Data Cleaning - SQL

Background

In this project we use some cleaing techniques such as

  • Standardize Data Format
  • Populate Property Address data
  • Breaking out Address into Individual Column (Address, City, State)
  • Change Y and N to Yes and No in "Sold as Vacant" field
  • Remove Duplicates
  • Delete Unused Columns

SQL Code


/*
Cleaning Data in SQL Queries
*/

Select *
From PortfolioProject.dbo.Sheet1$

---------------------------------------------------------------------------------------------------


-- Standardize Data Format

ALTER TABLE Sheet1$
Add SaleDateConverted date;

Update Sheet1$
SET SaleDateConverted = CONVERT(date, SaleDate)

Select SaleDateConverted, CONVERT(date, SaleDate)
From PortfolioProject.dbo.Sheet1$

---------------------------------------------------------------------------------------------------


-- Populate Property Address data

Select *
From PortfolioProject.dbo.Sheet1$
--Where PropertyAddress is null
Order by ParcelID


Select a.ParcelID, a.PropertyAddress, b.ParcelID, b.PropertyAddress, ISNULL(a.PropertyAddress, b.PropertyAddress)
From PortfolioProject.dbo.Sheet1$ a
JOIN PortfolioProject.dbo.Sheet1$ b
on a.ParcelID = b.ParcelID
And a.[UniqueID ] <> b.[UniqueID ]
Where a.PropertyAddress is null


Update a
SET PropertyAddress = ISNULL(a.PropertyAddress, 'No Address')
From PortfolioProject.dbo.Sheet1$ a
JOIN PortfolioProject.dbo.Sheet1$ b
on a.ParcelID = b.ParcelID
And a.[UniqueID ] <> b.[UniqueID ]
Where a.PropertyAddress is null


---------------------------------------------------------------------------------------------------

-- Breaking out Address into Individual Column (Address, City, State)

Select PropertyAddress
From PortfolioProject.dbo.Sheet1$


SELECT 
SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress)) as Address
, SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) + 1, LEN(PropertyAddress)) as Address
From PortfolioProject.dbo.Sheet1$


ALTER TABLE Sheet1$
Add PropertySplitAddress Nvarchar(255);

Update Sheet1$
SET PropertySplitAddress = SUBSTRING(PropertyAddress, 1, ABS(CHARINDEX(',', PropertyAddress) -1))

ALTER TABLE Sheet1$
Add PropertySplitCity Nvarchar(255);

Update Sheet1$
SET PropertySplitCity = SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) + 1, LEN(PropertyAddress))

SELECT *
From PortfolioProject.dbo.Sheet1$





Select OwnerAddress
From PortfolioProject.dbo.Sheet1$

Select 
PARSENAME(REPLACE(OwnerAddress, ',', '.'), 3),
PARSENAME(REPLACE(OwnerAddress, ',', '.'), 2),
PARSENAME(REPLACE(OwnerAddress, ',', '.'), 1)
From PortfolioProject.dbo.Sheet1$



ALTER TABLE Sheet1$
Add OwnerSplitAddress Nvarchar(255);

Update Sheet1$
SET OwnerSplitAddress = PARSENAME(REPLACE(OwnerAddress, ',', '.'), 3)


ALTER TABLE Sheet1$
Add OwnerSplitCity Nvarchar(255);

Update Sheet1$
SET OwnerSplitCity = PARSENAME(REPLACE(OwnerAddress, ',', '.'), 2)


ALTER TABLE Sheet1$
Add OwnerSplitState Nvarchar(255);

Update Sheet1$
SET OwnerSplitState = PARSENAME(REPLACE(OwnerAddress, ',', '.'), 1)




---------------------------------------------------------------------------------------------------

-- Change Y and N to Yes and No in "Sold as Vacant" field

Select Distinct(SoldAsVacant), Count(SoldAsVacant)
From PortfolioProject.dbo.Sheet1$
Group By SoldAsVacant

Select SoldAsVacant, 
CASE When SoldAsVacant = 'Y' THEN 'Yes' 
When SoldAsVacant = 'N' THEN 'No'
ELSE SoldAsVacant
End
From PortfolioProject.dbo.Sheet1$


Update Sheet1$
SET SoldAsVacant = CASE When SoldAsVacant = 'Y' THEN 'Yes' 
When SoldAsVacant = 'N' THEN 'No'
ELSE SoldAsVacant
End




---------------------------------------------------------------------------------------------------

-- Remove Duplicates

WITH RowNumCTE AS(
Select *,
ROW_NUMBER() OVER (
PARTITION BY ParcelID,
		PropertyAddress,
		SalePrice,
		SaleDate,
		LegalReference
		ORDER BY
		UniqueID
		) row_num

From PortfolioProject.dbo.Sheet1$
--order by ParcelID
)
Select *
From RowNumCTE
Where row_num > 1
Order by PropertyAddress



Select *
From PortfolioProject.dbo.Sheet1$



---------------------------------------------------------------------------------------------------

-- Delete Unused Columns

Select *
From PortfolioProject.dbo.Sheet1$


ALTER TABLE PortfolioProject.dbo.Sheet1$
DROP COLUMN OwnerAddress, TaxDistrict, PropertyAddress, SaleDate