import pandas as pd
df = pd.read_csv('Nashville Housing Data for Data Cleaning.csv')
df.head()
SQL
Saved to variable
df_1SELECT *
FROM df
SQL
Saved to variable
df_1SELECT SaleDate, CAST(SaleDate AS DATETIME) AS SaleDateConverted
FROM df
-- UPDATE df
-- SET SaleDate = CONVERT(Date, SaleDate)
-- -- If it doesn't UPDATE properly
-- -- Create a column first before updating
-- ALTER TABLE df
-- ADD SaleDateConverted Date;
-- UPDATE df
-- SET SaleDateConverted = CONVERT(Date, SaleDate)
-- SELECT SaleDateConverted, SaleDate
-- FROM df
SQL
Saved to variable
df_1SELECT *
FROM df
WHERE ParcelID IN (
SELECT ParcelID
FROM df
WHERE PropertyAddress IS NULL
)
ORDER BY ParcelID, SaleDate
SQL
Saved to variable
df_1SELECT
df_1.UniqueID,
df_1.ParcelID,
df_1.PropertyAddress,
COALESCE(df_1.PropertyAddress, df_2.PropertyAddress) AS fixed_address
FROM df df_1
JOIN df df_2
ON df_1.ParcelID = df_2.ParcelID
AND df_1.UniqueID != df_2.UniqueID
WHERE df_1.ParcelID IN (
SELECT ParcelID
FROM df
WHERE PropertyAddress IS NULL
)
ORDER BY df_1.ParcelID, df_1.PropertyAddress
-- UPDATE df_1
-- SET PropertyAddress = COALESCE(df_1.PropertyAddress,df_2.PropertyAddress)
-- FROM df df_1
-- JOIN df df_2
-- ON df_1.ParcelID = df_2.ParcelID
-- AND df_1.UniqueID <> df_2.UniqueID
-- WHERE df_1.PropertyAddress IS NULL
SQL
Saved to variable
df_1SELECT PropertyAddress
FROM df
--ORDER BY ParcelID
LIMIT 5
SQL
Saved to variable
df_1SELECT
PropertyAddress,
-- SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) - 1) AS Address1
-- , SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) + 1, LEN(PropertyAddress)) AS Address2
SUBSTR(PropertyAddress, 1, CAST(INSTR(PropertyAddress, ',') - 1 AS INT)) AS Address1,
SUBSTR(PropertyAddress,
CAST(INSTR(PropertyAddress, ',') + 1 AS INT),
CAST(LEN(PropertyAddress) - INSTR(PropertyAddress, ',') AS INT)) AS address2
FROM df
-- ALTER TABLE df
-- ADD PropertySplitAddress Nvarchar(255);
-- UPDATE df
-- SET PropertySplitAddress = SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) - 1)
-- ALTER TABLE df
-- ADD PropertySplitCity Nvarchar(255);
-- UPDATE df
-- SET PropertySplitCity = SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) + 1, LEN(PropertyAddress))
-- SELECT PropertyAddress, PropertySplitAddress, PropertySplitCity
-- FROM df
SQL
Saved to variable
df_1SELECT OwnerAddress
FROM df
LIMIT 5
SQL
Saved to variable
df_1SELECT
OwnerAddress
,PARSENAME(REPLACE(OwnerAddress, ',', '.') , 3)
,PARSENAME(REPLACE(OwnerAddress, ',', '.') , 2)
,PARSENAME(REPLACE(OwnerAddress, ',', '.') , 1)
FROM df
RuntimeError: Catalog Error: Scalar Function with name parsename does not exist!
Did you mean "dayname"?
LINE 3: ,PARSENAME(REPLACE(...
^
SQL
Saved to variable
df_1SELECT SoldAsVacant, COUNT(SoldAsVacant)
FROM df
GROUP BY SoldAsVacant
ORDER BY 2
SQL
Saved to variable
df_1SELECT
DISTINCT SoldAsVacant,
CASE
WHEN SoldAsVacant = 'Y' THEN 'Yes'
WHEN SoldAsVacant = 'N' THEN 'No'
ELSE SoldAsVacant
END AS fixed_sold
FROM df
-- UPDATE df
-- SET SoldAsVacant =
-- CASE
-- WHEN SoldAsVacant = 'Y' THEN 'Yes'
-- WHEN SoldAsVacant = 'N' THEN 'No'
-- ELSE SoldAsVacant
-- END
SQL
Saved to variable
df_1SELECT UniqueID, COUNT(UniqueID)
FROM df
GROUP BY UniqueID
ORDER BY 2 DESC
SQL
Saved to variable
df_1WITH RowNumCTE AS (
SELECT
*,
ROW_NUMBER() OVER (
PARTITION BY
ParcelID,
PropertyAddress,
SalePrice,
SaleDate,
LegalReference
ORDER BY
UniqueID
) AS row_num
FROM df
)
SELECT
UniqueID,
ParcelID,
LandUse,
PropertyAddress,
row_num
FROM RowNumCTE
WHERE ParcelID IN (
SELECT ParcelID
FROM RowNumCTE
WHERE row_num != 1
)
ORDER BY PropertyAddress;
SQL
Saved to variable
df_1WITH RowCounts AS (
SELECT
ParcelID,
PropertyAddress,
SalePrice,
SaleDate,
LegalReference,
COUNT(*) AS row_count
FROM df
GROUP BY
ParcelID,
PropertyAddress,
SalePrice,
SaleDate,
LegalReference
)
SELECT *
FROM RowCounts
WHERE ParcelID IN (
SELECT ParcelID
FROM RowCounts
WHERE row_count != 1
)
ORDER BY PropertyAddress