[DSS DeCal] [FA '22] Mini-Lecture 3: Data Cleaning

import pandas as pd import numpy as np from datascience import *

# importing the dataset india_census_df = pd.read_csv("census_india_2011.csv") india_census_df

# QUESTION 1: How can we find the shape of the dataset? india_census_df.shape

# QUESTION 3: Filter out the outliers from the "Population" column. # Hint: outliers are defined as the minimum - (1.5 * IQR) and maximum + (1.5 * IQR), where # IQR = Q3 - Q1. Q3 = np.percentile(india_census_df["Population"], 75) Q1 = np.percentile(india_census_df["Population"],25) IQR = Q3-Q1 up_bound = Q3 + (1.5*IQR) low_bound = Q1 -(1.5*IQR) out = india_census_df[(india_census_df["Population"]<=up_bound)& (india_census_df["Population"]>=low_bound)] out

# QUESTION 4: Check to see if there are any duplicates in the "District name" column. # Return a Boolean. # Hint: check out the pd.is_unique() function! india_census_df[("District name")].is_unique