import pandas as pd
import numpy as np
from datascience import *
# importing the dataset
india_census_df = pd.read_csv("census_india_2011.csv")
india_census_df
District codeint64
1 - 640
State nameobject
UTTAR PRADESH11.1%
MADHYA PRADESH7.8%
33 others81.1%
0
1
JAMMU AND KASHMIR
1
2
JAMMU AND KASHMIR
2
3
JAMMU AND KASHMIR
3
4
JAMMU AND KASHMIR
4
5
JAMMU AND KASHMIR
5
6
JAMMU AND KASHMIR
6
7
JAMMU AND KASHMIR
7
8
JAMMU AND KASHMIR
8
9
JAMMU AND KASHMIR
9
10
JAMMU AND KASHMIR
# QUESTION 1: How can we find the shape of the dataset?
india_census_df.shape
# QUESTION 3: Filter out the outliers from the "Population" column.
# Hint: outliers are defined as the minimum - (1.5 * IQR) and maximum + (1.5 * IQR), where
# IQR = Q3 - Q1.
Q3 = np.percentile(india_census_df["Population"], 75)
Q1 = np.percentile(india_census_df["Population"],25)
IQR = Q3-Q1
up_bound = Q3 + (1.5*IQR)
low_bound = Q1 -(1.5*IQR)
out = india_census_df[(india_census_df["Population"]<=up_bound)& (india_census_df["Population"]>=low_bound)]
out
District codeint64
1 - 640
State nameobject
UTTAR PRADESH11.2%
MADHYA PRADESH8%
33 others80.7%
131
132
UTTAR PRADESH
132
133
UTTAR PRADESH
133
134
UTTAR PRADESH
134
135
UTTAR PRADESH
135
136
UTTAR PRADESH
136
137
UTTAR PRADESH
137
138
UTTAR PRADESH
138
139
UTTAR PRADESH
139
140
UTTAR PRADESH
140
141
UTTAR PRADESH
# QUESTION 4: Check to see if there are any duplicates in the "District name" column.
# Return a Boolean.
# Hint: check out the pd.is_unique() function!
india_census_df[("District name")].is_unique