Structured data analysis 2: pandering

# Don't change this cell; just run it. import numpy as np import pandas as pd # Safe settings for Pandas. pd.set_option('mode.chained_assignment', 'raise') %matplotlib inline import matplotlib.pyplot as plt plt.style.use('fivethirtyeight') from client.api.notebook import Notebook ok = Notebook('pandering.ok')

unemployment = pd.read_csv("unemployment.csv") unemployment

_ = ok.grade('q1_1')

by_nei = unemployment.sort_values("NEI", ascending = False) by_nei_pter = unemployment.sort_values("NEI-PTER", ascending= False)

_ = ok.grade('q1_2')

greatest_nei = by_nei.head(10) greatest_nei

_ = ok.grade('q1_3')

pter = pd.DataFrame() pter=unemployment["NEI-PTER"]- unemployment ["NEI"] # Show the first five values. pter.head()

_ = ok.grade('q1_4')

unemployment["PTER"]=pter by_pter = unemployment.sort_values("PTER",ascending=False) # Show the first five values. by_pter.head()

_ = ok.grade('q1_5')

highpter= pd.DataFrame() highpter["Date"]=by_pter["Date"] highpter["PTER"] = by_pter["PTER"] by_date=highpter.sort_values("Date", ascending=True) by_date.plot (x = "Date", xticks= [0,15,30,45,60,75]) summation = 0 count= 0 summation_2 =0 count_2=0 for i in np.arange(56,68): summation_2= summation_2 + by_date['PTER'].iloc[i] count_2 = count_2 +1 mean_pter_2008_2010= summation_2/count_2 print ("The mean percentage of people who were PTER between 2008-2010 was", round(mean_pter_2008_2010, ndigits=2)) difference=mean_pter_2008_2010- mean_pter_2005_2007 difference_of_percent= (difference/mean_pter_2008_2010)*100 print ("Meaning that PTER increased by", round(difference_of_percent, ndigits==2), "percent in the period 2008-2010 compared to 2005-2007")

# Don't change this cell; just run it. # From http://www2.census.gov/programs-surveys/popest/datasets/2010-2016/national/totals/nst-est2016-alldata.csv # See https://www2.census.gov/programs-surveys/popest/datasets/2010-2016/national/totals/nst-est2016-alldata.pdf # for column descriptions. # Read the table raw_pop = pd.read_csv('nst-est2016-alldata.csv') # Select rows giving summaries at US state level (summary level). pop = raw_pop[raw_pop['SUMLEV'] == 40] # Select only the columns we're interested in good_cols = ['REGION', 'NAME', 'POPESTIMATE2015', 'POPESTIMATE2016', 'BIRTHS2016', 'DEATHS2016', 'NETMIG2016', 'RESIDUAL2016'] pop = pop.loc[:, good_cols] # Give the columns new names good_names = ['REGION', 'NAME', '2015', '2016', 'BIRTHS', 'DEATHS', 'MIGRATION', 'OTHER'] pop.columns = good_names # Show the first five rows. pop.head()

total_us_population=sum(pop["2015"]) total_births= sum(pop["BIRTHS"]) us_birth_rate = total_births/total_us_population us_birth_rate

_ = ok.grade('q2_1')

pop_growth= np.array([]) for i in np.arange(1,53): diff= (pop["2016"].iloc[i-1]- pop["2015"].iloc[i-1]/pop["2016"].iloc[i-1] pop_growth=np.append(pop_growth,diff) pop["GROWTH RATE"]= pop_growth growth=pop.sort_values("GROWTH RATE", ascending= False) fastest_growth = pd.Series() fastest_growth=growth["NAME"].head(5) fastest_growth

_ = ok.grade('q2_2')

annual_rate_migration= np.array([]) for i in np.arange(1,53): prop_migration= (pop["MIGRATION"].iloc[i-1]/pop["2015"].iloc[i-1])*100 annual_rate_migration=np.append(annual_rate_migration, prop_migration) n_movers = np.array([]) n_movers=np.count_nonzero(annual_rate_migration > 1.0) n_movers

_ = ok.grade('q2_3')

region_4= pop["REGION"]== "4" west_births = pop["BIRTHS"][region_4] n_west_births= sum(west_births) n_west_births

_ = ok.grade('q2_4')

n_less_than_west_births=pop["2016"]< n_west_births n_less_than_west_births = np.count_nonzero(n_less_than_west_births) n_less_than_west_births

_ = ok.grade('q2_5')

#- Generate a chart here to support your conclusion birth_rate=pop["BIRTHS"]/pop["2015"] death_rate=pop["DEATHS"]/pop["2015"] association=birth_rate/death_rate pop["BIRTH RATE"]= birth_rate pop["DEATH RATE"]= death_rate pop["BIRTH/DEATH RATE"]= association pop.plot(x= "BIRTH RATE", y= "DEATH RATE", kind= "scatter")

# Just run this cell. complaints = pd.read_csv("complaints.csv") complaints.head()

complaints_per_product = complaints["product"] complaints_per_product= complaints_per_product.value_counts() complaints_per_product

_ = ok.grade('q3_1')

complaints_per_product_p= complaints_per_product.sort_values(ascending= True) complaints_per_product_p.plot(kind= "barh").set_xlabel("complaints")

complaints_per_company = complaints ["company"] complaints_per_company= complaints_per_company.value_counts() complaints_per_company

_ = ok.grade('q3_3')

complaints_per_company_plot= complaints_per_company.head(10).sort_values(ascending= True) complaints_per_company_plot.plot(kind= "barh").set_xlabel("complaints")

proportion_complaints_per_company = (complaints_per_company/sum(complaints_per_company)).head(10).sort_values(ascending= True) proportion_complaints_per_company.plot(kind= "barh").set_xlabel("prportion of total complaints")

# For your convenience, you can run this cell to run all the tests at once! import os _ = [ok.grade(q[:-3]) for q in os.listdir("tests") if q.startswith('q')]