Data102HW1

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import scipy.stats as stats import scipy

df = pd.read_csv('policez.csv', index_col = 0)

fig, ax = plt.subplots(figsize = [12, 7]) x_axis = np.arange(-5, 5, 0.1) plt.plot(x_axis, stats.norm.pdf(x_axis, 0, 1)) plt.hist(data = df, x = 'x', density = True, bins = np.arange(-10, 10, .5)) plt.show()

df['pvalues'] = 1 - scipy.stats.norm.cdf(df['x']) df.head()

df = df.sort_values('pvalues').reset_index(drop = True) df['rank'] = np.arange(1, len(df['pvalues']) + 1) df['critval'] = df['rank']/len(df['pvalues']) * 0.2 #using that largest kth_pval kth_pval = max(df[df['pvalues'] < df['critval']]['pvalues']) df['decisions'] = [int(df['pvalues'][i] <= kth_pval) for i in np.arange(len(df['pvalues']))] df.head()

f"{len(df[df['decisions'] == 1])} discoveries"

sns.stripplot( data=df, x='pvalues', y='decisions', alpha = 0.8, order = [0, 1], orient = "h", ) print(f"decision boundary at {min(df[df['decisions']==0]['critval'])}")

df1 = pd.read_csv('policez.csv')

fig, ax = plt.subplots(figsize = [12, 7]) x_axis = np.arange(-5, 5, 0.1) plt.plot(x_axis, stats.norm.pdf(x_axis, .1, 1.4)) plt.hist(data = df1, x = 'x', density = True, bins = np.arange(-10, 10, .5)) plt.show()

df1['pvalues'] = 1 - scipy.stats.norm.cdf(df1['x'], 0.1, 1.4) df1 = df1.sort_values('pvalues').reset_index(drop = True) df1['rank'] = np.arange(1, len(df1['pvalues']) + 1) df1['critval'] = df1['rank']/len(df1['pvalues']) * .2 #using that largest kth_pval kth_pval = max(df1[df1['pvalues'] < df1['critval']]['pvalues']) df1['decisions'] = [int(df1['pvalues'][i] <= kth_pval) for i in np.arange(len(df1['pvalues']))] print(f"{len(df1[df1['decisions'] == 1])} discoveries") sns.stripplot( data=df1, x='pvalues', y='decisions', alpha = 0.8, order = [0, 1], orient = "h", ) print(f"decision boundary at {min(df1[df1['decisions']==0]['critval'])}")