Dataiku

cols = ['age', 'class of worker', 'detailed industry recode', 'detailed occupation recode', 'education', 'wage per hour', 'enroll in edu inst last wk', 'marital stat', 'major industry code', 'major occupation code', 'race', 'hispanic origin', 'sex', 'member of a labor union', 'reason for unemployment', 'full or part time employment stat', 'capital gains', 'capital losses', 'dividends from stocks', 'tax filer stat', 'region of previous residence', 'state of previous residence', 'detailed household and family stat', 'detailed household summary in household', 'instance weight', 'migration code-change in msa', 'migration code-change in reg', 'migration code-move within reg', 'live in this house 1 year ago', 'migration prev res in sunbelt', 'num persons worked for employer', 'family members under 18', 'country of birth father', 'country of birth mother', 'country of birth self', 'citizenship', 'own business or self employed', "fill inc questionnaire for veteran's admin", 'veterans benefits', 'weeks worked in year', 'year', 'annual salary > 50k']

import pandas as pd df = pd.read_csv('/work/census_income_learn.csv', names=cols) df.head()

df.shape

df.iloc[0]

print('The number of duplications') print(df.shape[0]-df.drop_duplicates().shape[0]) print('The percentage of duplication') print((df.shape[0]-df.drop_duplicates().shape[0])/df.shape[0]*100, '%') print('shape before removing duplications') print(df.shape) print('shape after removing duplications') df.drop_duplicates(inplace=True) print(df.shape)

df.isna().sum().loc[lambda x: x>0]

df['annual salary > 50k'] = df['annual salary > 50k'].apply(lambda x: 1 if '+' in x else 0) print('label count') print(df['annual salary > 50k'].value_counts()) print('label percentage') print(df['annual salary > 50k'].value_counts(normalize=True)*100)

continuous_cols = ['age', 'wage per hour', 'capital gains', 'capital losses', 'num persons worked for employer', 'dividends from stocks', 'weeks worked in year', 'instance weight'] import plotly.express as px for col in continuous_cols[:-1]: fig = px.histogram(df, x=col, facet_col='annual salary > 50k', width=700, height=350, title=f'The distribution of {col}') fig.show() print(df.groupby('annual salary > 50k')[col].describe().applymap(lambda x: round(x,1)).to_string())

fig = px.imshow(pd.DataFrame.corr(df[continuous_cols[:-1]], method='spearman').applymap(lambda x: round(x,2)), text_auto=True) fig.show() selected_continuous_cols = ['age', 'wage per hour', 'capital gains', 'capital losses', 'num persons worked for employer', 'dividends from stocks'] print('selected continuous columns') print(selected_continuous_cols)

nominal_cols = list(set(cols) - set(continuous_cols) - {'annual salary > 50k'}) for col in nominal_cols: print(f'{col} has {df[col].nunique()} unique values') print(df[col].value_counts().to_frame().merge((df[col].value_counts(normalize=True)*100).apply(lambda x: round(x,1)).to_frame(),left_index=True, right_index=True, suffixes=['_count', '_percentage']).to_string()) print('--------')

px.bar(x=df[nominal_cols].nunique().sort_values().keys(), y=df[nominal_cols].nunique().sort_values().values, width=750, height=600, title='The number of unique value for each nominal feature', labels={'y': 'number of unique values', 'x':'feature'}, text_auto=True)

print(df[nominal_cols].apply(lambda x: x.nunique()).describe().to_string()) px.box(x=df[nominal_cols].apply(lambda x: x.nunique()).values, width=700, height=300, title='The distribution of the number of unique values for all nominal features')

for col in ['class of worker', 'major industry code', 'major occupation code', 'education', 'sex', 'race']: temp = (df.groupby('annual salary > 50k')[col].value_counts() / df.groupby(col)['annual salary > 50k'].size() * 100)\ .to_frame().rename(columns={0:'percentage'}).applymap(lambda x: int(x)).reset_index() temp['annual salary > 50k'] = temp['annual salary > 50k'].astype('str') fig = px.bar(temp, x=col, y='percentage', color='annual salary > 50k', width=750, height=500, barmode='group', title=f'The percentage of earning over&below 50k for each category of {col}') fig.show()

from scipy.stats import chi2_contingency chi_statistics = [] p_values = [] for col in nominal_cols: chi, p, _, _ = chi2_contingency(pd.crosstab(index=df['annual salary > 50k'], columns=df[col]).values) chi_statistics.append(chi) p_values.append(p) temp = pd.DataFrame({'col': nominal_cols, 'chi_statistic': chi_statistics, 'p_value': p_values}) temp.sort_values('chi_statistic', ascending=False, inplace=True) selected_nominal_cols = temp['col'][:15] selected_nominal_cols

from imblearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer, make_column_selector from sklearn.preprocessing import StandardScaler, OneHotEncoder from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import RandomOverSampler from sklearn.feature_selection import chi2, SelectKBest from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression pl_no_sampling = Pipeline( [ ( "preprocessing", ColumnTransformer( [ ("standard_scaler", StandardScaler(), selected_continuous_cols), ("one_hot_encoder", OneHotEncoder(sparse=False, handle_unknown="ignore"), nominal_cols), ], verbose_feature_names_out=False, ), ), ("classifier", DecisionTreeClassifier(random_state=42)), ] ) pl_under_sampling = Pipeline( [ ( "preprocessing", ColumnTransformer( [ ("standard_scaler", StandardScaler(), selected_continuous_cols), ("one_hot_encoder", OneHotEncoder(sparse=False, handle_unknown="ignore"), nominal_cols), ], verbose_feature_names_out=False, ), ), ("under_sampling", RandomUnderSampler(random_state=42)), ("classifier", DecisionTreeClassifier(random_state=42)), ] ) pl_over_sampling = Pipeline( [ ( "preprocessing", ColumnTransformer( [ ("standard_scaler", StandardScaler(), selected_continuous_cols), ("one_hot_encoder", OneHotEncoder(sparse=False, handle_unknown="ignore"), nominal_cols), ], verbose_feature_names_out=False, ), ), ("under_sampling", RandomOverSampler(random_state=42)), ("classifier", DecisionTreeClassifier(random_state=42)), ] )

from sklearn.model_selection import cross_val_score import numpy as np print('pipeline no sampling') f1s = cross_val_score(pl_no_sampling, df, df["annual salary > 50k"], scoring='f1_macro') print(f'macro f1 score of each fold {f1s}') print(f'average macro f1 score {np.mean(f1s)}') print('---') print('pipeline under sampling') f1s = cross_val_score(pl_under_sampling, df, df["annual salary > 50k"], scoring='f1_macro') print(f'macro f1 score of each fold {f1s}') print(f'average macro f1 score {np.mean(f1s)}') print('---') print('pipeline over sampling') f1s = cross_val_score(pl_over_sampling, df, df["annual salary > 50k"], scoring='f1_macro') print(f'macro f1 score of each fold {f1s}') print(f'average macro f1 score {np.mean(f1s)}')

from sklearn.ensemble import RandomForestClassifier pl_no_sampling_rf = Pipeline( [ ( "preprocessing", ColumnTransformer( [ ("standard_scaler", StandardScaler(), selected_continuous_cols), ("one_hot_encoder", OneHotEncoder(sparse=False, handle_unknown="ignore"), nominal_cols), ], verbose_feature_names_out=False, ), ), ("classifier", RandomForestClassifier(random_state=42)), ] ) print('pipeline no sampling + random forest model') f1s = cross_val_score(pl_no_sampling_rf, df, df["annual salary > 50k"], scoring='f1_macro') print(f'macro f1 score of each fold {f1s}') print(f'average macro f1 score {np.mean(f1s)}')

for n in [10, 50, 100, 150, 200, 500]: pl_no_sampling_rf = Pipeline( [ ( "preprocessing", ColumnTransformer( [ ("standard_scaler", StandardScaler(), selected_continuous_cols), ("one_hot_encoder", OneHotEncoder(sparse=False, handle_unknown="ignore"), nominal_cols), ], verbose_feature_names_out=False, ), ), ("classifier", RandomForestClassifier(n_estimators=n, random_state=42)), ] ) f1s = cross_val_score(pl_no_sampling_rf, df, df["annual salary > 50k"], scoring='f1_macro') print(f'n estimators is {n}') print(f'average macro f1 score {np.mean(f1s)}') print('---')

df_test = pd.read_csv('/work/census_income_test.csv', names=cols) df_test.shape

from sklearn.metrics import classification_report pl_no_sampling_rf = Pipeline( [ ( "preprocessing", ColumnTransformer( [ ("standard_scaler", StandardScaler(), selected_continuous_cols), ("one_hot_encoder", OneHotEncoder(sparse=False, handle_unknown="ignore"), nominal_cols), ], verbose_feature_names_out=False, ), ), ("classifier", RandomForestClassifier(n_estimators=200, random_state=42)), ] ) pl_no_sampling_rf.fit(df, df['annual salary > 50k']) y_pred = pl_no_sampling_rf.predict(df_test) y_true = df_test['annual salary > 50k'].apply(lambda x: 1 if '+' in x else 0) print(classification_report(y_true, y_pred))

from sklearn.metrics import PrecisionRecallDisplay PrecisionRecallDisplay.from_estimator(pl_no_sampling_rf, df_test, y_true)