cols = ['age', 'class of worker', 'detailed industry recode', 'detailed occupation recode', 'education',
'wage per hour', 'enroll in edu inst last wk', 'marital stat', 'major industry code',
'major occupation code', 'race', 'hispanic origin', 'sex', 'member of a labor union',
'reason for unemployment', 'full or part time employment stat', 'capital gains', 'capital losses',
'dividends from stocks', 'tax filer stat', 'region of previous residence',
'state of previous residence', 'detailed household and family stat',
'detailed household summary in household', 'instance weight', 'migration code-change in msa',
'migration code-change in reg', 'migration code-move within reg', 'live in this house 1 year ago',
'migration prev res in sunbelt', 'num persons worked for employer', 'family members under 18',
'country of birth father', 'country of birth mother', 'country of birth self', 'citizenship',
'own business or self employed', "fill inc questionnaire for veteran's admin",
'veterans benefits', 'weeks worked in year', 'year', 'annual salary > 50k']
import pandas as pd
df = pd.read_csv('/work/census_income_learn.csv', names=cols)
df.head()
df.shape
df.iloc[0]
print('The number of duplications')
print(df.shape[0]-df.drop_duplicates().shape[0])
print('The percentage of duplication')
print((df.shape[0]-df.drop_duplicates().shape[0])/df.shape[0]*100, '%')
print('shape before removing duplications')
print(df.shape)
print('shape after removing duplications')
df.drop_duplicates(inplace=True)
print(df.shape)
df.isna().sum().loc[lambda x: x>0]
df['annual salary > 50k'] = df['annual salary > 50k'].apply(lambda x: 1 if '+' in x else 0)
print('label count')
print(df['annual salary > 50k'].value_counts())
print('label percentage')
print(df['annual salary > 50k'].value_counts(normalize=True)*100)
continuous_cols = ['age', 'wage per hour', 'capital gains',
'capital losses', 'num persons worked for employer',
'dividends from stocks', 'weeks worked in year', 'instance weight']
import plotly.express as px
for col in continuous_cols[:-1]:
fig = px.histogram(df, x=col, facet_col='annual salary > 50k', width=700, height=350, title=f'The distribution of {col}')
fig.show()
print(df.groupby('annual salary > 50k')[col].describe().applymap(lambda x: round(x,1)).to_string())
fig = px.imshow(pd.DataFrame.corr(df[continuous_cols[:-1]], method='spearman').applymap(lambda x: round(x,2)), text_auto=True)
fig.show()
selected_continuous_cols = ['age', 'wage per hour', 'capital gains', 'capital losses',
'num persons worked for employer', 'dividends from stocks']
print('selected continuous columns')
print(selected_continuous_cols)
nominal_cols = list(set(cols) - set(continuous_cols) - {'annual salary > 50k'})
for col in nominal_cols:
print(f'{col} has {df[col].nunique()} unique values')
print(df[col].value_counts().to_frame().merge((df[col].value_counts(normalize=True)*100).apply(lambda x: round(x,1)).to_frame(),left_index=True, right_index=True, suffixes=['_count', '_percentage']).to_string())
print('--------')
px.bar(x=df[nominal_cols].nunique().sort_values().keys(),
y=df[nominal_cols].nunique().sort_values().values,
width=750, height=600,
title='The number of unique value for each nominal feature',
labels={'y': 'number of unique values', 'x':'feature'},
text_auto=True)
print(df[nominal_cols].apply(lambda x: x.nunique()).describe().to_string())
px.box(x=df[nominal_cols].apply(lambda x: x.nunique()).values,
width=700, height=300,
title='The distribution of the number of unique values for all nominal features')
for col in ['class of worker', 'major industry code', 'major occupation code', 'education', 'sex', 'race']:
temp = (df.groupby('annual salary > 50k')[col].value_counts() / df.groupby(col)['annual salary > 50k'].size() * 100)\
.to_frame().rename(columns={0:'percentage'}).applymap(lambda x: int(x)).reset_index()
temp['annual salary > 50k'] = temp['annual salary > 50k'].astype('str')
fig = px.bar(temp, x=col, y='percentage', color='annual salary > 50k', width=750, height=500, barmode='group', title=f'The percentage of earning over&below 50k for each category of {col}')
fig.show()
from scipy.stats import chi2_contingency
chi_statistics = []
p_values = []
for col in nominal_cols:
chi, p, _, _ = chi2_contingency(pd.crosstab(index=df['annual salary > 50k'], columns=df[col]).values)
chi_statistics.append(chi)
p_values.append(p)
temp = pd.DataFrame({'col': nominal_cols, 'chi_statistic': chi_statistics, 'p_value': p_values})
temp.sort_values('chi_statistic', ascending=False, inplace=True)
selected_nominal_cols = temp['col'][:15]
selected_nominal_cols
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
pl_no_sampling = Pipeline(
[
(
"preprocessing",
ColumnTransformer(
[
("standard_scaler", StandardScaler(), selected_continuous_cols),
("one_hot_encoder", OneHotEncoder(sparse=False, handle_unknown="ignore"), nominal_cols),
],
verbose_feature_names_out=False,
),
),
("classifier", DecisionTreeClassifier(random_state=42)),
]
)
pl_under_sampling = Pipeline(
[
(
"preprocessing",
ColumnTransformer(
[
("standard_scaler", StandardScaler(), selected_continuous_cols),
("one_hot_encoder", OneHotEncoder(sparse=False, handle_unknown="ignore"), nominal_cols),
],
verbose_feature_names_out=False,
),
),
("under_sampling", RandomUnderSampler(random_state=42)),
("classifier", DecisionTreeClassifier(random_state=42)),
]
)
pl_over_sampling = Pipeline(
[
(
"preprocessing",
ColumnTransformer(
[
("standard_scaler", StandardScaler(), selected_continuous_cols),
("one_hot_encoder", OneHotEncoder(sparse=False, handle_unknown="ignore"), nominal_cols),
],
verbose_feature_names_out=False,
),
),
("under_sampling", RandomOverSampler(random_state=42)),
("classifier", DecisionTreeClassifier(random_state=42)),
]
)
from sklearn.model_selection import cross_val_score
import numpy as np
print('pipeline no sampling')
f1s = cross_val_score(pl_no_sampling, df, df["annual salary > 50k"], scoring='f1_macro')
print(f'macro f1 score of each fold {f1s}')
print(f'average macro f1 score {np.mean(f1s)}')
print('---')
print('pipeline under sampling')
f1s = cross_val_score(pl_under_sampling, df, df["annual salary > 50k"], scoring='f1_macro')
print(f'macro f1 score of each fold {f1s}')
print(f'average macro f1 score {np.mean(f1s)}')
print('---')
print('pipeline over sampling')
f1s = cross_val_score(pl_over_sampling, df, df["annual salary > 50k"], scoring='f1_macro')
print(f'macro f1 score of each fold {f1s}')
print(f'average macro f1 score {np.mean(f1s)}')
from sklearn.ensemble import RandomForestClassifier
pl_no_sampling_rf = Pipeline(
[
(
"preprocessing",
ColumnTransformer(
[
("standard_scaler", StandardScaler(), selected_continuous_cols),
("one_hot_encoder", OneHotEncoder(sparse=False, handle_unknown="ignore"), nominal_cols),
],
verbose_feature_names_out=False,
),
),
("classifier", RandomForestClassifier(random_state=42)),
]
)
print('pipeline no sampling + random forest model')
f1s = cross_val_score(pl_no_sampling_rf, df, df["annual salary > 50k"], scoring='f1_macro')
print(f'macro f1 score of each fold {f1s}')
print(f'average macro f1 score {np.mean(f1s)}')
for n in [10, 50, 100, 150, 200, 500]:
pl_no_sampling_rf = Pipeline(
[
(
"preprocessing",
ColumnTransformer(
[
("standard_scaler", StandardScaler(), selected_continuous_cols),
("one_hot_encoder", OneHotEncoder(sparse=False, handle_unknown="ignore"), nominal_cols),
],
verbose_feature_names_out=False,
),
),
("classifier", RandomForestClassifier(n_estimators=n, random_state=42)),
]
)
f1s = cross_val_score(pl_no_sampling_rf, df, df["annual salary > 50k"], scoring='f1_macro')
print(f'n estimators is {n}')
print(f'average macro f1 score {np.mean(f1s)}')
print('---')
df_test = pd.read_csv('/work/census_income_test.csv', names=cols)
df_test.shape
from sklearn.metrics import classification_report
pl_no_sampling_rf = Pipeline(
[
(
"preprocessing",
ColumnTransformer(
[
("standard_scaler", StandardScaler(), selected_continuous_cols),
("one_hot_encoder", OneHotEncoder(sparse=False, handle_unknown="ignore"), nominal_cols),
],
verbose_feature_names_out=False,
),
),
("classifier", RandomForestClassifier(n_estimators=200, random_state=42)),
]
)
pl_no_sampling_rf.fit(df, df['annual salary > 50k'])
y_pred = pl_no_sampling_rf.predict(df_test)
y_true = df_test['annual salary > 50k'].apply(lambda x: 1 if '+' in x else 0)
print(classification_report(y_true, y_pred))
from sklearn.metrics import PrecisionRecallDisplay
PrecisionRecallDisplay.from_estimator(pl_no_sampling_rf, df_test, y_true)