import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates
%matplotlib inline
missing_values = [" ?"]
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',header=None, na_values = missing_values)
data.columns = ['Age', 'Work-Class', 'Fnlwgt', 'Education', 'Education-Num', 'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Capital-Gain', 'Capital-Loss', 'Hours-Per-Week', 'Native-Country', 'Num']
data
continousCol = ['Age', "Fnlwgt", 'Education-Num', 'Capital-Gain', 'Capital-Loss', 'Hours-Per-Week']
applyFunc = lambda f: [(x, f(data[x])) for x in continousCol]
print('Average')
print(applyFunc(np.average))
print('Standard Deviation')
print(applyFunc(np.std))
print('Min Value')
print(applyFunc(np.min))
print('Max Value')
print(applyFunc(np.max))
Average
[('Age', 38.58164675532078), ('Fnlwgt', 189778.36651208502), ('Education-Num', 10.0806793403151), ('Capital-Gain', 1077.6488437087312), ('Capital-Loss', 87.303829734959), ('Hours-Per-Week', 40.437455852092995)]
Standard Deviation
[('Age', 13.640223092304275), ('Fnlwgt', 105548.3568808908), ('Education-Num', 2.5726808256012865), ('Capital-Gain', 7385.178676947626), ('Capital-Loss', 402.9540308274866), ('Hours-Per-Week', 12.34723907570799)]
Min Value
[('Age', 17), ('Fnlwgt', 12285), ('Education-Num', 1), ('Capital-Gain', 0), ('Capital-Loss', 0), ('Hours-Per-Week', 1)]
Max Value
[('Age', 90), ('Fnlwgt', 1484705), ('Education-Num', 16), ('Capital-Gain', 99999), ('Capital-Loss', 4356), ('Hours-Per-Week', 99)]
def counts(index):
counts = {}
for x in data[index]:
counts[x] = counts.get(x, 0) + 1
return counts
discreteCol = ["Work-Class", "Education", "Marital-Status", "Occupation", "Relationship", "Race", "Sex", "Native-Country"]
commons = dict()
for x in discreteCol:
c = counts(x)
print("{}: {}".format(x, c))
commons[x] = max(c, key = lambda y: c[y])
Work-Class: {' State-gov': 1298, ' Self-emp-not-inc': 2541, ' Private': 22696, ' Federal-gov': 960, ' Local-gov': 2093, nan: 1836, ' Self-emp-inc': 1116, ' Without-pay': 14, ' Never-worked': 7}
Education: {' Bachelors': 5355, ' HS-grad': 10501, ' 11th': 1175, ' Masters': 1723, ' 9th': 514, ' Some-college': 7291, ' Assoc-acdm': 1067, ' Assoc-voc': 1382, ' 7th-8th': 646, ' Doctorate': 413, ' Prof-school': 576, ' 5th-6th': 333, ' 10th': 933, ' 1st-4th': 168, ' Preschool': 51, ' 12th': 433}
Marital-Status: {' Never-married': 10683, ' Married-civ-spouse': 14976, ' Divorced': 4443, ' Married-spouse-absent': 418, ' Separated': 1025, ' Married-AF-spouse': 23, ' Widowed': 993}
Occupation: {' Adm-clerical': 3770, ' Exec-managerial': 4066, ' Handlers-cleaners': 1370, ' Prof-specialty': 4140, ' Other-service': 3295, ' Sales': 3650, ' Craft-repair': 4099, ' Transport-moving': 1597, ' Farming-fishing': 994, ' Machine-op-inspct': 2002, ' Tech-support': 928, nan: 1843, ' Protective-serv': 649, ' Armed-Forces': 9, ' Priv-house-serv': 149}
Relationship: {' Not-in-family': 8305, ' Husband': 13193, ' Wife': 1568, ' Own-child': 5068, ' Unmarried': 3446, ' Other-relative': 981}
Race: {' White': 27816, ' Black': 3124, ' Asian-Pac-Islander': 1039, ' Amer-Indian-Eskimo': 311, ' Other': 271}
Sex: {' Male': 21790, ' Female': 10771}
Native-Country: {' United-States': 29170, ' Cuba': 95, ' Jamaica': 81, ' India': 100, nan: 583, ' Mexico': 643, ' South': 80, ' Puerto-Rico': 114, ' Honduras': 13, ' England': 90, ' Canada': 121, ' Germany': 137, ' Iran': 43, ' Philippines': 198, ' Italy': 73, ' Poland': 60, ' Columbia': 59, ' Cambodia': 19, ' Thailand': 18, ' Ecuador': 28, ' Laos': 18, ' Taiwan': 51, ' Haiti': 44, ' Portugal': 37, ' Dominican-Republic': 70, ' El-Salvador': 106, ' France': 29, ' Guatemala': 64, ' China': 75, ' Japan': 62, ' Yugoslavia': 16, ' Peru': 31, ' Outlying-US(Guam-USVI-etc)': 14, ' Scotland': 12, ' Trinadad&Tobago': 19, ' Greece': 29, ' Nicaragua': 34, ' Vietnam': 67, ' Hong': 20, ' Ireland': 24, ' Hungary': 13, ' Holand-Netherlands': 1}
fig, axes = plt.subplots(nrows=2, ncols=len(discreteCol)//2, figsize=(25,10))
for n, x in enumerate(discreteCol):
pd.Series(data[x]).value_counts().plot(kind='bar', ax=axes[n//4, n%4])
fig, axes = plt.subplots(nrows=2, ncols=len(continousCol)//2, figsize=(25,10))
for n, x in enumerate(continousCol):
axes[n//3, n%3].hist(data[x])
fig,a = plt.subplots(1,2, figsize=(25,10))
a[0].scatter(data['Age'], data['Capital-Gain'])
a[1].scatter(data['Age'], data['Capital-Loss'])
plt.show()
def plotParallelCoordinates(col):
data2 = data.drop(filter(lambda x: x != col, discreteCol + ["Num", "Fnlwgt"]), axis=1)
return parallel_coordinates(data2, col)
plotParallelCoordinates("Marital-Status")
plotParallelCoordinates("Education")
plotParallelCoordinates("Relationship")
plotParallelCoordinates("Relationship")
data2 = data.replace(' ?', np.NaN).drop("Num", axis=1)
for x in discreteCol:
data2[x][data2[x].isna()] = commons[x]
print(data2.isna().any())
Age False
Work-Class False
Fnlwgt False
Education False
Education-Num False
Marital-Status False
Occupation False
Relationship False
Race False
Sex False
Capital-Gain False
Capital-Loss False
Hours-Per-Week False
Native-Country False
dtype: bool
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
This is separate from the ipykernel package so we can avoid doing imports until
fig, axes = plt.subplots(nrows=2, ncols=len(discreteCol)//2, figsize=(25,10))
for n, x in enumerate(discreteCol):
pd.Series(data[x]).value_counts().plot(kind='bar', ax=axes[n//4, n%4])
fig, axes = plt.subplots(nrows=2, ncols=len(discreteCol)//2, figsize=(25,10))
for n, x in enumerate(discreteCol):
pd.Series(data2[x]).value_counts().plot(kind='bar', ax=axes[n//4, n%4])