from jupyterthemes.stylefx import set_nb_theme
set_nb_theme('gruvboxd')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msn
from copy import deepcopy
from warnings import filterwarnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
raw = pd.read_csv('Fruit.txt', sep='\t')
df = deepcopy(raw)
df.sample(6)
df[df.isna().any(axis=1)]
#We have fewer mandarin records
fruit_table = pd.pivot_table(data=df, index='fruit_name', aggfunc=[np.mean, 'count']).iloc[:,:6]
fruit_lab_table = pd.pivot_table(data=df, index=['fruit_name', 'fruit_subtype'], aggfunc=[np.mean, 'count']).iloc[:,:6]
display(fruit_table, fruit_lab_table)
con_feats = df.select_dtypes(include='number').iloc[:,1:].columns
def compare_mean_visual(data, cat_col: str, con_col: str):
assert(cat_col and con_col in data.columns)
sns.set(rc={'figure.figsize':(10,7)})
sns.set_palette("Reds", 4)
sns.boxplot(x=cat_col, y=con_col, data=data)
sns.stripplot(x=cat_col, y=con_col, data=data, jitter=0.3, color="0.3")
plt.xlabel("")
plt.show()
for feat in con_feats:
compare_mean_visual(df, cat_col='fruit_name', con_col=feat)
def find_outlier_records(data, column_name: str, no_std=3):
assert (column_name in data.columns)
no_outliers = 0
values = data[column_name].unique()
upper_border = np.asarray(data[column_name]).mean() + no_std*(np.asarray(data[column_name]).std())
lower_border = np.asarray(data[column_name]).mean() - no_std*(np.asarray(data[column_name]).std())
for value in values:
if value>upper_border:
no_outliers+=int(data[column_name].value_counts()[value])
elif value<lower_border:
no_outliers+=int(data[column_name].value_counts()[value])
else:
continue
print(f'''Number of outlier records in {column_name} column: {no_outliers}''')
#Very few
for feat in con_feats:
find_outlier_records(df, feat)
Number of outlier records in mass column: 3
Number of outlier records in width column: 1
Number of outlier records in height column: 0
Number of outlier records in color_score column: 0
df = pd.concat([df.iloc[:,1:2], df.iloc[:,3:]], axis=1)
df.sample(3)
X = df.iloc[:,1:]
y = df.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=0.15,
stratify=y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
le = LabelEncoder()
df['fruit_name'] = le.fit_transform(df['fruit_name'])
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train)
scaler = RobustScaler()
X_test = scaler.fit_transform(X_test)
X_test = pd.DataFrame(X_test)
d_tree = DecisionTreeClassifier().fit(X_train, y_train)
pred = d_tree.predict(X_test)
pred
#A better result would've been reached had we had more data
print(classification_report(y_test, pred))
precision recall f1-score support
apple 1.00 0.67 0.80 3
lemon 1.00 1.00 1.00 2
mandarin 1.00 1.00 1.00 1
orange 0.75 1.00 0.86 3
accuracy 0.89 9
macro avg 0.94 0.92 0.91 9
weighted avg 0.92 0.89 0.89 9