#Import relevant libraries
import altair as alt
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import (
FunctionTransformer,
Normalizer,
OneHotEncoder,
StandardScaler,
normalize,
scale)
from sklearn.metrics import plot_confusion_matrix, classification_report
from sklearn.svm import SVC, SVR
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
#Import and split the date
cheese_df = pd.read_csv('data/cheese_data.csv')
train_df, test_df = train_test_split(cheese_df, test_size= 0.2, random_state=123)
train_df.head(5)
#use describe on data
train_df.describe(include='all')
#look for null and dtypes
train_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 833 entries, 482 to 1041
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CheeseId 833 non-null int64
1 ManufacturerProvCode 833 non-null object
2 ManufacturingTypeEn 833 non-null object
3 MoisturePercent 821 non-null float64
4 FlavourEn 637 non-null object
5 CharacteristicsEn 522 non-null object
6 Organic 833 non-null int64
7 CategoryTypeEn 813 non-null object
8 MilkTypeEn 832 non-null object
9 MilkTreatmentTypeEn 781 non-null object
10 RindTypeEn 579 non-null object
11 CheeseName 833 non-null object
12 FatLevel 833 non-null object
dtypes: float64(1), int64(2), object(10)
memory usage: 91.1+ KB
#Drop columns that will not be useful and define our target
X_train, y_train = train_df.drop(columns=['FlavourEn','CharacteristicsEn', 'RindTypeEn','CheeseId', 'ManufacturingTypeEn']), train_df['ManufacturingTypeEn']
X_test, y_test = test_df.drop(columns=['FlavourEn','CharacteristicsEn', 'RindTypeEn','CheeseId', 'ManufacturingTypeEn']), test_df['ManufacturingTypeEn']
X_train.head(5)
#plot to determine distribution of target
manf_df = pd.DataFrame(y_train)
manf_type_plot = alt.Chart(manf_df, width=500, height=300). mark_bar().encode(
x=alt.X('ManufacturingTypeEn', title= "Manufacturing Type"),
y='count():Q'
).properties(title='Chart 1: Distribution of Target')
manf_type_plot
#plot to determine distribution of milk type
milk_type_plot = alt.Chart(X_train, height=300, width=500).mark_bar().encode(
x= alt.X("MilkTreatmentTypeEn:N", title="Milk Treatment Type"),
y="count():Q"
).properties(title= "Chart 2: Distribution of Milk Treatment Type")
milk_type_plot
#create baseline model
dummy= DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
dummy.score(X_train, y_train)
#seperate features into categories
num_feats = ["MoisturePercent"]
cat_feats = ['ManufacturerProvCode', 'MilkTypeEn', 'MilkTreatmentTypeEn']
binary_feats = ['Organic', 'FatLevel']
text_feats = 'CheeseName'
#make pipelines to transform
num_trans =make_pipeline(SimpleImputer(strategy ='median'))
cat_trans = make_pipeline(SimpleImputer(strategy ='most_frequent', fill_value="missing"),
OneHotEncoder(handle_unknown = 'ignore'))
binary_trans = make_pipeline(OneHotEncoder(drop='if_binary',dtype='int', sparse=False))
text_trans = make_pipeline(CountVectorizer())
#make preprocessor
preprocessor = make_column_transformer((num_trans, num_feats),
(cat_trans, cat_feats),
(binary_trans, binary_feats),
(text_trans, text_feats))
#make main pipe
main_pipe= make_pipeline(preprocessor, KNeighborsClassifier(weights='uniform'))
#cross_validate and find scores
knn_scores = cross_validate(main_pipe, X_train, y_train, cv=10, return_train_score=True)
knn_scores_df = pd.DataFrame(knn_scores).mean()
knn_scores_df
#seperate features into categories
num_feats = ["MoisturePercent"]
cat_feats = ['ManufacturerProvCode', 'MilkTypeEn', 'MilkTreatmentTypeEn']
binary_feats = ['Organic', 'FatLevel']
text_feats = 'CheeseName'
#transform data using pipelines
num_trans =make_pipeline(SimpleImputer(strategy ='median'))
cat_trans = make_pipeline(SimpleImputer(strategy ='most_frequent', fill_value="missing"),
OneHotEncoder(handle_unknown = 'ignore'))
binary_trans = make_pipeline(OneHotEncoder(drop='if_binary',dtype='int', sparse=False))
text_trans = make_pipeline(CountVectorizer())
#make preprocessor
preprocessor = make_column_transformer((num_trans, num_feats),
(cat_trans, cat_feats),
(binary_trans, binary_feats),
(text_trans, text_feats))
#make main pipe
main_pipe_svc= make_pipeline(preprocessor, SVC(class_weight='balanced'))
#calculate scores with cv
svc_scores = cross_validate(main_pipe_svc, X_train, y_train, cv=10, return_train_score=True)
svc_scores_df = pd.DataFrame(svc_scores).mean()
svc_scores_df
#seperate into categories
num_feats = ["MoisturePercent"]
cat_feats = ['ManufacturerProvCode', 'MilkTypeEn', 'MilkTreatmentTypeEn']
binary_feats = ['Organic', 'FatLevel']
text_feats = 'CheeseName'
#create dictionary for hyperparameter tuning
param_grid = {
"kneighborsclassifier__n_neighbors" : [1, 5, 10, 20, 30, 40, 50],
"kneighborsclassifier__weights" : ['uniform', 'distance']}
#transform data using pipelines
num_trans =make_pipeline(SimpleImputer(strategy ='median'))
cat_trans = make_pipeline(SimpleImputer(strategy ='most_frequent', fill_value="missing"),
OneHotEncoder(handle_unknown = 'ignore'))
binary_trans = make_pipeline(OneHotEncoder(drop='if_binary',dtype='int', sparse=False))
text_trans = make_pipeline(CountVectorizer())
#create preprocessor
preprocessor = make_column_transformer((num_trans, num_feats),
(cat_trans, cat_feats),
(binary_trans, binary_feats),
(text_trans, text_feats))
#create main pipe
main_pipe= make_pipeline(preprocessor, KNeighborsClassifier())
#run hyperparameter turning with cv
knn_random_search =RandomizedSearchCV(main_pipe,param_grid, cv=10, return_train_score = True, verbose=1, n_jobs=-1, n_iter=10, scoring = 'f1_micro', random_state=123)
#fit the data
knn_random_search.fit(X_train, y_train)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 9.1s finished
#find best scores and params
print(knn_random_search.best_score_)
print(knn_random_search.best_params_)
0.6206110154905334
{'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__n_neighbors': 5}
#separate into categories
num_feats = ["MoisturePercent"]
cat_feats = ['ManufacturerProvCode', 'MilkTypeEn', 'MilkTreatmentTypeEn']
binary_feats = ['Organic', 'FatLevel']
text_feats = 'CheeseName'
#create hyperparamter tuning dictionary
param_grid_svc = {"svc__gamma" : [0.1, 1.0, 10, 100],
"svc__C": [0.1, 10, 10, 100]}
#transform the data using pipes
num_trans =make_pipeline(SimpleImputer(strategy ='median'))
cat_trans = make_pipeline(SimpleImputer(strategy ='most_frequent', fill_value="missing"),
OneHotEncoder(handle_unknown = 'ignore'))
binary_trans = make_pipeline(OneHotEncoder(drop='if_binary',dtype='int', sparse=False))
text_trans = make_pipeline(CountVectorizer())
#make preprocessor
preprocessor = make_column_transformer((num_trans, num_feats),
(cat_trans, cat_feats),
(binary_trans, binary_feats),
(text_trans, text_feats))
#make main pipe
main_pipe_svc= make_pipeline(preprocessor, SVC(class_weight='balanced'))
#run hyperparameter tuning with cv
svc_scores2 = RandomizedSearchCV(main_pipe_svc, param_grid_svc, cv=10, return_train_score = True, verbose=1, n_jobs=-1, n_iter=10, scoring = 'f1_micro', random_state=123)
#fit the data
svc_scores2.fit(X_train, y_train)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 15.9s finished
#find best score and params
print(svc_scores2.best_score_)
print(svc_scores2.best_params_)
0.6794176706827308
{'svc__gamma': 0.1, 'svc__C': 10}
#score the test data
test_score = svc_scores2.score(X_test, y_test)
test_score
#plot confusion matrix
test_score_plot = plot_confusion_matrix(svc_scores2, X_test, y_test,normalize='all')
test_score_plot
#create y_pred
y_pred = svc_scores2.predict(X_test)
svc_scores2.classes_
#create classification report
print(classification_report(y_test,y_pred, sample_weight=None, digits=2))
precision recall f1-score support
Artisan 0.69 0.63 0.66 75
Farmstead 0.69 0.51 0.59 49
Industrial 0.70 0.87 0.78 85
accuracy 0.70 209
macro avg 0.70 0.67 0.67 209
weighted avg 0.70 0.70 0.69 209
#recombine test set
test_merged = pd.concat([X_test, y_test], axis=1)
test_merged
#create plot of manufacturing type vs. milk type
which_milk_plot = alt.Chart(test_merged, height=300, width=500).mark_bar().encode(
alt.X("ManufacturingTypeEn:N", title="Manufacturing Type"),
alt.Y('count():Q'),
alt.Color('MilkTreatmentTypeEn')
).properties(title= "Chart 3: Manufacturing Type vs. Milk Treatment Type")
which_milk_plot
#clan up
!black final_project.ipynb
reformatted final_project.ipynb
All done! ✨ 🍰 ✨
1 file reformatted.