Water quality dataset - what makes clean water?
!pip install missingno
# Loading in libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from collections import Counter
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
Loading in the data
# Loading in the data as a csv file
dataset = pd.read_csv('water_potability.csv')
dataset.head(5)
# Getting basic information about the dataset
dataset.info()
dataset.describe()
# Define colors used as colorcodes
blue = '#51C4D3' # To mark drinkable water
green = '#74C365' # To mark undrinkable water
red = '#CD6155' # For further markings
orange = '#DC7633' # For further markings
# Plot the colors as a palplot
sns.palplot([blue])
sns.palplot([green])
sns.palplot([red])
sns.palplot([orange])
Exploratory Data Analysis
# Count amount of each value in the Potability column
dataset['Potability'].value_counts()
dataset['Potability'].value_counts()
# Clear matplotlib and set style
plt.clf()
plt.style.use('ggplot')
# Create subplot and pie chart
fig1, ax1 = plt.subplots()
ax1.pie(dataset['Potability'].value_counts(), colors=[green, blue], labels=['Not drinkable', 'Drinkable'], autopct='%1.1f%%', startangle=0, rotatelabels=False)
#draw circle
centre_circle = plt.Circle((0,0),0.80, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')
# Set tighten layout and show plot
plt.tight_layout()
plt.show()
# Define a list of features to be plotted
feature = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']
# Drop Potability column
plotter_dataset = dataset.drop('Potability', axis=1)
plt.figure(figsize = (15, 15))
# Iterate through the feature list and plot a histlpot
for i in enumerate(feature):
plt.subplot(3, 3,i[0]+1)
# Histlot plotting the fetures in the dataset
sns.histplot(
data = plotter_dataset,
x = plotter_dataset[i[1]],
hue = dataset['Potability'],
palette= [blue, green],
kde = True,
multiple='stack',
alpha=0.8
)
# Rotate the xticks for better readability
plt.xticks(rotation = 25)
Further filtering of the data
# Define a plotting function
def mask_plotter(column, mask1, mask2, label1, label2, color2, title):
# Set size and stlye
plt.figure(figsize=(5, 5))
plt.style.use('ggplot')
# Create to histplots
sns.histplot(data=dataset[mask1], x=column, multiple='stack', color=blue, label=label1) # Save
sns.histplot(data=dataset[mask2], x=column, multiple='stack', color=color2, label=label2)
# Add title, legend and show plot
plt.title(title)
plt.legend()
plt.show()
# Create masks to filter ph-values within a specific range
drinkable_ph_mask = (dataset['ph'] > 6.5) & (dataset['ph'] < 9)
undrinkable_ph_mask = (dataset['ph'] < 6.5) | (dataset['ph'] > 9)
# Create masks to filter ph-values within a specific range
soft_mask = (dataset['Hardness'] < 150)
hard_mask = (dataset['Hardness'] > 150)
# Create masks to filter ph-values within a specific range
eu_recommendation = (dataset['Sulfate'] < 250)
who_recommendation = (dataset['Sulfate'] > 250) & (dataset['Sulfate'] < 500)
# Plot ph-values
mask_plotter('ph', drinkable_ph_mask, undrinkable_ph_mask, 'Save ph value', 'Unsave ph-value', red, 'ph-values')
# Plot Hardness
mask_plotter('Hardness', soft_mask, hard_mask, 'Soft - moderate water', 'Hard water', red, 'Hardness')
# Plot Sulfate
mask_plotter('Sulfate', eu_recommendation, who_recommendation, 'Within EU limits', 'Within WHO limits', orange, 'Sulfate')
Data preparation
Dealing with missing data
# Plot out missing values
fig = msno.matrix(dataset)
# Mean and median of ph
print("Mean of ph: " + str(dataset['ph'].mean()))
print("Median of ph: " + str(dataset['ph'].median()))
print('-------------------------------------------')
# Mean and median of Sulfates
print("Mean of Sulfate: " + str(dataset['Sulfate'].mean()))
print("Median of Sulfate: " + str(dataset['Sulfate'].median()))
print('-------------------------------------------')
# Mean and median of Trihalomethanes
print("Mean of Trihalomethanes: " + str(dataset['Trihalomethanes'].mean()))
print("Median of Trihalomethanes: " + str(dataset['Trihalomethanes'].median()))
print('-------------------------------------------')
# Replacing nan values with the median
dataset['ph'].fillna(value=dataset['ph'].median(),inplace=True)
dataset['Sulfate'].fillna(value=dataset['Sulfate'].median(),inplace=True)
dataset['Trihalomethanes'].fillna(value=dataset['Trihalomethanes'].median(),inplace=True)
# Checking if there are missing values left
dataset.isnull().sum()
Checking for correlations
# Set figure size
plt.figure(figsize=(10, 10))
# Create heatmap
sns.heatmap(dataset.corr(), annot=True)
# Show plot
plt.show()
# Setting features (X) and targets(y)
X = dataset.drop('Potability', axis=1)
y = dataset['Potability']
# Split data into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
# Scale the data. Only training data is fitted, testing data is only transformed
# Also, only x values are scaled, because y only containes values 0 and 1, which we don't want to scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Creatre figure and list containing axes
fig, ax = plt.subplots(2, 1)
# Plot histogram of before and after scaling
X_train.iloc[:, 2].hist(ax=ax[0])
ax[1].hist(X_train_scaled[:, 2])
plt.show()
Building a model
Randomized, cross validated search
# Set up all the models that are to be tested
lr = LogisticRegression()
rfc = RandomForestClassifier()
dtc = DecisionTreeClassifier()
xgb = GradientBoostingClassifier()
knn = KNeighborsClassifier()
# lr param grid
lr_params = {
'penalty' : ['l1', 'l2'],
'C' : np.linspace(0, 5, 10)}
# Instantiate cross validated logistic regression random search
rs_lr = RandomizedSearchCV(estimator=lr, param_distributions=lr_params, cv=5)
# Fit lr to trainig data
rs_lr.fit(X_train, y_train)
# rfc param grid
rfc_params = {
'n_estimators' : [*range(25, 400, 20)],
'criterion' : ['gini', 'entropy'],
'max_depth': [*range(1, 11)],
'min_samples_split' : [2, 4, 6, 8],
'min_samples_leaf': [1, 2, 3, 4]}
# Instantiate cross validated random forest random search
rs_rfc = RandomizedSearchCV(estimator=rfc, param_distributions=rfc_params, cv=5)
# Fit rfc to trainig data
rs_rfc.fit(X_train, y_train)
# dtc param grid
dtc_params = {
'criterion' : ['gini', 'entropy'],
'splitter' : ['best', 'random'],
'max_depth': [*range(1, 11)],
'min_samples_split' : [2, 4, 6, 8],
'min_samples_leaf': [1, 2, 3, 4]}
# Instantiate cross validated desicion tree random search
rs_dtc = RandomizedSearchCV(estimator=dtc, param_distributions=dtc_params, cv=5)
# Fit dtc to trainig data
rs_dtc.fit(X_train, y_train)
# xgb param grid
xgb_params = {
'loss' : ['deviance', 'exponential'],
'learning_rate' : np.linspace(0, 1, 10),
'n_estimators' : [*range(25, 500, 10)]}
# Instantiate cross validated extreme gradient boosting random search
rs_xgb = RandomizedSearchCV(estimator=xgb, param_distributions=xgb_params, cv=5)
# Fit xgb to training data
rs_xgb.fit(X_train, y_train)
# knn param grid
knn_params = {
'n_neighbors' : [*range(1, 11)],
'weights' : ['uniform', 'distant'],
'algorithm' : ['auto', 'ball_tree', 'kd_tree' 'brute'],
'leaf_size' : [10, 20, 30, 40]}
# Instantiate cross validated k-nearest neighbor random search
rs_knn = RandomizedSearchCV(estimator=knn, param_distributions=knn_params, cv=5)
# Fit knn to training data
rs_knn.fit(X_train, y_train)
Selecting the best model
# Set up all models scores as a df
score_df = pd.DataFrame({'Logistic Regression' : [rs_lr.best_score_] ,'Desicion tree' : [rs_dtc.best_score_], \
'Random forest' : [rs_rfc.best_score_],'X-Gradient Booster': [rs_xgb.best_score_] ,'K-Neighbors' : [rs_knn.best_score_]})
# Plot bar plot
plt.style.use('default')
color_palette = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
score_df.plot(kind='bar', edgecolor='white', colormap=color_palette, linewidth=5, figsize=(8, 4), xlabel='Models')
# Show plot
plt.show()
Further improvement using cross validated grid search
'''
# rfc param grid
rfc_params = {
'n_estimators' : [*range(25, 400, 20)],
'criterion' : ['gini', 'entropy'],
'max_depth': [*range(1, 5)],
'min_samples_split' : [2, 4, 6, 8],
'min_samples_leaf': [1, 2, 3, 4]}
# Instantiate cross validated random forest random search
gs_rfc = GridSearchCV(estimator=rfc, param_grid=rfc_params, cv=5)
# Fit rfc to trainig data
gs_rfc.fit(X_train, y_train)
'''
# Print out feature importance
#feature_imp = pd.Series(gs_rfc.best_estimator_.feature_importances_, index=X.columns).sort_values(ascending=False)
#print(feature_imp)
#print(gs_rfc.best_estimator_.feature_importances_)
#print(gs_rfc.best_estimator_)
The best model
from sklearn.metrics import accuracy_score
# Set up the best classifiert
rfc_v02 = RandomForestClassifier(max_depth=4, min_samples_leaf=3, min_samples_split=6, n_estimators=265)
# Train it
rfc_v02.fit(X_train, y_train)
# Predict on unseen data
y_pred = rfc_v02.predict(X_test)
# Get the score of the model
print('Correct Prediction (%): ', accuracy_score(y_test, y_pred, normalize = True) * 100.0)
Feature importances
# Get feature importance
feature_imp = pd.Series(rfc_v02.feature_importances_, index=X.columns).sort_values(ascending=False)
# Clear matplotlib
plt.clf()
# Create subplot and pie chart
fig1, ax1 = plt.subplots()
ax1.pie(feature_imp, colors=sns.cubehelix_palette(start=.5, rot=-.75, n_colors=9), labels=feature_imp.index, autopct='%1.1f%%', startangle=0, rotatelabels=True)
#draw circle
centre_circle = plt.Circle((0,0),0.80, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')
# Set tighten layout and show plot
plt.title('Feature importance in %')
plt.tight_layout()
plt.show()