Water quality dataset - what makes clean water?

!pip install missingno

# Loading in libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import missingno as msno from collections import Counter from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV from sklearn.preprocessing import StandardScaler from sklearn.metrics import r2_score from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier

Loading in the data

# Loading in the data as a csv file dataset = pd.read_csv('water_potability.csv') dataset.head(5)

# Getting basic information about the dataset dataset.info()

dataset.describe()

# Define colors used as colorcodes blue = '#51C4D3' # To mark drinkable water green = '#74C365' # To mark undrinkable water red = '#CD6155' # For further markings orange = '#DC7633' # For further markings # Plot the colors as a palplot sns.palplot([blue]) sns.palplot([green]) sns.palplot([red]) sns.palplot([orange])

Exploratory Data Analysis

# Count amount of each value in the Potability column dataset['Potability'].value_counts()

dataset['Potability'].value_counts()

# Clear matplotlib and set style plt.clf() plt.style.use('ggplot') # Create subplot and pie chart fig1, ax1 = plt.subplots() ax1.pie(dataset['Potability'].value_counts(), colors=[green, blue], labels=['Not drinkable', 'Drinkable'], autopct='%1.1f%%', startangle=0, rotatelabels=False) #draw circle centre_circle = plt.Circle((0,0),0.80, fc='white') fig = plt.gcf() fig.gca().add_artist(centre_circle) # Equal aspect ratio ensures that pie is drawn as a circle ax1.axis('equal') # Set tighten layout and show plot plt.tight_layout() plt.show()

# Define a list of features to be plotted feature = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']

# Drop Potability column plotter_dataset = dataset.drop('Potability', axis=1) plt.figure(figsize = (15, 15)) # Iterate through the feature list and plot a histlpot for i in enumerate(feature): plt.subplot(3, 3,i[0]+1) # Histlot plotting the fetures in the dataset sns.histplot( data = plotter_dataset, x = plotter_dataset[i[1]], hue = dataset['Potability'], palette= [blue, green], kde = True, multiple='stack', alpha=0.8 ) # Rotate the xticks for better readability plt.xticks(rotation = 25)

Further filtering of the data

# Define a plotting function def mask_plotter(column, mask1, mask2, label1, label2, color2, title): # Set size and stlye plt.figure(figsize=(5, 5)) plt.style.use('ggplot') # Create to histplots sns.histplot(data=dataset[mask1], x=column, multiple='stack', color=blue, label=label1) # Save sns.histplot(data=dataset[mask2], x=column, multiple='stack', color=color2, label=label2) # Add title, legend and show plot plt.title(title) plt.legend() plt.show()

# Create masks to filter ph-values within a specific range drinkable_ph_mask = (dataset['ph'] > 6.5) & (dataset['ph'] < 9) undrinkable_ph_mask = (dataset['ph'] < 6.5) | (dataset['ph'] > 9) # Create masks to filter ph-values within a specific range soft_mask = (dataset['Hardness'] < 150) hard_mask = (dataset['Hardness'] > 150) # Create masks to filter ph-values within a specific range eu_recommendation = (dataset['Sulfate'] < 250) who_recommendation = (dataset['Sulfate'] > 250) & (dataset['Sulfate'] < 500)

# Plot ph-values mask_plotter('ph', drinkable_ph_mask, undrinkable_ph_mask, 'Save ph value', 'Unsave ph-value', red, 'ph-values')

# Plot Hardness mask_plotter('Hardness', soft_mask, hard_mask, 'Soft - moderate water', 'Hard water', red, 'Hardness')

# Plot Sulfate mask_plotter('Sulfate', eu_recommendation, who_recommendation, 'Within EU limits', 'Within WHO limits', orange, 'Sulfate')

Data preparation

Dealing with missing data

# Plot out missing values fig = msno.matrix(dataset)

# Mean and median of ph print("Mean of ph: " + str(dataset['ph'].mean())) print("Median of ph: " + str(dataset['ph'].median())) print('-------------------------------------------') # Mean and median of Sulfates print("Mean of Sulfate: " + str(dataset['Sulfate'].mean())) print("Median of Sulfate: " + str(dataset['Sulfate'].median())) print('-------------------------------------------') # Mean and median of Trihalomethanes print("Mean of Trihalomethanes: " + str(dataset['Trihalomethanes'].mean())) print("Median of Trihalomethanes: " + str(dataset['Trihalomethanes'].median())) print('-------------------------------------------')

# Replacing nan values with the median dataset['ph'].fillna(value=dataset['ph'].median(),inplace=True) dataset['Sulfate'].fillna(value=dataset['Sulfate'].median(),inplace=True) dataset['Trihalomethanes'].fillna(value=dataset['Trihalomethanes'].median(),inplace=True)

# Checking if there are missing values left dataset.isnull().sum()

Checking for correlations

# Set figure size plt.figure(figsize=(10, 10)) # Create heatmap sns.heatmap(dataset.corr(), annot=True) # Show plot plt.show()

# Setting features (X) and targets(y) X = dataset.drop('Potability', axis=1) y = dataset['Potability']

# Split data into training and testing dataset X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42) # Scale the data. Only training data is fitted, testing data is only transformed # Also, only x values are scaled, because y only containes values 0 and 1, which we don't want to scale scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test)

# Creatre figure and list containing axes fig, ax = plt.subplots(2, 1) # Plot histogram of before and after scaling X_train.iloc[:, 2].hist(ax=ax[0]) ax[1].hist(X_train_scaled[:, 2]) plt.show()

Building a model

Randomized, cross validated search

# Set up all the models that are to be tested lr = LogisticRegression() rfc = RandomForestClassifier() dtc = DecisionTreeClassifier() xgb = GradientBoostingClassifier() knn = KNeighborsClassifier()

# lr param grid lr_params = { 'penalty' : ['l1', 'l2'], 'C' : np.linspace(0, 5, 10)} # Instantiate cross validated logistic regression random search rs_lr = RandomizedSearchCV(estimator=lr, param_distributions=lr_params, cv=5) # Fit lr to trainig data rs_lr.fit(X_train, y_train)

# rfc param grid rfc_params = { 'n_estimators' : [*range(25, 400, 20)], 'criterion' : ['gini', 'entropy'], 'max_depth': [*range(1, 11)], 'min_samples_split' : [2, 4, 6, 8], 'min_samples_leaf': [1, 2, 3, 4]} # Instantiate cross validated random forest random search rs_rfc = RandomizedSearchCV(estimator=rfc, param_distributions=rfc_params, cv=5) # Fit rfc to trainig data rs_rfc.fit(X_train, y_train)

# dtc param grid dtc_params = { 'criterion' : ['gini', 'entropy'], 'splitter' : ['best', 'random'], 'max_depth': [*range(1, 11)], 'min_samples_split' : [2, 4, 6, 8], 'min_samples_leaf': [1, 2, 3, 4]} # Instantiate cross validated desicion tree random search rs_dtc = RandomizedSearchCV(estimator=dtc, param_distributions=dtc_params, cv=5) # Fit dtc to trainig data rs_dtc.fit(X_train, y_train)

# xgb param grid xgb_params = { 'loss' : ['deviance', 'exponential'], 'learning_rate' : np.linspace(0, 1, 10), 'n_estimators' : [*range(25, 500, 10)]} # Instantiate cross validated extreme gradient boosting random search rs_xgb = RandomizedSearchCV(estimator=xgb, param_distributions=xgb_params, cv=5) # Fit xgb to training data rs_xgb.fit(X_train, y_train)

# knn param grid knn_params = { 'n_neighbors' : [*range(1, 11)], 'weights' : ['uniform', 'distant'], 'algorithm' : ['auto', 'ball_tree', 'kd_tree' 'brute'], 'leaf_size' : [10, 20, 30, 40]} # Instantiate cross validated k-nearest neighbor random search rs_knn = RandomizedSearchCV(estimator=knn, param_distributions=knn_params, cv=5) # Fit knn to training data rs_knn.fit(X_train, y_train)

Selecting the best model

# Set up all models scores as a df score_df = pd.DataFrame({'Logistic Regression' : [rs_lr.best_score_] ,'Desicion tree' : [rs_dtc.best_score_], \ 'Random forest' : [rs_rfc.best_score_],'X-Gradient Booster': [rs_xgb.best_score_] ,'K-Neighbors' : [rs_knn.best_score_]}) # Plot bar plot plt.style.use('default') color_palette = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True) score_df.plot(kind='bar', edgecolor='white', colormap=color_palette, linewidth=5, figsize=(8, 4), xlabel='Models') # Show plot plt.show()

Further improvement using cross validated grid search

''' # rfc param grid rfc_params = { 'n_estimators' : [*range(25, 400, 20)], 'criterion' : ['gini', 'entropy'], 'max_depth': [*range(1, 5)], 'min_samples_split' : [2, 4, 6, 8], 'min_samples_leaf': [1, 2, 3, 4]} # Instantiate cross validated random forest random search gs_rfc = GridSearchCV(estimator=rfc, param_grid=rfc_params, cv=5) # Fit rfc to trainig data gs_rfc.fit(X_train, y_train) '''

# Print out feature importance #feature_imp = pd.Series(gs_rfc.best_estimator_.feature_importances_, index=X.columns).sort_values(ascending=False) #print(feature_imp)

#print(gs_rfc.best_estimator_.feature_importances_)

#print(gs_rfc.best_estimator_)

The best model

from sklearn.metrics import accuracy_score # Set up the best classifiert rfc_v02 = RandomForestClassifier(max_depth=4, min_samples_leaf=3, min_samples_split=6, n_estimators=265) # Train it rfc_v02.fit(X_train, y_train) # Predict on unseen data y_pred = rfc_v02.predict(X_test) # Get the score of the model print('Correct Prediction (%): ', accuracy_score(y_test, y_pred, normalize = True) * 100.0)

Feature importances

# Get feature importance feature_imp = pd.Series(rfc_v02.feature_importances_, index=X.columns).sort_values(ascending=False) # Clear matplotlib plt.clf() # Create subplot and pie chart fig1, ax1 = plt.subplots() ax1.pie(feature_imp, colors=sns.cubehelix_palette(start=.5, rot=-.75, n_colors=9), labels=feature_imp.index, autopct='%1.1f%%', startangle=0, rotatelabels=True) #draw circle centre_circle = plt.Circle((0,0),0.80, fc='white') fig = plt.gcf() fig.gca().add_artist(centre_circle) # Equal aspect ratio ensures that pie is drawn as a circle ax1.axis('equal') # Set tighten layout and show plot plt.title('Feature importance in %') plt.tight_layout() plt.show()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Water quality dataset - what makes clean water?