import numpy as np
import pandas as pd
import klib
import plotly.express as px
gen_df = pd.read_csv('/content/drive/MyDrive/PersonalProjects/Tesis_pa/evaluaciones especies final Julio 6.xls - Resultado Gral.csv')
gen_df.head(3)
gen_df['Especie'] = gen_df['Especie'].replace({3.0:'Pacora',
2.0:'Moncholo',
1.0:'Bocachico'})
gen_df['Cienaga'] = gen_df['Cienaga'].replace({2.0:'Ayapel',
1.0:'Achi',
3.0:'Desc'})
print(gen_df['Especie'].value_counts())
print(gen_df['Cienaga'].value_counts())
# Get one hot encoding of columns B
one_hot_especies = pd.get_dummies(gen_df['Especie'])
one_hot_cienaga = pd.get_dummies(gen_df['Cienaga'])
# Drop column B as it is now encoded
gen_df = gen_df.drop('Especie',axis = 1)
gen_df = gen_df.drop('Cienaga',axis = 1)
# Join the encoded df
gen_df = gen_df.join(one_hot_especies)
gen_df = gen_df.join(one_hot_cienaga)
gen_df
df = gen_df.dropna(how='all')
df = df[:-1] #a remaining nan row
klib.missingval_plot(df)
null_df = df[df['Peso (g)'].isnull()]
null_df
clean_df = df.dropna()
X = clean_df[list(clean_df.columns[1:3])+list(clean_df.columns[18:])]
y = clean_df[clean_df.columns[4]].values
clean_df.head()
y
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=1)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Don't cheat - fit only on training data
scaler.fit(X_train)
X_train = scaler.transform(X_train)
# apply same transformation to test data
X_test = scaler.transform(X_test)
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)
reg.coef_
reg.intercept_
reg.predict(X_test)
print('El Puntaje R2 (R cuadrado) en el set de prueba es de:')
reg.score(X_test, y_test)
from sklearn.ensemble import GradientBoostingRegressor
reg = GradientBoostingRegressor(random_state=0).fit(X_train, y_train)
reg.predict(X_test[1:2])
print('El Puntaje R2 (R cuadrado) en el set de prueba es de:')
reg.score(X_test, y_test)
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=9).fit(X_train, y_train)
neigh.predict(X_test)
print('El Puntaje R2 (R cuadrado) en el set de prueba es de:')
neigh.score(X_test, y_test)
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
# Always scale the input. The most convenient way is to use a pipeline.
reg = make_pipeline(StandardScaler(),
SGDRegressor(max_iter=3000, tol=1e-3)).fit(X_train, y_train)
reg.predict(X_test[:2])
print('El Puntaje R2 (R cuadrado) en el set de prueba es de:')
reg.score(X_test, y_test)
from sklearn.neural_network import MLPRegressor
regr = MLPRegressor(hidden_layer_sizes=(100,), random_state=1, max_iter=50000).fit(X_train, y_train)
regr.predict(X_test[:2])
print('El Puntaje R2 (R cuadrado) en el set de prueba es de:')
regr.score(X_test, y_test)
null_X = null_df[list(null_df.columns[1:3])+list(null_df.columns[18:])]
null_X = scaler.transform(null_X)
null_df['Peso (g)'] = reg.predict(null_X)
null_df.head(3)
df = pd.concat([clean_df, null_df])
df
import numpy as np
X = df[list(df.columns[1:3])+[df.columns[4]]+list(df.columns[6:])]
y = df[df.columns[5]]
y = np.log(y)
X.head(2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=1)
scaler = StandardScaler()
# Don't cheat - fit only on training data
scaler.fit(X_train)
X_train = scaler.transform(X_train)
# apply same transformation to test data
X_test = scaler.transform(X_test)
reg = LinearRegression().fit(X_train, y_train)
reg.coef_
reg.intercept_
reg.predict(X_test)
print('El Puntaje R2 (R cuadrado) en el set de prueba es de:')
reg.score(X_test, y_test)
gbreg = GradientBoostingRegressor(random_state=0).fit(X_train, y_train)
gbreg.predict(X_test[1:2])
print('El Puntaje R2 (R cuadrado) en el set de prueba es de:')
gbreg.score(X_test, y_test)
#Useful for large datasets
from sklearn.ensemble import HistGradientBoostingRegressor
est = HistGradientBoostingRegressor().fit(X_train, y_train)
print('El Puntaje R2 (R cuadrado) en el set de prueba es de:')
est.score(X_test, y_test)
from sklearn.ensemble import RandomForestRegressor
rfregr = RandomForestRegressor(max_depth=2, random_state=0).fit(X_train, y_train)
print('El Puntaje R2 (R cuadrado) en el set de prueba es de:')
rfregr.score(X_test, y_test)
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0).fit(X_train, y_train)
# cross_val_score(regressor, X, y, cv=10)
regressor.score(X_test, y_test)
neigh = KNeighborsRegressor(n_neighbors=9).fit(X_train, y_train)
neigh.predict(X_test)
print('El Puntaje R2 (R cuadrado) en el set de prueba es de:')
neigh.score(X_test, y_test)
# Always scale the input. The most convenient way is to use a pipeline.
sgdreg = SGDRegressor(max_iter=3000, tol=1e-3).fit(X_train, y_train)
sgdreg.predict(X_test[:2])
print('El Puntaje R2 (R cuadrado) en el set de prueba es de:')
sgdreg.score(X_test, y_test)
regr = MLPRegressor(hidden_layer_sizes=(100,), random_state=1, max_iter=50000).fit(X_train, y_train)
regr.predict(X_test[:2])
print('El Puntaje R2 (R cuadrado) en el set de prueba es de:')
regr.score(X_test, y_test)
df[list(df.columns[1:3])+[df.columns[4]]+list(df.columns[6:])].head(2)
df_X = df[list(df.columns[1:3])+[df.columns[4]]+list(df.columns[6:])]
df_X = scaler.transform(df_X)
predicted_df = df.copy(deep=True)
predicted_df['Hg peces (ppm)'] = np.exp(gbreg.predict(df_X)) #From ln to normal values we should use the Exponential function
predicted_df['Predicted'] = 'Predicted'
gbreg_df = pd.concat([df, predicted_df])
gbreg_df.fillna('Real', inplace=True)
gbreg_df.head(3)
fig = px.scatter(
gbreg_df, x="Lon(cm)", y="Hg peces (ppm)", color = "Predicted" ,color_continuous_scale='viridis', template='none',trendline="ols",
title='Gradient Boosting Model Predictions & Real Values'
)
fig.show()
df_X = df[list(df.columns[1:3])+[df.columns[4]]+list(df.columns[6:])]
df_X = scaler.transform(df_X)
predicted_df = df.copy(deep=True)
predicted_df['Hg peces (ppm)'] = np.exp(reg.predict(df_X)) #From ln to normal values we should use the Exponential function
predicted_df['Predicted'] = 'Predicted'
reg_df = pd.concat([df, predicted_df])
reg_df.fillna('Real', inplace=True)
fig = px.scatter(reg_df, x="Lon(cm)", y="Hg peces (ppm)", color = "Predicted" ,color_continuous_scale='viridis', template='none',trendline="ols",
title='Linear Model Predictions & Real Values')
fig.show()
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
df_X = df[list(df.columns[1:3])+[df.columns[4]]+list(df.columns[6:])]
feature_importance = gbreg.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align="center")
plt.yticks(pos, np.array(df_X.columns)[sorted_idx])
plt.title("Gradient Boosting Feature Importance (MDI)")
result = permutation_importance(
gbreg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(
result.importances[sorted_idx].T,
vert=False,
labels=np.array(df_X.columns)[sorted_idx],
)
plt.title("Gradient Boosting Permutation Importance (test set)")
fig.tight_layout()
plt.show()
result = permutation_importance(
reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)
sorted_idx = result.importances_mean.argsort()
plt.boxplot(
result.importances[sorted_idx].T,
vert=False,
labels=np.array(df_X.columns)[sorted_idx],
)
plt.title("Linear Model Permutation Importance (test set)")
fig.tight_layout()
plt.show()