import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
#sns.set_style('darkgrid')
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')
np.random.seed(0)
syn_data = [{"Qo": np.random.uniform(5, 1501, 250), "TVD": np.random.uniform(100, 16001, 250),
"API": np.random.uniform(8, 15, 250), "Temperature": np.random.uniform(100, 351, 250),
"Azimuthal_Angle": np.random.uniform(0, 21, 250), "AL_Method": "Rod Pump"},
{"Qo": np.random.uniform(100, 10001, 200), "TVD": np.random.uniform(5000, 15001, 200),
"API": np.random.uniform(15, 30, 200), "Temperature": np.random.uniform(100, 401, 200),
"Azimuthal_Angle": np.random.uniform(0, 71, 200), "AL_Method": "Gas Lift"},
{"Qo": np.random.uniform(50, 4001, 165), "TVD": np.random.uniform(5000, 15001, 165),
"API": np.random.uniform(8, 30, 165), "Temperature": np.random.uniform(100, 501, 165),
"Azimuthal_Angle": np.random.uniform(0, 91, 165), "AL_Method": "Hydraulic Pump"},
{"Qo": np.random.uniform(100, 30001, 325), "TVD": np.random.uniform(5000, 15001, 325),
"API": np.random.uniform(10, 35, 325), "Temperature": np.random.uniform(100, 401, 325),
"Azimuthal_Angle": np.random.uniform(0, 81, 325), "AL_Method": "ESP"},
{"Qo": np.random.uniform(5, 2201, 200), "TVD": np.random.uniform(2000, 10001, 200),
"API": np.random.uniform(10, 35, 200), "Temperature": np.random.uniform(75, 251, 200),
"Azimuthal_Angle": np.random.uniform(0, 91, 200), "AL_Method": "PCP"}]
data = pd.DataFrame()
for values in syn_data:
df = pd.DataFrame(values)
data = pd.concat([data, df], ignore_index=True)
data
data.AL_Method.value_counts()
columns = ['Qo', 'TVD', 'API', 'Temperature', 'Azimuthal_Angle']
for column in columns:
data[column] = data[column].round(decimals=3)
data
# Rename columns
data.rename(columns={'Qo':'Qo (bpd)', 'TVD':'TVD (ft)', 'Temperature':'Temperatura (F)',
'Azimuthal_Angle':'Ángulo de incl', 'AL_Method':'Método de LA'}, inplace=True)
data
data.info()
data.describe()
data["Qo (bpd)"] = data["Qo (bpd)"].astype('float64')
data["TVD (ft)"] = data["TVD (ft)"].astype('float64')
data["API"] = data["API"].astype('float64')
data["Temperatura (F)"] = data["Temperatura (F)"].astype('float64')
data["Ángulo de incl"] = data["Ángulo de incl"].astype('float64')
data.info()
data
fig, ax = plt.subplots(figsize=(14,8))
sns.heatmap(data.isnull(), cmap='gist_heat')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=12)
plt.show()
# Pair plot
pair_plot = sns.pairplot(data=data, hue="Método de LA", height=1.5)
def box_plots(data, h_column, v_column):
fig, ax = plt.subplots(figsize=(10, 8))
sns.boxplot(data=data, x=h_column, y=v_column, ax=ax)
ax.set_xlabel("")
ax.set_ylabel(v_column, fontsize=12)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=12)
ax.set_title('Boxplot: {} VS Métodos de LA'.format(v_column), fontname="Times New Roman", size=16, fontweight='bold')
plt.show()
props = ["Qo (bpd)", "TVD (ft)", "API", "Temperatura (F)", "Ángulo de incl"]
fig, ax = plt.subplots(len(props),1, figsize=(10,25))
for axes, prop in zip(ax, props):
sns.boxplot(data=data, x="Método de LA", y=prop, ax=axes)
axes.set_xlabel("")
axes.set_ylabel(prop, fontsize=12)
axes.set_xticklabels(axes.get_xticklabels(), fontsize=12)
axes.set_title('Boxplot: {} VS Métodos de LA'.format(prop), fontname="Times New Roman", size=16, fontweight='bold')
plt.tight_layout()
sns.displot(data=data, x="Qo (bpd)", hue="Método de LA")
plt.show()
#fig, ax = plt.subplots(5,1, figsize=(8,20))
props = ["Qo (bpd)", "TVD (ft)", "API", "Temperatura (F)", "Ángulo de incl"]
fig, ax = plt.subplots(len(props),1, figsize=(10,25))
for axes, prop in zip(ax, props):
sns.kdeplot(data=data, x=prop, shade=True, ax=axes)
axes.set_xlabel(prop, fontsize=12)
axes.set_title(f'Función de densidad de probabilidades: {prop}', fontname="Times New Roman", size=16, fontweight='bold')
plt.tight_layout()
# Pearson correlation
corr = data.corr()
# Heatmap of pearson's correlations
fig, ax = plt.subplots(figsize=(14,8))
sns.heatmap(data=corr, cmap='RdYlGn', annot=True, linewidths=0.01, linecolor='black', square=True, ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, fontsize=12)
ax.set_yticklabels(ax.get_yticklabels(), rotation=30, fontsize=12)
plt.show()
# Feature Selection (Feature importance)
X = data.iloc[:,:4].values #Feature Variables
y = data.iloc[:,-1] # Target Variable
# Split the data in training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
# Call the Machine Learning Algorithms for Classfication
knn = KNeighborsClassifier()
decision_tree = DecisionTreeClassifier()
svc = SVC()
rf = RandomForestClassifier()
# Define the hyperparameters to tune for each algorithm
knn_params = {'n_neighbors': np.arange(1, 30)}
dt_params = {'max_depth': np.arange(2,20,1)}
svc_params= {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
rf_params = {'max_depth': np.arange(0.1, 1.1, 0.1)}
# Function to train each algorithm
def training_model(model, param_grid):
model = GridSearchCV(model, param_grid=param_grid, scoring='accuracy')
model.fit(X_train, y_train)
return model
# KNN Algorithm
knn_model = training_model(knn, knn_params)
knn_model.best_params_
knn_model_final = knn_model.best_estimator_.fit(X_train, y_train)
# Prediction on test set
y_pred_knn = knn_model_final.predict(X_test)
# Accuracy of model
print(knn_model_final.score(X_train, y_train))
print(knn_model_final.score(X_test, y_test))
# Accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(accuracy_knn)
# Plot of Accuracy vs K values using the training and testing data
# Function to plot Accuracy vs main hyperparameter of each algorithm
def evaluation_plot_knn(init, stop, step, algorithm):
fig, ax = plt.subplots(figsize=(15,8))
param = np.arange(init, stop, step)
train_accuracy = np.empty(len(param))
test_accuracy = np.empty(len(param))
for i, k in enumerate(param):
model = algorithm(n_neighbors=k)
model.fit(X_train, y_train)
train_accuracy[i] = model.score(X_train, y_train)
test_accuracy[i] = model.score(X_test, y_test)
ax.plot(param, test_accuracy, label = 'Prueba')
ax.plot(param, train_accuracy, label = 'Entrenamiento')
plt.legend(fontsize=12)
ax.set_xlabel('K neighbors', size=12)
ax.set_ylabel('Accuracy')
ax.set_title('Accuracy Vs K neighbors', fontname="Times New Roman", size=16, fontweight='bold')
plt.show()
evaluation_plot_knn(1, 30, 1, KNeighborsClassifier)
# Decision tree algorithm
dt_model = training_model(decision_tree, dt_params)
dt_model.best_params_
dt_model_final = dt_model.best_estimator_.fit(X_train, y_train)
# Prediction on test set
y_pred_dt = dt_model_final.predict(X_test)
# Accuracy of model
print(dt_model_final.score(X_train, y_train))
print(dt_model_final.score(X_test, y_test))
# Accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(accuracy_dt)
def evaluation_plot_tree(init, stop, step, algorithm):
fig, ax = plt.subplots(figsize=(15,8))
max_depth = np.arange(init, stop, step)
train_accuracy = np.empty(len(max_depth))
test_accuracy = np.empty(len(max_depth))
for i, k in enumerate(max_depth):
Dt = DecisionTreeClassifier(max_depth=k)
Dt.fit(X_train, y_train)
train_accuracy[i] = Dt.score(X_train, y_train)
test_accuracy[i] = Dt.score(X_test, y_test)
ax.plot(max_depth, test_accuracy, label = 'Prueba')
ax.plot(max_depth, train_accuracy, label = 'Entrenamiento')
plt.legend(fontsize=12)
ax.set_xlabel('Max depth', size=12)
ax.set_ylabel('Accuracy')
ax.set_title('Accuracy Vs Max depth', fontname="Times New Roman", size=16, fontweight='bold')
plt.show()
evaluation_plot_tree(2, 20, 1, DecisionTreeClassifier)
# Support vector classifier algorithm
svc_model = training_model(svc, svc_params)
svc_model.best_params_
svc_model_final = svc_model.best_estimator_.fit(X_train, y_train)
# Prediction on test set
y_pred_svc = svc_model_final.predict(X_test)
# Accuracy of model
print(svc_model_final.score(X_train, y_train))
print(svc_model_final.score(X_test, y_test))
# Accuracy
accuracy_svc = accuracy_score(y_test, y_pred_svc)
print(accuracy_svc)
fig, ax = plt.subplots(figsize=(15,8))
C_values = [0.001, 0.01, 0.1, 1, 10, 100]
train_accuracy = np.empty(len(C_values))
test_accuracy = np.empty(len(C_values))
for i, k in enumerate(C_values):
svc = SVC(C=k)
svc.fit(X_train, y_train)
train_accuracy[i] = svc.score(X_train, y_train)
test_accuracy[i] = svc.score(X_test, y_test)
ax.plot(C_values, test_accuracy, label = 'Prueba')
ax.plot(C_values, train_accuracy, label = 'Entrenamiento')
plt.legend(fontsize=12)
ax.set_xlabel('C', fontsize=12)
ax.set_ylabel('Accuracy')
ax.set_xticklabels(C_values)
ax.set_title('Accuracy Vs C', fontname="Times New Roman", size=16, fontweight='bold')
plt.show()
# Support vector classifier algorithm
rf_model = training_model(rf, rf_params)
rf_model.best_params_
rf_model_final = rf_model.best_estimator_.fit(X_train, y_train)
# Prediction on test set
y_pred_rf = rf_model_final.predict(X_test)
# Accuracy of model
print(rf_model_final.score(X_train, y_train))
print(rf_model_final.score(X_test, y_test))
# Accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(accuracy_rf)
def evaluation_plot_forest(init, stop, step, algorithm, max_depth):
fig, ax = plt.subplots(figsize=(15,8))
param = np.arange(init, stop, step)
train_accuracy = np.empty(len(param))
test_accuracy = np.empty(len(param))
for i, k in enumerate(param):
model = algorithm(max_depth=k)
model.fit(X_train, y_train)
train_accuracy[i] = model.score(X_train, y_train)
test_accuracy[i] = model.score(X_test, y_test)
ax.plot(param, test_accuracy, label = 'Prueba')
ax.plot(param, train_accuracy, label = 'Entrenamiento')
plt.legend(fontsize=12)
ax.set_xlabel('Max depth', fontsize=12)
ax.set_ylabel('Accuracy')
ax.set_title('Accuracy Vs Max depth', fontname="Times New Roman", size=16, fontweight='bold')
plt.show()
accuracy = [accuracy_knn, accuracy_dt, accuracy_svc, accuracy_rf]
algorithms = ['KNN', 'Decision Tree', 'SVC', 'Random Forest']
model_select = pd.DataFrame({'Model': algorithms, 'Accuracy': accuracy})
model_select
def method_recommended(model, qo, tvd, api, temp):
predict = model.predict(np.array([qo, tvd, api, temp]).reshape(1, -1))
return print(f'The AL method recommended is: {predict[0]}')
predict_1 = method_recommended(dt_model_final, 720, 10355, 29.6, 214)
predict_1
predic_2 = method_recommended(dt_model_final, 166, 10085, 30, 232)
predic_2
predic_3 = method_recommended(dt_model_final, 1071, 9681, 24.1, 222)
predic_3
predict_4 = method_recommended(dt_model_final, 411.4, 9612, 30.8, 215)
predict_4