XEducation
# Import libraries
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
# Data Visualization
from matplotlib.pyplot import xticks
%matplotlib inline
#Model Building
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print(f"Python version {sys.version}")
print(f"pandas version: {pd.__version__}")
Importing & Data Cleaning
df = pd.DataFrame(pd.read_csv(r"/datasets/google-drive/XEducation_LeadScoring/XEducation_LeadsDataset.csv"))
pd.options.display.max_columns = None
display(df.head(10))
#Eliminamos llaves duplicadas
"""Eliminar filas que tengan ID duplicado, si las hay"""
print(df.shape)
df.drop_duplicates(subset=['Prospect ID'])
print(df.shape)
#Escaneamos la estructura de los datos
df.info()
#Escaneamos las estadísticas de los datos numéricos
df.describe()
#Eliminamos columnas de información seleccionadas
df_clean = df.drop(['Lead Number', 'Tags','Lead Quality', 'Lead Profile', 'Asymmetrique Activity Index', 'Asymmetrique Profile Index',
'Asymmetrique Activity Score', 'Asymmetrique Profile Score'], axis = 1)
pd.options.display.max_columns = None
display(df_clean.head(5))
#Trabajamos con los valores <Select> de nuestra base de datos
"""Convertimos los valores Select en NaN, pues no aportan nada de valor a nuestro análisis"""
df_clean = df_clean.replace('Select', np.nan)
round(100*(df_clean.isnull().sum()/len(df_clean.index)))
#Eliminamos la columna no representativa
df_clean = df_clean.drop(['How did you hear about X Education'], axis = 1)
pd.options.display.max_columns = None
display(df_clean.head(5))
df_clean.shape
Country
df_clean['Country'].describe()
"""Podemos cambiar los valores nulos por <India>, debido a que tiene una frecuencia superior al 50%"""
df_clean['Country'] = df_clean['Country'].replace(np.nan, 'India')
round(100*(df_clean['Country'].isnull().sum()/len(df_clean['Country'].index)))
Specialization
df_clean['Specialization'].describe()
"""Nuestro dato más alto no es representativo, por lo que englobamos el resto de opciones en una nueva categoría"""
df_clean['Specialization'] = df_clean['Specialization'].replace(np.nan, 'Other')
round(100*(df_clean['Specialization'].isnull().sum()/len(df_clean['Specialization'].index)))
What is your current ocupation
df_clean['What is your current occupation'].describe()
"""Podemos cambiar los valores nulos por <Unemployed>, debido a que tiene una frecuencia superior al 50%"""
df_clean['What is your current occupation'] = df_clean['What is your current occupation'].replace(np.nan, 'Unemployed')
round(100*(df_clean['What is your current occupation'].isnull().sum()/len(df_clean['What is your current occupation'].index)))
What matters most to you in choosing a course
df_clean['What matters most to you in choosing a course'].describe()
"""Podemos cambiar los valores nulos por <Better Career Prospects>, debido a que tiene una frecuencia superior al 50%"""
df_clean['What matters most to you in choosing a course'] = df_clean['What matters most to you in choosing a course'].replace(np.nan, 'Better Career Prospects')
round(100*(df_clean['What matters most to you in choosing a course'].isnull().sum()/len(df_clean['What matters most to you in choosing a course'].index)))
City
df_clean['City'].describe()
"""Podemos cambiar los valores nulos por <Mumbai>, debido a que tiene una frecuencia superior al 50%"""
df_clean['City'] = df_clean['City'].replace(np.nan, 'Mumbai')
round(100*(df_clean['City'].isnull().sum()/len(df_clean['City'].index)))
"""Por último, eliminamos las filas de las columnas que tienen valores NaN mínimos"""
df_xedu = df_clean.dropna()
round(100*(df_xedu.isnull().sum()/len(df_xedu.index)))
df_xedu.shape
df_xedu.to_csv('XEducation_Leads.csv')
Exploratory Data Analysis
#Evaluamos la columna objetivo
conv = (df_xedu['Converted'].sum() / len(df_xedu['Converted']) * 100)
no_conv = ((df_xedu['Converted']==0).sum() / len(df_xedu['Converted'])*100)
sns.barplot(x=['Convertidos', "No Convertidos"], y=[conv, no_conv])
print('% Convertidos: ' + str(round(conv,2)) + " - " + str(df_xedu['Converted'].sum()))
print('% No Convertidos: ' + str(round(no_conv, 2)) + " - " + str((df_xedu['Converted']==0).sum()))
Lead Origin
#Columna Lead Origin
sns.countplot(data = df_xedu, x = "Lead Origin", hue = "Converted")
xticks(rotation = 90)
Lead Source
#Columna Lead Source
sns.countplot(data = df_xedu, x = "Lead Source", hue = "Converted")
xticks(rotation = 90)
"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido"""
df_xedu['Lead Source'] = df_xedu['Lead Source'].replace('google', 'Google')
df_xedu['Lead Source'] = df_xedu['Lead Source'].replace(['blog', 'Pay per Click Ads', 'bing', 'Social Media',
'WeLearn', 'Click2call', 'Live Chat', 'welearnblog_Home', 'youtubechannel', 'testone', 'Press_Release', 'NC_EDM'], 'Others')
sns.countplot(data = df_xedu, x = "Lead Source", hue = "Converted")
xticks(rotation = 90)
Do Not Email
#Columna Do Not Email
sns.countplot(data = df_xedu, x = "Do Not Email", hue = "Converted")
xticks(rotation = 90)
Do Not Call
#Columna Do Not Call
sns.countplot(data = df_xedu, x = "Do Not Call", hue = "Converted")
xticks(rotation = 90)
TotalVisits
#Columna TotalVisits
sns.boxplot(data = df_xedu, x = 'Converted', y = 'TotalVisits')
#Reducimos los outliers
perc = df_xedu['TotalVisits'].quantile([0.05,0.90]).values
df_xedu['TotalVisits'][df_xedu['TotalVisits'] <= perc[0]] = perc[0]
df_xedu['TotalVisits'][df_xedu['TotalVisits'] >= perc[1]] = perc[1]
sns.boxplot(data = df_xedu, x = 'Converted', y = 'TotalVisits')
Total Time Spent on Website
#Columna Total Time Spent on Website
sns.boxplot(data = df_xedu, x = 'Converted', y = 'Total Time Spent on Website')
#Reducimos los outliers
perc = df_xedu['Total Time Spent on Website'].quantile([0.05,0.80]).values
df_xedu['Total Time Spent on Website'][df_xedu['Total Time Spent on Website'] <= perc[0]] = perc[0]
df_xedu['Total Time Spent on Website'][df_xedu['Total Time Spent on Website'] >= perc[1]] = perc[1]
sns.boxplot(data = df_xedu, x = 'Converted', y = 'Total Time Spent on Website')
Page Views Per Visit
#Columna Page Views Per Visit
sns.boxplot(data = df_xedu, x = 'Converted', y = 'Page Views Per Visit')
#Reducimos los outliers
perc = df_xedu['Page Views Per Visit'].quantile([0.05,0.90]).values
df_xedu['Page Views Per Visit'][df_xedu['Page Views Per Visit'] <= perc[0]] = perc[0]
df_xedu['Page Views Per Visit'][df_xedu['Page Views Per Visit'] >= perc[1]] = perc[1]
sns.boxplot(data = df_xedu, x = 'Converted', y = 'Page Views Per Visit')
Last Activity
#Columna Last Activity
sns.countplot(data = df_xedu, x = "Last Activity", hue = "Converted")
xticks(rotation = 90)
"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido"""
df_xedu['Last Activity'] = df_xedu['Last Activity'].replace(['Email Marked Spam', 'Email Received', 'Resubscribed to emails', 'Approached upfront',
'Visited Booth in Tradeshow', 'View in browser link Clicked'], 'Others')
sns.countplot(data = df_xedu, x = "Last Activity", hue = "Converted")
xticks(rotation = 90)
Country
#Columna Country
sns.countplot(data = df_xedu, x = "Country", hue = "Converted")
xticks(rotation = 90)
"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido"""
df_xedu['Country'] = df_xedu['Country'].replace(['Indonesia', 'Vietnam', 'Bangladesh', 'Philippines', 'Denmark', 'Switzerland', 'Liberia', 'Malaysia',
'unknown', 'Tanzania', 'South Africa', 'Italy', 'Kenya', 'Uganda', 'Asia/Pacific Region', 'Germany', 'Hong Kong', 'Nigeria', 'Sweden', 'Netherlands',
'Canada', 'China', 'Sri Lanka', 'France', 'Belgium', 'Qatar', 'Ghana', 'Bahrain', 'United Kingdom', 'Australia', 'Oman', 'Kuwait', 'Russia'], 'Others')
sns.countplot(data = df_xedu, x = "Country", hue = "Converted")
xticks(rotation = 90)
Specialization
#Columna Specialization
sns.countplot(data = df_xedu, x = "Specialization", hue = "Converted")
xticks(rotation = 90)
What is your current occupation
#Columna What is your current occupation
sns.countplot(data = df_xedu, x = "What is your current occupation", hue = "Converted")
xticks(rotation = 90)
What matters most to you in choosing a course
#Columna What matters most to you in choosing a course
sns.countplot(data = df_xedu, x = "What matters most to you in choosing a course", hue = "Converted")
xticks(rotation = 90)
Search
#Columna Search
sns.countplot(data = df_xedu, x = "Search", hue = "Converted")
xticks(rotation = 90)
Magazine
#Columna Magazine
sns.countplot(data = df_xedu, x = "Magazine", hue = "Converted")
xticks(rotation = 90)
Newspaper Article
#Columna Newspaper Article
sns.countplot(data = df_xedu, x = "Newspaper Article", hue = "Converted")
xticks(rotation = 90)
Digital Advertisement
#Columna Digital Advertisement
sns.countplot(data = df_xedu, x = "Digital Advertisement", hue = "Converted")
xticks(rotation = 90)
Through Recommendations
#Columna Through Recommendations
sns.countplot(data = df_xedu, x = "Through Recommendations", hue = "Converted")
xticks(rotation = 90)
Receive More Updates About Our Courses
#Columna Receive More Updates About Our Courses
sns.countplot(data = df_xedu, x = "Receive More Updates About Our Courses", hue = "Converted")
xticks(rotation = 90)
Update me on Supply Chain Content
#Columna Update me on Supply Chain Content
sns.countplot(data = df_xedu, x = "Update me on Supply Chain Content", hue = "Converted")
xticks(rotation = 90)
Get updates on DM Content
#Columna Get updates on DM Content
sns.countplot(data = df_xedu, x = "Get updates on DM Content", hue = "Converted")
xticks(rotation = 90)
City
#Columna City
sns.countplot(data = df_xedu, x = "City", hue = "Converted")
xticks(rotation = 90)
I agree to pay the amount through cheque
#Columna I agree to pay the amount through cheque
sns.countplot(data = df_xedu, x = "I agree to pay the amount through cheque", hue = "Converted")
xticks(rotation = 90)
A free copy of Mastering The Interview
#Columna A free copy of Mastering The Interview
sns.countplot(data = df_xedu, x = "A free copy of Mastering The Interview", hue = "Converted")
xticks(rotation = 90)
Last Notable Activity
#Columna Last Notable Activity
sns.countplot(data = df_xedu, x = "Last Notable Activity", hue = "Converted")
xticks(rotation = 90)
"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido"""
df_xedu['Last Notable Activity'] = df_xedu['Last Notable Activity'].replace(['Email Marked Spam', 'Email Received', 'Form Submitted on Website',
'View in browser link Clicked', 'Resubscribed to emails', 'Approached upfront'], 'Others')
sns.countplot(data = df_xedu, x = "Last Notable Activity", hue = "Converted")
xticks(rotation = 90)
X Education Forums
#Columna X Education Forums
sns.countplot(data = df_xedu, x = "X Education Forums", hue = "Converted")
xticks(rotation = 90)
Newspaper
#Columna Newspaper
sns.countplot(data = df_xedu, x = "Newspaper", hue = "Converted")
xticks(rotation = 90)
Data Preparation
#Eliminamos todas las columnas irrelevantes
df_xedu = df_xedu.drop(['Do Not Call', 'TotalVisits','Page Views Per Visit', 'Country', 'What matters most to you in choosing a course',
'Search', 'Magazine', 'Newspaper Article', 'Digital Advertisement', 'Through Recommendations', 'Receive More Updates About Our Courses',
'Update me on Supply Chain Content', 'Get updates on DM Content', 'I agree to pay the amount through cheque', 'X Education Forums', 'Newspaper'], axis = 1)
#Guardamos nuestro dataframe en una variable distinta para ocuparlo al final
df_final_test = df_xedu
pd.options.display.max_columns = None
display(df_xedu.head(5))
df_xedu.shape
df_xedu.info()
df_xedu.describe()
#Convertimos las variables binarias en datos numéricos
df_xedu['Do Not Email'] = df_xedu['Do Not Email'].replace('No', 0)
df_xedu['Do Not Email'] = df_xedu['Do Not Email'].replace('Yes', 1)
df_xedu['A free copy of Mastering The Interview'] = df_xedu['A free copy of Mastering The Interview'].replace('No', 0)
df_xedu['A free copy of Mastering The Interview'] = df_xedu['A free copy of Mastering The Interview'].replace('Yes', 1)
pd.options.display.max_columns = None
display(df_xedu.head(10))
#Cambiamos los nombres de las columnas para manipular el dataset con mayor simplicidad y orden
df_xedu.columns = ['id', 'lead_origin', 'lead_source', 'do_not_email', 'converted', 'total_time_on_website', 'last_activity', 'specialization',
'current_ocupation', 'city', 'free_copy_of_interview', 'last_notable_activity']
#Ahora convertiremos nuestra variable continua en variable categórica usando los cuartiles
df_xedu['total_time_on_website'].describe()
sns.boxplot(data = df_xedu, x = 'converted', y = 'total_time_on_website')
condition_muy_bajo = (df_xedu['total_time_on_website'].astype(int) >= 0) & (df_xedu['total_time_on_website'].astype(int) <= 11)
condition_bajo = (df_xedu['total_time_on_website'].astype(int) > 11) & (df_xedu['total_time_on_website'].astype(int) <= 246)
condition_medio = (df_xedu['total_time_on_website'].astype(int) > 246) & (df_xedu['total_time_on_website'].astype(int) <= 922)
condition_alto = df_xedu['total_time_on_website'].astype(int) > 922
df_xedu.loc[condition_muy_bajo, 'total_time_on_website'] = 'muy_bajo'
df_xedu.loc[condition_bajo, 'total_time_on_website'] = 'bajo'
df_xedu.loc[condition_medio, 'total_time_on_website'] = 'medio'
df_xedu.loc[condition_alto, 'total_time_on_website'] = 'alto'
pd.options.display.max_columns = None
display(df_xedu.head(10))
#Utilizamos get_dummies para convertir variables categóricas en variables numéricas
df_xedu_dummy = pd.get_dummies(df_xedu[['lead_origin', 'lead_source', 'total_time_on_website','last_activity', 'specialization',
'current_ocupation','city','last_notable_activity']])
df_xedu_dummy.head(10)
df_xedu = pd.concat([df_xedu, df_xedu_dummy], axis=1)
df_xedu = df_xedu.drop(['lead_origin', 'lead_source', 'total_time_on_website','last_activity', 'specialization',
'current_ocupation','city','last_notable_activity'], axis = 1)
df_xedu.head(10)
df_xedu.shape
#Intentamos identificar grados altos de correlación entre las variables mediante un heatmap
import random
def unico(x,L):
esUnico=True
for i in range(len(L)):
if x==L[i]:
esUnico=False
break
return esUnico
L=[]
j=0
while j<15:
x=random.randint(0,74)
if unico(x,L):
L.append(x)
j+=1
lista_corr = []
for i in L:
lista_corr.append(df_xedu.columns[i])
corr_matrix = df_xedu[lista_corr].corr()
sns.heatmap(corr_matrix, cmap="coolwarm")
#Importamos el dataframe final
df_xedu.to_csv('XEducation_Dummies.csv')
Model Building
#Observamos nuestro dataset
pd.options.display.max_columns = None
display(df_xedu.head(10))
#Cargamos las variables de entrada, excluyendo la columna "converted"
X = df_xedu.drop(['id','converted'], axis=1)
y = df_xedu['converted']
X.shape
#Ajustamos nuestro modelo a nuestras variables de entrada
model = linear_model.LogisticRegression()
model.fit(X,y)
#Dividimos nuestro dataset en un set de entrenamiento y un set de prueba
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, y, train_size=0.7, test_size=0.3, random_state = 100)
#Compilamos nuestro modelo con el set de entrenamiento
name = 'Score en el Set de Entrenamiento'
kfold = model_selection.KFold(n_splits = 10, random_state = 100, shuffle = True)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv = kfold, scoring = 'accuracy')
msg = "%s: %f" % (name, cv_results.mean())
#Grado de Certeza del Modelo aplicado al Set de Entrenamiento
print(msg)
#Grado de Certeza del Modelo aplicado al Set de Pruebas
name_s = 'Score en el Set de Pruebas'
predictions = model.predict(X_validation)
print("%s: %f" % (name_s, accuracy_score(Y_validation, predictions)))
#Plasmamos la Matriz de Confusión
print(confusion_matrix(Y_validation, predictions))
#Traemos de vuelta nuestro dataset original
column_prediction = model.predict(X)
df_final_test.insert(5, 'prediction', column_prediction)
column_probability = []
array = model.predict_proba(X)
lista = list(range(0,len(array),1))
for i in lista:
column_probability.append(array[i][1])
df_final_test.insert(6, 'probability', column_probability)
pd.options.display.max_columns = None
display(df_final_test)