XEducation

# Import libraries import sys import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from IPython.display import display # Data Visualization from matplotlib.pyplot import xticks %matplotlib inline #Model Building from sklearn.model_selection import train_test_split from sklearn import linear_model from sklearn import model_selection from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score print(f"Python version {sys.version}") print(f"pandas version: {pd.__version__}")

Importing & Data Cleaning

df = pd.DataFrame(pd.read_csv(r"/datasets/google-drive/XEducation_LeadScoring/XEducation_LeadsDataset.csv")) pd.options.display.max_columns = None display(df.head(10))

#Eliminamos llaves duplicadas """Eliminar filas que tengan ID duplicado, si las hay""" print(df.shape) df.drop_duplicates(subset=['Prospect ID']) print(df.shape)

#Escaneamos la estructura de los datos df.info()

#Escaneamos las estadísticas de los datos numéricos df.describe()

#Eliminamos columnas de información seleccionadas df_clean = df.drop(['Lead Number', 'Tags','Lead Quality', 'Lead Profile', 'Asymmetrique Activity Index', 'Asymmetrique Profile Index', 'Asymmetrique Activity Score', 'Asymmetrique Profile Score'], axis = 1) pd.options.display.max_columns = None display(df_clean.head(5))

#Trabajamos con los valores <Select> de nuestra base de datos """Convertimos los valores Select en NaN, pues no aportan nada de valor a nuestro análisis""" df_clean = df_clean.replace('Select', np.nan) round(100*(df_clean.isnull().sum()/len(df_clean.index)))

#Eliminamos la columna no representativa df_clean = df_clean.drop(['How did you hear about X Education'], axis = 1) pd.options.display.max_columns = None display(df_clean.head(5))

df_clean.shape

Country

df_clean['Country'].describe()

"""Podemos cambiar los valores nulos por <India>, debido a que tiene una frecuencia superior al 50%""" df_clean['Country'] = df_clean['Country'].replace(np.nan, 'India') round(100*(df_clean['Country'].isnull().sum()/len(df_clean['Country'].index)))

Specialization

df_clean['Specialization'].describe()

"""Nuestro dato más alto no es representativo, por lo que englobamos el resto de opciones en una nueva categoría""" df_clean['Specialization'] = df_clean['Specialization'].replace(np.nan, 'Other') round(100*(df_clean['Specialization'].isnull().sum()/len(df_clean['Specialization'].index)))

What is your current ocupation

df_clean['What is your current occupation'].describe()

"""Podemos cambiar los valores nulos por <Unemployed>, debido a que tiene una frecuencia superior al 50%""" df_clean['What is your current occupation'] = df_clean['What is your current occupation'].replace(np.nan, 'Unemployed') round(100*(df_clean['What is your current occupation'].isnull().sum()/len(df_clean['What is your current occupation'].index)))

What matters most to you in choosing a course

df_clean['What matters most to you in choosing a course'].describe()

"""Podemos cambiar los valores nulos por <Better Career Prospects>, debido a que tiene una frecuencia superior al 50%""" df_clean['What matters most to you in choosing a course'] = df_clean['What matters most to you in choosing a course'].replace(np.nan, 'Better Career Prospects') round(100*(df_clean['What matters most to you in choosing a course'].isnull().sum()/len(df_clean['What matters most to you in choosing a course'].index)))

City

df_clean['City'].describe()

"""Podemos cambiar los valores nulos por <Mumbai>, debido a que tiene una frecuencia superior al 50%""" df_clean['City'] = df_clean['City'].replace(np.nan, 'Mumbai') round(100*(df_clean['City'].isnull().sum()/len(df_clean['City'].index)))

"""Por último, eliminamos las filas de las columnas que tienen valores NaN mínimos""" df_xedu = df_clean.dropna() round(100*(df_xedu.isnull().sum()/len(df_xedu.index)))

df_xedu.shape

df_xedu.to_csv('XEducation_Leads.csv')

Exploratory Data Analysis

#Evaluamos la columna objetivo conv = (df_xedu['Converted'].sum() / len(df_xedu['Converted']) * 100) no_conv = ((df_xedu['Converted']==0).sum() / len(df_xedu['Converted'])*100) sns.barplot(x=['Convertidos', "No Convertidos"], y=[conv, no_conv]) print('% Convertidos: ' + str(round(conv,2)) + " - " + str(df_xedu['Converted'].sum())) print('% No Convertidos: ' + str(round(no_conv, 2)) + " - " + str((df_xedu['Converted']==0).sum()))

Lead Origin

#Columna Lead Origin sns.countplot(data = df_xedu, x = "Lead Origin", hue = "Converted") xticks(rotation = 90)

Lead Source

#Columna Lead Source sns.countplot(data = df_xedu, x = "Lead Source", hue = "Converted") xticks(rotation = 90)

"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido""" df_xedu['Lead Source'] = df_xedu['Lead Source'].replace('google', 'Google') df_xedu['Lead Source'] = df_xedu['Lead Source'].replace(['blog', 'Pay per Click Ads', 'bing', 'Social Media', 'WeLearn', 'Click2call', 'Live Chat', 'welearnblog_Home', 'youtubechannel', 'testone', 'Press_Release', 'NC_EDM'], 'Others') sns.countplot(data = df_xedu, x = "Lead Source", hue = "Converted") xticks(rotation = 90)

Do Not Email

#Columna Do Not Email sns.countplot(data = df_xedu, x = "Do Not Email", hue = "Converted") xticks(rotation = 90)

Do Not Call

#Columna Do Not Call sns.countplot(data = df_xedu, x = "Do Not Call", hue = "Converted") xticks(rotation = 90)

TotalVisits

#Columna TotalVisits sns.boxplot(data = df_xedu, x = 'Converted', y = 'TotalVisits')

#Reducimos los outliers perc = df_xedu['TotalVisits'].quantile([0.05,0.90]).values df_xedu['TotalVisits'][df_xedu['TotalVisits'] <= perc[0]] = perc[0] df_xedu['TotalVisits'][df_xedu['TotalVisits'] >= perc[1]] = perc[1] sns.boxplot(data = df_xedu, x = 'Converted', y = 'TotalVisits')

Total Time Spent on Website

#Columna Total Time Spent on Website sns.boxplot(data = df_xedu, x = 'Converted', y = 'Total Time Spent on Website')

#Reducimos los outliers perc = df_xedu['Total Time Spent on Website'].quantile([0.05,0.80]).values df_xedu['Total Time Spent on Website'][df_xedu['Total Time Spent on Website'] <= perc[0]] = perc[0] df_xedu['Total Time Spent on Website'][df_xedu['Total Time Spent on Website'] >= perc[1]] = perc[1] sns.boxplot(data = df_xedu, x = 'Converted', y = 'Total Time Spent on Website')

Page Views Per Visit

#Columna Page Views Per Visit sns.boxplot(data = df_xedu, x = 'Converted', y = 'Page Views Per Visit')

#Reducimos los outliers perc = df_xedu['Page Views Per Visit'].quantile([0.05,0.90]).values df_xedu['Page Views Per Visit'][df_xedu['Page Views Per Visit'] <= perc[0]] = perc[0] df_xedu['Page Views Per Visit'][df_xedu['Page Views Per Visit'] >= perc[1]] = perc[1] sns.boxplot(data = df_xedu, x = 'Converted', y = 'Page Views Per Visit')

Last Activity

#Columna Last Activity sns.countplot(data = df_xedu, x = "Last Activity", hue = "Converted") xticks(rotation = 90)

"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido""" df_xedu['Last Activity'] = df_xedu['Last Activity'].replace(['Email Marked Spam', 'Email Received', 'Resubscribed to emails', 'Approached upfront', 'Visited Booth in Tradeshow', 'View in browser link Clicked'], 'Others') sns.countplot(data = df_xedu, x = "Last Activity", hue = "Converted") xticks(rotation = 90)

Country

#Columna Country sns.countplot(data = df_xedu, x = "Country", hue = "Converted") xticks(rotation = 90)

"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido""" df_xedu['Country'] = df_xedu['Country'].replace(['Indonesia', 'Vietnam', 'Bangladesh', 'Philippines', 'Denmark', 'Switzerland', 'Liberia', 'Malaysia', 'unknown', 'Tanzania', 'South Africa', 'Italy', 'Kenya', 'Uganda', 'Asia/Pacific Region', 'Germany', 'Hong Kong', 'Nigeria', 'Sweden', 'Netherlands', 'Canada', 'China', 'Sri Lanka', 'France', 'Belgium', 'Qatar', 'Ghana', 'Bahrain', 'United Kingdom', 'Australia', 'Oman', 'Kuwait', 'Russia'], 'Others') sns.countplot(data = df_xedu, x = "Country", hue = "Converted") xticks(rotation = 90)

Specialization

#Columna Specialization sns.countplot(data = df_xedu, x = "Specialization", hue = "Converted") xticks(rotation = 90)

What is your current occupation

#Columna What is your current occupation sns.countplot(data = df_xedu, x = "What is your current occupation", hue = "Converted") xticks(rotation = 90)

What matters most to you in choosing a course

#Columna What matters most to you in choosing a course sns.countplot(data = df_xedu, x = "What matters most to you in choosing a course", hue = "Converted") xticks(rotation = 90)

Search

#Columna Search sns.countplot(data = df_xedu, x = "Search", hue = "Converted") xticks(rotation = 90)

Magazine

#Columna Magazine sns.countplot(data = df_xedu, x = "Magazine", hue = "Converted") xticks(rotation = 90)

Newspaper Article

#Columna Newspaper Article sns.countplot(data = df_xedu, x = "Newspaper Article", hue = "Converted") xticks(rotation = 90)

Digital Advertisement

#Columna Digital Advertisement sns.countplot(data = df_xedu, x = "Digital Advertisement", hue = "Converted") xticks(rotation = 90)

Through Recommendations

#Columna Through Recommendations sns.countplot(data = df_xedu, x = "Through Recommendations", hue = "Converted") xticks(rotation = 90)

Receive More Updates About Our Courses

#Columna Receive More Updates About Our Courses sns.countplot(data = df_xedu, x = "Receive More Updates About Our Courses", hue = "Converted") xticks(rotation = 90)

Update me on Supply Chain Content

#Columna Update me on Supply Chain Content sns.countplot(data = df_xedu, x = "Update me on Supply Chain Content", hue = "Converted") xticks(rotation = 90)

Get updates on DM Content

#Columna Get updates on DM Content sns.countplot(data = df_xedu, x = "Get updates on DM Content", hue = "Converted") xticks(rotation = 90)

City

#Columna City sns.countplot(data = df_xedu, x = "City", hue = "Converted") xticks(rotation = 90)

I agree to pay the amount through cheque

#Columna I agree to pay the amount through cheque sns.countplot(data = df_xedu, x = "I agree to pay the amount through cheque", hue = "Converted") xticks(rotation = 90)

A free copy of Mastering The Interview

#Columna A free copy of Mastering The Interview sns.countplot(data = df_xedu, x = "A free copy of Mastering The Interview", hue = "Converted") xticks(rotation = 90)

Last Notable Activity

#Columna Last Notable Activity sns.countplot(data = df_xedu, x = "Last Notable Activity", hue = "Converted") xticks(rotation = 90)

"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido""" df_xedu['Last Notable Activity'] = df_xedu['Last Notable Activity'].replace(['Email Marked Spam', 'Email Received', 'Form Submitted on Website', 'View in browser link Clicked', 'Resubscribed to emails', 'Approached upfront'], 'Others') sns.countplot(data = df_xedu, x = "Last Notable Activity", hue = "Converted") xticks(rotation = 90)

X Education Forums

#Columna X Education Forums sns.countplot(data = df_xedu, x = "X Education Forums", hue = "Converted") xticks(rotation = 90)

Newspaper

#Columna Newspaper sns.countplot(data = df_xedu, x = "Newspaper", hue = "Converted") xticks(rotation = 90)

Data Preparation

#Eliminamos todas las columnas irrelevantes df_xedu = df_xedu.drop(['Do Not Call', 'TotalVisits','Page Views Per Visit', 'Country', 'What matters most to you in choosing a course', 'Search', 'Magazine', 'Newspaper Article', 'Digital Advertisement', 'Through Recommendations', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content', 'Get updates on DM Content', 'I agree to pay the amount through cheque', 'X Education Forums', 'Newspaper'], axis = 1) #Guardamos nuestro dataframe en una variable distinta para ocuparlo al final df_final_test = df_xedu pd.options.display.max_columns = None display(df_xedu.head(5))

df_xedu.shape

df_xedu.info()

df_xedu.describe()

#Convertimos las variables binarias en datos numéricos df_xedu['Do Not Email'] = df_xedu['Do Not Email'].replace('No', 0) df_xedu['Do Not Email'] = df_xedu['Do Not Email'].replace('Yes', 1) df_xedu['A free copy of Mastering The Interview'] = df_xedu['A free copy of Mastering The Interview'].replace('No', 0) df_xedu['A free copy of Mastering The Interview'] = df_xedu['A free copy of Mastering The Interview'].replace('Yes', 1) pd.options.display.max_columns = None display(df_xedu.head(10))

#Cambiamos los nombres de las columnas para manipular el dataset con mayor simplicidad y orden df_xedu.columns = ['id', 'lead_origin', 'lead_source', 'do_not_email', 'converted', 'total_time_on_website', 'last_activity', 'specialization', 'current_ocupation', 'city', 'free_copy_of_interview', 'last_notable_activity']

#Ahora convertiremos nuestra variable continua en variable categórica usando los cuartiles df_xedu['total_time_on_website'].describe()

sns.boxplot(data = df_xedu, x = 'converted', y = 'total_time_on_website')

condition_muy_bajo = (df_xedu['total_time_on_website'].astype(int) >= 0) & (df_xedu['total_time_on_website'].astype(int) <= 11) condition_bajo = (df_xedu['total_time_on_website'].astype(int) > 11) & (df_xedu['total_time_on_website'].astype(int) <= 246) condition_medio = (df_xedu['total_time_on_website'].astype(int) > 246) & (df_xedu['total_time_on_website'].astype(int) <= 922) condition_alto = df_xedu['total_time_on_website'].astype(int) > 922 df_xedu.loc[condition_muy_bajo, 'total_time_on_website'] = 'muy_bajo' df_xedu.loc[condition_bajo, 'total_time_on_website'] = 'bajo' df_xedu.loc[condition_medio, 'total_time_on_website'] = 'medio' df_xedu.loc[condition_alto, 'total_time_on_website'] = 'alto' pd.options.display.max_columns = None display(df_xedu.head(10))

#Utilizamos get_dummies para convertir variables categóricas en variables numéricas df_xedu_dummy = pd.get_dummies(df_xedu[['lead_origin', 'lead_source', 'total_time_on_website','last_activity', 'specialization', 'current_ocupation','city','last_notable_activity']]) df_xedu_dummy.head(10)

df_xedu = pd.concat([df_xedu, df_xedu_dummy], axis=1) df_xedu = df_xedu.drop(['lead_origin', 'lead_source', 'total_time_on_website','last_activity', 'specialization', 'current_ocupation','city','last_notable_activity'], axis = 1) df_xedu.head(10)

df_xedu.shape

#Intentamos identificar grados altos de correlación entre las variables mediante un heatmap import random def unico(x,L): esUnico=True for i in range(len(L)): if x==L[i]: esUnico=False break return esUnico L=[] j=0 while j<15: x=random.randint(0,74) if unico(x,L): L.append(x) j+=1 lista_corr = [] for i in L: lista_corr.append(df_xedu.columns[i]) corr_matrix = df_xedu[lista_corr].corr() sns.heatmap(corr_matrix, cmap="coolwarm")

#Importamos el dataframe final df_xedu.to_csv('XEducation_Dummies.csv')

Model Building

#Observamos nuestro dataset pd.options.display.max_columns = None display(df_xedu.head(10))

#Cargamos las variables de entrada, excluyendo la columna "converted" X = df_xedu.drop(['id','converted'], axis=1) y = df_xedu['converted'] X.shape

#Ajustamos nuestro modelo a nuestras variables de entrada model = linear_model.LogisticRegression() model.fit(X,y)

#Dividimos nuestro dataset en un set de entrenamiento y un set de prueba X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, y, train_size=0.7, test_size=0.3, random_state = 100)

#Compilamos nuestro modelo con el set de entrenamiento name = 'Score en el Set de Entrenamiento' kfold = model_selection.KFold(n_splits = 10, random_state = 100, shuffle = True) cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv = kfold, scoring = 'accuracy') msg = "%s: %f" % (name, cv_results.mean())

#Grado de Certeza del Modelo aplicado al Set de Entrenamiento print(msg)

#Grado de Certeza del Modelo aplicado al Set de Pruebas name_s = 'Score en el Set de Pruebas' predictions = model.predict(X_validation) print("%s: %f" % (name_s, accuracy_score(Y_validation, predictions)))

#Plasmamos la Matriz de Confusión print(confusion_matrix(Y_validation, predictions))

#Traemos de vuelta nuestro dataset original column_prediction = model.predict(X) df_final_test.insert(5, 'prediction', column_prediction) column_probability = [] array = model.predict_proba(X) lista = list(range(0,len(array),1)) for i in lista: column_probability.append(array[i][1]) df_final_test.insert(6, 'probability', column_probability) pd.options.display.max_columns = None display(df_final_test)

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}XEducation

Importing &amp; Data Cleaning

Country

Specialization

What is your current ocupation

What matters most to you in choosing a course

City

Exploratory Data Analysis

Lead Origin

Lead Source

Do Not Email

Do Not Call

TotalVisits

Total Time Spent on Website

Page Views Per Visit

Last Activity

Country

Specialization

What is your current occupation

What matters most to you in choosing a course

Search

Magazine

Newspaper Article

Digital Advertisement

Through Recommendations

Receive More Updates About Our Courses

Update me on Supply Chain Content

Get updates on DM Content

City

I agree to pay the amount through cheque

A free copy of Mastering The Interview

Last Notable Activity

X Education Forums

Newspaper

Data Preparation

Model Building

XEducation

Importing & Data Cleaning