XEducation

# Import libraries import sys import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from IPython.display import display # Data Visualization from matplotlib.pyplot import xticks %matplotlib inline #Model Building from sklearn.model_selection import train_test_split from sklearn import linear_model from sklearn import model_selection from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score print(f"Python version {sys.version}") print(f"pandas version: {pd.__version__}")

Python version 3.7.11 (default, Jun 29 2021, 20:40:03) 
[GCC 8.3.0]
pandas version: 1.2.5

Importing & Data Cleaning

df = pd.DataFrame(pd.read_csv(r"/datasets/google-drive/XEducation_LeadScoring/XEducation_LeadsDataset.csv")) pd.options.display.max_columns = None display(df.head(10))

#Eliminamos llaves duplicadas """Eliminar filas que tengan ID duplicado, si las hay""" print(df.shape) df.drop_duplicates(subset=['Prospect ID']) print(df.shape)

(9240, 37)
(9240, 37)

#Escaneamos la estructura de los datos df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 non-null   float64
 10  Last Activity                                  9137 non-null   object 
 11  Country                                        6779 non-null   object 
 12  Specialization                                 7802 non-null   object 
 13  How did you hear about X Education             7033 non-null   object 
 14  What is your current occupation                6550 non-null   object 
 15  What matters most to you in choosing a course  6531 non-null   object 
 16  Search                                         9240 non-null   object 
 17  Magazine                                       9240 non-null   object 
 18  Newspaper Article                              9240 non-null   object 
 19  X Education Forums                             9240 non-null   object 
 20  Newspaper                                      9240 non-null   object 
 21  Digital Advertisement                          9240 non-null   object 
 22  Through Recommendations                        9240 non-null   object 
 23  Receive More Updates About Our Courses         9240 non-null   object 
 24  Tags                                           5887 non-null   object 
 25  Lead Quality                                   4473 non-null   object 
 26  Update me on Supply Chain Content              9240 non-null   object 
 27  Get updates on DM Content                      9240 non-null   object 
 28  Lead Profile                                   6531 non-null   object 
 29  City                                           7820 non-null   object 
 30  Asymmetrique Activity Index                    5022 non-null   object 
 31  Asymmetrique Profile Index                     5022 non-null   object 
 32  Asymmetrique Activity Score                    5022 non-null   float64
 33  Asymmetrique Profile Score                     5022 non-null   float64
 34  I agree to pay the amount through cheque       9240 non-null   object 
 35  A free copy of Mastering The Interview         9240 non-null   object 
 36  Last Notable Activity                          9240 non-null   object 
dtypes: float64(4), int64(3), object(30)
memory usage: 2.6+ MB

#Escaneamos las estadísticas de los datos numéricos df.describe()

#Eliminamos columnas de información seleccionadas df_clean = df.drop(['Lead Number', 'Tags','Lead Quality', 'Lead Profile', 'Asymmetrique Activity Index', 'Asymmetrique Profile Index', 'Asymmetrique Activity Score', 'Asymmetrique Profile Score'], axis = 1) pd.options.display.max_columns = None display(df_clean.head(5))

#Trabajamos con los valores <Select> de nuestra base de datos """Convertimos los valores Select en NaN, pues no aportan nada de valor a nuestro análisis""" df_clean = df_clean.replace('Select', np.nan) round(100*(df_clean.isnull().sum()/len(df_clean.index)))

#Eliminamos la columna no representativa df_clean = df_clean.drop(['How did you hear about X Education'], axis = 1) pd.options.display.max_columns = None display(df_clean.head(5))

df_clean.shape

Country

df_clean['Country'].describe()

"""Podemos cambiar los valores nulos por <India>, debido a que tiene una frecuencia superior al 50%""" df_clean['Country'] = df_clean['Country'].replace(np.nan, 'India') round(100*(df_clean['Country'].isnull().sum()/len(df_clean['Country'].index)))

Specialization

df_clean['Specialization'].describe()

"""Nuestro dato más alto no es representativo, por lo que englobamos el resto de opciones en una nueva categoría""" df_clean['Specialization'] = df_clean['Specialization'].replace(np.nan, 'Other') round(100*(df_clean['Specialization'].isnull().sum()/len(df_clean['Specialization'].index)))

What is your current ocupation

df_clean['What is your current occupation'].describe()

"""Podemos cambiar los valores nulos por <Unemployed>, debido a que tiene una frecuencia superior al 50%""" df_clean['What is your current occupation'] = df_clean['What is your current occupation'].replace(np.nan, 'Unemployed') round(100*(df_clean['What is your current occupation'].isnull().sum()/len(df_clean['What is your current occupation'].index)))

What matters most to you in choosing a course

df_clean['What matters most to you in choosing a course'].describe()

"""Podemos cambiar los valores nulos por <Better Career Prospects>, debido a que tiene una frecuencia superior al 50%""" df_clean['What matters most to you in choosing a course'] = df_clean['What matters most to you in choosing a course'].replace(np.nan, 'Better Career Prospects') round(100*(df_clean['What matters most to you in choosing a course'].isnull().sum()/len(df_clean['What matters most to you in choosing a course'].index)))

City

df_clean['City'].describe()

"""Podemos cambiar los valores nulos por <Mumbai>, debido a que tiene una frecuencia superior al 50%""" df_clean['City'] = df_clean['City'].replace(np.nan, 'Mumbai') round(100*(df_clean['City'].isnull().sum()/len(df_clean['City'].index)))

"""Por último, eliminamos las filas de las columnas que tienen valores NaN mínimos""" df_xedu = df_clean.dropna() round(100*(df_xedu.isnull().sum()/len(df_xedu.index)))

df_xedu.shape

df_xedu.to_csv('XEducation_Leads.csv')

Exploratory Data Analysis

#Evaluamos la columna objetivo conv = (df_xedu['Converted'].sum() / len(df_xedu['Converted']) * 100) no_conv = ((df_xedu['Converted']==0).sum() / len(df_xedu['Converted'])*100) sns.barplot(x=['Convertidos', "No Convertidos"], y=[conv, no_conv]) print('% Convertidos: ' + str(round(conv,2)) + " - " + str(df_xedu['Converted'].sum())) print('% No Convertidos: ' + str(round(no_conv, 2)) + " - " + str((df_xedu['Converted']==0).sum()))

% Convertidos: 37.86 - 3435
% No Convertidos: 62.14 - 5639

Lead Origin

#Columna Lead Origin sns.countplot(data = df_xedu, x = "Lead Origin", hue = "Converted") xticks(rotation = 90)

Lead Source

#Columna Lead Source sns.countplot(data = df_xedu, x = "Lead Source", hue = "Converted") xticks(rotation = 90)

"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido""" df_xedu['Lead Source'] = df_xedu['Lead Source'].replace('google', 'Google') df_xedu['Lead Source'] = df_xedu['Lead Source'].replace(['blog', 'Pay per Click Ads', 'bing', 'Social Media', 'WeLearn', 'Click2call', 'Live Chat', 'welearnblog_Home', 'youtubechannel', 'testone', 'Press_Release', 'NC_EDM'], 'Others') sns.countplot(data = df_xedu, x = "Lead Source", hue = "Converted") xticks(rotation = 90)

/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.

Do Not Email

#Columna Do Not Email sns.countplot(data = df_xedu, x = "Do Not Email", hue = "Converted") xticks(rotation = 90)

Do Not Call

#Columna Do Not Call sns.countplot(data = df_xedu, x = "Do Not Call", hue = "Converted") xticks(rotation = 90)

TotalVisits

#Columna TotalVisits sns.boxplot(data = df_xedu, x = 'Converted', y = 'TotalVisits')

#Reducimos los outliers perc = df_xedu['TotalVisits'].quantile([0.05,0.90]).values df_xedu['TotalVisits'][df_xedu['TotalVisits'] <= perc[0]] = perc[0] df_xedu['TotalVisits'][df_xedu['TotalVisits'] >= perc[1]] = perc[1] sns.boxplot(data = df_xedu, x = 'Converted', y = 'TotalVisits')

/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/series.py:992: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(~key, value, inplace=True)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/series.py:992: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(~key, value, inplace=True)

Total Time Spent on Website

#Columna Total Time Spent on Website sns.boxplot(data = df_xedu, x = 'Converted', y = 'Total Time Spent on Website')

#Reducimos los outliers perc = df_xedu['Total Time Spent on Website'].quantile([0.05,0.80]).values df_xedu['Total Time Spent on Website'][df_xedu['Total Time Spent on Website'] <= perc[0]] = perc[0] df_xedu['Total Time Spent on Website'][df_xedu['Total Time Spent on Website'] >= perc[1]] = perc[1] sns.boxplot(data = df_xedu, x = 'Converted', y = 'Total Time Spent on Website')

/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/series.py:992: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(~key, value, inplace=True)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/series.py:992: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(~key, value, inplace=True)

Page Views Per Visit

#Columna Page Views Per Visit sns.boxplot(data = df_xedu, x = 'Converted', y = 'Page Views Per Visit')

#Reducimos los outliers perc = df_xedu['Page Views Per Visit'].quantile([0.05,0.90]).values df_xedu['Page Views Per Visit'][df_xedu['Page Views Per Visit'] <= perc[0]] = perc[0] df_xedu['Page Views Per Visit'][df_xedu['Page Views Per Visit'] >= perc[1]] = perc[1] sns.boxplot(data = df_xedu, x = 'Converted', y = 'Page Views Per Visit')

/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/series.py:992: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(~key, value, inplace=True)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/series.py:992: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(~key, value, inplace=True)

Last Activity

#Columna Last Activity sns.countplot(data = df_xedu, x = "Last Activity", hue = "Converted") xticks(rotation = 90)

"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido""" df_xedu['Last Activity'] = df_xedu['Last Activity'].replace(['Email Marked Spam', 'Email Received', 'Resubscribed to emails', 'Approached upfront', 'Visited Booth in Tradeshow', 'View in browser link Clicked'], 'Others') sns.countplot(data = df_xedu, x = "Last Activity", hue = "Converted") xticks(rotation = 90)

/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until

Country

#Columna Country sns.countplot(data = df_xedu, x = "Country", hue = "Converted") xticks(rotation = 90)

"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido""" df_xedu['Country'] = df_xedu['Country'].replace(['Indonesia', 'Vietnam', 'Bangladesh', 'Philippines', 'Denmark', 'Switzerland', 'Liberia', 'Malaysia', 'unknown', 'Tanzania', 'South Africa', 'Italy', 'Kenya', 'Uganda', 'Asia/Pacific Region', 'Germany', 'Hong Kong', 'Nigeria', 'Sweden', 'Netherlands', 'Canada', 'China', 'Sri Lanka', 'France', 'Belgium', 'Qatar', 'Ghana', 'Bahrain', 'United Kingdom', 'Australia', 'Oman', 'Kuwait', 'Russia'], 'Others') sns.countplot(data = df_xedu, x = "Country", hue = "Converted") xticks(rotation = 90)

/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.

Specialization

#Columna Specialization sns.countplot(data = df_xedu, x = "Specialization", hue = "Converted") xticks(rotation = 90)

What is your current occupation

#Columna What is your current occupation sns.countplot(data = df_xedu, x = "What is your current occupation", hue = "Converted") xticks(rotation = 90)

What matters most to you in choosing a course

#Columna What matters most to you in choosing a course sns.countplot(data = df_xedu, x = "What matters most to you in choosing a course", hue = "Converted") xticks(rotation = 90)

Search

#Columna Search sns.countplot(data = df_xedu, x = "Search", hue = "Converted") xticks(rotation = 90)

Magazine

#Columna Magazine sns.countplot(data = df_xedu, x = "Magazine", hue = "Converted") xticks(rotation = 90)

Newspaper Article

#Columna Newspaper Article sns.countplot(data = df_xedu, x = "Newspaper Article", hue = "Converted") xticks(rotation = 90)

Digital Advertisement

#Columna Digital Advertisement sns.countplot(data = df_xedu, x = "Digital Advertisement", hue = "Converted") xticks(rotation = 90)

Through Recommendations

#Columna Through Recommendations sns.countplot(data = df_xedu, x = "Through Recommendations", hue = "Converted") xticks(rotation = 90)

Receive More Updates About Our Courses

#Columna Receive More Updates About Our Courses sns.countplot(data = df_xedu, x = "Receive More Updates About Our Courses", hue = "Converted") xticks(rotation = 90)

Update me on Supply Chain Content

#Columna Update me on Supply Chain Content sns.countplot(data = df_xedu, x = "Update me on Supply Chain Content", hue = "Converted") xticks(rotation = 90)

Get updates on DM Content

#Columna Get updates on DM Content sns.countplot(data = df_xedu, x = "Get updates on DM Content", hue = "Converted") xticks(rotation = 90)

City

#Columna City sns.countplot(data = df_xedu, x = "City", hue = "Converted") xticks(rotation = 90)

I agree to pay the amount through cheque

#Columna I agree to pay the amount through cheque sns.countplot(data = df_xedu, x = "I agree to pay the amount through cheque", hue = "Converted") xticks(rotation = 90)

A free copy of Mastering The Interview

#Columna A free copy of Mastering The Interview sns.countplot(data = df_xedu, x = "A free copy of Mastering The Interview", hue = "Converted") xticks(rotation = 90)

Last Notable Activity

#Columna Last Notable Activity sns.countplot(data = df_xedu, x = "Last Notable Activity", hue = "Converted") xticks(rotation = 90)

"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido""" df_xedu['Last Notable Activity'] = df_xedu['Last Notable Activity'].replace(['Email Marked Spam', 'Email Received', 'Form Submitted on Website', 'View in browser link Clicked', 'Resubscribed to emails', 'Approached upfront'], 'Others') sns.countplot(data = df_xedu, x = "Last Notable Activity", hue = "Converted") xticks(rotation = 90)

/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until

X Education Forums

#Columna X Education Forums sns.countplot(data = df_xedu, x = "X Education Forums", hue = "Converted") xticks(rotation = 90)

Newspaper

#Columna Newspaper sns.countplot(data = df_xedu, x = "Newspaper", hue = "Converted") xticks(rotation = 90)

Data Preparation

#Eliminamos todas las columnas irrelevantes df_xedu = df_xedu.drop(['Do Not Call', 'TotalVisits','Page Views Per Visit', 'Country', 'What matters most to you in choosing a course', 'Search', 'Magazine', 'Newspaper Article', 'Digital Advertisement', 'Through Recommendations', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content', 'Get updates on DM Content', 'I agree to pay the amount through cheque', 'X Education Forums', 'Newspaper'], axis = 1) #Guardamos nuestro dataframe en una variable distinta para ocuparlo al final df_final_test = df_xedu pd.options.display.max_columns = None display(df_xedu.head(5))

df_xedu.shape

df_xedu.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9074 entries, 0 to 9239
Data columns (total 12 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Prospect ID                             9074 non-null   object 
 1   Lead Origin                             9074 non-null   object 
 2   Lead Source                             9074 non-null   object 
 3   Do Not Email                            9074 non-null   object 
 4   Converted                               9074 non-null   int64  
 5   Total Time Spent on Website             9074 non-null   float64
 6   Last Activity                           9074 non-null   object 
 7   Specialization                          9074 non-null   object 
 8   What is your current occupation         9074 non-null   object 
 9   City                                    9074 non-null   object 
 10  A free copy of Mastering The Interview  9074 non-null   object 
 11  Last Notable Activity                   9074 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 1.2+ MB

df_xedu.describe()

#Convertimos las variables binarias en datos numéricos df_xedu['Do Not Email'] = df_xedu['Do Not Email'].replace('No', 0) df_xedu['Do Not Email'] = df_xedu['Do Not Email'].replace('Yes', 1) df_xedu['A free copy of Mastering The Interview'] = df_xedu['A free copy of Mastering The Interview'].replace('No', 0) df_xedu['A free copy of Mastering The Interview'] = df_xedu['A free copy of Mastering The Interview'].replace('Yes', 1) pd.options.display.max_columns = None display(df_xedu.head(10))

#Cambiamos los nombres de las columnas para manipular el dataset con mayor simplicidad y orden df_xedu.columns = ['id', 'lead_origin', 'lead_source', 'do_not_email', 'converted', 'total_time_on_website', 'last_activity', 'specialization', 'current_ocupation', 'city', 'free_copy_of_interview', 'last_notable_activity']

#Ahora convertiremos nuestra variable continua en variable categórica usando los cuartiles df_xedu['total_time_on_website'].describe()

sns.boxplot(data = df_xedu, x = 'converted', y = 'total_time_on_website')

condition_muy_bajo = (df_xedu['total_time_on_website'].astype(int) >= 0) & (df_xedu['total_time_on_website'].astype(int) <= 11) condition_bajo = (df_xedu['total_time_on_website'].astype(int) > 11) & (df_xedu['total_time_on_website'].astype(int) <= 246) condition_medio = (df_xedu['total_time_on_website'].astype(int) > 246) & (df_xedu['total_time_on_website'].astype(int) <= 922) condition_alto = df_xedu['total_time_on_website'].astype(int) > 922 df_xedu.loc[condition_muy_bajo, 'total_time_on_website'] = 'muy_bajo' df_xedu.loc[condition_bajo, 'total_time_on_website'] = 'bajo' df_xedu.loc[condition_medio, 'total_time_on_website'] = 'medio' df_xedu.loc[condition_alto, 'total_time_on_website'] = 'alto' pd.options.display.max_columns = None display(df_xedu.head(10))

#Utilizamos get_dummies para convertir variables categóricas en variables numéricas df_xedu_dummy = pd.get_dummies(df_xedu[['lead_origin', 'lead_source', 'total_time_on_website','last_activity', 'specialization', 'current_ocupation','city','last_notable_activity']]) df_xedu_dummy.head(10)

df_xedu = pd.concat([df_xedu, df_xedu_dummy], axis=1) df_xedu = df_xedu.drop(['lead_origin', 'lead_source', 'total_time_on_website','last_activity', 'specialization', 'current_ocupation','city','last_notable_activity'], axis = 1) df_xedu.head(10)

df_xedu.shape

#Intentamos identificar grados altos de correlación entre las variables mediante un heatmap import random def unico(x,L): esUnico=True for i in range(len(L)): if x==L[i]: esUnico=False break return esUnico L=[] j=0 while j<15: x=random.randint(0,74) if unico(x,L): L.append(x) j+=1 lista_corr = [] for i in L: lista_corr.append(df_xedu.columns[i]) corr_matrix = df_xedu[lista_corr].corr() sns.heatmap(corr_matrix, cmap="coolwarm")

#Importamos el dataframe final df_xedu.to_csv('XEducation_Dummies.csv')

Model Building

#Observamos nuestro dataset pd.options.display.max_columns = None display(df_xedu.head(10))

#Cargamos las variables de entrada, excluyendo la columna "converted" X = df_xedu.drop(['id','converted'], axis=1) y = df_xedu['converted'] X.shape

#Ajustamos nuestro modelo a nuestras variables de entrada model = linear_model.LogisticRegression() model.fit(X,y)

/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)

#Dividimos nuestro dataset en un set de entrenamiento y un set de prueba X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, y, train_size=0.7, test_size=0.3, random_state = 100)

#Compilamos nuestro modelo con el set de entrenamiento name = 'Score en el Set de Entrenamiento' kfold = model_selection.KFold(n_splits = 10, random_state = 100, shuffle = True) cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv = kfold, scoring = 'accuracy') msg = "%s: %f" % (name, cv_results.mean())

/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)

#Grado de Certeza del Modelo aplicado al Set de Entrenamiento print(msg)

Score en el Set de Entrenamiento: 0.811368

#Grado de Certeza del Modelo aplicado al Set de Pruebas name_s = 'Score en el Set de Pruebas' predictions = model.predict(X_validation) print("%s: %f" % (name_s, accuracy_score(Y_validation, predictions)))

Score en el Set de Pruebas: 0.822622

#Plasmamos la Matriz de Confusión print(confusion_matrix(Y_validation, predictions))

[[1494  240]
 [ 243  746]]

#Traemos de vuelta nuestro dataset original column_prediction = model.predict(X) df_final_test.insert(5, 'prediction', column_prediction) column_probability = [] array = model.predict_proba(X) lista = list(range(0,len(array),1)) for i in lista: column_probability.append(array[i][1]) df_final_test.insert(6, 'probability', column_probability) pd.options.display.max_columns = None display(df_final_test)

XEducation

Importing &amp; Data Cleaning

Country

Specialization

What is your current ocupation

What matters most to you in choosing a course

City

Exploratory Data Analysis

Lead Origin

Lead Source

Do Not Email

Do Not Call

TotalVisits

Total Time Spent on Website

Page Views Per Visit

Last Activity

Country

Specialization

What is your current occupation

What matters most to you in choosing a course

Search

Magazine

Newspaper Article

Digital Advertisement

Through Recommendations

Receive More Updates About Our Courses

Update me on Supply Chain Content

Get updates on DM Content

City

I agree to pay the amount through cheque

A free copy of Mastering The Interview

Last Notable Activity

X Education Forums

Newspaper

Data Preparation

Model Building

Importing & Data Cleaning