XEducation
# Import libraries
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
# Data Visualization
from matplotlib.pyplot import xticks
%matplotlib inline
#Model Building
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print(f"Python version {sys.version}")
print(f"pandas version: {pd.__version__}")
Python version 3.7.11 (default, Jun 29 2021, 20:40:03)
[GCC 8.3.0]
pandas version: 1.2.5
Importing & Data Cleaning
df = pd.DataFrame(pd.read_csv(r"/datasets/google-drive/XEducation_LeadScoring/XEducation_LeadsDataset.csv"))
pd.options.display.max_columns = None
display(df.head(10))
#Eliminamos llaves duplicadas
"""Eliminar filas que tengan ID duplicado, si las hay"""
print(df.shape)
df.drop_duplicates(subset=['Prospect ID'])
print(df.shape)
(9240, 37)
(9240, 37)
#Escaneamos la estructura de los datos
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Prospect ID 9240 non-null object
1 Lead Number 9240 non-null int64
2 Lead Origin 9240 non-null object
3 Lead Source 9204 non-null object
4 Do Not Email 9240 non-null object
5 Do Not Call 9240 non-null object
6 Converted 9240 non-null int64
7 TotalVisits 9103 non-null float64
8 Total Time Spent on Website 9240 non-null int64
9 Page Views Per Visit 9103 non-null float64
10 Last Activity 9137 non-null object
11 Country 6779 non-null object
12 Specialization 7802 non-null object
13 How did you hear about X Education 7033 non-null object
14 What is your current occupation 6550 non-null object
15 What matters most to you in choosing a course 6531 non-null object
16 Search 9240 non-null object
17 Magazine 9240 non-null object
18 Newspaper Article 9240 non-null object
19 X Education Forums 9240 non-null object
20 Newspaper 9240 non-null object
21 Digital Advertisement 9240 non-null object
22 Through Recommendations 9240 non-null object
23 Receive More Updates About Our Courses 9240 non-null object
24 Tags 5887 non-null object
25 Lead Quality 4473 non-null object
26 Update me on Supply Chain Content 9240 non-null object
27 Get updates on DM Content 9240 non-null object
28 Lead Profile 6531 non-null object
29 City 7820 non-null object
30 Asymmetrique Activity Index 5022 non-null object
31 Asymmetrique Profile Index 5022 non-null object
32 Asymmetrique Activity Score 5022 non-null float64
33 Asymmetrique Profile Score 5022 non-null float64
34 I agree to pay the amount through cheque 9240 non-null object
35 A free copy of Mastering The Interview 9240 non-null object
36 Last Notable Activity 9240 non-null object
dtypes: float64(4), int64(3), object(30)
memory usage: 2.6+ MB
#Escaneamos las estadísticas de los datos numéricos
df.describe()
#Eliminamos columnas de información seleccionadas
df_clean = df.drop(['Lead Number', 'Tags','Lead Quality', 'Lead Profile', 'Asymmetrique Activity Index', 'Asymmetrique Profile Index',
'Asymmetrique Activity Score', 'Asymmetrique Profile Score'], axis = 1)
pd.options.display.max_columns = None
display(df_clean.head(5))
#Trabajamos con los valores <Select> de nuestra base de datos
"""Convertimos los valores Select en NaN, pues no aportan nada de valor a nuestro análisis"""
df_clean = df_clean.replace('Select', np.nan)
round(100*(df_clean.isnull().sum()/len(df_clean.index)))
#Eliminamos la columna no representativa
df_clean = df_clean.drop(['How did you hear about X Education'], axis = 1)
pd.options.display.max_columns = None
display(df_clean.head(5))
df_clean.shape
Country
df_clean['Country'].describe()
"""Podemos cambiar los valores nulos por <India>, debido a que tiene una frecuencia superior al 50%"""
df_clean['Country'] = df_clean['Country'].replace(np.nan, 'India')
round(100*(df_clean['Country'].isnull().sum()/len(df_clean['Country'].index)))
Specialization
df_clean['Specialization'].describe()
"""Nuestro dato más alto no es representativo, por lo que englobamos el resto de opciones en una nueva categoría"""
df_clean['Specialization'] = df_clean['Specialization'].replace(np.nan, 'Other')
round(100*(df_clean['Specialization'].isnull().sum()/len(df_clean['Specialization'].index)))
What is your current ocupation
df_clean['What is your current occupation'].describe()
"""Podemos cambiar los valores nulos por <Unemployed>, debido a que tiene una frecuencia superior al 50%"""
df_clean['What is your current occupation'] = df_clean['What is your current occupation'].replace(np.nan, 'Unemployed')
round(100*(df_clean['What is your current occupation'].isnull().sum()/len(df_clean['What is your current occupation'].index)))
What matters most to you in choosing a course
df_clean['What matters most to you in choosing a course'].describe()
"""Podemos cambiar los valores nulos por <Better Career Prospects>, debido a que tiene una frecuencia superior al 50%"""
df_clean['What matters most to you in choosing a course'] = df_clean['What matters most to you in choosing a course'].replace(np.nan, 'Better Career Prospects')
round(100*(df_clean['What matters most to you in choosing a course'].isnull().sum()/len(df_clean['What matters most to you in choosing a course'].index)))
City
df_clean['City'].describe()
"""Podemos cambiar los valores nulos por <Mumbai>, debido a que tiene una frecuencia superior al 50%"""
df_clean['City'] = df_clean['City'].replace(np.nan, 'Mumbai')
round(100*(df_clean['City'].isnull().sum()/len(df_clean['City'].index)))
"""Por último, eliminamos las filas de las columnas que tienen valores NaN mínimos"""
df_xedu = df_clean.dropna()
round(100*(df_xedu.isnull().sum()/len(df_xedu.index)))
df_xedu.shape
df_xedu.to_csv('XEducation_Leads.csv')
Exploratory Data Analysis
#Evaluamos la columna objetivo
conv = (df_xedu['Converted'].sum() / len(df_xedu['Converted']) * 100)
no_conv = ((df_xedu['Converted']==0).sum() / len(df_xedu['Converted'])*100)
sns.barplot(x=['Convertidos', "No Convertidos"], y=[conv, no_conv])
print('% Convertidos: ' + str(round(conv,2)) + " - " + str(df_xedu['Converted'].sum()))
print('% No Convertidos: ' + str(round(no_conv, 2)) + " - " + str((df_xedu['Converted']==0).sum()))
% Convertidos: 37.86 - 3435
% No Convertidos: 62.14 - 5639
Lead Origin
#Columna Lead Origin
sns.countplot(data = df_xedu, x = "Lead Origin", hue = "Converted")
xticks(rotation = 90)
Lead Source
#Columna Lead Source
sns.countplot(data = df_xedu, x = "Lead Source", hue = "Converted")
xticks(rotation = 90)
"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido"""
df_xedu['Lead Source'] = df_xedu['Lead Source'].replace('google', 'Google')
df_xedu['Lead Source'] = df_xedu['Lead Source'].replace(['blog', 'Pay per Click Ads', 'bing', 'Social Media',
'WeLearn', 'Click2call', 'Live Chat', 'welearnblog_Home', 'youtubechannel', 'testone', 'Press_Release', 'NC_EDM'], 'Others')
sns.countplot(data = df_xedu, x = "Lead Source", hue = "Converted")
xticks(rotation = 90)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
after removing the cwd from sys.path.
Do Not Email
#Columna Do Not Email
sns.countplot(data = df_xedu, x = "Do Not Email", hue = "Converted")
xticks(rotation = 90)
Do Not Call
#Columna Do Not Call
sns.countplot(data = df_xedu, x = "Do Not Call", hue = "Converted")
xticks(rotation = 90)
TotalVisits
#Columna TotalVisits
sns.boxplot(data = df_xedu, x = 'Converted', y = 'TotalVisits')
#Reducimos los outliers
perc = df_xedu['TotalVisits'].quantile([0.05,0.90]).values
df_xedu['TotalVisits'][df_xedu['TotalVisits'] <= perc[0]] = perc[0]
df_xedu['TotalVisits'][df_xedu['TotalVisits'] >= perc[1]] = perc[1]
sns.boxplot(data = df_xedu, x = 'Converted', y = 'TotalVisits')
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
This is separate from the ipykernel package so we can avoid doing imports until
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/series.py:992: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self._where(~key, value, inplace=True)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
after removing the cwd from sys.path.
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/series.py:992: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self._where(~key, value, inplace=True)
Total Time Spent on Website
#Columna Total Time Spent on Website
sns.boxplot(data = df_xedu, x = 'Converted', y = 'Total Time Spent on Website')
#Reducimos los outliers
perc = df_xedu['Total Time Spent on Website'].quantile([0.05,0.80]).values
df_xedu['Total Time Spent on Website'][df_xedu['Total Time Spent on Website'] <= perc[0]] = perc[0]
df_xedu['Total Time Spent on Website'][df_xedu['Total Time Spent on Website'] >= perc[1]] = perc[1]
sns.boxplot(data = df_xedu, x = 'Converted', y = 'Total Time Spent on Website')
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
This is separate from the ipykernel package so we can avoid doing imports until
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/series.py:992: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self._where(~key, value, inplace=True)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
after removing the cwd from sys.path.
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/series.py:992: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self._where(~key, value, inplace=True)
Page Views Per Visit
#Columna Page Views Per Visit
sns.boxplot(data = df_xedu, x = 'Converted', y = 'Page Views Per Visit')
#Reducimos los outliers
perc = df_xedu['Page Views Per Visit'].quantile([0.05,0.90]).values
df_xedu['Page Views Per Visit'][df_xedu['Page Views Per Visit'] <= perc[0]] = perc[0]
df_xedu['Page Views Per Visit'][df_xedu['Page Views Per Visit'] >= perc[1]] = perc[1]
sns.boxplot(data = df_xedu, x = 'Converted', y = 'Page Views Per Visit')
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
This is separate from the ipykernel package so we can avoid doing imports until
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/series.py:992: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self._where(~key, value, inplace=True)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
after removing the cwd from sys.path.
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/series.py:992: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self._where(~key, value, inplace=True)
Last Activity
#Columna Last Activity
sns.countplot(data = df_xedu, x = "Last Activity", hue = "Converted")
xticks(rotation = 90)
"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido"""
df_xedu['Last Activity'] = df_xedu['Last Activity'].replace(['Email Marked Spam', 'Email Received', 'Resubscribed to emails', 'Approached upfront',
'Visited Booth in Tradeshow', 'View in browser link Clicked'], 'Others')
sns.countplot(data = df_xedu, x = "Last Activity", hue = "Converted")
xticks(rotation = 90)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
This is separate from the ipykernel package so we can avoid doing imports until
Country
#Columna Country
sns.countplot(data = df_xedu, x = "Country", hue = "Converted")
xticks(rotation = 90)
"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido"""
df_xedu['Country'] = df_xedu['Country'].replace(['Indonesia', 'Vietnam', 'Bangladesh', 'Philippines', 'Denmark', 'Switzerland', 'Liberia', 'Malaysia',
'unknown', 'Tanzania', 'South Africa', 'Italy', 'Kenya', 'Uganda', 'Asia/Pacific Region', 'Germany', 'Hong Kong', 'Nigeria', 'Sweden', 'Netherlands',
'Canada', 'China', 'Sri Lanka', 'France', 'Belgium', 'Qatar', 'Ghana', 'Bahrain', 'United Kingdom', 'Australia', 'Oman', 'Kuwait', 'Russia'], 'Others')
sns.countplot(data = df_xedu, x = "Country", hue = "Converted")
xticks(rotation = 90)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
after removing the cwd from sys.path.
Specialization
#Columna Specialization
sns.countplot(data = df_xedu, x = "Specialization", hue = "Converted")
xticks(rotation = 90)
What is your current occupation
#Columna What is your current occupation
sns.countplot(data = df_xedu, x = "What is your current occupation", hue = "Converted")
xticks(rotation = 90)
What matters most to you in choosing a course
#Columna What matters most to you in choosing a course
sns.countplot(data = df_xedu, x = "What matters most to you in choosing a course", hue = "Converted")
xticks(rotation = 90)
Search
#Columna Search
sns.countplot(data = df_xedu, x = "Search", hue = "Converted")
xticks(rotation = 90)
Magazine
#Columna Magazine
sns.countplot(data = df_xedu, x = "Magazine", hue = "Converted")
xticks(rotation = 90)
Newspaper Article
#Columna Newspaper Article
sns.countplot(data = df_xedu, x = "Newspaper Article", hue = "Converted")
xticks(rotation = 90)
Digital Advertisement
#Columna Digital Advertisement
sns.countplot(data = df_xedu, x = "Digital Advertisement", hue = "Converted")
xticks(rotation = 90)
Through Recommendations
#Columna Through Recommendations
sns.countplot(data = df_xedu, x = "Through Recommendations", hue = "Converted")
xticks(rotation = 90)
Receive More Updates About Our Courses
#Columna Receive More Updates About Our Courses
sns.countplot(data = df_xedu, x = "Receive More Updates About Our Courses", hue = "Converted")
xticks(rotation = 90)
Update me on Supply Chain Content
#Columna Update me on Supply Chain Content
sns.countplot(data = df_xedu, x = "Update me on Supply Chain Content", hue = "Converted")
xticks(rotation = 90)
Get updates on DM Content
#Columna Get updates on DM Content
sns.countplot(data = df_xedu, x = "Get updates on DM Content", hue = "Converted")
xticks(rotation = 90)
City
#Columna City
sns.countplot(data = df_xedu, x = "City", hue = "Converted")
xticks(rotation = 90)
I agree to pay the amount through cheque
#Columna I agree to pay the amount through cheque
sns.countplot(data = df_xedu, x = "I agree to pay the amount through cheque", hue = "Converted")
xticks(rotation = 90)
A free copy of Mastering The Interview
#Columna A free copy of Mastering The Interview
sns.countplot(data = df_xedu, x = "A free copy of Mastering The Interview", hue = "Converted")
xticks(rotation = 90)
Last Notable Activity
#Columna Last Notable Activity
sns.countplot(data = df_xedu, x = "Last Notable Activity", hue = "Converted")
xticks(rotation = 90)
"""Englobamos las fuentes minoritarias en una sola fuente para obtener menos ruido"""
df_xedu['Last Notable Activity'] = df_xedu['Last Notable Activity'].replace(['Email Marked Spam', 'Email Received', 'Form Submitted on Website',
'View in browser link Clicked', 'Resubscribed to emails', 'Approached upfront'], 'Others')
sns.countplot(data = df_xedu, x = "Last Notable Activity", hue = "Converted")
xticks(rotation = 90)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
This is separate from the ipykernel package so we can avoid doing imports until
X Education Forums
#Columna X Education Forums
sns.countplot(data = df_xedu, x = "X Education Forums", hue = "Converted")
xticks(rotation = 90)
Newspaper
#Columna Newspaper
sns.countplot(data = df_xedu, x = "Newspaper", hue = "Converted")
xticks(rotation = 90)
Data Preparation
#Eliminamos todas las columnas irrelevantes
df_xedu = df_xedu.drop(['Do Not Call', 'TotalVisits','Page Views Per Visit', 'Country', 'What matters most to you in choosing a course',
'Search', 'Magazine', 'Newspaper Article', 'Digital Advertisement', 'Through Recommendations', 'Receive More Updates About Our Courses',
'Update me on Supply Chain Content', 'Get updates on DM Content', 'I agree to pay the amount through cheque', 'X Education Forums', 'Newspaper'], axis = 1)
#Guardamos nuestro dataframe en una variable distinta para ocuparlo al final
df_final_test = df_xedu
pd.options.display.max_columns = None
display(df_xedu.head(5))
df_xedu.shape
df_xedu.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9074 entries, 0 to 9239
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Prospect ID 9074 non-null object
1 Lead Origin 9074 non-null object
2 Lead Source 9074 non-null object
3 Do Not Email 9074 non-null object
4 Converted 9074 non-null int64
5 Total Time Spent on Website 9074 non-null float64
6 Last Activity 9074 non-null object
7 Specialization 9074 non-null object
8 What is your current occupation 9074 non-null object
9 City 9074 non-null object
10 A free copy of Mastering The Interview 9074 non-null object
11 Last Notable Activity 9074 non-null object
dtypes: float64(1), int64(1), object(10)
memory usage: 1.2+ MB
df_xedu.describe()
#Convertimos las variables binarias en datos numéricos
df_xedu['Do Not Email'] = df_xedu['Do Not Email'].replace('No', 0)
df_xedu['Do Not Email'] = df_xedu['Do Not Email'].replace('Yes', 1)
df_xedu['A free copy of Mastering The Interview'] = df_xedu['A free copy of Mastering The Interview'].replace('No', 0)
df_xedu['A free copy of Mastering The Interview'] = df_xedu['A free copy of Mastering The Interview'].replace('Yes', 1)
pd.options.display.max_columns = None
display(df_xedu.head(10))
#Cambiamos los nombres de las columnas para manipular el dataset con mayor simplicidad y orden
df_xedu.columns = ['id', 'lead_origin', 'lead_source', 'do_not_email', 'converted', 'total_time_on_website', 'last_activity', 'specialization',
'current_ocupation', 'city', 'free_copy_of_interview', 'last_notable_activity']
#Ahora convertiremos nuestra variable continua en variable categórica usando los cuartiles
df_xedu['total_time_on_website'].describe()
sns.boxplot(data = df_xedu, x = 'converted', y = 'total_time_on_website')
condition_muy_bajo = (df_xedu['total_time_on_website'].astype(int) >= 0) & (df_xedu['total_time_on_website'].astype(int) <= 11)
condition_bajo = (df_xedu['total_time_on_website'].astype(int) > 11) & (df_xedu['total_time_on_website'].astype(int) <= 246)
condition_medio = (df_xedu['total_time_on_website'].astype(int) > 246) & (df_xedu['total_time_on_website'].astype(int) <= 922)
condition_alto = df_xedu['total_time_on_website'].astype(int) > 922
df_xedu.loc[condition_muy_bajo, 'total_time_on_website'] = 'muy_bajo'
df_xedu.loc[condition_bajo, 'total_time_on_website'] = 'bajo'
df_xedu.loc[condition_medio, 'total_time_on_website'] = 'medio'
df_xedu.loc[condition_alto, 'total_time_on_website'] = 'alto'
pd.options.display.max_columns = None
display(df_xedu.head(10))
#Utilizamos get_dummies para convertir variables categóricas en variables numéricas
df_xedu_dummy = pd.get_dummies(df_xedu[['lead_origin', 'lead_source', 'total_time_on_website','last_activity', 'specialization',
'current_ocupation','city','last_notable_activity']])
df_xedu_dummy.head(10)
df_xedu = pd.concat([df_xedu, df_xedu_dummy], axis=1)
df_xedu = df_xedu.drop(['lead_origin', 'lead_source', 'total_time_on_website','last_activity', 'specialization',
'current_ocupation','city','last_notable_activity'], axis = 1)
df_xedu.head(10)
df_xedu.shape
#Intentamos identificar grados altos de correlación entre las variables mediante un heatmap
import random
def unico(x,L):
esUnico=True
for i in range(len(L)):
if x==L[i]:
esUnico=False
break
return esUnico
L=[]
j=0
while j<15:
x=random.randint(0,74)
if unico(x,L):
L.append(x)
j+=1
lista_corr = []
for i in L:
lista_corr.append(df_xedu.columns[i])
corr_matrix = df_xedu[lista_corr].corr()
sns.heatmap(corr_matrix, cmap="coolwarm")
#Importamos el dataframe final
df_xedu.to_csv('XEducation_Dummies.csv')
Model Building
#Observamos nuestro dataset
pd.options.display.max_columns = None
display(df_xedu.head(10))
#Cargamos las variables de entrada, excluyendo la columna "converted"
X = df_xedu.drop(['id','converted'], axis=1)
y = df_xedu['converted']
X.shape
#Ajustamos nuestro modelo a nuestras variables de entrada
model = linear_model.LogisticRegression()
model.fit(X,y)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
#Dividimos nuestro dataset en un set de entrenamiento y un set de prueba
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, y, train_size=0.7, test_size=0.3, random_state = 100)
#Compilamos nuestro modelo con el set de entrenamiento
name = 'Score en el Set de Entrenamiento'
kfold = model_selection.KFold(n_splits = 10, random_state = 100, shuffle = True)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv = kfold, scoring = 'accuracy')
msg = "%s: %f" % (name, cv_results.mean())
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
#Grado de Certeza del Modelo aplicado al Set de Entrenamiento
print(msg)
Score en el Set de Entrenamiento: 0.811368
#Grado de Certeza del Modelo aplicado al Set de Pruebas
name_s = 'Score en el Set de Pruebas'
predictions = model.predict(X_validation)
print("%s: %f" % (name_s, accuracy_score(Y_validation, predictions)))
Score en el Set de Pruebas: 0.822622
#Plasmamos la Matriz de Confusión
print(confusion_matrix(Y_validation, predictions))
[[1494 240]
[ 243 746]]
#Traemos de vuelta nuestro dataset original
column_prediction = model.predict(X)
df_final_test.insert(5, 'prediction', column_prediction)
column_probability = []
array = model.predict_proba(X)
lista = list(range(0,len(array),1))
for i in lista:
column_probability.append(array[i][1])
df_final_test.insert(6, 'probability', column_probability)
pd.options.display.max_columns = None
display(df_final_test)