¿Cómo funcionan las conexiones a bases de datos en Python?
import pandas as pd
import os
from datetime import datetime
import time
pd.options.display.max_columns = None
# Cargamos la base
df = pd.read_csv('marketing_data.csv')
# Arreglando el formato de la fecha en la columna Dt_Customer
def fix_fecha(x):
mes, dia, año = x[0], x[1], x[2]
fecha = mes + '/' + dia + '/20' + año
fecha = datetime.strptime(fecha, '%m/%d/%Y').date()
return fecha
df.Dt_Customer = df.Dt_Customer.apply(lambda x: fix_fecha(x.split('/')))
# Arreglando el formato numerico de la columna Income
df = df.rename(columns={' Income ': 'Income'})
for i in ('$', ',', '.'):
df.Income = df.Income.str.replace(i, '')
df.Income = pd.to_numeric(df.Income)
df.Income = df.Income.fillna(0)
# Incrementando el tamaño del DataFrame
for i in range(0, 1):
print(len(df))
df = pd.concat([df, df], ignore_index=True)
print(len(df))
2240
4480
<ipython-input-34-9cb816784541>:15: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.
df.Income = df.Income.str.replace(i, '')
df.head()
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4480 entries, 0 to 4479
Data columns (total 28 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 4480 non-null int64
1 Year_Birth 4480 non-null int64
2 Education 4480 non-null object
3 Marital_Status 4480 non-null object
4 Income 4480 non-null float64
5 Kidhome 4480 non-null int64
6 Teenhome 4480 non-null int64
7 Dt_Customer 4480 non-null object
8 Recency 4480 non-null int64
9 MntWines 4480 non-null int64
10 MntFruits 4480 non-null int64
11 MntMeatProducts 4480 non-null int64
12 MntFishProducts 4480 non-null int64
13 MntSweetProducts 4480 non-null int64
14 MntGoldProds 4480 non-null int64
15 NumDealsPurchases 4480 non-null int64
16 NumWebPurchases 4480 non-null int64
17 NumCatalogPurchases 4480 non-null int64
18 NumStorePurchases 4480 non-null int64
19 NumWebVisitsMonth 4480 non-null int64
20 AcceptedCmp3 4480 non-null int64
21 AcceptedCmp4 4480 non-null int64
22 AcceptedCmp5 4480 non-null int64
23 AcceptedCmp1 4480 non-null int64
24 AcceptedCmp2 4480 non-null int64
25 Response 4480 non-null int64
26 Complain 4480 non-null int64
27 Country 4480 non-null object
dtypes: float64(1), int64(23), object(4)
memory usage: 980.1+ KB
Conexion Estandar Mediante el Cursor de PyMySQL
import pymysql.cursors
connection = pymysql.connect(host='localhost',
user='root',
password='12345',
database='Marketing')
cursor = connection.cursor()
sql = "SHOW TABLES"
cursor.execute(sql)
result = cursor.fetchall()
print(result)
()
query_create_table = """
CREATE TABLE Marketing (
ID INT NULL,
Year_Birth FLOAT NULL,
Education VARCHAR(250) NULL,
Marital_Status VARCHAR(250) NULL,
Income FLOAT NULL,
Kidhome FLOAT NULL,
Teenhome FLOAT NULL,
Dt_Customer DATE NULL,
Recency FLOAT NULL,
MntWines FLOAT NULL,
MntFruits FLOAT NULL,
MntMeatProducts FLOAT NULL,
MntFishProducts FLOAT NULL,
MntSweetProducts FLOAT NULL,
MntGoldProds FLOAT NULL,
NumDealsPurchases FLOAT NULL,
NumWebPurchases FLOAT NULL,
NumCatalogPurchases FLOAT NULL,
NumStorePurchases FLOAT NULL,
NumWebVisitsMonth FLOAT NULL,
AcceptedCmp3 FLOAT NULL,
AcceptedCmp4 FLOAT NULL,
AcceptedCmp5 FLOAT NULL,
AcceptedCmp1 FLOAT NULL,
AcceptedCmp2 FLOAT NULL,
Response FLOAT NULL,
Complain FLOAT NULL,
Country VARCHAR(250) NULL
)
"""
cursor.execute(query_create_table)
result = cursor.fetchone()
sql = "SHOW TABLES"
cursor.execute(sql)
result = cursor.fetchall()
print(result)
(('marketing',),)
# Cramos el nombre de las columnas
name_colum = "`,`".join([str(i) for i in df.columns.tolist()])
# iteramos en cada linea del df para tomar cada uno de los valores e ingestarlos
inicio = time.time()
for i, row in df.iterrows():
sql = "INSERT INTO `Marketing` (`" +name_colum + "`) VALUES (" + "%s,"*(len(row)-1) + "%s)"
cursor.execute(sql, tuple(row))
connection.commit()
fin = time.time()
print(fin-inicio)
11.69935917854309
Conexion con SQLAlchemy y Pandas
from sqlalchemy import create_engine
engine = create_engine("mysql+pymysql://{user}:{pw}@localhost/{db}"
.format(user="root",
pw="12345",
db="Marketing"))
inicio = time.time()
df.to_sql('Marketing2', engine, chunksize=500000,
method='multi', index=False, if_exists='append')
fin = time.time()
print(fin-inicio)
3.0159225463867188
C:\Users\flavi_rlghtvx\anaconda3\lib\site-packages\pandas\io\sql.py:1444: UserWarning: The provided table name 'Marketing2' is not found exactly as such in the database after writing the table, possibly due to case sensitivity issues. Consider using lower case table names.
warnings.warn(msg, UserWarning)
El Caso especial en Accion
from sqlalchemy import create_engine
import pyodbc
pyodbc.drivers()
import urllib
connection_string = (
'DRIVER=MySQL ODBC 8.0 Unicode Driver;'
'SERVER=localhost;'
'DATABASE=Marketing;'
'UID=root;'
'PWD=12345;'
)
params = urllib.parse.quote_plus(connection_string)
engine = create_engine("mysql+pyodbc:///?odbc_connect={}".format(params))
engine.connect()
from sqlalchemy import event
@event.listens_for(engine, "before_cursor_execute")
def receive_before_cursor_execute(
conn, cursor, statement, params, context, executemany
):
if executemany:
cursor.fast_executemany = True
# creando un buen chucksize
chunksize = (len(df) // 4) + 2
# ejecutando la ingesta
inicio = time.time()
df.to_sql('Marketing3', engine, chunksize=chunksize,
index=False, if_exists='append')
fin = time.time()
print(fin-inicio)
1.32609224319458
C:\Users\flavi_rlghtvx\anaconda3\lib\site-packages\pandas\io\sql.py:1444: UserWarning: The provided table name 'Marketing3' is not found exactly as such in the database after writing the table, possibly due to case sensitivity issues. Consider using lower case table names.
warnings.warn(msg, UserWarning)
Comparando
def pymysql_ins(df):
import pymysql.cursors
connection = pymysql.connect(host='localhost',
user='root',
password='280214',
database='DataGalaxy')
cursor = connection.cursor()
name_colum = "`,`".join([str(i) for i in df.columns.tolist()])
inicio = time.time()
for i, row in df.iterrows():
sql = "INSERT INTO `Marketing` (`" +name_colum + "`) VALUES (" + "%s,"*(len(row)-1) + "%s)"
cursor.execute(sql, tuple(row))
connection.commit()
fin = time.time()
tiempo_inser = fin-inicio
tamaño_df = len(df)
tipo = 'pymysql'
list_tiempo_inser.append(tiempo_inser)
list_tamaño_df.append(tamaño_df)
list_tipo.append(tipo)
def pan_alch_ins(df):
from sqlalchemy import create_engine
engine = create_engine("mysql+pymysql://{user}:{pw}@localhost/{db}"
.format(user="root",
pw="12345",
db="Marketing"))
inicio = time.time()
chunksize = (len(df) // 4) + 2
df.to_sql('Marketing2', engine, chunksize=chunksize,
method='multi', index=False, if_exists='append')
fin = time.time()
tiempo_inser = fin-inicio
tamaño_df = len(df)
tipo = 'pan_alch'
list_tiempo_inser.append(tiempo_inser)
list_tamaño_df.append(tamaño_df)
list_tipo.append(tipo)
def pyodbc_ins(df):
from sqlalchemy import create_engine
from sqlalchemy import event
import pyodbc
import urllib
connection_string = (
'DRIVER=MySQL ODBC 8.0 Unicode Driver;'
'SERVER=localhost;'
'DATABASE=Marketing;'
'UID=root;'
'PWD=12345;'
)
params = urllib.parse.quote_plus(connection_string)
engine = create_engine("mysql+pyodbc:///?odbc_connect={}".format(params))
@event.listens_for(engine, "before_cursor_execute")
def receive_before_cursor_execute(
conn, cursor, statement, params, context, executemany
):
if executemany:
cursor.fast_executemany = True
chunksize = (len(df) // 4) + 2
inicio = time.time()
df.to_sql('Marketing3', engine, chunksize=chunksize,
index=False, if_exists='append')
fin = time.time()
tiempo_inser = fin-inicio
tamaño_df = len(df)
tipo = 'pyodbc'
list_tiempo_inser.append(tiempo_inser)
list_tamaño_df.append(tamaño_df)
list_tipo.append(tipo)
list_tiempo_inser = []
list_tamaño_df = []
list_tipo = []
for uper_df in range(5):
pymysql_ins(df)
pan_alch_ins(df)
pyodbc_ins(df)
df = pd.concat([df, df], ignore_index=True)
df_comparacion = pd.DataFrame({ 'TIPO' : list_tipo,
'TAMAÑO' : list_tamaño_df,
'TIEMPO' : list_tiempo_inser})
C:\Users\flavi_rlghtvx\anaconda3\lib\site-packages\pandas\io\sql.py:1444: UserWarning: The provided table name 'Marketing2' is not found exactly as such in the database after writing the table, possibly due to case sensitivity issues. Consider using lower case table names.
warnings.warn(msg, UserWarning)
C:\Users\flavi_rlghtvx\anaconda3\lib\site-packages\pandas\io\sql.py:1444: UserWarning: The provided table name 'Marketing3' is not found exactly as such in the database after writing the table, possibly due to case sensitivity issues. Consider using lower case table names.
warnings.warn(msg, UserWarning)
df_comparacion
import matplotlib.pyplot as plt
x = list(df_comparacion['TAMAÑO'].unique())
y_pymysql = list(df_comparacion.query('TIPO == "pymysql"')['TIEMPO'])
y_pan_alch = list(df_comparacion.query('TIPO == "pan_alch"')['TIEMPO'])
y_pyodbc = list(df_comparacion.query('TIPO == "pyodbc"')['TIEMPO'])
plt.plot(x, y_pymysql, label='y=pymysql')
plt.plot(x, y_pan_alch, label='y=pan_alch')
plt.plot(x, y_pyodbc, label='y=pyodbc')
plt.xlabel('TAMAÑO DE DATAFRAME')
plt.ylabel('TIEMPO DE INSERCION')
plt.title('COMPARACION')
plt.legend()
plt.show()