Optimizacion de Insecion de Datos a una Base SQL con Python

¿Cómo funcionan las conexiones a bases de datos en Python?

import pandas as pd import os from datetime import datetime import time pd.options.display.max_columns = None

# Cargamos la base df = pd.read_csv('marketing_data.csv') # Arreglando el formato de la fecha en la columna Dt_Customer def fix_fecha(x): mes, dia, año = x[0], x[1], x[2] fecha = mes + '/' + dia + '/20' + año fecha = datetime.strptime(fecha, '%m/%d/%Y').date() return fecha df.Dt_Customer = df.Dt_Customer.apply(lambda x: fix_fecha(x.split('/'))) # Arreglando el formato numerico de la columna Income df = df.rename(columns={' Income ': 'Income'}) for i in ('$', ',', '.'): df.Income = df.Income.str.replace(i, '') df.Income = pd.to_numeric(df.Income) df.Income = df.Income.fillna(0) # Incrementando el tamaño del DataFrame for i in range(0, 1): print(len(df)) df = pd.concat([df, df], ignore_index=True) print(len(df))

df.head()

df.info()

Conexion Estandar Mediante el Cursor de PyMySQL

import pymysql.cursors connection = pymysql.connect(host='localhost', user='root', password='12345', database='Marketing')

cursor = connection.cursor() sql = "SHOW TABLES" cursor.execute(sql) result = cursor.fetchall() print(result)

query_create_table = """ CREATE TABLE Marketing ( ID INT NULL, Year_Birth FLOAT NULL, Education VARCHAR(250) NULL, Marital_Status VARCHAR(250) NULL, Income FLOAT NULL, Kidhome FLOAT NULL, Teenhome FLOAT NULL, Dt_Customer DATE NULL, Recency FLOAT NULL, MntWines FLOAT NULL, MntFruits FLOAT NULL, MntMeatProducts FLOAT NULL, MntFishProducts FLOAT NULL, MntSweetProducts FLOAT NULL, MntGoldProds FLOAT NULL, NumDealsPurchases FLOAT NULL, NumWebPurchases FLOAT NULL, NumCatalogPurchases FLOAT NULL, NumStorePurchases FLOAT NULL, NumWebVisitsMonth FLOAT NULL, AcceptedCmp3 FLOAT NULL, AcceptedCmp4 FLOAT NULL, AcceptedCmp5 FLOAT NULL, AcceptedCmp1 FLOAT NULL, AcceptedCmp2 FLOAT NULL, Response FLOAT NULL, Complain FLOAT NULL, Country VARCHAR(250) NULL ) """

cursor.execute(query_create_table) result = cursor.fetchone() sql = "SHOW TABLES" cursor.execute(sql) result = cursor.fetchall() print(result)

# Cramos el nombre de las columnas name_colum = "`,`".join([str(i) for i in df.columns.tolist()]) # iteramos en cada linea del df para tomar cada uno de los valores e ingestarlos inicio = time.time() for i, row in df.iterrows(): sql = "INSERT INTO `Marketing` (`" +name_colum + "`) VALUES (" + "%s,"*(len(row)-1) + "%s)" cursor.execute(sql, tuple(row)) connection.commit() fin = time.time() print(fin-inicio)

Conexion con SQLAlchemy y Pandas

from sqlalchemy import create_engine

engine = create_engine("mysql+pymysql://{user}:{pw}@localhost/{db}" .format(user="root", pw="12345", db="Marketing"))

inicio = time.time() df.to_sql('Marketing2', engine, chunksize=500000, method='multi', index=False, if_exists='append') fin = time.time() print(fin-inicio)

El Caso especial en Accion

from sqlalchemy import create_engine

import pyodbc pyodbc.drivers()

import urllib connection_string = ( 'DRIVER=MySQL ODBC 8.0 Unicode Driver;' 'SERVER=localhost;' 'DATABASE=Marketing;' 'UID=root;' 'PWD=12345;' ) params = urllib.parse.quote_plus(connection_string) engine = create_engine("mysql+pyodbc:///?odbc_connect={}".format(params))

engine.connect()

from sqlalchemy import event @event.listens_for(engine, "before_cursor_execute") def receive_before_cursor_execute( conn, cursor, statement, params, context, executemany ): if executemany: cursor.fast_executemany = True

# creando un buen chucksize chunksize = (len(df) // 4) + 2 # ejecutando la ingesta inicio = time.time() df.to_sql('Marketing3', engine, chunksize=chunksize, index=False, if_exists='append') fin = time.time() print(fin-inicio)

Comparando

def pymysql_ins(df): import pymysql.cursors connection = pymysql.connect(host='localhost', user='root', password='280214', database='DataGalaxy') cursor = connection.cursor() name_colum = "`,`".join([str(i) for i in df.columns.tolist()]) inicio = time.time() for i, row in df.iterrows(): sql = "INSERT INTO `Marketing` (`" +name_colum + "`) VALUES (" + "%s,"*(len(row)-1) + "%s)" cursor.execute(sql, tuple(row)) connection.commit() fin = time.time() tiempo_inser = fin-inicio tamaño_df = len(df) tipo = 'pymysql' list_tiempo_inser.append(tiempo_inser) list_tamaño_df.append(tamaño_df) list_tipo.append(tipo)

def pan_alch_ins(df): from sqlalchemy import create_engine engine = create_engine("mysql+pymysql://{user}:{pw}@localhost/{db}" .format(user="root", pw="12345", db="Marketing")) inicio = time.time() chunksize = (len(df) // 4) + 2 df.to_sql('Marketing2', engine, chunksize=chunksize, method='multi', index=False, if_exists='append') fin = time.time() tiempo_inser = fin-inicio tamaño_df = len(df) tipo = 'pan_alch' list_tiempo_inser.append(tiempo_inser) list_tamaño_df.append(tamaño_df) list_tipo.append(tipo)

def pyodbc_ins(df): from sqlalchemy import create_engine from sqlalchemy import event import pyodbc import urllib connection_string = ( 'DRIVER=MySQL ODBC 8.0 Unicode Driver;' 'SERVER=localhost;' 'DATABASE=Marketing;' 'UID=root;' 'PWD=12345;' ) params = urllib.parse.quote_plus(connection_string) engine = create_engine("mysql+pyodbc:///?odbc_connect={}".format(params)) @event.listens_for(engine, "before_cursor_execute") def receive_before_cursor_execute( conn, cursor, statement, params, context, executemany ): if executemany: cursor.fast_executemany = True chunksize = (len(df) // 4) + 2 inicio = time.time() df.to_sql('Marketing3', engine, chunksize=chunksize, index=False, if_exists='append') fin = time.time() tiempo_inser = fin-inicio tamaño_df = len(df) tipo = 'pyodbc' list_tiempo_inser.append(tiempo_inser) list_tamaño_df.append(tamaño_df) list_tipo.append(tipo)

list_tiempo_inser = [] list_tamaño_df = [] list_tipo = [] for uper_df in range(5): pymysql_ins(df) pan_alch_ins(df) pyodbc_ins(df) df = pd.concat([df, df], ignore_index=True) df_comparacion = pd.DataFrame({ 'TIPO' : list_tipo, 'TAMAÑO' : list_tamaño_df, 'TIEMPO' : list_tiempo_inser})

df_comparacion

import matplotlib.pyplot as plt x = list(df_comparacion['TAMAÑO'].unique()) y_pymysql = list(df_comparacion.query('TIPO == "pymysql"')['TIEMPO']) y_pan_alch = list(df_comparacion.query('TIPO == "pan_alch"')['TIEMPO']) y_pyodbc = list(df_comparacion.query('TIPO == "pyodbc"')['TIEMPO']) plt.plot(x, y_pymysql, label='y=pymysql') plt.plot(x, y_pan_alch, label='y=pan_alch') plt.plot(x, y_pyodbc, label='y=pyodbc') plt.xlabel('TAMAÑO DE DATAFRAME') plt.ylabel('TIEMPO DE INSERCION') plt.title('COMPARACION') plt.legend() plt.show()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}¿Cómo funcionan las conexiones a bases de datos en Python?

Conexion Estandar Mediante el Cursor de PyMySQL

Conexion con SQLAlchemy y Pandas

El Caso especial en Accion

Comparando

¿Cómo funcionan las conexiones a bases de datos en Python?