Trabajo realizado por Ma. Soledad García Ortiz para Big Data - CaC 4.0. 2022. Profesora: Regina Molares.
# Levanto los datos en 3 diferentes dataframes
#ARTICULOS
conn = sql3.connect("/work/data/articles.db")
sql_query = pd.read_sql_query("SELECT * FROM articles", conn) #Pandas no necesita "cursor" porq tiene un metodo para levantar querys de sql
df_articles = pd.DataFrame(sql_query, columns=["article_id", "article_name", "unit_price"])
print(df_articles)
#VENDEDORES
df_sellers = pd.read_excel("/work/data/sellers.xlsx", index_col=0)
print(df_sellers)
#ORDENES
df_orders = pd.read_csv("/work/data/orders.csv")
print(df_orders)
# Exploración del df de artículos
print("Muestra de datos")
print(df_articles.head())
print("\nFormato del dataframe")
print(df_articles.shape)
print("\nBusqueda de valores nulos")
print(df_articles.isnull().sum())
print("\nFormato de datos")
print(df_articles.dtypes)
Muestra de datos
article_id article_name unit_price
0 20015 Smartphone 525.00
1 20016 Full Pc 2127.81
2 20017 Monitor 230.00
3 20018 Tablet 130.00
4 20019 Desk 130.10
Formato del dataframe
(31, 3)
Busqueda de valores nulos
article_id 0
article_name 0
unit_price 0
dtype: int64
Formato de datos
article_id int64
article_name object
unit_price object
dtype: object
#Reemplazar los datos en las nuevas columnas
#print(my_df.index)
for i in range(len(my_df.index)):
#columna article_name
#cargo el nombre del articulo en una variable
#article = df_articles.loc[my_df.loc[i]['article_name']]['article_name']
#se lo asigno a la columna y registro que corresponde
#my_df.loc[i,"article_name"] = article
# columna total amount
#my_df.loc[i, 'total_amount'] = my_df.loc[i, 'quantity'] * df_articles.loc[my_df.loc[i]['total_amount']]['unit_price']
#columna de seller_name
#my_df.loc[i,"seller_name"] = df_sellers.loc[my_df.loc[i]["seller_name"]]["seller_name"]
print(my_df.head())
order_id week article_id quantity seller_id country_name \
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
article_name total_amount seller_name
0 Water Cooling 675.0 Cirilo Grandham
1 Mouse 454.5 Ewell Peres
2 Netbook 725.0 Janel O'Curran
3 Tablet 1170.0 Janel O'Curran
4 Case 227.4 Daisie Slograve
#elimino las columnas que no necesito
#my_df.drop(['order_id','article_id','seller_id'], axis='columns', inplace=True)
print(my_df.head())
print(my_df.shape)
week quantity country_name article_name total_amount seller_name
0 1 10 Peru Water Cooling 675.0 Cirilo Grandham
1 1 15 Peru Mouse 454.5 Ewell Peres
2 1 5 Bolivia Netbook 725.0 Janel O'Curran
3 1 9 Brazil Tablet 1170.0 Janel O'Curran
4 1 6 Mexico Case 227.4 Daisie Slograve
(1000, 6)
week quantity total_amount
country_name
Puerto Rico 3 12 1265.43
week quantity total_amount
article_name
CPU 1 9 1256.58
Usb Cable 2 3 8.85