import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
dict_data2 = {
'edad' : [ 10, 9, 13, 14, 12, 11, 12],
'cm' : [ 115, 110, 130, 155, 125, 120, 125],
'pais' : [ 'co', 'mx', 'co', 'mx', 'mx', 'ch', 'ch'],
'genero' : [ 'M', 'F', 'F', 'M', 'M', 'M', 'F'],
'Q1' : [ 5, 10, 8, np.nan, 7, 8, 3],
'Q2' : [ 7, 9, 9, 8, 8, 8, 9.]
}
df2 = pd.DataFrame(dict_data2,index=['ana','benito','camilo','daniel','erika','fabian','gabriela'])
df2
df2.columns
df2.values
df2.index
#Te mostrara el minimo. maximo, media,std,…etc de las columnas numericas.
df2.describe()
df2['edad'].value_counts()
df2['edad'].unique()
df2.loc[['ana','erika'],['edad','cm','Q1']]
df2.iloc[[2,3],[1,2,3]]
df2.iloc[:,[1,2,3]]
df2[(df2['edad'] >= 12) & (df2['pais'] == 'mx')]
dir_bd = 'db/{}'
df2.to_csv(dir_bd.format('test.csv'))
df_read = pd.read_csv('db/test.csv')
df_read.head(1)
import pandas as pd
import psycopg2
conn_sql = psycopg2.connect(user = "user_name",
password = "password",
host = "xxx.xxx.xxx.xxx",
port = "5432",
database = "postgres_db_name")
OperationalError: asynchronous connection attempt underway
query_sql = '''
select *
from table_name
limit 10
'''
df = pd.read_sql(query_sql, sql_conn)
df.head(5)
import pandas as pd
import pyodbc
driver = '{SQL Server}'
server_name = 'server_name'
db_name = 'database_name'
user = 'user'
password = 'password'
sql_conn = pyodbc.connect('''
DRIVER={};SERVER={};DATABASE={};UID={};PWD={};
Trusted_Connection=yes
'''.format(driver, server_name, db_name, user, password))
dsn = 'odbc_datasource_name'
sql_conn = pyodbc.connect('''
DSN={};UID={};PWD={};Trusted_Connection=yes;
'''.format(dsn, user, password))
Seguido simplemente definimos nuestra query en SQL:
query_sql = 'select * from table_name limit 10'
df = pd.read_sql(query_sql, sql_conn)
df.head(5)
import pandas as pd
import sqlalchemy as sql
Escogemos nuestra base de datos, Oracle, MySql o la de tu preferencia:
database_type = 'mysql'
database_type = 'oracle'
user = 'user_name'
password = 'password'
host = 'xxx.xxx.xxx.xxx:port'
database = 'database_name'
conn_string = '{}://{}:{}@{}/{}'.format(
database_type, user, password, host, database)
sql_conn = sql.create_engine(conn_string)
query_sql = '''
select *
from table_name
limit 10
'''
df = pd.read_sql(query_sql, sql_conn)
df.head(5)
import pandas as pd
df_meteorites = pd.read_csv('db/Meteorite_Landings.csv')
df_meteorites.tail(1) #Ver los ultimos registros
df_meteorites.sample(1) #Muestra registros aleatorios
df_meteorites.shape
df_meteorites.size
pd.options.display.float_format='{:,.2f}'.format
df_meteorites.describe(include='all')
df_meteorites.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 45716 non-null object
1 id 45716 non-null int64
2 nametype 45716 non-null object
3 recclass 45716 non-null object
4 mass (g) 45585 non-null float64
5 fall 45716 non-null object
6 year 45425 non-null object
7 reclat 38401 non-null float64
8 reclong 38401 non-null float64
9 GeoLocation 38401 non-null object
dtypes: float64(3), int64(1), object(6)
memory usage: 3.5+ MB
df_meteorites.convert_dtypes().dtypes
df_meteorites[['fall','nametype']] = df_meteorites[['fall','nametype']].astype('category')
df_meteorites.dtypes
pd.get_dummies(df_meteorites['fall'])
df_meteorites[['fell','found']] = pd.get_dummies(df_meteorites['fall'])
df_meteorites.head(2)
df_meteorites.rename(columns={'mass (g)':'mass'}, inplace=True)
df_meteorites.dtypes
#df_meteorites.drop(['Nombre_Columna'], axis=1, inplace=True)
df_meteorites.drop(columns = ['id','recclass'],index=[1,2,4,6]).head(1)
df_meteorites['year'] = pd.to_datetime(
df_meteorites['year'],
errors='coerce',
format='%m/%d/%Y %H:%M:%S %p'
)
df_meteorites.convert_dtypes().dtypes
df = df_meteorites.copy(deep=True)
df.drop(['id'], axis=1, inplace=True)
df.head(1)
import pandas as pd
import numpy as np
df_lmerged = pd.read_csv('db/london_merged.csv')
#Convertir el tipo de dato en datetime
df_lmerged['timestamp'] = pd.to_datetime(df_lmerged['timestamp'])
df_lmerged['hour'] = df_lmerged['timestamp'].dt.hour
df_lmerged['hour']
df = df_lmerged.iloc[:, 1:]
df['t1'].iloc[::2].sub(df['t2'],fill_value=1000)
df['t1'].add(df['t2'])
def fun_1(x, a=4, b=1):
y = x ** 2 + a*x /b
return y
df['hour'].apply(fun_1)
df['hour'].apply(fun_1,a=20,b=-100)
df['t1'].apply(lambda x: x+273)
df.applymap(lambda x: x*510)
import pandas as pd
import numpy as np
df_prob = pd.read_csv('db/poblacion.csv')
pd.options.display.float_format = '{:,.1f}'.format
df_prob['year'] = pd.Categorical(df_prob['year'].apply(str))
id_filtro = df_prob['Country'].isin(['Aruba','Colombia'])
df_sample = df_prob[id_filtro]
df_sample = df_sample.set_index(['Country','year']).sort_index()
df_sample
df_sample.loc['Colombia',:].loc['2016',:]
df_sample.xs(['Aruba','2018'])
df_sample.xs('2018', level='year')
df_countries = df_prob.set_index(['Country','year']).sort_index(ascending=[True,True])
df_countries.head(4)
ids = pd.IndexSlice
df_countries.loc[ids['Aruba':'Austria','2015':'2017'],:].sort_index()
df_countries.index.get_level_values(0)
df_countries['pop']['Colombia']['2016']
df_countries.sum(level='year')
df_sample.unstack('year')
np.set_printoptions(precision=2)
x1 = np.random.rand(2,5)*10
x2 = np.random.rand(2,5)*10
s1 = pd.Series(x1[0],index=['a','b','c','d','e'])
s2 = pd.Series(x2[0], index=['c','b','e','f','g'])
df1 = pd.DataFrame(np.random.rand(3,2)*10,columns=['a','b'])
df2 = pd.DataFrame(np.random.rand(3,2)*-1,columns=['a','b'], index=[2,3,4])
np.concatenate([x1,x2], axis=1)
s1 = s1.reset_index(drop=True)
s2 = s2.reset_index(drop=True)
pd.concat([df1,df2], axis=1)
pd.concat([df1,df2], axis=1, join='inner')
df1.append(df2).append(df1)
df1.T.append(df2.T).T
df_left = pd.DataFrame(
{'X':['x0','x1','x2','x3'],
'W':['w0','w1','w2','w3'],
'Y':['y0','y1','y2','y3'],
'Mix':['y2','y3','a2','a3']},
index = [0,1,2,3])
df_right = pd.DataFrame(
{'Z':['z2','z3','z4','z5'],
'A':['a2','a3','a4','a5'],
'Y':['y2','y3','y4','y5']},
index = [2,3,4,5])
pd.merge(df_left,df_right, how='inner', on='Y')
pd.merge(df_left,df_right, how='inner', left_on='Mix', right_on='Y', suffixes=['_left','_right'])
pd.merge(df_left,df_right, how='inner', left_on='Mix', right_on='A', suffixes=['_left','_right'])
df_left = pd.DataFrame(
{'X': ['x0', 'x1', 'x2', 'x3'],
'W': ['w0', 'w1', 'w2', 'w3'],
'Y': ['y0', 'y1', 'y2', 'y3'] },
index=[0,1,2,3])
df_right = pd.DataFrame(
{'Z': ['z2', 'z3', 'z4', 'z5'],
'A': ['a2', 'a3', 'a4', 'a5'],
'Y': ['y2', 'y3', 'y4', 'y5']},
index=[2,3,4,5])
pd.merge(df_left,df_right, how='inner')
pd.merge(df_left,df_right, how='left', on='Y')
pd.merge(df_left,df_right, how='right', on='Y')
pd.merge(df_left,df_right, how='outer', on='Y')
df_left2 = pd.DataFrame(
{'X': ['x0', 'x1', 'x2', 'x3'],
'W': ['w0', 'w1', 'w2', 'w3'],
'Y': ['y0', 'y1', 'y2', 'y3'],
'A': ['a0', 'a1', 'a2', 'a3'],})
pd.merge(df_left2,df_right,how='outer', on=['Y','A'])
import numpy as np
import pandas as pd
df = pd.DataFrame(np.arange(0, 15).reshape(5, 3), columns=['a', 'b', 'c'])
df
df['d'] = np.nan
df['e'] = np.arange(15, 20)
df.loc[5,:] = pd.NA
df.loc[4,'a'] = pd.NA
df.loc[0,'d'] = 1
df.loc[5,'d'] = 10
df
df.isnull()
df.isnull().sum()
df.notnull().sum(axis=1)
df.size-df.isnull().sum().sum()
df[df['a'].notnull()]
df.dropna()
df[['a']].dropna()
df.fillna(0)
df.fillna(method="ffill")
df.fillna(method="bfill")
df.fillna(method="bfill",axis=1)
fill = pd.Series([100, 101, 102])
fill
df['d'] = df['d'].fillna(fill)
df['d']
df.fillna(df.median())
df_d = pd.concat([df[['d']], df[['d']].interpolate()],axis=1)
df_d.columns = ['d_antes','d_interpolado']
df_d
pd.options.display.float_format = '{:,.3f}'.format
df = sns.load_dataset('diamonds')
df.groupby('cut').mean()
for key_group, group in df.groupby('cut'):
grouped_price = group['price'].mean()
print('Cut: {}, Price: {}, \n'.format(key_group,grouped_price))
Cut: Ideal, Price: 3457.541970210199,
Cut: Premium, Price: 4584.2577042999055,
Cut: Very Good, Price: 3981.7598907465654,
Cut: Good, Price: 3928.864451691806,
Cut: Fair, Price: 4358.757763975155,
df.groupby(['cut','color'])['price'].mean().to_frame()
def mean_kilo(x):
return np.mean(x)
df.groupby(['cut','color'])['price'].aggregate(['min',np.mean,max,mean_kilo]).head(10)
dict_agg = {'carat' : [min,max], 'price' : [np.mean,mean_kilo]}
df.groupby(['cut','color']).aggregate(dict_agg)
def f_filter(x):
return mean_kilo(x['price']) > 4
df.groupby(['cut']).filter(f_filter)
df.groupby(['cut']).filter(f_filter)['cut'].unique()
df = sns.load_dataset('tips')
df['day'].value_counts() / df['day'].value_counts().sum()*100
df.groupby('sex').median()
df['prct_tip'] = df['tip']/df['total_bill']
df.head(2)
df.groupby('sex')[['prct_tip']].describe()
df.groupby('sex')[['prct_tip','total_bill']].describe()
def mean_eur_to_usd(x):
return np.mean(x)*1.12
df.groupby(['sex','time'])[['prct_tip','total_bill']].apply(mean_eur_to_usd)
df.groupby(['sex','time'])[['total_bill','prct_tip']].agg([np.mean,np.max])
dict_agg = {'tip':[min, max],'total_bill':[np.mean, mean_eur_to_usd]}
df.groupby(['sex','time'])[['total_bill','tip']].agg(dict_agg)
def filter_bill(x):
return mean_eur_to_usd(x['total_bill'].mean()) > 20
df_filtered = df.groupby(['sex','time']).filter(filter_bill)
df_filtered.head(3)
df_filtered.groupby(['sex','time']).count()
df['ones'] = 1
df_g = df.groupby(['sex','smoker'])[['ones']].sum()
df_g
df_g.groupby(level=0).apply(
lambda x:
x / x.sum() * 100
)
pd.cut(df['total_bill'], bins = 3).value_counts()
pd.cut(df['total_bill'], bins=[3,18,35,70]).value_counts()
df['bin_total'] = pd.cut(df['total_bill'], bins=[3,18,35,70])
df
df.groupby(['time','bin_total'])[['ones']].count().groupby(level=0).apply(lambda x: x/x.sum()*100)
df = pd.DataFrame({'a': ['w'] * 4 + ['x'] * 3 + ['y'] * 2 + ['z']+['v'],
'b': [1, 1, 1, 1, 2, 2, 2, 3, 3, 4,5]})
df.duplicated() # Ver duplicados
#keep='last' o 'first, marca la primer/ultima ocurrencia
df.duplicated(keep='first')
df.duplicated(keep='last')
#Filtra casos que no estan duplicados, ~ = negacion
df[~ df.duplicated()]
# Marca todos los registros duplicados
df.duplicated(keep=False)
df[df.duplicated(keep=False)]
# descarta los registros duplicados, tiene por defecto el keep = first
df.drop_duplicates(['a'],keep='last')
pd.options.display.float_format = '{:,.3f}'.format
df = sns.load_dataset('tips')
df_gp = df.groupby(['sex','time'])[['total_bill']].mean().reset_index()
df_gp
df_gp.pivot_table(values='total_bill', index='sex', columns='time')
df_pivot = df.pivot_table(values='total_bill', index='sex', columns='time', aggfunc=[np.median,np.std])
df_pivot
df_pivot.unstack().reset_index()
df.pivot_table(values=['total_bill','tip'], index='smoker', columns='time', aggfunc=[np.median,np.std])
df = pd.read_csv('db/covid_19_data.csv')
df['ObservationDate'] = pd.to_datetime(df['ObservationDate'])
df_time = df.groupby('ObservationDate').sum()
df_diff = df_time.diff()
df_diff.cumsum().head(4)
df_diff.resample('W-sun').sum()
df_cum = df_time.resample('12h').sum(min_count=1)
df_cum
df_cum.bfill()
df_cum.ffill()
df_cum = df_cum.interpolate()
df_cum.head(3)
df_cum['rate'] = 1 - df_cum['Deaths'] / df_cum['Confirmed']
df_cum = df_cum.reset_index()
df_cum.groupby(pd.Grouper(key='ObservationDate', freq='M'))[['rate']].mean()