Cargar un data set
import pandas as pd
df = pd.read_csv('cars.csv')
df.head()
df.describe()
Otras medias de tendencia central
from scipy.stats import hmean
hmean(df['year_produced'])
from scipy.stats import gmean
hmean(df['year_produced'])
import numpy as np
np.average(df['year_produced'], weights=[x for x in range(0,df['year_produced'].count())])
Medidas de tendencia central
df['price_usd'].mean()
df['price_usd'].median()
df['price_usd'].plot.hist(bins = 50)
import seaborn as sns
sns.displot(df, x= 'price_usd',hue = 'engine_type',multiple = 'dodge')
df.groupby('engine_type').count()
Q7_df = df[(df['manufacturer_name']=='Audi') & (df['model_name']=='Q7')]
sns.histplot(Q7_df, x = 'price_usd',hue = 'year_produced')
sns.set(style="ticks", color_codes=True)
sns.pairplot(df,vars = ['price_usd', 'odometer_value','year_produced'], hue="engine_type")
Medidas de dispersión
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df['price_usd'].std()
rango = df['price_usd'].max() -df['price_usd'].min()
rango
median = df['price_usd'].median()
Q1 = df['price_usd'].quantile(q=0.25)
Q3 = df['price_usd'].quantile(q=0.75)
min_val = df['price_usd'].quantile(q=0)
max_val = df['price_usd'].quantile(q=1)
print(min_val, Q1,median,Q3,max_val)
iqr = Q3-Q1
iqr
Límites para detección de outliers (datos simetricamente distribuidos)
$$\text{Datos entre }Q_1-1.5 \times IQR \text{ y } Q_3+1.5IQR$$
$$\text{Datos entre }Q_1-1.5 \times IQR \text{ y } Q_3+1.5IQR$$
minlimit = Q1 - 1.5*iqr
maxlimit = Q3 + 1.5*iqr
print(minlimit, maxlimit)
sns.histplot(df['price_usd'])
sns.boxplot(df['price_usd'])
sns.boxplot(x = 'engine_type',y = 'price_usd',data = df)
Diagramas de dispersión
import seaborn as sns
import pandas as pd
iris = sns.load_dataset('iris')
iris.head()
sns.scatterplot(iris, x= 'sepal_length',y = 'petal_length',hue = 'species')
sns.scatterplot(iris, x= 'sepal_length',y = 'sepal_width',hue = 'species')
Escalamiento de datos numéricos
import timeit
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
X, y = datasets.load_diabetes(return_X_y=True)
raw = X[:, None, 2]
max-min scaling
max_raw = max(raw)
min_raw = min(raw)
scaled = (2*raw -max_raw -min_raw)/(max_raw -min_raw) #transformación
fig, axs = plt.subplots(2,1,sharex = True)
axs[0].hist(raw)
axs[1].hist(scaled)
Modelos para entrenamiento
def train_raw():
linear_model.LinearRegression().fit(raw,y)
def train_scaled():
linear_model.LinearRegression().fit(scaled,y)
raw_time = timeit.timeit(train_raw,number = 100)
scaled_time = timeit.timeit(train_scaled,number = 100)
print('train_raw:{}'.format(raw_time))
print('train_scaled:{}'.format(scaled_time))
print('diferencia:{}'.format(raw_time-scaled_time))
max-min scalling: mejor para datos uniformemente distribuidos
z-score scalling: mejor para datos distribuidos "normalmente" (forma de campana de gauss)
z-score scalling
media = np.mean(raw)
std = np.std(raw)
scaled = (raw-media)/std
fig, axs = plt.subplots(2,1,sharex = True)
axs[0].hist(raw)
axs[1].hist(scaled)
raw_time = timeit.timeit(train_raw,number = 100)
scaled_time = timeit.timeit(train_scaled,number = 100)
print('train_raw:{}'.format(raw_time))
print('train_scaled:{}'.format(scaled_time))
print('diferencia:{}'.format(raw_time-scaled_time))
Transformaciones no lineales
df = pd.read_csv('cars.csv')
df.price_usd.hist()
Vemos que tiene una distribución fuertemente sesgada
Transformación con tanh(x)
p=10000
df.price_usd.apply(lambda x: np.tanh(x/p)).hist()
Procesamiento para variables categóricas
import pandas as pd
df = pd.read_csv('cars.csv')
pd.get_dummies(df['engine_type'])
import sklearn.preprocessing as preprocessing
encoder = preprocessing.OneHotEncoder(handle_unknown = 'ignore')
encoder.fit(df[['engine_type']].values)
encoder.transform([['gasoline'],['diesel'],['aceite']]).toarray()
Variables numéricas discretas pueden ser codificadas como categóricas
encoder.fit(df[['year_produced']].values)
encoder.transform([['2016'],['2009'],['1990']]).toarray()
Matriz de covarianza
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
iris = sns.load_dataset('iris')
iris.columns()
scaler = StandardScaler()
scaled = scaler.fit_transform(iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])
scaled.T
covariance_matrix = np.cov(scaled.T)
covariance_matrix
plt.figure(figsize =(10,10))
sns.set(font_scale = 1.5)
sns.heatmap(covariance_matrix,
cbar = True,
annot = True,
square = True,
fmt = '.2f',
annot_kws={'size':12},
yticklabels = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
xticklabels = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
Calcular las componentes principales de la matriz de covarianza
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
iris = sns.load_dataset('iris')
scaler = StandardScaler()
scaled = scaler.fit_transform(iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])
covariance_matrix = np.cov(scaled.T)
sns.jointplot(x = iris['petal_length'],y = iris['petal_width'] )
sns.jointplot(x = scaled[:,2], y = scaled[:,3])
Descomposición en vectores y valores propios
eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)
Veamos la varianza que capturan los valores propios
variance_explained = []
for i in eigen_values:
variance_explained.append(i/sum(eigen_values)*100)
print(variance_explained)
PCA para reducción de dimensiones
Ahora usemos La libreria de sklearn
from sklearn.decomposition import PCA
pca = PCA(n_components= 2)
pca.fit(scaled)
pca.explained_variance_ratio_
reduced = pca.transform(scaled)
iris['pca_1'] = scaled[:,0]
iris['pca_2'] = scaled[:,1]
iris
sns.jointplot(iris,x = 'pca_1', y = 'pca_2',hue = 'species')