Curso estadistica descriptiva 2021

import pandas as pd import matplotlib.pyplot as plt import plotly.express as px import seaborn as sns df = pd.read_csv("cars.csv") df.head().style.background_gradient(cmap = 'coolwarm', axis=None)

df.dtypes

df.describe().style.background_gradient(cmap = 'coolwarm', axis=None)

import pandas as pd df = pd.read_csv('cars.csv')

#Media df['price_usd'].mean()

#Mediana

#Grafico en pandas de un histograma de frecuencia fig1 = px.histogram(df, x='price_usd', color_discrete_sequence=['#FA0087'], opacity=0.8) fig1.update_layout(title = 'Grafica historial del precio vehiculos', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900) fig1.show()

# Se usara principalmente en estas notas la libreria Plotly #distribution plot para hacer un histograma con las marcas de carros fig1 = px.histogram(df, x='price_usd', color='manufacturer_name', opacity = 0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig1.update_layout(title = 'Graficas de precio separado por marca de vehiculo', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900, legend_bgcolor='rgb(117, 112, 179)') fig1.show() #color crea un histograma por cada una de las categorías de manufactorer_name

#Histograma, de barras apiladadf con el tipo de combustible que necesitan fig1 = px.histogram(df, x='price_usd', color='engine_type', color_discrete_sequence=['rgb(175, 100, 88)', '#479B55', '#FA0087'], opacity=0.8) fig1.update_layout(title = 'Graficas de precio separado por marca de vehiculo', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900, legend_bgcolor='rgb(117, 112, 179)') fig1.show()

df.groupby('engine_type').count()

Q7_df = df[(df['manufacturer_name']=='Audi') & (df['model_name']=='Q7')] fig = px.histogram(Q7_df, x='price_usd', color='year_produced', opacity=0.8, color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title = 'Precio de marca Audi, modelo Q7', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

import pandas as pd import matplotlib.pyplot as plt import seaborn as sns df = pd.read_csv('cars.csv')

df['price_usd'].std()

#Rango = valor max - valor min rango = df['price_usd'].max() - df['price_usd'].min() rango

#Quartiles median = df['price_usd'].median() Q1 = df['price_usd'].quantile(q=0.25) #toma el primer 25% de todos los datos Q3 = df['price_usd'].quantile(q=0.75) min_val = df['price_usd'].quantile(q=0) max_val = df['price_usd'].quantile(q=1) print(min_val, Q1, median, Q3, max_val)

iqr = Q3 - Q1 iqr

minlimit = Q1 - 1.5*iqr maxlimit = Q3 + 1.3*iqr print(minlimit, maxlimit)

fig1 = px.histogram(df, x="price_usd", opacity=0.8, color_discrete_sequence=['#FA0087']) fig1.update_layout(title = 'Grafica de cantidad de autos por precio', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900) fig1.show() fig2 = px.box(df, x='price_usd', color_discrete_sequence=['#FA0087']) fig2.update_layout(title = 'Grafica de cantidad de autos por precio', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900) fig2.show()

fig = px.box(df, x='price_usd', y='engine_fuel', color = 'engine_fuel', color_discrete_sequence=px.colors.qualitative.Dark24) fig.update_layout(title = 'Grafica de precio por tipo de combustible usado', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

import pandas as pd import seaborn as sns iris = sns.load_dataset('iris') iris.head()

fig1 = px.scatter(iris, x="sepal_length", y="petal_length", color='species', color_discrete_sequence=['rgb(175, 100, 88)', '#479B55', '#FA0087'], opacity=0.8) fig1.update_layout(title='Longitud de petalo/Longitud sepalo', xaxis_title='Longitud de sepalo', yaxis_title='Longitud de petalo', font=dict(family='verdana', size = 16, color = 'white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)') fig1.show()

fig1 = px.scatter(iris, x="sepal_length", y="petal_length", color='species', color_discrete_sequence=['rgb(175, 100, 88)', '#479B55', '#FA0087'], marginal_x="box", marginal_y="box", opacity=0.8) fig1.update_layout(title='Longitud de petalo/Longitud sepalo', xaxis_title='Longitud de sepalo', yaxis_title='Longitud de petalo', font = dict(family='verdana', size = 16, color = 'white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)') fig1.show()

fig = px.box(iris, x='species', y="sepal_length", color='species', color_discrete_sequence=['rgb(175, 100, 88)', '#479B55', '#FA0087']) fig.update_layout(title='Comparativa Tamaño sepalo flor por especie', xaxis_title='Especie', yaxis_title='Longitud de petalo', font=dict(family='verdana', size = 16, color = 'white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)') fig.show()

fig = px.bar(iris, x='species', y="sepal_length", color='species', color_discrete_sequence=['rgb(175, 100, 88)', '#479B55', '#FA0087'], opacity=0.8, barmode='overlay') fig.update_layout(title='Comparativa Tamaño sepalo flor por especie', xaxis_title='Especie', yaxis_title='Longitud de petalo', font=dict(family = 'verdana', size = 16, color = 'white'), template='plotly_dark', height=500, width=900, legend_bgcolor='rgb(117, 112, 179)', xaxis={'categoryorder':'category descending'}) fig.show()

import timeit #para medir el tiempo de ejecución de los modelos import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn import datasets, linear_model #datasets para descargar un modelo y linear_model para hacer una regresión lineal X, y = datasets.load_diabetes(return_X_y=True) #carga el dataset raw = X[:, None, 2] #transformación en las dimensiones para que se ajuste al formato de entrada del preprocesamiento

#reglas de escalamiento lineal, aplicamos max-min max_raw = max(raw) #raw = datos crudos min_raw = min(raw) scaled = (2*raw - max_raw - min_raw)/(max_raw - min_raw) # normalización por el metodo del Z-score avg = np.average(raw) # μ media u averaje de raw std = np.std(raw) # σ la desviación estándar de la población raw z_scaled = (raw - avg)/std # comparan cada valor de raw para saber el Z-Score, esto es la misma que scaled del metodo anterior solo que calculado por otro metodo y luego se compara a ver cual normalizo de mejor manera los datos del dataset raw # Graficas # es importante tener una noción de los datos originales antes y después de escalarlos: fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True, tight_layout=True) ax1.hist(raw, color='#FA0087', alpha=0.8) ax1.set_title('Primer Plot usando raw') ax2.hist(scaled, color='#FA0087', alpha=0.8) ax2.set_title('Segunto Plot usando scaled') ax3.hist(z_scaled, color='#FA0087', alpha=0.8) ax3.set_title('Tercer Plot usando z_scaled') plt.show()

# modelos para entrenamiento def train_raw(): linear_model.LinearRegression().fit(raw, y) def train_scaled(): linear_model.LinearRegression().fit(scaled, y) def train_z_scaled(): linear_model.LinearRegression().fit(z_scaled, y)

raw_time = timeit.timeit(train_raw, number=100) #repite la ejecución del código 100 veces y sobre eso calcula el tiempo scaled_time = timeit.timeit(train_scaled, number=100) z_scaled_time = timeit.timeit(train_raw, number = 100) print(f'train raw: {raw_time}') print(f'train scaled: {scaled_time}') print(f'trainning time for z_scaled data : {z_scaled_time}')

df = pd.read_csv('cars.csv')

# Acá se puede apreciar como la distribución está fuertemente sesgada fig2 = px.histogram(df, x="price_usd", nbins=10, color_discrete_sequence=['#FA0087'], opacity=0.8) fig2.update_layout(title = 'Grafica curso estadistica descriptiva', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900) fig2.show()

# Transformación con tanh(x) # Esta línea toma la columna y le aplica a toda una función matemática p = 8000 fig2 = px.histogram(df, x=df.price_usd.apply(lambda x: np.tanh(x/p)), nbins=10, color_discrete_sequence=['#FA0087'], opacity=0.8) fig2.update_layout(title = 'Grafica curso estadistica descriptiva', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 500, width = 900) fig2.show()

import pandas as pd df = pd.read_csv('cars.csv')

pd.get_dummies(df['engine_type'])

import sklearn.preprocessing as preprocessing encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')

encoder.fit(df[['engine_type']].values)

encoder.transform([['gasoline'],['diesel'], ['aceite']]).toarray()

encoder.fit(df[['year_produced']].values)

encoder.transform([[2016], [2009], [190]]).toarray()

import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler iris = sns.load_dataset('iris')

g = sns.pairplot(iris, hue = 'species', palette='Set2', kind='scatter', height=1.5, corner=True) g.fig.set_size_inches(16,8) g.fig.suptitle('Grafica de covarianza') plt.show() #este gráfico no sirve si hay demasiadas variables

iris.columns

scaler = StandardScaler() scaled = scaler.fit_transform( iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] ) scaled.T

covariance_matrix = np.cov(scaled.T) covariance_matrix

iris_covariance_matrix = pd.DataFrame(covariance_matrix, index=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']) iris_covariance_matrix

# Mapa de calor de la matriz de covarianza fig = px.imshow(iris_covariance_matrix, color_continuous_scale='Portland') fig.update_layout(title = 'Heatmaps de matrix de covarianza de nuestro dataset iris', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 750, width = 900) fig.show()

fig = px.imshow(iris.corr(), color_continuous_scale='Portland') fig.update_layout(title = 'Heatmaps de correlacion de nuestro dataset iris', font = dict(family = 'verdana', size = 16, color = 'white'), template = 'plotly_dark', height = 750, width = 900) fig.show()