Tipo de datos
import pandas as pd
df = pd.read_csv("cars.csv")
df
This chart is empty
Chart was probably not set up properly in the notebook
This chart is empty
Chart was probably not set up properly in the notebook
df.dtypes
df.describe()
import pandas as pd
df = pd.read_csv("cars.csv")
df ["price_usd"].mean()
df ["price_usd"].median()
df["price_usd"].plot.hist(bins=20)
import seaborn as sns
sns.displot(df, x = "price_usd", hue = "manufacturer_name")
sns.displot(df, x="price_usd", hue="engine_type", multiple= "stack")
df.groupby("engine_type").count()
Q7_df = df[(df["manufacturer_name"] == "Audi") & (df["model_name"] == "Q7")]
Q7_df
sns.histplot(Q7_df, x = "price_usd", hue = "year_produced")
sns.histplot(Q7_df, x = "price_usd", hue = "engine_type")
Medidas de dispersión
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("cars.csv")
df["price_usd"].std()
rango = df["price_usd"].max() - df["price_usd"].min()
rango
median = df["price_usd"].median()
Q1 = df["price_usd"].quantile(q=0.25)
Q3 = df["price_usd"].quantile(q=0.75)
min_val = df["price_usd"].quantile(q=0)
max_val = df["price_usd"].quantile(q=1)
print(min_val, Q1, median, Q3, max_val)
1.0 2100.0 4800.0 8990.0 50000.0
IQR = Q3 - Q1
IQR
minlimit = Q1 - 1.5*IQR
maxlimit = Q3 - 1.5*IQR
print(minlimit, maxlimit)
-8235.0 -1345.0
sns.histplot(df["price_usd"])
sns.boxplot(df["price_usd"])
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
FutureWarning
sns.boxplot(x="engine_fuel", y="price_usd", data=df)
Exploración visual de los datos
Diagramas de dispersión en el análisis de datos
import pandas as pd
import seaborn as sns
iris = sns.load_dataset("iris")
iris.head()
sns.scatterplot(data=iris, x="sepal_length", y="petal_length", hue="species")
sns.jointplot(data=iris, x="sepal_length", y="petal_length", hue="species")
sns.boxplot(data=iris, x="species", y="sepal_length")
sns.barplot(data=iris, x="species", y="sepal_length")
sns.lmplot(data=iris, x="sepal_width", y="petal_width", hue="species")
Transformaciones lineales
import pandas as pd
import numpy as np
import timeit #medir performance de modelos
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model #regresion lineal
X,y = datasets.load_diabetes(return_X_y=True)
raw = X[:, None, 2]
#reglas de escalamiento
max_raw = max(raw)
min_raw = min(raw)
scaled = (2*raw - max_raw - min_raw)/(max_raw - min_raw)
#noción de datos pre y post escalado
fig, axs = plt.subplots(2, 1, sharex=True)
axs[0].hist(raw)
axs[1].hist(scaled)
#modelos para entrenamiento
def train_raw():
linear_model:LinearRegression().fit(raw, y)
def train_scaled():
linear_model:LinearRegression().fit(scaled, y)
raw_time = timeit.timeit(train_raw, number= 100) #repite la ejecucion 100 veces para medir el tiempo
scaled_time= timeit.timeit(train_scaled, number= 100)
print("train raw: {}".format(raw_time))
print("train scaled: {}".format(scaled_time))
#se reduce el tiempo en datos escalados, el efecto de escalar datos is contribuye a que el algoritmo tenga mejor convergencia
train raw: 7.267000910360366e-06
train scaled: 7.0789974415674806e-06
Transformaciones no lineales
df = pd.read_csv("cars.csv")
df.price_usd.hist()
#Transformación con tanh(x)
p = 10000
df.price_usd.apply(lambda x: np.tanh(x/p)).hist()
Pipelines para procesas variables categóricas
import pandas as pd
df = pd.read_csv("cars.csv")
pd.get_dummies(df["engine_type"]) #en sentido práctico es un one-hot le asigna 1 al que si es y cero a los que no
import sklearn.preprocessing as prepocessing
encoder = prepocessing.OneHotEncoder(handle_unknown= "ignore")
encoder.fit(df[["engine_type"]].values)
encoder.transform([["gasoline"], ["diesel"], ["aceite"]]).toarray()
encoder.fit(df[["year_produced"]].values)
encoder.transform([[2016], [2009], [1990]]).toarray()
Correlaciones: covarianza y coeficiente de correlación
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
iris = sns.load_dataset("iris")
sns.pairplot(iris, hue="species")
iris.columns
scaler = StandardScaler()
scaled = scaler.fit_transform(
iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
)
scaled.T
covariance_matrix = np.cov(scaled.T)
covariance_matrix
#Mapa de calor
plt.figure(figsize=(10,10))
sns.set(font_scale=1.5)
hm = sns.heatmap(covariance_matrix,
cbar=True,
annot=True,
square=True,
fmt=".2f",
annot_kws={"size": 12},
yticklabels=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
xticklabels=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])