import pandas as pd
df = pd.read_csv('cars.csv')
df.head()
df.columns
df.dtypes
df.describe()
import pandas as pd
df = pd.read_csv('cars.csv')
df['price_usd'].mean()
df['price_usd'].median()
df['price_usd'].plot.hist(bins=20)
import seaborn as sns
sns.displot(df, x='price_usd', hue='manufacturer_name')
sns.displot(df, x='price_usd', hue='engine_type', multiple='stack')
df.groupby('engine_type').count()
Q7_df = df[(df['manufacturer_name'] == 'Audi') & (df['model_name'] == 'Q7')]
sns.histplot(Q7_df, x='price_usd', hue='year_produced')
df.groupby('transmission').count()
focus_df = df[(df['manufacturer_name'] == 'Ford') & (df['transmission'] == 'mechanical') & (df['model_name'] == 'Focus')]
focus_df
sns.histplot(focus_df, x='price_usd', hue='year_produced')
sns.histplot(focus_df, x='price_usd', hue='color')
sns.histplot(focus_df, x='price_usd', hue='engine_type')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('cars.csv')
# Desviación estandar
df['price_usd'].std()
# Rango = valor max - valor min
df['price_usd'].max() - df['price_usd'].min()
# Cuartiles
median = df['price_usd'].median()
q1 = df['price_usd'].quantile(q=0.25)
q3 = df['price_usd'].quantile(q=0.75)
min_val = df['price_usd'].quantile(q=0)
max_val = df['price_usd'].quantile(q=1.0)
print("Min=", min_val, "Q1=", q1, "Q2=", median, "Q3=", q3, "Max=", max_val)
# Rango intercuartil
iqr = q3 - q1
iqr
min_limit = q1 - (1.5 * iqr)
max_limit = q3 + (1.5 * iqr)
print("Valor mínimo =", min_limit, "| Valor máximo =", max_limit)
sns.histplot(df['price_usd'])
sns.boxplot(df['price_usd'])
sns.boxplot(data=df, x='engine_fuel', y='price_usd')
sns.boxplot(data=df, x='color', y='price_usd')
import pandas as pd
import seaborn as sns
iris = sns.load_dataset('iris')
iris.head()
iris.dtypes
iris.describe()
# scatterplot
sns.scatterplot(data=iris, x='sepal_length', y='petal_length', hue='species')
# jointplot
sns.jointplot(data=iris, x='sepal_length', y='petal_length', hue='species')
# boxplot
sns.boxplot(data=iris, x='species', y='sepal_length')
# barplot
sns.barplot(data=iris, y='species', x='petal_length')
import timeit
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
X, y = datasets.load_diabetes(return_X_y=True)
raw = X[:, None, 2]
# reglas de escalamiento
max_raw = max(raw)
min_raw = min(raw)
scaled = (2*raw - max_raw - min_raw) / (max_raw - min_raw)
fig, axs = plt.subplots(2, 1, sharex=True)
axs[0].hist(raw)
axs[1].hist(scaled)
# Modelo de entrenamiento
def train_raw():
linear_model.LinearRegression().fit(raw, y)
def train_scaled():
linear_model.LinearRegression().fit(scaled, y)
raw_time = timeit.timeit(train_raw, number=100)
scaled_time = timeit.timeit(train_scaled, number=100)
print("train raw: {}".format(raw_time))
print("train scaled: {}".format(scaled_time))
# Escalamiento Z-Score
import numpy as np
mean_raw = np.mean(raw)
stdev_raw = np.std(raw)
scaled_zscore = (raw - mean_raw) / stdev_raw
fig, axs = plt.subplots(2, 1, sharex=True)
axs[0].hist(raw)
axs[1].hist(scaled_zscore)
# Modelo de entrenamiento
def train_raw():
linear_model.LinearRegression().fit(raw, y)
def train_scaled():
linear_model.LinearRegression().fit(scaled_zscore, y)
raw_time = timeit.timeit(train_raw, number=1)
scaled_time = timeit.timeit(train_scaled, number=100)
print("train raw: {}".format(raw_time))
print("train scaled: {}".format(scaled_time))
df = pd.read_csv('cars.csv')
df.price_usd.hist()
p = 10_000
df.price_usd.apply(lambda x: np.tanh(x/p)).hist()
import pandas as pd
df = pd.read_csv("cars.csv")
pd.get_dummies(df['engine_type'])
import sklearn.preprocessing as preprocessing
encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
encoder.fit(df[['engine_type']].values)
encoder.transform([['gasoline'], ['diesel'], ['aceite']]).toarray()
encoder.fit(df[['year_produced']].values)
encoder.transform([[2016], [2009], [2031]]).toarray()
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
iris = sns.load_dataset('iris')
sns.pairplot(iris, hue='species')
iris.columns
scaler = StandardScaler()
scaled = scaler.fit_transform(
iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
)
scaled.T
covariance_matrix = np.cov(scaled.T)
covariance_matrix
plt.figure(figsize=(10, 10))
sns.set(font_scale=1.5)
hm = sns.heatmap(covariance_matrix,
cbar=True,
annot=True,
square=True,
fmt='.2f',
annot_kws={'size': 12},
yticklabels=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
xticklabels=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
# sns.heatmap(iris.corr(), annot=True)
# En python el producto de matrices se calcula fácil usando la librería numpy:
import numpy as np
A = np.array([[2, 4], [-1, 2]])
B = np.array([[2, 3], [-4, 1]])
np.matmul(A, B)
# El cálculo de matriz inversa se hace rápido usando NumPy nuevamente así:
A = np.array([[2, 4], [-1, 2]])
Ainversa = np.linalg.inv(A)
Ainversa
# Para comprobar que esto está bien, puedes multiplicar ambas matrices para ver que da lo correcto:
np.matmul(A, Ainversa)
# Aquí entonces debemos encontrar las combinaciones de x, y e que satisfacen el sistema de ecuaciones
A = np.array([[1, 2], [1, 0]])
values, vectors = np.linalg.eig(A)
print(values, "\n")
print(vectors)
# Puedes verificar que cada vector y su respectivo valor propio cumplen la ecuación original ejecutando cada parte así:
np.matmul(A, vectors.T[1])
#Mientras que por otro lado calculando, Resulta en lo mismo:
values[1]*vectors.T[1]
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
iris = sns.load_dataset('iris')
scaler = StandardScaler()
scaled = scaler.fit_transform(
iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].values
)
covariance_matrix = np.cov(scaled.T)
covariance_matrix
sns.pairplot(iris)
sns.jointplot(x=iris['petal_length'], y=iris['petal_width'])
sns.jointplot(x=scaled[:, 2], y=scaled[:, 3])
eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)
eigen_values
eigen_vectors
variance_explained = []
for i in eigen_values:
variance_explained.append((i/sum(eigen_values))*100)
print(variance_explained)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(scaled)
pca.explained_variance_ratio_
reduced_scaled = pca.transform(scaled)
reduced_scaled
scaled
iris['pca_1'] = reduced_scaled[:, 0]
iris['pca_2'] = reduced_scaled[:, 1]
iris
sns.jointplot(iris['pca_1'], iris['pca_2'], hue=iris['species'])