import timeit
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
X, y = datasets.load_diabetes(return_X_y=True)
raw = X[:, None, 2]
# escalamiento max-min
max_raw = max(raw)
min_raw = min(raw)
scaled = (2*raw - max_raw -min_raw)/(max_raw - min_raw)
# normalización Z-score
avg = np.average(raw)
std = np.std(raw)
z_scaled = (raw - avg)/std
fig, axs = plt.subplots(3, 1, sharex=True, tight_layout=True)
axs[0].hist(raw)
axs[1].hist(scaled)
axs[2].hist(z_scaled)
# modelos para entrenamiento
def train_raw():
linear_model.LinearRegression().fit(raw, y)
def train_scaled():
linear_model.LinearRegression().fit(scaled, y)
def train_z_scaled():
linear_model.LinearRegression().fit(z_scaled, y)
raw_time = timeit.timeit(train_raw, number = 100)
scaled_time = timeit.timeit(train_raw, number = 100)
z_scaled_time = timeit.timeit(train_raw, number = 100)
print('trainning time for raw data : {} '.format(raw_time))
print('trainning time for scaled data : {}'.format(scaled_time))
print('trainning time for z_scaled data : {}'.format(z_scaled_time))
df = pd.read_csv('cars.csv')
df.price_usd.hist()
p = 10000
df.price_usd.apply(lambda x: np.tanh(x/p)).hist()