!pip install statsmodels==0.14.0
Run to view results
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import scipy.stats as stats
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import median_absolute_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
Run to view results
df = pd.read_csv("datasets/AT_BostonHousing_Data.csv", sep=";")
df.head(10)
Run to view results
df.isna().sum()
Run to view results
df_numeric_stats = df[["age", "tax", "medv"]]
df_numeric_stats.describe()
Run to view results
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
sns.boxplot(x=df_numeric_stats['age'], data=df)
plt.title('Age Box Plot')
plt.subplot(1, 3, 2)
sns.boxplot(x=df_numeric_stats['tax'], data=df)
plt.title('Tax Boxplot')
plt.subplot(1, 3, 3)
sns.boxplot(x=df_numeric_stats['medv'], data=df)
plt.title('Medv Boxplot')
plt.tight_layout()
plt.show()
Run to view results
df['rm'].describe()
Run to view results
def plot_hist(df, col):
plt.figure(figsize=(10, 6))
sns.histplot(df[col], bins=30, kde=True, color='red', edgecolor='black')
plt.title(f'Distribuição da Coluna "{col}"')
plt.xlabel(col)
plt.ylabel('Frequência')
plt.show()
plot_hist(df, 'rm')
Run to view results
def plot_corr_heatmap(df):
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, mask=np.triu(np.ones_like(correlation_matrix, dtype=bool)))
plt.title('Mapa de Calor - Correlação entre todas as variáveis')
plt.show()
plot_corr_heatmap(df)
Run to view results
dependent_var = 'medv'
data_Y = df[dependent_var]
data_X = df.drop(columns=[dependent_var])
print(f"Variáveis Independentes: {', '.join(data_X.columns)}")
print(f"Variável Dependente: {dependent_var}")
Run to view results
g = sns.PairGrid(df, corner=True)
g = g.map_lower(sns.regplot, scatter_kws={'alpha':0.15}, line_kws={'color': 'red'})
plt.show()
# Salvar imagem ou abrir o link em outra aba, para melhor visualização
Run to view results
data_X = df[['zn', 'dis', 'rm']]
data_X.head()
Run to view results
train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size=0.25, shuffle=False)
print(f"Train X: {train_X.shape} | Test X: {test_X.shape} | Train Y: {train_Y.shape[0]} | Test Y: {test_Y.shape[0]}")
Run to view results
def print_min_max(df, column):
min_value = df[column].min()
max_value = df[column].max()
print(f"Column: {column}")
print(f"Minimum Value: {min_value}")
print(f"Maximum Value: {max_value}")
print()
column_name = 'your_column'
print_min_max(df, 'zn')
print_min_max(df, 'dis')
print_min_max(df, 'rm')
Run to view results
MinMax_scaler = MinMaxScaler()
train_X_minmax = MinMax_scaler.fit_transform(train_X)
train_X_minmax
Run to view results
df_X_train_minmax = pd.DataFrame(train_X_minmax, columns=['zn','dis', 'rm'])
df_X_train_minmax.describe()
Run to view results
regressor_simple = LinearRegression()
train_X_rm = train_X['rm'].values.reshape(-1, 1)
test_X_rm = test_X['rm'].values.reshape(-1, 1)
regressor_simple.fit(train_X_rm, train_Y)
predict_y_simple = regressor_simple.predict(test_X_rm)
print('RMSE para regressão linear simples: ', math.sqrt(mean_squared_error(test_Y, predict_y_simple)))
Run to view results
regressor_multiple = LinearRegression()
regressor_multiple.fit(train_X, train_Y)
predict_y_multiple = regressor_multiple.predict(test_X)
print('RMSE para regressão linear múltipla: ', math.sqrt(mean_squared_error(test_Y, predict_y_multiple)))
Run to view results
df_assets = pd.read_csv('datasets/AT_Ativos_PETR_VALE.csv', index_col='Date')
df_assets.head()
Run to view results
df_assets['Date'] = df_assets.index
df_long_assets = pd.melt(df_assets, id_vars='Date', var_name='Asset', value_name='Price')
df_long_assets.index = df_long_assets['Date']
df_long_assets['Date'] = pd.to_datetime(df_long_assets['Date'])
df_long_assets.head()
Run to view results
plt.figure(figsize=(20,10))
ax = sns.lineplot(df_long_assets, x='Date', y='Price', hue='Asset')
plt.ylabel('Price', fontsize=10)
plt.xticks(fontsize=9)
loc = mdates.MonthLocator(interval=3)
ax.xaxis.set_major_locator(loc)
fmt = mdates.DateFormatter('%b\n%Y')
ax.xaxis.set_major_formatter(fmt)
Run to view results
df_long_assets['Asset'].unique()
Run to view results
df_long_petr3 = df_long_assets[df_long_assets['Asset'] == 'PETR3']
df_long_petr3['Asset'].unique()
Run to view results
fig, ax = plt.subplots(figsize = (20, 8))
ax.plot(df_long_petr3['Date'], df_long_petr3['Price'], label = 'PETR3')
ax.plot(df_long_petr3['Date'], df_long_petr3['Price'].rolling(window=15).mean(), label='Mean 15d')
ax.plot(df_long_petr3['Date'], df_long_petr3['Price'].rolling(window=50).mean(), label='Mean 50d')
ax.plot(df_long_petr3['Date'], df_long_petr3['Price'].rolling(window=100).mean(), label='Mean 100d')
loc = mdates.MonthLocator(interval=3)
ax.xaxis.set_major_locator(loc)
fmt = mdates.DateFormatter('%b\n%Y')
ax.xaxis.set_major_formatter(fmt)
ax.legend()
plt.show()
Run to view results
df_long_vale3 = df_long_assets[df_long_assets['Asset'] == 'VALE3']
df_long_vale3['Asset'].unique()
Run to view results
fig, ax = plt.subplots(figsize = (20, 8))
ax.plot(df_long_vale3['Date'], df_long_vale3['Price'], label = 'PETR3')
ax.plot(df_long_vale3['Date'], df_long_vale3['Price'].rolling(window=200).mean(), label='Mean 200d')
loc = mdates.MonthLocator(interval=3)
ax.xaxis.set_major_locator(loc)
fmt = mdates.DateFormatter('%b\n%Y')
ax.xaxis.set_major_formatter(fmt)
ax.legend()
plt.show()
Run to view results