!pip install statsmodels==0.14.0
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import scipy.stats as stats
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import median_absolute_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
df = pd.read_csv("datasets/AT_BostonHousing_Data.csv", sep=";")
df.head(10)
df.isna().sum()
df_numeric_stats = df[["age", "tax", "medv"]]
df_numeric_stats.describe()
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
sns.boxplot(x=df_numeric_stats['age'], data=df)
plt.title('Age Box Plot')
plt.subplot(1, 3, 2)
sns.boxplot(x=df_numeric_stats['tax'], data=df)
plt.title('Tax Boxplot')
plt.subplot(1, 3, 3)
sns.boxplot(x=df_numeric_stats['medv'], data=df)
plt.title('Medv Boxplot')
plt.tight_layout()
plt.show()
df['rm'].describe()
def plot_hist(df, col):
plt.figure(figsize=(10, 6))
sns.histplot(df[col], bins=30, kde=True, color='red', edgecolor='black')
plt.title(f'Distribuição da Coluna "{col}"')
plt.xlabel(col)
plt.ylabel('Frequência')
plt.show()
plot_hist(df, 'rm')
def plot_corr_heatmap(df):
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, mask=np.triu(np.ones_like(correlation_matrix, dtype=bool)))
plt.title('Mapa de Calor - Correlação entre todas as variáveis')
plt.show()
plot_corr_heatmap(df)
dependent_var = 'medv'
data_Y = df[dependent_var]
data_X = df.drop(columns=[dependent_var])
print(f"Variáveis Independentes: {', '.join(data_X.columns)}")
print(f"Variável Dependente: {dependent_var}")
g = sns.PairGrid(df, corner=True)
g = g.map_lower(sns.regplot, scatter_kws={'alpha':0.15}, line_kws={'color': 'red'})
plt.show()
# Salvar imagem ou abrir o link em outra aba, para melhor visualização
data_X = df[['zn', 'dis', 'rm']]
data_X.head()
train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size=0.25, shuffle=False)
print(f"Train X: {train_X.shape} | Test X: {test_X.shape} | Train Y: {train_Y.shape[0]} | Test Y: {test_Y.shape[0]}")
def print_min_max(df, column):
min_value = df[column].min()
max_value = df[column].max()
print(f"Column: {column}")
print(f"Minimum Value: {min_value}")
print(f"Maximum Value: {max_value}")
print()
column_name = 'your_column'
print_min_max(df, 'zn')
print_min_max(df, 'dis')
print_min_max(df, 'rm')
MinMax_scaler = MinMaxScaler()
train_X_minmax = MinMax_scaler.fit_transform(train_X)
train_X_minmax
df_X_train_minmax = pd.DataFrame(train_X_minmax, columns=['zn','dis', 'rm'])
df_X_train_minmax.describe()
regressor_simple = LinearRegression()
train_X_rm = train_X['rm'].values.reshape(-1, 1)
test_X_rm = test_X['rm'].values.reshape(-1, 1)
regressor_simple.fit(train_X_rm, train_Y)
predict_y_simple = regressor_simple.predict(test_X_rm)
print('RMSE para regressão linear simples: ', math.sqrt(mean_squared_error(test_Y, predict_y_simple)))
regressor_multiple = LinearRegression()
regressor_multiple.fit(train_X, train_Y)
predict_y_multiple = regressor_multiple.predict(test_X)
print('RMSE para regressão linear múltipla: ', math.sqrt(mean_squared_error(test_Y, predict_y_multiple)))
df_assets = pd.read_csv('datasets/AT_Ativos_PETR_VALE.csv', index_col='Date')
df_assets.head()
df_assets['Date'] = df_assets.index
df_long_assets = pd.melt(df_assets, id_vars='Date', var_name='Asset', value_name='Price')
df_long_assets.index = df_long_assets['Date']
df_long_assets['Date'] = pd.to_datetime(df_long_assets['Date'])
df_long_assets.head()
plt.figure(figsize=(20,10))
ax = sns.lineplot(df_long_assets, x='Date', y='Price', hue='Asset')
plt.ylabel('Price', fontsize=10)
plt.xticks(fontsize=9)
loc = mdates.MonthLocator(interval=3)
ax.xaxis.set_major_locator(loc)
fmt = mdates.DateFormatter('%b\n%Y')
ax.xaxis.set_major_formatter(fmt)
df_long_assets['Asset'].unique()
df_long_petr3 = df_long_assets[df_long_assets['Asset'] == 'PETR3']
df_long_petr3['Asset'].unique()
fig, ax = plt.subplots(figsize = (20, 8))
ax.plot(df_long_petr3['Date'], df_long_petr3['Price'], label = 'PETR3')
ax.plot(df_long_petr3['Date'], df_long_petr3['Price'].rolling(window=15).mean(), label='Mean 15d')
ax.plot(df_long_petr3['Date'], df_long_petr3['Price'].rolling(window=50).mean(), label='Mean 50d')
ax.plot(df_long_petr3['Date'], df_long_petr3['Price'].rolling(window=100).mean(), label='Mean 100d')
loc = mdates.MonthLocator(interval=3)
ax.xaxis.set_major_locator(loc)
fmt = mdates.DateFormatter('%b\n%Y')
ax.xaxis.set_major_formatter(fmt)
ax.legend()
plt.show()
df_long_vale3 = df_long_assets[df_long_assets['Asset'] == 'VALE3']
df_long_vale3['Asset'].unique()
fig, ax = plt.subplots(figsize = (20, 8))
ax.plot(df_long_vale3['Date'], df_long_vale3['Price'], label = 'PETR3')
ax.plot(df_long_vale3['Date'], df_long_vale3['Price'].rolling(window=200).mean(), label='Mean 200d')
loc = mdates.MonthLocator(interval=3)
ax.xaxis.set_major_locator(loc)
fmt = mdates.DateFormatter('%b\n%Y')
ax.xaxis.set_major_formatter(fmt)
ax.legend()
plt.show()