import pandas as pd
import numpy as np
data = pd.read_csv('https://www.ic.unicamp.br/~wainer/cursos/1s2021/432/solar-flare.csv', sep=' ', header=None, skiprows=[0])
data.head()
data.describe()
data.tail()
from sklearn.preprocessing import OneHotEncoder
def convert_one_hot_encoding(list_columns, data):
new_df = pd.DataFrame()
for idx in list_columns:
ohe = OneHotEncoder(categories='auto')
column_ohe = data[[idx]].to_numpy()
column_ohe = ohe.fit_transform(column_ohe).toarray().astype(int)
cols = ['col_'+str(idx)+'_'+str(col) for col in ohe.categories_[0]]
temp_df = pd.DataFrame(column_ohe, columns=cols)
prefix = 'col_'+str(idx)+'_'
temp_df.add_prefix(prefix)
new_df = pd.concat([new_df, temp_df], axis=1)
return new_df
columns = [0, 1, 2]
new_df = convert_one_hot_encoding(columns, data)
new_df.head()
data.drop([0, 1, 2], axis=1, inplace=True)
data = pd.concat([new_df, data], axis=1)
data.head()
data.tail()
from sklearn.preprocessing import StandardScaler
def centering_scaling(data, columns):
vector = data[columns].values
scale = StandardScaler().fit(vector)
data[columns] = scale.transform(data[columns])
columns = []
for column in data.columns:
if column != 10 and column != 11 and column != 12:
columns.append(column)
centering_scaling(data, columns)
data.head()
data.describe()
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
pca = PCA(n_components=len(data.columns) - 3)
principal_components = pca.fit_transform(data[columns])
pca_df = pd.DataFrame(data=principal_components, columns=columns)
pca_df.head()
values = np.arange(pca.n_components_) + 1
plt.plot(values, pca.explained_variance_ratio_, 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained')
plt.show()
x = np.cumsum(pca.explained_variance_ratio_)
np.argmax(x > 0.9) + 1
print ("Proportion of Variance Explained : ", pca.explained_variance_ratio_)
out_sum = np.cumsum(pca.explained_variance_ratio_)
print ("Cumulative Prop. Variance Explained: ", out_sum)
Proportion of Variance Explained : [1.78932757e-01 1.56177213e-01 1.00060310e-01 8.77959402e-02
6.15151684e-02 5.39747695e-02 5.19490713e-02 4.93541754e-02
4.49881049e-02 4.01448024e-02 3.76184782e-02 2.89743775e-02
2.54481203e-02 2.43993652e-02 2.35389887e-02 2.02297599e-02
1.44686185e-02 4.29980237e-04 5.94906914e-31 1.28941439e-31
2.39893589e-32 1.29770050e-32 0.00000000e+00]
Cumulative Prop. Variance Explained: [0.17893276 0.33510997 0.43517028 0.52296622 0.58448139 0.63845616
0.69040523 0.7397594 0.78474751 0.82489231 0.86251079 0.89148517
0.91693329 0.94133265 0.96487164 0.9851014 0.99957002 1.
1. 1. 1. 1. 1. ]
final_cols = []
for idx, col in enumerate(columns):
if idx >= 13:
break
final_cols.append(col)
print(final_cols)
['col_0_B', 'col_0_C', 'col_0_D', 'col_0_E', 'col_0_F', 'col_0_H', 'col_1_A', 'col_1_H', 'col_1_K', 'col_1_R', 'col_1_S', 'col_1_X', 'col_2_C']
final_df = data[final_cols]
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
X = final_df.values
Y_1 = data[10].values
Y_2 = data[11].values
Y_3 = data[12].values
cv_split = ShuffleSplit(n_splits=5, test_size=.30, random_state=193)
linear_reg = LinearRegression()
cv_results = cross_validate(linear_reg, X, Y_1, cv=cv_split, scoring=('neg_root_mean_squared_error', 'neg_mean_absolute_error'))
rsme_res = cv_results['test_neg_root_mean_squared_error']
mae_res = cv_results['test_neg_mean_absolute_error']
print("Label 1:")
print(f'\tRSME: {-1*rsme_res}')
print(f'\tmédia RMSE = {-1*np.mean(rsme_res)}\n')
print(f'\tMAE: {-1*mae_res}')
print(f'\tmédia = {-1*np.mean(mae_res)}')
Label 1:
RSME: [0.79308892 0.68152147 0.79819238 0.83312734 0.66628359]
média RMSE = 0.7544427379858405
MAE: [0.45180137 0.41079952 0.4343813 0.44391281 0.37948536]
média = 0.42407607026201594
linear_reg = LinearRegression()
cv_results = cross_validate(linear_reg, X, Y_2, cv=cv_split, scoring=('neg_root_mean_squared_error', 'neg_mean_absolute_error'))
rsme_res = cv_results['test_neg_root_mean_squared_error']
mae_res = cv_results['test_neg_mean_absolute_error']
print("Label 2:")
print(f'\tRSME: {-1*rsme_res}')
print(f'\tmédia RMSE = {-1*np.mean(rsme_res)}\n')
print(f'\tMAE: {-1*mae_res}')
print(f'\tmédia = {-1*np.mean(mae_res)}')
Label 2:
RSME: [0.26902859 0.41677828 0.34655151 0.32508482 0.20938649]
média RMSE = 0.3133659398516363
MAE: [0.0937963 0.10206958 0.10321677 0.09391941 0.07648187]
média = 0.09389678649602644
linear_reg = LinearRegression()
cv_results = cross_validate(linear_reg, X, Y_3, cv=cv_split, scoring=('neg_root_mean_squared_error', 'neg_mean_absolute_error'))
rsme_res = cv_results['test_neg_root_mean_squared_error']
mae_res = cv_results['test_neg_mean_absolute_error']
print("Label 3:")
print(f'\tRSME: {-1*rsme_res}')
print(f'\tmédia RMSE = {-1*np.mean(rsme_res)}\n')
print(f'\tMAE: {-1*mae_res}')
print(f'\tmédia = {-1*np.mean(mae_res)}')
Label 3:
RSME: [0.07986859 0.14419054 0.03994121 0.05662712 0.05211832]
média RMSE = 0.07454915628038052
MAE: [0.01839322 0.0179816 0.01102435 0.01254235 0.00967835]
média = 0.013923973257511624