import pandas as pd
df = pd.read_csv("ArcTecSw_2021_BigData_Practica_Part3_amoni.csv", index_col=0)
df.shape
df.head()
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 0 to 214
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 target 147 non-null float64
1 1 145 non-null float64
2 2 145 non-null float64
3 3 32 non-null float64
4 4 31 non-null float64
5 5 33 non-null float64
6 6 37 non-null float64
7 7 37 non-null float64
dtypes: float64(8)
memory usage: 10.3 KB
df.describe()
df.corr()
import matplotlib.pyplot as plt
f, ax = plt.subplots(nrows=3,ncols=3)
f.set_figheight(12)
f.set_figwidth(15)
ax[0,0].scatter(df['target'],df['1'])
ax[0,1].scatter(df['target'],df['2'])
ax[0,2].scatter(df['target'],df['3'])
ax[1,0].scatter(df['target'],df['4'])
ax[1,1].scatter(df['target'],df['5'])
ax[1,2].scatter(df['target'],df['6'])
ax[2,0].scatter(df['target'],df['7'])
ax[2,1].remove()
ax[2,2].remove()
processed_df = df.drop(['3','4','5','6','7'], axis=1)
processed_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 0 to 214
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 target 147 non-null float64
1 1 145 non-null float64
2 2 145 non-null float64
dtypes: float64(3)
memory usage: 4.6 KB
processed_df = processed_df.dropna()
processed_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 144 entries, 0 to 214
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 target 144 non-null float64
1 1 144 non-null float64
2 2 144 non-null float64
dtypes: float64(3)
memory usage: 4.5 KB
processed_df.head()
station_1 = processed_df['1']
station_2 = processed_df['2']
mean = (station_1+station_2)/2
relation = processed_df['target']/mean
processed_df['relation'] = relation
processed_df['mean'] = processed_df['1']/processed_df['2']
processed_df.head()
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(processed_df[["1","2","relation","mean"]], processed_df[["target"]], test_size=0.10, random_state=42)
from sklearn.linear_model import SGDRegressor
reg_sgdr = SGDRegressor()
reg_sgdr.fit(x_train, y_train)
/shared-libs/python3.8/py/lib/python3.8/site-packages/sklearn/utils/validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mean_absolute_error(y_test, reg_sgdr.predict(x_test))
mean_squared_error(y_test, reg_sgdr.predict(x_test))
r2_score(y_test, reg_sgdr.predict(x_test))
from sklearn import linear_model
reg_sgdr = linear_model.PoissonRegressor(alpha=0)
reg_sgdr.fit(x_train, y_train)
/shared-libs/python3.8/py/lib/python3.8/site-packages/sklearn/utils/validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
mean_absolute_error(y_test, reg_sgdr.predict(x_test))
r2_score(y_test, reg_sgdr.predict(x_test))
from sklearn.linear_model import LinearRegression
reg_sgdr = LinearRegression().fit(x_train, y_train)
mean_absolute_error(y_test, reg_sgdr.predict(x_test))
r2_score(y_test, reg_sgdr.predict(x_test))
reg_sgdr = linear_model.Ridge(alpha=.5)
reg_sgdr.fit(x_train, y_train)
mean_absolute_error(y_test, reg_sgdr.predict(x_test))
r2_score(y_test, reg_sgdr.predict(x_test))
reg_sgdr = linear_model.Ridge()
reg_sgdr.fit(x_train, y_train)
mean_absolute_error(y_test, reg_sgdr.predict(x_test))
r2_score(y_test, reg_sgdr.predict(x_test))