import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression
%matplotlib inline
file_name='https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/FinalModule_Coursera/data/kc_house_data_NaN.csv'
df=pd.read_csv(file_name)
df.head()
df.dtypes
df.describe()
df.drop(columns=["id","Unnamed: 0"], axis=1 ,inplace=True)
df.describe()
print("Numero de valores NaN para la columna bedrooms :", df['bedrooms'].isnull().sum())
print("Numero de valores NaN para la columna bathrooms :", df['bathrooms'].isnull().sum())
mean=df['bedrooms'].mean()
df['bedrooms'].replace(np.nan,mean, inplace=True)
mean=df['bathrooms'].mean()
df['bathrooms'].replace(np.nan,mean, inplace=True)
print("Numero de valores NaN para la columna bedrooms :", df['bedrooms'].isnull().sum())
print("Numero de valores NaN para la columna bathrooms :", df['bathrooms'].isnull().sum())
df["floors"].value_counts().to_frame()
sns.boxplot(x="waterfront", y="price", data=df)
sns.regplot(x="sqft_above", y="price", data=df)
df.corr()['price'].sort_values()
X = df[['long']]
Y = df['price']
lm = LinearRegression()
lm.fit(X,Y)
lm.score(X, Y)
x = df[["sqft_living"]]
y = df["price"]
lm2 = LinearRegression()
lm2.fit(x,y)
lm2.score(x,y)
features = ["floors", "waterfront", "lat", "bedrooms", "sqft_basement", "view", "bathrooms","sqft_living15", "sqft_above", "grade", "sqft_living"]
x = df[features]
y = df["price"]
lm3 = LinearRegression()
lm3.fit(x, y)
lm3.score(x,y)
Input=[('scale',StandardScaler()),('polynomial', PolynomialFeatures(include_bias=False)),('model',LinearRegression())]
x = df[features]
y = df["price"]
pipe = Pipeline(Input)
pipe.fit(x,y)
pipe.score(x,y)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
print("done")
features =["floors", "waterfront","lat" ,"bedrooms" ,"sqft_basement" ,"view" ,"bathrooms","sqft_living15","sqft_above","grade","sqft_living"]
X = df[features]
Y = df['price']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=1)
print("number of test samples:", x_test.shape[0])
print("number of training samples:",x_train.shape[0])
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.1)
ridge.fit(x_train,y_train)
ridge.score(x_test,y_test)
pr2=PolynomialFeatures(degree=2)
x_train_pr = pr2.fit_transform(x_train)
x_test_pr = pr2.fit_transform(x_test)
ridge2 = Ridge(alpha=0.1)
ridge2.fit(x_train_pr, y_train)
ridge2.score(x_test_pr, y_test)