import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression
%matplotlib inline
file_name='https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-SkillsNetwork/labs/FinalModule_Coursera/data/kc_house_data_NaN.csv'
df=pd.read_csv(file_name)
df.head()
Unnamed: 0int64
idint64
0
0
7129300520
1
1
6414100192
2
2
5631500400
3
3
2487200875
4
4
1954400510
df.dtypes
df.describe()
Unnamed: 0float64
idfloat64
count
21613
21613
mean
10806
4580301521
std
6239.28002
2876565571
min
0
1000102
25%
5403
2123049194
50%
10806
3904930410
75%
16209
7308900445
max
21612
9900000190
df.drop(columns=["id","Unnamed: 0"], axis=1 ,inplace=True)
df.describe()
pricefloat64
bedroomsfloat64
count
21613
21600
mean
540088.1418
3.37287037
std
367127.1965
0.9266566928
min
75000
1
25%
321950
3
50%
450000
3
75%
645000
4
max
7700000
33
print("Numero de valores NaN para la columna bedrooms :", df['bedrooms'].isnull().sum())
print("Numero de valores NaN para la columna bathrooms :", df['bathrooms'].isnull().sum())
Numero de valores NaN para la columna bedrooms : 13
Numero de valores NaN para la columna bathrooms : 10
mean=df['bedrooms'].mean()
df['bedrooms'].replace(np.nan,mean, inplace=True)
mean=df['bathrooms'].mean()
df['bathrooms'].replace(np.nan,mean, inplace=True)
print("Numero de valores NaN para la columna bedrooms :", df['bedrooms'].isnull().sum())
print("Numero de valores NaN para la columna bathrooms :", df['bathrooms'].isnull().sum())
Numero de valores NaN para la columna bedrooms : 0
Numero de valores NaN para la columna bathrooms : 0
df["floors"].value_counts().to_frame()
floorsint64
1
10680
2
8241
1.5
1910
3
613
2.5
161
3.5
8
sns.boxplot(x="waterfront", y="price", data=df)
sns.regplot(x="sqft_above", y="price", data=df)
df.corr()['price'].sort_values()
X = df[['long']]
Y = df['price']
lm = LinearRegression()
lm.fit(X,Y)
lm.score(X, Y)
x = df[["sqft_living"]]
y = df["price"]
lm2 = LinearRegression()
lm2.fit(x,y)
lm2.score(x,y)
features = ["floors", "waterfront", "lat", "bedrooms", "sqft_basement", "view", "bathrooms","sqft_living15", "sqft_above", "grade", "sqft_living"]
x = df[features]
y = df["price"]
lm3 = LinearRegression()
lm3.fit(x, y)
lm3.score(x,y)
Input=[('scale',StandardScaler()),('polynomial', PolynomialFeatures(include_bias=False)),('model',LinearRegression())]
x = df[features]
y = df["price"]
pipe = Pipeline(Input)
pipe.fit(x,y)
pipe.score(x,y)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
print("done")
done
features =["floors", "waterfront","lat" ,"bedrooms" ,"sqft_basement" ,"view" ,"bathrooms","sqft_living15","sqft_above","grade","sqft_living"]
X = df[features]
Y = df['price']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=1)
print("number of test samples:", x_test.shape[0])
print("number of training samples:",x_train.shape[0])
number of test samples: 3242
number of training samples: 18371
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.1)
ridge.fit(x_train,y_train)
ridge.score(x_test,y_test)
pr2=PolynomialFeatures(degree=2)
x_train_pr = pr2.fit_transform(x_train)
x_test_pr = pr2.fit_transform(x_test)
ridge2 = Ridge(alpha=0.1)
ridge2.fit(x_train_pr, y_train)
ridge2.score(x_test_pr, y_test)