import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
boston_dataset = load_boston()
print(boston_dataset.DESCR)
df = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
df['MEDV'] = boston_dataset.target
df
df.shape
df.dtypes
df.columns.unique()
df.nunique()
df.isnull().sum()
df.describe()
df.corr
plt.figure(figsize=(10,10))
sns.heatmap(data=df.corr(), annot=True)
# sns.pairplot(df, size=2.5)
#Plot a box plot
plt.figure(figsize=(20,20))
df.boxplot()
# Minimum price
df.MEDV.min()
#Maximum price
df.MEDV.max()
# standard deviation
df.MEDV.std()
x =np.array(df.drop('MEDV', axis=1))
y =np.array(df.MEDV)
x
y
x_train , x_test , y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
len(x_train)
len(y_train)
len(x_test)
len(y_test)
model = LinearRegression()
model.fit(x_train, y_train)
model.intercept_
model.coef_
y_pred = model.predict(x_test)
y_pred
model.score(x_test, y_test)
r2_score(y_test, y_pred)
mean_squared_error(y_test, y_pred)
mean_absolute_error(y_test, y_pred)
plt.scatter(y_test, y_pred)
plt.xlabel("actual price")
plt.ylabel("Predicted price")
plt.grid()
plt.plot([min(y_test), max(y_test)], [min(y_pred), max(y_pred)], color='red')
plt.title("Actual v/s Predicted price")
import klib
from sklearn.datasets import load_diabetes, load_iris
df2 = pd.read_csv('/work/train.csv')
# df.columns =load_iris().feature_names
df2
klib.cat_plot(df2)
klib.dist_plot(df2)
import joblib
joblib.dump(model, 'house_price_model.pkl')
df
CRIM
ZN
INDUS
CHAS
NOX
RM
AGE
DIS
RAD
TAX
PTRATIO
B
LSTAT