import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('./TrainAndValid.csv', low_memory=False)
df.head()
df.info()
fig, ax = plt.subplots()
ax.scatter(df['saledate'][:1000], df["SalePrice"][:1000])
df.saledate[:1000]
df.saledate.dtype
df.SalePrice.plot.hist()
df = pd.read_csv('./TrainAndValid.csv',
low_memory=False,
parse_dates=["saledate"])
df.saledate.dtype
df['saledate'][:1000]
fig, ax = plt.subplots()
ax.scatter(df['saledate'][:1000], df['SalePrice'][:1000])
df.head()
df.head().T
df.saledate.head(20)
df.sort_values(by=["saledate"], inplace=True,ascending=True)
df.saledate.head(20)
## Make a copy of the original DataFrame
df_tmp = df.copy()
## Add datetime parameter for saledate column
df_tmp["SaleYear"] = df_tmp.saledate.dt.year
df_tmp["SaleMonth"] = df_tmp.saledate.dt.month
df_tmp["SaleDay"] = df_tmp.saledate.dt.day
df_tmp["SaleDayOfWeek"] = df_tmp.saledate.dt.dayofweek
df_tmp["SaleDayOfYear"] = df_tmp.saledate.dt.dayofyear
df_tmp.head().T
# Now we enriched out DataFrame
df_tmp.drop("saledate", axis=1, inplace=True)
df_tmp.state.value_counts()
len(df_tmp)
df_tmp.head()
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_jobs=1, random_state=42)
model.fit(df_tmp.drop("SalePrice", axis=1), df_tmp["SalePrice"])
df_tmp.info()
df_tmp["UsageBand"].dtype
df_tmp.isna().sum()
df_tmp.head().T
pd.api.types.is_string_dtype(df_tmp["UsageBand"])
for label, content in df_tmp.items():
if pd.api.types.is_string_dtype(content):
print(label)
for label, content in df_tmp.items():
if pd.api.types.is_string_dtype(content):
df_tmp[label] = content.astype("category").cat.as_ordered()
df_tmp.info()
df_tmp.state.cat.categories
len(df_tmp.state.cat.categories)
df_tmp.state.cat.codes
for label, content in df_tmp.items():
if pd.api.types.is_numeric_dtype(content):
print(label)
df_tmp.ModelID
df_tmp.isna().sum()
# Check for which numeric column has null values
for label, content in df_tmp.items():
if pd.api.types.is_numeric_dtype(content):
if pd.isnull(content).sum():
print(label)
# Fill numeric rows with the median
for label, content in df_tmp.items():
if pd.api.types.is_numeric_dtype(content):
if pd.isnull(content).sum():
# Add binary column which tells us if the data was missing
df_tmp[label+"_is_missing"] = pd.isnull(content)
df_tmp[label] = content.fillna(content.median())
hundreds = np.full((1000,), 100)
hundreds_billion = np.append(hundreds, 1000000000)
np.mean(hundreds), np.mean(hundreds_billion), np.median(hundreds), np.median(hundreds_billion)
# Check if there is any null numeric values
for label, content in df_tmp.items():
if pd.api.types.is_numeric_dtype(content):
if pd.isnull(content).sum():
print(label)
df_tmp.auctioneerID_is_missing.value_counts()
for label, content in df_tmp.items():
if not pd.api.types.is_numeric_dtype(content):
print(label)
# Turn categorical variables into numbers and fill missing
for label ,content in df_tmp.items():
if not pd.api.types.is_numeric_dtype(content):
# Add binary
df_tmp[label+"_is_missing"] = pd.isnull(content)
# turn category into numbers and add + 1
df_tmp[label] = pd.Categorical(content).codes+1
pd.Categorical(df_tmp["state"]).codes + 1
pd.Categorical(df_tmp["UsageBand"]).codes
df_tmp.info()
df_tmp.head().T
df_tmp.isna().sum()
%%time
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_jobs=-1, random_state=42)
model.fit(df_tmp.drop("SalePrice", axis=1), df_tmp["SalePrice"])
# Score model
model.score(df_tmp.drop("SalePrice", axis=1), df_tmp["SalePrice"])
df_tmp.SaleYear.value_counts()
from sklearn.model_selection import train_test_split
df_val = df_tmp[df_tmp.SaleYear == 2012]
df_train = df_tmp[df_tmp.SaleYear !=
2012]
len(df_val), len(df_train)
# Split in data in to X and y
X_train, y_train = df_train.drop("SalePrice", axis=1), df_train.SalePrice
X_valid, y_valid = df_val.drop("SalePrice", axis=1), df_val.SalePrice
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape
# Create evaluation function (the competition uses Root Mean Square Log Error)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
def rmsle(y_test, y_preds):
return np.sqrt(mean_squared_log_error(y_test, y_preds))
# Create function to evaluate our model
def show_scores(model):
train_preds = model.predict(X_train)
val_preds = model.predict(X_valid)
scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
"Valid MAE": mean_absolute_error(y_valid, val_preds),
"Training RMSLE": rmsle(y_train, train_preds),
"Valid RMSLE": rmsle(y_valid, val_preds),
"Training R^2": model.score(X_train, y_train),
"Valid R^2": model.score(X_valid, y_valid)}
return scores
len(X_train)
# Change max samples in RandomForestRegressor
model = RandomForestRegressor(n_jobs=-1,random_state=42,max_samples=10000)
%%time
# Cutting down the max number of samples each tree can see improves training time
model.fit(X_train, y_train)
show_scores(model)
np.arange(2, 20, 2)
%%time
from sklearn.model_selection import RandomizedSearchCV
rf_grid = {"n_estimators":np.arange(10, 100, 10),
"max_depth":[None, 3, 5, 10],
"min_samples_split": np.arange(2, 20, 2),
"max_features":[0.5, 1, "sqrt", "auto"],
"max_samples":[1000],
"min_samples_leaf": np.arange(1, 20, 2)}
rs_model = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1, random_state=42),
param_distributions=rf_grid,
cv=5,
n_iter=2,
verbose=True)
rs_model.fit(X_train, y_train)
# Find the best model hyperparameter
rs_model.best_params_
# Evaluate the RandomizedSearchCV
show_scores(rs_model)
%%time
ideal_model = RandomForestRegressor(n_estimators=40,
min_samples_leaf=1,
min_samples_split=14,
max_features=0.5,
n_jobs=1,
max_samples=None,
random_state=42) # to reproduce
# Fit the ideal model
ideal_model.fit(X_train, y_train)
# Scores for ideal model (trained on all the data)
show_scores(ideal_model)
# Scores on rs_model(only trained on 10,000 examples)
show_scores(rs_model)
# import test data
df_test = pd.read_csv('./bulldozer/Test.csv',
low_memory=False,
parse_dates=["saledate"])
df_test.head()
df_test.shape
# Make prediction on test dataset
test_preds = ideal_model.predict(df_test)
df_test.info()
df_test.isna().sum()
df_test.columns
X_train.columns
def preprocess_data(df):
# Add datetime parameters for saledate
df["SaleYear"] = df.saledate.dt.year
df["SaleMonth"] = df.saledate.dt.month
df["SaleDay"] = df.saledate.dt.day
df["SaleDayOfWeek"] = df.saledate.dt.dayofweek
df["SaleDayOfYear"] = df.saledate.dt.dayofyear
# Drop original saledate
df.drop("saledate", axis=1, inplace=True)
# Fill numeric rows with the median
for label, content in df.items():
if pd.api.types.is_numeric_dtype(content):
if pd.isnull(content).sum():
df[label+"_is_missing"] = pd.isnull(content)
df[label] = content.fillna(content.median())
# Turn categorical variables into numbers
if not pd.api.types.is_numeric_dtype(content):
df[label+"_is_missing"] = pd.isnull(content)
# We add the +1 because pandas encodes missing categories as -1
df[label] = pd.Categorical(content).codes+1
return df
df_test = preprocess_data(df_test)
df_test.head()
X_train.head()
# Make prediction on updated test data
test_preds = ideal_model.predict(df_test)
# We can find how columns differ using python sets
set(X_train.columns) - set(df_test.columns)
df_test["auctioneerID_is_missing"] = False
df_test.head()
df_test = df_test.reindex(X_train.columns, axis=1)
# Make predictions on the test data
test_preds = ideal_model.predict(df_test)
test_preds
df_preds = pd.DataFrame()
df_preds["SalesID"] = df_test["SalesID"]
df_preds["SalesPrice"] = test_preds
df_preds
# Export prediction data
df_preds.to_csv("./bulldozer/test_predictiosn.csv", index=False)
# Find feature importance
ideal_model.feature_importances_
def plot_features(columns, importances, n=20):
df = (pd.DataFrame({"features":columns,
"features_importances": importances})
.sort_values("features_importances", ascending=False)
.reset_index(drop=True))
# plot the dataframe
fig, ax = plt.subplots()
ax.barh(df["features"][:n], df["features_importances"][:20])
ax.set_ylabel("Features")
ax.set_xlabel("Features importance")
ax.invert_yaxis()
plot_features(X_train.columns, ideal_model.feature_importances_)
df["Enclosure"].value_counts()