Bulldozer Price : Regression

import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt import seaborn as sns

df = pd.read_csv('./TrainAndValid.csv', low_memory=False)

df.head()

df.info()

fig, ax = plt.subplots() ax.scatter(df['saledate'][:1000], df["SalePrice"][:1000])

df.saledate[:1000]

df.saledate.dtype

df.SalePrice.plot.hist()

df = pd.read_csv('./TrainAndValid.csv', low_memory=False, parse_dates=["saledate"]) df.saledate.dtype

df['saledate'][:1000]

fig, ax = plt.subplots() ax.scatter(df['saledate'][:1000], df['SalePrice'][:1000])

df.head()

df.head().T

df.saledate.head(20)

df.sort_values(by=["saledate"], inplace=True,ascending=True) df.saledate.head(20)

## Make a copy of the original DataFrame df_tmp = df.copy()

## Add datetime parameter for saledate column df_tmp["SaleYear"] = df_tmp.saledate.dt.year df_tmp["SaleMonth"] = df_tmp.saledate.dt.month df_tmp["SaleDay"] = df_tmp.saledate.dt.day df_tmp["SaleDayOfWeek"] = df_tmp.saledate.dt.dayofweek df_tmp["SaleDayOfYear"] = df_tmp.saledate.dt.dayofyear

df_tmp.head().T

# Now we enriched out DataFrame df_tmp.drop("saledate", axis=1, inplace=True)

df_tmp.state.value_counts()

len(df_tmp)

df_tmp.head()

from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor(n_jobs=1, random_state=42) model.fit(df_tmp.drop("SalePrice", axis=1), df_tmp["SalePrice"])

df_tmp.info()

df_tmp["UsageBand"].dtype

df_tmp.isna().sum()

df_tmp.head().T

pd.api.types.is_string_dtype(df_tmp["UsageBand"])

for label, content in df_tmp.items(): if pd.api.types.is_string_dtype(content): print(label)

for label, content in df_tmp.items(): if pd.api.types.is_string_dtype(content): df_tmp[label] = content.astype("category").cat.as_ordered()

df_tmp.info()

df_tmp.state.cat.categories

len(df_tmp.state.cat.categories)

df_tmp.state.cat.codes

for label, content in df_tmp.items(): if pd.api.types.is_numeric_dtype(content): print(label)

df_tmp.ModelID

df_tmp.isna().sum()

# Check for which numeric column has null values for label, content in df_tmp.items(): if pd.api.types.is_numeric_dtype(content): if pd.isnull(content).sum(): print(label)

# Fill numeric rows with the median for label, content in df_tmp.items(): if pd.api.types.is_numeric_dtype(content): if pd.isnull(content).sum(): # Add binary column which tells us if the data was missing df_tmp[label+"_is_missing"] = pd.isnull(content) df_tmp[label] = content.fillna(content.median())

hundreds = np.full((1000,), 100) hundreds_billion = np.append(hundreds, 1000000000) np.mean(hundreds), np.mean(hundreds_billion), np.median(hundreds), np.median(hundreds_billion)

# Check if there is any null numeric values for label, content in df_tmp.items(): if pd.api.types.is_numeric_dtype(content): if pd.isnull(content).sum(): print(label)

df_tmp.auctioneerID_is_missing.value_counts()

for label, content in df_tmp.items(): if not pd.api.types.is_numeric_dtype(content): print(label)

# Turn categorical variables into numbers and fill missing for label ,content in df_tmp.items(): if not pd.api.types.is_numeric_dtype(content): # Add binary df_tmp[label+"_is_missing"] = pd.isnull(content) # turn category into numbers and add + 1 df_tmp[label] = pd.Categorical(content).codes+1

pd.Categorical(df_tmp["state"]).codes + 1

pd.Categorical(df_tmp["UsageBand"]).codes

df_tmp.info()

df_tmp.head().T

df_tmp.isna().sum()

%%time from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor(n_jobs=-1, random_state=42) model.fit(df_tmp.drop("SalePrice", axis=1), df_tmp["SalePrice"])

# Score model model.score(df_tmp.drop("SalePrice", axis=1), df_tmp["SalePrice"])

df_tmp.SaleYear.value_counts()

from sklearn.model_selection import train_test_split df_val = df_tmp[df_tmp.SaleYear == 2012] df_train = df_tmp[df_tmp.SaleYear != 2012] len(df_val), len(df_train)

# Split in data in to X and y X_train, y_train = df_train.drop("SalePrice", axis=1), df_train.SalePrice X_valid, y_valid = df_val.drop("SalePrice", axis=1), df_val.SalePrice

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

# Create evaluation function (the competition uses Root Mean Square Log Error) from sklearn.metrics import mean_squared_log_error, mean_absolute_error def rmsle(y_test, y_preds): return np.sqrt(mean_squared_log_error(y_test, y_preds)) # Create function to evaluate our model def show_scores(model): train_preds = model.predict(X_train) val_preds = model.predict(X_valid) scores = {"Training MAE": mean_absolute_error(y_train, train_preds), "Valid MAE": mean_absolute_error(y_valid, val_preds), "Training RMSLE": rmsle(y_train, train_preds), "Valid RMSLE": rmsle(y_valid, val_preds), "Training R^2": model.score(X_train, y_train), "Valid R^2": model.score(X_valid, y_valid)} return scores

len(X_train)

# Change max samples in RandomForestRegressor model = RandomForestRegressor(n_jobs=-1,random_state=42,max_samples=10000)

%%time # Cutting down the max number of samples each tree can see improves training time model.fit(X_train, y_train)

show_scores(model)

np.arange(2, 20, 2)

%%time from sklearn.model_selection import RandomizedSearchCV rf_grid = {"n_estimators":np.arange(10, 100, 10), "max_depth":[None, 3, 5, 10], "min_samples_split": np.arange(2, 20, 2), "max_features":[0.5, 1, "sqrt", "auto"], "max_samples":[1000], "min_samples_leaf": np.arange(1, 20, 2)} rs_model = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1, random_state=42), param_distributions=rf_grid, cv=5, n_iter=2, verbose=True)

rs_model.fit(X_train, y_train)

# Find the best model hyperparameter rs_model.best_params_

# Evaluate the RandomizedSearchCV show_scores(rs_model)

%%time ideal_model = RandomForestRegressor(n_estimators=40, min_samples_leaf=1, min_samples_split=14, max_features=0.5, n_jobs=1, max_samples=None, random_state=42) # to reproduce # Fit the ideal model ideal_model.fit(X_train, y_train)

# Scores for ideal model (trained on all the data) show_scores(ideal_model)

# Scores on rs_model(only trained on 10,000 examples) show_scores(rs_model)

# import test data df_test = pd.read_csv('./bulldozer/Test.csv', low_memory=False, parse_dates=["saledate"])

df_test.head()

df_test.shape

# Make prediction on test dataset test_preds = ideal_model.predict(df_test)

df_test.info()

df_test.isna().sum()

df_test.columns

X_train.columns

def preprocess_data(df): # Add datetime parameters for saledate df["SaleYear"] = df.saledate.dt.year df["SaleMonth"] = df.saledate.dt.month df["SaleDay"] = df.saledate.dt.day df["SaleDayOfWeek"] = df.saledate.dt.dayofweek df["SaleDayOfYear"] = df.saledate.dt.dayofyear # Drop original saledate df.drop("saledate", axis=1, inplace=True) # Fill numeric rows with the median for label, content in df.items(): if pd.api.types.is_numeric_dtype(content): if pd.isnull(content).sum(): df[label+"_is_missing"] = pd.isnull(content) df[label] = content.fillna(content.median()) # Turn categorical variables into numbers if not pd.api.types.is_numeric_dtype(content): df[label+"_is_missing"] = pd.isnull(content) # We add the +1 because pandas encodes missing categories as -1 df[label] = pd.Categorical(content).codes+1 return df

df_test = preprocess_data(df_test) df_test.head()

X_train.head()

# Make prediction on updated test data test_preds = ideal_model.predict(df_test)

# We can find how columns differ using python sets set(X_train.columns) - set(df_test.columns)

df_test["auctioneerID_is_missing"] = False df_test.head()

df_test = df_test.reindex(X_train.columns, axis=1)

# Make predictions on the test data test_preds = ideal_model.predict(df_test)

test_preds

df_preds = pd.DataFrame() df_preds["SalesID"] = df_test["SalesID"] df_preds["SalesPrice"] = test_preds df_preds

# Export prediction data df_preds.to_csv("./bulldozer/test_predictiosn.csv", index=False)

# Find feature importance ideal_model.feature_importances_

def plot_features(columns, importances, n=20): df = (pd.DataFrame({"features":columns, "features_importances": importances}) .sort_values("features_importances", ascending=False) .reset_index(drop=True)) # plot the dataframe fig, ax = plt.subplots() ax.barh(df["features"][:n], df["features_importances"][:20]) ax.set_ylabel("Features") ax.set_xlabel("Features importance") ax.invert_yaxis()

plot_features(X_train.columns, ideal_model.feature_importances_)

df["Enclosure"].value_counts()