Dappling in Machine Learning

# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the read-only "../input/" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# save filepath to variable for easier access melbourne_file_path = '../input/melbourne-housing-snapshot/melb_data.csv' # read the data and store data in DataFrame titled melbourne_data melbourne_data = pd.read_csv(melbourne_file_path) # print a summary of the data in Melbourne data melbourne_data.describe()

# Path of the file to read iowa_file_path = '../input/home-data-for-ml-course/train.csv' # Fill in the line below to read the file into a variable home_data home_data = pd.read_csv(iowa_file_path)

home_data.describe()

melbourne_data.columns

melbourne_data = melbourne_data.dropna(axis=0)

mel_y = melbourne_data.Price

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude'] mel_X = melbourne_data[melbourne_features] mel_X.describe()

mel_X.head()

from sklearn.tree import DecisionTreeRegressor melbourne_model = DecisionTreeRegressor(random_state=1) melbourne_model.fit(mel_X, mel_y)

print("Making predictions for the following 5 houses:") print(mel_X.head()) print("The predictions are") print(melbourne_model.predict(mel_X.head()))

home_data.columns

iowa_y = home_data.SalePrice feature_names = ['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd'] iowa_X = home_data[feature_names]

iowa_X.describe()

iowa_X.head()

iowa_model = DecisionTreeRegressor(random_state=1) iowa_model.fit(iowa_X, iowa_y)

predictions = iowa_model.predict(iowa_X) print(predictions)

home_data.head

# Load data melbourne_file_path = '../input/melbourne-housing-snapshot/melb_data.csv' melbourne_data = pd.read_csv(melbourne_file_path) # Filter rows with missing price values filtered_melbourne_data = melbourne_data.dropna(axis=0) # Choose target and features mel_y = filtered_melbourne_data.Price melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude'] mel_X = filtered_melbourne_data[melbourne_features] from sklearn.tree import DecisionTreeRegressor # Define model melbourne_model = DecisionTreeRegressor() # Fit model melbourne_model.fit(mel_X, mel_y)

from sklearn.metrics import mean_absolute_error predicted_home_prices = melbourne_model.predict(mel_X) mean_absolute_error(mel_y, predicted_home_prices)

from sklearn.model_selection import train_test_split # split data into training and validation data, for both features and target # The split is based on a random number generator. Supplying a numeric value to # the random_state argument guarantees we get the same split every time we # run this script. train_X, val_X, train_y, val_y = train_test_split(mel_X, mel_y, random_state = 0) # Define model melbourne_model = DecisionTreeRegressor() # Fit model melbourne_model.fit(train_X, train_y) # get predicted prices on validation data val_predictions = melbourne_model.predict(val_X) print(mean_absolute_error(val_y, val_predictions))

iowa_X.head()

iowa_X.describe()

train_X, val_X, train_y, val_y = train_test_split(iowa_X, iowa_y, random_state = 1)

iowa_model = DecisionTreeRegressor(random_state = 1) iowa_model.fit(train_X, train_y)

val_predictions = iowa_model.predict(val_X)

print(val_predictions) print(val_y)

print(val_y)

val_mae = mean_absolute_error(val_y, val_predictions) print(val_mae)

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y): model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0) model.fit(train_X, train_y) preds_val = model.predict(val_X) mae = mean_absolute_error(val_y, preds_val) return(mae)

# Load data melbourne_file_path = '../input/melbourne-housing-snapshot/melb_data.csv' melbourne_data = pd.read_csv(melbourne_file_path) # Filter rows with missing values filtered_melbourne_data = melbourne_data.dropna(axis=0) # Choose target and features y = filtered_melbourne_data.Price melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude'] X = filtered_melbourne_data[melbourne_features] from sklearn.model_selection import train_test_split # split data into training and validation data, for both features and target train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)

# compare MAE with differing values of max_leaf_nodes for max_leaf_nodes in [5, 50, 500, 5000]: my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y) print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))

for max_leaf_nodes in [19, 69, 420, 9000]: my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y) print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))

# Path of the file to read iowa_file_path = '../input/home-data-for-ml-course/train.csv' home_data = pd.read_csv(iowa_file_path) # Create target object and call it y y = home_data.SalePrice # Create X features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd'] X = home_data[features] # Split into validation and training data train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) # Specify Model iowa_model = DecisionTreeRegressor(random_state=1) # Fit Model iowa_model.fit(train_X, train_y) # Make validation predictions and calculate mean absolute error val_predictions = iowa_model.predict(val_X) val_mae = mean_absolute_error(val_predictions, val_y) print("Validation MAE: {:,.0f}".format(val_mae))

candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500] # loop to find the ideal tree size from candidate_max_leaf_nodes scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes} best_tree_size = min(scores, key=scores.get) print(scores) print(best_tree_size)

# Load data melbourne_file_path = '../input/melbourne-housing-snapshot/melb_data.csv' melbourne_data = pd.read_csv(melbourne_file_path) # Filter rows with missing values melbourne_data = melbourne_data.dropna(axis=0) # Choose target and features y = melbourne_data.Price melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude'] X = melbourne_data[melbourne_features]

train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)

from sklearn.ensemble import RandomForestRegressor forest_model = RandomForestRegressor(random_state=1) forest_model.fit(train_X, train_y) melb_preds = forest_model.predict(val_X) print(mean_absolute_error(val_y, melb_preds))

# Path of the file to read iowa_file_path = '../input/home-data-for-ml-course/train.csv' home_data = pd.read_csv(iowa_file_path) # Create target object and call it y y = home_data.SalePrice # Create X features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd'] X = home_data[features] # Split into validation and training data train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1) # Specify Model iowa_model = DecisionTreeRegressor(random_state=1) # Fit Model iowa_model.fit(train_X, train_y) # Make validation predictions and calculate mean absolute error val_predictions = iowa_model.predict(val_X) val_mae = mean_absolute_error(val_predictions, val_y) print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format(val_mae)) # Using best value for max_leaf_nodes iowa_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1) iowa_model.fit(train_X, train_y) val_predictions = iowa_model.predict(val_X) val_mae = mean_absolute_error(val_predictions, val_y) print("Validation MAE for best value of max_leaf_nodes: {:,.0f}".format(val_mae))

# Define the model. Set random_state to 1 rf_model = RandomForestRegressor(random_state=1) # fit your model rf_model.fit(train_X, train_y) # Calculate the mean absolute error of your Random Forest model on the validation data rf_model_predictions=rf_model.predict(val_X) rf_val_mae = mean_absolute_error(rf_model_predictions, val_y) print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))