Task 0

We added requirement.txt with sklearn and graphviz

then we run pip install -r requirement.txt in the terminal

import pandas as pd import numpy as np

from sklearn.model_selection import train_test_split

Task 1

Reading the data

scores=[] # Read the CSV file. data = pd.read_csv('https://www.cse.chalmers.se/~richajo/dit866/data/CTG.csv', skiprows=1) # Select the relevant numerical columns. selected_cols = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency', 'NSP'] data = data[selected_cols].dropna() # Shuffle the dataset. data_shuffled = data.sample(frac=1.0, random_state=0) # Split into input part X and output part Y. X = data_shuffled.drop('NSP', axis=1) # Map the diagnosis code to a human-readable label. def to_label(y): return [None, 'normal', 'suspect', 'pathologic'][(int(y))] Y = data_shuffled['NSP'].apply(to_label) # Partition the data into training and test sets. Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)

X.head()

Training the baseline classifier

from sklearn.dummy import DummyClassifier clf = DummyClassifier(strategy='most_frequent')

#Dummy Classifier from sklearn.model_selection import cross_val_score result=cross_val_score(clf, Xtrain, Ytrain) mean_valueD= np.mean(result) mean_valueD scores.append({'name': 'Dummy classifier', 'score': mean_valueD})

Trying out some different classifiers

#MLP Classifier from sklearn.neural_network import MLPClassifier MLP = MLPClassifier(hidden_layer_sizes=(200,15), random_state=1, max_iter=300).fit(Xtrain, Ytrain) MLP.predict_proba(Xtest[:1])

from sklearn.metrics import accuracy_score MLP.fit(Xtrain, Ytrain) Yguess = MLP.predict(Xtest) print(accuracy_score(Ytest, Yguess))

#from sklearn.model_selection import cross_val_score from sklearn.model_selection import cross_validate #cross_val_score(clf, Xtrain, Ytrain) MLPscore = cross_validate(MLP, Xtrain, Ytrain) #MLPscore mean_valueMLP= np.mean(MLPscore['test_score']) mean_valueMLP scores.append({'name': 'MLP classifier', 'score': mean_valueMLP})

#DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier DTC = DecisionTreeClassifier(max_depth=4) DTCscore = cross_validate(DTC, Xtrain, Ytrain) mean_valueDTC= np.mean(DTCscore['test_score']) mean_valueDTC scores.append({'name': 'Decision Tree classifier', 'score': mean_valueDTC}) scores

#RandomForestClassifier from sklearn.ensemble import RandomForestClassifier RFC = RandomForestClassifier(max_depth=4) RFCscore = cross_validate(RFC, Xtrain, Ytrain) mean_valueRFC= np.mean(RFCscore['test_score']) mean_valueRFC scores.append({'name': 'Random Forest classifier', 'score': mean_valueRFC})

#GradientBoostingClassifier from sklearn.ensemble import GradientBoostingClassifier GBC = GradientBoostingClassifier(max_depth=5) GBCscore = cross_validate(GBC, Xtrain, Ytrain) mean_valueGBC= np.mean(GBCscore['test_score']) mean_valueGBC scores.append({'name': 'Gradient Boosting classifier', 'score': mean_valueGBC})

#Perceptron from sklearn.linear_model import Perceptron Perp = Perceptron(max_iter=100) Perpscore = cross_validate(Perp, Xtrain, Ytrain) mean_valueP= np.mean(Perpscore['test_score']) mean_valueP scores.append({'name': 'Perceptron classifier', 'score': mean_valueP})

#LogisticRegression from sklearn.linear_model import LogisticRegression LR = LogisticRegression(random_state=0, max_iter=200) LRscore = cross_validate(LR, Xtrain, Ytrain) mean_valueLR= np.mean(LRscore['test_score']) scores.append({'name': 'Logistic Regression', 'score': mean_valueLR})

#LinearSVC from sklearn.svm import LinearSVC LSVC = LinearSVC(max_iter=10000) LSVCscore = cross_validate(LSVC, Xtrain, Ytrain) mean_valueLSVC= np.mean(LSVCscore['test_score']) mean_valueLSVC scores.append({'name': 'Linear SVC', 'score': mean_valueLSVC})

def get_sc(scores): return scores.get('score') scores.sort(key=get_sc,reverse=True) scores

tune the hyperparameters 

from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import cross_validate scores=[] for i in range(2,11): GBC = GradientBoostingClassifier(max_depth=i) GBCscore = cross_validate(GBC, Xtrain, Ytrain) mean_valueGBC= np.mean(GBCscore['test_score']) scores.append({'name': 'max_depth: '+str(i), 'score': mean_valueGBC}) def get_score(scores): return scores.get('score') scores.sort(key=get_score,reverse=True) scores

the tuning part shows that the max_depth 7 will get us better accuracy

Final evaluation

#Gradient Boosting Classifier accuracy on test data from sklearn.metrics import accuracy_score GBC_7 = GradientBoostingClassifier(max_depth=i) GBC_7.fit(Xtrain, Ytrain) Yguess = GBC_7.predict(Xtest) accuracy_score(Ytest, Yguess)

Task 2

Use the defined class TreeClassifier as your classifier

class DecisionTreeLeaf: def __init__(self, value): self.value = value # This method computes the prediction for this leaf node. This will just return a constant value. def predict(self, x): return self.value # Utility function to draw a tree visually using graphviz. def draw_tree(self, graph, node_counter, names): node_id = str(node_counter) val_str = f'{self.value:.4g}' if isinstance(self.value, float) else str(self.value) graph.node(node_id, val_str, style='filled') return node_counter+1, node_id def __eq__(self, other): if isinstance(other, DecisionTreeLeaf): return self.value == other.value else: return False

class DecisionTreeBranch: def __init__(self, feature, threshold, low_subtree, high_subtree): self.feature = feature self.threshold = threshold self.low_subtree = low_subtree self.high_subtree = high_subtree # For a branch node, we compute the prediction by first considering the feature, and then # calling the upper or lower subtree, depending on whether the feature is or isn't greater # than the threshold. def predict(self, x): if x[self.feature] <= self.threshold: return self.low_subtree.predict(x) else: return self.high_subtree.predict(x) # Utility function to draw a tree visually using graphviz. def draw_tree(self, graph, node_counter, names): node_counter, low_id = self.low_subtree.draw_tree(graph, node_counter, names) node_counter, high_id = self.high_subtree.draw_tree(graph, node_counter, names) node_id = str(node_counter) fname = f'F{self.feature}' if names is None else names[self.feature] lbl = f'{fname} > {self.threshold:.4g}?' graph.node(node_id, lbl, shape='box', fillcolor='yellow', style='filled, rounded') graph.edge(node_id, low_id, 'False') graph.edge(node_id, high_id, 'True') return node_counter+1, node_id

from graphviz import Digraph from sklearn.base import BaseEstimator, ClassifierMixin from abc import ABC, abstractmethod class DecisionTree(ABC, BaseEstimator): def __init__(self, max_depth): super().__init__() self.max_depth = max_depth # As usual in scikit-learn, the training method is called *fit*. We first process the dataset so that # we're sure that it's represented as a NumPy matrix. Then we call the recursive tree-building method # called make_tree (see below). def fit(self, X, Y): if isinstance(X, pd.DataFrame): self.names = X.columns X = X.to_numpy() elif isinstance(X, list): self.names = None X = np.array(X) else: self.names = None Y = np.array(Y) self.root = self.make_tree(X, Y, self.max_depth) def draw_tree(self): graph = Digraph() self.root.draw_tree(graph, 0, self.names) return graph # By scikit-learn convention, the method *predict* computes the classification or regression output # for a set of instances. # To implement it, we call a separate method that carries out the prediction for one instance. def predict(self, X): if isinstance(X, pd.DataFrame): X = X.to_numpy() return [self.predict_one(x) for x in X] # Predicting the output for one instance. def predict_one(self, x): return self.root.predict(x) # This is the recursive training def make_tree(self, X, Y, max_depth): # We start by computing the default value that will be used if we'll return a leaf node. # For classifiers, this will be the most common value in Y. default_value = self.get_default_value(Y) # First the two base cases in the recursion: is the training set completely # homogeneous, or have we reached the maximum depth? Then we need to return a leaf. # If we have reached the maximum depth, return a leaf with the majority value. if max_depth == 0: return DecisionTreeLeaf(default_value) # If all the instances in the remaining training set have the same output value, # return a leaf with this value. if self.is_homogeneous(Y): return DecisionTreeLeaf(default_value) # Select the "most useful" feature and split threshold. To rank the "usefulness" of features, # we use one of the classification or regression criteria. # For each feature, we call best_split (defined in a subclass). We then maximize over the features. n_features = X.shape[1] _, best_feature, best_threshold = max(self.best_split(X, Y, feature) for feature in range(n_features)) if best_feature is None: return DecisionTreeLeaf(default_value) # Split the training set into subgroups, based on whether the selected feature is greater than # the threshold or not X_low, X_high, Y_low, Y_high = self.split_by_feature(X, Y, best_feature, best_threshold) # Build the subtrees using a recursive call. Each subtree is associated # with a value of the feature. low_subtree = self.make_tree(X_low, Y_low, max_depth-1) high_subtree = self.make_tree(X_high, Y_high, max_depth-1) if low_subtree == high_subtree: return low_subtree # Return a decision tree branch containing the result. return DecisionTreeBranch(best_feature, best_threshold, low_subtree, high_subtree) # Utility method that splits the data into the "upper" and "lower" part, based on a feature # and a threshold. def split_by_feature(self, X, Y, feature, threshold): low = X[:,feature] <= threshold high = ~low return X[low], X[high], Y[low], Y[high] # The following three methods need to be implemented by the classification and regression subclasses. @abstractmethod def get_default_value(self, Y): pass @abstractmethod def is_homogeneous(self, Y): pass @abstractmethod def best_split(self, X, Y, feature): pass

from collections import Counter class TreeClassifier(DecisionTree, ClassifierMixin): def __init__(self, max_depth=7, criterion='maj_sum'): super().__init__(max_depth) self.criterion = criterion def fit(self, X, Y): # For decision tree classifiers, there are some different ways to measure # the homogeneity of subsets. if self.criterion == 'maj_sum': self.criterion_function = majority_sum_scorer elif self.criterion == 'info_gain': self.criterion_function = info_gain_scorer elif self.criterion == 'gini': self.criterion_function = gini_scorer else: raise Exception(f'Unknown criterion: {self.criterion}') super().fit(X, Y) self.classes_ = sorted(set(Y)) # Select a default value that is going to be used if we decide to make a leaf. # We will select the most common value. def get_default_value(self, Y): self.class_distribution = Counter(Y) return self.class_distribution.most_common(1)[0][0] # Checks whether a set of output values is homogeneous. In the classification case, # this means that all output values are identical. # We assume that we called get_default_value just before, so that we can access # the class_distribution attribute. If the class distribution contains just one item, # this means that the set is homogeneous. def is_homogeneous(self, Y): return len(self.class_distribution) == 1 # Finds the best splitting point for a given feature. We'll keep frequency tables (Counters) # for the upper and lower parts, and then compute the impurity criterion using these tables. # In the end, we return a triple consisting of # - the best score we found, according to the criterion we're using # - the id of the feature # - the threshold for the best split def best_split(self, X, Y, feature): # Create a list of input-output pairs, where we have sorted # in ascending order by the input feature we're considering. sorted_indices = np.argsort(X[:, feature]) X_sorted = list(X[sorted_indices, feature]) Y_sorted = list(Y[sorted_indices]) n = len(Y) # The frequency tables corresponding to the parts *before and including* # and *after* the current element. low_distr = Counter() high_distr = Counter(Y) # Keep track of the best result we've seen so far. max_score = -np.inf max_i = None # Go through all the positions (excluding the last position). for i in range(0, n-1): # Input and output at the current position. x_i = X_sorted[i] y_i = Y_sorted[i] # Update the frequency tables. low_distr[y_i] += 1 high_distr[y_i] -= 1 # If the input is equal to the input at the next position, we will # not consider a split here. #x_next = XY[i+1][0] x_next = X_sorted[i+1] if x_i == x_next: continue # Compute the homogeneity criterion for a split at this position. score = self.criterion_function(i+1, low_distr, n-i-1, high_distr) # If this is the best split, remember it. if score > max_score: max_score = score max_i = i # If we didn't find any split (meaning that all inputs are identical), return # a dummy value. if max_i is None: return -np.inf, None, None # Otherwise, return the best split we found and its score. split_point = 0.5*(X_sorted[max_i] + X_sorted[max_i+1]) return max_score, feature, split_point def majority_sum_scorer(n_low, low_distr, n_high, high_distr): maj_sum_low = low_distr.most_common(1)[0][1] maj_sum_high = high_distr.most_common(1)[0][1] return maj_sum_low + maj_sum_high def entropy(distr): n = sum(distr.values()) ps = [n_i/n for n_i in distr.values()] return -sum(p*np.log2(p) if p > 0 else 0 for p in ps) def info_gain_scorer(n_low, low_distr, n_high, high_distr): return -(n_low*entropy(low_distr)+n_high*entropy(high_distr))/(n_low+n_high) def gini_impurity(distr): n = sum(distr.values()) ps = [n_i/n for n_i in distr.values()] return 1-sum(p**2 for p in ps) def gini_scorer(n_low, low_distr, n_high, high_distr): return -(n_low*gini_impurity(low_distr)+n_high*gini_impurity(high_distr))/(n_low+n_high)

also draw a tree

clf = TreeClassifier(max_depth=2) clf.fit(Xtrain, Ytrain); clf.draw_tree()

cls = TreeClassifier(max_depth=6) cls.fit(Xtrain, Ytrain) cls.draw_tree()

cls = TreeClassifier(max_depth=3, criterion='gini') cls.fit(Xtrain, Ytrain) cls.draw_tree() #max_depth will be 6 no matter how much you change #the small value that could be used for max_depth is 2 to correctly classify all the categories

Tune the hyperparameter max_depth to get the best cross-validation performance

from sklearn.model_selection import cross_val_score scores=[] for i in range(2,11): cls = TreeClassifier(max_depth=i) cls.fit(Xtrain, Ytrain) cls_score = cross_val_score(cls, Xtrain, Ytrain) mean_score = np.mean(cls_score) scores.append({'name': 'max_depth: '+str(i), 'score': mean_score}) def get_score(scores): return scores.get('score') scores.sort(key=get_score,reverse=True) scores

max_depth(5) has the most mean accuracy in range(2,11) with score = 0.9123529411764706

evaluate the classifier on the test set

cls = TreeClassifier(max_depth=5) cls.fit(Xtrain, Ytrain) cls_score = cross_val_score(cls, Xtest, Ytest) mean_score = np.mean(cls_score) mean_score

the mean score on test data is 0.8709712722298221

Task 3

from sklearn.model_selection import train_test_split

# Read the CSV file using Pandas. alldata = pd.read_csv('sberbank.csv') # Convert the timestamp string to an integer representing the year. def get_year(timestamp): return int(timestamp[:4]) alldata['year'] = alldata.timestamp.apply(get_year) # Select the 9 input columns and the output column. selected_columns = ['price_doc', 'year', 'full_sq', 'life_sq', 'floor', 'num_room', 'kitch_sq', 'full_all'] alldata = alldata[selected_columns] alldata = alldata.dropna() # Shuffle. alldata_shuffled = alldata.sample(frac=1.0, random_state=0) # Separate the input and output columns. X = alldata_shuffled.drop('price_doc', axis=1) # For the output, we'll use the log of the sales price. Y = alldata_shuffled['price_doc'].apply(np.log) # Split into training and test sets. Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)

from sklearn.dummy import DummyRegressor from sklearn.model_selection import cross_validate m1 = DummyRegressor() score_Dummy = cross_validate(m1, Xtrain, Ytrain, scoring='neg_mean_squared_error') score_Dummy

dummyscore=np.mean(score_Dummy['test_score']) dummyscore

scores=[] scores.append({'name': 'Dummy Regressor', 'score': dummyscore}) scores

from sklearn.metrics import mean_squared_error m1.fit(Xtrain, Ytrain) mean_squared_error(Ytest, m1.predict(Xtest))

LinearRegression

A Linear Regression assumes a linear relationship between the input data and the output data. A linear combination(formula) of all input variables results in output.

from sklearn.linear_model import LinearRegression regressor_Linear = LinearRegression() score_Linear = cross_validate(regressor_Linear, Xtrain, Ytrain, scoring='neg_mean_squared_error') score_Linear

linearscore=np.mean(score_Linear['test_score']) scores.append({'name': 'Linear Regression', 'score': linearscore})

Ridge

Ridge regression is a regularized linear regression. It try to shrink coefficients for those input variables that do not contribute much to the prediction task.

from sklearn.linear_model import Ridge regressor_Ridge = Ridge(random_state=0) score_Ridge = cross_validate(regressor_Ridge, Xtrain, Ytrain, scoring='neg_mean_squared_error') score_Ridge

ridgescore = np.mean(score_Ridge['test_score']) scores.append({'name': 'Ridge Regressor', 'score': ridgescore}) ridgescore

Lasso

Lasso is also a regularization model. It also make use of shrinkage. But it uses L1 regularization in contrast to Ridge which uses L2.

from sklearn.linear_model import Lasso regressor_Lasso = Lasso(random_state=0) score_Lasso = cross_validate(regressor_Lasso, Xtrain, Ytrain, scoring='neg_mean_squared_error') score_Lasso

lassoscore = np.mean(score_Lasso['test_score']) scores.append({'name': 'Lasso Regressor', 'score': lassoscore})

DecisionTreeRegressor

Decision tree regression observes features of an object. It trains a model in the structure of a tree to predict data in the future to produce meaningful continuous output.

from sklearn.tree import DecisionTreeRegressor regressor_DT = DecisionTreeRegressor(random_state=0) score_DT = cross_validate(regressor_DT, Xtrain, Ytrain, scoring='neg_mean_squared_error') score_DT

dtscore = np.mean(score_DT['test_score']) scores.append({'name': 'Decision Tree Regressor', 'score': dtscore})

from sklearn.metrics import mean_squared_error regressor_DT.fit(Xtrain, Ytrain) mean_squared_error(Ytest, regressor_DT.predict(Xtest))

RandomForestRegressor

Random forest Regressor fits multiple classifying decision trees on various sub-sample of the dataset and than does average of all these samples to increase prediction accuracy.

from sklearn.ensemble import RandomForestRegressor regressor_RF = RandomForestRegressor(random_state=0) score_RF = cross_validate(regressor_RF, Xtrain, Ytrain, scoring='neg_mean_squared_error') score_RF

rfscore = np.mean(score_RF['test_score']) scores.append({'name': 'Random Forest Regressor', 'score': rfscore}) rfscore

GradientBoostingRegressor

Gradient Boosting Regressor produces a predictive model from an ensemble of weak predictive models.

from sklearn.ensemble import GradientBoostingRegressor regressor_GB = GradientBoostingRegressor(random_state=0) score_GB = cross_validate(regressor_GB, Xtrain, Ytrain, scoring='neg_mean_squared_error') score_GB

gbscore = np.mean(score_GB['test_score']) scores.append({'name': 'Gradient Boosting Regressor', 'score': gbscore}) gbscore

MLPRegressor

MLPRegressor implements a multi-layer perceptron (MLP) algorithm that trains using Backpropagation. It don't make use of activation function in the output layer. It also supports multi-output regression, in which a sample can have more than one target.

from sklearn.neural_network import MLPRegressor regressor_MLP = MLPRegressor(random_state=0) score_MLP = cross_validate(regressor_MLP, Xtrain, Ytrain, scoring='neg_mean_squared_error') score_MLP

mlpscore = np.mean(score_MLP['test_score']) scores.append({'name': 'MLP Regressor', 'score': mlpscore}) mlpscore

scores

def get_score(scores): return scores.get('score') scores.sort(key=get_score,reverse=True) scores

In order to trade off the Cross-validation score, we have replaced the dummy classifier with 7 classifiers that are more meaningful. Though the score was not significantly higher in all of them as compared to the Dummy classifier Gradient Boosting Regressor has a comparatively higher score than other regressors. Random Forest regressor also performed well followed by Lasso, Ridge and Linear Regression, which performed quite similar but better than the dummy regressor. The decision Tree regressor and MLP Regressor didn't perform better than the Dummy regressor.

Finally, train on the full training set and evaluate for the best regressor

from sklearn.metrics import mean_squared_error regressor_GB.fit(Xtrain, Ytrain) mean_squared_error(Ytest, regressor_GB.predict(Xtest))

Task 4

Step 1. Implementing the regression model

from sklearn.base import RegressorMixin import numpy as np class TreeRegressor(DecisionTree, RegressorMixin): def __init__(self, max_depth=10, criterion='variance_reduction'): super().__init__(max_depth) self.criterion = criterion def fit(self, X, Y): if self.criterion == 'variance_reduction': self.criterion_function = variance_reduction else: raise Exception(f'Unknown criterion: {self.criterion}') super().fit(X, Y) self.classes_ = sorted(set(Y)) ################################################################# # how to compute the default output value: # - you should return the mean instead of the most common value. ################################################################# def get_default_value(self, Y): self.class_distribution = np.mean(Y) return self.class_distribution ################################################################# # you should probably compare the variance to some threshold # - This threshold can either be a small value. ################################################################# def is_homogeneous(self, Y): _variance = np.var(Y) True if _variance > self.class_distribution else False ################################################################# # Computing the variances from scratch at each possible threshold # keep track of the sum and sum of squares for the lower and upper partitions. ################################################################# def best_split(self, X, Y, feature): sorted_indices = np.argsort(X[:, feature]) X_sorted = list(X[sorted_indices, feature]) Y_sorted = list(Y[sorted_indices]) max_score = -np.inf max_i = None n = len(Y) for index in range(0, n-1): threshold = X_sorted[index] threshold_next = X_sorted[index+1] if threshold == threshold_next: continue index_dataset_left = np.array( [index for index, row in enumerate(X) if row[feature] <= threshold]) index_dataset_right = np.array( [index for index, row in enumerate(X) if row[feature] > threshold]) if len(index_dataset_left) > 0 and len(index_dataset_right) > 0: low_distr = [Y[i] for i in index_dataset_left] high_distr = [Y[i] for i in index_dataset_right] score = self.criterion_function( n, Y, len(low_distr), low_distr, len(high_distr), high_distr) if score > max_score: max_score = score max_i = index if max_i is None: return -np.inf, None, None split_point = 0.5*(X_sorted[max_i] + X_sorted[max_i+1]) return max_score, feature, split_point ############################################################################## # we use the variance of a set of values to measure its homogeneity. # The homogeneity criterion most typically used in decision tree regression ############################################################################## def variance_reduction(n, y, nlower, ylower, nhigher, yhigher): y_var = np.var(y) ylower_var = np.var(ylower) yhigher_var = np.var(yhigher) # compute scores score = (y_var - nlower / n * ylower_var - nhigher / n * yhigher_var) / y_var return score ############################################################################## # variances from scratch ############################################################################## def variance_scratch(data): n = len(data) Square = [x ** 2 for x in data] # Variance first_part = sum(Square) / n second_part = sum(data)**2 / n**2 return first_part - second_part

Step 2. Sanity check

def make_some_data(n): x = np.random.uniform(-5, 5, size=n) Y = (x > 1) + 0.1*np.random.normal(size=n) X = x.reshape(n, 1) # X needs to be a 2-dimensional matrix return X, Y

generate such a dataset and plot it

import matplotlib.pyplot as plt xx,yy = make_some_data(30) plt.scatter(xx,yy, color='blue') plt.xlabel('input') plt.ylabel('output') plt.show()

What kind of decision tree would we want to describe this data?

The output or the Target(yy) is not categorical or discrete value so classifier algorithms like DecisionTreeClassifier are not one of our options. the target is continuous values as a result of that regression is our choice and we think DecisionTreeRegressor is a better option.

Train your decision tree regressor algorithm then draw the tree

Select the tree depth according to your common sense

Does the result make sense

What happens if we allow the tree depth to be a large number

treg = TreeRegressor(max_depth=2) treg.fit(xx, yy) treg.draw_tree()

Step 3. Predicting apartment prices using decision tree regression

from sklearn.model_selection import cross_validate

treg_price = TreeRegressor(max_depth=2) # Xtrain and Ytrain comes from task 3 score_TreeRegressor = cross_validate(treg_price, Xtrain, Ytrain, scoring='neg_mean_squared_error') np.mean(score_TreeRegressor['test_score'])

please describe what tree depth 

for finding the best result for max_dep we ran the algorithem in the loop and compare the scores

depth_scores = [0,0] for depth in range(2,18): print(depth, end=' ') treg_price_depth = TreeRegressor(max_depth=depth) score_depth = cross_validate(treg_price_depth, Xtrain, Ytrain, scoring='neg_mean_squared_error') mean_score_depth =np.mean(score_depth['test_score']) depth_scores.append((depth,mean_score_depth)) depth_scores

Plot

_depth_scores = [value for i,value in depth_scores] _depth_scores

import matplotlib.pyplot as plt plt.plot(_depth_scores, '-o', label='mean_score_depth') plt.show()

The base of the plot by increasing depth after depth of 7 the score is improving too, and the graph shows that the 17 is the best depth in rage(2,18)

evaluation score on the test set.

import warnings warnings.filterwarnings("ignore")

treg_price_depth_17 = TreeRegressor(max_depth=17) score_17 = cross_validate(treg_price_depth_17, Xtest, Ytest, scoring='neg_mean_squared_error') test_score =np.mean(score_17['test_score']) test_score

Step 4. Underfitting and overfitting

- draw a plot that shows the evaluation score on the training set and on the test set

- for different values of max_depth, ranging from 0 to 12

- do not use cross-validation this time

from sklearn.metrics import mean_squared_error import math from sklearn.tree import DecisionTreeRegressor

train_scores, test_scores = list(), list() for i in range(1,13): treeReg = DecisionTreeRegressor(max_depth=i) treeReg.fit(Xtrain,Ytrain) Y_predicted_train = treeReg.predict(Xtrain) MSE_train = mean_squared_error(Ytrain, Y_predicted_train) RMSE_train = math.sqrt(MSE_train) train_scores.append(RMSE_train) # evaluate on the test dataset Y_predicted_test = treeReg.predict(Xtest) MSE_test = mean_squared_error(Ytest, Y_predicted_test) RMSE_test = math.sqrt(MSE_test) test_scores.append(RMSE_test)

import matplotlib.pyplot as plt plt.plot(train_scores, '-o', label='Train') plt.plot(test_scores, '-o', label='Test') plt.legend() plt.show()

for evaluating the regression model we use mean square Error which is calculated by the sum of the square of prediction error which is real output minus predicted output and then divide by the number of data points.

the plot shows the RMSE both training data and testing data for each max_depth.

we can see that for training RMSE is getting lower when the max depth increase but for Testing it gets lower until the depth of 5 and after that, the RMSE is increased more by increasing the depth. so we think the model going to overfit when max_depth is more than 5.

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Task 0