import pandas as pd
from sklearn.model_selection import train_test_split
# Read the CSV file.
data = pd.read_csv('CTG.csv', skiprows=1)
# Select the relevant numerical columns.
selected_cols = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV',
'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean',
'Median', 'Variance', 'Tendency', 'NSP']
#Filter missing data.
data = data[selected_cols].dropna()
# Shuffle the dataset.
data_shuffled = data.sample(frac=1.0, random_state=0)
# Split into input part X and output part Y.
X = data_shuffled.drop('NSP', axis=1)
# Map the diagnosis code to a human-readable label.
def to_label(y):
return [None, 'normal', 'suspect', 'pathologic'][(int(y))]
Y = data_shuffled['NSP'].apply(to_label)
# Partition the data into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)
# We can also use call data_shuffled.head() to take a peek at the data set.
X.head()
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy='most_frequent')
from sklearn.model_selection import cross_val_score
cross_val_score(clf, Xtrain, Ytrain, cv=5)
from sklearn.tree import DecisionTreeClassifier
# No big difference if we set the value of max depth of 10.
clf1 = DecisionTreeClassifier(max_depth=10)
cross_val_score(clf1, Xtrain, Ytrain, cv=5)
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(n_estimators = 100)
cross_val_score(clf2, Xtrain, Ytrain, cv=5)
from sklearn.ensemble import GradientBoostingClassifier
clf3 = GradientBoostingClassifier()
cross_val_score(clf3, Xtrain, Ytrain, cv=5)
from sklearn.linear_model import Perceptron
clf4 = Perceptron()
cross_val_score(clf4, Xtrain, Ytrain, cv=5)
# Satndarlized data.
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
Xtrain, Xtest, Ytrain, Ytest = train_test_split(StandardScaler().fit_transform(X), Y, test_size=0.2, random_state=0)
clf5 = LogisticRegression(max_iter = 1000)
cross_val_score(clf5, Xtrain, Ytrain, cv=5)
# Set max_iter to a larger valuefrom 1000 to 10000 and dual = True in LinearSVC classifier.
from sklearn.svm import LinearSVC
clf6 = LinearSVC(max_iter=10000, dual = True)
cross_val_score(clf6, Xtrain, Ytrain, cv=5)
from sklearn.neural_network import MLPClassifier
clf7 = MLPClassifier(max_iter=1000)
cross_val_score(clf7, Xtrain, Ytrain, cv=5)
from sklearn.metrics import accuracy_score
clf3.fit(Xtrain, Ytrain)
Yguess = clf3.predict(Xtest)
print(accuracy_score(Ytest, Yguess))
0.9295774647887324
# Alternatively
clf3.fit(Xtrain, Ytrain)
clf3.score(Xtest, Ytest)
!pip install graphviz
%run DecisionTree.py
test = []
max_depth = 30
for i in range(max_depth):
cls = TreeClassifier(max_depth=i+1)
score = cross_val_score(cls, Xtrain, Ytrain, cv=5, scoring='accuracy').mean()
test.append(score)
max_depth_best = np.arange(1,max_depth+1)[np.argmax(test)]
Requirement already satisfied: graphviz in /root/venv/lib/python3.7/site-packages (0.16)
WARNING: You are using pip version 20.1.1; however, version 21.0.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
import matplotlib.pyplot as plt
plt.figure(figsize=(12,7))
plt.style.use('ggplot')
plt.xticks(range(1, max_depth+1, 1))
plt.plot(range(1,max_depth+1),test,'bo-',label="max_depth")
plt.xlabel('Max Depth', fontsize=15)
plt.ylabel('Mean of 5-Fold Cross Validation Score', fontsize=15)
plt.legend(fontsize=15, loc='lower right')
plt.grid(True)
plt.show()
print("Best Cross Validation Score: {cv_score:.2f}% when max_depth = {md:d}"\
.format(cv_score=max(test)*100, md=max_depth_best))
Best Cross Validation Score: 91.35% when max_depth = 13
cls = TreeClassifier(max_depth_best)
cls.fit(Xtrain, Ytrain)
Ypredict = cls.predict(Xtest)
print("Best performance is {score:.2f}% when # of Tree = {treenum:d}"\
.format(score = accuracy_score(Ytest, Ypredict)*100, treenum = max_depth_best))
Best performance is 87.09% when # of Tree = 13
cls = TreeClassifier(np.floor(max_depth_best/4))
cls.fit(Xtrain, Ytrain)
cls.draw_tree()
# Read the CSV file using Pandas.
alldata = pd.read_csv('sberbank.csv')
# Convert the timestamp string to an integer representing the year.
def get_year(timestamp):
return int(timestamp[:4])
alldata['year'] = alldata.timestamp.apply(get_year)
# Select the 9 input columns and the output column.
selected_columns = ['price_doc', 'year', 'full_sq', 'life_sq', 'floor', 'num_room', 'kitch_sq', 'full_all']
alldata = alldata[selected_columns]
alldata = alldata.dropna()
# Shuffle.
alldata_shuffled = alldata.sample(frac=1.0, random_state=0)
# Separate the input and output columns.
X = alldata_shuffled.drop('price_doc', axis=1)
# For the output, we'll use the log of the sales price.
Y = alldata_shuffled['price_doc'].apply(np.log)
# Split into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_validate
m1 = DummyRegressor()
cross_validate(m1, Xtrain, Ytrain, scoring='neg_mean_squared_error')
from sklearn.metrics import mean_squared_error
regr.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, regr.predict(Xtest))
NameError: name 'regr' is not defined