Untitled project

#!pip install kaggle #!pip install category_encoders import os import math import kaggle import warnings import numpy as np import pandas as pd import seaborn as sn import matplotlib.pyplot as plt import category_encoders as ce from zipfile import ZipFile from sklearn.svm import SVR from sklearn.naive_bayes import GaussianNB from sklearn.compose import ColumnTransformer from sklearn.tree import DecisionTreeRegressor,export_graphviz from sklearn.feature_selection import RFE, RFECV from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.preprocessing import StandardScaler, RobustScaler from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor from sklearn.metrics import accuracy_score, classification_report, mean_squared_error warnings.filterwarnings('ignore')

Download the regression and classification dataset:

!kaggle competitions download -q -c cs9856-spotify-regression-problem-2022 !kaggle competitions download -q -c cs9856-spotify-classification-problem-2022

Unzip both datasets to target folders:

downloadFiles = [("regression", "cs9856-spotify-regression-problem-2022.zip"), ("classification", "cs9856-spotify-classification-problem-2022.zip")] for folderTarget, downloadedZipFile in downloadFiles: if os.path.exists(downloadedZipFile): # Unzip to location with ZipFile(downloadedZipFile, 'r') as zipFile: zipFile.extractall(folderTarget) # If the zip file exists remove it os.remove(downloadedZipFile)

A function to output model predictions to a csv file and directly import it to Kaggle for comparing scores

def uploadDataToKaggle(testId, y_pred, problemType, upload=False): if (problemType == "regression"): pd.DataFrame(data={"Id" : testId, "pop" : y_pred}).to_csv('prediction.csv', index=False) if(upload): !kaggle competitions submit -c cs9856-spotify-regression-problem-2022 -f prediction.csv -m "Uploading prediction" elif(problemType == "classification"): pd.DataFrame(data={"Id" : testId, "top genre" : y_pred}).to_csv('prediction.csv', index=False) if(upload): !kaggle competitions submit -c cs9856-spotify-classification-problem-2022 -f prediction.csv -m "Uploading prediction" if os.path.exists("prediction.csv"): os.remove("prediction.csv")

A regression problem that aims to predict the popularity score of a song

regressionTrain = pd.read_csv("regression/CS98XRegressionTrain.csv") regressionTest = pd.read_csv("regression/CS98XRegressionTest.csv")

regressionTrain.info()

plt.figure(figsize = (15,8)) sn.heatmap(regressionTrain.corr(), annot=True) plt.show()

x_train = regressionTrain.loc[:, 'bpm':'spch'] y_train = regressionTrain['pop'] x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.20, random_state=42) lrModel = LinearRegression().fit(x_train, y_train) lr_RMSE = np.sqrt(mean_squared_error(y_val, lrModel.predict(x_val))) print("RMSE on validation data:", lr_RMSE) uploadDataToKaggle(regressionTest['Id'].to_list(), lrModel.predict(regressionTest.loc[:, 'bpm':'spch']), "regression", upload=False)

RMSE on Kaggle test data: 8.91431

x_train = regressionTrain.loc[:, 'bpm':'spch'] y_train = regressionTrain['pop'] x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.20, random_state=42) treeModel = DecisionTreeRegressor(max_depth=2) treeModel.fit(x_train, y_train) tree_RMSE = np.sqrt(mean_squared_error(y_val, treeModel.predict(x_val))) print("RMSE on validation data:", tree_RMSE) uploadDataToKaggle(regressionTest['Id'].to_list(), treeModel.predict(regressionTest.loc[:, 'bpm':'spch']), "regression", upload=False)

RMSE on Kaggle data: 9.00005

export_graphviz( treeModel, out_file= os.path.join("tree_model.dot"), feature_names=regressionTrain.columns[5:-1], class_names='pop', rounded=True, filled=True)

attributes = ['dur','dB','dnce','acous','nrgy'] x_train = regressionTrain[attributes] x_test = regressionTest[attributes] scaler = StandardScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) x_train = pd.DataFrame(x_train, columns=attributes) x_test = pd.DataFrame(x_test, columns=attributes)

# After scaling x_train.head()

pd.plotting.scatter_matrix(x_train[attributes], figsize=(12,7)) plt.show()

# svr_params = { # 'kernel':['rbf'], # 'C':[1,10,100,1000], # #'gamma':['scale','auto'] # 'gamma':[0.0001,0.001,0.01,0.1,0.2,0.25,0.5], # 'epsilon': [ 0.001,0.1,0.5,1,2,5,6,7,8,10] # } # estimator = SVR() # svr_clf = GridSearchCV(estimator, # svr_params, # cv = 5, # refit = True) # svr_clf.fit(x_train,y_train)

svr_best = SVR(C=10, epsilon=5, gamma=0.1, kernel='rbf') svr_clf = AdaBoostRegressor(svr_best, n_estimators=50)

gr_clf = GradientBoostingRegressor(loss='lad', n_estimators=50, max_depth=5)

y_train = regressionTrain['pop'] x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42) vt_clf = VotingRegressor([('svr', svr_clf), ('gr', gr_clf)], weights=[0.4,0.6]) vt_clf.fit(x_train, y_train) vot_RMSE = np.sqrt(mean_squared_error(y_val, vt_clf.predict(x_val))) print("RMSE on validation data:", vot_RMSE) uploadDataToKaggle(regressionTest['Id'].to_list(), vt_clf.predict(x_test[attributes]), "regression", upload=False)

RMSE on Kaggle Test Data: 7.59224 (score using full training data for training the model)

Conclusion

A classification problem that aims to predict the top genre that a song belongs to

classificationTrain = pd.read_csv("classification/CS98XClassificationTrain.csv") classificationTest = pd.read_csv("classification/CS98XClassificationTest.csv")

# May need to upsample or weight classes classificationTrain['top genre'].value_counts().plot.bar() plt.rcParams["figure.figsize"] = (15,10) plt.xlabel('Genre') plt.title('Number of Instances in Each Genre')

# Look at the distribution of all the features classificationTrain.hist(bins = 50,figsize=(16,10)) plt.show()

columns = [col for col in classificationTrain.columns if classificationTrain[col].dtypes == 'O'] print(columns)

for var in columns: print(classificationTrain[var].value_counts())

Adult standard was added for null top genre values because it was the most frequent genre, as identified by an earlier plot.

print(classificationTrain.isnull().sum().sum()) classificationTrain['top genre'].replace(np.NaN, "adult standards", inplace=True)

numerical = [var for var in classificationTrain.columns if classificationTrain[var].dtype!='O'] print('There are {} numerical variables'.format(len(numerical))) print('The numerical variables are :', numerical)

classificationTrain[columns].isnull().sum().sum()

y_train = classificationTrain["top genre"] classificationTrain.drop(labels="top genre", axis=1, inplace=True)

One hot encoding the title and artist for both the training and test datasets

encoder = ce.OneHotEncoder(cols=['title' , 'artist'], handle_unknown="ignore") classificationTrain = encoder.fit_transform(classificationTrain) classificationTest = encoder.transform(classificationTest)

x_train = classificationTrain x_test = classificationTest

We scaled the data with RobustScaler() as it did perform the best out of all three. StandardScaler follows Standard Normal Distribution (SND); makes mean = 0 and scales the data to unit variance. RobustScaler, however, scales features using statistics that are robust to outliers. This method removes the median and scales the data in the range between 1st quartile and 3rd quartile. i.e., in between 25th quantile and 75th quantile range. This range is also called an Interquartile range. 

scaler = RobustScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) X_train = pd.DataFrame(x_train, columns=[classificationTrain.columns]) X_test = pd.DataFrame(x_test, columns=[classificationTrain.columns])

X_train.head()

X_test.replace(np.NaN, 0, inplace=True) X_test.head()

classificationTrain = pd.read_csv("classification/CS98XClassificationTrain.csv") classificationTest = pd.read_csv("classification/CS98XClassificationTest.csv") # Replacing missing genres with adult standards classificationTrain['top genre'].replace(np.NaN, "adult standards", inplace=True) x_train = classificationTrain.loc[:, 'bpm':'pop'] y_train = classificationTrain['top genre'] x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.20,random_state=42) lrModel = LogisticRegression().fit(x_train, y_train) lr_accuracy = accuracy_score(y_val, lrModel.predict(x_val)) print("Accuracy on validation data:", lr_accuracy) uploadDataToKaggle(classificationTest['Id'].to_list(), lrModel.predict(classificationTest.loc[:, 'bpm':'pop']), "classification", upload=False)

Accuracy score on Kaggle test data: 0.23214

# Split into features and classes train_features = X_train.drop(["Id"], axis=1) train_classes = classificationTrain["top genre"]

# Use gridseachCV and RFE to find the best combination of hyperparameters and features for use in the model. # Takes a long time to load so best hyperparameters taken and code commented out for future # rnd_params = { # 'estimator__n_estimators':[100,500], # 'estimator__criterion':['gini'], # 'estimator__max_depth':[10,50,100], # 'estimator__bootstrap':[True], # 'estimator__oob_score':[True] # } # estimator = RandomForestClassifier() # selector = RFE(estimator) # rnd_clf = GridSearchCV(selector, # rnd_params, # cv=5, # refit = True)

estimator = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=100, bootstrap=True, oob_score=True) rnd_clf = RFE(estimator) x_train, x_val, y_train, y_val = train_test_split(train_features, train_classes, test_size=0.20, random_state=42) rnd_clf.fit(x_train, y_train)

rnd_accuracy = accuracy_score(y_val, rnd_clf.predict(x_val)) print("Accuracy on validation data:", rnd_accuracy)

X_test.drop(["Id"], axis=1, inplace=True)

uploadDataToKaggle(classificationTest['Id'].to_list(), rnd_clf.predict(X_test), "classification", upload=False)

Accuracy on Kaggle test data: 0.50000

Gaussian Naive Bayes is a variant of Naive Bayes that follows Gaussian normal distribution and supports continuous data. Naive Bayes are a group of supervised machine learning classification algorithms based on the Bayes theorem. It is a simple classification technique, but has high functionality.

train_features = X_train.drop(["Id"], axis=1) train_classes = classificationTrain["top genre"]

x_train, x_val, y_train, y_val = train_test_split(train_features, train_classes, test_size=0.2, random_state=42)

gnb = GaussianNB() gnb.fit(x_train, y_train) y_pred = gnb.predict(x_val)

Getting the accuracy score but on the training data that we split earlier

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

print(classification_report(y_val, y_pred))

uploadDataToKaggle(classificationTest["Id"].tolist(), gnb.predict(X_test), "classification", upload=False)

Accuracy on Kaggle test data: 0.41071 (for full training data used for training)