Predictive Power of Song Attributes on Music Genre and Popularity

#PreInstalled Imports import numpy as np from scipy import stats import matplotlib.pyplot as plt from matplotlib.pyplot import figure import pandas as pd import statistics as stat import sys import random from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score from sklearn import linear_model #Allison Imports from statsmodels.stats.power import TTestIndPower #Isha Imports from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import matplotlib.dates as mdates import seaborn import datetime as dt import statsmodels.api as sm # pip install yellowbrick from yellowbrick.datasets import load_concrete from yellowbrick.regressor import ResidualsPlot from yellowbrick.style import set_palette from sklearn import preprocessing import plotly import plotly.express as px #Mary Imports import warnings warnings.filterwarnings('ignore') from sklearn.decomposition import PCA as sk_PCA from sklearn.pipeline import Pipeline from sklearn.cluster import KMeans as sk_KMeans from sklearn.tree import _tree, DecisionTreeClassifier from sklearn.preprocessing import StandardScaler from sklearn.metrics import pairwise_distances_argmin from sklearn.metrics import silhouette_samples, silhouette_score from dython.nominal import associations # must pip install dython to run from IPython.display import display, HTML import seaborn as sns sns.set() #Annabelle Imports from dataclasses import dataclass from time import time %matplotlib inline import warnings from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.svm import SVC from sklearn.preprocessing import LabelEncoder from sklearn.metrics import roc_auc_score from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, auc, roc_curve import requests warnings.filterwarnings('ignore')

from numpy import random random.seed(17669368)

anime = pd.read_csv('NYU_IntroCapstone_ANIME - Sheet1.csv') broadway = pd.read_csv('NYU_IntroCapstone_BROADWAY - Sheet1.csv') classical = pd.read_csv('NYU_IntroCapstone_CLASSICAL - Sheet1.csv') country = pd.read_csv('NYU_IntroCapstone_COUNTRY - Sheet1.csv') dance_electronic = pd.read_csv('NYU_IntroCapstone_DANCE_ELECTRONIC - Sheet1.csv') disney = pd.read_csv('NYU_IntroCapstone_DISNEY - Sheet1.csv') happy_holidays = pd.read_csv('NYU_IntroCapstone_HAPPY_HOLIDAYS - Sheet1.csv') hip_hop = pd.read_csv('NYU_IntroCapstone_HIP_HOP - Sheet1.csv') jazz = pd.read_csv('NYU_IntroCapstone_JAZZ - Sheet1.csv') latin = pd.read_csv('NYU_IntroCapstone_LATIN - Sheet1.csv') pop = pd.read_csv('NYU_IntroCapstone_POP - Sheet1.csv') rock = pd.read_csv('NYU_IntroCapstone_ROCK - Sheet1.csv') data_df = pd.concat([anime, broadway, classical, country, dance_electronic, disney, happy_holidays, hip_hop, jazz, latin, pop, rock], axis=0) data_df = data_df.drop_duplicates(subset=['Title', 'Artist']) data_df = data_df.drop('#', axis = 1) data_df = data_df.dropna(axis=0, how='any').reset_index(drop=True) #remove rows that have any NaN #Convert Length Column from Minutes and Seconds to Seconds #Function to Get Seconds def get_sec(time_str): ''' This function takes the time in a min:sec format and converts it to seconds ''' time_str = str(time_str) m, s = time_str.split(':') return int(m) * 60 + int(s) #Apply the get_sec Function to the Data (elif statements for improperly formatted data) for i in range(14892): if i == 2979: data_df.iloc[i,8] = 1578 elif i == 4928: data_df.iloc[i,8] = 1623 elif i == 9846: data_df.iloc[i,8] = 1707 elif i == 10249: data_df.iloc[i,8] = 2573 elif i == 10386: data_df.iloc[i,8] = 1572 elif i == 10404: data_df.iloc[i,8] = 1562 elif i == 10714: data_df.iloc[i,8] = 1904 else: data_df.iloc[i,8] = get_sec(data_df.iloc[i,8]) display(data_df)

#Copy over data frame inference_df = data_df

inference_df['Length'] = inference_df['Length'].astype(float)

# Create genre & attributes list: genres = inference_df.Genre.unique() attributes = inference_df.columns.values.tolist()[3:11]

#Various Histograms to begin visualizing the difference between genres for certain attributes color = ['red','green','blue','cyan','magenta','yellow','black','orange','purple','white','silver','maroon'] #sample size display("Sample Sizes:") display(inference_df.groupby(inference_df['Genre']).count().reset_index().iloc[:,0:2]) display("Plots:") #each genre's attribute histogram for attribute in attributes: for i, genre in enumerate(genres): filtered_df = inference_df[(inference_df.Genre == genre)] plt.hist(filtered_df[attribute], rwidth = 0.2, color = color[i], edgecolor='gray', linewidth=1.2, label = genre, density = True) plt.ylabel("Probability Density") plt.xlabel(attribute) plt.legend(loc='upper center', bbox_to_anchor=(1.25, 1), fancybox=True, shadow=True, ncol=1) plt.title(attribute) plt.show() #bar chart of average attribute per genre for attribute in attributes: fig, ax = plt.subplots(figsize=(10,4)) ax.set_xticklabels(np.array(inference_df.loc[:,'Genre']), fontsize=8, rotation=90) X = inference_df.groupby(inference_df['Genre']).mean().reset_index() plt.bar(range(len(X)), X[attribute], color=color, edgecolor='gray', linewidth=1.2) plt.xticks(range(len(X)), X['Genre']) plt.ylabel(attribute) plt.xlabel('Genre') plt.show()

#Check variance using Levene test - Results show variance is not the same across genres for attribute in attributes: variance_test = stats.levene(inference_df[inference_df.Genre == genres[0]].loc[:,attribute], inference_df[inference_df.Genre == genres[1]].loc[:,attribute], inference_df[inference_df.Genre == genres[2]].loc[:,attribute], inference_df[inference_df.Genre == genres[3]].loc[:,attribute], inference_df[inference_df.Genre == genres[4]].loc[:,attribute], inference_df[inference_df.Genre == genres[5]].loc[:,attribute], inference_df[inference_df.Genre == genres[6]].loc[:,attribute], inference_df[inference_df.Genre == genres[7]].loc[:,attribute], inference_df[inference_df.Genre == genres[8]].loc[:,attribute], inference_df[inference_df.Genre == genres[9]].loc[:,attribute], inference_df[inference_df.Genre == genres[10]].loc[:,attribute], inference_df[inference_df.Genre == genres[11]].loc[:,attribute]) display(attribute,variance_test) #use this line to see that multiple p-values are lower than 0.005, therefore we cannot assume equal variance

#Welch's t-test of all the combos of genres and features used_genre_attribute_combos = np.empty(0) genre1_list = np.empty(0) genre2_list = np.empty(0) attribute_list = np.empty(0) result_list = np.empty(0) stat_list = np.empty(0) effect_size_list = np.empty(0) power_list = np.empty(0) alpha = 0.005 #significance level for attribute in attributes: for genre1 in genres: for genre2 in genres: if (genre1 == genre2): #ignores comparing to itself continue elif ((genre1 + genre2 + attribute) in used_genre_attribute_combos): #ignores duplicate pairs continue else: filtered_df1 = inference_df[(inference_df.Genre == genre1)] filtered_df2 = inference_df[(inference_df.Genre == genre2)] genre1_attribute_list = np.array(filtered_df1.loc[:,attribute]) genre2_attribute_list = np.array(filtered_df2.loc[:,attribute]) combined_genres_attribute_list = pd.concat([filtered_df1.loc[:,attribute], filtered_df2.loc[:,attribute]], axis=0) #welch's t-test result = stats.ttest_ind(genre1_attribute_list, genre2_attribute_list, equal_var=False) #effect size effect_size = np.absolute((np.mean(genre1_attribute_list) - np.mean(genre2_attribute_list))/np.std(combined_genres_attribute_list)) #power power_func = TTestIndPower() power = power_func.solve_power(effect_size=effect_size, nobs1=filtered_df1.shape[0], alpha=alpha, power=None, ratio=(filtered_df2.shape[0]/filtered_df1.shape[0]), alternative='two-sided') #lists of results genre1_list = np.append(genre1_list, genre1) genre2_list = np.append(genre2_list, genre2) attribute_list = np.append(attribute_list, attribute) result_list = np.append(result_list, result.pvalue) stat_list = np.append(stat_list, result.statistic) effect_size_list = np.append(effect_size_list, effect_size) power_list = np.append(power_list,power) used_genre_attribute_combos = np.append(used_genre_attribute_combos,(genre2 + genre1 + attribute)) cols = {'Attribute': attribute_list, 'Genre 1': genre1_list, 'Genre 2': genre2_list, 'P-Value': result_list, 'Test Statistic': stat_list, 'Effect Size': effect_size_list, 'Power': power_list} #create df of results: welch_t_test_result_df = pd.DataFrame(data=cols).sort_values('P-Value').reset_index()

#Show df of results: with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.precision', 3, ): display("Results of Welch's T-Test:") display(welch_t_test_result_df.sort_values('Genre 1').sort_values('Attribute')) display(welch_t_test_result_df.shape)

#Find Number of Significant Results, Significant/High Powered Results, & Significant/High Powered Results/High Effect Size Results display("Out of 66 possible combinations:") #Significant significant_df = welch_t_test_result_df[welch_t_test_result_df['P-Value'] < alpha].reset_index() significant_df = significant_df.rename(columns={"P-Value": "Results"}) results_by_genre = pd.pivot_table(significant_df, values='Results', columns=['Attribute'], aggfunc='count', fill_value='0') results_by_genre = results_by_genre.loc[:,attributes] display("With Significant Values (p-value < 0.005)") display(results_by_genre) #Significant and High Power significant_df = welch_t_test_result_df[(welch_t_test_result_df['P-Value'] < alpha) & (welch_t_test_result_df['Power'] > 0.8)] significant_df = significant_df.rename(columns={"P-Value": "Results"}) results_by_genre = pd.pivot_table(significant_df, values='Results', columns=['Attribute'], aggfunc='count', fill_value='0') results_by_genre = results_by_genre.loc[:,attributes] display("With Significant Values (p-value < 0.005) and High Power (1-Beta > 0.8)") display(results_by_genre) #Significant, High Power, and High Effect Size significant_df = welch_t_test_result_df[(welch_t_test_result_df['P-Value'] < alpha) & (welch_t_test_result_df['Power'] > 0.8) & (welch_t_test_result_df['Effect Size'] > 0.8)] significant_df = significant_df.rename(columns={"P-Value": "Results"}) results_by_genre = pd.pivot_table(significant_df, values='Results', columns=['Attribute'], aggfunc='count', fill_value='0') results_by_genre = results_by_genre.loc[:,attributes] display("With Significant Values (p-value < 0.005), High Power (1-Beta > 0.8), and High Effect Size (Cohen's d > 0.8)") display(results_by_genre)

#Plots for Highest Differences in Length and Energy #Length length_significant_df = welch_t_test_result_df[(welch_t_test_result_df['P-Value'] < alpha) & (welch_t_test_result_df['Power'] > 0.8) & (welch_t_test_result_df['Effect Size'] > 0.8) & (welch_t_test_result_df['Attribute'] == 'Length')] plt.hist(inference_df['Length'][inference_df['Genre']=='Anime'], density = True, color ='red', bins = 10, rwidth = 0.7, label = "Anime") plt.hist(inference_df['Length'][inference_df['Genre']=='Pop'], density = True, color = 'silver', bins = 10, rwidth = 0.7, label = "Pop") plt.ylabel("Probability Density") plt.xlabel('Length (seconds)') plt.legend() plt.show() display(length_significant_df) #Energy energy_significant_df = welch_t_test_result_df[(welch_t_test_result_df['P-Value'] < alpha) & (welch_t_test_result_df['Power'] > 0.8) & (welch_t_test_result_df['Effect Size'] > 0.8) & (welch_t_test_result_df['Attribute'] == 'Energy')] plt.hist(inference_df['Energy'][inference_df['Genre']=='Anime'], density = True, bins = 10, color ='red', rwidth = 0.7, label = "Anime") plt.hist(inference_df['Energy'][inference_df['Genre']=='Classical'], density = True, color = 'blue', bins = 10, rwidth = 0.7, label = "Classical") plt.ylabel("Probability Density") plt.xlabel('Energy') plt.legend() plt.show() display(energy_significant_df)

#Find which genres have most differences from eachother: #(The bottom left corner in the report is the top right portion transposed, for better audience readability) genre_vs_genre_pivot = pd.pivot_table(significant_df, values='Results', index=['Genre 1'], columns=['Genre 2'], aggfunc='count', fill_value='0') display(genre_vs_genre_pivot)

# set random seed (Allison Redfern's NetID #). allison_num_rand = random.seed(17669368)

# remove 2 unnecessary columns (A.Sep, Rnd). new_full_df = data_df new_full_df['Length'] = new_full_df['Length'].astype(float) new_full_df = new_full_df.drop(columns=['A.Sep', 'Rnd']) # check. print(new_full_df.shape) display(new_full_df)

# build a baseline multiple Linear Regression model. # explore. print(new_full_df.shape) print(new_full_df.columns) display(new_full_df.head()) popularity = new_full_df['Pop.'] print('popularity: ') print(popularity.value_counts()) print('min: ', popularity.min()) print('max: ', popularity.max()) print('\n') # feature set. feature_df = new_full_df.drop(columns=['Pop.']) # drop all non-numeric features. feature_df_new = feature_df.drop(columns=['Title', 'Artist', 'Genre', 'Release']) print(feature_df_new.shape) print(feature_df_new.columns)

# build a baseline multiple Linear Regression model. # set-up. X = feature_df_new y = popularity # use random_state netID. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=allison_num_rand) # fit. print(X_train.shape) print(X_test.shape) print(y_train.shape) print(y_test.shape) regr = LinearRegression() regr.fit(X_train, y_train) # predict. y_pred = regr.predict(X_test) # coefficients. print("coefficients: ", regr.coef_) # mean squared error. print("mean squared error: ", mean_squared_error(y_test, y_pred)) # coefficient of determination. print("coefficient of determination: ", r2_score(y_test, y_pred))

# build an exploratory simple Linear Regression model. # check 'Loud' predictor power on 'Pop.' outcome. X_new = feature_df_new['Loud'].to_numpy().reshape(-1, 1) X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=allison_num_rand) regr = LinearRegression() regr.fit(X_train, y_train) # predict. y_pred = regr.predict(X_test) # coefficients. print("coefficients: ", regr.coef_) # mean squared error. print("mean squared error: ", mean_squared_error(y_test, y_pred)) # coefficient of determination. print("coefficient of determination: ", r2_score(y_test, y_pred))

# investigation. df_releasedate = new_full_df['Release'] display(df_releasedate.head()) print(df_releasedate.shape) release_list = df_releasedate.tolist() print(type(release_list[0])) dates_release = pd.to_datetime(release_list) print(dates_release) print(popularity)

# plot popularity over time. # set-up. popularity_list = popularity.tolist() series_pop = pd.Series(data=popularity_list, index=dates_release) series_popOverTime = series_pop.sort_index(ascending=True) df_popOverTime = series_popOverTime.to_frame() df_popOverTime.reset_index(inplace=True) # only visualize popularity past 1970. df_filtered_popOverTime = df_popOverTime[df_popOverTime['index'] > '1970-01-01'] # x, y. index_new_series = df_filtered_popOverTime['index'].tolist() data_new_series = df_filtered_popOverTime[0].tolist() series_filtered_popOverTime = pd.Series(data=data_new_series, index=index_new_series) # plot. plt.style.use('seaborn') seaborn.set_palette(palette='Pastel2', color_codes=True) plt.plot_date(index_new_series, data_new_series, c=[0.7019607, 0.88627, 0.803921], alpha=0.5) # trend line. mdates_index_new_series = mdates.date2num(index_new_series) z = np.polyfit(mdates_index_new_series, data_new_series, 1) p = np.poly1d(z) plt.plot(index_new_series, p(mdates_index_new_series), c='grey') # , alpha=0.5) # show. plt.tight_layout() plt.xlabel('year') plt.ylabel('popularity') plt.show() plt.close()

# feature engineering. # calculate attributes. df_calculate_attributes = df_popOverTime.rename(columns={"index": "date"}) display(df_calculate_attributes) # ATTRIBUTE 1: MIN popularity each year. min_popularity_year = df_calculate_attributes.groupby(df_calculate_attributes.date.dt.year)[0].min() print('MIN POPULARITY: ') display(min_popularity_year.tail(3)) # ATTRIBUTE 2: MAX popularity each year. max_popularity_year = df_calculate_attributes.groupby(df_calculate_attributes.date.dt.year)[0].max() print('MAX POPULARITY: ') display(max_popularity_year.tail(3)) # ATTRIBUTE 3: COUNT of songs released each year. count_songs_year = df_calculate_attributes.groupby(df_calculate_attributes.date.dt.year)[0].count() print('NUMBER OF SONGS: ') display(count_songs_year.tail(3)) print('shape: ') print(count_songs_year.shape) # distinct years in the df. years_present = count_songs_year.index.values.tolist() print('years present: ', years_present) print(len(years_present)) # ATTRIBUTES 4, 5, 6, 7: mean, median, std, sum popularity each year. mean_popularity_year = df_calculate_attributes.groupby(df_calculate_attributes.date.dt.year)[0].mean() median_popularity_year = df_calculate_attributes.groupby(df_calculate_attributes.date.dt.year)[0].median() std_popularity_year = df_calculate_attributes.groupby(df_calculate_attributes.date.dt.year)[0].std() sum_popularity_year = df_calculate_attributes.groupby(df_calculate_attributes.date.dt.year)[0].sum()

# add 'year' attribute to original data. # make a copy to modify. original_df = new_full_df # obtain year of song release from 'Release' attribute. original_df['Release'] = pd.to_datetime(original_df['Release']) release_list = original_df['Release'] # create list of years. attribute_4_years_list = [] for i in range(release_list.shape[0]): curr_date = release_list.loc[i] curr_year = curr_date.year attribute_4_years_list.append(curr_year) print('\n') print('number of songs: ', len(attribute_4_years_list)) # add feature to df. original_df['year'] = attribute_4_years_list # check. display(original_df)

# merge time-related calculated features onto original df by 'year'. # add to df. min_attribute_to_add = [] max_attribute_to_add = [] count_attribute_to_add = [] mean_attribute_to_add = [] median_attribute_to_add = [] std_attribute_to_add = [] sum_attribute_to_add = [] # lists. min_popularity_year_list = list(min_popularity_year) max_popularity_year_list = list(max_popularity_year) count_songs_year_list = list(count_songs_year) mean_popularity_year_list = list(mean_popularity_year) median_popularity_year_list = list(median_popularity_year) std_popularity_year_list = list(std_popularity_year) sum_popularity_year_list = list(sum_popularity_year) # years present. index_of_each_year_list = years_present print(index_of_each_year_list) year_list_original = original_df['year'].tolist() # check that index matches. print(index_of_each_year_list.index(1926)) # loop through each row in df. for i in range(original_df.shape[0]): curr_year = year_list_original[i] index_of_curr_year = index_of_each_year_list.index(curr_year) # match year to corresponding info. min_corresponding = min_popularity_year_list[index_of_curr_year] max_corresponding = max_popularity_year_list[index_of_curr_year] count_corresponding = count_songs_year_list[index_of_curr_year] mean_corresponding = mean_popularity_year_list[index_of_curr_year] median_corresponding = median_popularity_year_list[index_of_curr_year] std_corresponding = std_popularity_year_list[index_of_curr_year] sum_corresponding = sum_popularity_year_list[index_of_curr_year] # append to respective lists. min_attribute_to_add.append(min_corresponding) max_attribute_to_add.append(max_corresponding) count_attribute_to_add.append(count_corresponding) mean_attribute_to_add.append(mean_corresponding) median_attribute_to_add.append(median_corresponding) std_attribute_to_add.append(std_corresponding) sum_attribute_to_add.append(sum_corresponding) # check lengths. print(len(min_attribute_to_add)) print(len(max_attribute_to_add)) print(len(count_attribute_to_add)) print(len(mean_attribute_to_add)) print(len(median_attribute_to_add)) print(len(std_attribute_to_add)) print(len(sum_attribute_to_add)) # check values. print(min_attribute_to_add[:5]) print(max_attribute_to_add[:5]) print(count_attribute_to_add[:5]) print(mean_attribute_to_add[:5]) print(median_attribute_to_add[:5]) print(std_attribute_to_add[:5]) print(sum_attribute_to_add[:5]) # add to dataframe. original_df['minPop'] = min_attribute_to_add original_df['maxPop'] = max_attribute_to_add original_df['countYear'] = count_attribute_to_add original_df['meanPop'] = mean_attribute_to_add original_df['medianPop'] = median_attribute_to_add original_df['stdPop'] = std_attribute_to_add original_df['sumPop'] = sum_attribute_to_add # check. display(original_df)

# build a multiple Linear Regression model on original df with TIME data. # obtain only time features. time_feature_set = original_df.iloc[:, 12:] display(time_feature_set) # handle NaN values. original_df.at[1278, 'stdPop'] = 0 original_df.at[6270, 'stdPop'] = 0 original_df.at[9448, 'stdPop'] = 0 # set-up. X = original_df.drop(columns=['Title', 'Artist', 'Release', 'Genre', 'Pop.']) y = popularity # use random_state netID. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=allison_num_rand) # fit. regr = LinearRegression() regr.fit(X_train, y_train) # predict. y_pred = regr.predict(X_test) # coefficients. print("coefficients: ", regr.coef_) # mean squared error. print("mean squared error: ", mean_squared_error(y_test, y_pred)) # coefficient of determination. print("coefficient of determination: ", r2_score(y_test, y_pred))

# investigation. genre_counts = new_full_df['Genre'].value_counts() print(genre_counts)

# plot popularity by genre to investigate possible effects. # plot. fig = px.scatter(new_full_df, y="Pop.", x="medianPop", color="Genre", color_discrete_sequence=["red", "green", "blue", "cyan", "magenta", "yellow", "black", "orange", "purple", "white", "silver", "magenta"]) # remove gridlines. fig.update_layout(xaxis=dict(showgrid=False), yaxis=dict(showgrid=False)) # change size. fig.update_traces(marker_size=4) # display. fig.show()

# convert categorical variable into numeric representation. # list of genres. genre_list = new_full_df['Genre'].tolist() # create indicator variables. new_genre_df = pd.DataFrame({'Genre': genre_list}) genre_df_separate = pd.get_dummies(new_genre_df) # check. display(genre_df_separate)

# build a multiple Linear Regression model on only GENRE data. # obtain only genre features. genre_feature_set = genre_df_separate display(genre_df_separate) # set-up. X = genre_feature_set y = popularity # use random_state netID. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=allison_num_rand) # fit. regr = LinearRegression() regr.fit(X_train, y_train) # predict. y_pred = regr.predict(X_test) # coefficients. print("coefficients: ", regr.coef_) # mean squared error. print("mean squared error: ", mean_squared_error(y_test, y_pred)) # coefficient of determination. print("coefficient of determination: ", r2_score(y_test, y_pred))

# create combined feature set. # combine original attributes (12), time attributes (8), genre attributes (12). feature_set_concat = pd.concat([original_df, genre_df_separate], axis=1) display(feature_set_concat) # drop non-numeric columns. filtered_feature_set = feature_set_concat.drop(columns=['Title', 'Artist', 'Release', 'Genre', 'Pop.']) display(filtered_feature_set)

# normalize labels. labels_popularity_list = popularity.tolist() labels_popularity_list_normalized = [(x/100) for x in labels_popularity_list] # check. print(labels_popularity_list_normalized[:10])

# build a Regularized Regression model (Ridge); train/test on combined feature set. # set-up. X = filtered_feature_set y = labels_popularity_list_normalized X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=allison_num_rand) # hyperparameter tuning. alpha_list = np.arange(1, 101, 100) # cross-validation. model_standardized = linear_model.RidgeCV(alphas = alpha_list, cv=10) # standardize training/testing sets. scaler = preprocessing.StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) # fit Ridge on standardized data. model_standardized.fit(X_train_scaled, y_train) # obtain predictions. scaled_predictions = model_standardized.predict(X_test_scaled) ''' measure performance. ''' # coefficients. print("coefficients: ", model_standardized.coef_) # mean squared error. print("mean squared error: ", mean_squared_error(y_test, scaled_predictions, squared=False)) # coefficient of determination. print("coefficient of determination: ", r2_score(y_test, scaled_predictions)) """ coefficients: [ 8.86402004e-07 -1.76711787e-02 1.31199909e-02 2.55697739e-02 -7.22367567e-03 -7.49757374e-03 -5.11259157e-03 -6.97105126e-03 4.94013254e-03 -2.32173221e-03 2.74905341e-02 2.77715155e-02 -1.18636196e-02 1.70466642e-03 -4.73663403e-02 -4.86293602e-03 -5.27887719e-02 -8.56424514e-03 2.00124357e-02 7.79449638e-03 1.67410961e-04 -2.67200390e-02 2.63725393e-02 -4.97467324e-02 4.07161417e-02 4.25784420e-02 -1.67247894e-04] mean squared error: 0.1233510219111883 coefficient of determination: 0.5097335012402788 """

# build a Residual Plot to validate no bias in model predictions. # plot. set_palette('pastel', color_codes=True) visualizer = ResidualsPlot(model_standardized, train_color='b', test_color='g') # fit. visualizer.fit(X_train_scaled, y_train) # fit the training data to the visualizer. visualizer.score(X_test_scaled, y_test) # evaluate the model on the test data. # remove gridlines. fig.update_layout(xaxis=dict(showgrid=False), yaxis=dict(showgrid=False)) visualizer.show()

# feature importance (rank magnitude of weights found by Ridge model). # list of attributes. feature_list_new = list(filtered_feature_set.columns.values) print('attributes: ', feature_list_new) # analyze coefficients. print('\n') weights_new = list(model_standardized.coef_) print('weights: ', weights_new) weights_series_new = pd.Series(weights_new) weights_series_abs_new = weights_series_new.abs() # order weights. weights_series_reordered_new = weights_series_abs_new.sort_values(ascending=False) print('\n') print('order of weights: ') print(weights_series_reordered_new) weights_index_new = weights_series_reordered_new.index.tolist() print('index of reordered weights: ', weights_index_new) print('\n') print('* * * * * * * * * *') # output in order of weights. print('attributes, in order of IMPORTANCE: ') weights_index_names_new = [feature_list_new[i] for i in weights_index_new] print(weights_index_names_new)

# run a Regression Analysis to determine significance of coefficients. # alpha value. new_alpha = model_standardized.alpha_ X2 = sm.add_constant(X_train_scaled) # build, fit OLS. est = sm.OLS(y_train, X2).fit() # analysis summary. print(est.summary())

# pearson correlations. # calculate. display(filtered_feature_set) num_cols = filtered_feature_set.shape[1] col_names = list(filtered_feature_set.columns) pop = new_full_df['Pop.'] for i in range(num_cols): curr_col = col_names[i] pearson = pop.corr(filtered_feature_set[curr_col]) # output per attribute. print('corr Pop. & ' + str(curr_col) + ': ' + str(pearson))

PCA_df = data_df PCA_df['Length'] = PCA_df['Length'].astype(float) X = PCA_df.drop(['Title', 'Artist', 'Release', 'Rnd', 'A.Sep', 'Genre'], 1) y = PCA_df['Genre'] # Check size of features and labels print(X.shape) print(y.shape)

# features only complete_correlation= associations(X, figsize=(10,10)) complete_correlation

# PCA Pipeline pca_pipeline = Pipeline([('scaling', StandardScaler()), ('pca', sk_PCA())]) pca_pipeline.fit(X) #features only, no labels

# Kaiser Criterion: Consider all principal components with eigen values greater than 1.0 eigVals=pca_pipeline[1].explained_variance_ nComponents = 8 x = np.linspace(1,nComponents, nComponents) plt.figure(figsize=(10, 8)) plt.bar(x, eigVals, color='gray') plt.plot([0,nComponents],[1,1],color='red',label='Kaiser Criterion') # red Kaiser criterion line plt.xlabel('Principal Component', fontsize=25) plt.ylabel('Eigenvalue', fontsize=25) plt.xticks(fontsize=20) plt.yticks(fontsize=20) plt.legend() plt.show() # Print Eigenvalues for index, value in enumerate(eigVals): print(value)

# Explained Variance Calculation covarExplained = eigVals/sum(eigVals)*100 print("Variance explained by the 2 PCs above is: %.3f " % (sum(covarExplained[:2])))

# PCA Components scaler = StandardScaler() X_comp=scaler.fit_transform(X) pca=sk_PCA(n_components=2) X_new=pca.fit_transform(X_comp) explained_variance=pca.explained_variance_ratio_ sum_variance = sum(explained_variance) print("Variance explained by the 2 PCs above is: %.4f " % (sum_variance)) components=pca.components_ components=pd.DataFrame(components, columns=X.columns) display(components)

# PCA Pipeline for Silhouette music_pca_pipeline = Pipeline([('scaling', StandardScaler()), ('pca', sk_PCA(n_components=2))]) music_processed = music_pca_pipeline.fit_transform(X.values)

numClusters = 9 # how many clusters are we looping over? (from 2 to 10) Q = np.empty([numClusters,1])*np.NaN # init container to store sums # Compute kMeans: plt.figure(figsize=(16, 8)) for ii in range(2, 11): # Loop through each cluster (from 2 to 10!) kMeans = sk_KMeans(n_clusters = int(ii)).fit(music_processed) # compute kmeans using scikit cId = kMeans.labels_ # vector of cluster IDs that the row belongs to cCoords = kMeans.cluster_centers_ # coordinate location for center of each cluster s = silhouette_samples(music_processed,cId) # compute the mean silhouette coefficient of all samples Q[ii-2] = sum(s) # take the sum # Plot data: plt.subplot(3,3,ii-1) plt.hist(s,bins=20) plt.xlim(-0.2,1) plt.ylim(0,250) plt.xlabel('Silhouette score') plt.ylabel('Count') plt.title('Sum: {}'.format(int(Q[ii-2]))) # sum rounded to nearest integer plt.tight_layout() # adjusts subplot padding

# Plot Silhouette Analysis plt.plot(np.linspace(2,10,9),Q) plt.xlabel('Number of clusters', fontsize=15) plt.ylabel('Sum of silhouette scores', fontsize=15) plt.show()

# Define PCA class PCA(): """A method for doing dimensionality reduction by transforming the feature space to a lower dimensionality, removing correlation between features and maximizing the variance along each feature axis. """ def _init__(self): self.eigenValues=None self.components=None def transform(self, X, n_components): """ Fit the dataset to the number of principal components specified in the constructor and return the transformed dataset """ covariance_matrix = self.calculate_covariance_matrix(X) # Where (eigenvector[:,0] corresponds to eigenvalue[0]) eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix) # Sort the eigenvalues and corresponding eigenvectors from largest # to smallest eigenvalue and select the first n_components idx = eigenvalues.argsort()[::-1] # [3, 2, 1] ---> [2, 1, 0] --> [0, 1, 2] eigenvalues = eigenvalues[idx][:n_components] eigenvectors = np.atleast_1d(eigenvectors[:, idx])[:, :n_components] # Set the object variables self.eigenValues=eigenvalues self.components=eigenvectors # Project the data onto principal components X_transformed = X.dot(eigenvectors) return X_transformed def calculate_covariance_matrix(self, X, Y=None): """ Calculate the covariance matrix for the dataset X """ if Y is None: Y = X n_samples = np.shape(X)[0] covariance_matrix = (1 / (n_samples-1)) * (X - X.mean(axis=0)).T.dot(Y - Y.mean(axis=0)) return np.array(covariance_matrix, dtype=float)

# Labels print("Genres: ", y.unique()) target_names = y.unique() # Convert genres into unique ids y_codes = pd.DataFrame(y) y_codes['id'] = y_codes.groupby(['Genre']).ngroup() y_codes_arr = np.array(y_codes['id']) print(y_codes_arr)

# PCA for visualization pca=PCA() X_transformed=pca.transform(StandardScaler().fit_transform(X), 2) # 2 principal components

target_ids = range(len(target_names)) print(target_ids) colors = ['red','green','blue','cyan','magenta','yellow','black','orange','purple','white','silver','maroon'] plt.figure(figsize=(20, 13)) for i, c, label in zip(target_ids, colors, target_names): plt.scatter(X_transformed[y_codes_arr == i, 0], X_transformed[y_codes_arr == i, 1], c=c, alpha=0.6, label=label) plt.xticks(fontsize=25) plt.yticks(fontsize=25) plt.title('Principal Component Analysis: Music Genres', fontsize=45) plt.xlabel('Principal Component 1', fontsize=35) plt.ylabel('Principal Component 2', fontsize=35) plt.ylim(-5,4) plt.xlim(-4,7) plt.legend() plt.show()

# K-Means def KMeans(X, n_clusters, rseed=17669368): # 1. Randomly choose clusters rng = np.random.RandomState(17669368) i = rng.permutation(X.shape[0])[:n_clusters] centers = X[i] while True: # 2a. Assign labels based on closest center labels = pairwise_distances_argmin(X, centers) # 2b. Find new centers from means of points new_centers = np.array([X[labels == i].mean(0) for i in range(n_clusters)]) # 2c. Check for convergence if np.all(centers == new_centers): break centers = new_centers return centers, labels

# Plotting K-Means clusters n_clusters=range(2, 8) plt.figure(figsize=(18, 12), layout="constrained") for i, c_num in enumerate(n_clusters): centers, labels = KMeans(X_transformed, c_num) plt.subplot(3, 2, i+1) plt.title('K-Means Clustering with n_clusters={}'.format(c_num)) plt.xlabel("Principal Component 1") plt.ylabel("Principal Component 2") plt.ylim(-4.5,3.5) plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=labels, cmap='Pastel2') plt.plot(centers[:, 0], centers[:, 1], '*',markersize=10,color='red')

# Optimal Cluster # for K-Means n_clusters=2 plt.figure(figsize=(20, 13), layout="constrained") centers, labels = KMeans(X_transformed, 2) # plt.title('K-Means Clustering with n_clusters={}'.format(2), fontsize=45) plt.xticks(fontsize=25) plt.yticks(fontsize=25) plt.xlabel("Principal Component 1", fontsize=40) plt.ylabel("Principal Component 2", fontsize=40) plt.ylim(-5,4) plt.xlim(-4,7) plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=labels, cmap='Pastel2') plt.plot(centers[:, 0], centers[:, 1], '*',markersize=10,color='red');

# Cluster Report def pretty_print(df): return display( HTML( df.to_html().replace("\\n","<br>") ) ) def get_class_rules(tree: DecisionTreeClassifier, feature_names: list): inner_tree: _tree.Tree = tree.tree_ classes = tree.classes_ class_rules_dict = dict() def tree_dfs(node_id=0, current_rule=[]): # feature[i] holds the feature to split on, for the internal node i. split_feature = inner_tree.feature[node_id] if split_feature != _tree.TREE_UNDEFINED: # internal node name = feature_names[split_feature] threshold = inner_tree.threshold[node_id] # left child left_rule = current_rule + ["({} <= {})".format(name, threshold)] tree_dfs(inner_tree.children_left[node_id], left_rule) # right child right_rule = current_rule + ["({} > {})".format(name, threshold)] tree_dfs(inner_tree.children_right[node_id], right_rule) else: # leaf dist = inner_tree.value[node_id][0] dist = dist/dist.sum() max_idx = dist.argmax() if len(current_rule) == 0: rule_string = "ALL" else: rule_string = " and ".join(current_rule) # register new rule to dictionary selected_class = classes[max_idx] class_probability = dist[max_idx] class_rules = class_rules_dict.get(selected_class, []) class_rules.append((rule_string, class_probability)) class_rules_dict[selected_class] = class_rules tree_dfs() # start from root, node_id = 0 return class_rules_dict def cluster_report(data: pd.DataFrame, clusters, min_samples_leaf=50, pruning_level=0.01): # Create Model tree = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, ccp_alpha=pruning_level) tree.fit(data, clusters) # Generate Report feature_names = data.columns class_rule_dict = get_class_rules(tree, feature_names) report_class_list = [] for class_name in class_rule_dict.keys(): rule_list = class_rule_dict[class_name] combined_string = "" for rule in rule_list: combined_string += "[{}] {}\n\n".format(rule[1], rule[0]) report_class_list.append((class_name, combined_string)) cluster_instance_df = pd.Series(clusters).value_counts().reset_index() cluster_instance_df.columns = ['class_name', 'instance_count'] report_df = pd.DataFrame(report_class_list, columns=['class_name', 'rule_list']) report_df = pd.merge(cluster_instance_df, report_df, on='class_name', how='left') pretty_print(report_df.sort_values(by='class_name')[['class_name', 'instance_count', 'rule_list']])

pc = pca_pipeline.fit_transform(X) kmeans_model = sk_KMeans(n_clusters=2) y_cluster = kmeans_model.fit_predict(pc) cluster_report(X, y_cluster, min_samples_leaf=20, pruning_level=0.05)

#Define Functions Used in Neural Network def sigmoid(z): return 1.0 / (1.0 + np.exp(-z)) def sigmoid_prime(z): return sigmoid(z) * (1 - sigmoid(z)) def cost(output_activations, y): return 1/2*(np.sum((output_activations-y)**2)) def cost_derivative(output_activations, y): return (output_activations - y)

#Create a Neural Network Class @dataclass class Network: num_layers: int biases: list weights: list #Initialize Biases and Weights Randomly from Standard Normal Distribution def init_network(layers): np.random.seed(17669368) return Network( len(layers), [np.random.randn(y, 1) for y in layers[1:]], [np.random.randn(y, x) for x, y in zip(layers[:-1], layers[1:])]) #Feedforward Function def feedforward(nn, a): for b, w in zip(nn.biases, nn.weights): a = sigmoid(np.dot(w, a) + b) return a #Evaluate Performance on Validation and Test Data def evaluate(nn, test_data): test_results = [(np.argmax(feedforward(nn, x)), y) for (x, y) in test_data] return sum(int(x == y) for (x, y) in test_results)

#Learning Process for the Neural Network def learn(nn, training_data, epochs, mini_batch_size, learning_rate, test_data = None): n = len(training_data) for j in range(epochs): random.seed(17669368) random.shuffle(training_data) mini_batches = [training_data[k: k + mini_batch_size] for k in range(0, n, mini_batch_size)] cost=0.0 for mini_batch in mini_batches: cost+=batch_stochastic_gradient_descent(nn, mini_batch, learning_rate) if test_data: print('Epoch {0}: accuracy {1}% , Cost: {2}'.format(f'{j + 1:2}', 100.0 * evaluate(nn, test_data) / len(test_data), np.round(cost/n, 3))) else: print('Epoch {0} complete, , Cost: {1}'.format(f'{j + 1:2}', np.round(cost/n, 3)))

#Stochastic Gradient Descent for the Batch def batch_stochastic_gradient_descent(nn, mini_batch, eta): nabla_b = [np.zeros(b.shape) for b in nn.biases] nabla_w = [np.zeros(w.shape) for w in nn.weights] loss, nabla_b, nabla_w = batch_backprop(nn, mini_batch) nn.weights = [w - (eta / len(mini_batch)) * nw for w, nw in zip(nn.weights, nabla_w)] nn.biases = [b - (eta / len(mini_batch)) * nb for b, nb in zip(nn.biases, nabla_b)] return loss

#Performs Backpropogation to Update Weights and Biases Based on Gradient Descent def batch_backprop(nn, mini_batch): nabla_b = [np.zeros(b.shape) for b in nn.biases] nabla_w = [np.zeros(w.shape) for w in nn.weights] #Mini-batch Components ax, ay = tuple(t for t in np.asarray(mini_batch).transpose()) x = np.stack(ax) y = np.stack(ay) #Feedforward activation = x activations = [x] zs = [] for b, w in zip(nn.biases, nn.weights): z = np.matmul(w, activation) + b zs.append(z) activation = sigmoid(z) activations.append(activation) loss = cost(activations[-1].squeeze(), y.squeeze()) #Backward pass #Start from Output Layer delta = cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1]) nabla_b[-1] = delta.sum(axis = 0) nabla_w[-1] = np.matmul(delta, activations[-2].transpose(0, 2, 1)).sum(axis = 0) #Back to Input Layer for i in reversed(range(2, nn.num_layers)): z = zs[-i] sp = sigmoid_prime(z) delta = np.matmul(nn.weights[-i + 1].transpose(), delta) * sp nabla_b[-i] = delta.sum(axis = 0) nabla_w[-i] = np.matmul(delta, activations[-i - 1].transpose(0, 2, 1)).sum(axis = 0) return (loss, nabla_b, nabla_w)

#Make a Copy of Original Dataset for Subsection neural_df = data_df[['BPM','Energy','Dance','Loud','Valence','Length','Acoustic','Pop.','Genre']]

#Turn the Genre Lables Into Digits 0-11 label_data = LabelEncoder() neural_df['Genre'] = label_data.fit_transform(neural_df['Genre']) display(neural_df)

#Load and Transform Data neural_data = neural_df.copy() Y = neural_data['Genre'].tolist() neural_data = neural_data.drop(['Genre'], axis=1) X = neural_data.to_numpy() y = np.array(Y) #80/10/10 Train/Validate/Test Split X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8, random_state=17669368) X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5, random_state=17669368) #Format Data train = (X_train.astype(float),y_train) validate = (X_valid.astype(float),y_valid) test = (X_test.astype(float),y_test)

#Define Data Wrapper, One Hot Encoder, and Print Shape for Output def load_data_wrapper(): tr_d = train va_d = validate te_d = test training_inputs = [np.reshape(x, (8, 1)) for x in tr_d[0]] training_results = [one_hot_encode(y) for y in tr_d[1]] training_data = zip(training_inputs, training_results) validation_inputs = [np.reshape(x, (8, 1)) for x in va_d[0]] validation_data = zip(validation_inputs, va_d[1]) test_inputs = [np.reshape(x, (8, 1)) for x in te_d[0]] test_data = zip(test_inputs, te_d[1]) return (list(training_data), list(validation_data), list(test_data)) def one_hot_encode(j): e = np.zeros((12, 1)) e[j] = 1.0 return e def print_shape(name, data): print('Shape of {0}: {1}'.format(name, data.shape))

#Train and Test Neural Network #8 Inputs, 10 Neurons in Hidden Layer, 12 Labels training_data, validation_data, test_data = load_data_wrapper() nn = init_network([8, 10, 12]) for l in range(0, nn.num_layers - 1): print('\nNetwork layer {0}'.format(l + 2)) print_shape('weights', nn.weights[l]) print_shape('biases', nn.biases[l]) #Hyper Parameters epochs = 30 mini_batch_size = 10 learning_rate = 3.0 #Learning Process print('\nLearning process started...\n') time_start = time() learn(nn, training_data, epochs, mini_batch_size, learning_rate, validation_data) time_end = time() time_elapsed = time_end - time_start print('\nLearning process complete in {0} seconds ({1} seconds per epoch)!\n'.format(f'{time_elapsed:.0f}', f'{time_elapsed / epochs:.1f}')) #Use Held Out Test Data to Determine Accuracy print('Validation (with yet unseen data): accuracy {0}%'.format(100.0 * evaluate(nn, test_data) / len(test_data)))

#Load and Transform Data neural_data = neural_df.copy() Y = neural_data['Genre'].tolist() neural_data = neural_data.drop(['Genre'], axis=1) X = neural_data.to_numpy() y = np.array(Y) #80/10/10 Train/Validate/Test Split X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8, random_state=17669368) X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5, random_state=17669368) #Standardize Data (Fitting the scaler to the training data and then transforming the training and test data with it) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) X_valid = scaler.transform(X_valid) #Format Data as Tuples train = (X_train.astype(float),y_train) validate = (X_valid.astype(float),y_valid) test = (X_test.astype(float),y_test)

#Redefine Data Wrapper and One Hot Encoder def load_data_wrapper(): tr_d = train va_d = validate te_d = test training_inputs = [np.reshape(x, (8, 1)) for x in tr_d[0]] training_results = [one_hot_encode(y) for y in tr_d[1]] training_data = zip(training_inputs, training_results) validation_inputs = [np.reshape(x, (8, 1)) for x in va_d[0]] validation_data = zip(validation_inputs, va_d[1]) test_inputs = [np.reshape(x, (8, 1)) for x in te_d[0]] test_data = zip(test_inputs, te_d[1]) return (list(training_data), list(validation_data), list(test_data)) def one_hot_encode(j): e = np.zeros((12, 1)) e[j] = 1.0 return e

#Train and Test Neural Network #8 Inputs, 10 in Hidden Layer, 12 Labels training_data, validation_data, test_data = load_data_wrapper() nn = init_network([8, 10, 12]) for l in range(0, nn.num_layers - 1): print('\nNetwork layer {0}'.format(l + 2)) print_shape('weights', nn.weights[l]) print_shape('biases', nn.biases[l]) #Hyper Parameters epochs = 30 mini_batch_size = 10 learning_rate = 3.0 #Learning Process print('\nLearning process started...\n') time_start = time() learn(nn, training_data, epochs, mini_batch_size, learning_rate, validation_data) time_end = time() time_elapsed = time_end - time_start print('\nLearning process complete in {0} seconds ({1} seconds per epoch)!\n'.format(f'{time_elapsed:.0f}', f'{time_elapsed / epochs:.1f}')) #Use Held Out Test Data to Determine Accuracy print('Validation (with yet unseen data): accuracy {0}%'.format(100.0 * evaluate(nn, test_data) / len(test_data)))

#Determine the Number of Samples for Each Genre neural_df2 = neural_df.copy() genre_0 = neural_df2[neural_df2['Genre']==0] genre_1 = neural_df2[neural_df2['Genre']==1] genre_2 = neural_df2[neural_df2['Genre']==2] genre_3 = neural_df2[neural_df2['Genre']==3] genre_4 = neural_df2[neural_df2['Genre']==4] genre_5 = neural_df2[neural_df2['Genre']==5] genre_6 = neural_df2[neural_df2['Genre']==6] genre_7 = neural_df2[neural_df2['Genre']==7] genre_8 = neural_df2[neural_df2['Genre']==8] genre_9 = neural_df2[neural_df2['Genre']==9] genre_10 = neural_df2[neural_df2['Genre']==10] genre_11 = neural_df2[neural_df2['Genre']==11]

#Initialize List of the Number of Samples for Each Genre samples = [629,1101,1338,1109,1615,471,1161,1447,1966,2103,1163,789] #Initialize List of the Genres genres = [0,1,2,3,4,5,6,7,8,9,10,11] #Initialize List of Lists of the Genres Not Being Singled Out rest = [[1,2,3,4,5,6,7,8,9,10,11],[0,2,3,4,5,6,7,8,9,10,11],[0,1,3,4,5,6,7,8,9,10,11],[0,1,2,4,5,6,7,8,9,10,11],[0,1,2,3,5,6,7,8,9,10,11],[0,1,2,3,4,6,7,8,9,10,11],[0,1,2,3,4,5,7,8,9,10,11],[0,1,2,3,4,5,6,8,9,10,11],[0,1,2,3,4,5,6,7,9,10,11],[0,1,2,3,4,5,6,7,8,10,11],[0,1,2,3,4,5,6,7,8,9,11],[0,1,2,3,4,5,6,7,8,9,10]]

#Change Data Wrapper and Change One Hot Encoder to 2 Outputs def load_data_wrapper(train,test,validate): tr_d = train va_d = validate te_d = test training_inputs = [np.reshape(x, (8, 1)) for x in tr_d[0]] training_results = [one_hot_encode(y) for y in tr_d[1]] training_data = zip(training_inputs, training_results) validation_inputs = [np.reshape(x, (8, 1)) for x in va_d[0]] validation_data = zip(validation_inputs, va_d[1]) test_inputs = [np.reshape(x, (8, 1)) for x in te_d[0]] test_data = zip(test_inputs, te_d[1]) return (list(training_data), list(validation_data), list(test_data)) def one_hot_encode(j): e = np.zeros((2, 1)) e[j] = 1.0 return e

#Data Preprocessing Function def binary_classifier(genrenumber,restofgenres,numsamples): ''' This function takes the coded genre number, a list of the rest of the genre numbers and the number of samples in the dataset for that particular genre. It tranforms the data for a genre in a one-vs-all binary format for use in the neural network. Params: genrenumber: (int) restofgenres: (list) numsamples: (int) Return: train: (tuple) test: (tuple) validate: (tuple) ''' #Replace Labels with 0s and 1s neural_df2 = neural_df.copy() neural_df2 = neural_df2.replace({'Genre': genrenumber}, 20) neural_df2 = neural_df2.replace({'Genre': restofgenres}, 0) neural_df2 = neural_df2.replace({'Genre': 20}, 1) dataframe1 = neural_df2[neural_df2['Genre']==1] dataframe1 = pd.DataFrame(dataframe1) dataframe2 = neural_df2[neural_df2['Genre']==0] dataframe2 = pd.DataFrame(dataframe2) dataframe2 = dataframe2.sample(n=numsamples, replace=False, random_state=17669368) df_stacked = pd.concat([dataframe1, dataframe2]) #Load and Transform Data Y = df_stacked['Genre'].tolist() df_stacked = df_stacked.drop(['Genre'], axis=1) X = df_stacked.to_numpy() y = np.array(Y) #80/10/10 Train/Validate/Test Split X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8, random_state=17669368) X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5, random_state=17669368) #Standardize Data scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) X_valid = scaler.transform(X_valid) #Format Data train = (X_train.astype(float),y_train) validate = (X_valid.astype(float),y_valid) test = (X_test.astype(float),y_test) return train, test, validate

#Loop Through Each Genres and Run One-vs-All Through the Neural Network #Calculates Accuracy, ROC Curve, AUC colors = ['red','green','blue','cyan','magenta','yellow','black','orange','purple','white','silver','maroon'] labels = ['Anime', 'Broadway', 'Classical', 'Country', 'Dance Electronic', 'Disney', 'Happy Holidays', 'Hip Hop', 'Jazz', 'Latin', 'Pop','Rock'] TP = [] FN = [] FP = [] TN = [] for a, b, c, d, e in zip(genres, rest, samples, colors, labels): #Load Data train, test, validate = binary_classifier(a,b,c) #Train and Test Neural Network #8 Input Features, 6 Neurons in Hidden Layer, 2 Output Labels training_data, validation_data, test_data = load_data_wrapper(train,test,validate) nn = init_network([8, 6, 2]) print("\nGenre: "+str(a)) for l in range(0, nn.num_layers - 1): print('\nNetwork layer {0}'.format(l + 2)) print_shape('weights', nn.weights[l]) print_shape('biases', nn.biases[l]) #Hyper Parameters epochs = 30 mini_batch_size = 10 learning_rate = 3.0 #Learning Process print('\nLearning process started...\n') time_start = time() learn(nn, training_data, epochs, mini_batch_size, learning_rate, validation_data) time_end = time() time_elapsed = time_end - time_start print('\nLearning process complete in {0} seconds ({1} seconds per epoch)!\n'.format(f'{time_elapsed:.0f}', f'{time_elapsed / epochs:.1f}')) #Use Held Out Test Data to Determine Accuracy print('Validation (with yet unseen data): accuracy {0}%'.format(100.0 * evaluate(nn, test_data) / len(test_data))) #Get Predicted and True Values test_results = [(np.argmax(feedforward(nn, x)), y) for (x, y) in test_data] y_pred = [] y_true = [] for i in range(len(test_results)): y_pred.append(test_results[i][0]) y_true.append(test_results[i][1]) #ROC Curves fpr, tpr, thresholds = roc_curve(y_true, y_pred) fig = plt.figure(figsize = (7,6)) plt.plot(fpr, tpr, color=d) plt.xlabel('False Positive Rate', fontsize=20) plt.ylabel('True Positive Rate', fontsize=20) plt.title('ROC Curve for '+str(e), fontsize=20) plt.show() #Confusion Matrix conf_matrix = confusion_matrix(y_true, y_pred) TP.append(conf_matrix[1][1]) FN.append(conf_matrix[1][0]) FP.append(conf_matrix[0][1]) TN.append(conf_matrix[0][0]) disp=ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=[0,1]) disp.plot() plt.xlabel('Predicted Label') plt.ylabel('True Label') plt.title('Confusion Matrix for '+str(e)) plt.grid() plt.show() print("Accuracy of the model is: %.3f" % accuracy_score(y_pred, y_true)) auc = roc_auc_score(y_true, y_pred) print("AUC: "+str(auc))

#Micro Averaged ROC total_instances_1 = sum(TP) + sum(FN) total_instances_2 = sum(FP) + sum(TN) #Micro-averaged TPR and FPR micro_TPR = sum(TP) / total_instances_1 micro_FPR = sum(FP) / total_instances_2 micro_TPR = [0, micro_TPR, 1] micro_FPR = [0, micro_FPR, 1] thresholds = [2, 1, 0] #Plot the ROC Curve fig = plt.figure(figsize = (7,6)) plt.plot(micro_FPR, micro_TPR, color='tomato') plt.xlabel('False Positive Rate', fontsize=20) plt.ylabel('True Positive Rate', fontsize=20) plt.title('Micro-Averaged ROC Curve', fontsize=20) plt.show()

#Load and Transform Data neural_data = neural_df.copy() Y = neural_data['Genre'].tolist() neural_data = neural_data.drop(['Genre'], axis=1) X = neural_data.to_numpy() y = np.array(Y) #80/20 Train/Test Split X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=17669368)

#Random Forest Classifier Optimized Using Grid Search #Takes an Hour to Run #Parameters n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 3)] max_features = [round(x,2) for x in np.linspace(start = 0.3, stop = 1.0, num = 3)] min_samples_leaf = [int(x) for x in np.linspace(start = 300, stop = 600, num = 3)] bootstrap = [True, False] #Create Parameter Grid param_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap} #Set the Estimator and Grid Search Object and Fit to Training Data random_forest=RandomForestClassifier() rf_grid = GridSearchCV(random_forest, param_grid, cv=5, scoring="accuracy", refit=True) rf_grid.fit(X_train, y_train) #Get Best Parameters and Best Score print("Best parameters:", rf_grid.best_params_) print("Best score: {:.2f}".format(rf_grid.best_score_)) #Get Accuracy on Unseen Data accuracy = rf_grid.score(X_test, y_test) print(accuracy) #Get Accuracy on Unseen Data y_pred = rf_grid.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy: {:.4f}".format(accuracy))

#Load and Transform Data neural_data = neural_df.copy() Y = neural_data['Genre'].tolist() neural_data = neural_data.drop(['Genre'], axis=1) X = neural_data.to_numpy() y = np.array(Y) #80/20 Train/Test Split and Standardize Data X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=17669368) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test)

#Gradient Boosting Classifier Optimized Using Grid Search #**Takes At Least 45Min-1Hr to Run #Parameters n_estimators = [100, 200, 300] learning_rate = [0.1, 0.2, 0.3] max_depth = [1, 2, 3] #Create Parameter Grid param_grid = { 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'max_depth': max_depth} #Set the Estimator and Grid Search Object and Fit to Training Data gradient_boosting = GradientBoostingClassifier() gb_grid = GridSearchCV(gradient_boosting, param_grid, cv=5, scoring='accuracy', refit=True) gb_grid.fit(X_train, y_train) #Get Best Parameters and Best Score print("Best parameters:", gb_grid.best_params_) print("Best score: {:.2f}".format(gb_grid.best_score_)) #Get Accuracy on Unseen Data y_pred = gb_grid.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy: {:.4f}".format(accuracy))

#Load and Transform Data neural_data = neural_df.copy() Y = neural_data['Genre'].tolist() neural_data = neural_data.drop(['Genre'], axis=1) X = neural_data.to_numpy() y = np.array(Y) #80/20 Train/Test Split and Standardize Data X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=17669368) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test)

#Support Vector Machine Classifier Optimized Using GridSearch #**Takes a Few Hours to Run #Parameters kernel = ['linear', 'poly', 'rbf', 'sigmoid'] C = [0.1, 1, 10, 100] gamma = ['scale', 'auto'] degree = [2, 3, 4] #Create Parameter Grid param_grid = { 'kernel': kernel, 'C': C, 'gamma': gamma, 'degree': degree} #Set the Estimator and Grid Search Object and Fit to Training Data support_vector = SVC() SVC_grid = GridSearchCV(support_vector, param_grid, cv=5, scoring='accuracy', refit=True) SVC_grid.fit(X_train, y_train) #Get Best Parameters and Best Score print("Best parameters:", SVC_grid.best_params_) print("Best score: {:.2f}".format(SVC_grid.best_score_)) #Get Accuracy on Unseen Data y_pred = SVC_grid.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy: {:.4f}".format(accuracy))

import random from random import sample import scipy from scipy import stats

# explore popularity differences between single-genre and multi-genre artists. # df. extra_credit = data_df[['Artist','BPM','Energy','Dance','Loud','Valence','Length','Acoustic','Pop.','Genre']] # group by Artist Name and Count Number of Unique Genre Labels. genre_counts = extra_credit.groupby("Artist")["Genre"].nunique().reset_index(name="Genre Count") # extract Artist Names. five = genre_counts[genre_counts['Genre Count']==5] four = genre_counts[genre_counts['Genre Count']==4] three = genre_counts[genre_counts['Genre Count']==3] two = genre_counts[genre_counts['Genre Count']==2] one = genre_counts[genre_counts['Genre Count']==1] display(five) display(four) display(three) display(two) display(one) # create multi-genre artist df. multi_artists = pd.concat([four['Artist'], three['Artist'], two['Artist']], ignore_index=True) multi_artists = multi_artists.drop_duplicates() print('\n') display(multi_artists) multi_artists_list = list(multi_artists) print(multi_artists_list[:10]) print('\n') print(len(four['Artist'].tolist())) print(len(three['Artist'].tolist())) print(len(two['Artist'].tolist())) # total artists. total_artists = extra_credit['Artist'].drop_duplicates().tolist() print('# of total artists: ', len(total_artists))

# popularity scores from songs of these multi-genre artists. df_multi_artists = extra_credit[extra_credit['Artist'].isin(multi_artists_list)] display(df_multi_artists) num_samples = df_multi_artists.shape[0] print('number of samples: ') print(num_samples) multi_genre_popularity = df_multi_artists['Pop.'].tolist() print('multi_genre popularity MEAN: ', stat.mean(multi_genre_popularity))

# randomly select 2497 samples from single-genre artist datapoints. num_dp = multi_artists.shape[0] one_genre_artists = one['Artist'].tolist() print(len(one_genre_artists)) df_singleArtists = extra_credit[extra_credit['Artist'].isin(one_genre_artists)] display(df_singleArtists) # randomly select samples from this df. single_genre_popularity_all = df_singleArtists['Pop.'].tolist() single_genre_popularity = random.sample(single_genre_popularity_all, num_samples) print('number of samples: ') print(len(single_genre_popularity)) print('single_genre popularity MEAN: ', stat.mean(single_genre_popularity))

# let's run a t-test to see if these differences are significant. ''' running a WELCH'S T-TEST. ''' t_stat, t_pvalue = scipy.stats.ttest_ind(multi_genre_popularity, single_genre_popularity, equal_var=False) print('t statistic: ', t_stat) print('t-test p-value: ', t_pvalue)

''' t statistic: 22.743030380040615 t-test p-value: 5.030839473407017e-109 ''' # interpretation: we ran an independent samples t-test under the assumption that # population variances were not equal between samples (Welch's Test). # We obtained an extremely significant p-value, and can reject the null hypothesis that # the means of the two populations are equivalent.