#PreInstalled Imports
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import pandas as pd
import statistics as stat
import sys
import random
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn import linear_model
#Allison Imports
from statsmodels.stats.power import TTestIndPower
#Isha Imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.dates as mdates
import seaborn
import datetime as dt
import statsmodels.api as sm
# pip install yellowbrick
from yellowbrick.datasets import load_concrete
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.style import set_palette
from sklearn import preprocessing
import plotly
import plotly.express as px
#Mary Imports
import warnings
warnings.filterwarnings('ignore')
from sklearn.decomposition import PCA as sk_PCA
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans as sk_KMeans
from sklearn.tree import _tree, DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances_argmin
from sklearn.metrics import silhouette_samples, silhouette_score
from dython.nominal import associations # must pip install dython to run
from IPython.display import display, HTML
import seaborn as sns
sns.set()
#Annabelle Imports
from dataclasses import dataclass
from time import time
%matplotlib inline
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, auc, roc_curve
import requests
warnings.filterwarnings('ignore')
from numpy import random
random.seed(17669368)
anime = pd.read_csv('NYU_IntroCapstone_ANIME - Sheet1.csv')
broadway = pd.read_csv('NYU_IntroCapstone_BROADWAY - Sheet1.csv')
classical = pd.read_csv('NYU_IntroCapstone_CLASSICAL - Sheet1.csv')
country = pd.read_csv('NYU_IntroCapstone_COUNTRY - Sheet1.csv')
dance_electronic = pd.read_csv('NYU_IntroCapstone_DANCE_ELECTRONIC - Sheet1.csv')
disney = pd.read_csv('NYU_IntroCapstone_DISNEY - Sheet1.csv')
happy_holidays = pd.read_csv('NYU_IntroCapstone_HAPPY_HOLIDAYS - Sheet1.csv')
hip_hop = pd.read_csv('NYU_IntroCapstone_HIP_HOP - Sheet1.csv')
jazz = pd.read_csv('NYU_IntroCapstone_JAZZ - Sheet1.csv')
latin = pd.read_csv('NYU_IntroCapstone_LATIN - Sheet1.csv')
pop = pd.read_csv('NYU_IntroCapstone_POP - Sheet1.csv')
rock = pd.read_csv('NYU_IntroCapstone_ROCK - Sheet1.csv')
data_df = pd.concat([anime, broadway, classical, country, dance_electronic, disney, happy_holidays, hip_hop, jazz, latin, pop, rock], axis=0)
data_df = data_df.drop_duplicates(subset=['Title', 'Artist'])
data_df = data_df.drop('#', axis = 1)
data_df = data_df.dropna(axis=0, how='any').reset_index(drop=True) #remove rows that have any NaN
#Convert Length Column from Minutes and Seconds to Seconds
#Function to Get Seconds
def get_sec(time_str):
'''
This function takes the time in a min:sec format and converts it to seconds
'''
time_str = str(time_str)
m, s = time_str.split(':')
return int(m) * 60 + int(s)
#Apply the get_sec Function to the Data (elif statements for improperly formatted data)
for i in range(14892):
if i == 2979:
data_df.iloc[i,8] = 1578
elif i == 4928:
data_df.iloc[i,8] = 1623
elif i == 9846:
data_df.iloc[i,8] = 1707
elif i == 10249:
data_df.iloc[i,8] = 2573
elif i == 10386:
data_df.iloc[i,8] = 1572
elif i == 10404:
data_df.iloc[i,8] = 1562
elif i == 10714:
data_df.iloc[i,8] = 1904
else:
data_df.iloc[i,8] = get_sec(data_df.iloc[i,8])
display(data_df)
#Copy over data frame
inference_df = data_df
inference_df['Length'] = inference_df['Length'].astype(float)
# Create genre & attributes list:
genres = inference_df.Genre.unique()
attributes = inference_df.columns.values.tolist()[3:11]
#Various Histograms to begin visualizing the difference between genres for certain attributes
color = ['red','green','blue','cyan','magenta','yellow','black','orange','purple','white','silver','maroon']
#sample size
display("Sample Sizes:")
display(inference_df.groupby(inference_df['Genre']).count().reset_index().iloc[:,0:2])
display("Plots:")
#each genre's attribute histogram
for attribute in attributes:
for i, genre in enumerate(genres):
filtered_df = inference_df[(inference_df.Genre == genre)]
plt.hist(filtered_df[attribute], rwidth = 0.2, color = color[i], edgecolor='gray', linewidth=1.2, label = genre, density = True)
plt.ylabel("Probability Density")
plt.xlabel(attribute)
plt.legend(loc='upper center', bbox_to_anchor=(1.25, 1),
fancybox=True, shadow=True, ncol=1)
plt.title(attribute)
plt.show()
#bar chart of average attribute per genre
for attribute in attributes:
fig, ax = plt.subplots(figsize=(10,4))
ax.set_xticklabels(np.array(inference_df.loc[:,'Genre']), fontsize=8, rotation=90)
X = inference_df.groupby(inference_df['Genre']).mean().reset_index()
plt.bar(range(len(X)), X[attribute], color=color, edgecolor='gray', linewidth=1.2)
plt.xticks(range(len(X)), X['Genre'])
plt.ylabel(attribute)
plt.xlabel('Genre')
plt.show()
#Check variance using Levene test - Results show variance is not the same across genres
for attribute in attributes:
variance_test = stats.levene(inference_df[inference_df.Genre == genres[0]].loc[:,attribute],
inference_df[inference_df.Genre == genres[1]].loc[:,attribute],
inference_df[inference_df.Genre == genres[2]].loc[:,attribute],
inference_df[inference_df.Genre == genres[3]].loc[:,attribute],
inference_df[inference_df.Genre == genres[4]].loc[:,attribute],
inference_df[inference_df.Genre == genres[5]].loc[:,attribute],
inference_df[inference_df.Genre == genres[6]].loc[:,attribute],
inference_df[inference_df.Genre == genres[7]].loc[:,attribute],
inference_df[inference_df.Genre == genres[8]].loc[:,attribute],
inference_df[inference_df.Genre == genres[9]].loc[:,attribute],
inference_df[inference_df.Genre == genres[10]].loc[:,attribute],
inference_df[inference_df.Genre == genres[11]].loc[:,attribute])
display(attribute,variance_test) #use this line to see that multiple p-values are lower than 0.005, therefore we cannot assume equal variance
#Welch's t-test of all the combos of genres and features
used_genre_attribute_combos = np.empty(0)
genre1_list = np.empty(0)
genre2_list = np.empty(0)
attribute_list = np.empty(0)
result_list = np.empty(0)
stat_list = np.empty(0)
effect_size_list = np.empty(0)
power_list = np.empty(0)
alpha = 0.005 #significance level
for attribute in attributes:
for genre1 in genres:
for genre2 in genres:
if (genre1 == genre2): #ignores comparing to itself
continue
elif ((genre1 + genre2 + attribute) in used_genre_attribute_combos): #ignores duplicate pairs
continue
else:
filtered_df1 = inference_df[(inference_df.Genre == genre1)]
filtered_df2 = inference_df[(inference_df.Genre == genre2)]
genre1_attribute_list = np.array(filtered_df1.loc[:,attribute])
genre2_attribute_list = np.array(filtered_df2.loc[:,attribute])
combined_genres_attribute_list = pd.concat([filtered_df1.loc[:,attribute], filtered_df2.loc[:,attribute]], axis=0)
#welch's t-test
result = stats.ttest_ind(genre1_attribute_list, genre2_attribute_list, equal_var=False)
#effect size
effect_size = np.absolute((np.mean(genre1_attribute_list) - np.mean(genre2_attribute_list))/np.std(combined_genres_attribute_list))
#power
power_func = TTestIndPower()
power = power_func.solve_power(effect_size=effect_size, nobs1=filtered_df1.shape[0], alpha=alpha, power=None, ratio=(filtered_df2.shape[0]/filtered_df1.shape[0]), alternative='two-sided')
#lists of results
genre1_list = np.append(genre1_list, genre1)
genre2_list = np.append(genre2_list, genre2)
attribute_list = np.append(attribute_list, attribute)
result_list = np.append(result_list, result.pvalue)
stat_list = np.append(stat_list, result.statistic)
effect_size_list = np.append(effect_size_list, effect_size)
power_list = np.append(power_list,power)
used_genre_attribute_combos = np.append(used_genre_attribute_combos,(genre2 + genre1 + attribute))
cols = {'Attribute': attribute_list, 'Genre 1': genre1_list, 'Genre 2': genre2_list, 'P-Value': result_list, 'Test Statistic': stat_list, 'Effect Size': effect_size_list, 'Power': power_list}
#create df of results:
welch_t_test_result_df = pd.DataFrame(data=cols).sort_values('P-Value').reset_index()
#Show df of results:
with pd.option_context('display.max_rows', None,
'display.max_columns', None,
'display.precision', 3,
):
display("Results of Welch's T-Test:")
display(welch_t_test_result_df.sort_values('Genre 1').sort_values('Attribute'))
display(welch_t_test_result_df.shape)
#Find Number of Significant Results, Significant/High Powered Results, & Significant/High Powered Results/High Effect Size Results
display("Out of 66 possible combinations:")
#Significant
significant_df = welch_t_test_result_df[welch_t_test_result_df['P-Value'] < alpha].reset_index()
significant_df = significant_df.rename(columns={"P-Value": "Results"})
results_by_genre = pd.pivot_table(significant_df, values='Results', columns=['Attribute'], aggfunc='count', fill_value='0')
results_by_genre = results_by_genre.loc[:,attributes]
display("With Significant Values (p-value < 0.005)")
display(results_by_genre)
#Significant and High Power
significant_df = welch_t_test_result_df[(welch_t_test_result_df['P-Value'] < alpha)
& (welch_t_test_result_df['Power'] > 0.8)]
significant_df = significant_df.rename(columns={"P-Value": "Results"})
results_by_genre = pd.pivot_table(significant_df, values='Results', columns=['Attribute'], aggfunc='count', fill_value='0')
results_by_genre = results_by_genre.loc[:,attributes]
display("With Significant Values (p-value < 0.005) and High Power (1-Beta > 0.8)")
display(results_by_genre)
#Significant, High Power, and High Effect Size
significant_df = welch_t_test_result_df[(welch_t_test_result_df['P-Value'] < alpha)
& (welch_t_test_result_df['Power'] > 0.8)
& (welch_t_test_result_df['Effect Size'] > 0.8)]
significant_df = significant_df.rename(columns={"P-Value": "Results"})
results_by_genre = pd.pivot_table(significant_df, values='Results', columns=['Attribute'], aggfunc='count', fill_value='0')
results_by_genre = results_by_genre.loc[:,attributes]
display("With Significant Values (p-value < 0.005), High Power (1-Beta > 0.8), and High Effect Size (Cohen's d > 0.8)")
display(results_by_genre)
#Plots for Highest Differences in Length and Energy
#Length
length_significant_df = welch_t_test_result_df[(welch_t_test_result_df['P-Value'] < alpha)
& (welch_t_test_result_df['Power'] > 0.8)
& (welch_t_test_result_df['Effect Size'] > 0.8)
& (welch_t_test_result_df['Attribute'] == 'Length')]
plt.hist(inference_df['Length'][inference_df['Genre']=='Anime'], density = True, color ='red', bins = 10, rwidth = 0.7, label = "Anime")
plt.hist(inference_df['Length'][inference_df['Genre']=='Pop'], density = True, color = 'silver', bins = 10, rwidth = 0.7, label = "Pop")
plt.ylabel("Probability Density")
plt.xlabel('Length (seconds)')
plt.legend()
plt.show()
display(length_significant_df)
#Energy
energy_significant_df = welch_t_test_result_df[(welch_t_test_result_df['P-Value'] < alpha)
& (welch_t_test_result_df['Power'] > 0.8)
& (welch_t_test_result_df['Effect Size'] > 0.8)
& (welch_t_test_result_df['Attribute'] == 'Energy')]
plt.hist(inference_df['Energy'][inference_df['Genre']=='Anime'], density = True, bins = 10, color ='red', rwidth = 0.7, label = "Anime")
plt.hist(inference_df['Energy'][inference_df['Genre']=='Classical'], density = True, color = 'blue', bins = 10, rwidth = 0.7, label = "Classical")
plt.ylabel("Probability Density")
plt.xlabel('Energy')
plt.legend()
plt.show()
display(energy_significant_df)
#Find which genres have most differences from eachother:
#(The bottom left corner in the report is the top right portion transposed, for better audience readability)
genre_vs_genre_pivot = pd.pivot_table(significant_df, values='Results', index=['Genre 1'],
columns=['Genre 2'], aggfunc='count', fill_value='0')
display(genre_vs_genre_pivot)
# set random seed (Allison Redfern's NetID #).
allison_num_rand = random.seed(17669368)
# remove 2 unnecessary columns (A.Sep, Rnd).
new_full_df = data_df
new_full_df['Length'] = new_full_df['Length'].astype(float)
new_full_df = new_full_df.drop(columns=['A.Sep', 'Rnd'])
# check.
print(new_full_df.shape)
display(new_full_df)
# build a baseline multiple Linear Regression model.
# explore.
print(new_full_df.shape)
print(new_full_df.columns)
display(new_full_df.head())
popularity = new_full_df['Pop.']
print('popularity: ')
print(popularity.value_counts())
print('min: ', popularity.min())
print('max: ', popularity.max())
print('\n')
# feature set.
feature_df = new_full_df.drop(columns=['Pop.'])
# drop all non-numeric features.
feature_df_new = feature_df.drop(columns=['Title', 'Artist', 'Genre', 'Release'])
print(feature_df_new.shape)
print(feature_df_new.columns)
# build a baseline multiple Linear Regression model.
# set-up.
X = feature_df_new
y = popularity
# use random_state netID.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=allison_num_rand)
# fit.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
regr = LinearRegression()
regr.fit(X_train, y_train)
# predict.
y_pred = regr.predict(X_test)
# coefficients.
print("coefficients: ", regr.coef_)
# mean squared error.
print("mean squared error: ", mean_squared_error(y_test, y_pred))
# coefficient of determination.
print("coefficient of determination: ", r2_score(y_test, y_pred))
# build an exploratory simple Linear Regression model.
# check 'Loud' predictor power on 'Pop.' outcome.
X_new = feature_df_new['Loud'].to_numpy().reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=allison_num_rand)
regr = LinearRegression()
regr.fit(X_train, y_train)
# predict.
y_pred = regr.predict(X_test)
# coefficients.
print("coefficients: ", regr.coef_)
# mean squared error.
print("mean squared error: ", mean_squared_error(y_test, y_pred))
# coefficient of determination.
print("coefficient of determination: ", r2_score(y_test, y_pred))
# investigation.
df_releasedate = new_full_df['Release']
display(df_releasedate.head())
print(df_releasedate.shape)
release_list = df_releasedate.tolist()
print(type(release_list[0]))
dates_release = pd.to_datetime(release_list)
print(dates_release)
print(popularity)
# plot popularity over time.
# set-up.
popularity_list = popularity.tolist()
series_pop = pd.Series(data=popularity_list, index=dates_release)
series_popOverTime = series_pop.sort_index(ascending=True)
df_popOverTime = series_popOverTime.to_frame()
df_popOverTime.reset_index(inplace=True)
# only visualize popularity past 1970.
df_filtered_popOverTime = df_popOverTime[df_popOverTime['index'] > '1970-01-01']
# x, y.
index_new_series = df_filtered_popOverTime['index'].tolist()
data_new_series = df_filtered_popOverTime[0].tolist()
series_filtered_popOverTime = pd.Series(data=data_new_series, index=index_new_series)
# plot.
plt.style.use('seaborn')
seaborn.set_palette(palette='Pastel2', color_codes=True)
plt.plot_date(index_new_series, data_new_series, c=[0.7019607, 0.88627, 0.803921], alpha=0.5)
# trend line.
mdates_index_new_series = mdates.date2num(index_new_series)
z = np.polyfit(mdates_index_new_series, data_new_series, 1)
p = np.poly1d(z)
plt.plot(index_new_series, p(mdates_index_new_series), c='grey') # , alpha=0.5)
# show.
plt.tight_layout()
plt.xlabel('year')
plt.ylabel('popularity')
plt.show()
plt.close()
# feature engineering.
# calculate attributes.
df_calculate_attributes = df_popOverTime.rename(columns={"index": "date"})
display(df_calculate_attributes)
# ATTRIBUTE 1: MIN popularity each year.
min_popularity_year = df_calculate_attributes.groupby(df_calculate_attributes.date.dt.year)[0].min()
print('MIN POPULARITY: ')
display(min_popularity_year.tail(3))
# ATTRIBUTE 2: MAX popularity each year.
max_popularity_year = df_calculate_attributes.groupby(df_calculate_attributes.date.dt.year)[0].max()
print('MAX POPULARITY: ')
display(max_popularity_year.tail(3))
# ATTRIBUTE 3: COUNT of songs released each year.
count_songs_year = df_calculate_attributes.groupby(df_calculate_attributes.date.dt.year)[0].count()
print('NUMBER OF SONGS: ')
display(count_songs_year.tail(3))
print('shape: ')
print(count_songs_year.shape)
# distinct years in the df.
years_present = count_songs_year.index.values.tolist()
print('years present: ', years_present)
print(len(years_present))
# ATTRIBUTES 4, 5, 6, 7: mean, median, std, sum popularity each year.
mean_popularity_year = df_calculate_attributes.groupby(df_calculate_attributes.date.dt.year)[0].mean()
median_popularity_year = df_calculate_attributes.groupby(df_calculate_attributes.date.dt.year)[0].median()
std_popularity_year = df_calculate_attributes.groupby(df_calculate_attributes.date.dt.year)[0].std()
sum_popularity_year = df_calculate_attributes.groupby(df_calculate_attributes.date.dt.year)[0].sum()
# add 'year' attribute to original data.
# make a copy to modify.
original_df = new_full_df
# obtain year of song release from 'Release' attribute.
original_df['Release'] = pd.to_datetime(original_df['Release'])
release_list = original_df['Release']
# create list of years.
attribute_4_years_list = []
for i in range(release_list.shape[0]):
curr_date = release_list.loc[i]
curr_year = curr_date.year
attribute_4_years_list.append(curr_year)
print('\n')
print('number of songs: ', len(attribute_4_years_list))
# add feature to df.
original_df['year'] = attribute_4_years_list
# check.
display(original_df)
# merge time-related calculated features onto original df by 'year'.
# add to df.
min_attribute_to_add = []
max_attribute_to_add = []
count_attribute_to_add = []
mean_attribute_to_add = []
median_attribute_to_add = []
std_attribute_to_add = []
sum_attribute_to_add = []
# lists.
min_popularity_year_list = list(min_popularity_year)
max_popularity_year_list = list(max_popularity_year)
count_songs_year_list = list(count_songs_year)
mean_popularity_year_list = list(mean_popularity_year)
median_popularity_year_list = list(median_popularity_year)
std_popularity_year_list = list(std_popularity_year)
sum_popularity_year_list = list(sum_popularity_year)
# years present.
index_of_each_year_list = years_present
print(index_of_each_year_list)
year_list_original = original_df['year'].tolist()
# check that index matches.
print(index_of_each_year_list.index(1926))
# loop through each row in df.
for i in range(original_df.shape[0]):
curr_year = year_list_original[i]
index_of_curr_year = index_of_each_year_list.index(curr_year)
# match year to corresponding info.
min_corresponding = min_popularity_year_list[index_of_curr_year]
max_corresponding = max_popularity_year_list[index_of_curr_year]
count_corresponding = count_songs_year_list[index_of_curr_year]
mean_corresponding = mean_popularity_year_list[index_of_curr_year]
median_corresponding = median_popularity_year_list[index_of_curr_year]
std_corresponding = std_popularity_year_list[index_of_curr_year]
sum_corresponding = sum_popularity_year_list[index_of_curr_year]
# append to respective lists.
min_attribute_to_add.append(min_corresponding)
max_attribute_to_add.append(max_corresponding)
count_attribute_to_add.append(count_corresponding)
mean_attribute_to_add.append(mean_corresponding)
median_attribute_to_add.append(median_corresponding)
std_attribute_to_add.append(std_corresponding)
sum_attribute_to_add.append(sum_corresponding)
# check lengths.
print(len(min_attribute_to_add))
print(len(max_attribute_to_add))
print(len(count_attribute_to_add))
print(len(mean_attribute_to_add))
print(len(median_attribute_to_add))
print(len(std_attribute_to_add))
print(len(sum_attribute_to_add))
# check values.
print(min_attribute_to_add[:5])
print(max_attribute_to_add[:5])
print(count_attribute_to_add[:5])
print(mean_attribute_to_add[:5])
print(median_attribute_to_add[:5])
print(std_attribute_to_add[:5])
print(sum_attribute_to_add[:5])
# add to dataframe.
original_df['minPop'] = min_attribute_to_add
original_df['maxPop'] = max_attribute_to_add
original_df['countYear'] = count_attribute_to_add
original_df['meanPop'] = mean_attribute_to_add
original_df['medianPop'] = median_attribute_to_add
original_df['stdPop'] = std_attribute_to_add
original_df['sumPop'] = sum_attribute_to_add
# check.
display(original_df)
# build a multiple Linear Regression model on original df with TIME data.
# obtain only time features.
time_feature_set = original_df.iloc[:, 12:]
display(time_feature_set)
# handle NaN values.
original_df.at[1278, 'stdPop'] = 0
original_df.at[6270, 'stdPop'] = 0
original_df.at[9448, 'stdPop'] = 0
# set-up.
X = original_df.drop(columns=['Title', 'Artist', 'Release', 'Genre', 'Pop.'])
y = popularity
# use random_state netID.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=allison_num_rand)
# fit.
regr = LinearRegression()
regr.fit(X_train, y_train)
# predict.
y_pred = regr.predict(X_test)
# coefficients.
print("coefficients: ", regr.coef_)
# mean squared error.
print("mean squared error: ", mean_squared_error(y_test, y_pred))
# coefficient of determination.
print("coefficient of determination: ", r2_score(y_test, y_pred))
# investigation.
genre_counts = new_full_df['Genre'].value_counts()
print(genre_counts)
# plot popularity by genre to investigate possible effects.
# plot.
fig = px.scatter(new_full_df, y="Pop.", x="medianPop", color="Genre",
color_discrete_sequence=["red", "green", "blue", "cyan", "magenta",
"yellow", "black", "orange", "purple", "white",
"silver", "magenta"])
# remove gridlines.
fig.update_layout(xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False))
# change size.
fig.update_traces(marker_size=4)
# display.
fig.show()
# convert categorical variable into numeric representation.
# list of genres.
genre_list = new_full_df['Genre'].tolist()
# create indicator variables.
new_genre_df = pd.DataFrame({'Genre': genre_list})
genre_df_separate = pd.get_dummies(new_genre_df)
# check.
display(genre_df_separate)
# build a multiple Linear Regression model on only GENRE data.
# obtain only genre features.
genre_feature_set = genre_df_separate
display(genre_df_separate)
# set-up.
X = genre_feature_set
y = popularity
# use random_state netID.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=allison_num_rand)
# fit.
regr = LinearRegression()
regr.fit(X_train, y_train)
# predict.
y_pred = regr.predict(X_test)
# coefficients.
print("coefficients: ", regr.coef_)
# mean squared error.
print("mean squared error: ", mean_squared_error(y_test, y_pred))
# coefficient of determination.
print("coefficient of determination: ", r2_score(y_test, y_pred))
# create combined feature set.
# combine original attributes (12), time attributes (8), genre attributes (12).
feature_set_concat = pd.concat([original_df, genre_df_separate], axis=1)
display(feature_set_concat)
# drop non-numeric columns.
filtered_feature_set = feature_set_concat.drop(columns=['Title', 'Artist', 'Release', 'Genre', 'Pop.'])
display(filtered_feature_set)
# normalize labels.
labels_popularity_list = popularity.tolist()
labels_popularity_list_normalized = [(x/100) for x in labels_popularity_list]
# check.
print(labels_popularity_list_normalized[:10])
# build a Regularized Regression model (Ridge); train/test on combined feature set.
# set-up.
X = filtered_feature_set
y = labels_popularity_list_normalized
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=allison_num_rand)
# hyperparameter tuning.
alpha_list = np.arange(1, 101, 100)
# cross-validation.
model_standardized = linear_model.RidgeCV(alphas = alpha_list, cv=10)
# standardize training/testing sets.
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# fit Ridge on standardized data.
model_standardized.fit(X_train_scaled, y_train)
# obtain predictions.
scaled_predictions = model_standardized.predict(X_test_scaled)
''' measure performance. '''
# coefficients.
print("coefficients: ", model_standardized.coef_)
# mean squared error.
print("mean squared error: ", mean_squared_error(y_test, scaled_predictions, squared=False))
# coefficient of determination.
print("coefficient of determination: ", r2_score(y_test, scaled_predictions))
"""
coefficients: [ 8.86402004e-07 -1.76711787e-02 1.31199909e-02 2.55697739e-02
-7.22367567e-03 -7.49757374e-03 -5.11259157e-03 -6.97105126e-03
4.94013254e-03 -2.32173221e-03 2.74905341e-02 2.77715155e-02
-1.18636196e-02 1.70466642e-03 -4.73663403e-02 -4.86293602e-03
-5.27887719e-02 -8.56424514e-03 2.00124357e-02 7.79449638e-03
1.67410961e-04 -2.67200390e-02 2.63725393e-02 -4.97467324e-02
4.07161417e-02 4.25784420e-02 -1.67247894e-04]
mean squared error: 0.1233510219111883
coefficient of determination: 0.5097335012402788
"""
# build a Residual Plot to validate no bias in model predictions.
# plot.
set_palette('pastel', color_codes=True)
visualizer = ResidualsPlot(model_standardized, train_color='b', test_color='g')
# fit.
visualizer.fit(X_train_scaled, y_train) # fit the training data to the visualizer.
visualizer.score(X_test_scaled, y_test) # evaluate the model on the test data.
# remove gridlines.
fig.update_layout(xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False))
visualizer.show()
# feature importance (rank magnitude of weights found by Ridge model).
# list of attributes.
feature_list_new = list(filtered_feature_set.columns.values)
print('attributes: ', feature_list_new)
# analyze coefficients.
print('\n')
weights_new = list(model_standardized.coef_)
print('weights: ', weights_new)
weights_series_new = pd.Series(weights_new)
weights_series_abs_new = weights_series_new.abs()
# order weights.
weights_series_reordered_new = weights_series_abs_new.sort_values(ascending=False)
print('\n')
print('order of weights: ')
print(weights_series_reordered_new)
weights_index_new = weights_series_reordered_new.index.tolist()
print('index of reordered weights: ', weights_index_new)
print('\n')
print('* * * * * * * * * *')
# output in order of weights.
print('attributes, in order of IMPORTANCE: ')
weights_index_names_new = [feature_list_new[i] for i in weights_index_new]
print(weights_index_names_new)
# run a Regression Analysis to determine significance of coefficients.
# alpha value.
new_alpha = model_standardized.alpha_
X2 = sm.add_constant(X_train_scaled)
# build, fit OLS.
est = sm.OLS(y_train, X2).fit()
# analysis summary.
print(est.summary())
# pearson correlations.
# calculate.
display(filtered_feature_set)
num_cols = filtered_feature_set.shape[1]
col_names = list(filtered_feature_set.columns)
pop = new_full_df['Pop.']
for i in range(num_cols):
curr_col = col_names[i]
pearson = pop.corr(filtered_feature_set[curr_col])
# output per attribute.
print('corr Pop. & ' + str(curr_col) + ': ' + str(pearson))
PCA_df = data_df
PCA_df['Length'] = PCA_df['Length'].astype(float)
X = PCA_df.drop(['Title', 'Artist', 'Release', 'Rnd', 'A.Sep', 'Genre'], 1)
y = PCA_df['Genre']
# Check size of features and labels
print(X.shape)
print(y.shape)
# features only
complete_correlation= associations(X, figsize=(10,10))
complete_correlation
# PCA Pipeline
pca_pipeline = Pipeline([('scaling', StandardScaler()), ('pca', sk_PCA())])
pca_pipeline.fit(X) #features only, no labels
# Kaiser Criterion: Consider all principal components with eigen values greater than 1.0
eigVals=pca_pipeline[1].explained_variance_
nComponents = 8
x = np.linspace(1,nComponents, nComponents)
plt.figure(figsize=(10, 8))
plt.bar(x, eigVals, color='gray')
plt.plot([0,nComponents],[1,1],color='red',label='Kaiser Criterion') # red Kaiser criterion line
plt.xlabel('Principal Component', fontsize=25)
plt.ylabel('Eigenvalue', fontsize=25)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend()
plt.show()
# Print Eigenvalues
for index, value in enumerate(eigVals):
print(value)
# Explained Variance Calculation
covarExplained = eigVals/sum(eigVals)*100
print("Variance explained by the 2 PCs above is: %.3f " % (sum(covarExplained[:2])))
# PCA Components
scaler = StandardScaler()
X_comp=scaler.fit_transform(X)
pca=sk_PCA(n_components=2)
X_new=pca.fit_transform(X_comp)
explained_variance=pca.explained_variance_ratio_
sum_variance = sum(explained_variance)
print("Variance explained by the 2 PCs above is: %.4f " % (sum_variance))
components=pca.components_
components=pd.DataFrame(components, columns=X.columns)
display(components)
# PCA Pipeline for Silhouette
music_pca_pipeline = Pipeline([('scaling', StandardScaler()), ('pca', sk_PCA(n_components=2))])
music_processed = music_pca_pipeline.fit_transform(X.values)
numClusters = 9 # how many clusters are we looping over? (from 2 to 10)
Q = np.empty([numClusters,1])*np.NaN # init container to store sums
# Compute kMeans:
plt.figure(figsize=(16, 8))
for ii in range(2, 11): # Loop through each cluster (from 2 to 10!)
kMeans = sk_KMeans(n_clusters = int(ii)).fit(music_processed) # compute kmeans using scikit
cId = kMeans.labels_ # vector of cluster IDs that the row belongs to
cCoords = kMeans.cluster_centers_ # coordinate location for center of each cluster
s = silhouette_samples(music_processed,cId) # compute the mean silhouette coefficient of all samples
Q[ii-2] = sum(s) # take the sum
# Plot data:
plt.subplot(3,3,ii-1)
plt.hist(s,bins=20)
plt.xlim(-0.2,1)
plt.ylim(0,250)
plt.xlabel('Silhouette score')
plt.ylabel('Count')
plt.title('Sum: {}'.format(int(Q[ii-2]))) # sum rounded to nearest integer
plt.tight_layout() # adjusts subplot padding
# Plot Silhouette Analysis
plt.plot(np.linspace(2,10,9),Q)
plt.xlabel('Number of clusters', fontsize=15)
plt.ylabel('Sum of silhouette scores', fontsize=15)
plt.show()
# Define PCA
class PCA():
"""A method for doing dimensionality reduction by transforming the feature
space to a lower dimensionality, removing correlation between features and
maximizing the variance along each feature axis.
"""
def _init__(self):
self.eigenValues=None
self.components=None
def transform(self, X, n_components):
""" Fit the dataset to the number of principal components specified in the
constructor and return the transformed dataset """
covariance_matrix = self.calculate_covariance_matrix(X)
# Where (eigenvector[:,0] corresponds to eigenvalue[0])
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
# Sort the eigenvalues and corresponding eigenvectors from largest
# to smallest eigenvalue and select the first n_components
idx = eigenvalues.argsort()[::-1] # [3, 2, 1] ---> [2, 1, 0] --> [0, 1, 2]
eigenvalues = eigenvalues[idx][:n_components]
eigenvectors = np.atleast_1d(eigenvectors[:, idx])[:, :n_components]
# Set the object variables
self.eigenValues=eigenvalues
self.components=eigenvectors
# Project the data onto principal components
X_transformed = X.dot(eigenvectors)
return X_transformed
def calculate_covariance_matrix(self, X, Y=None):
""" Calculate the covariance matrix for the dataset X """
if Y is None:
Y = X
n_samples = np.shape(X)[0]
covariance_matrix = (1 / (n_samples-1)) * (X - X.mean(axis=0)).T.dot(Y - Y.mean(axis=0))
return np.array(covariance_matrix, dtype=float)
# Labels
print("Genres: ", y.unique())
target_names = y.unique()
# Convert genres into unique ids
y_codes = pd.DataFrame(y)
y_codes['id'] = y_codes.groupby(['Genre']).ngroup()
y_codes_arr = np.array(y_codes['id'])
print(y_codes_arr)
# PCA for visualization
pca=PCA()
X_transformed=pca.transform(StandardScaler().fit_transform(X), 2) # 2 principal components
target_ids = range(len(target_names))
print(target_ids)
colors = ['red','green','blue','cyan','magenta','yellow','black','orange','purple','white','silver','maroon']
plt.figure(figsize=(20, 13))
for i, c, label in zip(target_ids, colors, target_names):
plt.scatter(X_transformed[y_codes_arr == i, 0], X_transformed[y_codes_arr == i, 1], c=c, alpha=0.6, label=label)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
plt.title('Principal Component Analysis: Music Genres', fontsize=45)
plt.xlabel('Principal Component 1', fontsize=35)
plt.ylabel('Principal Component 2', fontsize=35)
plt.ylim(-5,4)
plt.xlim(-4,7)
plt.legend()
plt.show()
# K-Means
def KMeans(X, n_clusters, rseed=17669368):
# 1. Randomly choose clusters
rng = np.random.RandomState(17669368)
i = rng.permutation(X.shape[0])[:n_clusters]
centers = X[i]
while True:
# 2a. Assign labels based on closest center
labels = pairwise_distances_argmin(X, centers)
# 2b. Find new centers from means of points
new_centers = np.array([X[labels == i].mean(0)
for i in range(n_clusters)])
# 2c. Check for convergence
if np.all(centers == new_centers):
break
centers = new_centers
return centers, labels
# Plotting K-Means clusters
n_clusters=range(2, 8)
plt.figure(figsize=(18, 12), layout="constrained")
for i, c_num in enumerate(n_clusters):
centers, labels = KMeans(X_transformed, c_num)
plt.subplot(3, 2, i+1)
plt.title('K-Means Clustering with n_clusters={}'.format(c_num))
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.ylim(-4.5,3.5)
plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=labels, cmap='Pastel2')
plt.plot(centers[:, 0], centers[:, 1], '*',markersize=10,color='red')
# Optimal Cluster # for K-Means
n_clusters=2
plt.figure(figsize=(20, 13), layout="constrained")
centers, labels = KMeans(X_transformed, 2)
# plt.title('K-Means Clustering with n_clusters={}'.format(2), fontsize=45)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
plt.xlabel("Principal Component 1", fontsize=40)
plt.ylabel("Principal Component 2", fontsize=40)
plt.ylim(-5,4)
plt.xlim(-4,7)
plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=labels, cmap='Pastel2')
plt.plot(centers[:, 0], centers[:, 1], '*',markersize=10,color='red');
# Cluster Report
def pretty_print(df):
return display( HTML( df.to_html().replace("\\n","<br>") ) )
def get_class_rules(tree: DecisionTreeClassifier, feature_names: list):
inner_tree: _tree.Tree = tree.tree_
classes = tree.classes_
class_rules_dict = dict()
def tree_dfs(node_id=0, current_rule=[]):
# feature[i] holds the feature to split on, for the internal node i.
split_feature = inner_tree.feature[node_id]
if split_feature != _tree.TREE_UNDEFINED: # internal node
name = feature_names[split_feature]
threshold = inner_tree.threshold[node_id]
# left child
left_rule = current_rule + ["({} <= {})".format(name, threshold)]
tree_dfs(inner_tree.children_left[node_id], left_rule)
# right child
right_rule = current_rule + ["({} > {})".format(name, threshold)]
tree_dfs(inner_tree.children_right[node_id], right_rule)
else: # leaf
dist = inner_tree.value[node_id][0]
dist = dist/dist.sum()
max_idx = dist.argmax()
if len(current_rule) == 0:
rule_string = "ALL"
else:
rule_string = " and ".join(current_rule)
# register new rule to dictionary
selected_class = classes[max_idx]
class_probability = dist[max_idx]
class_rules = class_rules_dict.get(selected_class, [])
class_rules.append((rule_string, class_probability))
class_rules_dict[selected_class] = class_rules
tree_dfs() # start from root, node_id = 0
return class_rules_dict
def cluster_report(data: pd.DataFrame, clusters, min_samples_leaf=50, pruning_level=0.01):
# Create Model
tree = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, ccp_alpha=pruning_level)
tree.fit(data, clusters)
# Generate Report
feature_names = data.columns
class_rule_dict = get_class_rules(tree, feature_names)
report_class_list = []
for class_name in class_rule_dict.keys():
rule_list = class_rule_dict[class_name]
combined_string = ""
for rule in rule_list:
combined_string += "[{}] {}\n\n".format(rule[1], rule[0])
report_class_list.append((class_name, combined_string))
cluster_instance_df = pd.Series(clusters).value_counts().reset_index()
cluster_instance_df.columns = ['class_name', 'instance_count']
report_df = pd.DataFrame(report_class_list, columns=['class_name', 'rule_list'])
report_df = pd.merge(cluster_instance_df, report_df, on='class_name', how='left')
pretty_print(report_df.sort_values(by='class_name')[['class_name', 'instance_count', 'rule_list']])
pc = pca_pipeline.fit_transform(X)
kmeans_model = sk_KMeans(n_clusters=2)
y_cluster = kmeans_model.fit_predict(pc)
cluster_report(X, y_cluster, min_samples_leaf=20, pruning_level=0.05)
#Define Functions Used in Neural Network
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z) * (1 - sigmoid(z))
def cost(output_activations, y):
return 1/2*(np.sum((output_activations-y)**2))
def cost_derivative(output_activations, y):
return (output_activations - y)
#Create a Neural Network Class
@dataclass
class Network:
num_layers: int
biases: list
weights: list
#Initialize Biases and Weights Randomly from Standard Normal Distribution
def init_network(layers):
np.random.seed(17669368)
return Network(
len(layers),
[np.random.randn(y, 1) for y in layers[1:]],
[np.random.randn(y, x) for x, y in zip(layers[:-1], layers[1:])])
#Feedforward Function
def feedforward(nn, a):
for b, w in zip(nn.biases, nn.weights):
a = sigmoid(np.dot(w, a) + b)
return a
#Evaluate Performance on Validation and Test Data
def evaluate(nn, test_data):
test_results = [(np.argmax(feedforward(nn, x)), y) for (x, y) in test_data]
return sum(int(x == y) for (x, y) in test_results)
#Learning Process for the Neural Network
def learn(nn, training_data, epochs, mini_batch_size, learning_rate, test_data = None):
n = len(training_data)
for j in range(epochs):
random.seed(17669368)
random.shuffle(training_data)
mini_batches = [training_data[k: k + mini_batch_size] for k in range(0, n, mini_batch_size)]
cost=0.0
for mini_batch in mini_batches:
cost+=batch_stochastic_gradient_descent(nn, mini_batch, learning_rate)
if test_data:
print('Epoch {0}: accuracy {1}% , Cost: {2}'.format(f'{j + 1:2}', 100.0 * evaluate(nn, test_data) / len(test_data), np.round(cost/n, 3)))
else:
print('Epoch {0} complete, , Cost: {1}'.format(f'{j + 1:2}', np.round(cost/n, 3)))
#Stochastic Gradient Descent for the Batch
def batch_stochastic_gradient_descent(nn, mini_batch, eta):
nabla_b = [np.zeros(b.shape) for b in nn.biases]
nabla_w = [np.zeros(w.shape) for w in nn.weights]
loss, nabla_b, nabla_w = batch_backprop(nn, mini_batch)
nn.weights = [w - (eta / len(mini_batch)) * nw for w, nw in zip(nn.weights, nabla_w)]
nn.biases = [b - (eta / len(mini_batch)) * nb for b, nb in zip(nn.biases, nabla_b)]
return loss
#Performs Backpropogation to Update Weights and Biases Based on Gradient Descent
def batch_backprop(nn, mini_batch):
nabla_b = [np.zeros(b.shape) for b in nn.biases]
nabla_w = [np.zeros(w.shape) for w in nn.weights]
#Mini-batch Components
ax, ay = tuple(t for t in np.asarray(mini_batch).transpose())
x = np.stack(ax)
y = np.stack(ay)
#Feedforward
activation = x
activations = [x]
zs = []
for b, w in zip(nn.biases, nn.weights):
z = np.matmul(w, activation) + b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
loss = cost(activations[-1].squeeze(), y.squeeze())
#Backward pass
#Start from Output Layer
delta = cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
nabla_b[-1] = delta.sum(axis = 0)
nabla_w[-1] = np.matmul(delta, activations[-2].transpose(0, 2, 1)).sum(axis = 0)
#Back to Input Layer
for i in reversed(range(2, nn.num_layers)):
z = zs[-i]
sp = sigmoid_prime(z)
delta = np.matmul(nn.weights[-i + 1].transpose(), delta) * sp
nabla_b[-i] = delta.sum(axis = 0)
nabla_w[-i] = np.matmul(delta, activations[-i - 1].transpose(0, 2, 1)).sum(axis = 0)
return (loss, nabla_b, nabla_w)
#Make a Copy of Original Dataset for Subsection
neural_df = data_df[['BPM','Energy','Dance','Loud','Valence','Length','Acoustic','Pop.','Genre']]
#Turn the Genre Lables Into Digits 0-11
label_data = LabelEncoder()
neural_df['Genre'] = label_data.fit_transform(neural_df['Genre'])
display(neural_df)
#Load and Transform Data
neural_data = neural_df.copy()
Y = neural_data['Genre'].tolist()
neural_data = neural_data.drop(['Genre'], axis=1)
X = neural_data.to_numpy()
y = np.array(Y)
#80/10/10 Train/Validate/Test Split
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8, random_state=17669368)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5, random_state=17669368)
#Format Data
train = (X_train.astype(float),y_train)
validate = (X_valid.astype(float),y_valid)
test = (X_test.astype(float),y_test)
#Define Data Wrapper, One Hot Encoder, and Print Shape for Output
def load_data_wrapper():
tr_d = train
va_d = validate
te_d = test
training_inputs = [np.reshape(x, (8, 1)) for x in tr_d[0]]
training_results = [one_hot_encode(y) for y in tr_d[1]]
training_data = zip(training_inputs, training_results)
validation_inputs = [np.reshape(x, (8, 1)) for x in va_d[0]]
validation_data = zip(validation_inputs, va_d[1])
test_inputs = [np.reshape(x, (8, 1)) for x in te_d[0]]
test_data = zip(test_inputs, te_d[1])
return (list(training_data), list(validation_data), list(test_data))
def one_hot_encode(j):
e = np.zeros((12, 1))
e[j] = 1.0
return e
def print_shape(name, data):
print('Shape of {0}: {1}'.format(name, data.shape))
#Train and Test Neural Network
#8 Inputs, 10 Neurons in Hidden Layer, 12 Labels
training_data, validation_data, test_data = load_data_wrapper()
nn = init_network([8, 10, 12])
for l in range(0, nn.num_layers - 1):
print('\nNetwork layer {0}'.format(l + 2))
print_shape('weights', nn.weights[l])
print_shape('biases', nn.biases[l])
#Hyper Parameters
epochs = 30
mini_batch_size = 10
learning_rate = 3.0
#Learning Process
print('\nLearning process started...\n')
time_start = time()
learn(nn, training_data, epochs, mini_batch_size, learning_rate, validation_data)
time_end = time()
time_elapsed = time_end - time_start
print('\nLearning process complete in {0} seconds ({1} seconds per epoch)!\n'.format(f'{time_elapsed:.0f}', f'{time_elapsed / epochs:.1f}'))
#Use Held Out Test Data to Determine Accuracy
print('Validation (with yet unseen data): accuracy {0}%'.format(100.0 * evaluate(nn, test_data) / len(test_data)))
#Load and Transform Data
neural_data = neural_df.copy()
Y = neural_data['Genre'].tolist()
neural_data = neural_data.drop(['Genre'], axis=1)
X = neural_data.to_numpy()
y = np.array(Y)
#80/10/10 Train/Validate/Test Split
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8, random_state=17669368)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5, random_state=17669368)
#Standardize Data (Fitting the scaler to the training data and then transforming the training and test data with it)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_valid = scaler.transform(X_valid)
#Format Data as Tuples
train = (X_train.astype(float),y_train)
validate = (X_valid.astype(float),y_valid)
test = (X_test.astype(float),y_test)
#Redefine Data Wrapper and One Hot Encoder
def load_data_wrapper():
tr_d = train
va_d = validate
te_d = test
training_inputs = [np.reshape(x, (8, 1)) for x in tr_d[0]]
training_results = [one_hot_encode(y) for y in tr_d[1]]
training_data = zip(training_inputs, training_results)
validation_inputs = [np.reshape(x, (8, 1)) for x in va_d[0]]
validation_data = zip(validation_inputs, va_d[1])
test_inputs = [np.reshape(x, (8, 1)) for x in te_d[0]]
test_data = zip(test_inputs, te_d[1])
return (list(training_data), list(validation_data), list(test_data))
def one_hot_encode(j):
e = np.zeros((12, 1))
e[j] = 1.0
return e
#Train and Test Neural Network
#8 Inputs, 10 in Hidden Layer, 12 Labels
training_data, validation_data, test_data = load_data_wrapper()
nn = init_network([8, 10, 12])
for l in range(0, nn.num_layers - 1):
print('\nNetwork layer {0}'.format(l + 2))
print_shape('weights', nn.weights[l])
print_shape('biases', nn.biases[l])
#Hyper Parameters
epochs = 30
mini_batch_size = 10
learning_rate = 3.0
#Learning Process
print('\nLearning process started...\n')
time_start = time()
learn(nn, training_data, epochs, mini_batch_size, learning_rate, validation_data)
time_end = time()
time_elapsed = time_end - time_start
print('\nLearning process complete in {0} seconds ({1} seconds per epoch)!\n'.format(f'{time_elapsed:.0f}', f'{time_elapsed / epochs:.1f}'))
#Use Held Out Test Data to Determine Accuracy
print('Validation (with yet unseen data): accuracy {0}%'.format(100.0 * evaluate(nn, test_data) / len(test_data)))
#Determine the Number of Samples for Each Genre
neural_df2 = neural_df.copy()
genre_0 = neural_df2[neural_df2['Genre']==0]
genre_1 = neural_df2[neural_df2['Genre']==1]
genre_2 = neural_df2[neural_df2['Genre']==2]
genre_3 = neural_df2[neural_df2['Genre']==3]
genre_4 = neural_df2[neural_df2['Genre']==4]
genre_5 = neural_df2[neural_df2['Genre']==5]
genre_6 = neural_df2[neural_df2['Genre']==6]
genre_7 = neural_df2[neural_df2['Genre']==7]
genre_8 = neural_df2[neural_df2['Genre']==8]
genre_9 = neural_df2[neural_df2['Genre']==9]
genre_10 = neural_df2[neural_df2['Genre']==10]
genre_11 = neural_df2[neural_df2['Genre']==11]
#Initialize List of the Number of Samples for Each Genre
samples = [629,1101,1338,1109,1615,471,1161,1447,1966,2103,1163,789]
#Initialize List of the Genres
genres = [0,1,2,3,4,5,6,7,8,9,10,11]
#Initialize List of Lists of the Genres Not Being Singled Out
rest = [[1,2,3,4,5,6,7,8,9,10,11],[0,2,3,4,5,6,7,8,9,10,11],[0,1,3,4,5,6,7,8,9,10,11],[0,1,2,4,5,6,7,8,9,10,11],[0,1,2,3,5,6,7,8,9,10,11],[0,1,2,3,4,6,7,8,9,10,11],[0,1,2,3,4,5,7,8,9,10,11],[0,1,2,3,4,5,6,8,9,10,11],[0,1,2,3,4,5,6,7,9,10,11],[0,1,2,3,4,5,6,7,8,10,11],[0,1,2,3,4,5,6,7,8,9,11],[0,1,2,3,4,5,6,7,8,9,10]]
#Change Data Wrapper and Change One Hot Encoder to 2 Outputs
def load_data_wrapper(train,test,validate):
tr_d = train
va_d = validate
te_d = test
training_inputs = [np.reshape(x, (8, 1)) for x in tr_d[0]]
training_results = [one_hot_encode(y) for y in tr_d[1]]
training_data = zip(training_inputs, training_results)
validation_inputs = [np.reshape(x, (8, 1)) for x in va_d[0]]
validation_data = zip(validation_inputs, va_d[1])
test_inputs = [np.reshape(x, (8, 1)) for x in te_d[0]]
test_data = zip(test_inputs, te_d[1])
return (list(training_data), list(validation_data), list(test_data))
def one_hot_encode(j):
e = np.zeros((2, 1))
e[j] = 1.0
return e
#Data Preprocessing Function
def binary_classifier(genrenumber,restofgenres,numsamples):
'''
This function takes the coded genre number, a list of the rest of the genre numbers and the number
of samples in the dataset for that particular genre. It tranforms the data for a genre in
a one-vs-all binary format for use in the neural network.
Params:
genrenumber: (int)
restofgenres: (list)
numsamples: (int)
Return:
train: (tuple)
test: (tuple)
validate: (tuple)
'''
#Replace Labels with 0s and 1s
neural_df2 = neural_df.copy()
neural_df2 = neural_df2.replace({'Genre': genrenumber}, 20)
neural_df2 = neural_df2.replace({'Genre': restofgenres}, 0)
neural_df2 = neural_df2.replace({'Genre': 20}, 1)
dataframe1 = neural_df2[neural_df2['Genre']==1]
dataframe1 = pd.DataFrame(dataframe1)
dataframe2 = neural_df2[neural_df2['Genre']==0]
dataframe2 = pd.DataFrame(dataframe2)
dataframe2 = dataframe2.sample(n=numsamples, replace=False, random_state=17669368)
df_stacked = pd.concat([dataframe1, dataframe2])
#Load and Transform Data
Y = df_stacked['Genre'].tolist()
df_stacked = df_stacked.drop(['Genre'], axis=1)
X = df_stacked.to_numpy()
y = np.array(Y)
#80/10/10 Train/Validate/Test Split
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8, random_state=17669368)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5, random_state=17669368)
#Standardize Data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_valid = scaler.transform(X_valid)
#Format Data
train = (X_train.astype(float),y_train)
validate = (X_valid.astype(float),y_valid)
test = (X_test.astype(float),y_test)
return train, test, validate
#Loop Through Each Genres and Run One-vs-All Through the Neural Network
#Calculates Accuracy, ROC Curve, AUC
colors = ['red','green','blue','cyan','magenta','yellow','black','orange','purple','white','silver','maroon']
labels = ['Anime', 'Broadway', 'Classical', 'Country', 'Dance Electronic', 'Disney', 'Happy Holidays', 'Hip Hop', 'Jazz', 'Latin', 'Pop','Rock']
TP = []
FN = []
FP = []
TN = []
for a, b, c, d, e in zip(genres, rest, samples, colors, labels):
#Load Data
train, test, validate = binary_classifier(a,b,c)
#Train and Test Neural Network
#8 Input Features, 6 Neurons in Hidden Layer, 2 Output Labels
training_data, validation_data, test_data = load_data_wrapper(train,test,validate)
nn = init_network([8, 6, 2])
print("\nGenre: "+str(a))
for l in range(0, nn.num_layers - 1):
print('\nNetwork layer {0}'.format(l + 2))
print_shape('weights', nn.weights[l])
print_shape('biases', nn.biases[l])
#Hyper Parameters
epochs = 30
mini_batch_size = 10
learning_rate = 3.0
#Learning Process
print('\nLearning process started...\n')
time_start = time()
learn(nn, training_data, epochs, mini_batch_size, learning_rate, validation_data)
time_end = time()
time_elapsed = time_end - time_start
print('\nLearning process complete in {0} seconds ({1} seconds per epoch)!\n'.format(f'{time_elapsed:.0f}', f'{time_elapsed / epochs:.1f}'))
#Use Held Out Test Data to Determine Accuracy
print('Validation (with yet unseen data): accuracy {0}%'.format(100.0 * evaluate(nn, test_data) / len(test_data)))
#Get Predicted and True Values
test_results = [(np.argmax(feedforward(nn, x)), y) for (x, y) in test_data]
y_pred = []
y_true = []
for i in range(len(test_results)):
y_pred.append(test_results[i][0])
y_true.append(test_results[i][1])
#ROC Curves
fpr, tpr, thresholds = roc_curve(y_true, y_pred)
fig = plt.figure(figsize = (7,6))
plt.plot(fpr, tpr, color=d)
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.title('ROC Curve for '+str(e), fontsize=20)
plt.show()
#Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
TP.append(conf_matrix[1][1])
FN.append(conf_matrix[1][0])
FP.append(conf_matrix[0][1])
TN.append(conf_matrix[0][0])
disp=ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=[0,1])
disp.plot()
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for '+str(e))
plt.grid()
plt.show()
print("Accuracy of the model is: %.3f" % accuracy_score(y_pred, y_true))
auc = roc_auc_score(y_true, y_pred)
print("AUC: "+str(auc))
#Micro Averaged ROC
total_instances_1 = sum(TP) + sum(FN)
total_instances_2 = sum(FP) + sum(TN)
#Micro-averaged TPR and FPR
micro_TPR = sum(TP) / total_instances_1
micro_FPR = sum(FP) / total_instances_2
micro_TPR = [0, micro_TPR, 1]
micro_FPR = [0, micro_FPR, 1]
thresholds = [2, 1, 0]
#Plot the ROC Curve
fig = plt.figure(figsize = (7,6))
plt.plot(micro_FPR, micro_TPR, color='tomato')
plt.xlabel('False Positive Rate', fontsize=20)
plt.ylabel('True Positive Rate', fontsize=20)
plt.title('Micro-Averaged ROC Curve', fontsize=20)
plt.show()
#Load and Transform Data
neural_data = neural_df.copy()
Y = neural_data['Genre'].tolist()
neural_data = neural_data.drop(['Genre'], axis=1)
X = neural_data.to_numpy()
y = np.array(Y)
#80/20 Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=17669368)
#Random Forest Classifier Optimized Using Grid Search
#Takes an Hour to Run
#Parameters
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 3)]
max_features = [round(x,2) for x in np.linspace(start = 0.3, stop = 1.0, num = 3)]
min_samples_leaf = [int(x) for x in np.linspace(start = 300, stop = 600, num = 3)]
bootstrap = [True, False]
#Create Parameter Grid
param_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
#Set the Estimator and Grid Search Object and Fit to Training Data
random_forest=RandomForestClassifier()
rf_grid = GridSearchCV(random_forest, param_grid, cv=5, scoring="accuracy", refit=True)
rf_grid.fit(X_train, y_train)
#Get Best Parameters and Best Score
print("Best parameters:", rf_grid.best_params_)
print("Best score: {:.2f}".format(rf_grid.best_score_))
#Get Accuracy on Unseen Data
accuracy = rf_grid.score(X_test, y_test)
print(accuracy)
#Get Accuracy on Unseen Data
y_pred = rf_grid.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.4f}".format(accuracy))
#Load and Transform Data
neural_data = neural_df.copy()
Y = neural_data['Genre'].tolist()
neural_data = neural_data.drop(['Genre'], axis=1)
X = neural_data.to_numpy()
y = np.array(Y)
#80/20 Train/Test Split and Standardize Data
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=17669368)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
#Gradient Boosting Classifier Optimized Using Grid Search
#**Takes At Least 45Min-1Hr to Run
#Parameters
n_estimators = [100, 200, 300]
learning_rate = [0.1, 0.2, 0.3]
max_depth = [1, 2, 3]
#Create Parameter Grid
param_grid = {
'n_estimators': n_estimators,
'learning_rate': learning_rate,
'max_depth': max_depth}
#Set the Estimator and Grid Search Object and Fit to Training Data
gradient_boosting = GradientBoostingClassifier()
gb_grid = GridSearchCV(gradient_boosting, param_grid, cv=5, scoring='accuracy', refit=True)
gb_grid.fit(X_train, y_train)
#Get Best Parameters and Best Score
print("Best parameters:", gb_grid.best_params_)
print("Best score: {:.2f}".format(gb_grid.best_score_))
#Get Accuracy on Unseen Data
y_pred = gb_grid.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.4f}".format(accuracy))
#Load and Transform Data
neural_data = neural_df.copy()
Y = neural_data['Genre'].tolist()
neural_data = neural_data.drop(['Genre'], axis=1)
X = neural_data.to_numpy()
y = np.array(Y)
#80/20 Train/Test Split and Standardize Data
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=17669368)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
#Support Vector Machine Classifier Optimized Using GridSearch
#**Takes a Few Hours to Run
#Parameters
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
C = [0.1, 1, 10, 100]
gamma = ['scale', 'auto']
degree = [2, 3, 4]
#Create Parameter Grid
param_grid = {
'kernel': kernel,
'C': C,
'gamma': gamma,
'degree': degree}
#Set the Estimator and Grid Search Object and Fit to Training Data
support_vector = SVC()
SVC_grid = GridSearchCV(support_vector, param_grid, cv=5, scoring='accuracy', refit=True)
SVC_grid.fit(X_train, y_train)
#Get Best Parameters and Best Score
print("Best parameters:", SVC_grid.best_params_)
print("Best score: {:.2f}".format(SVC_grid.best_score_))
#Get Accuracy on Unseen Data
y_pred = SVC_grid.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.4f}".format(accuracy))
import random
from random import sample
import scipy
from scipy import stats
# explore popularity differences between single-genre and multi-genre artists.
# df.
extra_credit = data_df[['Artist','BPM','Energy','Dance','Loud','Valence','Length','Acoustic','Pop.','Genre']]
# group by Artist Name and Count Number of Unique Genre Labels.
genre_counts = extra_credit.groupby("Artist")["Genre"].nunique().reset_index(name="Genre Count")
# extract Artist Names.
five = genre_counts[genre_counts['Genre Count']==5]
four = genre_counts[genre_counts['Genre Count']==4]
three = genre_counts[genre_counts['Genre Count']==3]
two = genre_counts[genre_counts['Genre Count']==2]
one = genre_counts[genre_counts['Genre Count']==1]
display(five)
display(four)
display(three)
display(two)
display(one)
# create multi-genre artist df.
multi_artists = pd.concat([four['Artist'], three['Artist'], two['Artist']], ignore_index=True)
multi_artists = multi_artists.drop_duplicates()
print('\n')
display(multi_artists)
multi_artists_list = list(multi_artists)
print(multi_artists_list[:10])
print('\n')
print(len(four['Artist'].tolist()))
print(len(three['Artist'].tolist()))
print(len(two['Artist'].tolist()))
# total artists.
total_artists = extra_credit['Artist'].drop_duplicates().tolist()
print('# of total artists: ', len(total_artists))
# popularity scores from songs of these multi-genre artists.
df_multi_artists = extra_credit[extra_credit['Artist'].isin(multi_artists_list)]
display(df_multi_artists)
num_samples = df_multi_artists.shape[0]
print('number of samples: ')
print(num_samples)
multi_genre_popularity = df_multi_artists['Pop.'].tolist()
print('multi_genre popularity MEAN: ', stat.mean(multi_genre_popularity))
# randomly select 2497 samples from single-genre artist datapoints.
num_dp = multi_artists.shape[0]
one_genre_artists = one['Artist'].tolist()
print(len(one_genre_artists))
df_singleArtists = extra_credit[extra_credit['Artist'].isin(one_genre_artists)]
display(df_singleArtists)
# randomly select samples from this df.
single_genre_popularity_all = df_singleArtists['Pop.'].tolist()
single_genre_popularity = random.sample(single_genre_popularity_all, num_samples)
print('number of samples: ')
print(len(single_genre_popularity))
print('single_genre popularity MEAN: ', stat.mean(single_genre_popularity))
# let's run a t-test to see if these differences are significant.
''' running a WELCH'S T-TEST. '''
t_stat, t_pvalue = scipy.stats.ttest_ind(multi_genre_popularity, single_genre_popularity, equal_var=False)
print('t statistic: ', t_stat)
print('t-test p-value: ', t_pvalue)
'''
t statistic: 22.743030380040615
t-test p-value: 5.030839473407017e-109
'''
# interpretation: we ran an independent samples t-test under the assumption that
# population variances were not equal between samples (Welch's Test).
# We obtained an extremely significant p-value, and can reject the null hypothesis that
# the means of the two populations are equivalent.