CODATHON

# import libraries import matplotlib.pyplot as plt # data visualization doc: https://matplotlib.org/2.0.2/api/pyplot_api.html import pandas as pd # data science essentials doc: https://pandas.pydata.org/docs/ import seaborn as sns # enhanced data visualization doc: https://seaborn.pydata.org/ import numpy as np # numpy library for math functions and arrays doc: https://numpy.org/doc/ from sklearn.model_selection import train_test_split # train-test split doc: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html from sklearn.metrics import make_scorer, accuracy_score # metrics doc: https://scikit-learn.org/stable/modules/model_evaluation.html from sklearn.preprocessing import StandardScaler # standard scaler doc: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html from tpot import TPOTClassifier # doc: http://epistasislab.github.io/tpot/ import datetime # datetime doc: https://docs.python.org/3/library/datetime.html from scipy.stats import boxcox

train = pd.read_csv('/work/train.csv') train = train.set_index('id') # read test dataset test = pd.read_csv('/work/test.csv') test = test.set_index('id') # create a column to identify if train or test train['is_train'] = True test['is_train'] = False # merge the dataset test and train into all_data all_data = pd.concat([train,test], axis=0)

train_numeric = train.select_dtypes(include=['float64', 'int64', 'bool']) corr_matrix = train_numeric.corr(method='pearson', min_periods=30).round(decimals=2) # specifying plot size (making it bigger) fig, ax = plt.subplots(figsize=(10, 10)) # developing a freezing cold heatmap sns.heatmap(data=corr_matrix, # the correlation matrix cmap='Blues', # changing to COOL colors square=True, # tightening the layout annot=True, # should there be numbers in the heatmap linecolor='black', # lines between boxes linewidths=0.5) # how thick should the lines be? # title and displaying the plot plt.title(label=""" Linear Correlation Heatmap """) # rendering the visualization plt.show()

era_stats = train.loc[all_data['Hit_or_Flop'] == 1].groupby('Era')[['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'chorus_hit', 'sections']].agg(['mean', 'std']) def is_hit_song(row): era = row['Era'] features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'] for feature in features: mean = era_stats.loc[era, (feature, 'mean')] std = era_stats.loc[era, (feature, 'std')]*2.5 if not (mean - std <= row[feature] <= mean + std): return False return True all_data['adapted'] = all_data.apply(is_hit_song, axis=1).astype(int)

# calculate the sum of hits for each artist sum_hit = train[train['Hit_or_Flop'] == 1].groupby('artist')['Hit_or_Flop'].sum().reset_index() # calculate the total number of songs for each artist songs_live = train.groupby('artist')['track'].count().reset_index() # merge the two dataframes artist_stats = sum_hit.merge(songs_live, on='artist') # calculate the hit ratio and add it as a new column artist_stats['hit_ratio'] = artist_stats['Hit_or_Flop'] / artist_stats['track'] # merge the hit_ratio column back into the original dataframe all_data = all_data.reset_index().merge(artist_stats[['artist', 'hit_ratio']], on='artist', how='left').set_index('id')

# Add a new column 'has_a_feature' to the all_data DataFrame that indicates if the artist's name # contains the words 'featuring' or 'ft' (case-insensitive). This column will contain 1 if the artist # has a feature and 0 if not. all_data['has_a_feature'] = all_data['artist'].str.contains('featuring|ft', case=False, regex=True).astype(int)

# Create a Series called 'popular_artists' that counts the number of hit songs for each artist, but only # for those artists who have more than one hit song. popular_artists = all_data[all_data['Hit_or_Flop'] == 1]['artist'].value_counts() popular_artists = popular_artists[popular_artists > 1].index.tolist() # Add a new column 'is_popular_artist' to the all_data DataFrame that indicates if the artist has more than # one hit song. This column will contain 1 if the artist is popular and 0 if not. all_data['is_popular_artist'] = all_data['artist'].isin(popular_artists).astype(int)

####################### SKEWNESS ############################ # set a threshold for skewness skew_threshold = 0.5 # calculate the skewness of each column all_data_numeric = train.select_dtypes(include=['float64', 'int64', 'bool']) skewness = all_data_numeric.skew() # create a list of column names where the absolute value of skewness is greater than the threshold skewed_columns = list(skewness[abs(skewness) > skew_threshold].index) to_remove = ["mode", "time_signature", "adapted", "Hit_or_Flop", "has_a_feature", "is_popular_artist", "sections", "loudness"] skewed_columns = [elem for elem in skewed_columns if elem not in to_remove] # create a grid of subplots fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(20, 40)) # plot each histogram on a separate subplot for ax, col in zip(axes.flat, skewed_columns): sns.histplot(data=all_data, x=col, kde=False, ax=ax) ax.set_title(f"Histogram of {col}") # adjust spacing between subplots plt.subplots_adjust(wspace=0.5) # display the plot plt.show()

# for loop for the skewed columns for col in skewed_columns: # log and create a new column if col in all_data.columns: all_data['log_' + col] = np.log(all_data[col] + 0.001) # drop transformed variables all_data = all_data.drop(skewed_columns, axis = 1) # add a constant to loudness variable to make all values positive all_data['loudness_shifted'] = all_data['loudness'] - all_data['loudness'].min() + 1 # apply Box-Cox transformation on the 'loudness' variable all_data['loudness_transformed'], lambda_ = boxcox(all_data['loudness_shifted']) # print the optimal lambda value for transformation print(f"Optimal lambda value: {lambda_}") # drop the original 'loudness' column and the shifted column all_data = all_data.drop(['loudness', 'loudness_shifted'], axis=1)

# ONE HOT ENCODING # the variables era and time_signature needs to be onehot encoded for col in ['Era', 'time_signature']: # create df with one hot encoded variables one_hot_encoded_features = pd.get_dummies(all_data[col]) one_hot_encoded_features.columns = [col + col for col in one_hot_encoded_features.columns] # combine one hot encode with the full dataset all_data = pd.concat([all_data, one_hot_encoded_features], axis = 1) # drop the columns no longer needed all_data = all_data.drop(col, axis = 1)

# separate the full dataset into test and train train = all_data[all_data.is_train==True] test = all_data[all_data.is_train==False]

train_numeric = train.select_dtypes(include=['float64', 'int64', 'bool']) corr_matrix = train_numeric.corr(method='pearson', min_periods=30).round(decimals=2) corr_matrix['Hit_or_Flop'].to_frame().T

# drop the test/train and categorical columns train = train.drop(['is_train', 'track', 'artist', 'uri'], axis = 1) test = test.drop(['is_train', 'track', 'artist', 'uri'], axis = 1)

# split the dataset into train and test using the seed 219 to make sure the results are replicable x_train, x_test, y_train, y_test = train_test_split( train.drop('Hit_or_Flop', axis = 1), train['Hit_or_Flop'], test_size = 0.25, random_state = 219)

def tpot_rmse(x_train, y_train, x_test, y_test, generations= 100, population_size = 100): # Define custom scoring function as accuracy acc_scorer = make_scorer(accuracy_score) # Define TPOTRegressor with custom scoring function tpot = TPOTClassifier(generations = generations, population_size = population_size, scoring = acc_scorer, verbosity = 2, random_state = 42, n_jobs = -1) # Fit the TPOT Regressor to the training data tpot.fit(x_train, y_train) # Calculate predictions on test set y_pred = tpot.predict(x_test) # Calculate accuracy on test set accuracy = accuracy_score(y_test, y_pred) return tpot, accuracy tpot, accuracy = tpot_rmse(x_train, np.ravel(y_train), x_test, np.ravel(y_test), generations=10, population_size=75)

accuracy

# Define the data x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] y = [0.9146232621461646, 0.9165044834634595, 0.9172831354376211, 0.9172831354376211, 0.9172831354376211, 0.9172831354376211, 0.9172831354376211, 0.9172831354376211, 0.9179318542180297, 0.9179318542180297] # Set the style sns.set_style('darkgrid') # Create the plot plt.plot(x, y) # Add labels and title plt.xlabel('Generation') plt.ylabel('CV Score') plt.title('Tpot') # Show the plot plt.show()

y_prediction = tpot.predict(test.drop('Hit_or_Flop', axis = 1)) submission = pd.DataFrame() submission['Hit_or_Flop'] = y_prediction submission['Id'] = test.index submission = submission.set_index('Id') submission.to_csv('submission_team_6_tpot.csv')