# import libraries
import matplotlib.pyplot as plt # data visualization doc: https://matplotlib.org/2.0.2/api/pyplot_api.html
import pandas as pd # data science essentials doc: https://pandas.pydata.org/docs/
import seaborn as sns # enhanced data visualization doc: https://seaborn.pydata.org/
import numpy as np # numpy library for math functions and arrays doc: https://numpy.org/doc/
from sklearn.model_selection import train_test_split # train-test split doc: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
from sklearn.metrics import make_scorer, accuracy_score # metrics doc: https://scikit-learn.org/stable/modules/model_evaluation.html
from sklearn.preprocessing import StandardScaler # standard scaler doc: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from tpot import TPOTClassifier # doc: http://epistasislab.github.io/tpot/
import datetime # datetime doc: https://docs.python.org/3/library/datetime.html
from scipy.stats import boxcox
train = pd.read_csv('/work/train.csv')
train = train.set_index('id')
# read test dataset
test = pd.read_csv('/work/test.csv')
test = test.set_index('id')
# create a column to identify if train or test
train['is_train'] = True
test['is_train'] = False
# merge the dataset test and train into all_data
all_data = pd.concat([train,test], axis=0)
trackobject
artistobject
20953
Tender Lover
Babyface
30386
Seven Years
Saosin
39282
Boy With Luv
BTS Featuring Halsey
37222
It's All Gone
The Black
24866
Beng Beng Beng
Femi Kuti
trackobject
artistobject
26896
I Didn't Want To Need You
Heart
24215
LOST INTO THE NIGHT
ELISA
21493
Janie's Got A Gun
Aerosmith
30984
Fallout (Of Our Being)
Nausea
5918
Ain't Nobody Home
Howard Tate
<class 'pandas.core.frame.DataFrame'>
Int64Index: 20553 entries, 20953 to 5918
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 track 20553 non-null object
1 artist 20553 non-null object
2 uri 20553 non-null object
3 danceability 20553 non-null float64
4 energy 20553 non-null float64
5 key 20553 non-null int64
6 loudness 20553 non-null float64
7 mode 20553 non-null int64
8 speechiness 20553 non-null float64
9 acousticness 20553 non-null float64
10 instrumentalness 20553 non-null float64
11 liveness 20553 non-null float64
12 valence 20553 non-null float64
13 tempo 20553 non-null float64
14 duration_ms 20553 non-null int64
15 time_signature 20553 non-null int64
16 chorus_hit 20553 non-null float64
17 sections 20553 non-null int64
18 Hit_or_Flop 20553 non-null int64
19 Era 20553 non-null object
20 is_train 20553 non-null bool
dtypes: bool(1), float64(10), int64(6), object(4)
memory usage: 3.3+ MB
danceabilityfloat64
energyfloat64
count
20553.0
20553.0
mean
0.5410966330949253
0.5785785008028024
std
0.17712170410416406
0.25339129901642043
min
0.0
0.000276
25%
0.422
0.394
50%
0.553
0.601
75%
0.67
0.786
max
0.988
0.999
trackint64
artistint64
0
0
0
corr_matrix = train.corr(method='pearson', min_periods=30).round(decimals=2)
# specifying plot size (making it bigger)
fig, ax = plt.subplots(figsize=(10, 10))
# developing a freezing cold heatmap
sns.heatmap(data=corr_matrix, # the correlation matrix
cmap='Blues', # changing to COOL colors
square=True, # tightening the layout
annot=True, # should there be numbers in the heatmap
linecolor='black', # lines between boxes
linewidths=0.5) # how thick should the lines be?
# title and displaying the plot
plt.title(label="""
Linear Correlation Heatmap
""")
# rendering the visualization
plt.show()
era_stats = train.loc[all_data['Hit_or_Flop'] == 1].groupby('Era')[['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'chorus_hit', 'sections']].agg(['mean', 'std'])
def is_hit_song(row):
era = row['Era']
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
for feature in features:
mean = era_stats.loc[era, (feature, 'mean')]
std = era_stats.loc[era, (feature, 'std')]*2.5
if not (mean - std <= row[feature] <= mean + std):
return False
return True
all_data['adapted'] = all_data.apply(is_hit_song, axis=1).astype(int)
# calculate the sum of hits for each artist
sum_hit = train[train['Hit_or_Flop'] == 1].groupby('artist')['Hit_or_Flop'].sum().reset_index()
# calculate the total number of songs for each artist
songs_live = train.groupby('artist')['track'].count().reset_index()
# merge the two dataframes
artist_stats = sum_hit.merge(songs_live, on='artist')
# calculate the hit ratio and add it as a new column
artist_stats['hit_ratio'] = artist_stats['Hit_or_Flop'] / artist_stats['track']
# merge the hit_ratio column back into the original dataframe
all_data = all_data.reset_index().merge(artist_stats[['artist', 'hit_ratio']], on='artist', how='left').set_index('id')
# Add a new column 'has_a_feature' to the all_data DataFrame that indicates if the artist's name
# contains the words 'featuring' or 'ft' (case-insensitive). This column will contain 1 if the artist
# has a feature and 0 if not.
all_data['has_a_feature'] = all_data['artist'].str.contains('featuring|ft', case=False, regex=True).astype(int)
# Create a Series called 'popular_artists' that counts the number of hit songs for each artist, but only
# for those artists who have more than one hit song.
popular_artists = all_data[all_data['Hit_or_Flop'] == 1]['artist'].value_counts()
popular_artists = popular_artists[popular_artists > 1].index.tolist()
# Add a new column 'is_popular_artist' to the all_data DataFrame that indicates if the artist has more than
# one hit song. This column will contain 1 if the artist is popular and 0 if not.
all_data['is_popular_artist'] = all_data['artist'].isin(popular_artists).astype(int)
####################### SKEWNESS ############################
# set a threshold for skewness
skew_threshold = 0.5
# calculate the skewness of each column
skewness = all_data.skew()
# create a list of column names where the absolute value of skewness is greater than the threshold
skewed_columns = list(skewness[abs(skewness) > skew_threshold].index)
to_remove = ["mode", "time_signature", "adapted", "Hit_or_Flop", "has_a_feature", "is_popular_artist", "sections", "loudness"]
skewed_columns = [elem for elem in skewed_columns if elem not in to_remove]
# create a grid of subplots
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(20, 40))
# plot each histogram on a separate subplot
for ax, col in zip(axes.flat, skewed_columns):
sns.histplot(data=all_data, x=col, kde=False, ax=ax)
ax.set_title(f"Histogram of {col}")
# adjust spacing between subplots
plt.subplots_adjust(wspace=0.5)
# display the plot
plt.show()
# for loop for the skewed columns
for col in skewed_columns:
# log and create a new column
if col in all_data.columns:
all_data['log_' + col] = np.log(all_data[col] + 0.001)
# drop transformed variables
all_data = all_data.drop(skewed_columns, axis = 1)
# add a constant to loudness variable to make all values positive
all_data['loudness_shifted'] = all_data['loudness'] - all_data['loudness'].min() + 1
# apply Box-Cox transformation on the 'loudness' variable
all_data['loudness_transformed'], lambda_ = boxcox(all_data['loudness_shifted'])
# print the optimal lambda value for transformation
print(f"Optimal lambda value: {lambda_}")
# drop the original 'loudness' column and the shifted column
all_data = all_data.drop(['loudness', 'loudness_shifted'], axis=1)
Optimal lambda value: 3.776049741716768
# ONE HOT ENCODING
# the variables era and time_signature needs to be onehot encoded
for col in ['Era', 'time_signature']:
# create df with one hot encoded variables
one_hot_encoded_features = pd.get_dummies(all_data[col])
one_hot_encoded_features.columns = [col + col for col in one_hot_encoded_features.columns]
# combine one hot encode with the full dataset
all_data = pd.concat([all_data, one_hot_encoded_features], axis = 1)
# drop the columns no longer needed
all_data = all_data.drop(col, axis = 1)
# separate the full dataset into test and train
train = all_data[all_data.is_train==True]
test = all_data[all_data.is_train==False]
corr_matrix = train.corr(method='pearson', min_periods=30).round(decimals=2)
corr_matrix['Hit_or_Flop'].to_frame().T
danceability 0.35
energy 0.19
key 0.01
mode 0.08
acousticness -0.25
valence 0.25
tempo 0.04
sections -0.06
Hit_or_Flop 1.00
is_train NaN
adapted 0.43
has_a_feature 0.21
is_popular_artist 0.76
log_speechiness -0.07
log_instrumentalness -0.44
log_liveness -0.06
log_duration_ms 0.02
log_chorus_hit 0.03
log_hit_ratio 0.80
loudness_transformed 0.26
00s00s -0.01
10s10s -0.00
60s60s -0.00
70s70s 0.01
80s80s 0.00
90s90s 0.00
0 -0.00
2 -0.06
6 -0.12
8 0.15
10 -0.07
Name: Hit_or_Flop, dtype: float64
# drop the test/train and categorical columns
train = train.drop(['is_train', 'track', 'artist', 'uri'], axis = 1)
test = test.drop(['is_train', 'track', 'artist', 'uri'], axis = 1)
# split the dataset into train and test using the seed 219 to make sure the results are replicable
x_train, x_test, y_train, y_test = train_test_split(
train.drop('Hit_or_Flop', axis = 1),
train['Hit_or_Flop'],
test_size = 0.25,
random_state = 219)
def tpot_rmse(x_train, y_train, x_test, y_test, generations= 100, population_size = 100):
# Define custom scoring function as accuracy
acc_scorer = make_scorer(accuracy_score)
# Define TPOTRegressor with custom scoring function
tpot = TPOTClassifier(generations = generations,
population_size = population_size,
scoring = acc_scorer,
verbosity = 2,
random_state = 42,
n_jobs = -1)
# Fit the TPOT Regressor to the training data
tpot.fit(x_train, y_train)
# Calculate predictions on test set
y_pred = tpot.predict(x_test)
# Calculate accuracy on test set
accuracy = accuracy_score(y_test, y_pred)
return tpot, accuracy
tpot, accuracy = tpot_rmse(x_train, np.ravel(y_train), x_test, np.ravel(y_test), generations=10, population_size=75)
Imputing missing values in feature set
/shared-libs/python3.9/py/lib/python3.9/site-packages/sklearn/utils/validation.py:1858: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['int', 'str']. An error will be raised in 1.2.
warnings.warn(
/shared-libs/python3.9/py/lib/python3.9/site-packages/sklearn/utils/validation.py:1858: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['int', 'str']. An error will be raised in 1.2.
warnings.warn(
Generation 1 - Current best internal CV score: 0.9146232621461646
Generation 2 - Current best internal CV score: 0.9165044834634595
Generation 3 - Current best internal CV score: 0.9172831354376211
Generation 4 - Current best internal CV score: 0.9172831354376211
Generation 5 - Current best internal CV score: 0.9172831354376211
Generation 6 - Current best internal CV score: 0.9172831354376211
Generation 7 - Current best internal CV score: 0.9172831354376211
Generation 8 - Current best internal CV score: 0.9172831354376211
Generation 9 - Current best internal CV score: 0.9179318542180297
Generation 10 - Current best internal CV score: 0.9179318542180297
Best pipeline: ExtraTreesClassifier(PolynomialFeatures(input_matrix, degree=2, include_bias=False, interaction_only=False), bootstrap=False, criterion=entropy, max_features=0.55, min_samples_leaf=1, min_samples_split=9, n_estimators=100)
Imputing missing values in feature set
/shared-libs/python3.9/py/lib/python3.9/site-packages/sklearn/utils/validation.py:1858: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['int', 'str']. An error will be raised in 1.2.
warnings.warn(
accuracy
# Define the data
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y = [0.9146232621461646, 0.9165044834634595, 0.9172831354376211, 0.9172831354376211,
0.9172831354376211, 0.9172831354376211, 0.9172831354376211, 0.9172831354376211,
0.9179318542180297, 0.9179318542180297]
# Set the style
sns.set_style('darkgrid')
# Create the plot
plt.plot(x, y)
# Add labels and title
plt.xlabel('Generation')
plt.ylabel('CV Score')
plt.title('Tpot')
# Show the plot
plt.show()
y_prediction = tpot.predict(test.drop('Hit_or_Flop', axis = 1))
submission = pd.DataFrame()
submission['Hit_or_Flop'] = y_prediction
submission['Id'] = test.index
submission = submission.set_index('Id')
submission.to_csv('submission_team_6_tpot.csv')
Imputing missing values in feature set
/shared-libs/python3.9/py/lib/python3.9/site-packages/sklearn/utils/validation.py:1858: FutureWarning: Feature names only support names that are all strings. Got feature names with dtypes: ['int', 'str']. An error will be raised in 1.2.
warnings.warn(