# import libraries
import matplotlib.pyplot as plt # data visualization doc: https://matplotlib.org/2.0.2/api/pyplot_api.html
import pandas as pd # data science essentials doc: https://pandas.pydata.org/docs/
import seaborn as sns # enhanced data visualization doc: https://seaborn.pydata.org/
import numpy as np # numpy library for math functions and arrays doc: https://numpy.org/doc/
from sklearn.model_selection import train_test_split # train-test split doc: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
from sklearn.metrics import make_scorer, accuracy_score # metrics doc: https://scikit-learn.org/stable/modules/model_evaluation.html
from sklearn.preprocessing import StandardScaler # standard scaler doc: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from tpot import TPOTClassifier # doc: http://epistasislab.github.io/tpot/
import datetime # datetime doc: https://docs.python.org/3/library/datetime.html
from scipy.stats import boxcox
train = pd.read_csv('/work/train.csv')
train = train.set_index('id')
# read test dataset
test = pd.read_csv('/work/test.csv')
test = test.set_index('id')
# create a column to identify if train or test
train['is_train'] = True
test['is_train'] = False
# merge the dataset test and train into all_data
all_data = pd.concat([train,test], axis=0)
train_numeric = train.select_dtypes(include=['float64', 'int64', 'bool'])
corr_matrix = train_numeric.corr(method='pearson', min_periods=30).round(decimals=2)
# specifying plot size (making it bigger)
fig, ax = plt.subplots(figsize=(10, 10))
# developing a freezing cold heatmap
sns.heatmap(data=corr_matrix, # the correlation matrix
cmap='Blues', # changing to COOL colors
square=True, # tightening the layout
annot=True, # should there be numbers in the heatmap
linecolor='black', # lines between boxes
linewidths=0.5) # how thick should the lines be?
# title and displaying the plot
plt.title(label="""
Linear Correlation Heatmap
""")
# rendering the visualization
plt.show()
era_stats = train.loc[all_data['Hit_or_Flop'] == 1].groupby('Era')[['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'chorus_hit', 'sections']].agg(['mean', 'std'])
def is_hit_song(row):
era = row['Era']
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
for feature in features:
mean = era_stats.loc[era, (feature, 'mean')]
std = era_stats.loc[era, (feature, 'std')]*2.5
if not (mean - std <= row[feature] <= mean + std):
return False
return True
all_data['adapted'] = all_data.apply(is_hit_song, axis=1).astype(int)
# calculate the sum of hits for each artist
sum_hit = train[train['Hit_or_Flop'] == 1].groupby('artist')['Hit_or_Flop'].sum().reset_index()
# calculate the total number of songs for each artist
songs_live = train.groupby('artist')['track'].count().reset_index()
# merge the two dataframes
artist_stats = sum_hit.merge(songs_live, on='artist')
# calculate the hit ratio and add it as a new column
artist_stats['hit_ratio'] = artist_stats['Hit_or_Flop'] / artist_stats['track']
# merge the hit_ratio column back into the original dataframe
all_data = all_data.reset_index().merge(artist_stats[['artist', 'hit_ratio']], on='artist', how='left').set_index('id')
# Add a new column 'has_a_feature' to the all_data DataFrame that indicates if the artist's name
# contains the words 'featuring' or 'ft' (case-insensitive). This column will contain 1 if the artist
# has a feature and 0 if not.
all_data['has_a_feature'] = all_data['artist'].str.contains('featuring|ft', case=False, regex=True).astype(int)
# Create a Series called 'popular_artists' that counts the number of hit songs for each artist, but only
# for those artists who have more than one hit song.
popular_artists = all_data[all_data['Hit_or_Flop'] == 1]['artist'].value_counts()
popular_artists = popular_artists[popular_artists > 1].index.tolist()
# Add a new column 'is_popular_artist' to the all_data DataFrame that indicates if the artist has more than
# one hit song. This column will contain 1 if the artist is popular and 0 if not.
all_data['is_popular_artist'] = all_data['artist'].isin(popular_artists).astype(int)
####################### SKEWNESS ############################
# set a threshold for skewness
skew_threshold = 0.5
# calculate the skewness of each column
all_data_numeric = train.select_dtypes(include=['float64', 'int64', 'bool'])
skewness = all_data_numeric.skew()
# create a list of column names where the absolute value of skewness is greater than the threshold
skewed_columns = list(skewness[abs(skewness) > skew_threshold].index)
to_remove = ["mode", "time_signature", "adapted", "Hit_or_Flop", "has_a_feature", "is_popular_artist", "sections", "loudness"]
skewed_columns = [elem for elem in skewed_columns if elem not in to_remove]
# create a grid of subplots
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(20, 40))
# plot each histogram on a separate subplot
for ax, col in zip(axes.flat, skewed_columns):
sns.histplot(data=all_data, x=col, kde=False, ax=ax)
ax.set_title(f"Histogram of {col}")
# adjust spacing between subplots
plt.subplots_adjust(wspace=0.5)
# display the plot
plt.show()
# for loop for the skewed columns
for col in skewed_columns:
# log and create a new column
if col in all_data.columns:
all_data['log_' + col] = np.log(all_data[col] + 0.001)
# drop transformed variables
all_data = all_data.drop(skewed_columns, axis = 1)
# add a constant to loudness variable to make all values positive
all_data['loudness_shifted'] = all_data['loudness'] - all_data['loudness'].min() + 1
# apply Box-Cox transformation on the 'loudness' variable
all_data['loudness_transformed'], lambda_ = boxcox(all_data['loudness_shifted'])
# print the optimal lambda value for transformation
print(f"Optimal lambda value: {lambda_}")
# drop the original 'loudness' column and the shifted column
all_data = all_data.drop(['loudness', 'loudness_shifted'], axis=1)
# ONE HOT ENCODING
# the variables era and time_signature needs to be onehot encoded
for col in ['Era', 'time_signature']:
# create df with one hot encoded variables
one_hot_encoded_features = pd.get_dummies(all_data[col])
one_hot_encoded_features.columns = [col + col for col in one_hot_encoded_features.columns]
# combine one hot encode with the full dataset
all_data = pd.concat([all_data, one_hot_encoded_features], axis = 1)
# drop the columns no longer needed
all_data = all_data.drop(col, axis = 1)
# separate the full dataset into test and train
train = all_data[all_data.is_train==True]
test = all_data[all_data.is_train==False]
train_numeric = train.select_dtypes(include=['float64', 'int64', 'bool'])
corr_matrix = train_numeric.corr(method='pearson', min_periods=30).round(decimals=2)
corr_matrix['Hit_or_Flop'].to_frame().T
# drop the test/train and categorical columns
train = train.drop(['is_train', 'track', 'artist', 'uri'], axis = 1)
test = test.drop(['is_train', 'track', 'artist', 'uri'], axis = 1)
# split the dataset into train and test using the seed 219 to make sure the results are replicable
x_train, x_test, y_train, y_test = train_test_split(
train.drop('Hit_or_Flop', axis = 1),
train['Hit_or_Flop'],
test_size = 0.25,
random_state = 219)
def tpot_rmse(x_train, y_train, x_test, y_test, generations= 100, population_size = 100):
# Define custom scoring function as accuracy
acc_scorer = make_scorer(accuracy_score)
# Define TPOTRegressor with custom scoring function
tpot = TPOTClassifier(generations = generations,
population_size = population_size,
scoring = acc_scorer,
verbosity = 2,
random_state = 42,
n_jobs = -1)
# Fit the TPOT Regressor to the training data
tpot.fit(x_train, y_train)
# Calculate predictions on test set
y_pred = tpot.predict(x_test)
# Calculate accuracy on test set
accuracy = accuracy_score(y_test, y_pred)
return tpot, accuracy
tpot, accuracy = tpot_rmse(x_train, np.ravel(y_train), x_test, np.ravel(y_test), generations=10, population_size=75)
accuracy
# Define the data
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y = [0.9146232621461646, 0.9165044834634595, 0.9172831354376211, 0.9172831354376211,
0.9172831354376211, 0.9172831354376211, 0.9172831354376211, 0.9172831354376211,
0.9179318542180297, 0.9179318542180297]
# Set the style
sns.set_style('darkgrid')
# Create the plot
plt.plot(x, y)
# Add labels and title
plt.xlabel('Generation')
plt.ylabel('CV Score')
plt.title('Tpot')
# Show the plot
plt.show()
y_prediction = tpot.predict(test.drop('Hit_or_Flop', axis = 1))
submission = pd.DataFrame()
submission['Hit_or_Flop'] = y_prediction
submission['Id'] = test.index
submission = submission.set_index('Id')
submission.to_csv('submission_team_6_tpot.csv')