import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re
df = pd.read_csv('/work/datasets/Spotify 2010 - 2019 Top 100.csv')
df
print(df.info())
total_nan_values = df.isna().sum()
print ("Total Number of NaN values:")
print(total_nan_values)
df = df.dropna(axis=0)
df
df.groupby('top genre')['top genre'].count().sort_values(ascending=False)
genres_year = df[['top genre', 'top year']]
genres_year = genres_year.rename(columns={'top year':'top_year'})
def cont_gen(ds):
years = range(2010, 2020)
y_g = pd.DataFrame()
for i in years:
col = ds[ds.top_year == i]['top genre'].value_counts()
col = col[:3]
col.to_frame()
col = col.rename(str(i))
y_g = y_g.append(col)
return y_g
genres_years = cont_gen(genres_year)
genres_years
f1 = plt.figure()
colors = ['tomato','lightseagreen', 'Aqua', 'Firebrick', 'DarkViolet', 'Gainsboro', 'Olive', 'Gold', 'Pink', 'Black', 'Blue', 'Red']
plt.style.use('seaborn-darkgrid')
genres_years.plot(kind='bar', ax=f1.gca(), stacked=True ,figsize=(10,9), color=colors, ylabel='Number of songs in the top', xlabel='Years', title='Most popular genres of the decade')
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()
artist = df.query('`year released` < 2010')
artist
artist.query('`top year` != 2010')
df.groupby('artist')['artist'].count().sort_values(ascending=False)
artist_decade = df[['artist', 'top year']]
artist_decade = artist_decade.rename(columns={'top year':'top_year'})
def cont_art(ds):
years = range(2010, 2020)
a_d = pd.DataFrame()
for i in years:
col = ds[ds.top_year == i]['artist'].value_counts()
col = col[:3]
col.to_frame()
col = col.rename(str(i))
a_d = a_d.append(col)
return a_d
artist_decades = cont_art(artist_decade)
artist_decades
f2 = plt.figure()
colors = ['tomato','lightseagreen', 'Aqua', 'Firebrick', 'DarkViolet', 'Gainsboro', 'Olive', 'Gold', 'Pink', 'Black', 'Blue', 'Red', 'Goldenrod', 'mistyrose', 'tan', 'azure', 'gray', 'yellow', 'white', 'greenyellow', 'rosybrown', 'royalblue', 'bisque']
plt.style.use('seaborn-darkgrid')
artist_decades.plot(kind='bar', ax=f2.gca(), stacked=True ,figsize=(10,9), color=colors, title='Most popular artists of the decade', ylabel='Number of songs in the top', xlabel='Years')
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()
sad = df[['val', 'top year']]
sad = sad.rename(columns={'top year':'top_year'})
def cont_sad(ds):
years = range(2010, 2020)
s = {}
for i in years:
sa = sad[sad.top_year == i]
pro = sa.mean()
s[i] = pro[0]
return s
sad1 = cont_sad(sad)
sad1 = pd.DataFrame(sad1.items(), columns=['year', 'prom_sad'])
sad1
sns.set_theme(style="darkgrid")
f3 = sns.lineplot(data=sad1, x="year", y="prom_sad")
f3.set_title('Graph of the change in the mood of the songs in the decade 2010-2019')
f3.set_ylabel('Positivity of the songs')
f3.set_xlabel('Years')
plt.show()
nrgy = df[['nrgy', 'top year']]
nrgy = nrgy.rename(columns={'top year':'top_year'})
def cont_nrgy(ds):
years = range(2010, 2020)
s = {}
for i in years:
sa = nrgy[nrgy.top_year == i]
pro = sa.mean()
s[i] = pro[0]
return s
energy = cont_nrgy(nrgy)
energy = pd.DataFrame(energy.items(), columns=['year', 'prom_energy'])
energy
sns.set_theme(style="darkgrid")
f4 = sns.lineplot(data=energy, x="year", y="prom_energy")
f4.set_title('Graph of the change of energy of the songs in the decade 2010-2019')
f4.set_ylabel('Energy of the songs')
f4.set_xlabel('Years')
plt.show()
dnce = df[['dnce', 'top year']]
dnce = dnce.rename(columns={'top year':'top_year'})
def cont_dnce(ds):
years = range(2010, 2020)
s = {}
for i in years:
sa = dnce[dnce.top_year == i]
pro = sa.mean()
s[i] = pro[0]
return s
dance = cont_dnce(dnce)
dance = pd.DataFrame(dance.items(), columns=['year', 'prom_dance'])
dance
sns.set_theme(style="darkgrid")
f5 = sns.lineplot(data=dance, x="year", y="prom_dance")
f5.set_title('Graph of the change of danceability of the songs in the decade 2010-2019')
f5.set_ylabel('Danceability of the songs')
f5.set_xlabel('Years')
plt.show()
text = " ".join(title for title in df.title)
text = "".join(re.split("\(|\)|\[|\]", text)[::2])
text = text.replace('Edit ', '')
text = text.replace('Radio ', '')
text = text.replace('Remix', '')
word_cloud = WordCloud(collocations = False, background_color = 'white').generate(text)
plt.imshow(word_cloud, interpolation='bilinear')
plt.title('Map of the most common words in song titles')
plt.axis("off")
plt.show()
sns.set_theme(style="darkgrid")
f5 = sns.lineplot(data=dance, x="year", y="prom_dance")
f4 = sns.lineplot(data=energy, x="year", y="prom_energy")
f3 = sns.lineplot(data=sad1, x="year", y="prom_sad")
f5.set_xlabel('Years')
plt.show()
df1 = df.rename(columns={'top year':'top_year'})
df2 = df1[df1.top_year > 2016]
df2
df2 = df2[df2.dnce > 66]
df2 = df2.reset_index()
df2['dnce'].sort_values(ascending=False)
df2.iloc[[106, 153, 37, 5, 46]]
expr = df[['dnce', 'nrgy', 'val']]
sns.heatmap(expr.corr(), annot = True)