pip install seaborn
Collecting seaborn
Downloading seaborn-0.10.1-py3-none-any.whl (215 kB)
|████████████████████████████████| 215 kB 3.4 MB/s eta 0:00:01
Requirement already satisfied: pandas>=0.22.0 in /opt/venv/lib/python3.7/site-packages (from seaborn) (1.0.5)
Requirement already satisfied: matplotlib>=2.1.2 in /opt/venv/lib/python3.7/site-packages (from seaborn) (3.2.2)
Requirement already satisfied: numpy>=1.13.3 in /opt/venv/lib/python3.7/site-packages (from seaborn) (1.19.0)
Requirement already satisfied: scipy>=1.0.1 in /opt/venv/lib/python3.7/site-packages (from seaborn) (1.5.1)
Requirement already satisfied: python-dateutil>=2.6.1 in /opt/venv/lib/python3.7/site-packages (from pandas>=0.22.0->seaborn) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in /opt/venv/lib/python3.7/site-packages (from pandas>=0.22.0->seaborn) (2020.1)
Requirement already satisfied: cycler>=0.10 in /opt/venv/lib/python3.7/site-packages (from matplotlib>=2.1.2->seaborn) (0.10.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/venv/lib/python3.7/site-packages (from matplotlib>=2.1.2->seaborn) (1.2.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /opt/venv/lib/python3.7/site-packages (from matplotlib>=2.1.2->seaborn) (2.4.7)
Requirement already satisfied: six>=1.5 in /opt/venv/lib/python3.7/site-packages (from python-dateutil>=2.6.1->pandas>=0.22.0->seaborn) (1.15.0)
Installing collected packages: seaborn
Successfully installed seaborn-0.10.1
WARNING: You are using pip version 20.1.1; however, version 20.2.1 is available.
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
#necessary packages for querying data:
import sys
import lyricsgenius as genius
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import re
import json
#other packages for data preprocessing/analysis
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#plotting
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
import seaborn as sns
sns.set()
#sns.set_context("talk")
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
#Accesing Spotify's API through Spotipy, a custom wrapper to access Spotify's API.
#PLEASE DO NOT SHARE CODE WITHOUT MY PERMISSION (has personal info for auth):
client_credentials_manager = SpotifyClientCredentials(client_id='911b425bc8c3496297b0a66d3df0b236', client_secret='5f581823761649768f3ca1d3a7cfc6de')
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
#Unfortunately can not run in datahub, has to be ran locally through an IDE of some sort or terminal.
while True:
print()
print(">>> Welcome to Build Your Own Spotify/Lyrics Dataset!")
print()
print("0 - Create csv for an artist's songs/audio features")
print("1 - Find lyrics for an artist")
print("2 - Exit")
print()
choice = input("Your choice: ")
if choice == "0":
print()
searchQuery = input("Ok, what's their name?: ")
print()
# Get search results
searchResults = spotify.search(searchQuery,1,0,"artist")
# Artist details
artist = searchResults['artists']['items'][0]
artist_name = artist['name']
print(artist['name'])
print(str(artist['followers']['total']) + " followers")
print(artist['genres'][0])
artist_id = artist['id'] #2h93pZq0e7k5yf4dywlkpM for Frank Ocean
#artistURI(id) = 'spotify:artist:2h93pZq0e7k5yf4dywlkpM'
albums = spotify.artist_albums(artist_id, country="US", limit=50)
album_ids = [album['uri'] for album in albums['items']]
print(artist_name + ' has ' + str(len(album_ids)) + ' albums (on spotify, might be singles)!')
all_tracks = []
for album_id in album_ids:
tracks = spotify.album_tracks(album_id, limit=50)
all_tracks.append(tracks)
#Prints how many tracks per album on the left
#all tracks, lists per album tracks, and the keys for the albums
for tracks, album in zip(all_tracks, albums.get('items')):
print(
len(tracks.get('items')),
"\t",
album.get('name'))
#Now getting each track id or uri to get features
track_ids = []
for tracks in all_tracks:
album_tracks = []
for track in tracks.get('items'):
album_tracks.append(track.get('uri'))
track_ids.append(album_tracks)
#now grouping each track to album
track_objects = []
for track_id_list in track_ids:
tracks = spotify.tracks(track_id_list)
track_objects.append(tracks)
#grabbing audio features for each track
audio_feature_objects = []
for track_id_list in track_ids:
features = spotify.audio_features(track_id_list)
audio_feature_objects.append(features)
#create dictionary for json file
spotify_data = {
"audio_features": audio_feature_objects,
"tracks": track_objects}
with open('spotify.json', 'w') as outfile:
json.dump(spotify_data, outfile) ##dumps file locally into a jason file
df = pd.DataFrame(columns=[
'name',
'duration_ms',
'popularity',
'num_markets',
'album',
'disc_number',
'is_explicit',
'track_number',
'release_date',
'artist',
'danceability',
'energy',
'key',
'loudness',
'mode',
'speechiness',
'acousticness',
'instrumentalness',
'liveness',
'valence',
'tempo',
'time_signature',
])
for album_info, album_features in zip(
spotify_data.get('tracks'),
spotify_data.get('audio_features')):
for track_info, track_features in zip(
album_info.get('tracks'), album_features):
y = {
'name': track_info['name'],
'duration_ms': track_info['duration_ms'],
'popularity': track_info['popularity'],
'num_markets': len(track_info['available_markets']),
'album': track_info['album']['name'],
'disc_number': track_info['disc_number'],
'is_explicit': track_info['explicit'],
'track_number': track_info['track_number'],
'release_date': track_info['album']['release_date'],
'artist': track_info['artists'][0]['name'],
'danceability': track_features['danceability'],
'energy': track_features['energy'],
'key': track_features['key'],
'loudness': track_features['loudness'],
'mode': track_features['mode'],
'speechiness': track_features['speechiness'],
'acousticness': track_features['acousticness'],
'instrumentalness': track_features['instrumentalness'],
'liveness': track_features['liveness'],
'valence': track_features['valence'],
'tempo': track_features['tempo'],
'time_signature': track_features['time_signature'],
}
df = df.append(y, ignore_index=True)
df.to_csv("spotify.csv", index=False)
print(df.iloc[0])
if choice == "1":
print()
searchQuery2 = input("Extract Lyrics from which Artist? :")
print()
#Create Lyrics CSV
import lyricsgenius as genius
api = genius.Genius('tZhiB5ALt1qhLTWHtY_onJBHr4rLvRoFHvE8h5xSii24WJ8ioc9_-DxyuQzGbYtS') #auth code
artist2 = api.search_artist(searchQuery2)
genius_songs = artist2.songs
lyric_df = pd.DataFrame(columns=['title', 'album', 'year','lyrics'])
for x in genius_songs:
lyric_df = lyric_df.append({
'title': x.title,
'lyrics': x.lyrics,
'album': x.album,
'year': x.year
}, ignore_index=True)
lyric_df.to_csv("lyric.csv", index=False)
print()
print("Lyrics saved!")
if choice == "2":
break
>>> Welcome to Build Your Own Spotify/Lyrics Dataset!
0 - Create csv for an artist's songs/audio features
1 - Find lyrics for an artist
2 - Exit
Your choice: 1
Extract Lyrics from which Artist? :Frank Ocean
Searching for songs by Frank Ocean...
Song 1: "Thinkin Bout You"
Song 2: "Nights"
Song 3: "Pink Matter"
Song 4: "Chanel"
Song 5: "Self Control"
Song 6: "Pyramids"
Song 7: "Ivy"
Song 8: "Super Rich Kids"
Song 9: "Nikes"
Song 10: "Lost"
Song 11: "Bad Religion"
Song 12: "Novacane"
Song 13: "Pink + White"
Song 14: "Forrest Gump"
Song 15: "White Ferrari"
Song 16: "Solo"
Song 17: "Swim Good"
Song 18: "Seigfried"
Song 19: "Biking"
Song 20: "Solo (Reprise)"
Song 21: "Sweet Life"
Song 22: "Futura Free"
Song 23: "Godspeed"
Song 24: "Pilot Jones"
Song 25: "DHL"
Song 26: "Skyline To"
Song 27: "We All Try"
Song 28: "Crack Rock"
Song 29: "Cayendo"
Song 30: "Sierra Leone"
Song 31: "Close to You"
Song 32: "Good Guy"
Song 33: "In My Room"
Song 34: "Frank Ocean’s Open Letter on Tumblr"
Song 35: "American Wedding"
Song 36: "Provider"
Song 37: "Be Yourself"
Song 38: "End/Golden Girl"
Song 39: "Acura Integurl"
Song 40: "Moon River"
Song 41: "Lens"
Song 42: "Strawberry Swing"
Song 43: "U-N-I-T-Y"
Song 44: "At Your Best (You Are Love)"
Song 45: "Pretty Sweet"
Song 46: "Monks"
Song 47: "Higgs"
Song 48: "Facebook Story"
Song 49: "Songs for Women"
Song 50: "Wiseman"
Song 51: "Nature Feels"
Song 52: "Slide on Me"
Song 53: "Dear April"
Song 54: "Rushes"
Song 55: "Comme Des Garçons"
Song 56: "Wither"
Song 57: "Lovecrimes"
Song 58: "Biking (Solo)"
Song 59: "Alabama"
Song 60: "Fertilizer"
Song 61: "There Will Be Tears"
Song 62: "Eyes Like Sky"
Song 63: "Lens (Version 2)"
Song 64: "Voodoo"
Song 65: "Blue Whale"
Song 66: "Not Just Money"
Song 67: "Miss You So"
Song 68: "Chanel (Remix)"
Song 69: "Mine"
Song 70: "Mitsubishi Sony"
Song 71: "Sideways"
Song 72: "Device Control"
Song 73: "Dust"
Song 74: "Hublots"
Song 75: "In Here Somewhere"
Song 76: "Scared of Beautiful"
Song 77: "Start"
Song 78: "Pyrite (Fool’s Gold)"
Song 79: "I Miss You"
Song 80: "Impietas + Deathwish (ASR)"
Song 81: "A Certain Way"
Song 82: "Rushes To"
Song 83: "Honeybaby"
Song 84: "Summer Remains"
Song 85: "If I’m in Love"
Song 86: "Pink Matter (Remix)"
Song 87: "Bitches Talkin’ (Metal Gear Solid)"
Song 88: "Bedtime Story"
Song 89: "Florida"
Song 90: "White"
Song 91: "Blasted"
Song 92: "Can’t Be the Last Time"
Song 93: "Hero"
Song 94: "Xenons"
Song 95: "Whip Appeal"
Song 96: "Bricks and Steel"
Song 97: "Versace"
Song 98: "Ohh in Love"
Song 99: "Memrise"
Song 100: "Dying for Your Love"
Song 101: "No Love"
Song 102: "Tumblr/Magazine Note"
Song 103: "Day Away"
Song 104: "Poolside Convo (Self Control Intro)"
Song 105: "Street Fighter"
Song 106: "Back"
Song 107: "Rocket Love"
Song 108: "Mitsubishi Sony (Magazine Version)"
Song 109: "Slide On Me (Remix)"
Song 110: "Wise Man"
Song 111: "Sucka for Love (Alternate Version)"
Song 112: "Hardest Thing"
Song 113: "4 Tears"
Song 114: "Orion"
Song 115: "Goldeneye"
Song 116: "Nikes (Video Version)"
Song 117: "Easy"
Song 118: "Denim"
Song 119: "Ready"
Song 120: "No Bonnie"
Song 121: "Soul Calibur"
Song 122: "Non-Stop"
Song 123: "Movie List"
Song 124: "Sucker for Love"
Song 125: "Broken Pieces"
Song 126: "Little Demon (Arca Remix)"
Song 127: "Anywhere"
Song 128: "Blonde Thank You Note (Tumblr)"
Song 129: "Richest Man in the Room"
Song 130: "It’s All Good"
Song 131: "Channel ORANGE [Artwork]"
Song 132: "Math"
Song 133: "Cayendo (Sango Remix)"
Song 134: "White Ferrari (Magazine Version)"
Song 135: "Lost Angel"
Song 136: "I Need It"
Song 137: "Godspeed Screenplay (Episode 1, Scene 1)"
Song 138: "Real"
Song 139: "Brave"
Song 140: "Dream Killa"
Song 141: "Girlfriend’s Best Friend"
Song 142: "One Look"
Song 143: "She Won’t Say Hello"
Song 144: "Quickly"
Song 145: "Stay If You Go"
"Holy Combat (BasedGod Interview)" is not valid. Skipping.
Song 146: "I Need Love"
Song 147: "Disillusioned"
Song 148: "Follow"
Song 149: "Little Demon"
Song 150: "PDA"
Song 151: "Time Machine"
Song 152: "The City"
Song 153: "Try"
Song 154: "Truce"
"blonded RADIO 001 Tracklist" is not valid. Skipping.
Song 155: "Only You"
Song 156: "J.O.B"
Song 157: "Dear April (Justice Remix)"
Song 158: "Overload"
"Channel ORANGE [Booklet]" is not valid. Skipping.
Song 159: "Night.s (Demo)"
Song 160: "Rolls Royce Bitch Freestyle"
Song 161: "Boyfriend"
Song 162: "Done"
Song 163: "Static"
Song 164: "Together"
Song 165: "Home."
"Frank Ocean Oyster Magazine Interview" is not valid. Skipping.
Song 166: "Tumblr Letter About Orlando Shooting"
Song 167: "Greedy Love"
Song 168: "Let Me"
Song 169: "Taste"
Song 170: "Simply"
Song 171: "Feel California"
Song 172: "Heartbreak + Jetlag"
Song 173: "Wake Up"
Song 174: "Godspeed Screenplay (Episode 1, Scene 3)"
Song 175: "I Miss That Life"
Song 176: "Letter of Apology"
Song 177: "Kamikaze"
Song 178: "Close To You / Never Can Say Medley"
Song 179: "So Comfortable"
Song 180: "Song List"
Song 181: "Old Terror"
Song 182: "Godspeed (dvsn remix)"
"blonded RADIO 006 Tracklist" is not valid. Skipping.
Song 183: "BITCH I THINK I’M TOM PETTY."
Song 184: "As i take myself to sleep."
"Frank Ocean GQ Man Of The Year Interview" is not valid. Skipping.
Song 185: "Read the Stars"
Song 186: "Got the Keys"
Song 187: "Holly Baby"
"blonded RADIO 002 Tracklist" is not valid. Skipping.
"blonded RADIO 004 Tracklist" is not valid. Skipping.
Song 188: "Flight"
Song 189: "Review of Kim Burrell’s “Home”"
Song 190: "STAY TRUE"
"blonded RADIO 003 Tracklist" is not valid. Skipping.
Song 191: "In My Room (BennY RevivaL Remix)"
Song 192: "The Weekend"
Song 193: "Questions From Tumblr"
Song 194: "Go Up"
Song 195: "Untitled (Tumblr Post #33700009336)"
Song 196: "Focus"
Song 197: "Night.s (Magazine Version)"
Song 198: "I feel u bro."
Song 199: "Tumblr Post on Death"
Song 200: "Thinking Bout You Pt. 2"
Song 201: "Provider V4"
Song 202: "Lights"
Song 203: "Wise Man (Revised)"
Song 204: "Note on Released Songs"
Song 205: "Sucker for Love [Alternate Song Version]"
Song 206: "Tumblr post on current life"
Song 207: "My Random"
Song 208: "AT LUNCH WITH MY FAMILY.."
Song 209: "Tumblr Post on Eyes"
Song 210: "Best Seller"
Song 211: "LIVING OR.."
Song 212: "Standing Still"
Song 213: "Ambience 003"
Song 214: "ON OCCASION."
Song 215: "Know Better"
"blonded RADIO 005 Tracklist" is not valid. Skipping.
Song 216: "Godspeed Screenplay (Episode 1, Scene 2)"
Song 217: "Writing in dark places."
Song 218: "THE INTERNET MADE FAME WACK…"
Song 219: "Experiences"
Song 220: "Note on Plane Ride"
Song 221: "White (Gentrification Mix)"
"blonded RADIO Jay Z Interview" is not valid. Skipping.
Song 222: "Tumblr Post on LA"
Song 223: "Note on Dreams"
"The Lonny Breaux Collection [Tracklist + Album Art]" is not valid. Skipping.
Song 224: "Modern Moses"
Song 225: "Tumblr post on Solidarity"
Song 226: "TEXTING.."
Song 227: "FUCK THE OPPOSITION."
Song 228: "CLEAN SOCKS > EVERYTHANG"
Song 229: "REMINDER"
Song 230: "Sade wisdom on the wall next to my door. in pastel."
Song 231: "Tumblr post on adjectives"
Song 232: "Open Air"
Song 233: "LISTEN TO TOMMY WITH A CANDLE BURNING…"
Song 234: "Tonight"
Song 235: "Tumblr post on Perseverance"
Song 236: "Abracadabra"
Song 237: "Dec. 27th"
Song 238: "Super Rich Kids (Live)"
Song 239: "Rinse."
Song 240: "Hopes and Dreams"
"Rolls Rolls Wraith Skit" is not valid. Skipping.
Song 241: "Back (To You)"
Song 242: "Thinkin bout you - spring sampler / 2012"
Song 243: "Lonny (New Mix)"
Song 244: "Private Show"
Song 245: "Nabil’s Photographs of Somalia 2011 Famine"
Song 246: "Godspeed Screenplay (Episode 1, Scene 6)"
Song 247: "Love on My Piano"
Song 248: "This Is Mom"
Song 249: "Half.."
Song 250: "Theme Song"
"blonded RADIO 008 Tracklist" is not valid. Skipping.
Song 251: "U Got it"
Song 252: "Godspeed Screenplay (Episode 1, Scene 5)"
Song 253: "Godspeed Screenplay (Episode 1, Scene 4)"
Song 254: "Break Down*"
Song 255: "So Fresh"
Song 256: "Freestyle"
"blonded RADIO Midterms pt. I, pt. II and pt. III Tracklists" is not valid. Skipping.
Song 257: "Chanel (Nick Leon Atmosphere Remix)"
Song 258: "Someboy Else"
Song 259: "No Such Thing As White Jesus"
Song 260: "Bitches Talk (Repeat)"
Timeout raised and caught:
HTTPSConnectionPool(host='api.genius.com', port=443): Read timed out. (read timeout=5)
TypeError: argument of type 'NoneType' is not iterable
songs = pd.read_csv("frank_spotify.csv")
songs.head()
songs.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421 entries, 0 to 420
Data columns (total 22 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 421 non-null object
1 duration_ms 421 non-null int64
2 popularity 421 non-null int64
3 num_markets 421 non-null int64
4 album 421 non-null object
5 disc_number 421 non-null int64
6 is_explicit 421 non-null bool
7 track_number 421 non-null int64
8 release_date 421 non-null object
9 artist 421 non-null object
10 danceability 421 non-null float64
11 energy 421 non-null float64
12 key 421 non-null int64
13 loudness 421 non-null float64
14 mode 421 non-null int64
15 speechiness 421 non-null float64
16 acousticness 421 non-null float64
17 instrumentalness 421 non-null float64
18 liveness 421 non-null float64
19 valence 421 non-null float64
20 tempo 421 non-null float64
21 time_signature 421 non-null int64
dtypes: bool(1), float64(9), int64(8), object(4)
memory usage: 69.6+ KB
songs['album'].unique()
songs.describe()
lyrics = pd.read_csv("frank_lyric.csv")
lyrics = lyrics[['title', 'album','lyrics',]]
lyrics.head()
lyrics.info() #we see that a lot of albums are missing, and some are just instrumentals.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 title 271 non-null object
1 album 187 non-null object
2 lyrics 264 non-null object
dtypes: object(3)
memory usage: 6.5+ KB
song_albums = [
'Blonde', 'channel ORANGE', 'Moon River', 'Provider',
'Biking (Solo)', 'Lens', 'Chanel', 'Slide (feat. Frank Ocean & Migos)', 'RAF', 'Novacane', 'Swim Good',
'Thinkin Bout You', 'Chanel'
]
songs_updated = songs.loc[songs['album'].isin(song_albums)]
songs_updated.head()
len(songs_updated)
songs_updated = songs_updated.drop(42) #Novacane Edited
songs_updated = songs_updated.drop(220) #Nonexplicit RAF
len(songs_updated)
lyrics['album'].unique() #before data cleaning
lyric_albums = ['nostalgia, ULTRA.', 'channel ORANGE', 'Blonde ', 'Endless']
selected_titles = ['Moon River', 'Chanel', 'Summer Remains', 'Lens', 'Provider', 'Biking (Solo)']
lyrics_updated = lyrics.loc[lyrics['album'].isin(lyric_albums) | lyrics['title'].isin(selected_titles)]
len(lyrics_updated)
#Titles do not match:
lyrics_updated = lyrics_updated.replace("Close to You", "Close To You")
lyrics_updated = lyrics_updated.replace("End/Golden Girl", "End")
#lyrics_updated[lyrics_updated['title'] == 'Nostalgia, ULTRA [Artwork]'] = 143
lyrics_updated = lyrics_updated.drop(143)
#Pre-Processing of Lyrics:
# Ensure that Pandas shows at least 500 characters in columns, so we can see full tweets
pd.set_option('max_colwidth', 1250)
#Altering the lyrics to make it more readable:
test_lyrics = lyrics_updated[['title','lyrics']]
test_lyrics.loc[:,'lyrics'] = test_lyrics['lyrics'].str.lower()
stop = stopwords.words('english')
stop += ['i\'ll']
#count stopwords
test_lyrics.loc[:,'stopwords'] = test_lyrics['lyrics'].apply(lambda x: len([x for x in x.split() if x in stop]))
#Pre-processing continued:
test_lyrics.loc[:,'lyrics'] = test_lyrics['lyrics'].str.replace('\[(.*)]', '') #removes the following:
#(Verse [0-9]|Pre-Chorus [0-9]|Hook [0-9]|Chorus|Outro|Verse|Refrain|Hook|Bridge|Intro|Instrumental|Skit)
#removes stop words
test_lyrics.loc[:,'lyrics'] = test_lyrics['lyrics'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
test_lyrics.head()
#remove punctuation
test_lyrics.loc[:,'lyrics'] = test_lyrics['lyrics'].str.replace('[^\w\s]', '') #removes punctuation
test_lyrics.loc[:,'lyrics'] = test_lyrics['lyrics'].str.replace('\r\n', ' ')
#remove japanese verse from solo
test_lyrics.loc[:,'lyrics'] = test_lyrics['lyrics'].str.replace('一(.*)','')
test_lyrics.head(3)
/opt/venv/lib/python3.7/site-packages/pandas/core/indexing.py:1048: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
self.obj[item_labels[indexer[info_axis]]] = value
test_lyrics.sort_values('stopwords', ascending = False).head(5)
#Vader Analysis:
sent_vader = SentimentIntensityAnalyzer()
test_lyrics.loc[:,'vaderPos'] = test_lyrics['lyrics'].apply(lambda x: sent_vader.polarity_scores(x)['pos'])
test_lyrics.loc[:,'vaderNeu'] = test_lyrics['lyrics'].apply(lambda x: sent_vader.polarity_scores(x)['neu'])
test_lyrics.loc[:,'vaderNeg'] = test_lyrics['lyrics'].apply(lambda x: sent_vader.polarity_scores(x)['neg'])
test_lyrics.loc[:,'vaderCombined'] = test_lyrics['lyrics'].apply(lambda x: sent_vader.polarity_scores(x)['compound'])
final_lyrics = test_lyrics.loc[:]
final_lyrics.head(3)
#MERGING BOTH CSV's
frank = songs_updated.merge(right = final_lyrics, how = 'inner', left_on = 'name', right_on = 'title')
#difference between valences (SPOTIFY - VADER)
frank['valenceDiff'] = (frank['valence'] - frank['vaderCombined'])
#can't do outer join or else RAF,Slide, and Endless do not have lyrics or song features.
pd.set_option('max_colwidth', 50)
frank.head(3)
#remove unnecessary columns
frank = frank.drop(['title','artist','disc_number'], axis = 1)
#Change Index to Title of Track
frank = frank.set_index('name')
#Change Labeling due to Merging
final_frank = frank.rename(columns = lambda x:'album' if x == 'album_x' else x)
final_frank = final_frank.reset_index()
final_frank.head(3)
vader_cols = ['name', 'valence', 'vaderPos', 'vaderNeg', 'vaderCombined', 'valenceDiff']
final_frank[vader_cols].sort_values('vaderPos', ascending = False).head(3) #highest vaderPos = Swim Good
final_frank[vader_cols].sort_values('vaderNeg', ascending = False).head(3) #highest vaderNeg = Fertilizer
final_frank[vader_cols].sort_values('vaderCombined', ascending = False).head(3) #highest vaderCombined = Super Rich Kids
final_frank[vader_cols].sort_values('vaderCombined', ascending = True).head(3) #lowest vaderCombined = Nights
#Analysis of Blonde:
blonde = final_frank.loc[final_frank['album'] == 'Blonde']
blonde = blonde[['name','valence', 'vaderCombined', 'valenceDiff']]
blonde.loc[:,'valence'] = blonde['valence'].apply(lambda x: round(x, 3))
blonde.loc[:,'vaderCombined'] = blonde['vaderCombined'].apply(lambda x: round(x, 3))
blonde.columns = ['name', 'musicalVal', 'lyricalVal', 'valenceDiff']
final_blonde = blonde.melt(id_vars = 'name', value_vars = ['musicalVal', 'lyricalVal', 'valenceDiff'], var_name = 'valType', value_name = 'valence')
#Analysis of Channel Orange:
c_orange = final_frank.loc[final_frank['album'] == 'channel ORANGE']
c_orange = c_orange[['name','valence', 'vaderCombined', 'valenceDiff']]
c_orange.loc[:,'valence'] = c_orange['valence'].apply(lambda x: round(x, 3))
c_orange.loc[:,'vaderCombined'] = c_orange['vaderCombined'].apply(lambda x: round(x, 3))
c_orange.columns = ['name', 'musicalVal', 'lyricalVal', 'valenceDiff']
final_orange = c_orange.melt(id_vars = 'name', value_vars = ['musicalVal', 'lyricalVal', 'valenceDiff'], var_name = 'valType', value_name = 'valence')
#Other Albums/Singles that both SPOTIFY AND GENIUS have:
other_albums = ['Novacane', 'Swim Good', 'Chanel', 'Lens', 'Biking (Solo)', 'Provider', 'Moon River']
others = final_frank.loc[final_frank['album'].isin(other_albums)]
others = others[['name','valence', 'vaderCombined', 'valenceDiff', 'release_date']]
others.loc[:,'valence'] = others['valence'].apply(lambda x: round(x, 3))
others.loc[:,'vaderCombined'] = others['vaderCombined'].apply(lambda x: round(x, 3))
others.columns = ['name', 'musicalVal', 'lyricalVal', 'valenceDiff', 'release_date']
others = others.sort_values(['release_date'], ascending = True)
final_others = others.melt(id_vars = 'name', value_vars = ['musicalVal', 'lyricalVal', 'valenceDiff'], var_name = 'valType', value_name = 'valence')
others
final_others
sns.set_style("whitegrid")
#Valence (Blonde)
plt.figure(figsize=(20, 9))
blonde_valence = sns.pointplot(x ='name', y = 'valence', hue = 'valType', data=final_blonde,
palette = sns.color_palette('pastel'), scale = 2)
blonde_valence.set_ylabel("Valence Levels", weight = 'bold').set_fontsize('15')
blonde_valence.set_xlabel("Title of Song (Ordered by Tracklist)", weight = 'bold').set_fontsize('15')
blonde_valence.set_title("Fig 1: Blonde - Musical vs Valence Levels", weight='bold').set_fontsize('25')
#Valence (Channel Orange)
plt.figure(figsize=(20, 9))
orange_valence = sns.pointplot(x ='name', y = 'valence', hue = 'valType', data=final_orange,
palette = sns.color_palette('colorblind'), scale = 1.5)
orange_valence.set_ylabel("Valence Levels", weight = 'bold').set_fontsize('15')
orange_valence.set_xlabel("Title of Song (Ordered by Tracklist)", weight = 'bold').set_fontsize('15')
orange_valence.set_title("Fig 2: Channel Orange - Musical vs Valence Levels", weight='bold').set_fontsize('25')
#THE SONG WHITE HAS NO LYRICAL VALENCE, SO IS BIASED FOR DIFFERENCE
#Valence (Others)
plt.figure(figsize=(20, 9))
other_valence = sns.pointplot(x ='name', y = 'valence', hue = 'valType', data=final_others,
palette = sns.color_palette('deep'), scale = 2)
other_valence.set_ylabel("Valence Levels", weight = 'bold').set_fontsize('15')
other_valence.set_xlabel("Title of Song (Ordered by Release Date)", weight = 'bold').set_fontsize('15')
other_valence.set_title("Fig 3: Other Singles - Musical vs Valence Levels", weight='bold').set_fontsize('25')
#final_blonde, final_orange, final_others (albums)
#valenceType: lyricalVal, valenceDiff, musicalVal
def calculate_valence_sum(valenceType, album):
""" Calculates valenceSum for a valenceType and album"""
total = 0
df = album[album['valType'] == valenceType]
for songVal in df['valence']:
total+= songVal
return total
calculate_valence_sum('lyricalVal', final_blonde)
calculate_valence_sum('lyricalVal', final_orange)
calculate_valence_sum('musicalVal', final_orange)
calculate_valence_sum('musicalVal', final_blonde)
calculate_valence_sum('lyricalVal', final_blonde) - calculate_valence_sum('lyricalVal', final_orange)