Spotify Countries Proximity

import hashlib import io import numpy as np import pandas as pd import requests countries = [ "ar", "cl", "us", "es", "gb" ] def get_download_url(country): return f"https://spotifycharts.com/regional/{country}/daily/latest/download" def get_country_df(country): country_url = get_download_url(country) r = requests.get(country_url) csv_binary = r.content.decode("utf-8") return pd.read_csv(io.StringIO(csv_binary), header=1, error_bad_lines=False).head(10) def get_all_countries_tracks_dict(): countries_dict = {} for country in countries: try: df = pd.read_csv(f'regional-{country}-weekly-latest.csv', header=1) tracks = list(df["Track Name"]) if tracks: countries_dict[country] = tracks except Exception as E: print(E) print(f"{country} encoding failure") return countries_dict def get_all_tracks_as_set(countries_dict): unique_tracks = set() for country in list(countries_dict.keys()): unique_tracks.update(countries_dict[country]) return unique_tracks def LD(s, t): if s == "": return len(t) if t == "": return len(s) if s[-8:] == t[-8:]: cost = 0 else: cost = 1 res = min([LD(s[:-8], t)+1, LD(s, t[:-8])+1, LD(s[:-8], t[:-8]) + cost]) return res def encode_set(unique_tracks): hash_dict = {} for track in unique_tracks: hash_dict[track] = str(abs(hash(track))) return hash_dict def get_all_countries_tracks_dict_hashed(countries_dict, all_tracks_encoded_dict): countries_dict_hashed = {} for country in countries_dict: countries_dict_hashed[country] = [all_tracks_encoded_dict[track] for track in countries_dict[country]] return countries_dict_hashed def LD(s, t): if s == "": return len(t) if t == "": return len(s) if s[-8:] == t[-8:]: cost = 0 else: cost = 1 res = min([LD(s[:-8], t)+1, LD(s, t[:-8])+1, LD(s[:-8], t[:-8]) + cost]) return res def minimumEditDistance(first, second): #Creating numpy ndarray( initialized with 0 of dimension of size of both strings len_first = int(len(first)/19) len_second = int(len(second)/19) matrix = np.zeros((len_first+1, len_second+1), dtype=np.int) # Cross relation loop through each character of each string with each other and # fill the respective index of matrxi (row,column) for i in range(len_first+1): for j in range(len_second+1): #First doing the boundary value analysis, if first or second string is empty so directly adding insertion cost if i == 0: matrix[i][j] = j #Second case elif j == 0: matrix[i][j] = i else: x = i - 1 + 19 * (i - 1) y = i - 1 + 19 * (i - 1 + 1) matrix[i][j] = min(matrix[i][j-1] + 1, matrix[i-1][j] + 1, matrix[i-1][j-1] + 1 if first[x:y] != second[x:y] else matrix[i-1][j-1] + 0) # Adjusted the cost accordinly, insertion = 1, deletion=1 and substitution=2 return matrix[len_first][len_second] # Returning the final import scipy.stats as stats def kt(left, right): tau, p_value = stats.kendalltau(left, right) return tau def normalised_kendall_tau_distance(values1, values2): """Compute the Kendall tau distance.""" n = len(values1) assert len(values2) == n, "Both lists have to be of equal length" i, j = np.meshgrid(np.arange(n), np.arange(n)) a = np.argsort(values1) b = np.argsort(values2) ndisordered = np.logical_or(np.logical_and(a[i] < a[j], b[i] > b[j]), np.logical_and(a[i] > a[j], b[i] < b[j])).sum() return ndisordered / (n * (n - 1)) def jaccard_similarity(list1, list2): intersection = len(list(set(list1).intersection(list2))) union = (len(set(list1)) + len(set(list2))) - intersection return float(intersection) / union def distance_between_arrays(left, right): if len(left) != len(right): print('Arrays sizes does not match') jaccard = jaccard_similarity(left, right) return jaccard

countries_dict = get_all_countries_tracks_dict() #for country in countries_dict.keys(): # print(country, countries_dict[country])

def normalised_kendall_tau_distance(values1, values2): """Compute the Kendall tau distance.""" n = len(values1) assert len(values2) == n, "Both lists have to be of equal length" i, j = np.meshgrid(np.arange(n), np.arange(n)) a = np.argsort(values1) b = np.argsort(values2) ndisordered = np.logical_or(np.logical_and(a[i] < a[j], b[i] > b[j]), np.logical_and(a[i] > a[j], b[i] < b[j])).sum() return ndisordered / (n * (n - 1))

corr_data = [] for country1 in list(countries_dict.keys()): for country2 in list(countries_dict.keys()): d = distance_between_arrays(countries_dict[country1], countries_dict[country2]) corr_data.append([country1, country2, d])

corr_data = [[x[0],x[1], x[2]] for x in corr_data]

import networkx as nx import matplotlib.pyplot as plt from pyvis.network import Network G=nx.Graph() existing_nodes = [] existing_edges = [] def build_graph_for_all(): count=0 for d in corr_data: if d[0] not in existing_edges : G.add_node(str(d[0])) existing_nodes.append(str(d[0])) if d[1] not in existing_edges : G.add_node(str(d[1])) existing_nodes.append(str(d[1])) if d[0] != d[1] and not (((d[0], d[1]) in existing_edges) or ((d[1], d[0]) in existing_edges)): existing_edges.append((d[0], d[1])) G.add_weighted_edges_from([[str(d[0]), str(d[1]), d[2]]]) #build_graph_for_all() #pos = nx.spring_layout(G, weight='weight') #nx.draw(G, pos=pos, width=1, with_labels=True) #plt.savefig("path1.png")

cor_array = [] columns = ['Argentina', 'Chile', 'US', 'Spain', 'UK'] for i in range(0, len(corr_data), 5): tmp = [] for c in corr_data[i : i+5]: tmp.append(c[2]) cor_array.append(tmp)

from string import ascii_letters import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt d = pd.DataFrame(data=cor_array, columns=columns) # Compute the correlation matrix corr = d.corr() # Generate a mask for the upper triangle mask = np.triu(np.ones_like(corr, dtype=bool)) # Set up the matplotlib figure f, ax = plt.subplots(figsize=(11, 9)) # Generate a custom diverging colormap # Create a custom divergin palette cmap = sns.diverging_palette(100, 7, s=75, l=40, n=5, center="light", as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio #sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, # square=True, linewidths=.5, cbar_kws={"shrink": .5}) sns.heatmap(corr, mask=mask, center=0, annot=True, fmt='.2f', square=True, cmap='YlGnBu') ax.set_title('Jaccard Correlation Spotify Hits per Country') plt.show()

Chile, Argentina and Spain could be grouped together while UK and US can be grouped by themselves. Language appears to be the obvious casual, and way stronger factor than e.g Geographical Proximity.