The Weirdest State in the US
Set Up
import time
import os
import pandas as pd
import numpy as np
import altair as alt
import pytrends
from pytrends.request import TrendReq
from sklearn import metrics
import geopandas as gpd
pytrends = TrendReq(hl='en-US', tz=360)
keywords = pd.read_csv('keywords.csv',names=['keyword'],index_col=False).keyword
keywords = list(set(keywords)) # Remove any duplicates, in case we have some
print(f'Examples of some keywords used: {keywords[:5]}')
# Read existing file to avoid checking for the same topics we already checked
df = pd.read_csv('./data/interest.csv',index_col=[0,1])
# Remove repeated keywords
keywords_copy = keywords.copy()
for kw in keywords_copy:
if kw in df.columns:
keywords.remove(kw)
# Remove deleted keywords
for c in df.columns:
if c not in keywords_copy:
df.drop(c,axis=1,inplace=True)
# Iterate through each topic
data = []
for i,kw in enumerate(keywords):
print('Fetching {} of {}'.format(i+1,len(keywords)), end="\r", flush=True)
# Autocomplete the topic, in case Google uses a different terminology
code = pytrends.suggestions(kw)
code = code[0]['mid']
pytrends.build_payload(
[code],
timeframe='2019-01-01 2020-01-01',
geo='US'
)
time.sleep(5) # Sleep to avoid getting blocked by Google's servers
# Get the actual interest (search activity) for the topic, on a state-level
res = pytrends.interest_by_region(resolution='REGION',inc_geo_code=True,inc_low_vol=False)
res.geoCode = res.geoCode.str[-2:]
res = res.reset_index().set_index(['geoName','geoCode'])
res.rename({code:kw},axis=1,inplace=True)
time.sleep(5) # Sleep to avoid getting blocked by Google's servers
res /= 100 # Convert percentage to ratio
df = pd.concat([df,res],axis=1)
df.to_csv('./data/interest.csv') # Save for later use
df.head()
Creating a Distance Matrix
d = metrics.pairwise.manhattan_distances(df)
d[np.diag_indices_from(d)] = np.nan
d = pd.DataFrame(d,index=df.index.get_level_values(0),columns=df.index.get_level_values(0))
d = d/(df.max() - df.min()).sum() # Divide the matrix by the theoretical maximum difference two states could have
source = d.rename_axis('state1',axis=0).rename_axis('state2',axis=1).stack().reset_index()
source.rename(columns={0:'distance'},inplace=True)
alt.Chart(source).mark_rect().encode(
x='state1:O',
y='state2:O',
color=alt.Color('distance:Q', scale=alt.Scale(scheme='plasma'),legend=None),
tooltip=['state1','state2','distance']
).properties(
width=450,
height=450
)
Which states are most similar to each other?
s = 1-d
s = s.replace(1,np.nan)
# Rename data
source = s.rename_axis('State',axis=0).rename_axis('Similarity_to',axis=1).reset_index()
source.rename(columns={0:'Similarity'},inplace=True)
# Get geospatial data so we can plot the map
gdf = gpd.read_file('https://cdn.jsdelivr.net/npm/us-atlas@3/states-10m.json')
gdf.rename(columns={'name':'State'},inplace=True)
source
# Rename data
source = s.rename_axis('State',axis=0).rename_axis('Similarity_to',axis=1).reset_index()
source.rename(columns={0:'Similarity'},inplace=True)
# Get geospatial data so we can plot the map
gdf = gpd.read_file('https://cdn.jsdelivr.net/npm/us-atlas@3/states-10m.json').drop('id',axis=1)
gdf.rename(columns={'name':'State'},inplace=True)
source = gpd.GeoDataFrame(pd.merge(source,gdf,on='State'))
input_dropdown = alt.binding_select(options=source.State.unique())
selection = alt.selection_single(fields=['Similarity_to'], empty='none',init={'Similarity_to': 'New York'})
map1 = alt.Chart(source).mark_geoshape(
stroke='black',
opacity=0.1,
).encode(
tooltip='State',
).add_selection(
selection
)
map2 = alt.Chart(source).mark_geoshape(
stroke='black'
).encode(
color=alt.Color('Similarity:Q',legend=None),
tooltip=['State']
).transform_fold(
source.drop(['State','geometry'],axis=1).columns.values, # Preserve the State column, fold the rest
['Similarity_to','Similarity']
).transform_filter(
selection
).properties(
projection={'type': 'albersUsa'},
width=700,
height=400
)
map2+map1
input_dropdown = alt.binding_select(options=source.State.unique())
selection = alt.selection_single(name=' ',fields=['Similarity_to'], bind=input_dropdown ,init={'Similarity_to': 'New York'})
map1 = alt.Chart(gdf).mark_geoshape(
stroke='black',
color='green',
)
map2 = alt.Chart(gdf).mark_geoshape(
stroke='black'
).encode(
color=alt.Color('Similarity:Q',legend=None),
tooltip = ['State:O','Similarity:Q']
).properties(
projection={'type': 'albersUsa'},
width=700,
height=400
).transform_lookup(
lookup='State',
from_=alt.LookupData(source, 'State', source.columns.values)
).transform_fold(
source.drop('State',axis=1).columns.values, # Preserve the State column, fold the rest
['Similarity_to','Similarity']
).transform_filter(
selection
).add_selection(
selection
)
map1+map2
source = d.median(axis=1).reset_index()
source.columns = ['State','Weirdness']
alt.Chart(source, title='The Weirdest State in the US').mark_bar().encode(
x = alt.X('State',sort='-y'),
y = alt.Y('Weirdness'),
color=alt.Color('Weirdness',legend=None),
tooltip=['Weirdness']
).properties(
width=980
)