The Weirdest State in the US

Set Up

import time import os import pandas as pd import numpy as np import altair as alt import pytrends from pytrends.request import TrendReq from sklearn import metrics import geopandas as gpd pytrends = TrendReq(hl='en-US', tz=360)

keywords = pd.read_csv('keywords.csv',names=['keyword'],index_col=False).keyword keywords = list(set(keywords)) # Remove any duplicates, in case we have some print(f'Examples of some keywords used: {keywords[:5]}')

# Read existing file to avoid checking for the same topics we already checked df = pd.read_csv('./data/interest.csv',index_col=[0,1]) # Remove repeated keywords keywords_copy = keywords.copy() for kw in keywords_copy: if kw in df.columns: keywords.remove(kw) # Remove deleted keywords for c in df.columns: if c not in keywords_copy: df.drop(c,axis=1,inplace=True) # Iterate through each topic data = [] for i,kw in enumerate(keywords): print('Fetching {} of {}'.format(i+1,len(keywords)), end="\r", flush=True) # Autocomplete the topic, in case Google uses a different terminology code = pytrends.suggestions(kw) code = code[0]['mid'] pytrends.build_payload( [code], timeframe='2019-01-01 2020-01-01', geo='US' ) time.sleep(5) # Sleep to avoid getting blocked by Google's servers # Get the actual interest (search activity) for the topic, on a state-level res = pytrends.interest_by_region(resolution='REGION',inc_geo_code=True,inc_low_vol=False) res.geoCode = res.geoCode.str[-2:] res = res.reset_index().set_index(['geoName','geoCode']) res.rename({code:kw},axis=1,inplace=True) time.sleep(5) # Sleep to avoid getting blocked by Google's servers res /= 100 # Convert percentage to ratio df = pd.concat([df,res],axis=1) df.to_csv('./data/interest.csv') # Save for later use

df.head()

Creating a Distance Matrix

d = metrics.pairwise.manhattan_distances(df) d[np.diag_indices_from(d)] = np.nan d = pd.DataFrame(d,index=df.index.get_level_values(0),columns=df.index.get_level_values(0))

d = d/(df.max() - df.min()).sum() # Divide the matrix by the theoretical maximum difference two states could have

source = d.rename_axis('state1',axis=0).rename_axis('state2',axis=1).stack().reset_index() source.rename(columns={0:'distance'},inplace=True) alt.Chart(source).mark_rect().encode( x='state1:O', y='state2:O', color=alt.Color('distance:Q', scale=alt.Scale(scheme='plasma'),legend=None), tooltip=['state1','state2','distance'] ).properties( width=450, height=450 )

Which states are most similar to each other?

s = 1-d s = s.replace(1,np.nan)

# Rename data source = s.rename_axis('State',axis=0).rename_axis('Similarity_to',axis=1).reset_index() source.rename(columns={0:'Similarity'},inplace=True) # Get geospatial data so we can plot the map gdf = gpd.read_file('https://cdn.jsdelivr.net/npm/us-atlas@3/states-10m.json') gdf.rename(columns={'name':'State'},inplace=True)

source

# Rename data source = s.rename_axis('State',axis=0).rename_axis('Similarity_to',axis=1).reset_index() source.rename(columns={0:'Similarity'},inplace=True) # Get geospatial data so we can plot the map gdf = gpd.read_file('https://cdn.jsdelivr.net/npm/us-atlas@3/states-10m.json').drop('id',axis=1) gdf.rename(columns={'name':'State'},inplace=True) source = gpd.GeoDataFrame(pd.merge(source,gdf,on='State')) input_dropdown = alt.binding_select(options=source.State.unique()) selection = alt.selection_single(fields=['Similarity_to'], empty='none',init={'Similarity_to': 'New York'}) map1 = alt.Chart(source).mark_geoshape( stroke='black', opacity=0.1, ).encode( tooltip='State', ).add_selection( selection ) map2 = alt.Chart(source).mark_geoshape( stroke='black' ).encode( color=alt.Color('Similarity:Q',legend=None), tooltip=['State'] ).transform_fold( source.drop(['State','geometry'],axis=1).columns.values, # Preserve the State column, fold the rest ['Similarity_to','Similarity'] ).transform_filter( selection ).properties( projection={'type': 'albersUsa'}, width=700, height=400 ) map2+map1

input_dropdown = alt.binding_select(options=source.State.unique()) selection = alt.selection_single(name=' ',fields=['Similarity_to'], bind=input_dropdown ,init={'Similarity_to': 'New York'}) map1 = alt.Chart(gdf).mark_geoshape( stroke='black', color='green', ) map2 = alt.Chart(gdf).mark_geoshape( stroke='black' ).encode( color=alt.Color('Similarity:Q',legend=None), tooltip = ['State:O','Similarity:Q'] ).properties( projection={'type': 'albersUsa'}, width=700, height=400 ).transform_lookup( lookup='State', from_=alt.LookupData(source, 'State', source.columns.values) ).transform_fold( source.drop('State',axis=1).columns.values, # Preserve the State column, fold the rest ['Similarity_to','Similarity'] ).transform_filter( selection ).add_selection( selection )

map1+map2

source = d.median(axis=1).reset_index() source.columns = ['State','Weirdness'] alt.Chart(source, title='The Weirdest State in the US').mark_bar().encode( x = alt.X('State',sort='-y'), y = alt.Y('Weirdness'), color=alt.Color('Weirdness',legend=None), tooltip=['Weirdness'] ).properties( width=980 )

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}The Weirdest State in the US

Set Up

Creating a Distance Matrix

Which states are most similar to each other?

The Weirdest State in the US