# Installing covidcast
!pip install covidcast
from datetime import date
import pandas as pd
import covidcast
import matplotlib.pyplot as plt
# CONSTANTS
SOURCES = ['chng.smoothed_adj_outpatient_covid',
'doctor-visits.smoothed_adj_cli',
'hospital-admissions.smoothed_adj_covid19_from_claims',
'google-symptoms.anosmia_smoothed_search',
'google-symptoms.ageusia_smoothed_search',
'indicator-combination.confirmed_incidence_num']
RENAMES = ['visits_with_covid',
'visits_with_covid_symptoms',
'admissions_with_covid',
'google_anosmia_searches',
'google_ageusia_searches',
'confirmed_instances']
# get all counties and create hash set of their id's
CA_COUNTIES = covidcast.fips_to_name("^06.*", ties_method="all")[0]
CA_COUNTIES.pop('06000')
COUNTY_IDS = set(CA_COUNTIES.keys())
all_dfs = []
for idx, source in enumerate(SOURCES):
src, signal = source.split('.')[0], source.split('.')[1]
tmp_df = covidcast.signal(src, signal, geo_type="county", geo_values=list(COUNTY_IDS), time_type='day')
all_dfs.append(tmp_df)
import pickle
'''
with open('dfs.p', 'wb') as p:
pickle.dump(all_dfs, p)
'''
with open('dfs.p', 'rb') as p:
all_dfs = pickle.load(p)
df_agg = covidcast.aggregate_signals(all_dfs)
df_agg.head()
# drop all counties that do not have google data we want to use
counties_used = all_dfs[4]['geo_value'].unique()
df_agg = df_agg[df_agg['geo_value'].isin(counties_used)]
df_agg['geo_value'].unique()
for idx, source in enumerate(SOURCES):
name = source.split('.')[0] + '_' + source.split('.')[1] + '_{}_value'.format(idx)
df_agg = df_agg.rename(columns={name: RENAMES[idx]})
keep_columns = ['geo_value',
'time_value',
'visits_with_covid',
'visits_with_covid_symptoms',
'admissions_with_covid',
'google_anosmia_searches',
'google_ageusia_searches',
'confirmed_instances']
df_agg = df_agg[keep_columns]
df_agg.head(15)
df_agg = df_agg.drop(df_agg[df_agg['confirmed_instances'].isna()].index)
df_agg.reset_index(inplace = True)
df_agg.isna().sum()
missing = ['visits_with_covid', 'admissions_with_covid', 'google_anosmia_searches', 'google_ageusia_searches']
for feat in missing:
df_agg[feat] = df_agg[feat].interpolate()
still_missing = ['google_anosmia_searches', 'google_ageusia_searches']
for feat in still_missing:
df_agg[feat] = df_agg[feat].fillna(df_agg[feat].median())
df_agg.isna().sum()
df_agg.head()
features = ['visits_with_covid', 'visits_with_covid_symptoms',
'admissions_with_covid', 'google_anosmia_searches', 'google_ageusia_searches']
with open('data.csv', 'w') as f:
f.write('county_id, visits_with_covid_n-1, visits_with_covid_symptoms_n-1, admissions_with_covid_n-1,\
google_anosmia_searches_n-1, google_ageusia_searches_n-1, visits_with_covid_n, visits_with_covid_symptoms_n,\
admissions_with_covid_n, google_anosmia_searches_n, google_ageusia_searches_n, instances_n+1\n')
# loop through each county
for county_id in df_agg['geo_value'].unique():
tmp_df = df_agg[df_agg['geo_value'] == county_id]
tmp_df.reset_index(inplace=True)
for i in range(1, len(tmp_df.index)):
# skip if we are at last value as it has no label
if i == len(tmp_df.index) - 1:
continue
f.write(str(county_id) + ',')
for val in tmp_df.iloc[i-1][features].values:
f.write(str(val) + ',')
for val in tmp_df.iloc[i][features].values:
f.write(str(val) + ',')
f.write(str(tmp_df.iloc[i+1]['confirmed_instances']) + '\n')
# check that it works
df_test = pd.read_csv('data.csv', index_col=False)
df_test.head()