# Installing covidcast
!pip install covidcast
from datetime import date
import pandas as pd
import covidcast
import matplotlib.pyplot as plt
# CONSTANTS
SOURCES = ['chng.smoothed_adj_outpatient_covid',
            'doctor-visits.smoothed_adj_cli', 
            'hospital-admissions.smoothed_adj_covid19_from_claims', 
          'google-symptoms.anosmia_smoothed_search',
            'google-symptoms.ageusia_smoothed_search',
            'indicator-combination.confirmed_incidence_num']
RENAMES = ['visits_with_covid',
           'visits_with_covid_symptoms',
           'admissions_with_covid',
           'google_anosmia_searches',
           'google_ageusia_searches',
           'confirmed_instances']
# get all counties and create hash set of their id's
CA_COUNTIES = covidcast.fips_to_name("^06.*", ties_method="all")[0]
CA_COUNTIES.pop('06000')
COUNTY_IDS = set(CA_COUNTIES.keys())
all_dfs = []
for idx, source in enumerate(SOURCES):
    src, signal = source.split('.')[0], source.split('.')[1]
    tmp_df = covidcast.signal(src, signal, geo_type="county", geo_values=list(COUNTY_IDS), time_type='day')
    all_dfs.append(tmp_df)
import pickle
'''
with open('dfs.p', 'wb') as p:
    pickle.dump(all_dfs, p)
'''
with open('dfs.p', 'rb') as p:
    all_dfs = pickle.load(p)
df_agg = covidcast.aggregate_signals(all_dfs)
df_agg.head()
# drop all counties that do not have google data we want to use
counties_used = all_dfs[4]['geo_value'].unique()
df_agg = df_agg[df_agg['geo_value'].isin(counties_used)]
df_agg['geo_value'].unique()
for idx, source in enumerate(SOURCES):
    name = source.split('.')[0] + '_' + source.split('.')[1] + '_{}_value'.format(idx) 
    df_agg = df_agg.rename(columns={name: RENAMES[idx]})
keep_columns = ['geo_value',
                'time_value',
                'visits_with_covid',
                'visits_with_covid_symptoms',
                'admissions_with_covid',
                'google_anosmia_searches',
                'google_ageusia_searches',
                'confirmed_instances']
df_agg = df_agg[keep_columns]
df_agg.head(15)
df_agg = df_agg.drop(df_agg[df_agg['confirmed_instances'].isna()].index)
df_agg.reset_index(inplace = True)
df_agg.isna().sum()
missing = ['visits_with_covid', 'admissions_with_covid', 'google_anosmia_searches', 'google_ageusia_searches']
for feat in missing:
    df_agg[feat] = df_agg[feat].interpolate()
still_missing = ['google_anosmia_searches', 'google_ageusia_searches']
for feat in still_missing:
    df_agg[feat] = df_agg[feat].fillna(df_agg[feat].median())
df_agg.isna().sum()
df_agg.head()
features = ['visits_with_covid', 'visits_with_covid_symptoms', 
            'admissions_with_covid', 'google_anosmia_searches', 'google_ageusia_searches']
with open('data.csv', 'w') as f:
    f.write('county_id, visits_with_covid_n-1, visits_with_covid_symptoms_n-1, admissions_with_covid_n-1,\
            google_anosmia_searches_n-1, google_ageusia_searches_n-1, visits_with_covid_n, visits_with_covid_symptoms_n,\
            admissions_with_covid_n, google_anosmia_searches_n, google_ageusia_searches_n, instances_n+1\n')
    # loop through each county
    for county_id in df_agg['geo_value'].unique():
        tmp_df = df_agg[df_agg['geo_value'] == county_id]
        tmp_df.reset_index(inplace=True)
        for i in range(1, len(tmp_df.index)):   
            # skip if we are at last value as it has no label
            if i == len(tmp_df.index) - 1:
                continue
            f.write(str(county_id) + ',')
            for val in tmp_df.iloc[i-1][features].values:
                f.write(str(val) + ',')
            for val in tmp_df.iloc[i][features].values:
                f.write(str(val) + ',')
            f.write(str(tmp_df.iloc[i+1]['confirmed_instances']) + '\n')
# check that it works
df_test = pd.read_csv('data.csv', index_col=False)
df_test.head()