Final Project 197

# Installing covidcast !pip install covidcast

from datetime import date import pandas as pd import covidcast import matplotlib.pyplot as plt

# CONSTANTS SOURCES = ['chng.smoothed_adj_outpatient_covid', 'doctor-visits.smoothed_adj_cli', 'hospital-admissions.smoothed_adj_covid19_from_claims', 'google-symptoms.anosmia_smoothed_search', 'google-symptoms.ageusia_smoothed_search', 'indicator-combination.confirmed_incidence_num'] RENAMES = ['visits_with_covid', 'visits_with_covid_symptoms', 'admissions_with_covid', 'google_anosmia_searches', 'google_ageusia_searches', 'confirmed_instances'] # get all counties and create hash set of their id's CA_COUNTIES = covidcast.fips_to_name("^06.*", ties_method="all")[0] CA_COUNTIES.pop('06000') COUNTY_IDS = set(CA_COUNTIES.keys())

all_dfs = [] for idx, source in enumerate(SOURCES): src, signal = source.split('.')[0], source.split('.')[1] tmp_df = covidcast.signal(src, signal, geo_type="county", geo_values=list(COUNTY_IDS), time_type='day') all_dfs.append(tmp_df)

import pickle ''' with open('dfs.p', 'wb') as p: pickle.dump(all_dfs, p) ''' with open('dfs.p', 'rb') as p: all_dfs = pickle.load(p)

df_agg = covidcast.aggregate_signals(all_dfs) df_agg.head()

# drop all counties that do not have google data we want to use counties_used = all_dfs[4]['geo_value'].unique() df_agg = df_agg[df_agg['geo_value'].isin(counties_used)] df_agg['geo_value'].unique()

for idx, source in enumerate(SOURCES): name = source.split('.')[0] + '_' + source.split('.')[1] + '_{}_value'.format(idx) df_agg = df_agg.rename(columns={name: RENAMES[idx]})

keep_columns = ['geo_value', 'time_value', 'visits_with_covid', 'visits_with_covid_symptoms', 'admissions_with_covid', 'google_anosmia_searches', 'google_ageusia_searches', 'confirmed_instances'] df_agg = df_agg[keep_columns] df_agg.head(15)

df_agg = df_agg.drop(df_agg[df_agg['confirmed_instances'].isna()].index) df_agg.reset_index(inplace = True) df_agg.isna().sum()

missing = ['visits_with_covid', 'admissions_with_covid', 'google_anosmia_searches', 'google_ageusia_searches'] for feat in missing: df_agg[feat] = df_agg[feat].interpolate() still_missing = ['google_anosmia_searches', 'google_ageusia_searches'] for feat in still_missing: df_agg[feat] = df_agg[feat].fillna(df_agg[feat].median()) df_agg.isna().sum()

df_agg.head()

features = ['visits_with_covid', 'visits_with_covid_symptoms', 'admissions_with_covid', 'google_anosmia_searches', 'google_ageusia_searches'] with open('data.csv', 'w') as f: f.write('county_id, visits_with_covid_n-1, visits_with_covid_symptoms_n-1, admissions_with_covid_n-1,\ google_anosmia_searches_n-1, google_ageusia_searches_n-1, visits_with_covid_n, visits_with_covid_symptoms_n,\ admissions_with_covid_n, google_anosmia_searches_n, google_ageusia_searches_n, instances_n+1\n') # loop through each county for county_id in df_agg['geo_value'].unique(): tmp_df = df_agg[df_agg['geo_value'] == county_id] tmp_df.reset_index(inplace=True) for i in range(1, len(tmp_df.index)): # skip if we are at last value as it has no label if i == len(tmp_df.index) - 1: continue f.write(str(county_id) + ',') for val in tmp_df.iloc[i-1][features].values: f.write(str(val) + ',') for val in tmp_df.iloc[i][features].values: f.write(str(val) + ',') f.write(str(tmp_df.iloc[i+1]['confirmed_instances']) + '\n')

# check that it works df_test = pd.read_csv('data.csv', index_col=False) df_test.head()