!pip install covidcast
from datetime import date
import covidcast
import pandas as pd
import numpy as np
ca_counties = covidcast.fips_to_name("^06.*", ties_method="all")
ca_counties = list(ca_counties[0].values())
counties_string = []
for i in ca_counties:
string = ""
for element in i:
string += element
counties_string.append(string)
counties_string = counties_string[1:59] # removing 'california'
ca_counties_fips = covidcast.name_to_fips(counties_string)
ca_counties_fips
data = covidcast.signal("indicator-combination", "confirmed_incidence_num",
geo_values= google_sum_fips)
data.head()
data.tail()
labels = data['value']
# number of observations
labels.size
# looking for NA values for value column
data.isna().sum()
# many missing dates in month of november
chng = covidcast.signal("chng", "smoothed_outpatient_cli",
geo_values=google_sum_fips, start_day=date(2020, 2, 20), end_day=date(2021, 11, 12))
chng.head()
chng.tail()
chng.shape
chng.isna().sum()
hosp = covidcast.signal("hospital-admissions", "smoothed_covid19_from_claims",
geo_values=google_sum_fips, start_day=date(2020, 2, 20), end_day=date(2021, 11, 12))
hosp.head()
hosp.tail()
hosp.shape
google_sum = covidcast.signal("google-symptoms", "sum_anosmia_ageusia_raw_search",
geo_values=google_sum_fips, start_day=date(2020, 2, 20), end_day=date(2021, 11, 12))
google_sum.head()
google_ageusia = covidcast.signal("google-symptoms", "ageusia_raw_search",
geo_values=google_sum_fips, start_day=date(2020, 2, 20), end_day=date(2021, 11, 12))
google_ageusia.head()
google_anosmia = covidcast.signal("google-symptoms", "anosmia_raw_search",
geo_values=google_sum_fips, start_day=date(2020, 2, 20), end_day=date(2021, 11, 12))
google_anosmia.head()
# checking to see if counties match up in different signals
fips = []
for i in ageusia_fips:
if i in hosp_fips:
fips.append(i)
doc = covidcast.signal("doctor-visits", "smoothed_cli",
geo_values=google_sum_fips, start_day=date(2020, 2, 20), end_day=date(2021, 11, 12))
doc.head()
doc.tail()
np.array(doc).shape
doc.isna().sum()
merged = covidcast.aggregate_signals([hosp, chng, doc, google_sum, google_anosmia, google_ageusia, data])
merged
merged.columns
x = merged.drop(columns = [
'hospital-admissions_smoothed_covid19_from_claims_0_issue',
'hospital-admissions_smoothed_covid19_from_claims_0_lag',
'hospital-admissions_smoothed_covid19_from_claims_0_missing_value',
'hospital-admissions_smoothed_covid19_from_claims_0_missing_stderr',
'hospital-admissions_smoothed_covid19_from_claims_0_missing_sample_size',
'hospital-admissions_smoothed_covid19_from_claims_0_stderr',
'hospital-admissions_smoothed_covid19_from_claims_0_sample_size',
'chng_smoothed_outpatient_cli_1_issue',
'chng_smoothed_outpatient_cli_1_lag',
'chng_smoothed_outpatient_cli_1_missing_value',
'chng_smoothed_outpatient_cli_1_missing_stderr',
'chng_smoothed_outpatient_cli_1_missing_sample_size',
'chng_smoothed_outpatient_cli_1_stderr',
'chng_smoothed_outpatient_cli_1_sample_size',
'doctor-visits_smoothed_cli_2_issue',
'doctor-visits_smoothed_cli_2_lag',
'doctor-visits_smoothed_cli_2_missing_value',
'doctor-visits_smoothed_cli_2_missing_stderr',
'doctor-visits_smoothed_cli_2_missing_sample_size',
'doctor-visits_smoothed_cli_2_stderr',
'doctor-visits_smoothed_cli_2_sample_size',
'google-symptoms_sum_anosmia_ageusia_raw_search_3_issue',
'google-symptoms_sum_anosmia_ageusia_raw_search_3_lag',
'google-symptoms_sum_anosmia_ageusia_raw_search_3_missing_value',
'google-symptoms_sum_anosmia_ageusia_raw_search_3_missing_stderr',
'google-symptoms_sum_anosmia_ageusia_raw_search_3_missing_sample_size',
'google-symptoms_sum_anosmia_ageusia_raw_search_3_stderr',
'google-symptoms_sum_anosmia_ageusia_raw_search_3_sample_size',
'google-symptoms_anosmia_raw_search_4_issue',
'google-symptoms_anosmia_raw_search_4_lag',
'google-symptoms_anosmia_raw_search_4_missing_value',
'google-symptoms_anosmia_raw_search_4_missing_stderr',
'google-symptoms_anosmia_raw_search_4_missing_sample_size',
'google-symptoms_anosmia_raw_search_4_stderr',
'google-symptoms_anosmia_raw_search_4_sample_size',
'google-symptoms_ageusia_raw_search_5_issue',
'google-symptoms_ageusia_raw_search_5_lag',
'google-symptoms_ageusia_raw_search_5_missing_value',
'google-symptoms_ageusia_raw_search_5_missing_stderr',
'google-symptoms_ageusia_raw_search_5_missing_sample_size',
'google-symptoms_ageusia_raw_search_5_stderr',
'google-symptoms_ageusia_raw_search_5_sample_size',
'indicator-combination_confirmed_incidence_num_6_issue',
'indicator-combination_confirmed_incidence_num_6_lag',
'indicator-combination_confirmed_incidence_num_6_missing_value',
'indicator-combination_confirmed_incidence_num_6_missing_stderr',
'indicator-combination_confirmed_incidence_num_6_missing_sample_size',
'indicator-combination_confirmed_incidence_num_6_stderr',
'indicator-combination_confirmed_incidence_num_6_sample_size',
'geo_type'])
x
x.columns
x.isna().sum()
np.unique(x[x['google-symptoms_ageusia_raw_search_5_value'].isna()]['geo_value'])
# these are the unique counties that the ageusia column in the merged dataset is na for ?
x.info()
min(x['indicator-combination_confirmed_incidence_num_6_value'])
# seeing negative values for our labels????
x[x['indicator-combination_confirmed_incidence_num_6_value'] < 0]
# converting all negative labels to positive
x['indicator-combination_confirmed_incidence_num_6_value'] = abs(x['indicator-combination_confirmed_incidence_num_6_value'])
# drop ageusia since missing a county
x = x.drop(columns = "google-symptoms_ageusia_raw_search_5_value")
x
x.isna().sum()
# imputing by forward filling based on previous observation in each county
updated_x = x
updated_x['hospital-admissions_smoothed_covid19_from_claims_0_value'] = x.groupby('geo_value')['hospital-admissions_smoothed_covid19_from_claims_0_value'].fillna(method='ffill')
updated_x['chng_smoothed_outpatient_cli_1_value'] = x.groupby('geo_value')['chng_smoothed_outpatient_cli_1_value'].fillna(method='ffill')
updated_x['google-symptoms_sum_anosmia_ageusia_raw_search_3_value'] = x.groupby('geo_value')['google-symptoms_sum_anosmia_ageusia_raw_search_3_value'].fillna(method='ffill')
updated_x['google-symptoms_anosmia_raw_search_4_value'] = x.groupby('geo_value')['google-symptoms_anosmia_raw_search_4_value'].fillna(method='ffill')
updated_x
updated_x.isna().sum()
# drop remaining NA values
updated_x = updated_x.dropna(axis = 0)
updated_x.isna().sum()
updated_x
updated_x.to_csv(r'data_preparation.csv')