import numpy as np
import pandas as pd
import plotly.graph_objects as go
data=pd.read_csv("https://cdn.jsdelivr.net/gh/jeffcore/covid-19-usa-by-state@master/COVID-19-Cases-USA-By-State.csv")
上面的数据是实时更新的
数据源头为约翰霍普金斯大学的数据
data
data_1=data.iloc[:, [0,-1]]
data_2=np.array(data_1)
data_3=data_2[data_2[:,1].argsort()][::-1]
按照最后一天的确诊倒序排序
fig=go.Figure(data=[go.Bar(x=data_3[:,0],y=data_3[:,1])])
fig.update_layout(title_text='各州总确诊人数柱状图')
fig=go.Figure(data=[go.Bar(x=data_3[:5,0],y=data_3[:5,1])])
fig.update_layout(title_text='最高的五个州总确诊人数柱状图')
选出了前五个州作为重点州单独预测,分别是
- California CA 加利福尼亚州
- Texas TX 德克萨斯州
- Florida FL 佛罗里达州
- New York NY 纽约州
- Illinois IL 伊利诺伊州
美国各州地图
states={"Alabama":"AL","Alaska":"AK","American Samoa":"AS","Arizona":"AZ","Arkansas":"AR","California":"CA","Colorado":"CO","Connecticut":"CT","Delaware":"DE","District of Columbia":"DC","Florida":"FL","Georgia":"GA","Guam":"GU","Hawaii":"HI","Idaho":"ID","Illinois":"IL","Indiana":"IN","Iowa":"IA","Kansas":"KS","Kentucky":"KY","Louisiana":"LA","Maine":"ME","Maryland":"MD","Massachusetts":"MA","Michigan":"MI","Minnesota":"MN","Mississippi":"MS","Missouri":"MO","Montana":"MT","Nebraska":"NE","Nevada":"NV","New Hampshire":"NH","New Jersey":"NJ","New Mexico":"NM","New York":"NY","North Carolina":"NC","North Dakota":"ND","Northern Mariana Islands":"MP","Ohio":"OH","Oklahoma":"OK","Oregon":"OR","Pennsylvania":"PA","Puerto Rico":"PR","Rhode Island":"RI","South Carolina":"SC","South Dakota":"SD","Tennessee":"TN","Texas":"TX","Utah":"UT","Vermont":"VT","Virgin Islands":"VI","Virginia":"VA","Washington":"WA","West Virginia":"WV","Wisconsin":"WI","Wyoming":"WY"}
美国各州与其简称的字典
codes=[]
confirmed=[]
for i in data_3:
if i[0]=="Grand Princess"or i[0]=="Diamond Princess": #过滤掉两艘碍事的邮轮
continue
codes.append(states[i[0]])
confirmed.append(i[1])
df = pd.DataFrame({'codes': codes, 'confirmed': confirmed}) #为了方便
fig = go.Figure(data=go.Choropleth(
locations=df['codes'],
z=df['confirmed'].astype(float),
locationmode='USA-states',
hovertext=df['codes'],
colorscale='Reds',
colorbar_title="Confirmed Number",
))
fig.update_layout(
title_text='American Convid-19 Counts',
geo_scope='usa'
)
各州确诊人数的一个地图
data_4=np.array(data)
fig=go.Figure()
for i in data_4:
fig.add_trace(go.Scatter(x=np.arange(1,i.size),y=i[1:],name=i[0]))
fig.update_layout(title_text='各州确诊人数折线图')
fig=go.Figure()
for i in data_4:
if i[0] in ['California','Texas','Florida','New York','Illinois']:
fig.add_trace(go.Scatter(x=np.arange(1,i.size),y=i[1:],name=i[0]))
fig.update_layout(title_text='前五个州确诊人数折线图')
上面是五个重点州的确诊人数折线图,我们截取第 $300$ 天以后的数据进行训练
我们把每一个州分开,分别训练
data_IL_train=np.array(data.iloc[17,:])[300:]
data_IL_train_x=np.arange(data.iloc[17,:].size-300)
data_IL_train=np.array(data_IL_train,dtype=int)
a_IL,b_IL,c_IL=np.polyfit(data_IL_train_x,data_IL_train,deg=2)
f_IL=lambda x:x*x*a_IL+x*b_IL+c_IL
fig = go.Figure(data = go.Scatter(x = data_IL_train_x, y = data_IL_train, line_color = 'rgb(16,50,92)', name = 'TrueData'))
fig.add_trace(go.Scatter(x = data_IL_train_x, y = f_IL(data_IL_train_x), line_color = 'rgb(100,10,100)', name = 'Polyfit'))
fig.update_layout(title_text='伊利诺斯确诊人数与拟合结果折线图')
对于伊利诺斯州,我们使用了一个二次多项式进行拟合,上图为实际数据与拟合结果的对比折线图
data_CA_train=np.array(data.iloc[5,:])[300:]
data_CA_train_x=np.arange(data.iloc[5,:].size-300)
data_CA_train=np.array(data_CA_train,dtype=int)
a_CA,b_CA,c_CA =np.polyfit(data_CA_train_x,data_CA_train,deg=2)
f_CA=lambda x:x*x*a_CA+x*b_CA+c_CA
fig = go.Figure(data = go.Scatter(x = data_CA_train_x, y = data_CA_train, line_color = 'rgb(16,50,92)', name = 'TrueData'))
fig.add_trace(go.Scatter(x = data_CA_train_x, y = f_CA(data_CA_train_x), line_color = 'rgb(100,10,100)', name = 'Polyfit'))
fig.update_layout(title_text='加利佛尼亚确诊人数与拟合结果折线图')
对于加利佛尼亚州,我们使用了一个二次多项式进行拟合,上图为实际数据与拟合结果的对比折线图
拟合结果并不是那么好
data_NY_train=np.array(data.iloc[36,:])[300:]
data_NY_train_x=np.arange(data.iloc[36,:].size-300)
data_NY_train=np.array(data_NY_train,dtype=int)
a_NY,b_NY,c_NY =np.polyfit(data_NY_train_x,data_NY_train,deg=2)
f_NY=lambda x:x*x*a_NY+x*b_NY+c_NY
fig = go.Figure(data = go.Scatter(x = data_NY_train_x, y = data_NY_train, line_color = 'rgb(16,50,92)', name = 'TrueData'))
fig.add_trace(go.Scatter(x = data_NY_train_x, y = f_NY(data_NY_train_x), line_color = 'rgb(100,10,100)', name = 'Polyfit'))
fig.update_layout(title_text='纽约州确诊人数与拟合结果折线图')
对于纽约州,我们使用了一个二次多项式进行拟合,上图为实际数据与拟合结果的对比折线图
data_FL_train=np.array(data.iloc[11,:])[300:]
data_FL_train_x=np.arange(data.iloc[11,:].size-300)
data_FL_train=np.array(data_FL_train,dtype=int)
a_FL,b_FL,c_FL =np.polyfit(data_FL_train_x,data_FL_train,deg=2)
f_FL=lambda x:x*x*a_FL+x*b_FL+c_FL
fig = go.Figure(data = go.Scatter(x = data_FL_train_x, y = data_FL_train, line_color = 'rgb(16,50,92)', name = 'TrueData'))
fig.add_trace(go.Scatter(x = data_FL_train_x, y = f_FL(data_FL_train_x), line_color = 'rgb(100,10,100)', name = 'Polyfit'))
fig.update_layout(title_text='佛罗里达确诊人数与拟合结果折线图')
对于佛罗里达州,我们使用了一个二次多项式进行拟合,上图为实际数据与拟合结果的对比折线图
data_TX_train=np.array(data.iloc[49,:])[300:]
data_TX_train_x=np.arange(data.iloc[49,:].size-300)
data_TX_train=np.array(data_TX_train,dtype=int)
a_TX,b_TX,c_TX,d_TX =np.polyfit(data_TX_train_x,data_TX_train,deg=3)
f_TX=lambda x:a_TX*x*x*x+b_TX*x*x+c_TX*x+d_TX
fig = go.Figure(data = go.Scatter(x = data_TX_train_x, y = data_TX_train, line_color = 'rgb(16,50,92)', name = 'TrueData'))
fig.add_trace(go.Scatter(x = data_TX_train_x, y = f_TX(data_TX_train_x), line_color = 'rgb(100,10,100)', name = 'Polyfit'))
fig.update_layout(title_text='德克萨斯确诊人数与拟合结果折线图')
对于德克萨斯州,我们使用了一个三次多项式进行拟合,上图为实际数据与拟合结果的对比折线图
data_5=np.array(data.drop(index=[17,5,36,49,11]))[:,1:].sum(axis=0)[300:]
从数据集中排除五个重点州,然后对每一列(天)求和
x_all=np.arange(data_5.size)
data_5=np.array(data_5,dtype=int)
fig=go.Figure(data=go.Scatter(x=x_all,y=data_5))
fig.update_layout(title_text='其他州合计确诊人数折线图')
上图是其他州综合的折线图
a_all,b_all,c_all,d_all =np.polyfit(x_all,data_5,deg=3)
f_all=lambda x:a_all*x*x*x+b_all*x*x+c_all*x+d_all
fig = go.Figure(data = go.Scatter(x = x_all, y = data_5, line_color = 'rgb(16,50,92)', name = 'TrueData'))
fig.add_trace(go.Scatter(x = x_all, y = f_all(x_all), line_color = 'rgb(100,10,100)', name = 'Polyfit'))
fig.update_layout(title_text='其他州合计确诊人数与拟合结果折线图')
对于其他州的合计,我们使用了一个三次多项式进行拟合,上图为实际数据与拟合结果的对比折线图
fig=go.Figure()
range_=np.arange(data.iloc[36,:].size-300,data.iloc[36,:].size-300+7)
fig.add_trace(go.Scatter(x=range_,y=f_CA(range_),name='California',line_color = 'rgb(99, 110, 250)'))
fig.update_layout(title_text='加利佛尼亚州预测结果')
fig=go.Figure()
fig.add_trace(go.Scatter(x=range_,y=f_TX(range_),name='Texas',line_color='rgb(255, 161, 90)'))
fig.add_trace(go.Scatter(x=range_,y=f_FL(range_),name='Florida',line_color='rgb(239, 85, 59)'))
fig.update_layout(title_text='德克萨斯州和佛罗里达州预测结果')
fig=go.Figure()
fig.add_trace(go.Scatter(x=range_,y=f_IL(range_),name='Illinois',line_color='rgb(0, 204, 150)'))
fig.add_trace(go.Scatter(x=range_,y=f_NY(range_),name='New York',line_color='rgb(171, 99, 250)'))
fig.update_layout(title_text='伊利诺斯州和纽约州预测结果')
fig=go.Figure()
fig.add_trace(go.Scatter(x=range_,y=f_all(range_),name='Other'))
fig.update_layout(title_text='其他州合计预测结果')
我们的预测的原理:
- 对于所有数据都可以表示为 $(x,y)$ 的形式,其中 $x$ 表示天数,$y$ 表示当天的确诊人数
- 我们把它们尽可能的用一个二/三次函数 $y=f(x)$ 表示这些数据,用
numpy.polyfit
得到这样一个函数$f(x)$。这个过程称为训练 - 进行预测时,我们把后面的天数带入 $f(x)$ 函数中,得到结果