import time
import pandas as pd
import numpy as np
from pyecharts.components import Table
from pyecharts.options import ComponentTitleOpts
from pyecharts.charts import *
import pyecharts.options as opts
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('indian_liver_patient_dataset.csv')
df.isna().sum()
df.duplicated().sum()
df_t = df.groupby(['class','gender']).agg({'age':'count'}).reset_index()
df_t.replace({1:'Ill',2:'No Ill'},inplace=True)
data = []
item = []
for idx, row in df_t.iterrows():
if row['class'] in item:
data[-1]['children'].append(dict(name=row['gender'], value=row['age']))
else:
data.append(dict(name=row['class'], children=[dict(name=row['gender'], value=row['age'])]))
item.append(row['class'])
sunburst = (Sunburst(init_opts=opts.InitOpts(theme='dark'))
.add("",
data_pair=data,
radius=[0, "75%"],
center=['75%','60%'],
levels=[
{},
{
"r0": "0%",
"r": "55%",
"itemStyle": {"borderColor":"white","borderWidth": 2},
},
{
"r0": "50%",
"r": "70%",
"label": {"rotate": "tangential","padding": 3, "silent": False},
"itemStyle": {"borderColor":"white","borderWidth": 3},
},
],
)
.set_series_opts(
label_opts=opts.LabelOpts(
formatter='{b}:{c}people',
font_family='Adobe',
font_weight='lighter'
)
)
.set_global_opts(
title_opts=opts.TitleOpts(
title="Number of patients with liver disease by gender",
title_textstyle_opts=opts.TextStyleOpts(
font_family='Adobe',
font_weight='lighter',
font_size=22
),
pos_right='0%',
pos_top='5%'
),
legend_opts=opts.LegendOpts(
is_show=True
),
)
)
rich={
"a": {"color": "#999", "fontSize":14, "lineHeight": 22, "align": "center"},
"abg": {
"backgroundColor": "#e3e3e3",
"width": "100%",
"align": "right",
"height": 28,
"borderRadius": [4, 4, 0, 0],
},
"hr": {
"borderColor": "#aaa",
"width": "100%",
"borderWidth": 0.5,
"height": 0,
},
"b": {"fontSize": 16, "lineHeight": 33},
"per": {
"color": "#eee",
"backgroundColor": "#334455",
"padding": [2, 4],
"borderRadius": 2,
},
"c": {"align": "right","fontSize":12}
}
df['age_cut'] = pd.cut(df.age,[0,4,11,18,35,59,1000],labels=["婴幼儿","儿童","少年","青年","中年","老年"])
df_t = df[df['class'] == 1] ##选取患病患者的数据
df_t = df_t.groupby(['class','age_cut']).agg({'age':'count'}).unstack()
df_t.columns = df_t.columns.droplevel()
df_t.fillna(0,inplace=True)
xdata = df_t.columns.tolist()
ydata = df_t.iloc[0,:].tolist()
pie = (Pie(init_opts=opts.InitOpts(theme='dark',height='400px'))
.add('',[list(z) for z in zip(xdata,ydata)],radius=['50%','65%'],center=['25%','60%'])
.set_series_opts(
label_opts=opts.LabelOpts(
is_show=True,
formatter="{a|{b}}{abg|}\n {hr|}\n {c|{c}人\n 占比{d}%} ",
rich=rich
),
itemstyle_opts=opts.ItemStyleOpts(
border_color='#000',
border_width=2
)
)
.set_global_opts(
legend_opts=opts.LegendOpts(
is_show=False
),
title_opts=opts.TitleOpts(
title="Age composition of the affected population",
title_textstyle_opts=opts.TextStyleOpts(
font_family='Adobe',
font_weight='lighter',
font_size=22
),
pos_top='5%'
),
)
)
grid = Grid(init_opts=opts.InitOpts(theme='dark',width='980px'))
grid.add(pie,grid_opts=opts.GridOpts(pos_left='10%'))
grid.add(sunburst,grid_opts=opts.GridOpts(pos_right='10%'))
grid.render_notebook()
df_t = df.groupby('class').mean().reset_index()
df_t.replace({1:'患病',2:'未患病'},inplace=True)
df_t = round(df_t,2)
df_t['age'] = df_t['age'].map(lambda x:round(x,0))
headers = df_t.columns.tolist()
rows = df_t[headers].apply(lambda x: list(x), axis=1).values.tolist()
table = Table()
attributes = {"class": "fl-table", "style": "margin: 0 auto"} # 居中显示
table.add(headers, rows, attributes)
table.set_global_opts(
title_opts=ComponentTitleOpts(
title="患病与未患病人群的信息比较",
subtitle="Tips:部分数据保留2位小数",)
)
table.render_notebook()
## 去除'age_cut'数据
df = df.iloc[:,:-1]
df['gender'] = LabelEncoder().fit_transform(df['gender'])
df['class'].value_counts()
df_2 = df[df['class'] == 2]
df = pd.concat([df,df_2])
df = pd.concat([df,df_2])
X = df.iloc[:,:-1] ##特征列
Y = df.iloc[:,-1:] ##目标列
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
r = []
def modle_pro(modle_):
## 创建保存模型信息的list
modle_r_list = []
## 初始化模型
modle = modle_
## 记录模型训练运行时间
old_time = time.time()
modle.fit(X_train,y_train)
current_time = time.time()
use_time = round(current_time-old_time,4)
## 模型预测
pre = modle.predict(X_test)
pre = pre.astype(np.int64)
## 由于其问题本质属于分类问题,故对模型评估统一采用模型预测准确率(accuracy_score)进行评估
acc_score = round(accuracy_score(pre,y_test),4)
##
modle_r_list.append(str(modle_))
modle_r_list.append(use_time)
modle_r_list.append(acc_score)
r.append(modle_r_list)
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
modle_pro(gnb)
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
modle_pro(bnb)
from sklearn.svm import SVC
svm = SVC(kernel = 'linear')
modle_pro(svm)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 2)
modle_pro(knn)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
modle_pro(lr)
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression()
modle_pro(lgr)
from sklearn.tree import DecisionTreeClassifier
dtc_e = DecisionTreeClassifier(criterion='entropy')
modle_pro(dtc_e)
from sklearn.tree import DecisionTreeClassifier
dtc_g = DecisionTreeClassifier(criterion='gini')
modle_pro(dtc_g)
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
modle_pro(rfc)
import pandas as pd
df = pd.DataFrame(r,columns=['modle_name','use_time','acc_score'])
df.sort_values('acc_score',ascending=False)