# 优化
# 1.不显示错误警告
import warnings
warnings.filterwarnings('ignore')
# 2.画图过程中的中文显示
from matplotlib import pyplot as plt
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
# 导入数据
from pandas import read_csv
filename = 'sonar.all-data.csv'
dataset = read_csv(filename, header=None)
print(dataset)
from pandas import set_option
from matplotlib import pyplot
# 数据维度
print('1.数据维度')
print(dataset.shape)
# 查看数据类型
# set_option('display.max_rows', 500)
print("2.数据类型")
print(dataset.dtypes)
# 查看最初的20条记录
# set_option('display.width', 100)
print("3.查看前20条记录")
print(dataset.head(20))
# 描述性统计信息
# set_option('precision', 3)
print("4.查看统计信息(每个字段的反馈:总数,均值,标准差,最小值,上四分位数,中位数,下四分位数,最大值)")
print(dataset.describe())
# 数据的分类分布
print(dataset.groupby(60).size())
# 看数据之间的联系
# 直方图
dataset.hist(sharex=False, sharey=False,xlabelsize=1, ylabelsize=1)
pyplot.suptitle("属性直方图")
pyplot.show()
# 密度图
dataset.plot(kind='density', subplots=True, layout=(8,8), sharex=False, legend=False, fontsize=1)
pyplot.suptitle("属性密度图")
pyplot.show()
# 关系矩阵图
fig = pyplot.figure()
fig.suptitle('关系矩阵图')
ax = fig.add_subplot(111)
cax = ax.matshow(dataset.corr(), vmin=-1, vmax=1, interpolation='none')
fig.colorbar(cax)
pyplot.show()
from sklearn.model_selection import train_test_split
# 分离评估数据集
#dataset是pandans读的一个东西,是pandas里的dataframe类型,此步做一个格式类型的转化,array类型是Numpy类型
array = dataset.values
X = array[:, 0:60].astype(float)
Y = array[:, 60]
#最后一行test_size,取25%的数据进行一个测试
validation_size = 0.25
seed = 7 # 取一个固定的随机数种子,这样不会因为随机数的不同导致的数据的好坏而影响结果
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)
#random_state取一个值生成随机数,通过随机算法,不按照原有数据顺序对数据进行75%的取样
#引入部分
#导入算法
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
#做交叉验证
from sklearn.model_selection import KFold
#评分
from sklearn.model_selection import cross_val_score
#评分标准
scoring = 'accuracy'
#交叉验证折数
num_folds = 10
# 评估算法的基准
seed = 7
#构建模型字典
# 评估算法:使用原始数据 - 原始数据
models = {}
models['LR'] = LogisticRegression()#逻辑回归
models['LDA'] = LinearDiscriminantAnalysis()#线性判别分析
models['KNN'] = KNeighborsClassifier()#K近邻
models['CART'] = DecisionTreeClassifier()#分类与回归树
models['NB'] = GaussianNB()#朴素贝叶斯
models['SVM'] = SVC()#支持向量机
#把结果写进列表里
results = []
for key in models:
kfold = KFold(n_splits=num_folds, random_state=seed)
cv_results = cross_val_score(models[key], X_train, Y_train, cv=kfold, scoring=scoring)#训练的数据,结果,标准,构造模型
results.append(cv_results)
print('%s : %f (%f)' % (key, cv_results.mean(), cv_results.std()))#返回结果的均值和标准差,比如 LR : 0.749167 (0.081048)
#意思是准确率74.91%
print('明细结果:')
print(results)
# 评估算法:使用原始数据 - 箱线图
# 准确率图,KNN箱子最高,最好,NB是最不好的
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(models.keys())
pyplot.show()
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# 评估算法:使用正态化数据
pipelines = {}
pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LogisticRegression())])
pipelines['ScalerLDA'] = Pipeline([('Scaler', StandardScaler()), ('LDA', LinearDiscriminantAnalysis())])
pipelines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsClassifier())])
pipelines['ScalerCART'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeClassifier())])
pipelines['ScalerNB'] = Pipeline([('Scaler', StandardScaler()), ('NB', GaussianNB())])
pipelines['ScalerSVM'] = Pipeline([('Scaler', StandardScaler()), ('SVM', SVC())])
results = []
for key in pipelines:
kfold = KFold(n_splits=num_folds, random_state=seed)
cv_results = cross_val_score(pipelines[key], X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
print('%s : %f (%f)' % (key, cv_results.mean(), cv_results.std()))
print('明细结果:')
print(results)
# 评估算法 - 箱线图
#SVM最好了,其次是KNN
fig = pyplot.figure()
fig.suptitle('Scaled Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(models.keys())
pyplot.show()
from sklearn.model_selection import GridSearchCV
# 前面是直接用的模型,这是在改参数,调参改进算法 - KNN 不同的邻居会有不同的算法吗
# 调参改进算法 - KNN
scaler = StandardScaler().fit(X_train)#对模型标准化
rescaledX = scaler.transform(X_train)#构建参数的集合
param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
model = KNeighborsClassifier()
kfold = KFold(n_splits=num_folds, random_state=seed)#算法模型实例化
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)#引入网格搜索
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))#返回最优的参数,发现用一个邻居数是最好的
cv_results = zip(grid_result.cv_results_['mean_test_score'],
grid_result.cv_results_['std_test_score'],
grid_result.cv_results_['params'])
for mean, std, param in cv_results:
print('%f (%f) with %r' % (mean, std, param))
# 调参改进算法 - SVM
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train).astype(float)
param_grid = {}
param_grid['C'] = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0]
param_grid['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']
model = SVC()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
grid_result.cv_results_['std_test_score'],
grid_result.cv_results_['params'])
for mean, std, param in cv_results:
print('%f (%f) with %r' % (mean, std, param))
#用默认的参数做一遍看效果
#引入四种集成模型
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
# 集成算法
ensembles = {}
ensembles['ScaledAB'] = Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostClassifier())])
ensembles['ScaledGBM'] = Pipeline([('Scaler', StandardScaler()), ('GBM', GradientBoostingClassifier())])
ensembles['ScaledRF'] = Pipeline([('Scaler', StandardScaler()), ('RFR', RandomForestClassifier())])
ensembles['ScaledET'] = Pipeline([('Scaler', StandardScaler()), ('ETR', ExtraTreesClassifier())])
results = []
for key in ensembles:
kfold = KFold(n_splits=num_folds, random_state=seed)
cv_result = cross_val_score(ensembles[key], X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_result)
print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))
#发现集成算法比上面单一的算法整体要好
#但是如果不追求准确率到达80%觉得70%左右也可以,那么不需要用这种计算量较大的算法,考虑优化的收益和投入的时间
# 集成算法 - 箱线图,发现GBM效果最好,极端值较少
fig = pyplot.figure()
fig.suptitle('集成算法箱线图')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(ensembles.keys())
pyplot.show()
# 集成算法GBM - 调参
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900]}
model = GradientBoostingClassifier()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
# 集成算法ETR - 调参
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900]}
model = ExtraTreesClassifier()
kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
#准确率评估
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# 模型最终化
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model = SVC(C=1.5, kernel='rbf')
model.fit(X=rescaledX, y=Y_train)
# 评估模型,准确率 (Precision)、召回率 (Recall)、F值 (F-Measure)简介,支持度
rescaled_validationX = scaler.transform(X_validation)
predictions = model.predict(rescaled_validationX)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))