Untitled Python Project

# 优化 # 1.不显示错误警告 import warnings warnings.filterwarnings('ignore') # 2.画图过程中的中文显示 from matplotlib import pyplot as plt plt.rcParams['font.family'] = ['sans-serif'] plt.rcParams['font.sans-serif'] = ['SimHei']

# 导入数据 from pandas import read_csv filename = 'sonar.all-data.csv' dataset = read_csv(filename, header=None) print(dataset)

from pandas import set_option from matplotlib import pyplot # 数据维度 print('1.数据维度') print(dataset.shape) # 查看数据类型 # set_option('display.max_rows', 500) print("2.数据类型") print(dataset.dtypes) # 查看最初的20条记录 # set_option('display.width', 100) print("3.查看前20条记录") print(dataset.head(20)) # 描述性统计信息 # set_option('precision', 3) print("4.查看统计信息（每个字段的反馈：总数，均值，标准差，最小值，上四分位数，中位数，下四分位数，最大值）") print(dataset.describe()) # 数据的分类分布 print(dataset.groupby(60).size())

# 看数据之间的联系 # 直方图 dataset.hist(sharex=False, sharey=False,xlabelsize=1, ylabelsize=1) pyplot.suptitle("属性直方图") pyplot.show() # 密度图 dataset.plot(kind='density', subplots=True, layout=(8,8), sharex=False, legend=False, fontsize=1) pyplot.suptitle("属性密度图") pyplot.show() # 关系矩阵图 fig = pyplot.figure() fig.suptitle('关系矩阵图') ax = fig.add_subplot(111) cax = ax.matshow(dataset.corr(), vmin=-1, vmax=1, interpolation='none') fig.colorbar(cax) pyplot.show()

from sklearn.model_selection import train_test_split # 分离评估数据集 #dataset是pandans读的一个东西，是pandas里的dataframe类型，此步做一个格式类型的转化，array类型是Numpy类型 array = dataset.values X = array[:, 0:60].astype(float) Y = array[:, 60] #最后一行test_size，取25%的数据进行一个测试 validation_size = 0.25 seed = 7 # 取一个固定的随机数种子，这样不会因为随机数的不同导致的数据的好坏而影响结果 X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed) #random_state取一个值生成随机数，通过随机算法，不按照原有数据顺序对数据进行75%的取样

#引入部分 #导入算法 from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC #做交叉验证 from sklearn.model_selection import KFold #评分 from sklearn.model_selection import cross_val_score #评分标准 scoring = 'accuracy' #交叉验证折数 num_folds = 10

# 评估算法的基准 seed = 7 #构建模型字典 # 评估算法：使用原始数据 - 原始数据 models = {} models['LR'] = LogisticRegression()#逻辑回归 models['LDA'] = LinearDiscriminantAnalysis()#线性判别分析 models['KNN'] = KNeighborsClassifier()#K近邻 models['CART'] = DecisionTreeClassifier()#分类与回归树 models['NB'] = GaussianNB()#朴素贝叶斯 models['SVM'] = SVC()#支持向量机 #把结果写进列表里 results = [] for key in models: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results = cross_val_score(models[key], X_train, Y_train, cv=kfold, scoring=scoring)#训练的数据，结果，标准，构造模型 results.append(cv_results) print('%s : %f (%f)' % (key, cv_results.mean(), cv_results.std()))#返回结果的均值和标准差，比如 LR : 0.749167 (0.081048) #意思是准确率74.91% print('明细结果：') print(results)

# 评估算法：使用原始数据 - 箱线图 # 准确率图，KNN箱子最高，最好，NB是最不好的 fig = pyplot.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) pyplot.boxplot(results) ax.set_xticklabels(models.keys()) pyplot.show()

from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler # 评估算法：使用正态化数据 pipelines = {} pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LogisticRegression())]) pipelines['ScalerLDA'] = Pipeline([('Scaler', StandardScaler()), ('LDA', LinearDiscriminantAnalysis())]) pipelines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsClassifier())]) pipelines['ScalerCART'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeClassifier())]) pipelines['ScalerNB'] = Pipeline([('Scaler', StandardScaler()), ('NB', GaussianNB())]) pipelines['ScalerSVM'] = Pipeline([('Scaler', StandardScaler()), ('SVM', SVC())]) results = [] for key in pipelines: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results = cross_val_score(pipelines[key], X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) print('%s : %f (%f)' % (key, cv_results.mean(), cv_results.std())) print('明细结果：') print(results)

# 评估算法 - 箱线图 #SVM最好了，其次是KNN fig = pyplot.figure() fig.suptitle('Scaled Algorithm Comparison') ax = fig.add_subplot(111) pyplot.boxplot(results) ax.set_xticklabels(models.keys()) pyplot.show()

from sklearn.model_selection import GridSearchCV

# 前面是直接用的模型，这是在改参数，调参改进算法 - KNN 不同的邻居会有不同的算法吗 # 调参改进算法 - KNN scaler = StandardScaler().fit(X_train)#对模型标准化 rescaledX = scaler.transform(X_train)#构建参数的集合 param_grid = {'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]} model = KNeighborsClassifier() kfold = KFold(n_splits=num_folds, random_state=seed)#算法模型实例化 grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)#引入网格搜索 grid_result = grid.fit(X=rescaledX, y=Y_train) print('最优：%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))#返回最优的参数，发现用一个邻居数是最好的 cv_results = zip(grid_result.cv_results_['mean_test_score'], grid_result.cv_results_['std_test_score'], grid_result.cv_results_['params']) for mean, std, param in cv_results: print('%f (%f) with %r' % (mean, std, param))

# 调参改进算法 - SVM scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train).astype(float) param_grid = {} param_grid['C'] = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0] param_grid['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid'] model = SVC() kfold = KFold(n_splits=num_folds, random_state=seed) grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold) grid_result = grid.fit(X=rescaledX, y=Y_train) print('最优：%s 使用%s' % (grid_result.best_score_, grid_result.best_params_)) cv_results = zip(grid_result.cv_results_['mean_test_score'], grid_result.cv_results_['std_test_score'], grid_result.cv_results_['params']) for mean, std, param in cv_results: print('%f (%f) with %r' % (mean, std, param))

#用默认的参数做一遍看效果 #引入四种集成模型 from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier # 集成算法 ensembles = {} ensembles['ScaledAB'] = Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostClassifier())]) ensembles['ScaledGBM'] = Pipeline([('Scaler', StandardScaler()), ('GBM', GradientBoostingClassifier())]) ensembles['ScaledRF'] = Pipeline([('Scaler', StandardScaler()), ('RFR', RandomForestClassifier())]) ensembles['ScaledET'] = Pipeline([('Scaler', StandardScaler()), ('ETR', ExtraTreesClassifier())]) results = [] for key in ensembles: kfold = KFold(n_splits=num_folds, random_state=seed) cv_result = cross_val_score(ensembles[key], X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_result) print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std())) #发现集成算法比上面单一的算法整体要好 #但是如果不追求准确率到达80%觉得70%左右也可以，那么不需要用这种计算量较大的算法，考虑优化的收益和投入的时间

# 集成算法 - 箱线图，发现GBM效果最好，极端值较少 fig = pyplot.figure() fig.suptitle('集成算法箱线图') ax = fig.add_subplot(111) pyplot.boxplot(results) ax.set_xticklabels(ensembles.keys()) pyplot.show()

# 集成算法GBM - 调参 scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train) param_grid = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900]} model = GradientBoostingClassifier() kfold = KFold(n_splits=num_folds, random_state=seed) grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold) grid_result = grid.fit(X=rescaledX, y=Y_train) print('最优：%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))

# 集成算法ETR - 调参 scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train) param_grid = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900]} model = ExtraTreesClassifier() kfold = KFold(n_splits=num_folds, random_state=seed) grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold) grid_result = grid.fit(X=rescaledX, y=Y_train) print('最优：%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))

#准确率评估 from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report # 模型最终化 scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train) model = SVC(C=1.5, kernel='rbf') model.fit(X=rescaledX, y=Y_train) # 评估模型，准确率 (Precision)、召回率 (Recall)、F值 (F-Measure)简介，支持度 rescaled_validationX = scaler.transform(X_validation) predictions = model.predict(rescaled_validationX) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions))