from sklearn.datasets import make_blobs
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap
import pandas as pd
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import plotly_express as px
data = pd.read_csv("data.csv")
data = data[['Gender', 'Age', 'Employment status','Attitudes towards the LIVE that sells products ']]
data.shape
feature = data[['Gender', 'Age', 'Employment status']].values
label = data['Attitudes towards the LIVE that sells products '].values
#knn算法预测
X_train, X_test, y_train, y_test = train_test_split(feature,label,test_size=0.9)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
predict_y = knn.predict(X_test)
print("KNN 准确率", accuracy_score(y_test, predict_y))
#选取9个近邻
X_train_new = X_train[:18]
X_train_validation = X_train[18:]
for k in range(1, 15, 2):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
predict_y = knn.predict(X_test)
print("K为%s的准确率" % k, accuracy_score(y_test, predict_y))
returnMat, classLabelVector = np.array(data[['Gender', 'Age', 'Employment status']]),data['Attitudes towards the LIVE that sells products ']
def showdatas(returnMat, classLabelVector):
plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号
# 设置2*2的画布
fig,axs = plt.subplots(nrows=2, ncols=2,sharex=False,sharey=False,figsize=(13,8))
# 特征数据长度
numberOfLabels = len(classLabelVector)
# 设置空标签来存储
LabelsColors = []
for i in classLabelVector: # 对每个标签遍历,并且赋上对应的颜色
if i == 1:
LabelsColors.append('blue')
if i == 2:
LabelsColors.append('yellow')
if i == 3:
LabelsColors.append('red')
# 画出散点图:散点大小15 透明度0.5
axs[0][0].scatter(x=returnMat[:,0],y=returnMat[:,1],color=LabelsColors,s=15,alpha=0.5)
# gender, age
axs0_title_text = axs[0][0].set_title(u'gender,age')
axs0_xlabel_text = axs[0][0].set_xlabel(u'gender')
axs0_ylabel_text = axs[0][0].set_ylabel(u'age')
plt.setp(axs0_title_text, size=9, weight='bold', color='red')
plt.setp(axs0_xlabel_text, size=7, weight='bold', color='black')
plt.setp(axs0_ylabel_text, size=7, weight='bold', color='black')
# gender, employment
axs[0][1].scatter(x=returnMat[:,0],y=returnMat[:,2],color=LabelsColors,s=15,alpha=0.5)
# 设置标题、x轴、y轴
axs1_title_text = axs[0][1].set_title(u'gender,employment')
axs1_xlabel_text = axs[0][1].set_xlabel(u'gender')
axs1_ylabel_text = axs[0][1].set_ylabel(u'employment')
plt.setp(axs1_title_text, size=9, weight='bold', color='red')
plt.setp(axs1_xlabel_text, size=7, weight='bold', color='black')
plt.setp(axs1_ylabel_text, size=7, weight='bold', color='black')
# age, employment
axs[1][0].scatter(x=returnMat[:,1],y=returnMat[:,2],color=LabelsColors,s=15,alpha=0.5)
# 设置标题、x轴、y轴
axs2_title_text = axs[1][0].set_title(u'age,employment')
axs2_xlabel_text = axs[1][0].set_xlabel(u'age')
axs2_ylabel_text = axs[1][0].set_ylabel(u'employment')
plt.setp(axs2_title_text, size=9, weight='bold', color='red')
plt.setp(axs2_xlabel_text, size=7, weight='bold', color='black')
plt.setp(axs2_ylabel_text, size=7, weight='bold', color='black')
# 设置图例
like = mlines.Line2D([], [], color='blue', marker='.',
markersize=6, label='like')
indifferent = mlines.Line2D([], [], color='yellow', marker='.',
markersize=6, label='indifferent')
dislike = mlines.Line2D([], [], color='red', marker='.',
markersize=6, label='dont like')
#添加图例
axs[0][0].legend(handles=[like,indifferent,dislike])
axs[0][1].legend(handles=[like,indifferent,dislik])
axs[1][0].legend(handles=[like,indifferent,dislik])
#显示图片
plt.show()
showdatas(returnMat, classLabelVector)
#特征归一化
def minmax(dataSet):
minD = dataSet.min()
maxD = dataSet.max()
normSet = (dataSet - minD) / (maxD - minD)
return normSet
dataNew = pd.concat([minmax(data.iloc[:,:3]), data.iloc[:,3]], axis=1)
dataNew
returnMat1, classLabelVector1 = np.array(dataNew[['Gender', 'Age', 'Employment status']]),dataNew['Attitudes towards the LIVE that sells products ']
classLabelVector1
showdatas(returnMat1, classLabelVector1)
feature1 = dataNew[['Gender', 'Age', 'Employment status']].values
label1 = dataNew['Attitudes towards the LIVE that sells products '].values
def randSplit(dataSet, rate=0.9):
n = dataSet.shape[0] # [1000,4] 取出1000,实际上就是data的长度len(data)
m = int(n * rate) # 取出前90%
train = dataSet.iloc[:m,:] # 前90%的行,所有列
test = dataSet.iloc[m:,:] # 后面的行及所有列
test.index = range(test.shape[0]) # 测试集test的上索引需要重置
return train,test
# 调用函数
train,test = randSplit(dataNew)
def dataClassify(train,test,k):
n = train.shape[1] - 1 # train除去标签的所有列
m = test.shape[0] # test的行数
result = [] # 存放最终的结果
for i in range(m):
dist = list((((train.iloc[:,:n] - test.iloc[i,:n]) ** 2).sum(1))**0.5) # 计算训练集中的每个数据和测试集中某个数据的欧氏距离
dist_l = pd.DataFrame({'dist':dist, 'labels':(train.iloc[:,n])}) # 计算出来的距离和对应训练集的标签构成DF型数据
dr = dist_l.sort_values(by='dist')[:k] # 根据标签排序取出前k个数据
re = dr.loc[:,'labels'].value_counts() # 前k个数据中统计每个标签出现的票数,票数高的则为测试集数据的标签
result.append(re.index[0]) # re.index[0]表示票数最高的分类
result = pd.Series(result)
test['predict'] = result # 测试集中添加预测的结果
acc = (test.iloc[:,-1] == test.iloc[:,-2]).mean() # 原始结果和预测结果的对比
print(f"模型预测准确率{acc}")
return test
dataClassify(train,test,9)