import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
air_data_path = 'air_data.csv'
# 读取原始数据,指定UTF-8编码
air_data = pd.read_csv(air_data_path,encoding='utf-8')
# percentiles参数是指定计算多少的分位数表(如1/4分位数、中位数等)
explore = air_data.describe(percentiles=[],include='all').T
# describe()函数自动计算非空值数,需要手动计算空值数
explore['null'] = len(air_data)-explore['count']
explore = explore[['null', 'max', 'min']]
# # 表头重命名
explore.columns = [u'空值数', u'最大值', u'最小值']
explore
from datetime import datetime
# 提取入会年份,apply将一个函数作用于DataFrame中的每个行或者列
ffp = air_data['FFP_DATE'].apply(lambda x:datetime.strptime(x,'%Y/%m/%d'))
# map应用于Series结构中的每个元素
ffp_year = ffp.map(lambda x : x.year)
# 绘制各年份会员入会人数直方图
fig = plt.figure(figsize = (8 ,5)) # 设置画布大小
plt.hist(ffp_year, bins='auto', color='#0504aa')
plt.xlabel('年份')
plt.ylabel('入会人数')
plt.title('各年份会员入会人数')
# 提取会员不同性别人数
male = pd.value_counts(air_data['GENDER'])['男']
female = pd.value_counts(air_data['GENDER'])['女']
fig = plt.figure(figsize=(7,4))
# autopct 控制饼图内百分比设置
plt.pie([male,female],labels=['男','女'],
colors=['lightskyblue', 'lightcoral'],
autopct='%1.1f%%')
plt.title('会员性别比例')
lv_four = pd.value_counts(air_data['FFP_TIER'])[4]
lv_five = pd.value_counts(air_data['FFP_TIER'])[5]
lv_six = pd.value_counts(air_data['FFP_TIER'])[6]
fig = plt.figure(figsize = (8 ,5))
plt.bar(range(3), height=[lv_four,lv_five,lv_six],
width=0.4, alpha=0.8, color='skyblue')
plt.xticks([index for index in range(3)], ['4','5','6'])
plt.xlabel('会员等级')
plt.ylabel('会员人数')
plt.title('会员各级别人数')
age = air_data['AGE'].dropna()
age = age.astype('int64')
# 绘制会员年龄分布箱型图
fig = plt.figure(figsize = (5 ,10))
plt.boxplot(age,
patch_artist=True,
labels = ['会员年龄'], # 设置x轴标题
boxprops = {'facecolor':'lightblue'}) # 设置填充颜色
plt.title('会员年龄分布箱线图')
# 显示y坐标轴的底线
plt.grid(axis='y')
# 乘机信息类别
lte = air_data['LAST_TO_END']
fc = air_data['FLIGHT_COUNT']
sks = air_data['SEG_KM_SUM']
fig = plt.figure(figsize = (5 ,8))
plt.boxplot(lte,
patch_artist=True,
labels = ['时长'], # 设置x轴标题
boxprops = {'facecolor':'lightblue'}) # 设置填充颜色
plt.title('会员最后乘机至结束时长分布箱线图')
# 显示y坐标轴的底线
plt.grid(axis='y')
fig = plt.figure(figsize = (5 ,8))
plt.boxplot(fc,
patch_artist=True,
labels = ['飞行次数'], # 设置x轴标题
boxprops = {'facecolor':'lightblue'}) # 设置填充颜色
plt.title('会员飞行次数分布箱线图')
# 显示y坐标轴的底线
plt.grid(axis='y')
fig = plt.figure(figsize = (5 ,10))
plt.boxplot(sks,
patch_artist=True,
labels = ['总飞行公里数'], # 设置x轴标题
boxprops = {'facecolor':'lightblue'}) # 设置填充颜色
plt.title('客户总飞行公里数箱线图')
# 显示y坐标轴的底线
plt.grid(axis='y')
# 提取会员积分兑换次数
ec = air_data['EXCHANGE_COUNT']
fig = plt.figure(figsize = (8 ,5)) # 设置画布大小
plt.hist(ec, bins=5, color='#0504aa')
plt.xlabel('兑换次数')
plt.ylabel('会员人数')
plt.title('会员兑换积分次数分布直方图')
# 提取会员总累计积分
ps = air_data['Points_Sum']
fig = plt.figure(figsize = (5 ,8))
plt.boxplot(ps,
patch_artist=True,
labels = ['总累计积分'], # 设置x轴标题
boxprops = {'facecolor':'lightblue'}) # 设置填充颜色
plt.title('客户总累计积分箱线图')
# 显示y坐标轴的底线
plt.grid(axis='y')
# 提取属性并合并为新数据集
data_corr = air_data[['FFP_TIER','FLIGHT_COUNT','LAST_TO_END',
'SEG_KM_SUM','EXCHANGE_COUNT','Points_Sum']]
# 填充age
age1 = air_data['AGE'].fillna(0)
data_corr.loc[:,'AGE'] = age1.astype('int64')
data_corr.loc[:,'ffp_year'] = ffp_year
# 计算相关性矩阵
dt_corr = data_corr.corr(method = 'pearson')
print('相关性矩阵为:\n',dt_corr)
# 绘制热力图
plt.subplots(figsize=(10, 10))
sns.heatmap(dt_corr, annot=True, vmax=1, square=True, cmap='Blues')
cleanedfile = './data_cleaned.csv' # 数据清洗后保存的文件路径
# 读取数据
airline_data = pd.read_csv(air_data_path,encoding = 'utf-8')
print('原始数据的形状为:',airline_data.shape)
# 去除票价为空的记录
airline_notnull = airline_data.loc[airline_data['SUM_YR_1'].notnull() &
airline_data['SUM_YR_2'].notnull(),:]
print('删除缺失记录后数据的形状为:',airline_notnull.shape)
# 只保留票价非零的,或者平均折扣率不为0且总飞行公里数大于0的记录。
index1 = airline_notnull['SUM_YR_1'] != 0
index2 = airline_notnull['SUM_YR_2'] != 0
index3 = (airline_notnull['SEG_KM_SUM']> 0) & (airline_notnull['avg_discount'] != 0)
index4 = airline_notnull['AGE'] > 100 # 去除年龄大于100的记录
airline = airline_notnull[(index1 | index2) & index3 & ~index4]
print('数据清洗后数据的形状为:',airline.shape)
airline.to_csv(cleanedfile) # 保存清洗后的数据
# 读取数据清洗后的数据
cleanedfile = './data_cleaned.csv' # 数据清洗后保存的文件路径
airline = pd.read_csv(cleanedfile, encoding = 'utf-8')
# 选取需求属性
airline_selection = airline[['FFP_DATE','LOAD_TIME','LAST_TO_END',
'FLIGHT_COUNT','SEG_KM_SUM','avg_discount']]
print('筛选的属性前5行为:\n',airline_selection.head())
# 构造属性L
L = pd.to_datetime(airline_selection['LOAD_TIME']) - \
pd.to_datetime(airline_selection['FFP_DATE'])
L = L.astype('str').str.split().str[0]
L = L.astype('int')/30
# 合并属性
airline_features = pd.concat([L,airline_selection.iloc[:,2:]],axis = 1)
airline_features.columns = ['L','R','F','M','C']
print('构建的LRFMC属性前5行为:\n',airline_features.head())
# 数据标准化
from sklearn.preprocessing import StandardScaler
data = StandardScaler().fit_transform(airline_features)
np.savez('./airline_scale.npz',data)
print('标准化后LRFMC五个属性为:\n',data[:5,:])
from sklearn.cluster import KMeans # 导入kmeans算法
# 读取标准化后的数据
airline_scale = np.load('./airline_scale.npz')['arr_0']
k = 5 # 确定聚类中心数
# 构建模型,随机种子设为123
kmeans_model = KMeans(n_clusters = k,n_jobs=4,random_state=123)
fit_kmeans = kmeans_model.fit(airline_scale) # 模型训练
# 查看聚类结果
kmeans_cc = kmeans_model.cluster_centers_ # 聚类中心
print('各类聚类中心为:\n',kmeans_cc)
kmeans_labels = kmeans_model.labels_ # 样本的类别标签
print('各样本的类别标签为:\n',kmeans_labels)
r1 = pd.Series(kmeans_model.labels_).value_counts() # 统计不同类别样本的数目
print('最终每个类别的数目为:\n',r1)
# 输出聚类分群的结果
cluster_center = pd.DataFrame(kmeans_model.cluster_centers_,\
columns = ['ZL','ZR','ZF','ZM','ZC']) # 将聚类中心放在数据框中
cluster_center.index = pd.DataFrame(kmeans_model.labels_ ).\
drop_duplicates().iloc[:,0] # 将样本类别作为数据框索引
print(cluster_center)
# 客户分群雷达图
labels = ['ZL','ZR','ZF','ZM','ZC']
legen = [' customers' + str(i + 1) for i in cluster_center.index] # 客户群命名,作为雷达图的图例
lstype = ['-','--',(0, (3, 5, 1, 5, 1, 5)),':','-.']
kinds = list(cluster_center.iloc[:, 0])
# 由于雷达图要保证数据闭合,因此再添加L列,并转换为 np.ndarray
cluster_center = pd.concat([cluster_center, cluster_center[['ZL']]], axis=1)
centers = np.array(cluster_center.iloc[:, 0:])
# 分割圆周长,并让其闭合
n = len(labels)
angle = np.linspace(0, 2 * np.pi, n, endpoint=False)
angle = np.concatenate((angle, [angle[0]]))
# 绘图
fig = plt.figure(figsize = (8,6))
ax = fig.add_subplot(111, polar=True) # 以极坐标的形式绘制图形
# plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
# plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 画线
for i in range(len(kinds)):
ax.plot(angle, centers[i], linestyle=lstype[i], linewidth=2, label=kinds[i])
# 添加属性标签
ax.set_thetagrids(angle * 180 / np.pi, labels)
plt.title('Customer Profile Analysis')
plt.legend(legen)