import numpy as np
import pandas as pd
## 提取数据并处理列名
df=pd.read_csv('/datasets/data1/effect_tb.csv',header=None,names=["date","uid","click","type"],encoding='gb18030')
!ls /datasets/data1
effect_tb.csv
# 查看信息
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 date 0 non-null object
1 uid 0 non-null object
2 click 0 non-null object
3 type 0 non-null object
dtypes: object(4)
memory usage: 0.0+ bytes
#重复值
df.duplicated().sum()
#查看缺失值
df.isnull().sum()
#实验组2
df[(df["type"]==3)]["click"].mean()
#实验组1
df[(df["type"]==2)]["click"].mean()
#对照组
df[(df["type"]==1)]["click"].mean()
#检测
# 用户数
n_old = len(df[df.type == 1]) # 对照组
n_new = len(df[df.type == 3]) # 策略二
#点击数
c_old = df[(df.type ==1)&(df.click == 1)]["uid"].count()
c_new = df[(df.type ==3)&(df.click == 1)]["uid"].count()
import statsmodels.stats.proportion as sp
z_score, p = sp.proportions_ztest([c_old, c_new],[n_old, n_new], alternative = "smaller")
print("检验统计量z:",z_score,",p值:", p)
检验统计量z: -59.66600946268368 ,p值: 0.0
# 查α=0.05对应的z分位数
from scipy.stats import norm
z_alpha = norm.ppf(0.05)
z_alpha
# 策略一检验f
z_score, p = sp.proportions_ztest([c_old, len(df[df.type ==2][df.click == 1])],
[n_old, len(df[df.type == 2])], alternative = "smaller")
print("检验统计量z:",z_score,",p值:", p)
检验统计量z: -14.362726203811503 ,p值: 4.433468512724253e-47
D:\software\anaconda\lib\site-packages\ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
df_ka = pd.DataFrame(columns = ['用户数','点击数'],index = ['对照','策略2'],
data=[[n_old,c_old],[n_new,c_new]])
df_ka['未点击'] = df_ka["用户数"]- df_ka['点击数']
df_ka
df_ka2=df_ka[["点击数","未点击"]]
from scipy import stats
stats.chi2_contingency(observed=df_ka2)
# 用户数
n_new2 = len(df[df.type == 2]) # 策略二
#点击数
c_new2 = df[(df.type ==2)&(df.click == 1)]["uid"].count()
df_ka3 = pd.DataFrame(columns = ['用户数','点击数'],index = ['对照','策略1'],
data=[[n_old,c_old],[n_new2,c_new2]])
df_ka3['未点击'] = df_ka3["用户数"]- df_ka3['点击数']
df_ka3
stats.chi2_contingency(observed=df_ka3)