Assignment4_6305007

'ID 1 : 6305007 Chanokchon Timklay' 'ID 2 : 6305205 Noppanut Thongyard'

#1 import pandas as pd url = 'https://gist.githubusercontent.com/ericbusboom/b2ac1d366c005cd2ed8c/raw/c92c66e43d144fa9c29dbd602d5af6988e8db533/anscombes.csv' pd.read_csv(url)

#2 import pandas as pd url = 'https://gist.githubusercontent.com/ericbusboom/b2ac1d366c005cd2ed8c/raw/c92c66e43d144fa9c29dbd602d5af6988e8db533/anscombes.csv' stat = pd.read_csv(url, index_col = 0) Atable = stat.groupby(['dataset']).mean() Btable = stat.groupby(['dataset']).var() aa = pd.concat([Atable,(Atable['x']).rename('x mean')],axis = 1) aaa = pd.concat([aa,(aa['y']).rename('y mean')],axis = 1) b = pd.concat([aaa,(Btable['x']).rename('x variance')],axis = 1) bb = pd.concat([b,(Btable['y']).rename('y variance')],axis = 1) bb.drop(columns=['x','y'])

#2.1 a = stat[stat['dataset']=='I'].describe()['x'] print('Themean of x in group I is',a.loc['mean'])

#2.2 b = stat[stat['dataset']=='II'].describe()['x'] print('THe mean of x in group II is',b.loc['mean'])

#2.3 aa = stat[stat['dataset']=='I'].describe()['y'] print('Tha sample variance of y in group I is',aa.loc['std']**2)

#2.4 bb = stat[stat['dataset']=='II'].describe()['y'] print('The sample variance of y in group II is',bb.loc['std']**2)

#3 dely = stat.drop(columns=['y']) dely.boxplot('x','dataset')

#4 delx = stat.drop(columns=['x']) delx.boxplot('y','dataset')

pip install plotnine

#5 from plotnine import * dely = stat.drop(columns=['y']) ( ggplot(dely,aes(x='x')) + facet_wrap('dataset') + geom_histogram(aes(y=after_stat('density')), fill = 'black', alpha = 0.5)+ geom_density()+ ylim(0,0.5) )

#6 delx = stat.drop(columns=['x']) ( ggplot(delx,aes(x='y')) + facet_wrap('dataset') + geom_histogram(aes(y=after_stat('density')), fill = 'black', alpha = 0.5)+ geom_density()+ ylim(0,0.5) )

#7 corre = stat.groupby(['dataset']).corr() corre

#7.1 corrI = stat.query('dataset == \"I"\ ')['x'].corr(stat.query('dataset == \"I"\ ')['y']) corrI

#7.2 corrII = stat.query('dataset == \"II"\ ')['x'].corr(stat.query('dataset == \"II"\ ')['y']) corrII

#8 ( stat.groupby('dataset').pipe(ggplot)+ aes(x='x',y='y',color='dataset')+ geom_point(alpha=0.8)+ facet_wrap('dataset') )

#9 'From the data graph in (#8), we know' 'In dataset 1 (red color point): there is spreading data point.' 'In dataset 2 (green color point): there is line of data point like curve.' 'In dataset 3 (blue color point): there is line of data point as striaght line and a outliner.' 'In dataset 4 (purple color point): there is line of point like striaght line which perpendicular with x-axis means data points have same x values but y values has differance'

#extra point