'ID 1 : 6305007 Chanokchon Timklay'
'ID 2 : 6305205 Noppanut Thongyard'
#1
import pandas as pd
url = 'https://gist.githubusercontent.com/ericbusboom/b2ac1d366c005cd2ed8c/raw/c92c66e43d144fa9c29dbd602d5af6988e8db533/anscombes.csv'
pd.read_csv(url)
#2
import pandas as pd
url = 'https://gist.githubusercontent.com/ericbusboom/b2ac1d366c005cd2ed8c/raw/c92c66e43d144fa9c29dbd602d5af6988e8db533/anscombes.csv'
stat = pd.read_csv(url, index_col = 0)
Atable = stat.groupby(['dataset']).mean()
Btable = stat.groupby(['dataset']).var()
aa = pd.concat([Atable,(Atable['x']).rename('x mean')],axis = 1)
aaa = pd.concat([aa,(aa['y']).rename('y mean')],axis = 1)
b = pd.concat([aaa,(Btable['x']).rename('x variance')],axis = 1)
bb = pd.concat([b,(Btable['y']).rename('y variance')],axis = 1)
bb.drop(columns=['x','y'])
#2.1
a = stat[stat['dataset']=='I'].describe()['x']
print('Themean of x in group I is',a.loc['mean'])
#2.2
b = stat[stat['dataset']=='II'].describe()['x']
print('THe mean of x in group II is',b.loc['mean'])
#2.3
aa = stat[stat['dataset']=='I'].describe()['y']
print('Tha sample variance of y in group I is',aa.loc['std']**2)
#2.4
bb = stat[stat['dataset']=='II'].describe()['y']
print('The sample variance of y in group II is',bb.loc['std']**2)
#3
dely = stat.drop(columns=['y'])
dely.boxplot('x','dataset')
#4
delx = stat.drop(columns=['x'])
delx.boxplot('y','dataset')
pip install plotnine
#5
from plotnine import *
dely = stat.drop(columns=['y'])
(
ggplot(dely,aes(x='x')) +
facet_wrap('dataset') +
geom_histogram(aes(y=after_stat('density')), fill = 'black', alpha = 0.5)+
geom_density()+
ylim(0,0.5)
)
#6
delx = stat.drop(columns=['x'])
(
ggplot(delx,aes(x='y')) +
facet_wrap('dataset') +
geom_histogram(aes(y=after_stat('density')), fill = 'black', alpha = 0.5)+
geom_density()+
ylim(0,0.5)
)
#7
corre = stat.groupby(['dataset']).corr()
corre
#7.1
corrI = stat.query('dataset == \"I"\ ')['x'].corr(stat.query('dataset == \"I"\ ')['y'])
corrI
#7.2
corrII = stat.query('dataset == \"II"\ ')['x'].corr(stat.query('dataset == \"II"\ ')['y'])
corrII
#8
(
stat.groupby('dataset').pipe(ggplot)+
aes(x='x',y='y',color='dataset')+
geom_point(alpha=0.8)+
facet_wrap('dataset')
)
#9
'From the data graph in (#8), we know'
'In dataset 1 (red color point): there is spreading data point.'
'In dataset 2 (green color point): there is line of data point like curve.'
'In dataset 3 (blue color point): there is line of data point as striaght line and a outliner.'
'In dataset 4 (purple color point): there is line of point like striaght line which perpendicular with x-axis means data points have same x values but y values has differance'
#extra point