'ID 1 : 6305007 Chanokchon Timklay'
'ID 2 : 6305205 Noppanut Thongyard'
#1
import pandas as pd
url = 'https://gist.githubusercontent.com/ericbusboom/b2ac1d366c005cd2ed8c/raw/c92c66e43d144fa9c29dbd602d5af6988e8db533/anscombes.csv'
pd.read_csv(url)
idint64
0 - 43
datasetobject
I25%
II25%
2 others50%
10
10
I
11
11
II
12
12
II
13
13
II
14
14
II
15
15
II
16
16
II
17
17
II
18
18
II
19
19
II
#2
import pandas as pd
url = 'https://gist.githubusercontent.com/ericbusboom/b2ac1d366c005cd2ed8c/raw/c92c66e43d144fa9c29dbd602d5af6988e8db533/anscombes.csv'
stat = pd.read_csv(url, index_col = 0)
Atable = stat.groupby(['dataset']).mean()
Btable = stat.groupby(['dataset']).var()
aa = pd.concat([Atable,(Atable['x']).rename('x mean')],axis = 1)
aaa = pd.concat([aa,(aa['y']).rename('y mean')],axis = 1)
b = pd.concat([aaa,(Btable['x']).rename('x variance')],axis = 1)
bb = pd.concat([b,(Btable['y']).rename('y variance')],axis = 1)
bb.drop(columns=['x','y'])
x meanfloat64
y meanfloat64
I
9
7.500909091
II
9
7.500909091
III
9
7.5
IV
9
7.500909091
#2.1
a = stat[stat['dataset']=='I'].describe()['x']
print('Themean of x in group I is',a.loc['mean'])
Themean of x in group I is 9.0
#2.2
b = stat[stat['dataset']=='II'].describe()['x']
print('THe mean of x in group II is',b.loc['mean'])
THe mean of x in group II is 9.0
#2.3
aa = stat[stat['dataset']=='I'].describe()['y']
print('Tha sample variance of y in group I is',aa.loc['std']**2)
Tha sample variance of y in group I is 4.127269090909091
#2.4
bb = stat[stat['dataset']=='II'].describe()['y']
print('The sample variance of y in group II is',bb.loc['std']**2)
The sample variance of y in group II is 4.127629090909091
#3
dely = stat.drop(columns=['y'])
dely.boxplot('x','dataset')
#4
delx = stat.drop(columns=['x'])
delx.boxplot('y','dataset')
pip install plotnine
Requirement already satisfied: plotnine in /usr/local/lib/python3.7/site-packages (0.8.0)
Requirement already satisfied: descartes>=1.1.0 in /usr/local/lib/python3.7/site-packages (from plotnine) (1.1.0)
Requirement already satisfied: matplotlib>=3.1.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from plotnine) (3.5.1)
Requirement already satisfied: numpy>=1.19.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from plotnine) (1.21.5)
Requirement already satisfied: mizani>=0.7.3 in /usr/local/lib/python3.7/site-packages (from plotnine) (0.7.3)
Requirement already satisfied: patsy>=0.5.1 in /usr/local/lib/python3.7/site-packages (from plotnine) (0.5.2)
Requirement already satisfied: scipy>=1.5.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from plotnine) (1.7.3)
Requirement already satisfied: statsmodels>=0.12.1 in /usr/local/lib/python3.7/site-packages (from plotnine) (0.13.2)
Requirement already satisfied: pandas>=1.1.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from plotnine) (1.3.5)
Requirement already satisfied: fonttools>=4.22.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (4.29.1)
Requirement already satisfied: pyparsing>=2.2.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (3.0.7)
Requirement already satisfied: cycler>=0.10 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (0.11.0)
Requirement already satisfied: pillow>=6.2.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (9.0.1)
Requirement already satisfied: python-dateutil>=2.7 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (2.8.2)
Requirement already satisfied: packaging>=20.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (21.3)
Requirement already satisfied: kiwisolver>=1.0.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib>=3.1.1->plotnine) (1.3.2)
Requirement already satisfied: palettable in /usr/local/lib/python3.7/site-packages (from mizani>=0.7.3->plotnine) (3.3.0)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.1->plotnine) (1.16.0)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=1.1.0->plotnine) (2021.3)
WARNING: You are using pip version 20.1.1; however, version 22.0.4 is available.
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
#5
from plotnine import *
dely = stat.drop(columns=['y'])
(
ggplot(dely,aes(x='x')) +
facet_wrap('dataset') +
geom_histogram(aes(y=after_stat('density')), fill = 'black', alpha = 0.5)+
geom_density()+
ylim(0,0.5)
)
/usr/local/lib/python3.7/site-packages/plotnine/stats/stat_bin.py:95: PlotnineWarning: 'stat_bin()' using 'bins = 7'. Pick better value with 'binwidth'.
#6
delx = stat.drop(columns=['x'])
(
ggplot(delx,aes(x='y')) +
facet_wrap('dataset') +
geom_histogram(aes(y=after_stat('density')), fill = 'black', alpha = 0.5)+
geom_density()+
ylim(0,0.5)
)
/usr/local/lib/python3.7/site-packages/plotnine/stats/stat_bin.py:95: PlotnineWarning: 'stat_bin()' using 'bins = 7'. Pick better value with 'binwidth'.
#7
corre = stat.groupby(['dataset']).corr()
corre
#7.1
corrI = stat.query('dataset == \"I"\ ')['x'].corr(stat.query('dataset == \"I"\ ')['y'])
corrI
#7.2
corrII = stat.query('dataset == \"II"\ ')['x'].corr(stat.query('dataset == \"II"\ ')['y'])
corrII
#8
(
stat.groupby('dataset').pipe(ggplot)+
aes(x='x',y='y',color='dataset')+
geom_point(alpha=0.8)+
facet_wrap('dataset')
)
#9
'From the data graph in (#8), we know'
'In dataset 1 (red color point): there is spreading data point.'
'In dataset 2 (green color point): there is line of data point like curve.'
'In dataset 3 (blue color point): there is line of data point as striaght line and a outliner.'
'In dataset 4 (purple color point): there is line of point like striaght line which perpendicular with x-axis means data points have same x values but y values has differance'
#extra point