import numpy as np
import pandas as pd
import scipy.stats as sts
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
data = pd.read_csv('/home/jovyan/work/Daily experimental records - Data.csv')
print(data.columns)
data_sub = data[['pH 2 (day 1)', 'Solvent volume 2 (ml) (day 1)', 'Average Temperature (degree C)', 'Liquid amount (ml) (day 4)', 'Success']]
data_sub.columns = ['pH', 'solvent volume', 'temp', 'liquid volume', 'success']
Index(['Sample', 'pH 1 (day 1)', 'pH 2 (day 1)',
'Solvent volume 1 (ml) (day 1)', 'Solvent volume 2 (ml) (day 1)',
'Cooker (day 1)', 'Contained (day 1)',
'Temperature 1 (degree C) (day 1)', 'Temperature 2 (degree C) (day 2)',
'Temperature 3 (degree C) (day 4)', 'Average Temperature (degree C)',
'Container (day 1)', 'CO2 level (day 4)', 'Smell (day 4)',
'Taste (day 4)', 'Texture (day 4)', 'Color (day 4)',
'Liquid characteristics', 'Liquid amount (ml) (day 4)', 'Success',
'Note'],
dtype='object')
data_sub
temp_1 = data_sub['liquid volume'][data_sub['temp'] == 60]
temp_2 = data_sub['liquid volume'][data_sub['temp'] != 60]
pH_1 = data_sub['liquid volume'][data_sub['pH'] == 9.5]
pH_2 = data_sub['liquid volume'][data_sub['pH'] == 7]
pH_3 = data_sub['liquid volume'][data_sub['pH'] == 4]
plt.bar(['Temp = 60C', 'Temp = 40C'], [temp_1.mean(), temp_2.mean()], yerr = [temp_1.std(), temp_2.std()], capsize=3)
plt.title('Averaged liquid volume at 2 temperature levels')
plt.xlabel('Conditions')
plt.ylabel('Liquid volume (ml)')
plt.bar(['pH = 9.5', 'pH = 7', 'pH = 4'], [pH_1.mean(), pH_2.mean(), pH_3.mean()], yerr = [pH_1.std(), pH_2.std(), pH_3.std()], capsize=3)
plt.title('Averaged liquid volume at 3 pH conditions')
plt.xlabel('Conditions')
plt.ylabel('Liquid volume (ml)')
data_sub['pH scaled'] = StandardScaler().fit_transform((data_sub['pH']).to_numpy().reshape(-1,1))
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
data_sub['solvent volume scaled'] = StandardScaler().fit_transform((data_sub['solvent volume']).to_numpy().reshape(-1,1))
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
data_sub['temp scaled'] = StandardScaler().fit_transform((data_sub['temp']).to_numpy().reshape(-1,1))
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
X_train, X_test, y_train, y_test = train_test_split(data_sub[['pH scaled', 'temp scaled']], data_sub['liquid volume'], test_size = 0.3, random_state=123)
regr = LinearRegression()
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
pred
# R2 score
regr.score(X_test, y_test)
# coefficients
regr.coef_
def cohens_d (x,y):
n1 = x.shape[0]
n2 = y.shape[0]
sd = (((n1 - 1) * x.var() + (n2 - 1) * y.var())/(n1 + n2 - 2))**0.5
return ((x.mean()-y.mean())/sd)
sts.ttest_ind(temp_1, temp_2, equal_var=False)
cohens_d(temp_1, temp_2)
sts.ttest_ind(pH_1, pH_2, equal_var=False)
cohens_d(pH_1, pH_2)
sts.ttest_ind(pH_2, pH_3, equal_var=False)
cohens_d(pH_2, pH_3)
sts.ttest_ind(pH_3, pH_1, equal_var=True)
cohens_d(pH_3, pH_1)
data2 = pd.read_csv('/home/jovyan/work/Daily experimental records - Round 2.csv')
data2
# different groups
basic = data2['CO2 (ml)'][data2['pH'] == 9.5]
neutral = data2['CO2 (ml)'][data2['pH'] == 7]
acidic = data2['CO2 (ml)'][data2['pH'] == 4]
plt.bar(['pH = 9.5', 'pH = 7', 'pH = 4'], [basic.mean(), neutral.mean(), acidic.mean()], yerr = [basic.std(), neutral.std(), acidic.std()], capsize=3)
plt.title('Averaged CO2 volume at 3 pH conditions')
plt.xlabel('Conditions')
plt.ylabel('CO2 volume (ml)')
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(data2['pH'].to_numpy().reshape(-1,1), data2['CO2 (ml)'], random_state=123, test_size=0.3)
regr2 = LinearRegression()
regr2.fit(X_train_2, y_train_2)
regr2.score(X_test_2, y_test_2)
cohens_d(basic, neutral)
cohens_d(neutral, acidic)
cohens_d(basic, acidic)
sts.ttest_ind(basic, neutral, equal_var=False)
sts.ttest_ind(neutral, acidic, equal_var=False)
sts.ttest_ind(basic, acidic, equal_var=False)