Install packages
import numpy as np
import pandas as pd
import scipy.stats as sts
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
Round 1
Load data
data = pd.read_csv('/home/jovyan/work/Daily experimental records - Data.csv')
print(data.columns)
data_sub = data[['pH 2 (day 1)', 'Solvent volume 2 (ml) (day 1)', 'Average Temperature (degree C)', 'Liquid amount (ml) (day 4)', 'Success']]
data_sub.columns = ['pH', 'solvent volume', 'temp', 'liquid volume', 'success']
data_sub
Visualize data
temp_1 = data_sub['liquid volume'][data_sub['temp'] == 60]
temp_2 = data_sub['liquid volume'][data_sub['temp'] != 60]
pH_1 = data_sub['liquid volume'][data_sub['pH'] == 9.5]
pH_2 = data_sub['liquid volume'][data_sub['pH'] == 7]
pH_3 = data_sub['liquid volume'][data_sub['pH'] == 4]
plt.bar(['Temp = 60C', 'Temp = 40C'], [temp_1.mean(), temp_2.mean()], yerr = [temp_1.std(), temp_2.std()], capsize=3)
plt.title('Averaged liquid volume at 2 temperature levels')
plt.xlabel('Conditions')
plt.ylabel('Liquid volume (ml)')
plt.bar(['pH = 9.5', 'pH = 7', 'pH = 4'], [pH_1.mean(), pH_2.mean(), pH_3.mean()], yerr = [pH_1.std(), pH_2.std(), pH_3.std()], capsize=3)
plt.title('Averaged liquid volume at 3 pH conditions')
plt.xlabel('Conditions')
plt.ylabel('Liquid volume (ml)')
Linear regression
data_sub['pH scaled'] = StandardScaler().fit_transform((data_sub['pH']).to_numpy().reshape(-1,1))
data_sub['solvent volume scaled'] = StandardScaler().fit_transform((data_sub['solvent volume']).to_numpy().reshape(-1,1))
data_sub['temp scaled'] = StandardScaler().fit_transform((data_sub['temp']).to_numpy().reshape(-1,1))
X_train, X_test, y_train, y_test = train_test_split(data_sub[['pH scaled', 'temp scaled']], data_sub['liquid volume'], test_size = 0.3, random_state=123)
regr = LinearRegression()
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
pred
# R2 score
regr.score(X_test, y_test)
# coefficients
regr.coef_
Pairwise comparison
def cohens_d (x,y):
n1 = x.shape[0]
n2 = y.shape[0]
sd = (((n1 - 1) * x.var() + (n2 - 1) * y.var())/(n1 + n2 - 2))**0.5
return ((x.mean()-y.mean())/sd)
Temperature
sts.ttest_ind(temp_1, temp_2, equal_var=False)
cohens_d(temp_1, temp_2)
pH
sts.ttest_ind(pH_1, pH_2, equal_var=False)
cohens_d(pH_1, pH_2)
sts.ttest_ind(pH_2, pH_3, equal_var=False)
cohens_d(pH_2, pH_3)
sts.ttest_ind(pH_3, pH_1, equal_var=True)
cohens_d(pH_3, pH_1)
Round 2
Load data
data2 = pd.read_csv('/home/jovyan/work/Daily experimental records - Round 2.csv')
data2
# different groups
basic = data2['CO2 (ml)'][data2['pH'] == 9.5]
neutral = data2['CO2 (ml)'][data2['pH'] == 7]
acidic = data2['CO2 (ml)'][data2['pH'] == 4]
Visualize data
plt.bar(['pH = 9.5', 'pH = 7', 'pH = 4'], [basic.mean(), neutral.mean(), acidic.mean()], yerr = [basic.std(), neutral.std(), acidic.std()], capsize=3)
plt.title('Averaged CO2 volume at 3 pH conditions')
plt.xlabel('Conditions')
plt.ylabel('CO2 volume (ml)')
Linear regression
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(data2['pH'].to_numpy().reshape(-1,1), data2['CO2 (ml)'], random_state=123, test_size=0.3)
regr2 = LinearRegression()
regr2.fit(X_train_2, y_train_2)
regr2.score(X_test_2, y_test_2)
Pairwise comparisons
cohens_d(basic, neutral)
cohens_d(neutral, acidic)
cohens_d(basic, acidic)
sts.ttest_ind(basic, neutral, equal_var=False)
sts.ttest_ind(neutral, acidic, equal_var=False)
sts.ttest_ind(basic, acidic, equal_var=False)