# import standard scientific libraries
import os
import math
import numpy as np
import pandas as pd
# import ML models from scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import svm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing
import matplotlib.pyplot as plt
def plot_loss(history):
plt.plot(history.history['loss'], label='loss')
plt.legend()
plt.grid(True)
RANDOM_SEED = 4
np.random.seed(RANDOM_SEED)
pd.set_option('max_columns', None)
pd.set_option("display.precision", 8)
dataset = "../dataset/"
train = pd.read_csv("train.csv")#[:66000]
train.shape
train
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68613 entries, 0 to 68612
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MOFname 68613 non-null object
1 volume [A^3] 68613 non-null float64
2 weight [u] 68613 non-null float64
3 surface_area [m^2/g] 68613 non-null float64
4 void_fraction 68613 non-null float64
5 void_volume [cm^3/g] 68613 non-null float64
6 functional_groups 68290 non-null object
7 metal_linker 68613 non-null int64
8 organic_linker1 68613 non-null int64
9 organic_linker2 68613 non-null int64
10 topology 68613 non-null object
11 CO2/N2_selectivity 68613 non-null float64
12 heat_adsorption_CO2_P0.15bar_T298K [kcal/mol] 66526 non-null float64
13 CO2_working_capacity [mL/g] 68613 non-null float64
dtypes: float64(8), int64(3), object(3)
memory usage: 7.3+ MB
train = train.replace([np.inf, -np.inf], np.nan)
#train = train[train['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'].notna()]
train = train[train['functional_groups'].notna()]
train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 68290 entries, 0 to 68612
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MOFname 68290 non-null object
1 volume [A^3] 68290 non-null float64
2 weight [u] 68290 non-null float64
3 surface_area [m^2/g] 68290 non-null float64
4 void_fraction 68290 non-null float64
5 void_volume [cm^3/g] 68290 non-null float64
6 functional_groups 68290 non-null object
7 metal_linker 68290 non-null int64
8 organic_linker1 68290 non-null int64
9 organic_linker2 68290 non-null int64
10 topology 68290 non-null object
11 CO2/N2_selectivity 68290 non-null float64
12 heat_adsorption_CO2_P0.15bar_T298K [kcal/mol] 66203 non-null float64
13 CO2_working_capacity [mL/g] 68290 non-null float64
dtypes: float64(8), int64(3), object(3)
memory usage: 7.8+ MB
# train = pd.get_dummies(train, columns=["functional_groups"])
# train = pd.get_dummies(train, columns=["topology"])
# train
col = ["functional_groups", "topology"]
for i in col:
train[i] = train[i].astype("category").cat.codes
train = train.drop(['MOFname'],axis=1)
train= train[train['void_fraction'] > 0]
train= train[train['void_volume [cm^3/g]'] > 0]
#train train[train['CO2/N2_selectivity'] > 0]
train = train[train['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'].notna()]
train= train[train['surface_area [m^2/g]'] > 0]
train
train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 53778 entries, 1 to 66523
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 volume [A^3] 53778 non-null float64
1 weight [u] 53778 non-null float64
2 surface_area [m^2/g] 53778 non-null float64
3 void_fraction 53778 non-null float64
4 void_volume [cm^3/g] 53778 non-null float64
5 functional_groups 53778 non-null int16
6 metal_linker 53778 non-null int64
7 organic_linker1 53778 non-null int64
8 organic_linker2 53778 non-null int64
9 topology 53778 non-null int8
10 CO2/N2_selectivity 53778 non-null float64
11 heat_adsorption_CO2_P0.15bar_T298K [kcal/mol] 53778 non-null float64
12 CO2_working_capacity [mL/g] 53778 non-null float64
dtypes: float64(8), int16(1), int64(3), int8(1)
memory usage: 5.1 MB
# find rows having NaN
train.isnull().any(axis=0)
#train.fillna(method='pad', inplace=True)
#train.groupby('functional_groups')['functional_groups'].count()
count = 0
for i in train['functional_groups']:
if i == 240:
count = count+1
print(count)
1036
# find row having inf
np.isinf(train).any(axis=0)
train
x = train.drop(['CO2_working_capacity [mL/g]'],axis=1)
y = train['CO2_working_capacity [mL/g]']
x_train, x_test, y_train, y_true = train_test_split(x, y, test_size=0.2,random_state=RANDOM_SEED)
scaler = StandardScaler()
import tensorflow as tf
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns = x_train.columns)
x_test = pd.DataFrame(scaler.fit_transform(x_test), columns = x_test.columns)
normalizer = preprocessing.Normalization(axis=-1)
normalizer.adapt(np.array(x_train))
print(normalizer.mean.numpy())
[-3.9039222e-17 2.8867553e-16 -1.5687945e-16 3.6055024e-16
9.1043345e-18 1.1532158e-16 -1.1961568e-16 1.4999701e-17
2.4895503e-16 -2.1439572e-17 -2.9236062e-16 -3.9916364e-16]
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import datasets, layers, models
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.layers import BatchNormalization
initializer = tf.keras.initializers.VarianceScaling(
scale=0.1, mode='fan_in', distribution='uniform',seed = RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
model = Sequential(normalizer)
model.add(Dense(416,kernel_initializer=initializer, kernel_regularizer=regularizers.l2(0.0001),input_dim=(x_train.shape[1]), activation='relu')) # input
model.add(layers.Dropout(0.2))
model.add(Dense(416,kernel_initializer=initializer, kernel_regularizer=regularizers.l2(0.0001),activation='relu')) # hidden 1
model.add(layers.Dropout(0.2))
model.add(Dense(416,kernel_initializer=initializer, kernel_regularizer=regularizers.l2(0.0001),activation='relu')) # hidden 2
model.add(layers.Dropout(0.2))
model.add(Dense(416,kernel_initializer=initializer, kernel_regularizer=regularizers.l2(0.0001),activation='relu')) # hidden 3
model.add(layers.Dropout(0.2))
model.add(Dense(1))# output
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
normalization (Normalization (None, 12) 25
_________________________________________________________________
dense (Dense) (None, 416) 5408
_________________________________________________________________
dropout (Dropout) (None, 416) 0
_________________________________________________________________
dense_1 (Dense) (None, 416) 173472
_________________________________________________________________
dropout_1 (Dropout) (None, 416) 0
_________________________________________________________________
dense_2 (Dense) (None, 416) 173472
_________________________________________________________________
dropout_2 (Dropout) (None, 416) 0
_________________________________________________________________
dense_3 (Dense) (None, 416) 173472
_________________________________________________________________
dropout_3 (Dropout) (None, 416) 0
_________________________________________________________________
dense_4 (Dense) (None, 1) 417
=================================================================
Total params: 526,266
Trainable params: 526,241
Non-trainable params: 25
_________________________________________________________________
import tensorflow as tf
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
0.001,
decay_steps=x_train.shape[0]*1000,
decay_rate=0.1,
staircase=False)
opt = tf.keras.optimizers.Adamax(lr_schedule)
# stop_early = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
model.compile(loss='mean_absolute_error',optimizer=opt)
history = model.fit(x_train, y_train, epochs=98, batch_size=128)
Epoch 1/98
337/337 [==============================] - 15s 40ms/step - loss: 50.8373
Epoch 2/98
337/337 [==============================] - 13s 39ms/step - loss: 26.9328
Epoch 3/98
337/337 [==============================] - 14s 41ms/step - loss: 25.1312
Epoch 4/98
337/337 [==============================] - 12s 37ms/step - loss: 24.1117
Epoch 5/98
337/337 [==============================] - 13s 39ms/step - loss: 23.3096
Epoch 6/98
337/337 [==============================] - 13s 40ms/step - loss: 23.1611
Epoch 7/98
337/337 [==============================] - 13s 37ms/step - loss: 22.6337
Epoch 8/98
337/337 [==============================] - 13s 40ms/step - loss: 22.3117
Epoch 9/98
337/337 [==============================] - 14s 40ms/step - loss: 21.8711
Epoch 10/98
337/337 [==============================] - 14s 41ms/step - loss: 22.0119
Epoch 11/98
337/337 [==============================] - 13s 38ms/step - loss: 21.7464
Epoch 12/98
337/337 [==============================] - 13s 39ms/step - loss: 21.2430
Epoch 13/98
337/337 [==============================] - 14s 40ms/step - loss: 21.0638
Epoch 14/98
337/337 [==============================] - 13s 40ms/step - loss: 21.1192
Epoch 15/98
337/337 [==============================] - 13s 39ms/step - loss: 20.9444
Epoch 16/98
337/337 [==============================] - 13s 38ms/step - loss: 20.9674
Epoch 17/98
337/337 [==============================] - 13s 38ms/step - loss: 20.6636
Epoch 18/98
337/337 [==============================] - 13s 39ms/step - loss: 20.8038
Epoch 19/98
337/337 [==============================] - 13s 39ms/step - loss: 20.6799
Epoch 20/98
337/337 [==============================] - 13s 38ms/step - loss: 20.5429
Epoch 21/98
337/337 [==============================] - 13s 39ms/step - loss: 20.4305
Epoch 22/98
337/337 [==============================] - 13s 39ms/step - loss: 20.1575
Epoch 23/98
337/337 [==============================] - 12s 36ms/step - loss: 20.3344
Epoch 24/98
337/337 [==============================] - 13s 37ms/step - loss: 20.1992
Epoch 25/98
337/337 [==============================] - 13s 38ms/step - loss: 20.1133
Epoch 26/98
337/337 [==============================] - 13s 38ms/step - loss: 19.9581
Epoch 27/98
337/337 [==============================] - 13s 38ms/step - loss: 19.7673
Epoch 28/98
337/337 [==============================] - 13s 38ms/step - loss: 19.9862
Epoch 29/98
337/337 [==============================] - 12s 36ms/step - loss: 19.8421
Epoch 30/98
337/337 [==============================] - 13s 38ms/step - loss: 19.8637
Epoch 31/98
337/337 [==============================] - 13s 39ms/step - loss: 19.7332
Epoch 32/98
337/337 [==============================] - 13s 38ms/step - loss: 19.6096
Epoch 33/98
337/337 [==============================] - 13s 38ms/step - loss: 19.4367
Epoch 34/98
337/337 [==============================] - 13s 40ms/step - loss: 19.4765
Epoch 35/98
337/337 [==============================] - 13s 39ms/step - loss: 19.4989
Epoch 36/98
337/337 [==============================] - 13s 39ms/step - loss: 19.5899
Epoch 37/98
337/337 [==============================] - 13s 38ms/step - loss: 19.5227
Epoch 38/98
337/337 [==============================] - 14s 41ms/step - loss: 19.3814
Epoch 39/98
337/337 [==============================] - 13s 38ms/step - loss: 19.4275
Epoch 40/98
337/337 [==============================] - 13s 39ms/step - loss: 19.3258
Epoch 41/98
337/337 [==============================] - 13s 38ms/step - loss: 19.1510
Epoch 42/98
337/337 [==============================] - 13s 39ms/step - loss: 19.1399
Epoch 43/98
337/337 [==============================] - 13s 38ms/step - loss: 19.1406
Epoch 44/98
337/337 [==============================] - 13s 38ms/step - loss: 19.0212
Epoch 45/98
337/337 [==============================] - 13s 38ms/step - loss: 19.1663
Epoch 46/98
337/337 [==============================] - 13s 39ms/step - loss: 18.8862
Epoch 47/98
337/337 [==============================] - 13s 40ms/step - loss: 18.9225
Epoch 48/98
337/337 [==============================] - 13s 38ms/step - loss: 18.8958
Epoch 49/98
337/337 [==============================] - 13s 39ms/step - loss: 18.8860
Epoch 50/98
337/337 [==============================] - 13s 38ms/step - loss: 18.8594
Epoch 51/98
337/337 [==============================] - 14s 40ms/step - loss: 18.7613
Epoch 52/98
337/337 [==============================] - 13s 39ms/step - loss: 18.8338
Epoch 53/98
337/337 [==============================] - 13s 39ms/step - loss: 18.9721
Epoch 54/98
337/337 [==============================] - 16s 47ms/step - loss: 18.8345
Epoch 55/98
337/337 [==============================] - 20s 59ms/step - loss: 18.7144
Epoch 56/98
337/337 [==============================] - 14s 41ms/step - loss: 18.6411
Epoch 57/98
337/337 [==============================] - 12s 37ms/step - loss: 18.7880
Epoch 58/98
337/337 [==============================] - 12s 37ms/step - loss: 18.5522
Epoch 59/98
337/337 [==============================] - 13s 39ms/step - loss: 18.6737
Epoch 60/98
337/337 [==============================] - 12s 37ms/step - loss: 18.4886
Epoch 61/98
337/337 [==============================] - 13s 39ms/step - loss: 18.8030
Epoch 62/98
337/337 [==============================] - 13s 39ms/step - loss: 18.6988
Epoch 63/98
337/337 [==============================] - 13s 38ms/step - loss: 18.1567
Epoch 64/98
337/337 [==============================] - 12s 37ms/step - loss: 18.5697
Epoch 65/98
337/337 [==============================] - 13s 39ms/step - loss: 18.6062
Epoch 66/98
337/337 [==============================] - 12s 37ms/step - loss: 18.5833
Epoch 67/98
337/337 [==============================] - 12s 37ms/step - loss: 18.3165
Epoch 68/98
337/337 [==============================] - 13s 38ms/step - loss: 18.2401
Epoch 69/98
337/337 [==============================] - 13s 37ms/step - loss: 18.3634
Epoch 70/98
337/337 [==============================] - 13s 38ms/step - loss: 18.3404
Epoch 71/98
337/337 [==============================] - 13s 37ms/step - loss: 18.4798
Epoch 72/98
337/337 [==============================] - 13s 39ms/step - loss: 18.4538
Epoch 73/98
337/337 [==============================] - 13s 38ms/step - loss: 18.2806
Epoch 74/98
337/337 [==============================] - 13s 39ms/step - loss: 18.2701
Epoch 75/98
337/337 [==============================] - 13s 38ms/step - loss: 18.4515
Epoch 76/98
337/337 [==============================] - 13s 38ms/step - loss: 18.3967
Epoch 77/98
337/337 [==============================] - 13s 39ms/step - loss: 18.3362
Epoch 78/98
337/337 [==============================] - 14s 42ms/step - loss: 18.3203
Epoch 79/98
337/337 [==============================] - 13s 40ms/step - loss: 18.1915
Epoch 80/98
337/337 [==============================] - 13s 38ms/step - loss: 18.1031
Epoch 81/98
337/337 [==============================] - 13s 40ms/step - loss: 18.0830
Epoch 82/98
337/337 [==============================] - 14s 41ms/step - loss: 18.3405
Epoch 83/98
337/337 [==============================] - 13s 40ms/step - loss: 18.1402
Epoch 84/98
337/337 [==============================] - 13s 39ms/step - loss: 18.3528
Epoch 85/98
337/337 [==============================] - 13s 39ms/step - loss: 18.1374
Epoch 86/98
337/337 [==============================] - 13s 38ms/step - loss: 18.0053
Epoch 87/98
337/337 [==============================] - 13s 38ms/step - loss: 18.2175
Epoch 88/98
337/337 [==============================] - 13s 39ms/step - loss: 18.2257
Epoch 89/98
337/337 [==============================] - 12s 37ms/step - loss: 18.3146
Epoch 90/98
337/337 [==============================] - 12s 36ms/step - loss: 18.1044
Epoch 91/98
337/337 [==============================] - 13s 38ms/step - loss: 18.0029
Epoch 92/98
337/337 [==============================] - 13s 38ms/step - loss: 17.8634
Epoch 93/98
337/337 [==============================] - 13s 39ms/step - loss: 18.2011
Epoch 94/98
337/337 [==============================] - 13s 39ms/step - loss: 17.9378
Epoch 95/98
337/337 [==============================] - 13s 39ms/step - loss: 18.0570
Epoch 96/98
337/337 [==============================] - 13s 39ms/step - loss: 17.8637
Epoch 97/98
337/337 [==============================] - 13s 38ms/step - loss: 17.9196
Epoch 98/98
337/337 [==============================] - 13s 39ms/step - loss: 17.8466
plot_loss(history)
val_acc_per_epoch = history.history['loss']
best_epoch = val_acc_per_epoch.index(min(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))
Best epoch: 96
y_pred = model.predict(x_train)
log_mae = np.log(mean_absolute_error(y_pred, y_train))
log_mae
y_pred = model.predict(x_test)
log_mae = np.log(mean_absolute_error(y_pred, y_true))
log_mae
pretest = pd.read_csv("test.csv")
pretest.shape
pretest.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MOFname 17000 non-null object
1 volume [A^3] 17000 non-null float64
2 weight [u] 17000 non-null float64
3 surface_area [m^2/g] 17000 non-null float64
4 void_fraction 17000 non-null float64
5 void_volume [cm^3/g] 17000 non-null float64
6 functional_groups 17000 non-null object
7 metal_linker 17000 non-null int64
8 organic_linker1 17000 non-null int64
9 organic_linker2 17000 non-null int64
10 topology 17000 non-null object
11 CO2/N2_selectivity 17000 non-null float64
12 heat_adsorption_CO2_P0.15bar_T298K [kcal/mol] 17000 non-null float64
dtypes: float64(7), int64(3), object(3)
memory usage: 1.7+ MB
pretest['functional_groups'] = pretest['functional_groups'].replace({np.nan:0})
# train['void_fraction'] = train['void_fraction'].replace({'0':np.nan, 0:np.nan})
# train['void_volume [cm^3/g]'] = train['void_volume [cm^3/g]'].replace({'0':np. nan, 0:np.nan})
#train['functional_groups'] = train['functional_groups'].fillna(train.groupby('functional_groups')['functional_groups'].transform('mean'))
# train['void_fraction'] = train['void_fraction'].fillna(train.groupby('functional_groups')['void_fraction'].transform('mean'))
# train['void_volume [cm^3/g]'] = train['void_volume [cm^3/g]'].fillna(train.groupby('functional_groups')['void_volume [cm^3/g]'].transform('mean'))
pretest.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MOFname 17000 non-null object
1 volume [A^3] 17000 non-null float64
2 weight [u] 17000 non-null float64
3 surface_area [m^2/g] 17000 non-null float64
4 void_fraction 17000 non-null float64
5 void_volume [cm^3/g] 17000 non-null float64
6 functional_groups 17000 non-null object
7 metal_linker 17000 non-null int64
8 organic_linker1 17000 non-null int64
9 organic_linker2 17000 non-null int64
10 topology 17000 non-null object
11 CO2/N2_selectivity 17000 non-null float64
12 heat_adsorption_CO2_P0.15bar_T298K [kcal/mol] 17000 non-null float64
dtypes: float64(7), int64(3), object(3)
memory usage: 1.7+ MB
col = ["functional_groups", "topology"]
for i in col:
pretest[i] = pretest[i].astype("category").cat.codes
pretest
pretest = pretest.drop(['MOFname'],axis=1)
scaler = StandardScaler()
pretest = pd.DataFrame(scaler.fit_transform(pretest), columns = pretest.columns)
pretest = scaler.inverse_transform(pretest)
pretest
pretest_pred = model.predict(pretest)
pretest_pred
submission = pd.DataFrame({
"id": [str(i) for i in range(68614,85614)],
"CO2_working_capacity [mL/g]": pretest_pred.T[0]
})
submission.to_csv("submission.csv", index=False)
!ls
init.ipynb modelmaker.ipynb pre.ipynb submission.csv train.csv
kook-2.ipynb notebook-2.ipynb pretest.csv submission_NN.zip
kook.ipynb notebook.ipynb stang.ipynb test.csv
%%capture
!sudo apt-get update
!sudo apt-get install zip
!zip submission_NN.zip submission.csv
updating: submission.csv (deflated 55%)