# import standard scientific libraries
import os
import math
import numpy as np
import pandas as pd
# import ML models from scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import svm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental import preprocessing
import matplotlib.pyplot as plt
def plot_loss(history):
plt.plot(history.history['loss'], label='loss')
plt.legend()
plt.grid(True)
RANDOM_SEED = 4
np.random.seed(RANDOM_SEED)
pd.set_option('max_columns', None)
pd.set_option("display.precision", 8)
dataset = "../dataset/"
train = pd.read_csv("train.csv")#[:66000]
train.shape
train
train.info()
train = train.replace([np.inf, -np.inf], np.nan)
#train = train[train['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'].notna()]
train = train[train['functional_groups'].notna()]
train.info()
# train = pd.get_dummies(train, columns=["functional_groups"])
# train = pd.get_dummies(train, columns=["topology"])
# train
col = ["functional_groups", "topology"]
for i in col:
train[i] = train[i].astype("category").cat.codes
train = train.drop(['MOFname'],axis=1)
train= train[train['void_fraction'] > 0]
train= train[train['void_volume [cm^3/g]'] > 0]
#train train[train['CO2/N2_selectivity'] > 0]
train = train[train['heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]'].notna()]
train= train[train['surface_area [m^2/g]'] > 0]
train
train.info()
# find rows having NaN
train.isnull().any(axis=0)
#train.fillna(method='pad', inplace=True)
#train.groupby('functional_groups')['functional_groups'].count()
count = 0
for i in train['functional_groups']:
if i == 240:
count = count+1
print(count)
# find row having inf
np.isinf(train).any(axis=0)
train
x = train.drop(['CO2_working_capacity [mL/g]'],axis=1)
y = train['CO2_working_capacity [mL/g]']
x_train, x_test, y_train, y_true = train_test_split(x, y, test_size=0.2,random_state=RANDOM_SEED)
scaler = StandardScaler()
import tensorflow as tf
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns = x_train.columns)
x_test = pd.DataFrame(scaler.fit_transform(x_test), columns = x_test.columns)
normalizer = preprocessing.Normalization(axis=-1)
normalizer.adapt(np.array(x_train))
print(normalizer.mean.numpy())
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import datasets, layers, models
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.layers import BatchNormalization
initializer = tf.keras.initializers.VarianceScaling(
scale=0.1, mode='fan_in', distribution='uniform',seed = RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
model = Sequential(normalizer)
model.add(Dense(416,kernel_initializer=initializer, kernel_regularizer=regularizers.l2(0.0001),input_dim=(x_train.shape[1]), activation='relu')) # input
model.add(layers.Dropout(0.2))
model.add(Dense(416,kernel_initializer=initializer, kernel_regularizer=regularizers.l2(0.0001),activation='relu')) # hidden 1
model.add(layers.Dropout(0.2))
model.add(Dense(416,kernel_initializer=initializer, kernel_regularizer=regularizers.l2(0.0001),activation='relu')) # hidden 2
model.add(layers.Dropout(0.2))
model.add(Dense(416,kernel_initializer=initializer, kernel_regularizer=regularizers.l2(0.0001),activation='relu')) # hidden 3
model.add(layers.Dropout(0.2))
model.add(Dense(1))# output
model.summary()
import tensorflow as tf
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
0.001,
decay_steps=x_train.shape[0]*1000,
decay_rate=0.1,
staircase=False)
opt = tf.keras.optimizers.Adamax(lr_schedule)
# stop_early = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
model.compile(loss='mean_absolute_error',optimizer=opt)
history = model.fit(x_train, y_train, epochs=98, batch_size=128)
plot_loss(history)
val_acc_per_epoch = history.history['loss']
best_epoch = val_acc_per_epoch.index(min(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))
y_pred = model.predict(x_train)
log_mae = np.log(mean_absolute_error(y_pred, y_train))
log_mae
y_pred = model.predict(x_test)
log_mae = np.log(mean_absolute_error(y_pred, y_true))
log_mae
pretest = pd.read_csv("test.csv")
pretest.shape
pretest.info()
pretest['functional_groups'] = pretest['functional_groups'].replace({np.nan:0})
# train['void_fraction'] = train['void_fraction'].replace({'0':np.nan, 0:np.nan})
# train['void_volume [cm^3/g]'] = train['void_volume [cm^3/g]'].replace({'0':np. nan, 0:np.nan})
#train['functional_groups'] = train['functional_groups'].fillna(train.groupby('functional_groups')['functional_groups'].transform('mean'))
# train['void_fraction'] = train['void_fraction'].fillna(train.groupby('functional_groups')['void_fraction'].transform('mean'))
# train['void_volume [cm^3/g]'] = train['void_volume [cm^3/g]'].fillna(train.groupby('functional_groups')['void_volume [cm^3/g]'].transform('mean'))
pretest.info()
col = ["functional_groups", "topology"]
for i in col:
pretest[i] = pretest[i].astype("category").cat.codes
pretest
pretest = pretest.drop(['MOFname'],axis=1)
scaler = StandardScaler()
pretest = pd.DataFrame(scaler.fit_transform(pretest), columns = pretest.columns)
pretest = scaler.inverse_transform(pretest)
pretest
pretest_pred = model.predict(pretest)
pretest_pred
submission = pd.DataFrame({
"id": [str(i) for i in range(68614,85614)],
"CO2_working_capacity [mL/g]": pretest_pred.T[0]
})
submission.to_csv("submission.csv", index=False)
!ls
%%capture
!sudo apt-get update
!sudo apt-get install zip
!zip submission_NN.zip submission.csv