This notebook contains the required steps to provide the baseline model with tabular data and the multivariate recurrent neural network with a tensor

import pandas as pd
import numpy as np
import pickle
data = pd.read_stata('data/data_clear.dta')

## Extracting columns
# Trial, Gaze, and Fixations attributes
cols = ['subject', 'item', # columns about the particpant and keepin track of the trial order
'ownerp','person_pp', 'group_pp', 'group_total', 'group_size', 'resptime', 'dec_ut', # trial attributes
'n', 'gazetime','fix_n', 'dur', 'fix_start', 'fix_end', # gazes and fixations attributes
'textAOI','AOI_ownership', 'AOI_perperson', 'AOI_outside', 'AOI_total', 'AOI_deontological', 'AOI_utilitarian' # Area of Interest attributes
]
trials = data[cols]
# Sort on subject and item to
trials = trials.sort_values(by=['subject','item', 'gazetime'])

# Clean the dataset based on filler trials (where group was owner)
trials = trials.dropna(how='any')

# Change 'textAOI' to a a binary 0-1, any value higher than 1 does not give extra information
trials['textAOI'] = trials['textAOI'].apply(lambda x: 1 if x > 0 else 0)
# Change 'item' into 1-20 and from here on use it to track the sequence of fixations and rename it to 'trial'
trials['item'] -= 1
# Change 'subject' into 1-106 instead of 101-206
trials['subject'] -= 100

## Renaming columns of interest
trials.rename(columns= {
'subject':'participant',
'item':'trial',
'textAOI':'AOI_text',
'AOI_ownership':'AOI_allocated',
'AOI_perperson':'AOI_price_perperson',
'AOI_total':'AOI_price_total',
'group_total':'group_total',
'fix_n':'fixation_n',
'dur':'duration',
'fix_start':'fixation_start',
'fix_end':'fixation_end'
}, inplace=True)

def filter_cutoff(x, pr = False):
""" Print all the combinations of (participant,trial) that are above the cutoff """
""" Call this function with pr = True to print the combinations that are removed """
""" Inpsect this group aftward by: groups.get_group((35,2)).describe() """
if np.max(x['duration']) > cutoff:
if pr == True:
print(x.name)
return False
return True

# Set cutoff to .99 percentile
cutoff = trials["duration"].quantile(.999)
# Remove any combination of trial and participant that is > cutoff, and therefore, whole trial is seen as an outlier/ invalid observation time series
groups = trials.groupby(['participant','trial'])
trials_cutoff = groups.filter(lambda x: filter_cutoff(x, pr = True))
# Amount of rows cutoff:
print('amount of rows cutoff:',len(trials) - len(trials_cutoff))

# Save the trials and label
clean_trials = trials_cutoff.copy()
clean_trials.to_pickle('data/trials.pkl')