Phocrastination: A Game-Based Approach to Assessing and Classifying Academic Procrastination in College Students
Load Libraries
# base
import pandas as pd
import numpy as np
import json
import warnings
# preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from scipy.stats import norm
from sklearn.preprocessing import MinMaxScaler
# modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif
# validation
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import r2_score
from sklearn.metrics import roc_curve, RocCurveDisplay, auc
# visualization
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.patches as patches
import plotly.graph_objects as go
Load Data
# Load p_features and q_scores
with open('/work/p_features.json', 'r') as j:
p_features = json.load(j)
with open('/work/q_scores.json', 'r') as j:
q_scores = json.load(j)
p_features = dict(sorted(p_features.items()))
q_scores = dict(sorted(q_scores.items()))
Time Management Model
Subset Data
tm_X = []
tm_features = ['bowl_created_rem', 'bowl_washed_rem', 'order_collected_rem', 'order_given_rem', 'pre_post']
for data in p_features.values():
values = []
for var, val in data.items():
if var in tm_features:
values.append(val)
tm_X.append(values)
tm_X = np.array(tm_X)
print(tm_X.shape)
tm_X
tm_y = []
for data in q_scores.values():
for var, val in data.items():
if var == 'time management':
tm_y.append(val)
tm_y = np.array(tm_y)
print(tm_y.shape)
tm_y
Descriptive Statistics for y
mean_value = np.mean(tm_y)
median_value = np.median(tm_y)
std_deviation = np.std(tm_y)
min_value = np.min(tm_y)
max_value = np.max(tm_y)
q3 = np.percentile(tm_y, 75)
print(f"Mean: {mean_value:.2f}")
print(f"Median: {median_value}")
print(f"Standard Deviation: {std_deviation:.2f}")
print(f"Minimum Value: {min_value}")
print(f"Maximum Value: {max_value}")
print(f"Third Quartile: {q3}")
# Create a DataFrame
scores1 = {'Scores': tm_y}
scores1 = pd.DataFrame(scores1)
# Create a distplot using scores4.figure_factory
fig = ff.create_distplot([scores1['Scores']], group_labels=['distplot'], colors=['#d01c8b'], show_rug=False, show_curve=False)
# Add title
fig.update_layout(
title_text='Distribution of Time Management Scores',
paper_bgcolor='rgba(0,0,0,0)',
width=600
)
# Update x-axis range
fig.update_xaxes(range=[18, 90])
# Threshold line
threshold = 60
fig.add_shape(
go.layout.Shape(
type='line',
x0=threshold,
x1=threshold,
y0=0,
y1=0.08,
line=dict(color='red', width=2)
)
)
# Display the plot
fig.show()
Correlation Matrix
tm_correlation_matrix = np.corrcoef(tm_X, rowvar=False)
fig = go.Figure(data=go.Heatmap(
z=tm_correlation_matrix,
x=tm_features,
y=tm_features,
colorscale="PiYG",
zmin=-1,
zmax=1
))
# Customize the layout
fig.update_layout(
title="Time Management Correlation Matrix of Features",
xaxis=dict(title="Features"),
yaxis=dict(title="Features"),
height=500,
paper_bgcolor='rgba(0,0,0,0)',
width=500
)
# Show the plot
fig.show()
Feature Selection
# Split the data into training and testing sets
tm_X_train, tm_X_test, tm_y_train, tm_y_test = train_test_split(tm_X, tm_y, test_size=0.1, random_state=42)
selector = SelectKBest(f_classif, k=3)
selector.fit(tm_X_train, tm_y_train)
tm_selector_scores = selector.scores_
tm_selector_pvalues = selector.pvalues_
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [tm_features[i] for i in selected_feature_indices]
tm_selector_results = pd.DataFrame({
'Feature': selected_feature_names,
'Score': tm_selector_scores[selected_feature_indices],
'P-value': tm_selector_pvalues[selected_feature_indices]
})
tm_selector_results = tm_selector_results.sort_values(by='Score', ascending=False)
tm_selector_results
SVM
# split y into binary classes
tm_y_new = []
for score in tm_y:
if score < 60:
tm_y_new.append(0)
else:
tm_y_new.append(1)
tm_y = np.array(tm_y_new)
print(tm_y.shape)
tm_y
# Split the data into training and testing sets
tm_X_train, tm_X_test, tm_y_train, tm_y_test = train_test_split(tm_X, tm_y, test_size=0.2, random_state=42)
# Standardize the features by removing the mean and scaling to unit variance
scaler = StandardScaler()
tm_X_train = scaler.fit_transform(tm_X_train)
tm_X_test = scaler.transform(tm_X_test)
# Normalize the features
scaler2 = MinMaxScaler()
tm_X_train = scaler2.fit_transform(tm_X_train)
tm_X_test = scaler2.transform(tm_X_test)
# Create an SVM model
tm_svm = SVC(kernel='sigmoid')
# Train the model
tm_svm.fit(tm_X_train, tm_y_train)
# Make predictions on the test set
tm_y_pred = tm_svm.predict(tm_X_test)
# Evaluate the model
accuracy = accuracy_score(tm_y_test, tm_y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Display classification report
print("Classification Report:\n", classification_report(tm_y_test, tm_y_pred))
Confusion Matrix
# Calculate confusion matrix
conf_matrix1 = confusion_matrix(tm_y_test, tm_y_pred)
# Define class labels
class_labels = [0, 1]
# Create annotated heatmap using plotly.figure_factory
fig = ff.create_annotated_heatmap(conf_matrix1, x=class_labels, y=class_labels, colorscale='PiYG')
# # Set text color based on background color
# lightest_color_threshold = 0.5
# text_colors = [['black' if val / conf_matrix1.max() > lightest_color_threshold else 'white' for val in row] for row in conf_matrix1]
# # Update text information in the heatmap
# fig.update_layout(annotations=[dict(text=str(val), x=j, y=i, showarrow=False, font=dict(color=text_colors[i][j]))
# for i, row in enumerate(conf_matrix1)
# for j, val in enumerate(row)])
# Add title
fig.update_layout(title_text='Time Management Confusion Matrix')
# Add custom x-y-titles
fig.add_annotation(dict(font=dict(color="black", size=14), x=0.5, y=-0.15, showarrow=False, text="Predicted label", xref="paper", yref="paper"))
fig.add_annotation(dict(font=dict(color="black", size=14),x=-0.15,y=0.5,showarrow=False,text="Actual label",textangle=-90,xref="paper",yref="paper"))
# Adjust margins to make room for y-axis title
fig.update_layout(
margin=dict(t=50, l=200),
paper_bgcolor='rgba(0,0,0,0)',
width=600
)
# Display the plot
fig.show()
Cross Validation
# Split the data into training and testing sets
tm_X_train, tm_X_test, tm_y_train, tm_y_test = train_test_split(tm_X, tm_y, test_size=0.2, random_state=42)
# Feature selection
max_k = 5 # Set the maximum k value to test
avg_accuracies = []
for k in range(1, max_k + 1):
selector = SelectKBest(f_classif, k=k)
selector.fit(tm_X_train, tm_y_train)
# Get selected feature indices
selected_feature_indices = selector.get_support(indices=True)
# Apply feature selection to training and test sets
tm_X_train_selected = tm_X_train[:, selected_feature_indices]
tm_X_test_selected = tm_X_test[:, selected_feature_indices]
# Standardize the features
scaler = StandardScaler()
tm_X_train_selected = scaler.fit_transform(tm_X_train_selected)
tm_X_test_selected = scaler.transform(tm_X_test_selected)
# K-fold cross-validation
n_splits = 10
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
avg_acc = 0
for train_index, test_index in kfold.split(tm_X):
# Extract training and test sets
X_train, X_test = tm_X[train_index], tm_X[test_index]
y_train, y_test = tm_y[train_index], tm_y[test_index]
# Apply feature selection
X_train_selected = X_train[:, selected_feature_indices]
X_test_selected = X_test[:, selected_feature_indices]
# Standardize the features
X_train_selected = scaler.fit_transform(X_train_selected)
X_test_selected = scaler.transform(X_test_selected)
# Create and train the model
svm_model = SVC(kernel='sigmoid')
svm_model.fit(X_train_selected, y_train)
# Make predictions on the test set
y_pred = svm_model.predict(X_test_selected)
# Measure performance on y_test and y_pred
acc = accuracy_score(y_test, y_pred)
avg_acc += acc
avg_accuracy = avg_acc / n_splits
avg_accuracies.append(avg_accuracy)
# Find k with best accuracy
max_accuracy_index = max(range(len(avg_accuracies)), key=avg_accuracies.__getitem__)
# Find the second max accuracy index
second_best_k = None
for i, acc in enumerate(avg_accuracies):
if i != max_accuracy_index and (second_best_k is None or acc > avg_accuracies[second_best_k]):
second_best_k = i
# Selector for desired k
if second_best_k is not None:
best_k_second = second_best_k + 1
selector = SelectKBest(f_classif, k=max_accuracy_index+1)
selector.fit(tm_X_train, tm_y_train)
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [tm_features[i] for i in selected_feature_indices]
tm_X_train_selected = tm_X_train[:, selected_feature_indices]
tm_X_test_selected = tm_X_test[:, selected_feature_indices]
print(f"Average Accuracy: {round(avg_accuracies[max_accuracy_index], 2)} for k={max_accuracy_index + 1}")
# print(f"Average Accuracy: {round(avg_accuracies[second_best_k], 2)} for k={best_k_second}")
print(f"Selected Features for k={max_accuracy_index+1}: {selected_feature_names}")
# Plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1, max_k + 1)), y=avg_accuracies, mode='lines+markers', line=dict(color='#e8b0d3')))
fig.update_layout(
title='Time Management Avg Accuracy vs. Top-k Features',
xaxis=dict(title='Top-k Features'),
yaxis=dict(title='Average Accuracy'),
paper_bgcolor='rgba(0,0,0,0)',
width=500
)
fig.show()
ROC Curve
# Create and train the model with the best k
svm_model = SVC(kernel='sigmoid', probability=True) # Enable probability estimates for ROC curve
svm_model.fit(tm_X_train_selected, tm_y_train)
# Make predictions on the test set
y_probs = svm_model.predict_proba(tm_X_test_selected)[:, 1]
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(tm_y_test, y_probs)
roc_auc = auc(fpr, tpr)
# Plot ROC curve using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (AUC = {roc_auc:.2f})', line=dict(color='#e8b0d3')))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Baseline', line=dict(dash='dash')))
fig.update_layout(
title='Time Management ROC Curve',
xaxis=dict(title='False Positive Rate (FPR)', range=[-0.02, 1.02]),
yaxis=dict(title='True Positive Rate (TPR)'),
legend=dict(x=0.43, y=0.03, traceorder='normal'),
paper_bgcolor='rgba(0,0,0,0)',
width=500
)
fig.show()
Perfectionism Model
Subset Data
p_X = []
perf_features = ["ratio_hit", "total_time_drawing", "redo", "color_switch", 'bowl_created_rem', 'bowl_washed_rem', 'order_collected_rem', 'order_given_rem']
for data in p_features.values():
values = []
for var, val in data.items():
if var in perf_features:
values.append(val)
p_X.append(values)
p_X = np.array(p_X)
p_X
p_y = []
for data in q_scores.values():
for var, val in data.items():
if var == 'perfectionism':
p_y.append(val)
p_y = np.array(p_y)
p_y
Descriptive Statistics for y
mean_value = np.mean(p_y)
median_value = np.median(p_y)
std_deviation = np.std(p_y)
min_value = np.min(p_y)
max_value = np.max(p_y)
q3 = np.percentile(p_y, 75)
print(f"Mean: {mean_value:.2f}")
print(f"Median: {median_value}")
print(f"Standard Deviation: {std_deviation:.2f}")
print(f"Minimum Value: {min_value}")
print(f"Maximum Value: {max_value}")
print(f"Third Quartile: {q3}")
# Create a DataFrame
scores2 = {'Scores': p_y}
scores2 = pd.DataFrame(scores2)
# Create a distplot using scores2.figure_factory
fig = ff.create_distplot([scores2['Scores']], group_labels=['distplot'], colors=['#d01c8b'], show_rug=False, show_curve=False)
# Add title
fig.update_layout(
title_text='Distribution of Perfectionism Scores',
paper_bgcolor='rgba(0,0,0,0)',
width=600
)
# Update y-axis range
fig.update_xaxes(range=[10, 70])
# Threshold line
threshold = 52
fig.add_shape(
go.layout.Shape(
type='line',
x0=threshold,
x1=threshold,
y0=0,
y1=0.08,
line=dict(color='red', width=2)
)
)
# Display the plot
fig.show()
Correlation Matrix
p_correlation_matrix = np.corrcoef(p_X, rowvar=False)
fig = go.Figure(data=go.Heatmap(
z=p_correlation_matrix,
x=perf_features,
y=perf_features,
colorscale="PiYG",
zmin=-1,
zmax=1
))
# Customize the layout
fig.update_layout(
title="Perfectionism Correlation Matrix of Features",
xaxis=dict(title="Features"),
yaxis=dict(title="Features"),
height=500,
paper_bgcolor='rgba(0,0,0,0)',
width=600
)
# Show the plot
fig.show()
Feature Selection
# Split the data into training and testing sets
p_X_train, p_X_test, p_y_train, p_y_test = train_test_split(p_X, p_y, test_size=0.1, random_state=42)
selector = SelectKBest(f_classif, k=1)
selector.fit(p_X_train, p_y_train)
p_selector_scores = selector.scores_
p_selector_pvalues = selector.pvalues_
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [perf_features[i] for i in selected_feature_indices]
p_selector_results = pd.DataFrame({
'Feature': selected_feature_names,
'Score': p_selector_scores[selected_feature_indices],
'P-value': p_selector_pvalues[selected_feature_indices]
})
p_selector_results = p_selector_results.sort_values(by='Score', ascending=False)
p_selector_results
SVM
# split y into binary classes
p_y_new = []
for score in p_y:
if score < 52:
p_y_new.append(0)
else:
p_y_new.append(1)
p_y = np.array(p_y_new)
p_y
# Split the data into training and testing sets
p_X_train, p_X_test, p_y_train, p_y_test = train_test_split(p_X, p_y, test_size=0.1, random_state=42)
# Standardize the features by removing the mean and scaling to unit variance
scaler = StandardScaler()
p_X_train = scaler.fit_transform(p_X_train)
p_X_test = scaler.transform(p_X_test)
# Create an SVM model
p_svm = SVC(kernel='rbf')
# Train the model
p_svm.fit(p_X_train, p_y_train)
# Make predictions on the test set
p_y_pred = p_svm.predict(p_X_test)
# Evaluate the model
accuracy = accuracy_score(p_y_test, p_y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Display classification report
print("Classification Report:\n", classification_report(p_y_test, p_y_pred))
Confusion Matrix
# Calculate confusion matrix
conf_matrix2 = confusion_matrix(p_y_test, p_y_pred)
# Define class labels
class_labels = [0, 1]
# Create annotated heatmap using plotly.figure_factory
fig = ff.create_annotated_heatmap(conf_matrix2, x=class_labels, y=class_labels, colorscale='PiYG')
# Add title
fig.update_layout(title_text='Perfectionism Confusion Matrix')
# Add custom x-y-titles
fig.add_annotation(dict(font=dict(color="black", size=14), x=0.5, y=-0.15, showarrow=False, text="Predicted label", xref="paper", yref="paper"))
fig.add_annotation(dict(font=dict(color="black", size=14),x=-0.15,y=0.5,showarrow=False,text="Actual label",textangle=-90,xref="paper",yref="paper"))
# Adjust margins to make room for y-axis title
fig.update_layout(
margin=dict(t=50, l=200),
paper_bgcolor='rgba(0,0,0,0)',
width=600
)
# Display the plot
fig.show()
Cross Validation
# Split the data into training and testing sets
p_X_train, p_X_test, p_y_train, p_y_test = train_test_split(p_X, p_y, test_size=0.2, random_state=42)
# Feature selection
max_k = 8 # Set the maximum k value to test
avg_accuracies = []
for k in range(1, max_k + 1):
selector = SelectKBest(f_classif, k=k)
selector.fit(p_X_train, p_y_train)
# Get selected feature indices
selected_feature_indices = selector.get_support(indices=True)
# Apply feature selection to training and test sets
p_X_train_selected = p_X_train[:, selected_feature_indices]
p_X_test_selected = p_X_test[:, selected_feature_indices]
# Standardize the features
scaler = StandardScaler()
p_X_train_selected = scaler.fit_transform(p_X_train_selected)
p_X_test_selected = scaler.transform(p_X_test_selected)
# K-fold cross-validation
n_splits = 10
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
avg_acc = 0
for train_index, test_index in kfold.split(p_X):
# Extract training and test sets
X_train, X_test = p_X[train_index], p_X[test_index]
y_train, y_test = p_y[train_index], p_y[test_index]
# Apply feature selection
X_train_selected = X_train[:, selected_feature_indices]
X_test_selected = X_test[:, selected_feature_indices]
# Standardize the features
X_train_selected = scaler.fit_transform(X_train_selected)
X_test_selected = scaler.transform(X_test_selected)
# Create and train the model
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train_selected, y_train)
# Make predictions on the test set
y_pred = svm_model.predict(X_test_selected)
# Measure performance on y_test and y_pred
acc = accuracy_score(y_test, y_pred)
avg_acc += acc
avg_accuracy = avg_acc / n_splits
avg_accuracies.append(avg_accuracy)
# Find k with best accuracy
max_accuracy_index = max(range(len(avg_accuracies)), key=avg_accuracies.__getitem__)
# Find the second max accuracy index
second_best_k = None
for i, acc in enumerate(avg_accuracies):
if i != max_accuracy_index and (second_best_k is None or acc > avg_accuracies[second_best_k]):
second_best_k = i
# Selector for desired k
if second_best_k is not None:
best_k_second = second_best_k + 1
selector = SelectKBest(f_classif, k=max_accuracy_index + 1) #max_accuracy_index + 1
selector.fit(p_X_train, p_y_train)
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [perf_features[i] for i in selected_feature_indices]
p_X_train_selected = p_X_train[:, selected_feature_indices]
p_X_test_selected = p_X_test[:, selected_feature_indices]
print(f"Average Accuracy: {round(avg_accuracies[max_accuracy_index], 2)} for k={max_accuracy_index + 1}")
print(f"Selected Features for k={max_accuracy_index + 1}: {selected_feature_names}")
# Plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1, max_k + 1)), y=avg_accuracies, mode='lines+markers', line=dict(color='#e8b0d3')))
fig.update_layout(
title='Perfectionism Avg Accuracy vs. Top-k Features',
xaxis=dict(title='Top-k Features'),
yaxis=dict(title='Average Accuracy'),
paper_bgcolor='rgba(0,0,0,0)',
width=500
)
fig.show()
ROC Curve
# Create and train the model with the best k
svm_model = SVC(kernel='rbf', probability=True) # Enable probability estimates for ROC curve
svm_model.fit(p_X_train_selected, p_y_train)
# Make predictions on the test set
y_probs = svm_model.predict_proba(p_X_test_selected)[:, 1]
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(p_y_test, y_probs)
roc_auc = auc(fpr, tpr)
# Plot ROC curve using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (AUC = {roc_auc:.2f})', line=dict(color='#e8b0d3')))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Baseline', line=dict(dash='dash')))
fig.update_layout(
title='Perfectionism ROC Curve',
xaxis=dict(title='False Positive Rate (FPR)', range=[-0.02, 1.02]),
yaxis=dict(title='True Positive Rate (TPR)'),
legend=dict(x=0.43, y=0.03, traceorder='normal'),
paper_bgcolor='rgba(0,0,0,0)',
width=500
)
fig.show()
Distractibility Model
Subset Data
d_X = []
d_features = ["distracted_ratio", "time_distracted_ratio", "cue_distracted_ratio"]
for data in p_features.values():
values = []
for var, val in data.items():
if var in d_features:
values.append(val)
d_X.append(values)
d_X = np.array(d_X)
d_X
d_y = []
for data in q_scores.values():
for var, val in data.items():
if var == 'distractibility':
d_y.append(val)
d_y = np.array(d_y)
d_y
Descriptive Statistics for y
mean_value = np.mean(d_y)
median_value = np.median(d_y)
std_deviation = np.std(d_y)
min_value = np.min(d_y)
max_value = np.max(d_y)
q3 = np.percentile(d_y, 75)
print(f"Mean: {mean_value:.2f}")
print(f"Median: {median_value}")
print(f"Standard Deviation: {std_deviation:.2f}")
print(f"Minimum Value: {min_value}")
print(f"Maximum Value: {max_value}")
print(f"Third Quartile: {q3}")
# Create a DataFrame
scores3 = {'Scores': d_y}
scores3 = pd.DataFrame(scores3)
# Create a distplot using scores3.figure_factory
fig = ff.create_distplot([scores3['Scores']], group_labels=['distplot'], colors=['#d01c8b'], show_rug=False, show_curve=False)
# Add title
fig.update_layout(
title_text='Distribution of Distractability Scores',
paper_bgcolor='rgba(0,0,0,0)',
width=600
)
# Update y-axis range
fig.update_xaxes(range=[6, 31])
# Threshold line
threshold = 21
fig.add_shape(
go.layout.Shape(
type='line',
x0=threshold,
x1=threshold,
y0=0,
y1=0.18,
line=dict(color='red', width=2)
)
)
# Display the plot
fig.show()
Correlation Matrix
d_correlation_matrix = np.corrcoef(d_X, rowvar=False)
fig = go.Figure(data=go.Heatmap(
z=d_correlation_matrix,
x=d_features,
y=d_features,
colorscale="PiYG",
zmin=-1,
zmax=1
))
# Customize the layout
fig.update_layout(
title="Distractability Correlation Matrix of Features",
xaxis=dict(title="Features"),
yaxis=dict(title="Features"),
paper_bgcolor='rgba(0,0,0,0)',
width=600,
height=600,
)
# Show the plot
fig.show()
Feature Selection
# Split the data into training and testing sets
d_X_train, d_X_test, d_y_train, d_y_test = train_test_split(d_X, d_y, test_size=0.1, random_state=42)
selector = SelectKBest(f_classif, k=3)
selector.fit(d_X_train, d_y_train)
d_selector_scores = selector.scores_
d_selector_pvalues = selector.pvalues_
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [d_features[i] for i in selected_feature_indices]
d_selector_results = pd.DataFrame({
'Feature': selected_feature_names,
'Score': d_selector_scores[selected_feature_indices],
'P-value': d_selector_pvalues[selected_feature_indices]
})
d_selector_results = d_selector_results.sort_values(by='Score', ascending=False)
d_selector_results
SVM
# split y into binary classes
d_y_new = []
for score in d_y:
if score < 21:
d_y_new.append(0)
else:
d_y_new.append(1)
d_y = np.array(d_y_new)
d_y
# Split the data into training and testing sets
d_X_train, d_X_test, d_y_train, d_y_test = train_test_split(d_X, d_y, test_size=0.2, random_state=42)
# Standardize the features by removing the mean and scaling to unit variance
scaler = StandardScaler()
d_X_train = scaler.fit_transform(d_X_train)
d_X_test = scaler.transform(d_X_test)
# Create an SVM model
d_svm = SVC(kernel='poly')
# Train the model
d_svm.fit(d_X_train, d_y_train)
# Make predictions on the test set
d_y_pred = d_svm.predict(d_X_test)
# Evaluate the model
accuracy = accuracy_score(d_y_test, d_y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Display classification report
print("Classification Report:\n", classification_report(d_y_test, d_y_pred))
Confusion Matrix
# Calculate confusion matrix
conf_matrix3 = confusion_matrix(d_y_test, d_y_pred)
# Define class labels
class_labels = [0, 1]
# Create annotated heatmap using plotly.figure_factory
fig = ff.create_annotated_heatmap(conf_matrix3, x=class_labels, y=class_labels, colorscale='PiYG')
# Add title
fig.update_layout(title_text='Distractability Confusion Matrix')
# Add custom x-y-titles
fig.add_annotation(dict(font=dict(color="black", size=14), x=0.5, y=-0.15, showarrow=False, text="Predicted label", xref="paper", yref="paper"))
fig.add_annotation(dict(font=dict(color="black", size=14),x=-0.15,y=0.5,showarrow=False,text="Actual label",textangle=-90,xref="paper",yref="paper"))
# Adjust margins to make room for y-axis title
fig.update_layout(
margin=dict(t=50, l=200),
paper_bgcolor='rgba(0,0,0,0)',
width=600
)
# Display the plot
fig.show()
Cross Validation
# Split the data into training and testing sets
d_X_train, d_X_test, d_y_train, d_y_test = train_test_split(d_X, d_y, test_size=0.2, random_state=42)
# Feature selection
max_k = 3 # Set the maximum k value to test
avg_accuracies = []
for k in range(1, max_k + 1):
selector = SelectKBest(f_classif, k=k)
selector.fit(d_X_train, d_y_train)
# Get selected feature indices
selected_feature_indices = selector.get_support(indices=True)
# Apply feature selection to training and test sets
d_X_train_selected = d_X_train[:, selected_feature_indices]
d_X_test_selected = d_X_test[:, selected_feature_indices]
# Standardize the features
scaler = StandardScaler()
d_X_train_selected = scaler.fit_transform(d_X_train_selected)
d_X_test_selected = scaler.transform(d_X_test_selected)
# K-fold cross-validation
n_splits = 10
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
avg_acc = 0
for train_index, test_index in kfold.split(d_X):
# Extract training and test sets
X_train, X_test = d_X[train_index], d_X[test_index]
y_train, y_test = d_y[train_index], d_y[test_index]
# Apply feature selection
X_train_selected = X_train[:, selected_feature_indices]
X_test_selected = X_test[:, selected_feature_indices]
# Standardize the features
X_train_selected = scaler.fit_transform(X_train_selected)
X_test_selected = scaler.transform(X_test_selected)
# Create and train the model
svm_model = SVC(kernel='poly')
svm_model.fit(X_train_selected, y_train)
# Make predictions on the test set
y_pred = svm_model.predict(X_test_selected)
# Measure performance on y_test and y_pred
acc = accuracy_score(y_test, y_pred)
avg_acc += acc
avg_accuracy = avg_acc / n_splits
avg_accuracies.append(avg_accuracy)
# Find k with best accuracy
max_accuracy_index = max(range(len(avg_accuracies)), key=avg_accuracies.__getitem__)
# Find the second max accuracy index
second_best_k = None
for i, acc in enumerate(avg_accuracies):
if i != max_accuracy_index and (second_best_k is None or acc > avg_accuracies[second_best_k]):
second_best_k = i
# Selector for desired k
if second_best_k is not None:
best_k_second = second_best_k + 1
selector = SelectKBest(f_classif, k=max_accuracy_index + 1) # hardcode
selector.fit(d_X_train, d_y_train)
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [d_features[i] for i in selected_feature_indices]
d_X_train_selected = d_X_train[:, selected_feature_indices]
d_X_test_selected = d_X_test[:, selected_feature_indices]
print(f"Average Accuracy: {round(avg_accuracies[max_accuracy_index], 2)} for k={max_accuracy_index + 1}")
print(f"Selected Features for k={max_accuracy_index + 1}: {selected_feature_names}")
# Plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1, max_k + 1)), y=avg_accuracies, mode='lines+markers', line=dict(color='#e8b0d3')))
fig.update_layout(
title='Distractability Avg Accuracy vs. Top-k Features',
xaxis=dict(title='Top-k Features'),
yaxis=dict(title='Average Accuracy'),
paper_bgcolor='rgba(0,0,0,0)',
width=500
)
fig.show()
ROC Curve
# Create and train the model with the best k
svm_model = SVC(kernel='poly', probability=True) # Enable probability estimates for ROC curve
svm_model.fit(d_X_train_selected, d_y_train)
# Make predictions on the test set
y_probs = svm_model.predict_proba(d_X_test_selected)[:, 1]
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(d_y_test, y_probs)
roc_auc = auc(fpr, tpr)
# Plot ROC curve using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (AUC = {roc_auc:.2f})', line=dict(color='#e8b0d3')))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Baseline', line=dict(dash='dash')))
fig.update_layout(
title='Distractability ROC Curve',
xaxis=dict(title='False Positive Rate (FPR)', range=[-0.02, 1.02]),
yaxis=dict(title='True Positive Rate (TPR)'),
legend=dict(x=0.43, y=0.03, traceorder='normal'),
paper_bgcolor='rgba(0,0,0,0)',
width=500
)
fig.show()
Emotional Regulation Model
Subset Data
er_X = []
er_features = ["Ebowl_created_rem", "Ebowl_washed_rem", "Eorder_collected_rem", "Eorder_given_rem", "Epre_post",
"Eratio_hit", "Etotal_time_drawing", "Eredo", "Ecolor_switch", "Edistracted_ratio", "Etime_distracted_ratio", "Ecue_distracted_ratio"]
for data in p_features.values():
values = []
for var, val in data.items():
if var in er_features:
values.append(val)
er_X.append(values)
er_X = np.array(er_X)
er_X
er_y = []
for data in q_scores.values():
for var, val in data.items():
if var == 'emotional regulation':
er_y.append(val)
er_y = np.array(er_y)
er_y
Descriptive Statistics for y
mean_value = np.mean(er_y)
median_value = np.median(er_y)
std_deviation = np.std(er_y)
min_value = np.min(er_y)
max_value = np.max(er_y)
q3 = np.percentile(er_y, 75)
print(f"Mean: {mean_value:.2f}")
print(f"Median: {median_value}")
print(f"Standard Deviation: {std_deviation:.2f}")
print(f"Minimum Value: {min_value}")
print(f"Maximum Value: {max_value}")
print(f"Third Quartile: {q3}")
# Create a DataFrame
scores4 = {'Scores': er_y}
scores4 = pd.DataFrame(scores2)
# Create a distplot using scores4.figure_factory
fig = ff.create_distplot([scores4['Scores']], group_labels=['distplot'], colors=['#d01c8b'], show_rug=False, show_curve=False)
# Add title
fig.update_layout(
title_text='Distribution of Emotional Regulation Scores',
paper_bgcolor='rgba(0,0,0,0)',
width=600
)
# Update y-axis range
fig.update_xaxes(range=[10, 55])
# Threshold line
threshold = 34
fig.add_shape(
go.layout.Shape(
type='line',
x0=threshold,
x1=threshold,
y0=0,
y1=0.08,
line=dict(color='red', width=2)
)
)
# Display the plot
fig.show()
Correlation Matrix
er_correlation_matrix = np.corrcoef(er_X, rowvar=False)
fig = go.Figure(data=go.Heatmap(
z=er_correlation_matrix,
x=er_features,
y=er_features,
colorscale="PiYG",
zmin=-1,
zmax=1
))
# Customize the layout
fig.update_layout(
title="Emotional Regulation Correlation Matrix of Features",
xaxis=dict(title="Features"),
yaxis=dict(title="Features"),
paper_bgcolor='rgba(0,0,0,0)',
width=600,
height=600,
)
# Show the plot
fig.show()
Feature Selection
# Split the data into training and testing sets
er_X_train, er_X_test, er_y_train, er_y_test = train_test_split(er_X, er_y, test_size=0.1, random_state=42)
selector = SelectKBest(f_classif, k=4)
selector.fit(er_X_train, er_y_train)
er_selector_scores = selector.scores_
er_selector_pvalues = selector.pvalues_
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [er_features[i] for i in selected_feature_indices]
er_selector_results = pd.DataFrame({
'Feature': selected_feature_names,
'Score': er_selector_scores[selected_feature_indices],
'P-value': er_selector_pvalues[selected_feature_indices]
})
er_selector_results = er_selector_results.sort_values(by='Score', ascending=False)
er_selector_results
SVM
# split y into binary classes
er_y_new = []
for score in er_y:
if score < 33:
er_y_new.append(0)
else:
er_y_new.append(1)
er_y = np.array(er_y_new)
er_y
# Split the data into training and testing sets
er_X_train, er_X_test, er_y_train, er_y_test = train_test_split(er_X, er_y, test_size=0.2, random_state=42)
# Standardize the features by removing the mean and scaling to unit variance
scaler = StandardScaler()
er_X_train = scaler.fit_transform(er_X_train)
er_X_test = scaler.transform(er_X_test)
# Create an SVM model
er_svm = SVC(kernel='sigmoid')
# Train the model
er_svm.fit(er_X_train, er_y_train)
# Make predictions on the test set
er_y_pred = er_svm.predict(er_X_test)
# Evaluate the model
accuracy = accuracy_score(er_y_test, er_y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Display classification report
print("Classification Report:\n", classification_report(er_y_test, er_y_pred))
Confusion Matrix
# Calculate confusion matrix
conf_matrix4 = confusion_matrix(er_y_test, er_y_pred)
# Define class labels
class_labels = [0, 1]
# Create annotated heatmap using plotly.figure_factory
fig = ff.create_annotated_heatmap(conf_matrix4, x=class_labels, y=class_labels, colorscale='PiYG')
# Add title
fig.update_layout(title_text='Emotional Regulation Confusion Matrix')
# Add custom x-y-titles
fig.add_annotation(dict(font=dict(color="black", size=14), x=0.5, y=-0.15, showarrow=False, text="Predicted label", xref="paper", yref="paper"))
fig.add_annotation(dict(font=dict(color="black", size=14),x=-0.15,y=0.5,showarrow=False,text="Actual label",textangle=-90,xref="paper",yref="paper"))
# Adjust margins to make room for y-axis title
fig.update_layout(
margin=dict(t=50, l=200),
paper_bgcolor='rgba(0,0,0,0)',
width=600
)
# Display the plot
fig.show()
Cross Validation
# Split the data into training and testing sets
er_X_train, er_X_test, er_y_train, er_y_test = train_test_split(er_X, er_y, test_size=0.1, random_state=42)
# Feature selection
max_k = 12 # Set the maximum k value to test
avg_accuracies = []
for k in range(1, max_k + 1):
selector = SelectKBest(f_classif, k=k)
selector.fit(er_X_train, er_y_train)
# Get selected feature indices
selected_feature_indices = selector.get_support(indices=True)
# Apply feature selection to training and test sets
er_X_train_selected = er_X_train[:, selected_feature_indices]
er_X_test_selected = er_X_test[:, selected_feature_indices]
# Standardize the features
scaler = StandardScaler()
er_X_train_selected = scaler.fit_transform(er_X_train_selected)
er_X_test_selected = scaler.transform(er_X_test_selected)
# K-fold cross-validation
n_splits = 10
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
avg_acc = 0
for train_index, test_index in kfold.split(er_X):
# Extract training and test sets
X_train, X_test = er_X[train_index], er_X[test_index]
y_train, y_test = er_y[train_index], er_y[test_index]
# Apply feature selection
X_train_selected = X_train[:, selected_feature_indices]
X_test_selected = X_test[:, selected_feature_indices]
# Standardize the features
X_train_selected = scaler.fit_transform(X_train_selected)
X_test_selected = scaler.transform(X_test_selected)
# Create and train the model
svm_model = SVC(kernel='poly')
svm_model.fit(X_train_selected, y_train)
# Make predictions on the test set
y_pred = svm_model.predict(X_test_selected)
# Measure performance on y_test and y_pred
acc = accuracy_score(y_test, y_pred)
avg_acc += acc
avg_accuracy = avg_acc / n_splits
avg_accuracies.append(avg_accuracy)
# Find k with best accuracy
max_accuracy_index = max(range(len(avg_accuracies)), key=avg_accuracies.__getitem__)
# Find the second max accuracy index
second_best_k = None
for i, acc in enumerate(avg_accuracies):
if i != max_accuracy_index and (second_best_k is None or acc > avg_accuracies[second_best_k]):
second_best_k = i
# Selector for desired k
if second_best_k is not None:
best_k_second = second_best_k + 1
selector = SelectKBest(f_classif, k=max_accuracy_index + 1) # hardcode
selector.fit(er_X_train, er_y_train)
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [er_features[i] for i in selected_feature_indices]
er_X_train_selected = er_X_train[:, selected_feature_indices]
er_X_test_selected = er_X_test[:, selected_feature_indices]
print(f"Average Accuracy: {round(avg_accuracies[max_accuracy_index], 2)} for k={max_accuracy_index + 1}")
print(f"Selected Features for k={max_accuracy_index + 1}: {selected_feature_names}")
# Plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1, max_k + 1)), y=avg_accuracies, mode='lines+markers', line=dict(color='#e8b0d3')))
fig.update_layout(
title='Emotional Regulation Avg Accuracy vs. Top-k Features',
xaxis=dict(title='Top-k Features'),
yaxis=dict(title='Average Accuracy'),
paper_bgcolor='rgba(0,0,0,0)',
width=500
)
fig.show()
ROC Curve
# Create and train the model with the best k
svm_model = SVC(kernel='poly', probability=True) # Enable probability estimates for ROC curve
svm_model.fit(er_X_train_selected, er_y_train)
# Make predictions on the test set
y_probs = svm_model.predict_proba(er_X_test_selected)[:, 1]
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(er_y_test, y_probs)
roc_auc = auc(fpr, tpr)
# Plot ROC curve using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (AUC = {roc_auc:.2f})', line=dict(color='#e8b0d3')))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Baseline', line=dict(dash='dash')))
fig.update_layout(
title='Emotional Regulation ROC Curve',
xaxis=dict(title='False Positive Rate (FPR)', range=[-0.02, 1.02]),
yaxis=dict(title='True Positive Rate (TPR)'),
legend=dict(x=0.43, y=0.03, traceorder='normal'),
paper_bgcolor='rgba(0,0,0,0)',
width=500
)
fig.show()
Self-Discipline
Subset Data
sd_X = []
sd_features = ["Sbowl_created_rem", "Sbowl_washed_rem", "Sorder_collected_rem", "Sorder_given_rem", "Spre_post",
"Sratio_hit", "Stotal_time_drawing", "Sredo", "Scolor_switch", "Sdistracted_ratio", "Stime_distracted_ratio", "Scue_distracted_ratio"]
for data in p_features.values():
values = []
for var, val in data.items():
if var in sd_features:
values.append(val)
sd_X.append(values)
sd_X = np.array(sd_X)
sd_X
sd_y = []
for data in q_scores.values():
for var, val in data.items():
if var == 'self discipline':
sd_y.append(val)
sd_y = np.array(sd_y)
sd_y
Descriptive Statistics for y
mean_value = np.mean(sd_y)
median_value = np.median(sd_y)
std_deviation = np.std(sd_y)
min_value = np.min(sd_y)
max_value = np.max(sd_y)
q3 = np.percentile(sd_y, 75)
print(f"Mean: {mean_value:.2f}")
print(f"Median: {median_value}")
print(f"Standard Deviation: {std_deviation:.2f}")
print(f"Minimum Value: {min_value}")
print(f"Maximum Value: {max_value}")
print(f"Third Quartile: {q3}")
# Create a DataFrame
scores5 = {'Scores': sd_y}
scores5 = pd.DataFrame(scores5)
# Create a distplot using scores5.figure_factory
fig = ff.create_distplot([scores5['Scores']], group_labels=['distplot'], colors=['#d01c8b'], show_rug=False, show_curve=False)
# Add title
fig.update_layout(
title_text='Distribution of Self Discipline Scores',
paper_bgcolor='rgba(0,0,0,0)',
width=600
)
# Update y-axis range
fig.update_xaxes(range=[13, 65])
# Threshold line
threshold = 47
fig.add_shape(
go.layout.Shape(
type='line',
x0=threshold,
x1=threshold,
y0=0,
y1=0.16,
line=dict(color='red', width=2)
)
)
# Display the plot
fig.show()
Correlation Matrix
sd_correlation_matrix = np.corrcoef(sd_X, rowvar=False)
fig = go.Figure(data=go.Heatmap(
z=sd_correlation_matrix,
x=sd_features,
y=sd_features,
colorscale="PiYG",
zmin=-1,
zmax=1
))
# Customize the layout
fig.update_layout(
title="Self Discipline Correlation Matrix of Features",
xaxis=dict(title="Features"),
yaxis=dict(title="Features"),
paper_bgcolor='rgba(0,0,0,0)',
width=600,
height=600,
)
# Show the plot
fig.show()
Feature Selection
# Split the data into training and testing sets
sd_X_train, sd_X_test, sd_y_train, sd_y_test = train_test_split(sd_X, sd_y, test_size=0.1, random_state=42)
selector = SelectKBest(f_classif, k=1)
selector.fit(sd_X_train, sd_y_train)
sd_selector_scores = selector.scores_
sd_selector_pvalues = selector.pvalues_
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [sd_features[i] for i in selected_feature_indices]
sd_selector_results = pd.DataFrame({
'Feature': selected_feature_names,
'Score': sd_selector_scores[selected_feature_indices],
'P-value': sd_selector_pvalues[selected_feature_indices]
})
sd_selector_results = sd_selector_results.sort_values(by='Score', ascending=False)
sd_selector_results
SVM
# split y into binary classes
sd_y_new = []
for score in sd_y:
if score < 47:
sd_y_new.append(0)
else:
sd_y_new.append(1)
sd_y = np.array(sd_y_new)
sd_y
# Split the data into training and testing sets
sd_X_train, sd_X_test, sd_y_train, sd_y_test = train_test_split(sd_X, sd_y, test_size=0.2, random_state=42)
# Standardize the features by removing the mean and scaling to unit variance
scaler = StandardScaler()
sd_X_train = scaler.fit_transform(sd_X_train)
sd_X_test = scaler.transform(sd_X_test)
# Create an SVM model
sd_svm = SVC(kernel='rbf')
# Train the model
sd_svm.fit(sd_X_train, sd_y_train)
# Make predictions on the test set
sd_y_pred = sd_svm.predict(sd_X_test)
# Evaluate the model
accuracy = accuracy_score(sd_y_test, sd_y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Display classification report
print("Classification Report:\n", classification_report(sd_y_test, sd_y_pred))
Confusion Matrix
# Calculate confusion matrix
conf_matrix5 = confusion_matrix(sd_y_test, sd_y_pred)
# Define class labels
class_labels = [0, 1]
# Create annotated heatmap using plotly.figure_factory
fig = ff.create_annotated_heatmap(conf_matrix5, x=class_labels, y=class_labels, colorscale='PiYG')
# Update text information in the heatmap
fig.update_layout(annotations=[dict(text=str(val), x=j, y=i, showarrow=False)
for i, row in enumerate(conf_matrix5)
for j, val in enumerate(row)])
# Add title
fig.update_layout(title_text='Self Discipline Confusion Matrix')
# Add custom x-y-titles
fig.add_annotation(dict(font=dict(color="black", size=14), x=0.5, y=-0.15, showarrow=False, text="Predicted label", xref="paper", yref="paper"))
fig.add_annotation(dict(font=dict(color="black", size=14),x=-0.15,y=0.5,showarrow=False,text="Actual label",textangle=-90,xref="paper",yref="paper"))
# Adjust margins to make room for y-axis title
fig.update_layout(
margin=dict(t=50, l=200),
paper_bgcolor='rgba(0,0,0,0)',
width=600
)
# Display the plot
fig.show()
Cross Validation
# Split the data into training and testing sets
sd_X_train, sd_X_test, sd_y_train, sd_y_test = train_test_split(sd_X, sd_y, test_size=0.1, random_state=42)
# Feature selection
max_k = 12 # Set the maximum k value to test
avg_accuracies = []
for k in range(1, max_k + 1):
selector = SelectKBest(f_classif, k=k)
selector.fit(sd_X_train, sd_y_train)
# Get selected feature indices
selected_feature_indices = selector.get_support(indices=True)
# Apply feature selection to training and test sets
sd_X_train_selected = sd_X_train[:, selected_feature_indices]
sd_X_test_selected = sd_X_test[:, selected_feature_indices]
# Standardize the features
scaler = StandardScaler()
sd_X_train_selected = scaler.fit_transform(sd_X_train_selected)
sd_X_test_selected = scaler.transform(sd_X_test_selected)
# K-fold cross-validation
n_splits = 10
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
avg_acc = 0
for train_index, test_index in kfold.split(sd_X):
# Extract training and test sets
X_train, X_test = sd_X[train_index], sd_X[test_index]
y_train, y_test = sd_y[train_index], sd_y[test_index]
# Apply feature selection
X_train_selected = X_train[:, selected_feature_indices]
X_test_selected = X_test[:, selected_feature_indices]
# Standardize the features
X_train_selected = scaler.fit_transform(X_train_selected)
X_test_selected = scaler.transform(X_test_selected)
# Create and train the model
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train_selected, y_train)
# Make predictions on the test set
y_pred = svm_model.predict(X_test_selected)
# Measure performance on y_test and y_pred
acc = accuracy_score(y_test, y_pred)
avg_acc += acc
avg_accuracy = avg_acc / n_splits
avg_accuracies.append(avg_accuracy)
# Find k with best accuracy
max_accuracy_index = max(range(len(avg_accuracies)), key=avg_accuracies.__getitem__)
# Find the second max accuracy index
second_best_k = None
for i, acc in enumerate(avg_accuracies):
if i != max_accuracy_index and (second_best_k is None or acc > avg_accuracies[second_best_k]):
second_best_k = i
# Selector for desired k
if second_best_k is not None:
best_k_second = second_best_k + 1
selector = SelectKBest(f_classif, k=max_accuracy_index + 1)
selector.fit(sd_X_train, sd_y_train)
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [sd_features[i] for i in selected_feature_indices]
sd_X_train_selected = sd_X_train[:, selected_feature_indices]
sd_X_test_selected = sd_X_test[:, selected_feature_indices]
print(f"Average Accuracy: {round(avg_accuracies[max_accuracy_index], 2)} for k={max_accuracy_index + 1}")
print(f"Selected Features for k={max_accuracy_index + 1}: {selected_feature_names}")
# Plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1, max_k + 1)), y=avg_accuracies, mode='lines+markers', line=dict(color='#e8b0d3')))
fig.update_layout(
title='Self Discipline Avg Accuracy vs. Top-k Features',
xaxis=dict(title='Top-k Features'),
yaxis=dict(title='Average Accuracy'),
paper_bgcolor='rgba(0,0,0,0)',
width=500
)
fig.show()
ROC Curve
# Create and train the model with the best k
svm_model = SVC(kernel='rbf', probability=True) # Enable probability estimates for ROC curve
svm_model.fit(sd_X_train_selected, sd_y_train)
# Make predictions on the test set
y_probs = svm_model.predict_proba(sd_X_test_selected)[:, 1]
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(sd_y_test, y_probs)
roc_auc = auc(fpr, tpr)
# Plot ROC curve using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (AUC = {roc_auc:.2f})', line=dict(color='#e8b0d3')))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Baseline', line=dict(dash='dash')))
fig.update_layout(
title='Self Discipline ROC Curve',
xaxis=dict(title='False Positive Rate (FPR)', range=[-0.02, 1.02]),
yaxis=dict(title='True Positive Rate (TPR)'),
legend=dict(x=0.43, y=0.03, traceorder='normal'),
paper_bgcolor='rgba(0,0,0,0)',
width=500
)
fig.show()
Procrastination
Subset Data
pro_X = []
pro_features = ['pre_post', "ratio_hit", "total_time_drawing", "redo", "color_switch", 'bowl_created_rem', 'bowl_washed_rem', 'order_collected_rem', 'order_given_rem', "distracted_ratio", "time_distracted_ratio", "cue_distracted_ratio", "Ebowl_created_rem", "Ebowl_washed_rem", "Eorder_collected_rem", "Eorder_given_rem", "Epre_post", "Eratio_hit", "Etotal_time_drawing", "Eredo", "Ecolor_switch", "Edistracted_ratio", "Etime_distracted_ratio", "Ecue_distracted_ratio", "Sbowl_created_rem", "Sbowl_washed_rem", "Sorder_collected_rem", "Sorder_given_rem", "Spre_post", "Sratio_hit", "Stotal_time_drawing", "Sredo", "Scolor_switch", "Sdistracted_ratio", "Stime_distracted_ratio", "Scue_distracted_ratio"]
for data in p_features.values():
values = []
for var, val in data.items():
if var in pro_features:
values.append(val)
pro_X.append(values)
pro_X = np.array(pro_X)
pro_X
pro_y = []
for data in q_scores.values():
for var, val in data.items():
if var == 'procrastination':
pro_y.append(val)
pro_y = np.array(pro_y)
pro_y
Descriptive Statistics for y
mean_value = np.mean(pro_y)
median_value = np.median(pro_y)
std_deviation = np.std(pro_y)
min_value = np.min(pro_y)
max_value = np.max(pro_y)
q3 = np.percentile(pro_y, 75)
print(f"Mean: {mean_value:.2f}")
print(f"Median: {median_value}")
print(f"Standard Deviation: {std_deviation:.2f}")
print(f"Minimum Value: {min_value}")
print(f"Maximum Value: {max_value}")
print(f"Third Quartile: {q3}")
# Create a DataFrame
scores6 = {'Scores': pro_y}
scores6 = pd.DataFrame(scores6)
# Create a distplot using scores5.figure_factory
fig = ff.create_distplot([scores6['Scores']], group_labels=['distplot'], colors=['#d01c8b'], show_rug=False, show_curve=False)
# Add title
fig.update_layout(
title_text='Distribution of Procrastination Scores',
paper_bgcolor='rgba(0,0,0,0)',
width=600
)
# Update y-axis range
fig.update_xaxes(range=[12, 60])
# Threshold line
threshold = 44
fig.add_shape(
go.layout.Shape(
type='line',
x0=threshold,
x1=threshold,
y0=0,
y1=0.16,
line=dict(color='red', width=2)
)
)
# Display the plot
fig.show()
Correlation Matrix
pro_correlation_matrix = np.corrcoef(pro_X, rowvar=False)
fig = go.Figure(data=go.Heatmap(
z=pro_correlation_matrix,
x=pro_features,
y=pro_features,
colorscale="PiYG",
zmin=-1,
zmax=1
))
# Customize the layout
fig.update_layout(
title="Procrastination Correlation Matrix of Features",
xaxis=dict(title="Features"),
yaxis=dict(title="Features"),
paper_bgcolor='rgba(0,0,0,0)',
width=650,
height=650,
)
# Show the plot
fig.show()
Feature Selection
# Split the data into training and testing sets
pro_X_train, pro_X_test, pro_y_train, pro_y_test = train_test_split(pro_X, pro_y, test_size=0.1, random_state=42)
selector = SelectKBest(f_classif, k=5)
selector.fit(pro_X_train, pro_y_train)
pro_selector_scores = selector.scores_
pro_selector_pvalues = selector.pvalues_
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [pro_features[i] for i in selected_feature_indices]
pro_selector_results = pd.DataFrame({
'Feature': selected_feature_names,
'Score': pro_selector_scores[selected_feature_indices],
'P-value': pro_selector_pvalues[selected_feature_indices]
})
pro_selector_results = pro_selector_results.sort_values(by='Score', ascending=False)
pro_selector_results
SVM
# split y into binary classes
pro_y_new = []
for score in pro_y:
if score < 44:
pro_y_new.append(0)
else:
pro_y_new.append(1)
pro_y = np.array(pro_y_new)
pro_y
# Split the data into training and testing sets
pro_X_train, pro_X_test, pro_y_train, pro_y_test = train_test_split(pro_X, pro_y, test_size=0.2, random_state=42)
# # Normalization
# min_max_scaler = MinMaxScaler()
# pro_X_train = min_max_scaler.fit_transform(pro_X_train) # X_train
# pro_X_test = min_max_scaler.fit_transform(pro_X_test) # X_test
# # Standardize the features by removing the mean and scaling to unit variance
# scaler = StandardScaler()
# pro_X_train = scaler.fit_transform(pro_X_train)
# pro_X_test = scaler.transform(pro_X_test)
# Create an SVM model
pro_svm = SVC(kernel='rbf')
# Train the model
pro_svm.fit(pro_X_train, pro_y_train)
# Make predictions on the test set
pro_y_pred = pro_svm.predict(pro_X_test)
# Evaluate the model
accuracy = accuracy_score(pro_y_test, pro_y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Display classification report
print("Classification Report:\n", classification_report(pro_y_test, pro_y_pred))
Target Distribution
0# Count the occurrences of each class
class_counts = [pro_y.tolist().count(0), pro_y.tolist().count(1)]
# Calculate percentages
total_samples = len(pro_y)
percentages = [count / total_samples * 100 for count in class_counts]
# Create a Pie chart
fig = px.pie(names=['Not a Procrastinator (0)', 'Procrastinator (1)'], values=percentages, title='Procrastination Scores Distribution',
labels={'labels':'Score', 'values':'Percentage'},
color_discrete_sequence=['#d01c8b', '#e8b0d3'])
fig.update_layout(
paper_bgcolor='rgba(0,0,0,0)'
)
# Show percentage values inside pie slices
fig.update_traces(textinfo='percent+label')
# Hide the legend
fig.update_layout(showlegend=False)
# Show the plot
fig.show()
Confusion Matrix
# Calculate confusion matrix
conf_matrix6 = confusion_matrix(pro_y_test, pro_y_pred)
# Define class labels
class_labels = [0, 1]
# Create annotated heatmap using plotly.figure_factory
fig = ff.create_annotated_heatmap(conf_matrix6, x=class_labels, y=class_labels, colorscale='PiYG')
# Add title
fig.update_layout(title_text='Procrastination Confusion Matrix')
# Add custom x-y-titles
fig.add_annotation(dict(font=dict(color="black", size=14), x=0.5, y=-0.15, showarrow=False, text="Predicted label", xref="paper", yref="paper"))
fig.add_annotation(dict(font=dict(color="black", size=14),x=-0.15,y=0.5,showarrow=False,text="Actual label",textangle=-90,xref="paper",yref="paper"))
# Adjust margins to make room for y-axis title
fig.update_layout(
margin=dict(t=50, l=200),
paper_bgcolor='rgba(0,0,0,0)',
width=600
)
# Display the plot
fig.show()
Cross Validation
# Split the data into training and testing sets
pro_X_train, pro_X_test, pro_y_train, pro_y_test = train_test_split(pro_X, pro_y, test_size=0.1, random_state=42)
# Feature selection
max_k = 36 # Set the maximum k value to test
avg_accuracies = []
for k in range(1, max_k + 1):
selector = SelectKBest(f_classif, k=k)
selector.fit(pro_X_train, pro_y_train)
# Get selected feature indices
selected_feature_indices = selector.get_support(indices=True)
# Apply feature selection to training and test sets
pro_X_train_selected = pro_X_train[:, selected_feature_indices]
pro_X_test_selected = pro_X_test[:, selected_feature_indices]
# Standardize the features
scaler = StandardScaler()
pro_X_train_selected = scaler.fit_transform(pro_X_train_selected)
pro_X_test_selected = scaler.transform(pro_X_test_selected)
# K-fold cross-validation
n_splits = 10
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
avg_acc = 0
for train_index, test_index in kfold.split(pro_X):
# Extract training and test sets
X_train, X_test = pro_X[train_index], pro_X[test_index]
y_train, y_test = pro_y[train_index], pro_y[test_index]
# Apply feature selection
X_train_selected = X_train[:, selected_feature_indices]
X_test_selected = X_test[:, selected_feature_indices]
# Standardize the features
X_train_selected = scaler.fit_transform(X_train_selected)
X_test_selected = scaler.transform(X_test_selected)
# Create and train the model
svm_model = SVC(kernel='sigmoid')
svm_model.fit(X_train_selected, y_train)
# Make predictions on the test set
y_pred = svm_model.predict(X_test_selected)
# Measure performance on y_test and y_pred
acc = accuracy_score(y_test, y_pred)
avg_acc += acc
avg_accuracy = avg_acc / n_splits
avg_accuracies.append(avg_accuracy)
# Find k with best accuracy
max_accuracy_index = max(range(len(avg_accuracies)), key=avg_accuracies.__getitem__)
# Find the second max accuracy index
second_best_k = None
for i, acc in enumerate(avg_accuracies):
if i != max_accuracy_index and (second_best_k is None or acc > avg_accuracies[second_best_k]):
second_best_k = i
# Selector for desired k
if second_best_k is not None:
best_k_second = second_best_k + 1
selector = SelectKBest(f_classif, k=max_accuracy_index + 1)
selector.fit(pro_X_train, pro_y_train)
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = [pro_features[i] for i in selected_feature_indices]
pro_X_train_selected = pro_X_train[:, selected_feature_indices]
pro_X_test_selected = pro_X_test[:, selected_feature_indices]
print(f"Average Accuracy: {round(avg_accuracies[max_accuracy_index], 2)} for k={max_accuracy_index + 1}")
print(f"Selected Features for k={max_accuracy_index + 1}: {selected_feature_names}")
# Plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1, max_k + 1)), y=avg_accuracies, mode='lines+markers', line=dict(color='#e8b0d3')))
fig.update_layout(
title='Procrastination Avg Accuracy vs. Top-k Features',
xaxis=dict(title='Top-k Features'),
yaxis=dict(title='Average Accuracy'),
paper_bgcolor='rgba(0,0,0,0)',
width=500
)
fig.show()
ROC Curve
# Create and train the model with the best k
svm_model = SVC(kernel='sigmoid', probability=True) # Enable probability estimates for ROC curve
svm_model.fit(pro_X_train_selected, pro_y_train)
# Make predictions on the test set
y_probs = svm_model.predict_proba(pro_X_test_selected)[:, 1]
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(pro_y_test, y_probs)
roc_auc = auc(fpr, tpr)
# Plot ROC curve using Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (AUC = {roc_auc:.2f})', line=dict(color='#e8b0d3')))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Baseline', line=dict(dash='dash')))
fig.update_layout(
title='Procrastination ROC Curve',
xaxis=dict(title='False Positive Rate (FPR)', range=[-0.02, 1.02]),
yaxis=dict(title='True Positive Rate (TPR)'),
legend=dict(x=0.43, y=0.03, traceorder='normal'),
paper_bgcolor='rgba(0,0,0,0)',
width=500
)
fig.show()