Phocrastination: A Game-Based Approach to Assessing and Classifying Academic Procrastination in College Students

Load Libraries

# base import pandas as pd import numpy as np import json import warnings # preprocessing from sklearn.preprocessing import StandardScaler from sklearn.utils import shuffle from scipy.stats import norm from sklearn.preprocessing import MinMaxScaler # modeling from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.svm import SVC from sklearn.model_selection import KFold, StratifiedKFold from sklearn.feature_selection import SelectKBest, f_classif # validation from sklearn import metrics from sklearn.metrics import mean_squared_error from sklearn.metrics import accuracy_score, classification_report from sklearn.metrics import r2_score from sklearn.metrics import roc_curve, RocCurveDisplay, auc # visualization import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix import seaborn as sns import plotly.express as px import plotly.figure_factory as ff import matplotlib.patches as patches import plotly.graph_objects as go

Load Data

# Load p_features and q_scores with open('/work/p_features.json', 'r') as j: p_features = json.load(j) with open('/work/q_scores.json', 'r') as j: q_scores = json.load(j) p_features = dict(sorted(p_features.items())) q_scores = dict(sorted(q_scores.items()))

Time Management Model

Subset Data

tm_X = [] tm_features = ['bowl_created_rem', 'bowl_washed_rem', 'order_collected_rem', 'order_given_rem', 'pre_post'] for data in p_features.values(): values = [] for var, val in data.items(): if var in tm_features: values.append(val) tm_X.append(values) tm_X = np.array(tm_X)

print(tm_X.shape) tm_X

tm_y = [] for data in q_scores.values(): for var, val in data.items(): if var == 'time management': tm_y.append(val) tm_y = np.array(tm_y)

print(tm_y.shape) tm_y

Descriptive Statistics for y

mean_value = np.mean(tm_y) median_value = np.median(tm_y) std_deviation = np.std(tm_y) min_value = np.min(tm_y) max_value = np.max(tm_y) q3 = np.percentile(tm_y, 75) print(f"Mean: {mean_value:.2f}") print(f"Median: {median_value}") print(f"Standard Deviation: {std_deviation:.2f}") print(f"Minimum Value: {min_value}") print(f"Maximum Value: {max_value}") print(f"Third Quartile: {q3}")

# Create a DataFrame scores1 = {'Scores': tm_y} scores1 = pd.DataFrame(scores1) # Create a distplot using scores4.figure_factory fig = ff.create_distplot([scores1['Scores']], group_labels=['distplot'], colors=['#d01c8b'], show_rug=False, show_curve=False) # Add title fig.update_layout( title_text='Distribution of Time Management Scores', paper_bgcolor='rgba(0,0,0,0)', width=600 ) # Update x-axis range fig.update_xaxes(range=[18, 90]) # Threshold line threshold = 60 fig.add_shape( go.layout.Shape( type='line', x0=threshold, x1=threshold, y0=0, y1=0.08, line=dict(color='red', width=2) ) ) # Display the plot fig.show()

Correlation Matrix

tm_correlation_matrix = np.corrcoef(tm_X, rowvar=False) fig = go.Figure(data=go.Heatmap( z=tm_correlation_matrix, x=tm_features, y=tm_features, colorscale="PiYG", zmin=-1, zmax=1 )) # Customize the layout fig.update_layout( title="Time Management Correlation Matrix of Features", xaxis=dict(title="Features"), yaxis=dict(title="Features"), height=500, paper_bgcolor='rgba(0,0,0,0)', width=500 ) # Show the plot fig.show()

Feature Selection

# Split the data into training and testing sets tm_X_train, tm_X_test, tm_y_train, tm_y_test = train_test_split(tm_X, tm_y, test_size=0.1, random_state=42) selector = SelectKBest(f_classif, k=3) selector.fit(tm_X_train, tm_y_train) tm_selector_scores = selector.scores_ tm_selector_pvalues = selector.pvalues_ selected_feature_indices = selector.get_support(indices=True) selected_feature_names = [tm_features[i] for i in selected_feature_indices] tm_selector_results = pd.DataFrame({ 'Feature': selected_feature_names, 'Score': tm_selector_scores[selected_feature_indices], 'P-value': tm_selector_pvalues[selected_feature_indices] }) tm_selector_results = tm_selector_results.sort_values(by='Score', ascending=False) tm_selector_results

SVM

# split y into binary classes tm_y_new = [] for score in tm_y: if score < 60: tm_y_new.append(0) else: tm_y_new.append(1) tm_y = np.array(tm_y_new)

print(tm_y.shape) tm_y

# Split the data into training and testing sets tm_X_train, tm_X_test, tm_y_train, tm_y_test = train_test_split(tm_X, tm_y, test_size=0.2, random_state=42) # Standardize the features by removing the mean and scaling to unit variance scaler = StandardScaler() tm_X_train = scaler.fit_transform(tm_X_train) tm_X_test = scaler.transform(tm_X_test) # Normalize the features scaler2 = MinMaxScaler() tm_X_train = scaler2.fit_transform(tm_X_train) tm_X_test = scaler2.transform(tm_X_test) # Create an SVM model tm_svm = SVC(kernel='sigmoid') # Train the model tm_svm.fit(tm_X_train, tm_y_train) # Make predictions on the test set tm_y_pred = tm_svm.predict(tm_X_test) # Evaluate the model accuracy = accuracy_score(tm_y_test, tm_y_pred) print(f"Accuracy: {accuracy:.2f}") # Display classification report print("Classification Report:\n", classification_report(tm_y_test, tm_y_pred))

Confusion Matrix

# Calculate confusion matrix conf_matrix1 = confusion_matrix(tm_y_test, tm_y_pred) # Define class labels class_labels = [0, 1] # Create annotated heatmap using plotly.figure_factory fig = ff.create_annotated_heatmap(conf_matrix1, x=class_labels, y=class_labels, colorscale='PiYG') # # Set text color based on background color # lightest_color_threshold = 0.5 # text_colors = [['black' if val / conf_matrix1.max() > lightest_color_threshold else 'white' for val in row] for row in conf_matrix1] # # Update text information in the heatmap # fig.update_layout(annotations=[dict(text=str(val), x=j, y=i, showarrow=False, font=dict(color=text_colors[i][j])) # for i, row in enumerate(conf_matrix1) # for j, val in enumerate(row)]) # Add title fig.update_layout(title_text='Time Management Confusion Matrix') # Add custom x-y-titles fig.add_annotation(dict(font=dict(color="black", size=14), x=0.5, y=-0.15, showarrow=False, text="Predicted label", xref="paper", yref="paper")) fig.add_annotation(dict(font=dict(color="black", size=14),x=-0.15,y=0.5,showarrow=False,text="Actual label",textangle=-90,xref="paper",yref="paper")) # Adjust margins to make room for y-axis title fig.update_layout( margin=dict(t=50, l=200), paper_bgcolor='rgba(0,0,0,0)', width=600 ) # Display the plot fig.show()

Cross Validation

# Split the data into training and testing sets tm_X_train, tm_X_test, tm_y_train, tm_y_test = train_test_split(tm_X, tm_y, test_size=0.2, random_state=42) # Feature selection max_k = 5 # Set the maximum k value to test avg_accuracies = [] for k in range(1, max_k + 1): selector = SelectKBest(f_classif, k=k) selector.fit(tm_X_train, tm_y_train) # Get selected feature indices selected_feature_indices = selector.get_support(indices=True) # Apply feature selection to training and test sets tm_X_train_selected = tm_X_train[:, selected_feature_indices] tm_X_test_selected = tm_X_test[:, selected_feature_indices] # Standardize the features scaler = StandardScaler() tm_X_train_selected = scaler.fit_transform(tm_X_train_selected) tm_X_test_selected = scaler.transform(tm_X_test_selected) # K-fold cross-validation n_splits = 10 kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42) avg_acc = 0 for train_index, test_index in kfold.split(tm_X): # Extract training and test sets X_train, X_test = tm_X[train_index], tm_X[test_index] y_train, y_test = tm_y[train_index], tm_y[test_index] # Apply feature selection X_train_selected = X_train[:, selected_feature_indices] X_test_selected = X_test[:, selected_feature_indices] # Standardize the features X_train_selected = scaler.fit_transform(X_train_selected) X_test_selected = scaler.transform(X_test_selected) # Create and train the model svm_model = SVC(kernel='sigmoid') svm_model.fit(X_train_selected, y_train) # Make predictions on the test set y_pred = svm_model.predict(X_test_selected) # Measure performance on y_test and y_pred acc = accuracy_score(y_test, y_pred) avg_acc += acc avg_accuracy = avg_acc / n_splits avg_accuracies.append(avg_accuracy) # Find k with best accuracy max_accuracy_index = max(range(len(avg_accuracies)), key=avg_accuracies.__getitem__) # Find the second max accuracy index second_best_k = None for i, acc in enumerate(avg_accuracies): if i != max_accuracy_index and (second_best_k is None or acc > avg_accuracies[second_best_k]): second_best_k = i # Selector for desired k if second_best_k is not None: best_k_second = second_best_k + 1 selector = SelectKBest(f_classif, k=max_accuracy_index+1) selector.fit(tm_X_train, tm_y_train) selected_feature_indices = selector.get_support(indices=True) selected_feature_names = [tm_features[i] for i in selected_feature_indices] tm_X_train_selected = tm_X_train[:, selected_feature_indices] tm_X_test_selected = tm_X_test[:, selected_feature_indices] print(f"Average Accuracy: {round(avg_accuracies[max_accuracy_index], 2)} for k={max_accuracy_index + 1}") # print(f"Average Accuracy: {round(avg_accuracies[second_best_k], 2)} for k={best_k_second}") print(f"Selected Features for k={max_accuracy_index+1}: {selected_feature_names}") # Plot fig = go.Figure() fig.add_trace(go.Scatter(x=list(range(1, max_k + 1)), y=avg_accuracies, mode='lines+markers', line=dict(color='#e8b0d3'))) fig.update_layout( title='Time Management Avg Accuracy vs. Top-k Features', xaxis=dict(title='Top-k Features'), yaxis=dict(title='Average Accuracy'), paper_bgcolor='rgba(0,0,0,0)', width=500 ) fig.show()

ROC Curve

# Create and train the model with the best k svm_model = SVC(kernel='sigmoid', probability=True) # Enable probability estimates for ROC curve svm_model.fit(tm_X_train_selected, tm_y_train) # Make predictions on the test set y_probs = svm_model.predict_proba(tm_X_test_selected)[:, 1] # Calculate ROC curve fpr, tpr, thresholds = roc_curve(tm_y_test, y_probs) roc_auc = auc(fpr, tpr) # Plot ROC curve using Plotly fig = go.Figure() fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (AUC = {roc_auc:.2f})', line=dict(color='#e8b0d3'))) fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Baseline', line=dict(dash='dash'))) fig.update_layout( title='Time Management ROC Curve', xaxis=dict(title='False Positive Rate (FPR)', range=[-0.02, 1.02]), yaxis=dict(title='True Positive Rate (TPR)'), legend=dict(x=0.43, y=0.03, traceorder='normal'), paper_bgcolor='rgba(0,0,0,0)', width=500 ) fig.show()

Perfectionism Model

Subset Data

p_X = [] perf_features = ["ratio_hit", "total_time_drawing", "redo", "color_switch", 'bowl_created_rem', 'bowl_washed_rem', 'order_collected_rem', 'order_given_rem'] for data in p_features.values(): values = [] for var, val in data.items(): if var in perf_features: values.append(val) p_X.append(values) p_X = np.array(p_X)

p_X

p_y = [] for data in q_scores.values(): for var, val in data.items(): if var == 'perfectionism': p_y.append(val) p_y = np.array(p_y)

p_y

Descriptive Statistics for y

mean_value = np.mean(p_y) median_value = np.median(p_y) std_deviation = np.std(p_y) min_value = np.min(p_y) max_value = np.max(p_y) q3 = np.percentile(p_y, 75) print(f"Mean: {mean_value:.2f}") print(f"Median: {median_value}") print(f"Standard Deviation: {std_deviation:.2f}") print(f"Minimum Value: {min_value}") print(f"Maximum Value: {max_value}") print(f"Third Quartile: {q3}")

# Create a DataFrame scores2 = {'Scores': p_y} scores2 = pd.DataFrame(scores2) # Create a distplot using scores2.figure_factory fig = ff.create_distplot([scores2['Scores']], group_labels=['distplot'], colors=['#d01c8b'], show_rug=False, show_curve=False) # Add title fig.update_layout( title_text='Distribution of Perfectionism Scores', paper_bgcolor='rgba(0,0,0,0)', width=600 ) # Update y-axis range fig.update_xaxes(range=[10, 70]) # Threshold line threshold = 52 fig.add_shape( go.layout.Shape( type='line', x0=threshold, x1=threshold, y0=0, y1=0.08, line=dict(color='red', width=2) ) ) # Display the plot fig.show()

Correlation Matrix

p_correlation_matrix = np.corrcoef(p_X, rowvar=False) fig = go.Figure(data=go.Heatmap( z=p_correlation_matrix, x=perf_features, y=perf_features, colorscale="PiYG", zmin=-1, zmax=1 )) # Customize the layout fig.update_layout( title="Perfectionism Correlation Matrix of Features", xaxis=dict(title="Features"), yaxis=dict(title="Features"), height=500, paper_bgcolor='rgba(0,0,0,0)', width=600 ) # Show the plot fig.show()

Feature Selection

# Split the data into training and testing sets p_X_train, p_X_test, p_y_train, p_y_test = train_test_split(p_X, p_y, test_size=0.1, random_state=42) selector = SelectKBest(f_classif, k=1) selector.fit(p_X_train, p_y_train) p_selector_scores = selector.scores_ p_selector_pvalues = selector.pvalues_ selected_feature_indices = selector.get_support(indices=True) selected_feature_names = [perf_features[i] for i in selected_feature_indices] p_selector_results = pd.DataFrame({ 'Feature': selected_feature_names, 'Score': p_selector_scores[selected_feature_indices], 'P-value': p_selector_pvalues[selected_feature_indices] }) p_selector_results = p_selector_results.sort_values(by='Score', ascending=False) p_selector_results

SVM

# split y into binary classes p_y_new = [] for score in p_y: if score < 52: p_y_new.append(0) else: p_y_new.append(1) p_y = np.array(p_y_new)

p_y

# Split the data into training and testing sets p_X_train, p_X_test, p_y_train, p_y_test = train_test_split(p_X, p_y, test_size=0.1, random_state=42) # Standardize the features by removing the mean and scaling to unit variance scaler = StandardScaler() p_X_train = scaler.fit_transform(p_X_train) p_X_test = scaler.transform(p_X_test) # Create an SVM model p_svm = SVC(kernel='rbf') # Train the model p_svm.fit(p_X_train, p_y_train) # Make predictions on the test set p_y_pred = p_svm.predict(p_X_test) # Evaluate the model accuracy = accuracy_score(p_y_test, p_y_pred) print(f"Accuracy: {accuracy:.2f}") # Display classification report print("Classification Report:\n", classification_report(p_y_test, p_y_pred))

Confusion Matrix

# Calculate confusion matrix conf_matrix2 = confusion_matrix(p_y_test, p_y_pred) # Define class labels class_labels = [0, 1] # Create annotated heatmap using plotly.figure_factory fig = ff.create_annotated_heatmap(conf_matrix2, x=class_labels, y=class_labels, colorscale='PiYG') # Add title fig.update_layout(title_text='Perfectionism Confusion Matrix') # Add custom x-y-titles fig.add_annotation(dict(font=dict(color="black", size=14), x=0.5, y=-0.15, showarrow=False, text="Predicted label", xref="paper", yref="paper")) fig.add_annotation(dict(font=dict(color="black", size=14),x=-0.15,y=0.5,showarrow=False,text="Actual label",textangle=-90,xref="paper",yref="paper")) # Adjust margins to make room for y-axis title fig.update_layout( margin=dict(t=50, l=200), paper_bgcolor='rgba(0,0,0,0)', width=600 ) # Display the plot fig.show()

Cross Validation

# Split the data into training and testing sets p_X_train, p_X_test, p_y_train, p_y_test = train_test_split(p_X, p_y, test_size=0.2, random_state=42) # Feature selection max_k = 8 # Set the maximum k value to test avg_accuracies = [] for k in range(1, max_k + 1): selector = SelectKBest(f_classif, k=k) selector.fit(p_X_train, p_y_train) # Get selected feature indices selected_feature_indices = selector.get_support(indices=True) # Apply feature selection to training and test sets p_X_train_selected = p_X_train[:, selected_feature_indices] p_X_test_selected = p_X_test[:, selected_feature_indices] # Standardize the features scaler = StandardScaler() p_X_train_selected = scaler.fit_transform(p_X_train_selected) p_X_test_selected = scaler.transform(p_X_test_selected) # K-fold cross-validation n_splits = 10 kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42) avg_acc = 0 for train_index, test_index in kfold.split(p_X): # Extract training and test sets X_train, X_test = p_X[train_index], p_X[test_index] y_train, y_test = p_y[train_index], p_y[test_index] # Apply feature selection X_train_selected = X_train[:, selected_feature_indices] X_test_selected = X_test[:, selected_feature_indices] # Standardize the features X_train_selected = scaler.fit_transform(X_train_selected) X_test_selected = scaler.transform(X_test_selected) # Create and train the model svm_model = SVC(kernel='rbf') svm_model.fit(X_train_selected, y_train) # Make predictions on the test set y_pred = svm_model.predict(X_test_selected) # Measure performance on y_test and y_pred acc = accuracy_score(y_test, y_pred) avg_acc += acc avg_accuracy = avg_acc / n_splits avg_accuracies.append(avg_accuracy) # Find k with best accuracy max_accuracy_index = max(range(len(avg_accuracies)), key=avg_accuracies.__getitem__) # Find the second max accuracy index second_best_k = None for i, acc in enumerate(avg_accuracies): if i != max_accuracy_index and (second_best_k is None or acc > avg_accuracies[second_best_k]): second_best_k = i # Selector for desired k if second_best_k is not None: best_k_second = second_best_k + 1 selector = SelectKBest(f_classif, k=max_accuracy_index + 1) #max_accuracy_index + 1 selector.fit(p_X_train, p_y_train) selected_feature_indices = selector.get_support(indices=True) selected_feature_names = [perf_features[i] for i in selected_feature_indices] p_X_train_selected = p_X_train[:, selected_feature_indices] p_X_test_selected = p_X_test[:, selected_feature_indices] print(f"Average Accuracy: {round(avg_accuracies[max_accuracy_index], 2)} for k={max_accuracy_index + 1}") print(f"Selected Features for k={max_accuracy_index + 1}: {selected_feature_names}") # Plot fig = go.Figure() fig.add_trace(go.Scatter(x=list(range(1, max_k + 1)), y=avg_accuracies, mode='lines+markers', line=dict(color='#e8b0d3'))) fig.update_layout( title='Perfectionism Avg Accuracy vs. Top-k Features', xaxis=dict(title='Top-k Features'), yaxis=dict(title='Average Accuracy'), paper_bgcolor='rgba(0,0,0,0)', width=500 ) fig.show()

ROC Curve

# Create and train the model with the best k svm_model = SVC(kernel='rbf', probability=True) # Enable probability estimates for ROC curve svm_model.fit(p_X_train_selected, p_y_train) # Make predictions on the test set y_probs = svm_model.predict_proba(p_X_test_selected)[:, 1] # Calculate ROC curve fpr, tpr, thresholds = roc_curve(p_y_test, y_probs) roc_auc = auc(fpr, tpr) # Plot ROC curve using Plotly fig = go.Figure() fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (AUC = {roc_auc:.2f})', line=dict(color='#e8b0d3'))) fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Baseline', line=dict(dash='dash'))) fig.update_layout( title='Perfectionism ROC Curve', xaxis=dict(title='False Positive Rate (FPR)', range=[-0.02, 1.02]), yaxis=dict(title='True Positive Rate (TPR)'), legend=dict(x=0.43, y=0.03, traceorder='normal'), paper_bgcolor='rgba(0,0,0,0)', width=500 ) fig.show()

Distractibility Model

Subset Data

d_X = [] d_features = ["distracted_ratio", "time_distracted_ratio", "cue_distracted_ratio"] for data in p_features.values(): values = [] for var, val in data.items(): if var in d_features: values.append(val) d_X.append(values) d_X = np.array(d_X)

d_X

d_y = [] for data in q_scores.values(): for var, val in data.items(): if var == 'distractibility': d_y.append(val) d_y = np.array(d_y)

d_y

Descriptive Statistics for y

mean_value = np.mean(d_y) median_value = np.median(d_y) std_deviation = np.std(d_y) min_value = np.min(d_y) max_value = np.max(d_y) q3 = np.percentile(d_y, 75) print(f"Mean: {mean_value:.2f}") print(f"Median: {median_value}") print(f"Standard Deviation: {std_deviation:.2f}") print(f"Minimum Value: {min_value}") print(f"Maximum Value: {max_value}") print(f"Third Quartile: {q3}")

# Create a DataFrame scores3 = {'Scores': d_y} scores3 = pd.DataFrame(scores3) # Create a distplot using scores3.figure_factory fig = ff.create_distplot([scores3['Scores']], group_labels=['distplot'], colors=['#d01c8b'], show_rug=False, show_curve=False) # Add title fig.update_layout( title_text='Distribution of Distractability Scores', paper_bgcolor='rgba(0,0,0,0)', width=600 ) # Update y-axis range fig.update_xaxes(range=[6, 31]) # Threshold line threshold = 21 fig.add_shape( go.layout.Shape( type='line', x0=threshold, x1=threshold, y0=0, y1=0.18, line=dict(color='red', width=2) ) ) # Display the plot fig.show()

Correlation Matrix

d_correlation_matrix = np.corrcoef(d_X, rowvar=False) fig = go.Figure(data=go.Heatmap( z=d_correlation_matrix, x=d_features, y=d_features, colorscale="PiYG", zmin=-1, zmax=1 )) # Customize the layout fig.update_layout( title="Distractability Correlation Matrix of Features", xaxis=dict(title="Features"), yaxis=dict(title="Features"), paper_bgcolor='rgba(0,0,0,0)', width=600, height=600, ) # Show the plot fig.show()

Feature Selection

# Split the data into training and testing sets d_X_train, d_X_test, d_y_train, d_y_test = train_test_split(d_X, d_y, test_size=0.1, random_state=42) selector = SelectKBest(f_classif, k=3) selector.fit(d_X_train, d_y_train) d_selector_scores = selector.scores_ d_selector_pvalues = selector.pvalues_ selected_feature_indices = selector.get_support(indices=True) selected_feature_names = [d_features[i] for i in selected_feature_indices] d_selector_results = pd.DataFrame({ 'Feature': selected_feature_names, 'Score': d_selector_scores[selected_feature_indices], 'P-value': d_selector_pvalues[selected_feature_indices] }) d_selector_results = d_selector_results.sort_values(by='Score', ascending=False) d_selector_results

SVM

# split y into binary classes d_y_new = [] for score in d_y: if score < 21: d_y_new.append(0) else: d_y_new.append(1) d_y = np.array(d_y_new)

d_y

# Split the data into training and testing sets d_X_train, d_X_test, d_y_train, d_y_test = train_test_split(d_X, d_y, test_size=0.2, random_state=42) # Standardize the features by removing the mean and scaling to unit variance scaler = StandardScaler() d_X_train = scaler.fit_transform(d_X_train) d_X_test = scaler.transform(d_X_test) # Create an SVM model d_svm = SVC(kernel='poly') # Train the model d_svm.fit(d_X_train, d_y_train) # Make predictions on the test set d_y_pred = d_svm.predict(d_X_test) # Evaluate the model accuracy = accuracy_score(d_y_test, d_y_pred) print(f"Accuracy: {accuracy:.2f}") # Display classification report print("Classification Report:\n", classification_report(d_y_test, d_y_pred))

Confusion Matrix

# Calculate confusion matrix conf_matrix3 = confusion_matrix(d_y_test, d_y_pred) # Define class labels class_labels = [0, 1] # Create annotated heatmap using plotly.figure_factory fig = ff.create_annotated_heatmap(conf_matrix3, x=class_labels, y=class_labels, colorscale='PiYG') # Add title fig.update_layout(title_text='Distractability Confusion Matrix') # Add custom x-y-titles fig.add_annotation(dict(font=dict(color="black", size=14), x=0.5, y=-0.15, showarrow=False, text="Predicted label", xref="paper", yref="paper")) fig.add_annotation(dict(font=dict(color="black", size=14),x=-0.15,y=0.5,showarrow=False,text="Actual label",textangle=-90,xref="paper",yref="paper")) # Adjust margins to make room for y-axis title fig.update_layout( margin=dict(t=50, l=200), paper_bgcolor='rgba(0,0,0,0)', width=600 ) # Display the plot fig.show()

Cross Validation

# Split the data into training and testing sets d_X_train, d_X_test, d_y_train, d_y_test = train_test_split(d_X, d_y, test_size=0.2, random_state=42) # Feature selection max_k = 3 # Set the maximum k value to test avg_accuracies = [] for k in range(1, max_k + 1): selector = SelectKBest(f_classif, k=k) selector.fit(d_X_train, d_y_train) # Get selected feature indices selected_feature_indices = selector.get_support(indices=True) # Apply feature selection to training and test sets d_X_train_selected = d_X_train[:, selected_feature_indices] d_X_test_selected = d_X_test[:, selected_feature_indices] # Standardize the features scaler = StandardScaler() d_X_train_selected = scaler.fit_transform(d_X_train_selected) d_X_test_selected = scaler.transform(d_X_test_selected) # K-fold cross-validation n_splits = 10 kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42) avg_acc = 0 for train_index, test_index in kfold.split(d_X): # Extract training and test sets X_train, X_test = d_X[train_index], d_X[test_index] y_train, y_test = d_y[train_index], d_y[test_index] # Apply feature selection X_train_selected = X_train[:, selected_feature_indices] X_test_selected = X_test[:, selected_feature_indices] # Standardize the features X_train_selected = scaler.fit_transform(X_train_selected) X_test_selected = scaler.transform(X_test_selected) # Create and train the model svm_model = SVC(kernel='poly') svm_model.fit(X_train_selected, y_train) # Make predictions on the test set y_pred = svm_model.predict(X_test_selected) # Measure performance on y_test and y_pred acc = accuracy_score(y_test, y_pred) avg_acc += acc avg_accuracy = avg_acc / n_splits avg_accuracies.append(avg_accuracy) # Find k with best accuracy max_accuracy_index = max(range(len(avg_accuracies)), key=avg_accuracies.__getitem__) # Find the second max accuracy index second_best_k = None for i, acc in enumerate(avg_accuracies): if i != max_accuracy_index and (second_best_k is None or acc > avg_accuracies[second_best_k]): second_best_k = i # Selector for desired k if second_best_k is not None: best_k_second = second_best_k + 1 selector = SelectKBest(f_classif, k=max_accuracy_index + 1) # hardcode selector.fit(d_X_train, d_y_train) selected_feature_indices = selector.get_support(indices=True) selected_feature_names = [d_features[i] for i in selected_feature_indices] d_X_train_selected = d_X_train[:, selected_feature_indices] d_X_test_selected = d_X_test[:, selected_feature_indices] print(f"Average Accuracy: {round(avg_accuracies[max_accuracy_index], 2)} for k={max_accuracy_index + 1}") print(f"Selected Features for k={max_accuracy_index + 1}: {selected_feature_names}") # Plot fig = go.Figure() fig.add_trace(go.Scatter(x=list(range(1, max_k + 1)), y=avg_accuracies, mode='lines+markers', line=dict(color='#e8b0d3'))) fig.update_layout( title='Distractability Avg Accuracy vs. Top-k Features', xaxis=dict(title='Top-k Features'), yaxis=dict(title='Average Accuracy'), paper_bgcolor='rgba(0,0,0,0)', width=500 ) fig.show()

ROC Curve

# Create and train the model with the best k svm_model = SVC(kernel='poly', probability=True) # Enable probability estimates for ROC curve svm_model.fit(d_X_train_selected, d_y_train) # Make predictions on the test set y_probs = svm_model.predict_proba(d_X_test_selected)[:, 1] # Calculate ROC curve fpr, tpr, thresholds = roc_curve(d_y_test, y_probs) roc_auc = auc(fpr, tpr) # Plot ROC curve using Plotly fig = go.Figure() fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (AUC = {roc_auc:.2f})', line=dict(color='#e8b0d3'))) fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Baseline', line=dict(dash='dash'))) fig.update_layout( title='Distractability ROC Curve', xaxis=dict(title='False Positive Rate (FPR)', range=[-0.02, 1.02]), yaxis=dict(title='True Positive Rate (TPR)'), legend=dict(x=0.43, y=0.03, traceorder='normal'), paper_bgcolor='rgba(0,0,0,0)', width=500 ) fig.show()

Emotional Regulation Model

Subset Data

er_X = [] er_features = ["Ebowl_created_rem", "Ebowl_washed_rem", "Eorder_collected_rem", "Eorder_given_rem", "Epre_post", "Eratio_hit", "Etotal_time_drawing", "Eredo", "Ecolor_switch", "Edistracted_ratio", "Etime_distracted_ratio", "Ecue_distracted_ratio"] for data in p_features.values(): values = [] for var, val in data.items(): if var in er_features: values.append(val) er_X.append(values) er_X = np.array(er_X)

er_X

er_y = [] for data in q_scores.values(): for var, val in data.items(): if var == 'emotional regulation': er_y.append(val) er_y = np.array(er_y)

er_y

Descriptive Statistics for y

mean_value = np.mean(er_y) median_value = np.median(er_y) std_deviation = np.std(er_y) min_value = np.min(er_y) max_value = np.max(er_y) q3 = np.percentile(er_y, 75) print(f"Mean: {mean_value:.2f}") print(f"Median: {median_value}") print(f"Standard Deviation: {std_deviation:.2f}") print(f"Minimum Value: {min_value}") print(f"Maximum Value: {max_value}") print(f"Third Quartile: {q3}")

# Create a DataFrame scores4 = {'Scores': er_y} scores4 = pd.DataFrame(scores2) # Create a distplot using scores4.figure_factory fig = ff.create_distplot([scores4['Scores']], group_labels=['distplot'], colors=['#d01c8b'], show_rug=False, show_curve=False) # Add title fig.update_layout( title_text='Distribution of Emotional Regulation Scores', paper_bgcolor='rgba(0,0,0,0)', width=600 ) # Update y-axis range fig.update_xaxes(range=[10, 55]) # Threshold line threshold = 34 fig.add_shape( go.layout.Shape( type='line', x0=threshold, x1=threshold, y0=0, y1=0.08, line=dict(color='red', width=2) ) ) # Display the plot fig.show()

Correlation Matrix

er_correlation_matrix = np.corrcoef(er_X, rowvar=False) fig = go.Figure(data=go.Heatmap( z=er_correlation_matrix, x=er_features, y=er_features, colorscale="PiYG", zmin=-1, zmax=1 )) # Customize the layout fig.update_layout( title="Emotional Regulation Correlation Matrix of Features", xaxis=dict(title="Features"), yaxis=dict(title="Features"), paper_bgcolor='rgba(0,0,0,0)', width=600, height=600, ) # Show the plot fig.show()

Feature Selection

# Split the data into training and testing sets er_X_train, er_X_test, er_y_train, er_y_test = train_test_split(er_X, er_y, test_size=0.1, random_state=42) selector = SelectKBest(f_classif, k=4) selector.fit(er_X_train, er_y_train) er_selector_scores = selector.scores_ er_selector_pvalues = selector.pvalues_ selected_feature_indices = selector.get_support(indices=True) selected_feature_names = [er_features[i] for i in selected_feature_indices] er_selector_results = pd.DataFrame({ 'Feature': selected_feature_names, 'Score': er_selector_scores[selected_feature_indices], 'P-value': er_selector_pvalues[selected_feature_indices] }) er_selector_results = er_selector_results.sort_values(by='Score', ascending=False) er_selector_results

SVM

# split y into binary classes er_y_new = [] for score in er_y: if score < 33: er_y_new.append(0) else: er_y_new.append(1) er_y = np.array(er_y_new)

er_y

# Split the data into training and testing sets er_X_train, er_X_test, er_y_train, er_y_test = train_test_split(er_X, er_y, test_size=0.2, random_state=42) # Standardize the features by removing the mean and scaling to unit variance scaler = StandardScaler() er_X_train = scaler.fit_transform(er_X_train) er_X_test = scaler.transform(er_X_test) # Create an SVM model er_svm = SVC(kernel='sigmoid') # Train the model er_svm.fit(er_X_train, er_y_train) # Make predictions on the test set er_y_pred = er_svm.predict(er_X_test) # Evaluate the model accuracy = accuracy_score(er_y_test, er_y_pred) print(f"Accuracy: {accuracy:.2f}") # Display classification report print("Classification Report:\n", classification_report(er_y_test, er_y_pred))

Confusion Matrix

# Calculate confusion matrix conf_matrix4 = confusion_matrix(er_y_test, er_y_pred) # Define class labels class_labels = [0, 1] # Create annotated heatmap using plotly.figure_factory fig = ff.create_annotated_heatmap(conf_matrix4, x=class_labels, y=class_labels, colorscale='PiYG') # Add title fig.update_layout(title_text='Emotional Regulation Confusion Matrix') # Add custom x-y-titles fig.add_annotation(dict(font=dict(color="black", size=14), x=0.5, y=-0.15, showarrow=False, text="Predicted label", xref="paper", yref="paper")) fig.add_annotation(dict(font=dict(color="black", size=14),x=-0.15,y=0.5,showarrow=False,text="Actual label",textangle=-90,xref="paper",yref="paper")) # Adjust margins to make room for y-axis title fig.update_layout( margin=dict(t=50, l=200), paper_bgcolor='rgba(0,0,0,0)', width=600 ) # Display the plot fig.show()

Cross Validation

# Split the data into training and testing sets er_X_train, er_X_test, er_y_train, er_y_test = train_test_split(er_X, er_y, test_size=0.1, random_state=42) # Feature selection max_k = 12 # Set the maximum k value to test avg_accuracies = [] for k in range(1, max_k + 1): selector = SelectKBest(f_classif, k=k) selector.fit(er_X_train, er_y_train) # Get selected feature indices selected_feature_indices = selector.get_support(indices=True) # Apply feature selection to training and test sets er_X_train_selected = er_X_train[:, selected_feature_indices] er_X_test_selected = er_X_test[:, selected_feature_indices] # Standardize the features scaler = StandardScaler() er_X_train_selected = scaler.fit_transform(er_X_train_selected) er_X_test_selected = scaler.transform(er_X_test_selected) # K-fold cross-validation n_splits = 10 kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42) avg_acc = 0 for train_index, test_index in kfold.split(er_X): # Extract training and test sets X_train, X_test = er_X[train_index], er_X[test_index] y_train, y_test = er_y[train_index], er_y[test_index] # Apply feature selection X_train_selected = X_train[:, selected_feature_indices] X_test_selected = X_test[:, selected_feature_indices] # Standardize the features X_train_selected = scaler.fit_transform(X_train_selected) X_test_selected = scaler.transform(X_test_selected) # Create and train the model svm_model = SVC(kernel='poly') svm_model.fit(X_train_selected, y_train) # Make predictions on the test set y_pred = svm_model.predict(X_test_selected) # Measure performance on y_test and y_pred acc = accuracy_score(y_test, y_pred) avg_acc += acc avg_accuracy = avg_acc / n_splits avg_accuracies.append(avg_accuracy) # Find k with best accuracy max_accuracy_index = max(range(len(avg_accuracies)), key=avg_accuracies.__getitem__) # Find the second max accuracy index second_best_k = None for i, acc in enumerate(avg_accuracies): if i != max_accuracy_index and (second_best_k is None or acc > avg_accuracies[second_best_k]): second_best_k = i # Selector for desired k if second_best_k is not None: best_k_second = second_best_k + 1 selector = SelectKBest(f_classif, k=max_accuracy_index + 1) # hardcode selector.fit(er_X_train, er_y_train) selected_feature_indices = selector.get_support(indices=True) selected_feature_names = [er_features[i] for i in selected_feature_indices] er_X_train_selected = er_X_train[:, selected_feature_indices] er_X_test_selected = er_X_test[:, selected_feature_indices] print(f"Average Accuracy: {round(avg_accuracies[max_accuracy_index], 2)} for k={max_accuracy_index + 1}") print(f"Selected Features for k={max_accuracy_index + 1}: {selected_feature_names}") # Plot fig = go.Figure() fig.add_trace(go.Scatter(x=list(range(1, max_k + 1)), y=avg_accuracies, mode='lines+markers', line=dict(color='#e8b0d3'))) fig.update_layout( title='Emotional Regulation Avg Accuracy vs. Top-k Features', xaxis=dict(title='Top-k Features'), yaxis=dict(title='Average Accuracy'), paper_bgcolor='rgba(0,0,0,0)', width=500 ) fig.show()

ROC Curve

# Create and train the model with the best k svm_model = SVC(kernel='poly', probability=True) # Enable probability estimates for ROC curve svm_model.fit(er_X_train_selected, er_y_train) # Make predictions on the test set y_probs = svm_model.predict_proba(er_X_test_selected)[:, 1] # Calculate ROC curve fpr, tpr, thresholds = roc_curve(er_y_test, y_probs) roc_auc = auc(fpr, tpr) # Plot ROC curve using Plotly fig = go.Figure() fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (AUC = {roc_auc:.2f})', line=dict(color='#e8b0d3'))) fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Baseline', line=dict(dash='dash'))) fig.update_layout( title='Emotional Regulation ROC Curve', xaxis=dict(title='False Positive Rate (FPR)', range=[-0.02, 1.02]), yaxis=dict(title='True Positive Rate (TPR)'), legend=dict(x=0.43, y=0.03, traceorder='normal'), paper_bgcolor='rgba(0,0,0,0)', width=500 ) fig.show()

Self-Discipline

Subset Data

sd_X = [] sd_features = ["Sbowl_created_rem", "Sbowl_washed_rem", "Sorder_collected_rem", "Sorder_given_rem", "Spre_post", "Sratio_hit", "Stotal_time_drawing", "Sredo", "Scolor_switch", "Sdistracted_ratio", "Stime_distracted_ratio", "Scue_distracted_ratio"] for data in p_features.values(): values = [] for var, val in data.items(): if var in sd_features: values.append(val) sd_X.append(values) sd_X = np.array(sd_X)

sd_X

sd_y = [] for data in q_scores.values(): for var, val in data.items(): if var == 'self discipline': sd_y.append(val) sd_y = np.array(sd_y)

sd_y

Descriptive Statistics for y

mean_value = np.mean(sd_y) median_value = np.median(sd_y) std_deviation = np.std(sd_y) min_value = np.min(sd_y) max_value = np.max(sd_y) q3 = np.percentile(sd_y, 75) print(f"Mean: {mean_value:.2f}") print(f"Median: {median_value}") print(f"Standard Deviation: {std_deviation:.2f}") print(f"Minimum Value: {min_value}") print(f"Maximum Value: {max_value}") print(f"Third Quartile: {q3}")

# Create a DataFrame scores5 = {'Scores': sd_y} scores5 = pd.DataFrame(scores5) # Create a distplot using scores5.figure_factory fig = ff.create_distplot([scores5['Scores']], group_labels=['distplot'], colors=['#d01c8b'], show_rug=False, show_curve=False) # Add title fig.update_layout( title_text='Distribution of Self Discipline Scores', paper_bgcolor='rgba(0,0,0,0)', width=600 ) # Update y-axis range fig.update_xaxes(range=[13, 65]) # Threshold line threshold = 47 fig.add_shape( go.layout.Shape( type='line', x0=threshold, x1=threshold, y0=0, y1=0.16, line=dict(color='red', width=2) ) ) # Display the plot fig.show()

Correlation Matrix

sd_correlation_matrix = np.corrcoef(sd_X, rowvar=False) fig = go.Figure(data=go.Heatmap( z=sd_correlation_matrix, x=sd_features, y=sd_features, colorscale="PiYG", zmin=-1, zmax=1 )) # Customize the layout fig.update_layout( title="Self Discipline Correlation Matrix of Features", xaxis=dict(title="Features"), yaxis=dict(title="Features"), paper_bgcolor='rgba(0,0,0,0)', width=600, height=600, ) # Show the plot fig.show()

Feature Selection

# Split the data into training and testing sets sd_X_train, sd_X_test, sd_y_train, sd_y_test = train_test_split(sd_X, sd_y, test_size=0.1, random_state=42) selector = SelectKBest(f_classif, k=1) selector.fit(sd_X_train, sd_y_train) sd_selector_scores = selector.scores_ sd_selector_pvalues = selector.pvalues_ selected_feature_indices = selector.get_support(indices=True) selected_feature_names = [sd_features[i] for i in selected_feature_indices] sd_selector_results = pd.DataFrame({ 'Feature': selected_feature_names, 'Score': sd_selector_scores[selected_feature_indices], 'P-value': sd_selector_pvalues[selected_feature_indices] }) sd_selector_results = sd_selector_results.sort_values(by='Score', ascending=False) sd_selector_results

SVM

# split y into binary classes sd_y_new = [] for score in sd_y: if score < 47: sd_y_new.append(0) else: sd_y_new.append(1) sd_y = np.array(sd_y_new)

sd_y

# Split the data into training and testing sets sd_X_train, sd_X_test, sd_y_train, sd_y_test = train_test_split(sd_X, sd_y, test_size=0.2, random_state=42) # Standardize the features by removing the mean and scaling to unit variance scaler = StandardScaler() sd_X_train = scaler.fit_transform(sd_X_train) sd_X_test = scaler.transform(sd_X_test) # Create an SVM model sd_svm = SVC(kernel='rbf') # Train the model sd_svm.fit(sd_X_train, sd_y_train) # Make predictions on the test set sd_y_pred = sd_svm.predict(sd_X_test) # Evaluate the model accuracy = accuracy_score(sd_y_test, sd_y_pred) print(f"Accuracy: {accuracy:.2f}") # Display classification report print("Classification Report:\n", classification_report(sd_y_test, sd_y_pred))

Confusion Matrix

# Calculate confusion matrix conf_matrix5 = confusion_matrix(sd_y_test, sd_y_pred) # Define class labels class_labels = [0, 1] # Create annotated heatmap using plotly.figure_factory fig = ff.create_annotated_heatmap(conf_matrix5, x=class_labels, y=class_labels, colorscale='PiYG') # Update text information in the heatmap fig.update_layout(annotations=[dict(text=str(val), x=j, y=i, showarrow=False) for i, row in enumerate(conf_matrix5) for j, val in enumerate(row)]) # Add title fig.update_layout(title_text='Self Discipline Confusion Matrix') # Add custom x-y-titles fig.add_annotation(dict(font=dict(color="black", size=14), x=0.5, y=-0.15, showarrow=False, text="Predicted label", xref="paper", yref="paper")) fig.add_annotation(dict(font=dict(color="black", size=14),x=-0.15,y=0.5,showarrow=False,text="Actual label",textangle=-90,xref="paper",yref="paper")) # Adjust margins to make room for y-axis title fig.update_layout( margin=dict(t=50, l=200), paper_bgcolor='rgba(0,0,0,0)', width=600 ) # Display the plot fig.show()

Cross Validation

# Split the data into training and testing sets sd_X_train, sd_X_test, sd_y_train, sd_y_test = train_test_split(sd_X, sd_y, test_size=0.1, random_state=42) # Feature selection max_k = 12 # Set the maximum k value to test avg_accuracies = [] for k in range(1, max_k + 1): selector = SelectKBest(f_classif, k=k) selector.fit(sd_X_train, sd_y_train) # Get selected feature indices selected_feature_indices = selector.get_support(indices=True) # Apply feature selection to training and test sets sd_X_train_selected = sd_X_train[:, selected_feature_indices] sd_X_test_selected = sd_X_test[:, selected_feature_indices] # Standardize the features scaler = StandardScaler() sd_X_train_selected = scaler.fit_transform(sd_X_train_selected) sd_X_test_selected = scaler.transform(sd_X_test_selected) # K-fold cross-validation n_splits = 10 kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42) avg_acc = 0 for train_index, test_index in kfold.split(sd_X): # Extract training and test sets X_train, X_test = sd_X[train_index], sd_X[test_index] y_train, y_test = sd_y[train_index], sd_y[test_index] # Apply feature selection X_train_selected = X_train[:, selected_feature_indices] X_test_selected = X_test[:, selected_feature_indices] # Standardize the features X_train_selected = scaler.fit_transform(X_train_selected) X_test_selected = scaler.transform(X_test_selected) # Create and train the model svm_model = SVC(kernel='rbf') svm_model.fit(X_train_selected, y_train) # Make predictions on the test set y_pred = svm_model.predict(X_test_selected) # Measure performance on y_test and y_pred acc = accuracy_score(y_test, y_pred) avg_acc += acc avg_accuracy = avg_acc / n_splits avg_accuracies.append(avg_accuracy) # Find k with best accuracy max_accuracy_index = max(range(len(avg_accuracies)), key=avg_accuracies.__getitem__) # Find the second max accuracy index second_best_k = None for i, acc in enumerate(avg_accuracies): if i != max_accuracy_index and (second_best_k is None or acc > avg_accuracies[second_best_k]): second_best_k = i # Selector for desired k if second_best_k is not None: best_k_second = second_best_k + 1 selector = SelectKBest(f_classif, k=max_accuracy_index + 1) selector.fit(sd_X_train, sd_y_train) selected_feature_indices = selector.get_support(indices=True) selected_feature_names = [sd_features[i] for i in selected_feature_indices] sd_X_train_selected = sd_X_train[:, selected_feature_indices] sd_X_test_selected = sd_X_test[:, selected_feature_indices] print(f"Average Accuracy: {round(avg_accuracies[max_accuracy_index], 2)} for k={max_accuracy_index + 1}") print(f"Selected Features for k={max_accuracy_index + 1}: {selected_feature_names}") # Plot fig = go.Figure() fig.add_trace(go.Scatter(x=list(range(1, max_k + 1)), y=avg_accuracies, mode='lines+markers', line=dict(color='#e8b0d3'))) fig.update_layout( title='Self Discipline Avg Accuracy vs. Top-k Features', xaxis=dict(title='Top-k Features'), yaxis=dict(title='Average Accuracy'), paper_bgcolor='rgba(0,0,0,0)', width=500 ) fig.show()

ROC Curve

# Create and train the model with the best k svm_model = SVC(kernel='rbf', probability=True) # Enable probability estimates for ROC curve svm_model.fit(sd_X_train_selected, sd_y_train) # Make predictions on the test set y_probs = svm_model.predict_proba(sd_X_test_selected)[:, 1] # Calculate ROC curve fpr, tpr, thresholds = roc_curve(sd_y_test, y_probs) roc_auc = auc(fpr, tpr) # Plot ROC curve using Plotly fig = go.Figure() fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (AUC = {roc_auc:.2f})', line=dict(color='#e8b0d3'))) fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Baseline', line=dict(dash='dash'))) fig.update_layout( title='Self Discipline ROC Curve', xaxis=dict(title='False Positive Rate (FPR)', range=[-0.02, 1.02]), yaxis=dict(title='True Positive Rate (TPR)'), legend=dict(x=0.43, y=0.03, traceorder='normal'), paper_bgcolor='rgba(0,0,0,0)', width=500 ) fig.show()

Procrastination

Subset Data

pro_X = [] pro_features = ['pre_post', "ratio_hit", "total_time_drawing", "redo", "color_switch", 'bowl_created_rem', 'bowl_washed_rem', 'order_collected_rem', 'order_given_rem', "distracted_ratio", "time_distracted_ratio", "cue_distracted_ratio", "Ebowl_created_rem", "Ebowl_washed_rem", "Eorder_collected_rem", "Eorder_given_rem", "Epre_post", "Eratio_hit", "Etotal_time_drawing", "Eredo", "Ecolor_switch", "Edistracted_ratio", "Etime_distracted_ratio", "Ecue_distracted_ratio", "Sbowl_created_rem", "Sbowl_washed_rem", "Sorder_collected_rem", "Sorder_given_rem", "Spre_post", "Sratio_hit", "Stotal_time_drawing", "Sredo", "Scolor_switch", "Sdistracted_ratio", "Stime_distracted_ratio", "Scue_distracted_ratio"] for data in p_features.values(): values = [] for var, val in data.items(): if var in pro_features: values.append(val) pro_X.append(values) pro_X = np.array(pro_X)

pro_X

pro_y = [] for data in q_scores.values(): for var, val in data.items(): if var == 'procrastination': pro_y.append(val) pro_y = np.array(pro_y)

pro_y

Descriptive Statistics for y

mean_value = np.mean(pro_y) median_value = np.median(pro_y) std_deviation = np.std(pro_y) min_value = np.min(pro_y) max_value = np.max(pro_y) q3 = np.percentile(pro_y, 75) print(f"Mean: {mean_value:.2f}") print(f"Median: {median_value}") print(f"Standard Deviation: {std_deviation:.2f}") print(f"Minimum Value: {min_value}") print(f"Maximum Value: {max_value}") print(f"Third Quartile: {q3}")

# Create a DataFrame scores6 = {'Scores': pro_y} scores6 = pd.DataFrame(scores6) # Create a distplot using scores5.figure_factory fig = ff.create_distplot([scores6['Scores']], group_labels=['distplot'], colors=['#d01c8b'], show_rug=False, show_curve=False) # Add title fig.update_layout( title_text='Distribution of Procrastination Scores', paper_bgcolor='rgba(0,0,0,0)', width=600 ) # Update y-axis range fig.update_xaxes(range=[12, 60]) # Threshold line threshold = 44 fig.add_shape( go.layout.Shape( type='line', x0=threshold, x1=threshold, y0=0, y1=0.16, line=dict(color='red', width=2) ) ) # Display the plot fig.show()

Correlation Matrix

pro_correlation_matrix = np.corrcoef(pro_X, rowvar=False) fig = go.Figure(data=go.Heatmap( z=pro_correlation_matrix, x=pro_features, y=pro_features, colorscale="PiYG", zmin=-1, zmax=1 )) # Customize the layout fig.update_layout( title="Procrastination Correlation Matrix of Features", xaxis=dict(title="Features"), yaxis=dict(title="Features"), paper_bgcolor='rgba(0,0,0,0)', width=650, height=650, ) # Show the plot fig.show()

Feature Selection

# Split the data into training and testing sets pro_X_train, pro_X_test, pro_y_train, pro_y_test = train_test_split(pro_X, pro_y, test_size=0.1, random_state=42) selector = SelectKBest(f_classif, k=5) selector.fit(pro_X_train, pro_y_train) pro_selector_scores = selector.scores_ pro_selector_pvalues = selector.pvalues_ selected_feature_indices = selector.get_support(indices=True) selected_feature_names = [pro_features[i] for i in selected_feature_indices] pro_selector_results = pd.DataFrame({ 'Feature': selected_feature_names, 'Score': pro_selector_scores[selected_feature_indices], 'P-value': pro_selector_pvalues[selected_feature_indices] }) pro_selector_results = pro_selector_results.sort_values(by='Score', ascending=False) pro_selector_results

SVM

# split y into binary classes pro_y_new = [] for score in pro_y: if score < 44: pro_y_new.append(0) else: pro_y_new.append(1) pro_y = np.array(pro_y_new)

pro_y

# Split the data into training and testing sets pro_X_train, pro_X_test, pro_y_train, pro_y_test = train_test_split(pro_X, pro_y, test_size=0.2, random_state=42) # # Normalization # min_max_scaler = MinMaxScaler() # pro_X_train = min_max_scaler.fit_transform(pro_X_train) # X_train # pro_X_test = min_max_scaler.fit_transform(pro_X_test) # X_test # # Standardize the features by removing the mean and scaling to unit variance # scaler = StandardScaler() # pro_X_train = scaler.fit_transform(pro_X_train) # pro_X_test = scaler.transform(pro_X_test) # Create an SVM model pro_svm = SVC(kernel='rbf') # Train the model pro_svm.fit(pro_X_train, pro_y_train) # Make predictions on the test set pro_y_pred = pro_svm.predict(pro_X_test) # Evaluate the model accuracy = accuracy_score(pro_y_test, pro_y_pred) print(f"Accuracy: {accuracy:.2f}") # Display classification report print("Classification Report:\n", classification_report(pro_y_test, pro_y_pred))

Target Distribution

0# Count the occurrences of each class class_counts = [pro_y.tolist().count(0), pro_y.tolist().count(1)] # Calculate percentages total_samples = len(pro_y) percentages = [count / total_samples * 100 for count in class_counts] # Create a Pie chart fig = px.pie(names=['Not a Procrastinator (0)', 'Procrastinator (1)'], values=percentages, title='Procrastination Scores Distribution', labels={'labels':'Score', 'values':'Percentage'}, color_discrete_sequence=['#d01c8b', '#e8b0d3']) fig.update_layout( paper_bgcolor='rgba(0,0,0,0)' ) # Show percentage values inside pie slices fig.update_traces(textinfo='percent+label') # Hide the legend fig.update_layout(showlegend=False) # Show the plot fig.show()

Confusion Matrix

# Calculate confusion matrix conf_matrix6 = confusion_matrix(pro_y_test, pro_y_pred) # Define class labels class_labels = [0, 1] # Create annotated heatmap using plotly.figure_factory fig = ff.create_annotated_heatmap(conf_matrix6, x=class_labels, y=class_labels, colorscale='PiYG') # Add title fig.update_layout(title_text='Procrastination Confusion Matrix') # Add custom x-y-titles fig.add_annotation(dict(font=dict(color="black", size=14), x=0.5, y=-0.15, showarrow=False, text="Predicted label", xref="paper", yref="paper")) fig.add_annotation(dict(font=dict(color="black", size=14),x=-0.15,y=0.5,showarrow=False,text="Actual label",textangle=-90,xref="paper",yref="paper")) # Adjust margins to make room for y-axis title fig.update_layout( margin=dict(t=50, l=200), paper_bgcolor='rgba(0,0,0,0)', width=600 ) # Display the plot fig.show()

Cross Validation

# Split the data into training and testing sets pro_X_train, pro_X_test, pro_y_train, pro_y_test = train_test_split(pro_X, pro_y, test_size=0.1, random_state=42) # Feature selection max_k = 36 # Set the maximum k value to test avg_accuracies = [] for k in range(1, max_k + 1): selector = SelectKBest(f_classif, k=k) selector.fit(pro_X_train, pro_y_train) # Get selected feature indices selected_feature_indices = selector.get_support(indices=True) # Apply feature selection to training and test sets pro_X_train_selected = pro_X_train[:, selected_feature_indices] pro_X_test_selected = pro_X_test[:, selected_feature_indices] # Standardize the features scaler = StandardScaler() pro_X_train_selected = scaler.fit_transform(pro_X_train_selected) pro_X_test_selected = scaler.transform(pro_X_test_selected) # K-fold cross-validation n_splits = 10 kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42) avg_acc = 0 for train_index, test_index in kfold.split(pro_X): # Extract training and test sets X_train, X_test = pro_X[train_index], pro_X[test_index] y_train, y_test = pro_y[train_index], pro_y[test_index] # Apply feature selection X_train_selected = X_train[:, selected_feature_indices] X_test_selected = X_test[:, selected_feature_indices] # Standardize the features X_train_selected = scaler.fit_transform(X_train_selected) X_test_selected = scaler.transform(X_test_selected) # Create and train the model svm_model = SVC(kernel='sigmoid') svm_model.fit(X_train_selected, y_train) # Make predictions on the test set y_pred = svm_model.predict(X_test_selected) # Measure performance on y_test and y_pred acc = accuracy_score(y_test, y_pred) avg_acc += acc avg_accuracy = avg_acc / n_splits avg_accuracies.append(avg_accuracy) # Find k with best accuracy max_accuracy_index = max(range(len(avg_accuracies)), key=avg_accuracies.__getitem__) # Find the second max accuracy index second_best_k = None for i, acc in enumerate(avg_accuracies): if i != max_accuracy_index and (second_best_k is None or acc > avg_accuracies[second_best_k]): second_best_k = i # Selector for desired k if second_best_k is not None: best_k_second = second_best_k + 1 selector = SelectKBest(f_classif, k=max_accuracy_index + 1) selector.fit(pro_X_train, pro_y_train) selected_feature_indices = selector.get_support(indices=True) selected_feature_names = [pro_features[i] for i in selected_feature_indices] pro_X_train_selected = pro_X_train[:, selected_feature_indices] pro_X_test_selected = pro_X_test[:, selected_feature_indices] print(f"Average Accuracy: {round(avg_accuracies[max_accuracy_index], 2)} for k={max_accuracy_index + 1}") print(f"Selected Features for k={max_accuracy_index + 1}: {selected_feature_names}") # Plot fig = go.Figure() fig.add_trace(go.Scatter(x=list(range(1, max_k + 1)), y=avg_accuracies, mode='lines+markers', line=dict(color='#e8b0d3'))) fig.update_layout( title='Procrastination Avg Accuracy vs. Top-k Features', xaxis=dict(title='Top-k Features'), yaxis=dict(title='Average Accuracy'), paper_bgcolor='rgba(0,0,0,0)', width=500 ) fig.show()

ROC Curve

# Create and train the model with the best k svm_model = SVC(kernel='sigmoid', probability=True) # Enable probability estimates for ROC curve svm_model.fit(pro_X_train_selected, pro_y_train) # Make predictions on the test set y_probs = svm_model.predict_proba(pro_X_test_selected)[:, 1] # Calculate ROC curve fpr, tpr, thresholds = roc_curve(pro_y_test, y_probs) roc_auc = auc(fpr, tpr) # Plot ROC curve using Plotly fig = go.Figure() fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC curve (AUC = {roc_auc:.2f})', line=dict(color='#e8b0d3'))) fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Baseline', line=dict(dash='dash'))) fig.update_layout( title='Procrastination ROC Curve', xaxis=dict(title='False Positive Rate (FPR)', range=[-0.02, 1.02]), yaxis=dict(title='True Positive Rate (TPR)'), legend=dict(x=0.43, y=0.03, traceorder='normal'), paper_bgcolor='rgba(0,0,0,0)', width=500 ) fig.show()