import pandas as pd
# Load example dataset
df = pd.read_csv('https://raw.githubusercontent.com/Statology/Python-Guides/main/default.csv')
# Check unique values in response variable
unique_vals = df['default'].nunique()
print("Number of unique values in response:", unique_vals)
Run to view results
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Prepare data for the model
X = df[['balance', 'income']]
y = df['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit the model
model = LogisticRegression().fit(X_train, y_train)
# Predicted probabilities
y_pred_prob = model.predict_proba(X_train)[:,1]
residuals = y_train - y_pred_prob
# Suppose "index" is the time or order of measurement
plt.figure(figsize=(10, 6))
sns.scatterplot(x=range(len(residuals)), y=residuals)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel("Observation Order")
plt.ylabel("Residual")
plt.title("Residuals vs. Time/Order")
plt.show()
Run to view results
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
# Assume "features" is a DataFrame with only predictor variables
features = df[['balance', 'income']]
X_vif = add_constant(features)
vif_data = pd.DataFrame()
vif_data['Feature'] = X_vif.columns
vif_data['VIF'] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]
print(vif_data)
Run to view results
plt.figure(figsize=(8, 6))
sns.heatmap(features.corr(), annot=True, cmap="coolwarm", fmt='.2f')
plt.title('Predictor Correlation Matrix')
plt.show()
Run to view results
import statsmodels.api as sm
import numpy as np
# Fit a model using statsmodels to get influence diagnostics
X_sm = sm.add_constant(X)
logit_model = sm.Logit(y, X_sm).fit(disp=0)
# Get influence diagnostics
influence = logit_model.get_influence()
cooks_d, p_value = influence.cooks_distance
# Plot Cook's Distance
plt.figure(figsize=(10, 6))
plt.stem(np.arange(len(cooks_d)), cooks_d, markerfmt=",")
plt.axhline(y=4/len(df), color="red", linestyle="--", label="4/n Threshold")
plt.title("Cook's Distance")
plt.xlabel("Observation Index")
plt.ylabel("Cook's Distance")
plt.legend()
plt.show()
# See which points are influential
influential_indices = np.where(cooks_d > 4/len(df))[0]
print("Influential points (indices):", influential_indices)
Run to view results
# Create a copy of the dataframe for this test
df_bt = df.copy()
# Add Box-Tidwell interaction terms for continuous predictors
for var in ['balance', 'income']:
# Add a small constant to avoid log(0)
df_bt[f'{var}_log_{var}'] = df_bt[var] * np.log(df_bt[var] + 1e-9)
# Build model including original and interaction terms
X_bt = df_bt[['balance', 'income', 'balance_log_balance', 'income_log_income']]
X_bt = sm.add_constant(X_bt)
y_bt = df_bt['default']
model_bt = sm.Logit(y_bt, X_bt).fit(disp=0)
print(model_bt.summary())
Run to view results
# Get predicted probabilities from the original model
pred_prob = logit_model.predict(X_sm)
# Compute logit
log_odds = np.log(pred_prob / (1 - pred_prob))
# Plot for 'balance'
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['balance'], y=log_odds, alpha=0.5)
sns.regplot(x=df['balance'], y=log_odds, scatter=False, lowess=True, line_kws={'color': 'red'})
plt.xlabel('Balance')
plt.ylabel('Logit (log-odds)')
plt.title('Balance vs. Logit')
plt.show()
Run to view results
events = df['default'].sum()
variables = X.shape[1] # Number of predictor variables
events_per_variable = events / variables
print(f"Number of events (minority class): {events}")
print(f"Number of predictor variables: {variables}")
print(f"Events per variable: {events_per_variable:.2f}")
if events_per_variable < 10:
print("\nWarning: Sample size may be insufficient based on the 10 events per predictor rule.")
else:
print("\nSample size appears to be sufficient.")
Run to view results
from IPython.display import IFrame, display
display(IFrame(src="https://www.slideshare.net/slideshow/embed_code/key/algQmXUrmDTzOt?hostedIn=slideshare&page=upload", width="476", height="400"))
Run to view results