# import libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
sns.set_palette("deep")
# load data
df = pd.read_csv("twitch-streamers.csv")
df.head()
# check data types
df.dtypes
# convert Partnered and Mature columns
df['Partnered'] = np.where(df['Partnered'].isin([True]), 1,0).astype(np.int64)
df['Mature'] = np.where(df['Mature'].isin([True]), 1,0).astype(np.int64)
# summarize DataFrame
df.info()
# drop non-numeric columns
df_numeric = df.drop(['Channel', 'Language'], axis=1)
# retrieve correlation coefficients
corrCoeff = np.corrcoef(df_numeric, rowvar=False)
plt.figure(figsize=(10,6))
# create correlation heat map
sns.heatmap(corrCoeff, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap", fontsize=15)
plt.xticks(np.arange(9)+0.5, df_numeric.columns, rotation=45, ha='right')
plt.yticks(np.arange(9)+0.5, df_numeric.columns, rotation=0)
plt.show()
# create copy of data
df_model = df[['Average viewers', 'Watch time', 'Stream time', 'Followers']].copy()
# select predictors
X = df_model.drop(['Average viewers'], axis=1)
# add constant/intercept
X = sm.add_constant(X)
# create response variable
y = df_model['Average viewers']
# apply statsmodels OLS function
model1 = sm.OLS(y, X)
# fit the model
results = model1.fit()
results.summary()
# calculate VIF
vif_data = pd.DataFrame()
vif_data["variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
for i in range(len(X.columns))]
vif_data
# generate pair plot
sns.pairplot(df_model)
# create new copy for data with logarithmic values
df_log = df[['Average viewers', 'Watch time', 'Stream time', 'Followers']].copy()
df_log[['log(Average viewers)', 'log(Watch time)', 'log(Followers)']] = df_log[['Average viewers', 'Watch time', 'Followers']].apply(np.log)
# select predictors
X_log = df_log[['log(Watch time)', 'Stream time', 'log(Followers)']]
# add constant/intercept
X_log = sm.add_constant(X_log)
# create response variable
y_log = df_log['log(Average viewers)']
# apply statsmodels OLS function
model2 = sm.OLS(y_log, X_log)
# fit the model
results = model2.fit()
results.summary()
# split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.3, random_state=26)
# force coefficients to be positive
lr = LinearRegression(positive=True)
# fit training variables
lr.fit(X_train, y_train)
# generate predictions
y_pred = lr.predict(X_test)
# create DataFrame of actual and predicted number of average viewers
df_model = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# calculate exponential of actual and predicted values
np.exp(df_model.head(10))
# variables to calculate adjusted r-squared
R2 = r2_score(y_test, y_pred)
n = 0.3*1000
k = 3
print('Adjusted R-squared: ', (1 - ((1-R2) * (n-1) / (n-k-1))))
# calculate RMSE
print('RMSE : ', np.sqrt(mean_squared_error(y_test,y_pred)))