import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/work/tord_v3_edited.csv')
df.info()
#df.replace(('yes', 'no'), (1, 0), inplace=True)
df['whitelist'].replace(('Yes', 'No'), (1, 0), inplace=True)
df.head(20)
df.describe()
Q1. What defines the success of an ICO (i.e., what is the Y)?
The ICO is defined as success when the raised used exeeds 500,000, where 'success' dummy takes the value of 1.
# sanitize "raised_usd"
df['raised_usd'] = df['raised_usd'].fillna(0)
# create outcome variable success
df['success'] = np.where(df['raised_usd'] >= 500000, 1, 0)
df['success'].describe()
sns.pairplot(df, vars = ['raised_usd','teamsize','success'])
# Potential multicollinearity issue
corr = df.corr()
sns.heatmap(corr)
plt.show()
!pip install statsmodels
Q2. What are the factors that determine the success of an ICO (i.e., what are the Xs)?
import statsmodels.api as sm
from statsmodels.formula.api import ols, logit
# dependent/target/outcome variable
y = df['success']
#creating interaction variables
df['teamsizeXERC20']=df['teamsize']*df['ERC20']
# A larger team size can further benefit from higher potential for further related APP development captured by ERC20
df['whitelistXbonus']=df['whitelist']*df['bonus']
df['whitelistXbounty']=df['whitelist']*df['bounty']
df['eos_teamsize']=df['teamsize']*df['teamsize']
#being on whitelist may attract more attention and amplify the effects of bonus and bounty
# independent/predictor/explanatory variable
X = df[['is_ieo','is_sto','teamsize', 'rating','kyc','bonus','token_for_sale','bounty','ERC20','whitelist','teamsizeXERC20','whitelistXbonus','whitelistXbounty','eos_teamsize']]
# A. Logit regression
# turn independent variables into floating type (best practice)
# 'missing='drop'' drops rows with missing values from the regression
logit_model=sm.Logit(y,X.astype(float), missing='drop' )
# fit logit model into the data
result=logit_model.fit()
# summarize the logit model
print(result.summary2())
sns.regplot(x = "rating", y = "success", data = df,
logistic = True, y_jitter = .05)
plt.ylabel("success probability")
'''
options for "at"
1. 'overall' The average of the marginal effects at each observation
2. 'mean' The marginal effects at the mean of each regressor
3. 'median' The marginal effects at the median of each regressor
4. 'zero' The marginal effects at zero for each regressor
5. 'all' The marginal effects at each observation.
options for "method"
1. 'dydx' No transformation is made and amrginal effects are returned
2. 'eyex' estimate elasticities of variables in exog
3. 'dyex' estimate semi-elasticity
4. 'eydx' estimate semi-elasticity
'''
average_marginal_effect = result.get_margeff(at = "mean", method = "dydx")
print(average_marginal_effect.summary())
# B. Linear Probability Model
# logit regression
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
# Use wrapper lazypredict
!pip install lazypredict
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.model_selection import train_test_split
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
models
predictions