import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/work/tord_v3_edited.csv')
df.info()
# counr the number of tokens accepting
df = df.copy()
df['accept_count'] = 0
for i in range(len(df['id'])):
if df.loc[i,'accepting'] != 'nan':
df.loc[i,'accept_count'] = str(df.loc[i,'accepting']).count(',')+1
df['accept_count'].describe()
df = df.sort_values(by = 'raised_usd', ascending= False)
df.head()
df['min_invest_dum'] = np.where(df['min_investment'] != np.nan, 1, 0)
df['min_invest_dum'].describe()
# data cleaning, choosing those is lised on ico and has finished
df = df[df['is_ico']== 1]
df = df[df['ico_end'].isnull() == False]
df['token_for_sale'] = df['token_for_sale'].fillna(0)
df['sold_tokens'] = df['sold_tokens'].fillna(0)
df['distributed_in_ico'] = df['distributed_in_ico'].fillna(0)
df['ERC20'] = df['ERC20'].fillna(0)
df['teamsize'].describe()
#replace the teamzie with median
df['teamsize'] = df['teamsize'].fillna(11)
df['teamsize'].hist(bins = 30)
plt.show()
#define Y
# sanitize "raised_usd"
df['raised_usd'] = df['raised_usd'].fillna(0)
# create outcome variable success
df['success'] = np.where(df['raised_usd'] >= 500000, 1, 0)
df['success'].describe()
sns.pairplot(df, vars = ['raised_usd','teamsize','success','accept_count',])
# Potential multicollinearity issue
corr = df.corr()
sns.heatmap(corr)
plt.show()
!pip install statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import ols, logit
# dependent/target/outcome variable
y = df['success']
# independent/predictor/explanatory variable
Xlist = ['token_for_sale','sold_tokens','teamsize','accept_count','distributed_in_ico','ERC20']
X = df[Xlist]
# A. Logit regression
# turn independent variables into floating type (best practice)
# 'missing='drop'' drops rows with missing values from the regression
logit_model=sm.Logit(y,X.astype(float), missing='drop' )
# fit logit model into the data
result=logit_model.fit()
# summarize the logit model
print(result.summary2())
sns.regplot(x = "sold_tokens", y = "success", data = df,
# old plot was sns.regplot(x = "sold_tokens", y = "success", data = df,
logistic = True, y_jitter = .05)
plt.ylabel("success probability")
sns.regplot(x = "teamsize", y = "success", data = df,
# old plot was sns.regplot(x = "sold_tokens", y = "success", data = df,
logistic = True, y_jitter = .05)
plt.ylabel("success probability")
'''
options for "at"
1. 'overall' The average of the marginal effects at each observation
2. 'mean' The marginal effects at the mean of each regressor
3. 'median' The marginal effects at the median of each regressor
4. 'zero' The marginal effects at zero for each regressor
5. 'all' The marginal effects at each observation.
options for "method"
1. 'dydx' No transformation is made and amrginal effects are returned
2. 'eyex' estimate elasticities of variables in exog
3. 'dyex' estimate semi-elasticity
4. 'eydx' estimate semi-elasticity
'''
average_marginal_effect = result.get_margeff(at = "mean", method = "dydx")
print(average_marginal_effect.summary())
# B. Linear Probability Model
# logit regression
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
# Use wrapper lazypredict
!pip install lazypredict
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.model_selection import train_test_split
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
models
predictions