import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/work/tord_v3_edited.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6415 entries, 0 to 6414
Data columns (total 34 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 6415 non-null int64
1 name 6415 non-null object
2 token 6282 non-null object
3 country 6415 non-null object
4 is_ico 6415 non-null int64
5 is_ieo 6415 non-null int64
6 is_sto 6415 non-null int64
7 ico_start 5636 non-null object
8 ico_end 5497 non-null object
9 price_usd 5928 non-null object
10 raised_usd 2139 non-null float64
11 distributed_in_ico 4870 non-null float64
12 sold_tokens 192 non-null float64
13 token_for_sale 5122 non-null float64
14 whitelist 3882 non-null object
15 kyc 6415 non-null int64
16 bonus 6415 non-null int64
17 restricted_areas 2343 non-null object
18 min_investment 2079 non-null object
19 bounty 6415 non-null int64
20 mvp 1296 non-null object
21 pre_ico_start 2717 non-null object
22 pre_ico_end 2705 non-null object
23 pre_ico_price_usd 1733 non-null object
24 platform 6415 non-null int64
25 accepting 5545 non-null object
26 link_white_paper 5828 non-null object
27 linkedin_link 4355 non-null object
28 github_link 5649 non-null object
29 website 5649 non-null object
30 rating 5709 non-null float64
31 teamsize 4622 non-null float64
32 Coinmarketcap_identifier 1281 non-null float64
33 ERC20 5679 non-null float64
dtypes: float64(8), int64(8), object(18)
memory usage: 1.7+ MB
df.head(20)
df.describe()
# sanitize "raised_usd"
df['raised_usd'] = df['raised_usd'].fillna(0)
# create outcome variable success
df['success'] = np.where(df['raised_usd'] >= 500000, 1, 0)
df['success'].describe()
sns.pairplot(df, vars = ['raised_usd','teamsize','success'])
# Potential multicollinearity issue
corr = df.corr()
sns.heatmap(corr)
plt.show()
df['rating'] = df['rating'].fillna(0)
df['ERC20'] = df['ERC20'].fillna(0)
df['teamsize'] = df['teamsize'].fillna(1)
candidate_variables = ['teamsize','rating','bonus','sold_tokens','bounty','kyc']
current_variables = []
target = ['success']
df['ratxteamsize']=df['teamsize'] * df['rating']
df['ratxteamsize'].head(20)
df['ERC20'].fillna(0)
print(df['ERC20'].isna().sum())
df['whitelist']=df['whitelist'].map({'Yes': 1, 'No': 0})
print(df['whitelist'].head())
0
0 1.00
1 0.00
2 0.00
3 1.00
4 1.00
Name: whitelist, dtype: float64
df['whitelist'].fillna(0)
df['whitelist'].isna().sum()
df['whitelist'].head()
#df['kyc'].head()
df['whxky']=df['whitelist'] * df['kyc']
df['whxky'].head()
df['logsoldtokens']=np.log(df['sold_tokens'])
print(df['logsoldtokens'].head())
df['logsoldtokens'] = df['logsoldtokens'].fillna(0)
print(df['logsoldtokens'].head())
#for i in df['linkedin_link']:
# if i = 'TBD':
df.loc[df['linkedin_link']=="TBD", 'linkedin_link'] = 0
df['linkedin_link'].fillna(0)
df.loc[df['linkedin_link']!=0, "linkedin_link"] = 1
#df['linkedin_link']=df['linkedin_link'].map({'TBD': 0, 'No': 0})
print(df['linkedin_link'].sum())
0 18.42
1 NaN
2 NaN
3 NaN
4 NaN
Name: logsoldtokens, dtype: float64
0 18.42
1 0.00
2 0.00
3 0.00
4 0.00
Name: logsoldtokens, dtype: float64
5587
df['link_white_paper'].fillna(0, inplace=True)
df.loc[df['link_white_paper']!=0, "link_white_paper"] = 1
print(df['link_white_paper'].sum())
print(df['link_white_paper'].head())
df['linkxwhitep']=df['linkedin_link'] * df['link_white_paper']
print(df['linkxwhitep'].head())
5828
0 0
1 1
2 1
3 1
4 1
Name: link_white_paper, dtype: object
0 0
1 1
2 1
3 1
4 1
Name: linkxwhitep, dtype: object
df['github_link'].fillna(0, inplace=True)
df.loc[df['github_link']=="None", "github_link"] = 0
df.loc[df['github_link']!=0, "github_link"] = 1
print(df['github_link'].sum())
2871
df['ico_start'][0]
df['ico_start'].fillna(0, inplace=True)
print(df['ico_start'].head())
df['ico_end'].fillna(0, inplace=True)
print(df['ico_end'].head())
0 8/10/2020
1 8/1/2020
2 3/1/2019
3 6/25/2020
4 0
Name: ico_start, dtype: object
0 12/31/2020
1 12/31/2020
2 12/31/2020
3 1/31/2021
4 0
Name: ico_end, dtype: object
for i in range(len(df)):
if df['ico_end'][i] == 0 and df['ico_start'][i] != 0:
df['ico_end'][i] = "10/24/2021"
df['ico_start']=pd.to_datetime(df['ico_start'], errors='coerce',dayfirst=False)
df['ico_end']=pd.to_datetime(df['ico_end'], errors='coerce',dayfirst=False)
df['ico_length']=df['ico_end']-df['ico_start']
df['ico_length']=df['ico_length'].dt.days
print(df['ico_length'].head())
0 143
1 152
2 671
3 220
4 0
Name: ico_length, dtype: int64
for i in range(len(df)):
if df['ico_length'][i] < 0:
df['ico_length'][i] = 1
!pip install statsmodels
Collecting statsmodels
Downloading statsmodels-0.13.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
|████████████████████████████████| 9.8 MB 21.8 MB/s
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.7.1)
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Collecting patsy>=0.5.2
Downloading patsy-0.5.2-py2.py3-none-any.whl (233 kB)
|████████████████████████████████| 233 kB 42.7 MB/s
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2021.3)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.2 statsmodels-0.13.0
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
import statsmodels.api as sm
from statsmodels.formula.api import ols, logit
# dependent/target/outcome variable
y = df['success']
# independent/predictor/explanatory variable
X = df[['teamsize', 'rating']]
# A. Logit regression
# turn independent variables into floating type (best practice)
# 'missing='drop'' drops rows with missing values from the regression
logit_model=sm.Logit(y,X.astype(float), missing='drop' )
# fit logit model into the data
result=logit_model.fit()
# summarize the logit model
print(result.summary2())
print(result.tvalues)
Optimization terminated successfully.
Current function value: 0.634254
Iterations 5
Results: Logit
================================================================
Model: Logit Pseudo R-squared: -0.083
Dependent Variable: success AIC: 8141.4755
Date: 2021-10-24 09:49 BIC: 8155.0083
No. Observations: 6415 Log-Likelihood: -4068.7
Df Model: 1 LL-Null: -3755.4
Df Residuals: 6413 LLR p-value: 1.0000
Converged: 1.0000 Scale: 1.0000
No. Iterations: 5.0000
-----------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
-----------------------------------------------------------------
teamsize 0.0233 0.0038 6.1941 0.0000 0.0159 0.0307
rating -0.3148 0.0150 -21.0364 0.0000 -0.3441 -0.2855
================================================================
teamsize 6.19413
rating -21.03639
dtype: float64
sns.regplot(x = "rating", y = "success", data = df,
logistic = True, y_jitter = .05)
plt.ylabel("success probability")
'''
options for "at"
1. 'overall' The average of the marginal effects at each observation
2. 'mean' The marginal effects at the mean of each regressor
3. 'median' The marginal effects at the median of each regressor
4. 'zero' The marginal effects at zero for each regressor
5. 'all' The marginal effects at each observation.
options for "method"
1. 'dydx' No transformation is made and amrginal effects are returned
2. 'eyex' estimate elasticities of variables in exog
3. 'dyex' estimate semi-elasticity
4. 'eydx' estimate semi-elasticity
'''
average_marginal_effect = result.get_margeff(at = "mean", method = "dydx")
print(average_marginal_effect.summary())
Logit Marginal Effects
=====================================
Dep. Variable: success
Method: dydx
At: mean
==============================================================================
dy/dx std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
teamsize 0.0053 0.001 6.186 0.000 0.004 0.007
rating -0.0714 0.003 -23.165 0.000 -0.077 -0.065
==============================================================================
# B. Linear Probability Model
# logit regression
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
#print(result.rsquared_adj)
Results: Ordinary least squares
==================================================================
Model: OLS Adj. R-squared: 0.070
Dependent Variable: success AIC: 7359.1216
Date: 2021-10-24 09:50 BIC: 7379.4208
No. Observations: 6415 Log-Likelihood: -3676.6
Df Model: 2 F-statistic: 241.5
Df Residuals: 6412 Prob (F-statistic): 7.41e-102
R-squared: 0.070 Scale: 0.18430
--------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
--------------------------------------------------------------------
const 0.0268 0.0132 2.0379 0.0416 0.0010 0.0526
teamsize 0.0077 0.0008 9.9486 0.0000 0.0062 0.0092
rating 0.0708 0.0050 14.1388 0.0000 0.0609 0.0806
------------------------------------------------------------------
Omnibus: 1252.131 Durbin-Watson: 1.721
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1080.438
Skew: 0.919 Prob(JB): 0.000
Kurtosis: 2.184 Condition No.: 29
==================================================================
# Use wrapper lazypredict
!pip install lazypredict
Collecting lazypredict
Downloading lazypredict-0.2.9-py2.py3-none-any.whl (12 kB)
Collecting lightgbm==2.3.1
Downloading lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl (1.2 MB)
|████████████████████████████████| 1.2 MB 20.1 MB/s
Collecting six==1.15.0
Downloading six-1.15.0-py2.py3-none-any.whl (10 kB)
Collecting PyYAML==5.3.1
Downloading PyYAML-5.3.1.tar.gz (269 kB)
|████████████████████████████████| 269 kB 23.8 MB/s
Collecting scipy==1.5.4
Downloading scipy-1.5.4-cp37-cp37m-manylinux1_x86_64.whl (25.9 MB)
|████████████████████████████████| 25.9 MB 42.7 MB/s
Collecting pandas==1.0.5
Downloading pandas-1.0.5-cp37-cp37m-manylinux1_x86_64.whl (10.1 MB)
|████████████████████████████████| 10.1 MB 39.7 MB/s
Collecting click==7.1.2
Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
|████████████████████████████████| 82 kB 2.7 MB/s
Collecting xgboost==1.1.1
Downloading xgboost-1.1.1-py3-none-manylinux2010_x86_64.whl (127.6 MB)
|████████████████████████████████| 127.6 MB 54 kB/s
Collecting numpy==1.19.1
Downloading numpy-1.19.1-cp37-cp37m-manylinux2010_x86_64.whl (14.5 MB)
|████████████████████████████████| 14.5 MB 33.5 MB/s
Collecting joblib==1.0.0
Downloading joblib-1.0.0-py3-none-any.whl (302 kB)
|████████████████████████████████| 302 kB 28.2 MB/s
Collecting pytest==5.4.3
Downloading pytest-5.4.3-py3-none-any.whl (248 kB)
|████████████████████████████████| 248 kB 51.0 MB/s
Collecting scikit-learn==0.23.1
Downloading scikit_learn-0.23.1-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
|████████████████████████████████| 6.8 MB 34.5 MB/s
Collecting tqdm==4.56.0
Downloading tqdm-4.56.0-py2.py3-none-any.whl (72 kB)
|████████████████████████████████| 72 kB 1.6 MB/s
Requirement already satisfied: pytz>=2017.2 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas==1.0.5->lazypredict) (2021.3)
Requirement already satisfied: python-dateutil>=2.6.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas==1.0.5->lazypredict) (2.8.2)
Collecting pluggy<1.0,>=0.12
Downloading pluggy-0.13.1-py2.py3-none-any.whl (18 kB)
Collecting more-itertools>=4.0.0
Downloading more_itertools-8.10.0-py3-none-any.whl (51 kB)
|████████████████████████████████| 51 kB 632 kB/s
Requirement already satisfied: importlib-metadata>=0.12 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (4.8.1)
Requirement already satisfied: wcwidth in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (0.2.5)
Requirement already satisfied: attrs>=17.4.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (21.2.0)
Requirement already satisfied: py>=1.5.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (1.10.0)
Requirement already satisfied: packaging in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (21.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn==0.23.1->lazypredict) (3.0.0)
Requirement already satisfied: typing-extensions>=3.6.4 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from importlib-metadata>=0.12->pytest==5.4.3->lazypredict) (3.10.0.2)
Requirement already satisfied: zipp>=0.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from importlib-metadata>=0.12->pytest==5.4.3->lazypredict) (3.6.0)
Requirement already satisfied: pyparsing>=2.0.2 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from packaging->pytest==5.4.3->lazypredict) (2.4.7)
Building wheels for collected packages: PyYAML
Building wheel for PyYAML (setup.py) ... done
Created wheel for PyYAML: filename=PyYAML-5.3.1-cp37-cp37m-linux_x86_64.whl size=44635 sha256=7756ce74e267a8f02970ac2be10a236f3aa1e883066ef00db35326a3d0ad4819
Stored in directory: /root/.cache/pip/wheels/5e/03/1e/e1e954795d6f35dfc7b637fe2277bff021303bd9570ecea653
Successfully built PyYAML
Installing collected packages: numpy, six, scipy, joblib, scikit-learn, pluggy, more-itertools, xgboost, tqdm, PyYAML, pytest, pandas, lightgbm, click, lazypredict
Attempting uninstall: numpy
Found existing installation: numpy 1.19.5
Not uninstalling numpy at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'numpy'. No files were found to uninstall.
Attempting uninstall: six
Found existing installation: six 1.16.0
Not uninstalling six at /shared-libs/python3.7/py-core/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'six'. No files were found to uninstall.
Attempting uninstall: scipy
Found existing installation: scipy 1.7.1
Not uninstalling scipy at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'scipy'. No files were found to uninstall.
Attempting uninstall: joblib
Found existing installation: joblib 1.1.0
Not uninstalling joblib at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'joblib'. No files were found to uninstall.
Attempting uninstall: scikit-learn
Found existing installation: scikit-learn 1.0
Not uninstalling scikit-learn at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'scikit-learn'. No files were found to uninstall.
Attempting uninstall: pluggy
Found existing installation: pluggy 1.0.0
Not uninstalling pluggy at /shared-libs/python3.7/py-core/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'pluggy'. No files were found to uninstall.
Attempting uninstall: tqdm
Found existing installation: tqdm 4.62.3
Not uninstalling tqdm at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'tqdm'. No files were found to uninstall.
Attempting uninstall: PyYAML
Found existing installation: PyYAML 5.4.1
Not uninstalling pyyaml at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'PyYAML'. No files were found to uninstall.
Attempting uninstall: pytest
Found existing installation: pytest 6.2.5
Not uninstalling pytest at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'pytest'. No files were found to uninstall.
Attempting uninstall: pandas
Found existing installation: pandas 1.2.5
Not uninstalling pandas at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'pandas'. No files were found to uninstall.
Attempting uninstall: click
Found existing installation: click 8.0.3
Not uninstalling click at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'click'. No files were found to uninstall.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.4.1 requires numpy~=1.19.2, but you have numpy 1.19.1 which is incompatible.
tensorflow 2.4.1 requires typing-extensions~=3.7.4, but you have typing-extensions 3.10.0.2 which is incompatible.
Successfully installed PyYAML-5.3.1 click-7.1.2 joblib-1.0.0 lazypredict-0.2.9 lightgbm-2.3.1 more-itertools-8.10.0 numpy-1.19.1 pandas-1.0.5 pluggy-0.13.1 pytest-5.4.3 scikit-learn-0.23.1 scipy-1.5.4 six-1.15.0 tqdm-4.56.0 xgboost-1.1.1
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.model_selection import train_test_split
/root/venv/lib/python3.7/site-packages/sklearn/utils/deprecation.py:143: FutureWarning: The sklearn.utils.testing module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.utils. Anything that cannot be imported from sklearn.utils is now part of the private API.
warnings.warn(message, FutureWarning)
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
100%|██████████| 29/29 [00:08<00:00, 3.32it/s]
models
predictions
models['Accuracy']['LogisticRegression']
def prsq(variables, target, basetable):
X = basetable[variables]
y = basetable[target]
logreg = sm.Logit(y,X.astype(float), missing = 'drop')
result=logreg.fit()
#return(abs(result.tvalues))
return(result.prsquared)
def next_best(current_variables, candidate_variables, target, basetable):
best_rsq = -1
best_variable = None
for v in candidate_variables:
logrsq = prsq(current_variables+[v], target, basetable)
if logrsq >= best_rsq:
best_rsq = logrsq
best_variable = v
return best_variable
candidate_variables = ['bonus','sold_tokens','bounty','linkedin_link']
current_variables = ['teamsize','rating']
target = ['success']
next_variable = next_best(current_variables, candidate_variables,target, df)
print(next_variable)
Optimization terminated successfully.
Current function value: 0.615002
Iterations 7
Optimization terminated successfully.
Current function value: 0.642771
Iterations 24
Optimization terminated successfully.
Current function value: 0.631929
Iterations 5
Optimization terminated successfully.
Current function value: 0.624116
Iterations 5
sold_tokens
candidate_variables = ['teamsize','rating','bonus','sold_tokens','bounty','kyc']
#candidate_variables = ['teamsize','rating','bonus','bounty','kyc']
current_variables = []
target = ['success']
max_number_variables = 5
number_iterations = min(max_number_variables, len(candidate_variables))
for i in range(0,number_iterations):
next_var = next_best(current_variables, candidate_variables, target, df)
current_variables = current_variables + [next_var]
candidate_variables.remove(next_var)
print(current_variables)
Optimization terminated successfully.
Current function value: 0.670867
Iterations 4
Optimization terminated successfully.
Current function value: 0.637245
Iterations 4
Optimization terminated successfully.
Current function value: 0.656717
Iterations 7
Optimization terminated successfully.
Current function value: 0.688166
Iterations 1
Optimization terminated successfully.
Current function value: 0.666551
Iterations 5
Optimization terminated successfully.
Current function value: 0.655511
Iterations 5
Optimization terminated successfully.
Current function value: 0.650429
Iterations 24
Optimization terminated successfully.
Current function value: 0.643492
Iterations 24
Optimization terminated successfully.
Current function value: 0.633588
Iterations 24
Optimization terminated successfully.
Current function value: 0.651911
Iterations 24
Optimization terminated successfully.
Current function value: 0.652853
Iterations 24
Optimization terminated successfully.
Current function value: 0.627634
Iterations 24
Optimization terminated successfully.
Current function value: 0.615034
Iterations 24
Optimization terminated successfully.
Current function value: 0.633588
Iterations 24
Optimization terminated successfully.
Current function value: 0.631486
Iterations 24
Optimization terminated successfully.
Current function value: 0.614418
Iterations 24
Optimization terminated successfully.
Current function value: 0.611037
Iterations 24
Optimization terminated successfully.
Current function value: 0.614516
Iterations 24
Optimization terminated successfully.
Current function value: 0.610882
Iterations 24
Optimization terminated successfully.
Current function value: 0.611017
Iterations 24
['sold_tokens', 'bonus', 'rating', 'bounty', 'teamsize']
# dependent/target/outcome variable
y = df['success']
# independent/predictor/explanatory variable
X = df[current_variables]
# A. Logit regression
# turn independent variables into floating type (best practice)
# 'missing='drop'' drops rows with missing values from the regression
logit_model=sm.Logit(y,X.astype(float), missing='drop' )
# fit logit model into the data
result=logit_model.fit()
# summarize the logit model
print(result.summary2())
Optimization terminated successfully.
Current function value: 0.610882
Iterations 24
Results: Logit
=================================================================
Model: Logit Pseudo R-squared: 0.096
Dependent Variable: success AIC: 240.5785
Date: 2021-10-24 09:52 BIC: 250.3510
No. Observations: 192 Log-Likelihood: -117.29
Df Model: 2 LL-Null: -129.69
Df Residuals: 189 LLR p-value: 4.1188e-06
Converged: 1.0000 Scale: 1.0000
No. Iterations: 24.0000
------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
------------------------------------------------------------------
sold_tokens 0.0000 0.0000 1.6031 0.1089 -0.0000 0.0000
bonus -2.1902 0.8241 -2.6578 0.0079 -3.8054 -0.5751
rating 0.2028 0.0842 2.4088 0.0160 0.0378 0.3678
bounty -0.5241 0.4482 -1.1693 0.2423 -1.4026 0.3544
teamsize -0.0058 0.0236 -0.2446 0.8068 -0.0521 0.0405
=================================================================
# B. Linear Probability Model
# logit regression
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
#print(result.rsquared_adj)
Results: Ordinary least squares
=================================================================
Model: OLS Adj. R-squared: 0.012
Dependent Variable: success AIC: 273.5059
Date: 2021-10-24 09:52 BIC: 283.2784
No. Observations: 192 Log-Likelihood: -133.75
Df Model: 2 F-statistic: 2.154
Df Residuals: 189 Prob (F-statistic): 0.119
R-squared: 0.022 Scale: 0.23958
------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------
const 0.0737 0.0059 12.5733 0.0000 0.0622 0.0853
sold_tokens 0.0000 0.0000 1.1848 0.2376 -0.0000 0.0000
bonus 0.0048 0.0004 12.6795 0.0000 0.0040 0.0055
rating 0.1991 0.0157 12.6540 0.0000 0.1681 0.2302
bounty 0.0095 0.0007 13.1892 0.0000 0.0081 0.0109
teamsize -0.0073 0.0051 -1.4092 0.1604 -0.0174 0.0029
-----------------------------------------------------------------
Omnibus: 2292.577 Durbin-Watson: 1.772
Prob(Omnibus): 0.000 Jarque-Bera (JB): 23.549
Skew: -0.338 Prob(JB): 0.000
Kurtosis: 1.423 Condition No.: 4417515078361991
=================================================================
* The condition number is large (4e+15). This might indicate
strong multicollinearity or other numerical problems.
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
100%|██████████| 29/29 [00:09<00:00, 3.16it/s]
print(models['Accuracy']['LogisticRegression'])
print(models['ROC AUC']['LogisticRegression'])
0.7365549493374902
0.5572824266484709
def acctest(variables, target, basetable):
X = basetable[variables]
y = basetable[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
#logreg = sm.Logit(y,X.astype(float), missing = 'drop')
#result=logreg.fit()
#return(abs(result.tvalues))
return(models['ROC AUC']['LogisticRegression'])
def next_best2(current_variables, candidate_variables, target, basetable):
best_rsq = -1
best_variable = None
for v in candidate_variables:
logrsq = acctest(current_variables+[v], target, basetable)
if logrsq >= best_rsq:
best_rsq = logrsq
best_variable = v
return best_variable
candidate_variables = ['bonus','sold_tokens','bounty']
current_variables = ['teamsize','rating']
target = ['success']
next_variable = next_best2(current_variables, candidate_variables,target, df)
print(next_variable)
100%|██████████| 29/29 [00:08<00:00, 3.38it/s]
100%|██████████| 29/29 [00:08<00:00, 3.39it/s]
100%|██████████| 29/29 [00:08<00:00, 3.29it/s]bonus
print(next_variable)
bonus
candidate_variables = ['teamsize','rating','bonus','sold_tokens','bounty','kyc']
current_variables = []
target = ['success']
max_number_variables = 5
number_iterations = min(max_number_variables, len(candidate_variables))
for i in range(0,number_iterations):
next_var = next_best2(current_variables, candidate_variables, target, df)
current_variables = current_variables + [next_var]
candidate_variables.remove(next_var)
print(current_variables)
100%|██████████| 29/29 [00:08<00:00, 3.62it/s]
100%|██████████| 29/29 [00:07<00:00, 4.04it/s]
100%|██████████| 29/29 [00:05<00:00, 5.74it/s]
100%|██████████| 29/29 [00:06<00:00, 4.81it/s]
100%|██████████| 29/29 [00:05<00:00, 5.35it/s]
100%|██████████| 29/29 [00:05<00:00, 5.03it/s]
100%|██████████| 29/29 [00:09<00:00, 3.11it/s]
100%|██████████| 29/29 [00:08<00:00, 3.59it/s]
100%|██████████| 29/29 [00:08<00:00, 3.57it/s]
100%|██████████| 29/29 [00:08<00:00, 3.61it/s]
100%|██████████| 29/29 [00:07<00:00, 3.71it/s]
100%|██████████| 29/29 [00:09<00:00, 3.04it/s]
100%|██████████| 29/29 [00:08<00:00, 3.42it/s]
100%|██████████| 29/29 [00:08<00:00, 3.52it/s]
100%|██████████| 29/29 [00:07<00:00, 3.64it/s]
100%|██████████| 29/29 [00:09<00:00, 3.03it/s]
100%|██████████| 29/29 [00:09<00:00, 3.06it/s]
100%|██████████| 29/29 [00:09<00:00, 3.04it/s]
100%|██████████| 29/29 [00:09<00:00, 2.93it/s]
100%|██████████| 29/29 [00:09<00:00, 2.91it/s]['rating', 'bonus', 'teamsize', 'bounty', 'sold_tokens']
# dependent/target/outcome variable
y = df['success']
# independent/predictor/explanatory variable
X = df[current_variables]
# A. Logit regression
# turn independent variables into floating type (best practice)
# 'missing='drop'' drops rows with missing values from the regression
logit_model=sm.Logit(y,X.astype(float), missing='drop' )
# fit logit model into the data
result=logit_model.fit()
# summarize the logit model
print(result.summary2())
Optimization terminated successfully.
Current function value: 0.600325
Iterations 11
Results: Logit
================================================================
Model: Logit Pseudo R-squared: 0.113
Dependent Variable: success AIC: 136.0681
Date: 2021-10-23 16:43 BIC: 149.3379
No. Observations: 105 Log-Likelihood: -63.034
Df Model: 4 LL-Null: -71.052
Df Residuals: 100 LLR p-value: 0.0029717
Converged: 1.0000 Scale: 1.0000
No. Iterations: 11.0000
-----------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
-----------------------------------------------------------------
rating 0.2013 0.1302 1.5462 0.1221 -0.0539 0.4565
bonus -2.7624 1.4020 -1.9703 0.0488 -5.5103 -0.0145
teamsize -0.0045 0.0296 -0.1522 0.8790 -0.0624 0.0534
bounty -0.6432 0.5085 -1.2647 0.2060 -1.6399 0.3536
sold_tokens 0.0000 0.0000 1.3223 0.1861 -0.0000 0.0000
================================================================
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
100%|██████████| 29/29 [00:09<00:00, 2.99it/s]
print(models['Accuracy']['LogisticRegression'])
print(models['ROC AUC']['LogisticRegression'])
0.7420109119251753
0.567486508281124
df.columns[2:]
df['name'].isnull().sum()
#candidate_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','linkedin_link','link_white_paper','linkxwhitep','github_link','ico_length','raised_usd']
candidate_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','link_white_paper','linkxwhitep','github_link','ico_length']
current_variables = []
target = ['success']
max_number_variables = 8
number_iterations = min(max_number_variables, len(candidate_variables))
for i in range(0,number_iterations):
next_var = next_best(current_variables, candidate_variables, target, df)
current_variables = current_variables + [next_var]
candidate_variables.remove(next_var)
print(next_var)
print(current_variables)
Optimization terminated successfully.
Current function value: 0.670867
Iterations 4
Optimization terminated successfully.
Current function value: 0.637245
Iterations 4
Optimization terminated successfully.
Current function value: 0.656717
Iterations 7
Optimization terminated successfully.
Current function value: 0.692039
Iterations 4
Optimization terminated successfully.
Current function value: 0.666551
Iterations 5
Optimization terminated successfully.
Current function value: 0.655511
Iterations 5
Optimization terminated successfully.
Current function value: 0.684098
Iterations 4
Optimization terminated successfully.
Current function value: 0.593962
Iterations 5
Optimization terminated successfully.
Current function value: 0.625762
Iterations 5
Optimization terminated successfully.
Current function value: 0.654769
Iterations 5
Optimization terminated successfully.
Current function value: 0.667061
Iterations 5
Optimization terminated successfully.
Current function value: 0.601703
Iterations 5
Optimization terminated successfully.
Current function value: 0.637722
Iterations 5
Optimization terminated successfully.
Current function value: 0.671335
Iterations 4
Optimization terminated successfully.
Current function value: 0.691685
Iterations 5
is_ico
Optimization terminated successfully.
Current function value: 0.582406
Iterations 5
Optimization terminated successfully.
Current function value: 0.583269
Iterations 5
Optimization terminated successfully.
Current function value: 0.579338
Iterations 7
Optimization terminated successfully.
Current function value: 0.585713
Iterations 5
Optimization terminated successfully.
Current function value: 0.593915
Iterations 5
Optimization terminated successfully.
Current function value: 0.593947
Iterations 5
Optimization terminated successfully.
Current function value: 0.573491
Iterations 5
Optimization terminated successfully.
Current function value: 0.593249
Iterations 5
Optimization terminated successfully.
Current function value: 0.570740
Iterations 5
Optimization terminated successfully.
Current function value: 0.569132
Iterations 5
Optimization terminated successfully.
Current function value: 0.592900
Iterations 5
Optimization terminated successfully.
Current function value: 0.589103
Iterations 5
Optimization terminated successfully.
Current function value: 0.587018
Iterations 5
Optimization terminated successfully.
Current function value: 0.593872
Iterations 5
ratxteamsize
Optimization terminated successfully.
Current function value: 0.569498
Iterations 6
Optimization terminated successfully.
Current function value: 0.573266
Iterations 5
Optimization terminated successfully.
Current function value: 0.554547
Iterations 7
Optimization terminated successfully.
Current function value: 0.564267
Iterations 5
Optimization terminated successfully.
Current function value: 0.570808
Iterations 5
Optimization terminated successfully.
Current function value: 0.569001
Iterations 5
Optimization terminated successfully.
Current function value: 0.573248
Iterations 5
Optimization terminated successfully.
Current function value: 0.549291
Iterations 5
Optimization terminated successfully.
Current function value: 0.549036
Iterations 5
Optimization terminated successfully.
Current function value: 0.569408
Iterations 5
Optimization terminated successfully.
Current function value: 0.571803
Iterations 5
Optimization terminated successfully.
Current function value: 0.572682
Iterations 5
Optimization terminated successfully.
Current function value: 0.573341
Iterations 5
bonus
Optimization terminated successfully.
Current function value: 0.550192
Iterations 7
Optimization terminated successfully.
Current function value: 0.554210
Iterations 7
Optimization terminated successfully.
Current function value: 0.545085
Iterations 7
Optimization terminated successfully.
Current function value: 0.552735
Iterations 7
Optimization terminated successfully.
Current function value: 0.550887
Iterations 7
Optimization terminated successfully.
Current function value: 0.554314
Iterations 7
Optimization terminated successfully.
Current function value: 0.528229
Iterations 7
Optimization terminated successfully.
Current function value: 0.527774
Iterations 7
Optimization terminated successfully.
Current function value: 0.550560
Iterations 7
Optimization terminated successfully.
Current function value: 0.554016
Iterations 7
Optimization terminated successfully.
Current function value: 0.553627
Iterations 7
Optimization terminated successfully.
Current function value: 0.554390
Iterations 7
whxky
Optimization terminated successfully.
Current function value: 0.524424
Iterations 7
Optimization terminated successfully.
Current function value: 0.527488
Iterations 7
Optimization terminated successfully.
Current function value: 0.524731
Iterations 7
Optimization terminated successfully.
Current function value: 0.527684
Iterations 7
Optimization terminated successfully.
Current function value: 0.527768
Iterations 7
Optimization terminated successfully.
Current function value: 0.527101
Iterations 7
Optimization terminated successfully.
Current function value: 0.527440
Iterations 7
Optimization terminated successfully.
Current function value: 0.527310
Iterations 7
Optimization terminated successfully.
Current function value: 0.526081
Iterations 7
Optimization terminated successfully.
Current function value: 0.527597
Iterations 7
Optimization terminated successfully.
Current function value: 0.527719
Iterations 7
teamsize
Optimization terminated successfully.
Current function value: 0.522754
Iterations 7
Optimization terminated successfully.
Current function value: 0.521284
Iterations 7
Optimization terminated successfully.
Current function value: 0.524256
Iterations 7
Optimization terminated successfully.
Current function value: 0.524401
Iterations 7
Optimization terminated successfully.
Current function value: 0.523621
Iterations 7
Optimization terminated successfully.
Current function value: 0.524197
Iterations 7
Optimization terminated successfully.
Current function value: 0.524117
Iterations 7
Optimization terminated successfully.
Current function value: 0.522756
Iterations 7
Optimization terminated successfully.
Current function value: 0.524417
Iterations 7
Optimization terminated successfully.
Current function value: 0.524358
Iterations 7
logsoldtokens
Optimization terminated successfully.
Current function value: 0.519546
Iterations 7
Optimization terminated successfully.
Current function value: 0.521093
Iterations 7
Optimization terminated successfully.
Current function value: 0.521254
Iterations 7
Optimization terminated successfully.
Current function value: 0.520414
Iterations 7
Optimization terminated successfully.
Current function value: 0.521059
Iterations 7
Optimization terminated successfully.
Current function value: 0.520967
Iterations 7
Optimization terminated successfully.
Current function value: 0.519669
Iterations 7
Optimization terminated successfully.
Current function value: 0.521278
Iterations 7
Optimization terminated successfully.
Current function value: 0.521215
Iterations 7
rating
Optimization terminated successfully.
Current function value: 0.519501
Iterations 7
Optimization terminated successfully.
Current function value: 0.519184
Iterations 7
Optimization terminated successfully.
Current function value: 0.519366
Iterations 7
Optimization terminated successfully.
Current function value: 0.519324
Iterations 7
Optimization terminated successfully.
Current function value: 0.519513
Iterations 7
Optimization terminated successfully.
Current function value: 0.514750
Iterations 7
Optimization terminated successfully.
Current function value: 0.519232
Iterations 7
Optimization terminated successfully.
Current function value: 0.519456
Iterations 7
linkxwhitep
['is_ico', 'ratxteamsize', 'bonus', 'whxky', 'teamsize', 'logsoldtokens', 'rating', 'linkxwhitep']
#candidate_variables = ['teamsize','rating','bonus','sold_tokens','bounty','kyc','ratxteamsize']
candidate_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','linkedin_link']
current_variables = []
target = ['success']
max_number_variables = 8
number_iterations = min(max_number_variables, len(candidate_variables))
for i in range(0,number_iterations):
next_var = next_best2(current_variables, candidate_variables, target, df)
current_variables = current_variables + [next_var]
candidate_variables.remove(next_var)
print(current_variables)
100%|██████████| 29/29 [00:07<00:00, 3.74it/s]
100%|██████████| 29/29 [00:07<00:00, 3.89it/s]
100%|██████████| 29/29 [00:05<00:00, 5.58it/s]
100%|██████████| 29/29 [00:06<00:00, 4.68it/s]
100%|██████████| 29/29 [00:05<00:00, 5.17it/s]
100%|██████████| 29/29 [00:05<00:00, 4.94it/s]
100%|██████████| 29/29 [00:08<00:00, 3.34it/s]
100%|██████████| 29/29 [00:05<00:00, 5.64it/s]
100%|██████████| 29/29 [00:05<00:00, 5.25it/s]
100%|██████████| 29/29 [00:05<00:00, 5.15it/s]
100%|██████████| 29/29 [00:05<00:00, 5.11it/s]
100%|██████████| 29/29 [00:06<00:00, 4.78it/s]
100%|██████████| 29/29 [00:08<00:00, 3.25it/s]
100%|██████████| 29/29 [00:09<00:00, 3.12it/s]
100%|██████████| 29/29 [00:08<00:00, 3.23it/s]
100%|██████████| 29/29 [00:08<00:00, 3.28it/s]
100%|██████████| 29/29 [00:09<00:00, 3.18it/s]
100%|██████████| 29/29 [00:08<00:00, 3.25it/s]
100%|██████████| 29/29 [00:08<00:00, 3.28it/s]
100%|██████████| 29/29 [00:08<00:00, 3.26it/s]
100%|██████████| 29/29 [00:08<00:00, 3.26it/s]
100%|██████████| 29/29 [00:09<00:00, 3.22it/s]
100%|██████████| 29/29 [00:08<00:00, 3.25it/s]
100%|██████████| 29/29 [00:08<00:00, 3.35it/s]
100%|██████████| 29/29 [00:09<00:00, 3.04it/s]
100%|██████████| 29/29 [00:09<00:00, 3.18it/s]
100%|██████████| 29/29 [00:09<00:00, 3.19it/s]
100%|██████████| 29/29 [00:08<00:00, 3.28it/s]
100%|██████████| 29/29 [00:09<00:00, 3.21it/s]
100%|██████████| 29/29 [00:09<00:00, 3.14it/s]
100%|██████████| 29/29 [00:09<00:00, 3.19it/s]
100%|██████████| 29/29 [00:09<00:00, 3.20it/s]
100%|██████████| 29/29 [00:09<00:00, 3.22it/s]
100%|██████████| 29/29 [00:10<00:00, 2.86it/s]
100%|██████████| 29/29 [00:09<00:00, 2.94it/s]
100%|██████████| 29/29 [00:09<00:00, 3.01it/s]
100%|██████████| 29/29 [00:09<00:00, 2.94it/s]
100%|██████████| 29/29 [00:09<00:00, 2.94it/s]
100%|██████████| 29/29 [00:09<00:00, 3.00it/s]
100%|██████████| 29/29 [00:09<00:00, 3.07it/s]
100%|██████████| 29/29 [00:09<00:00, 2.95it/s]
100%|██████████| 29/29 [00:10<00:00, 2.86it/s]
100%|██████████| 29/29 [00:10<00:00, 2.72it/s]
100%|██████████| 29/29 [00:09<00:00, 2.94it/s]
100%|██████████| 29/29 [00:10<00:00, 2.88it/s]
100%|██████████| 29/29 [00:10<00:00, 2.79it/s]
100%|██████████| 29/29 [00:15<00:00, 1.83it/s]
100%|██████████| 29/29 [00:12<00:00, 2.28it/s]
100%|██████████| 29/29 [00:10<00:00, 2.75it/s]
100%|██████████| 29/29 [00:10<00:00, 2.78it/s]
100%|██████████| 29/29 [00:11<00:00, 2.62it/s]
100%|██████████| 29/29 [00:10<00:00, 2.85it/s]
100%|██████████| 29/29 [00:10<00:00, 2.88it/s]
100%|██████████| 29/29 [00:10<00:00, 2.79it/s]
100%|██████████| 29/29 [00:10<00:00, 2.83it/s]
100%|██████████| 29/29 [00:10<00:00, 2.85it/s]
100%|██████████| 29/29 [00:10<00:00, 2.86it/s]
100%|██████████| 29/29 [00:15<00:00, 1.82it/s]
100%|██████████| 29/29 [00:17<00:00, 1.62it/s]
100%|██████████| 29/29 [00:15<00:00, 1.89it/s]
100%|██████████| 29/29 [00:10<00:00, 2.66it/s]
100%|██████████| 29/29 [00:10<00:00, 2.74it/s]
100%|██████████| 29/29 [00:10<00:00, 2.81it/s]
100%|██████████| 29/29 [00:11<00:00, 2.60it/s]
100%|██████████| 29/29 [00:10<00:00, 2.68it/s]
100%|██████████| 29/29 [00:10<00:00, 2.67it/s]
100%|██████████| 29/29 [00:10<00:00, 2.66it/s]
100%|██████████| 29/29 [00:10<00:00, 2.66it/s]['ratxteamsize', 'bonus', 'rating', 'linkedin_link', 'logsoldtokens', 'whxky', 'whitelist', 'teamsize']
# dependent/target/outcome variable
y = df['success']
# independent/predictor/explanatory variable
#test_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','linkedin_link','link_white_paper']
test_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','linkedin_link','link_white_paper','linkxwhitep','github_link','ico_length']
#X = df[current_variables]
X = df[test_variables]
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_train = np.nan_to_num(X_train.astype(np.float32))
clf.fit(X_train,y_train)
100%|██████████| 29/29 [00:14<00:00, 2.04it/s]
#df.current_variables
feature_imp = pd.Series(clf.feature_importances_,index=df[test_variables].columns).sort_values(ascending=False)
feature_imp
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()
No handles with labels found to put in legend.
#chosen_variables = ['raised_usd','is_ico','rating','teamsize','ratxteamsize','linkedin_link']
chosen_variables = ['is_ico', 'ratxteamsize', 'rating', 'bonus', 'teamsize', 'kyc','whitelist','logsoldtokens']
#chosen_variables = ['is_ico', 'ratxteamsize', 'bonus', 'teamsize', 'logsoldtokens', 'rating', 'linkxwhitep','linkedin_link','link_white_paper']
# dependent/target/outcome variable
y = df['success']
# independent/predictor/explanatory variable
X = df[chosen_variables]
#X = df[current_variables]
# A. Logit regression
# turn independent variables into floating type (best practice)
# 'missing='drop'' drops rows with missing values from the regression
logit_model=sm.Logit(y,X.astype(float), missing='drop' )
# fit logit model into the data
result=logit_model.fit()
# summarize the logit model
print(result.summary2())
Optimization terminated successfully.
Current function value: 0.519402
Iterations 7
Results: Logit
=================================================================
Model: Logit Pseudo R-squared: 0.093
Dependent Variable: success AIC: 4048.6381
Date: 2021-10-24 11:23 BIC: 4098.7510
No. Observations: 3882 Log-Likelihood: -2016.3
Df Model: 7 LL-Null: -2224.3
Df Residuals: 3874 LLR p-value: 9.2401e-86
Converged: 1.0000 Scale: 1.0000
No. Iterations: 7.0000
-----------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
-----------------------------------------------------------------
is_ico -1.0620 0.1276 -8.3229 0.0000 -1.3121 -0.8119
ratxteamsize 0.0507 0.0057 8.8940 0.0000 0.0395 0.0619
rating -0.1944 0.0467 -4.1657 0.0000 -0.2858 -0.1029
bonus -2.6637 0.2958 -9.0046 0.0000 -3.2434 -2.0839
teamsize -0.1228 0.0205 -5.9808 0.0000 -0.1630 -0.0826
kyc 0.2291 0.0917 2.4978 0.0125 0.0493 0.4088
whitelist 0.0858 0.0838 1.0243 0.3057 -0.0784 0.2500
logsoldtokens 0.0773 0.0152 5.0848 0.0000 0.0475 0.1071
=================================================================
# B. Linear Probability Model
# logit regression
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
#print(result.rsquared_adj)
Results: Ordinary least squares
==================================================================
Model: OLS Adj. R-squared: 0.114
Dependent Variable: success AIC: 4159.3884
Date: 2021-10-24 11:23 BIC: 4215.7654
No. Observations: 3882 Log-Likelihood: -2070.7
Df Model: 8 F-statistic: 63.19
Df Residuals: 3873 Prob (F-statistic): 1.28e-97
R-squared: 0.115 Scale: 0.17055
------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------
const 0.0799 0.0486 1.6432 0.1004 -0.0154 0.1752
is_ico -0.0582 0.0311 -1.8720 0.0613 -0.1192 0.0028
ratxteamsize 0.0039 0.0011 3.7106 0.0002 0.0019 0.0060
rating 0.0454 0.0128 3.5599 0.0004 0.0204 0.0704
bonus -0.2910 0.0235 -12.3879 0.0000 -0.3371 -0.2449
teamsize -0.0046 0.0036 -1.2771 0.2016 -0.0116 0.0025
kyc 0.0413 0.0155 2.6717 0.0076 0.0110 0.0717
whitelist 0.0270 0.0146 1.8534 0.0639 -0.0016 0.0555
logsoldtokens 0.0153 0.0029 5.3465 0.0000 0.0097 0.0210
------------------------------------------------------------------
Omnibus: 537.531 Durbin-Watson: 1.809
Prob(Omnibus): 0.000 Jarque-Bera (JB): 596.738
Skew: 0.909 Prob(JB): 0.000
Kurtosis: 2.378 Condition No.: 371
==================================================================
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
print('The accuracy of our model is',models['Accuracy']['LogisticRegression'])
print('The ROC AUC of our model is',models['ROC AUC']['LogisticRegression'])
100%|██████████| 29/29 [00:18<00:00, 1.54it/s]The accuracy of our model is 0.7388932190179267
The ROC AUC of our model is 0.5709137150300849
models