# Start writing code here...
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/work/tord_v3_edited.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6415 entries, 0 to 6414
Data columns (total 34 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 6415 non-null int64
1 name 6415 non-null object
2 token 6282 non-null object
3 country 6415 non-null object
4 is_ico 6415 non-null int64
5 is_ieo 6415 non-null int64
6 is_sto 6415 non-null int64
7 ico_start 5636 non-null object
8 ico_end 5497 non-null object
9 price_usd 5928 non-null object
10 raised_usd 2139 non-null float64
11 distributed_in_ico 4870 non-null float64
12 sold_tokens 192 non-null float64
13 token_for_sale 5122 non-null float64
14 whitelist 3882 non-null object
15 kyc 6415 non-null int64
16 bonus 6415 non-null int64
17 restricted_areas 2343 non-null object
18 min_investment 2079 non-null object
19 bounty 6415 non-null int64
20 mvp 1296 non-null object
21 pre_ico_start 2717 non-null object
22 pre_ico_end 2705 non-null object
23 pre_ico_price_usd 1733 non-null object
24 platform 6415 non-null int64
25 accepting 5545 non-null object
26 link_white_paper 5828 non-null object
27 linkedin_link 4355 non-null object
28 github_link 5649 non-null object
29 website 5649 non-null object
30 rating 5709 non-null float64
31 teamsize 4622 non-null float64
32 Coinmarketcap_identifier 1281 non-null float64
33 ERC20 5679 non-null float64
dtypes: float64(8), int64(8), object(18)
memory usage: 1.7+ MB
df.head()
df['raised_usd'] = df['raised_usd'].fillna(0)
df['success'] = np.where(df['raised_usd'] >= 500000, 1, 0)
# Potential multicollinearity issue
corr = df.corr()
plt.figure(figsize=(21,16))
sns.heatmap(corr)
plt.show()
df.columns
corr.columns
!pip install statsmodels
Collecting statsmodels
Downloading statsmodels-0.13.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
|████████████████████████████████| 9.8 MB 25.8 MB/s
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.5)
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.7.1)
Collecting patsy>=0.5.2
Downloading patsy-0.5.2-py2.py3-none-any.whl (233 kB)
|████████████████████████████████| 233 kB 22.8 MB/s
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2021.3)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.2 statsmodels-0.13.0
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
import statsmodels.api as sm
from statsmodels.formula.api import ols, logit
# dependent/target/outcome variable
y = df['success']
# independent/predictor/explanatory variable
X = df[['is_ico', 'is_sto', 'raised_usd',
'token_for_sale', 'bonus', 'bounty',
'rating', 'teamsize', 'ERC20']] # 'distributed_in_ico', 'platform','kyc',
# A. Logit regression
# turn independent variables into floating type (best practice)
# 'missing='drop'' drops rows with missing values from the regression
logit_model=sm.Logit(y,X.astype(float), missing='drop' )
# fit logit model into the data
result=logit_model.fit()
# summarize the logit model
print(result.summary2())
Warning: Maximum number of iterations has been exceeded.
Current function value: 0.009040
Iterations: 35
Results: Logit
=================================================================
Model: Logit Pseudo R-squared: 0.985
Dependent Variable: success AIC: 80.3925
Date: 2021-10-25 02:53 BIC: 135.7103
No. Observations: 3451 Log-Likelihood: -31.196
Df Model: 8 LL-Null: -2146.2
Df Residuals: 3442 LLR p-value: 0.0000
Converged: 0.0000 Scale: 1.0000
No. Iterations: 35.0000
-----------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
-----------------------------------------------------------------
is_ico -2.2177 0.8908 -2.4896 0.0128 -3.9636 -0.4718
is_sto -1.4810 207.2716 -0.0071 0.9943 -407.7260 404.7639
raised_usd 0.0000 0.0000 6.1641 0.0000 0.0000 0.0000
token_for_sale -0.0000 0.0000 -0.4233 0.6721 -0.0000 0.0000
bonus -0.9625 8.9224 -0.1079 0.9141 -18.4500 16.5250
bounty 0.7843 0.7585 1.0340 0.3012 -0.7024 2.2709
rating -2.5179 0.5922 -4.2519 0.0000 -3.6786 -1.3573
teamsize 0.0194 0.0550 0.3531 0.7240 -0.0884 0.1273
ERC20 -2.8153 1.2117 -2.3233 0.0202 -5.1902 -0.4403
=================================================================
/root/venv/lib/python3.7/site-packages/statsmodels/base/model.py:606: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
ConvergenceWarning)
import statsmodels.api as sm
from statsmodels.formula.api import ols, logit
# dependent/target/outcome variable
y = df['success']
# independent/predictor/explanatory variable
X = df[['is_ico', 'is_sto', 'kyc', 'bonus', 'bounty',
'rating', 'teamsize', 'ERC20','platform', 'token_for_sale', 'distributed_in_ico' ]] # ,'raised_usd',
# A. Logit regression
# turn independent variables into floating type (best practice)
# 'missing='drop'' drops rows with missing values from the regression
logit_model=sm.Logit(y,X.astype(float), missing='drop' )
# fit logit model into the data
result=logit_model.fit()
# summarize the logit model
print(result.summary2())
Optimization terminated successfully.
Current function value: 0.570506
Iterations 7
Results: Logit
==================================================================
Model: Logit Pseudo R-squared: 0.078
Dependent Variable: success AIC: 3522.6226
Date: 2021-10-25 02:53 BIC: 3588.9392
No. Observations: 3068 Log-Likelihood: -1750.3
Df Model: 10 LL-Null: -1897.7
Df Residuals: 3057 LLR p-value: 1.9113e-57
Converged: 1.0000 Scale: 1.0000
No. Iterations: 7.0000
------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
------------------------------------------------------------------
is_ico -1.1134 0.1298 -8.5811 0.0000 -1.3678 -0.8591
is_sto -1.6114 1.0776 -1.4954 0.1348 -3.7234 0.5006
kyc -0.2078 0.0887 -2.3432 0.0191 -0.3817 -0.0340
bonus -2.7504 0.3128 -8.7942 0.0000 -3.3634 -2.1374
bounty -0.2140 0.0882 -2.4277 0.0152 -0.3868 -0.0412
rating 0.1368 0.0500 2.7380 0.0062 0.0389 0.2348
teamsize 0.0566 0.0063 9.0194 0.0000 0.0443 0.0689
ERC20 -0.4591 0.1134 -4.0495 0.0001 -0.6813 -0.2369
platform -0.1076 0.0834 -1.2910 0.1967 -0.2710 0.0558
token_for_sale -0.0000 0.0000 -1.6445 0.1001 -0.0000 0.0000
distributed_in_ico -0.0000 0.0003 -0.0435 0.9653 -0.0007 0.0006
==================================================================
average_marginal_effect = result.get_margeff(at = "mean", method = "dydx")
print(average_marginal_effect.summary())
Logit Marginal Effects
=====================================
Dep. Variable: success
Method: dydx
At: mean
======================================================================================
dy/dx std err z P>|z| [0.025 0.975]
--------------------------------------------------------------------------------------
is_ico -0.2274 0.026 -8.602 0.000 -0.279 -0.176
is_sto -0.3292 0.220 -1.496 0.135 -0.760 0.102
kyc -0.0425 0.018 -2.343 0.019 -0.078 -0.007
bonus -0.5618 0.058 -9.613 0.000 -0.676 -0.447
bounty -0.0437 0.018 -2.427 0.015 -0.079 -0.008
rating 0.0280 0.010 2.721 0.007 0.008 0.048
teamsize 0.0116 0.001 9.040 0.000 0.009 0.014
ERC20 -0.0938 0.023 -4.067 0.000 -0.139 -0.049
platform -0.0220 0.017 -1.291 0.197 -0.055 0.011
token_for_sale -3.217e-14 1.95e-14 -1.651 0.099 -7.04e-14 6.02e-15
distributed_in_ico -2.881e-06 6.63e-05 -0.043 0.965 -0.000 0.000
======================================================================================
sns.regplot(x = "rating", y = "success", data = df,
logistic = True, y_jitter = .05)
plt.ylabel("success probability")
# B. Linear Probability Model
# logit regression
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
Results: Ordinary least squares
===================================================================
Model: OLS Adj. R-squared: 0.122
Dependent Variable: success AIC: 3585.8829
Date: 2021-10-25 02:53 BIC: 3658.2282
No. Observations: 3068 Log-Likelihood: -1780.9
Df Model: 11 F-statistic: 39.66
Df Residuals: 3056 Prob (F-statistic): 1.01e-80
R-squared: 0.125 Scale: 0.18769
-------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
-------------------------------------------------------------------
const -0.2367 0.0585 -4.0464 0.0001 -0.3513 -0.1220
is_ico -0.0126 0.0323 -0.3886 0.6976 -0.0759 0.0508
is_sto -0.2694 0.1449 -1.8589 0.0631 -0.5535 0.0148
kyc -0.0472 0.0171 -2.7675 0.0057 -0.0806 -0.0138
bonus -0.3145 0.0263 -11.9755 0.0000 -0.3660 -0.2630
bounty -0.0610 0.0168 -3.6252 0.0003 -0.0939 -0.0280
rating 0.1555 0.0140 11.0774 0.0000 0.1280 0.1830
teamsize 0.0098 0.0012 8.2146 0.0000 0.0074 0.0121
ERC20 0.0244 0.0242 1.0100 0.3126 -0.0230 0.0717
platform -0.0002 0.0161 -0.0134 0.9893 -0.0318 0.0314
token_for_sale -0.0000 0.0000 -1.6517 0.0987 -0.0000 0.0000
distributed_in_ico 0.0000 0.0001 0.2162 0.8289 -0.0001 0.0001
-------------------------------------------------------------------
Omnibus: 1540.998 Durbin-Watson: 1.785
Prob(Omnibus): 0.000 Jarque-Bera (JB): 362.247
Skew: 0.629 Prob(JB): 0.000
Kurtosis: 1.883 Condition No.: 28979484379988
===================================================================
* The condition number is large (3e+13). This might indicate
strong multicollinearity or other numerical problems.
# C. Linear Probability Model with transformation
# dependent/target/outcome variable
y3 = df['success']
df['token_for_sale_log'] = np.log(df['token_for_sale'])
# independent/predictor/explanatory variable
X3 = df[['is_ico', 'is_sto', 'kyc', 'bonus', 'bounty',
'rating', 'teamsize', 'ERC20','platform', 'token_for_sale_log', 'distributed_in_ico' ]] # raised_usd
# turn independent variables into floating type (best practice)
# 'missing='drop'' drops rows with missing values from the regression
logit_model=sm.Logit(y3,X3.astype(float), missing='drop' )
# fit logit model into the data
result=logit_model.fit()
# summarize the logit model
print(result.summary2())
Optimization terminated successfully.
Current function value: 0.559364
Iterations 7
Results: Logit
==================================================================
Model: Logit Pseudo R-squared: 0.096
Dependent Variable: success AIC: 3454.2592
Date: 2021-10-25 02:55 BIC: 3520.5758
No. Observations: 3068 Log-Likelihood: -1716.1
Df Model: 10 LL-Null: -1897.7
Df Residuals: 3057 LLR p-value: 6.2577e-72
Converged: 1.0000 Scale: 1.0000
No. Iterations: 7.0000
------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
------------------------------------------------------------------
is_ico -0.6141 0.1428 -4.3016 0.0000 -0.8939 -0.3343
is_sto -1.6680 1.0799 -1.5445 0.1225 -3.7846 0.4487
kyc -0.1863 0.0901 -2.0677 0.0387 -0.3629 -0.0097
bonus -2.7722 0.3134 -8.8470 0.0000 -3.3864 -2.1581
bounty -0.2440 0.0892 -2.7341 0.0063 -0.4188 -0.0691
rating 0.4950 0.0663 7.4677 0.0000 0.3651 0.6250
teamsize 0.0565 0.0063 8.9384 0.0000 0.0441 0.0689
ERC20 -0.1622 0.1216 -1.3341 0.1822 -0.4004 0.0761
platform -0.0422 0.0850 -0.4964 0.6196 -0.2087 0.1243
token_for_sale_log -0.1042 0.0124 -8.4084 0.0000 -0.1285 -0.0799
distributed_in_ico -0.0001 0.0003 -0.2301 0.8180 -0.0007 0.0006
==================================================================
# Use wrapper lazypredict
!pip install lazypredict
Collecting lazypredict
Downloading lazypredict-0.2.9-py2.py3-none-any.whl (12 kB)
Collecting tqdm==4.56.0
Downloading tqdm-4.56.0-py2.py3-none-any.whl (72 kB)
|████████████████████████████████| 72 kB 1.7 MB/s
Collecting lightgbm==2.3.1
Downloading lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl (1.2 MB)
|████████████████████████████████| 1.2 MB 27.8 MB/s
Collecting pytest==5.4.3
Downloading pytest-5.4.3-py3-none-any.whl (248 kB)
|████████████████████████████████| 248 kB 42.8 MB/s
Collecting scikit-learn==0.23.1
Downloading scikit_learn-0.23.1-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
|████████████████████████████████| 6.8 MB 18.5 MB/s
Collecting joblib==1.0.0
Downloading joblib-1.0.0-py3-none-any.whl (302 kB)
|████████████████████████████████| 302 kB 37.2 MB/s
Collecting scipy==1.5.4
Downloading scipy-1.5.4-cp37-cp37m-manylinux1_x86_64.whl (25.9 MB)
|████████████████████████████████| 25.9 MB 26.6 MB/s
Collecting pandas==1.0.5
Downloading pandas-1.0.5-cp37-cp37m-manylinux1_x86_64.whl (10.1 MB)
|████████████████████████████████| 10.1 MB 29.5 MB/s
Collecting click==7.1.2
Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
|████████████████████████████████| 82 kB 2.4 MB/s
Collecting xgboost==1.1.1
Downloading xgboost-1.1.1-py3-none-manylinux2010_x86_64.whl (127.6 MB)
|████████████████████████████████| 127.6 MB 44 kB/s
Collecting numpy==1.19.1
Downloading numpy-1.19.1-cp37-cp37m-manylinux2010_x86_64.whl (14.5 MB)
|████████████████████████████████| 14.5 MB 35.7 MB/s
Collecting PyYAML==5.3.1
Downloading PyYAML-5.3.1.tar.gz (269 kB)
|████████████████████████████████| 269 kB 44.8 MB/s
Collecting six==1.15.0
Downloading six-1.15.0-py2.py3-none-any.whl (10 kB)
Requirement already satisfied: python-dateutil>=2.6.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas==1.0.5->lazypredict) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas==1.0.5->lazypredict) (2021.3)
Requirement already satisfied: py>=1.5.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (1.10.0)
Requirement already satisfied: attrs>=17.4.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (21.2.0)
Requirement already satisfied: importlib-metadata>=0.12 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (4.8.1)
Requirement already satisfied: wcwidth in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (0.2.5)
Collecting more-itertools>=4.0.0
Downloading more_itertools-8.10.0-py3-none-any.whl (51 kB)
|████████████████████████████████| 51 kB 488 kB/s
Requirement already satisfied: packaging in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (21.0)
Collecting pluggy<1.0,>=0.12
Downloading pluggy-0.13.1-py2.py3-none-any.whl (18 kB)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn==0.23.1->lazypredict) (3.0.0)
Requirement already satisfied: typing-extensions>=3.6.4 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from importlib-metadata>=0.12->pytest==5.4.3->lazypredict) (3.10.0.2)
Requirement already satisfied: zipp>=0.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from importlib-metadata>=0.12->pytest==5.4.3->lazypredict) (3.6.0)
Requirement already satisfied: pyparsing>=2.0.2 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from packaging->pytest==5.4.3->lazypredict) (2.4.7)
Building wheels for collected packages: PyYAML
Building wheel for PyYAML (setup.py) ... done
Created wheel for PyYAML: filename=PyYAML-5.3.1-cp37-cp37m-linux_x86_64.whl size=44635 sha256=984f75a81c90e3758ce41ac315aa6182c9c724b26e14c31dff2a62095fed090c
Stored in directory: /root/.cache/pip/wheels/5e/03/1e/e1e954795d6f35dfc7b637fe2277bff021303bd9570ecea653
Successfully built PyYAML
Installing collected packages: numpy, six, scipy, joblib, scikit-learn, pluggy, more-itertools, xgboost, tqdm, PyYAML, pytest, pandas, lightgbm, click, lazypredict
Attempting uninstall: numpy
Found existing installation: numpy 1.19.5
Not uninstalling numpy at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'numpy'. No files were found to uninstall.
Attempting uninstall: six
Found existing installation: six 1.16.0
Not uninstalling six at /shared-libs/python3.7/py-core/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'six'. No files were found to uninstall.
Attempting uninstall: scipy
Found existing installation: scipy 1.7.1
Not uninstalling scipy at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'scipy'. No files were found to uninstall.
Attempting uninstall: joblib
Found existing installation: joblib 1.1.0
Not uninstalling joblib at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'joblib'. No files were found to uninstall.
Attempting uninstall: scikit-learn
Found existing installation: scikit-learn 1.0
Not uninstalling scikit-learn at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'scikit-learn'. No files were found to uninstall.
Attempting uninstall: pluggy
Found existing installation: pluggy 1.0.0
Not uninstalling pluggy at /shared-libs/python3.7/py-core/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'pluggy'. No files were found to uninstall.
Attempting uninstall: tqdm
Found existing installation: tqdm 4.62.3
Not uninstalling tqdm at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'tqdm'. No files were found to uninstall.
Attempting uninstall: PyYAML
Found existing installation: PyYAML 5.4.1
Not uninstalling pyyaml at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'PyYAML'. No files were found to uninstall.
Attempting uninstall: pytest
Found existing installation: pytest 6.2.5
Not uninstalling pytest at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'pytest'. No files were found to uninstall.
Attempting uninstall: pandas
Found existing installation: pandas 1.2.5
Not uninstalling pandas at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'pandas'. No files were found to uninstall.
Attempting uninstall: click
Found existing installation: click 8.0.3
Not uninstalling click at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'click'. No files were found to uninstall.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.4.1 requires numpy~=1.19.2, but you have numpy 1.19.1 which is incompatible.
tensorflow 2.4.1 requires typing-extensions~=3.7.4, but you have typing-extensions 3.10.0.2 which is incompatible.
Successfully installed PyYAML-5.3.1 click-7.1.2 joblib-1.0.0 lazypredict-0.2.9 lightgbm-2.3.1 more-itertools-8.10.0 numpy-1.19.1 pandas-1.0.5 pluggy-0.13.1 pytest-5.4.3 scikit-learn-0.23.1 scipy-1.5.4 six-1.15.0 tqdm-4.56.0 xgboost-1.1.1
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.model_selection import train_test_split
/root/venv/lib/python3.7/site-packages/sklearn/utils/deprecation.py:143: FutureWarning: The sklearn.utils.testing module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.utils. Anything that cannot be imported from sklearn.utils is now part of the private API.
warnings.warn(message, FutureWarning)
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
100%|██████████| 29/29 [00:23<00:00, 1.21it/s]
predictions
models