import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/work/tord_v3_edited.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6415 entries, 0 to 6414
Data columns (total 34 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 6415 non-null int64
1 name 6415 non-null object
2 token 6282 non-null object
3 country 6415 non-null object
4 is_ico 6415 non-null int64
5 is_ieo 6415 non-null int64
6 is_sto 6415 non-null int64
7 ico_start 5636 non-null object
8 ico_end 5497 non-null object
9 price_usd 5928 non-null object
10 raised_usd 2139 non-null float64
11 distributed_in_ico 4870 non-null float64
12 sold_tokens 192 non-null float64
13 token_for_sale 5122 non-null float64
14 whitelist 3882 non-null object
15 kyc 6415 non-null int64
16 bonus 6415 non-null int64
17 restricted_areas 2343 non-null object
18 min_investment 2079 non-null object
19 bounty 6415 non-null int64
20 mvp 1296 non-null object
21 pre_ico_start 2717 non-null object
22 pre_ico_end 2705 non-null object
23 pre_ico_price_usd 1733 non-null object
24 platform 6415 non-null int64
25 accepting 5545 non-null object
26 link_white_paper 5828 non-null object
27 linkedin_link 4355 non-null object
28 github_link 5649 non-null object
29 website 5649 non-null object
30 rating 5709 non-null float64
31 teamsize 4622 non-null float64
32 Coinmarketcap_identifier 1281 non-null float64
33 ERC20 5679 non-null float64
dtypes: float64(8), int64(8), object(18)
memory usage: 1.7+ MB
# counr the number of tokens accepting
df = df.copy()
df['accept_count'] = 0
for i in range(len(df['id'])):
if df.loc[i,'accepting'] != 'nan':
df.loc[i,'accept_count'] = str(df.loc[i,'accepting']).count(',')+1
df['accept_count'].describe()
df = df.sort_values(by = 'raised_usd', ascending= False)
df.head()
df['min_invest_dum'] = np.where(df['min_investment'] != np.nan, 1, 0)
df['min_invest_dum'].describe()
# data cleaning, choosing those is lised on ico and has finished
df = df[df['is_ico']== 1]
df = df[df['ico_end'].isnull() == False]
df['token_for_sale'] = df['token_for_sale'].fillna(0)
df['sold_tokens'] = df['sold_tokens'].fillna(0)
df['distributed_in_ico'] = df['distributed_in_ico'].fillna(0)
df['ERC20'] = df['ERC20'].fillna(0)
df['teamsize'].describe()
#replace the teamzie with median
df['teamsize'] = df['teamsize'].fillna(11)
df['teamsize'].hist(bins = 30)
plt.show()
#define Y
# sanitize "raised_usd"
df['raised_usd'] = df['raised_usd'].fillna(0)
# create outcome variable success
df['success'] = np.where(df['raised_usd'] >= 500000, 1, 0)
df['success'].describe()
sns.pairplot(df, vars = ['raised_usd','teamsize','success','accept_count',])
# Potential multicollinearity issue
corr = df.corr()
sns.heatmap(corr)
plt.show()
!pip install statsmodels
Requirement already satisfied: statsmodels in /root/venv/lib/python3.7/site-packages (0.13.0)
Requirement already satisfied: pandas>=0.25 in /root/venv/lib/python3.7/site-packages (from statsmodels) (1.0.5)
Requirement already satisfied: patsy>=0.5.2 in /root/venv/lib/python3.7/site-packages (from statsmodels) (0.5.2)
Requirement already satisfied: numpy>=1.17 in /root/venv/lib/python3.7/site-packages (from statsmodels) (1.19.1)
Requirement already satisfied: scipy>=1.3 in /root/venv/lib/python3.7/site-packages (from statsmodels) (1.5.4)
Requirement already satisfied: python-dateutil>=2.6.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2021.3)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
WARNING: You are using pip version 21.2.4; however, version 21.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
import statsmodels.api as sm
from statsmodels.formula.api import ols, logit
# dependent/target/outcome variable
y = df['success']
# independent/predictor/explanatory variable
Xlist = ['token_for_sale','sold_tokens','teamsize','accept_count','distributed_in_ico','ERC20']
X = df[Xlist]
# A. Logit regression
# turn independent variables into floating type (best practice)
# 'missing='drop'' drops rows with missing values from the regression
logit_model=sm.Logit(y,X.astype(float), missing='drop' )
# fit logit model into the data
result=logit_model.fit()
# summarize the logit model
print(result.summary2())
Optimization terminated successfully.
Current function value: 0.641035
Iterations 25
Results: Logit
===================================================================
Model: Logit Pseudo R-squared: -0.022
Dependent Variable: success AIC: 6513.3734
Date: 2021-10-20 06:52 BIC: 6552.5612
No. Observations: 5071 Log-Likelihood: -3250.7
Df Model: 5 LL-Null: -3181.3
Df Residuals: 5065 LLR p-value: 1.0000
Converged: 1.0000 Scale: 1.0000
No. Iterations: 25.0000
-------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
-------------------------------------------------------------------
token_for_sale 0.0000 0.0000 0.3022 0.7625 -0.0000 0.0000
sold_tokens 0.0000 0.0000 2.4679 0.0136 0.0000 0.0000
teamsize 0.0139 0.0042 3.3034 0.0010 0.0056 0.0221
accept_count -0.1443 0.0180 -8.0253 0.0000 -0.1796 -0.1091
distributed_in_ico -0.0003 0.0003 -1.0027 0.3160 -0.0008 0.0003
ERC20 -0.5639 0.0555 -10.1572 0.0000 -0.6727 -0.4550
===================================================================
sns.regplot(x = "sold_tokens", y = "success", data = df,
# old plot was sns.regplot(x = "sold_tokens", y = "success", data = df,
logistic = True, y_jitter = .05)
plt.ylabel("success probability")
sns.regplot(x = "teamsize", y = "success", data = df,
# old plot was sns.regplot(x = "sold_tokens", y = "success", data = df,
logistic = True, y_jitter = .05)
plt.ylabel("success probability")
'''
options for "at"
1. 'overall' The average of the marginal effects at each observation
2. 'mean' The marginal effects at the mean of each regressor
3. 'median' The marginal effects at the median of each regressor
4. 'zero' The marginal effects at zero for each regressor
5. 'all' The marginal effects at each observation.
options for "method"
1. 'dydx' No transformation is made and amrginal effects are returned
2. 'eyex' estimate elasticities of variables in exog
3. 'dyex' estimate semi-elasticity
4. 'eydx' estimate semi-elasticity
'''
average_marginal_effect = result.get_margeff(at = "mean", method = "dydx")
print(average_marginal_effect.summary())
Logit Marginal Effects
=====================================
Dep. Variable: success
Method: dydx
At: mean
======================================================================================
dy/dx std err z P>|z| [0.025 0.975]
--------------------------------------------------------------------------------------
token_for_sale 0 0 nan nan 0 0
sold_tokens 0 0 nan nan 0 0
teamsize 0 0 nan nan 0 0
accept_count 0 0 nan nan 0 0
distributed_in_ico 0 0 nan nan 0 0
ERC20 0 0 nan nan 0 0
======================================================================================
/root/venv/lib/python3.7/site-packages/statsmodels/discrete/discrete_margins.py:435: RuntimeWarning: invalid value encountered in true_divide
return self.margeff / self.margeff_se
# B. Linear Probability Model
# logit regression
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
Results: Ordinary least squares
==================================================================
Model: OLS Adj. R-squared: 0.038
Dependent Variable: success AIC: 6475.1084
Date: 2021-10-20 06:53 BIC: 6514.2962
No. Observations: 5071 Log-Likelihood: -3231.6
Df Model: 5 F-statistic: 40.63
Df Residuals: 5065 Prob (F-statistic): 4.14e-41
R-squared: 0.039 Scale: 0.20968
------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------
const 0.0748 0.0060 12.5125 0.0000 0.0631 0.0865
token_for_sale 0.0000 0.0000 1.5971 0.1103 -0.0000 0.0000
sold_tokens 0.0000 0.0000 1.7552 0.0793 -0.0000 0.0000
teamsize 0.0135 0.0010 13.5211 0.0000 0.0115 0.0154
accept_count 0.0082 0.0037 2.2346 0.0255 0.0010 0.0153
distributed_in_ico 0.0000 0.0001 0.0843 0.9328 -0.0001 0.0001
ERC20 0.1008 0.0083 12.1356 0.0000 0.0845 0.1170
------------------------------------------------------------------
Omnibus: 8621.085 Durbin-Watson: 0.079
Prob(Omnibus): 0.000 Jarque-Bera (JB): 786.640
Skew: 0.731 Prob(JB): 0.000
Kurtosis: 1.741 Condition No.: 1186621678309991
==================================================================
* The condition number is large (1e+15). This might indicate
strong multicollinearity or other numerical problems.
# Use wrapper lazypredict
!pip install lazypredict
Requirement already satisfied: lazypredict in /root/venv/lib/python3.7/site-packages (0.2.9)
Requirement already satisfied: scikit-learn==0.23.1 in /root/venv/lib/python3.7/site-packages (from lazypredict) (0.23.1)
Requirement already satisfied: pytest==5.4.3 in /root/venv/lib/python3.7/site-packages (from lazypredict) (5.4.3)
Requirement already satisfied: numpy==1.19.1 in /root/venv/lib/python3.7/site-packages (from lazypredict) (1.19.1)
Requirement already satisfied: PyYAML==5.3.1 in /root/venv/lib/python3.7/site-packages (from lazypredict) (5.3.1)
Requirement already satisfied: tqdm==4.56.0 in /root/venv/lib/python3.7/site-packages (from lazypredict) (4.56.0)
Requirement already satisfied: xgboost==1.1.1 in /root/venv/lib/python3.7/site-packages (from lazypredict) (1.1.1)
Collecting six==1.15.0
Using cached six-1.15.0-py2.py3-none-any.whl (10 kB)
Requirement already satisfied: lightgbm==2.3.1 in /root/venv/lib/python3.7/site-packages (from lazypredict) (2.3.1)
Requirement already satisfied: pandas==1.0.5 in /root/venv/lib/python3.7/site-packages (from lazypredict) (1.0.5)
Requirement already satisfied: scipy==1.5.4 in /root/venv/lib/python3.7/site-packages (from lazypredict) (1.5.4)
Requirement already satisfied: joblib==1.0.0 in /root/venv/lib/python3.7/site-packages (from lazypredict) (1.0.0)
Requirement already satisfied: click==7.1.2 in /root/venv/lib/python3.7/site-packages (from lazypredict) (7.1.2)
Requirement already satisfied: python-dateutil>=2.6.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas==1.0.5->lazypredict) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas==1.0.5->lazypredict) (2021.3)
Requirement already satisfied: attrs>=17.4.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (21.2.0)
Collecting pluggy<1.0,>=0.12
Using cached pluggy-0.13.1-py2.py3-none-any.whl (18 kB)
Requirement already satisfied: more-itertools>=4.0.0 in /root/venv/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (8.10.0)
Requirement already satisfied: py>=1.5.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (1.10.0)
Requirement already satisfied: packaging in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (21.0)
Requirement already satisfied: wcwidth in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (0.2.5)
Requirement already satisfied: importlib-metadata>=0.12 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (4.8.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn==0.23.1->lazypredict) (3.0.0)
Requirement already satisfied: typing-extensions>=3.6.4 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from importlib-metadata>=0.12->pytest==5.4.3->lazypredict) (3.10.0.2)
Requirement already satisfied: zipp>=0.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from importlib-metadata>=0.12->pytest==5.4.3->lazypredict) (3.6.0)
Requirement already satisfied: pyparsing>=2.0.2 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from packaging->pytest==5.4.3->lazypredict) (2.4.7)
Installing collected packages: six, pluggy
Attempting uninstall: six
Found existing installation: six 1.16.0
Not uninstalling six at /shared-libs/python3.7/py-core/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'six'. No files were found to uninstall.
Attempting uninstall: pluggy
Found existing installation: pluggy 1.0.0
Not uninstalling pluggy at /shared-libs/python3.7/py-core/lib/python3.7/site-packages, outside environment /root/venv
Can't uninstall 'pluggy'. No files were found to uninstall.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.4.1 requires numpy~=1.19.2, but you have numpy 1.19.1 which is incompatible.
tensorflow 2.4.1 requires typing-extensions~=3.7.4, but you have typing-extensions 3.10.0.2 which is incompatible.
Successfully installed pluggy-0.13.1 six-1.15.0
WARNING: You are using pip version 21.2.4; however, version 21.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.model_selection import train_test_split
/root/venv/lib/python3.7/site-packages/sklearn/utils/deprecation.py:143: FutureWarning: The sklearn.utils.testing module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.utils. Anything that cannot be imported from sklearn.utils is now part of the private API.
warnings.warn(message, FutureWarning)
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
100%|██████████| 29/29 [00:07<00:00, 3.69it/s]
models
predictions