Lab 8 Live - Duplicate

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns

df = pd.read_csv('/work/tord_v3_edited.csv') df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6415 entries, 0 to 6414
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        6415 non-null   int64  
 1   name                      6415 non-null   object 
 2   token                     6282 non-null   object 
 3   country                   6415 non-null   object 
 4   is_ico                    6415 non-null   int64  
 5   is_ieo                    6415 non-null   int64  
 6   is_sto                    6415 non-null   int64  
 7   ico_start                 5636 non-null   object 
 8   ico_end                   5497 non-null   object 
 9   price_usd                 5928 non-null   object 
 10  raised_usd                2139 non-null   float64
 11  distributed_in_ico        4870 non-null   float64
 12  sold_tokens               192 non-null    float64
 13  token_for_sale            5122 non-null   float64
 14  whitelist                 3882 non-null   object 
 15  kyc                       6415 non-null   int64  
 16  bonus                     6415 non-null   int64  
 17  restricted_areas          2343 non-null   object 
 18  min_investment            2079 non-null   object 
 19  bounty                    6415 non-null   int64  
 20  mvp                       1296 non-null   object 
 21  pre_ico_start             2717 non-null   object 
 22  pre_ico_end               2705 non-null   object 
 23  pre_ico_price_usd         1733 non-null   object 
 24  platform                  6415 non-null   int64  
 25  accepting                 5545 non-null   object 
 26  link_white_paper          5828 non-null   object 
 27  linkedin_link             4355 non-null   object 
 28  github_link               5649 non-null   object 
 29  website                   5649 non-null   object 
 30  rating                    5709 non-null   float64
 31  teamsize                  4622 non-null   float64
 32  Coinmarketcap_identifier  1281 non-null   float64
 33  ERC20                     5679 non-null   float64
dtypes: float64(8), int64(8), object(18)
memory usage: 1.7+ MB

# counr the number of tokens accepting df = df.copy() df['accept_count'] = 0 for i in range(len(df['id'])): if df.loc[i,'accepting'] != 'nan': df.loc[i,'accept_count'] = str(df.loc[i,'accepting']).count(',')+1 df['accept_count'].describe()

df = df.sort_values(by = 'raised_usd', ascending= False) df.head()

df['min_invest_dum'] = np.where(df['min_investment'] != np.nan, 1, 0) df['min_invest_dum'].describe()

# data cleaning, choosing those is lised on ico and has finished df = df[df['is_ico']== 1] df = df[df['ico_end'].isnull() == False] df['token_for_sale'] = df['token_for_sale'].fillna(0) df['sold_tokens'] = df['sold_tokens'].fillna(0) df['distributed_in_ico'] = df['distributed_in_ico'].fillna(0) df['ERC20'] = df['ERC20'].fillna(0)

df['teamsize'].describe()

#replace the teamzie with median df['teamsize'] = df['teamsize'].fillna(11) df['teamsize'].hist(bins = 30) plt.show()

#define Y # sanitize "raised_usd" df['raised_usd'] = df['raised_usd'].fillna(0) # create outcome variable success df['success'] = np.where(df['raised_usd'] >= 500000, 1, 0)

df['success'].describe()

sns.pairplot(df, vars = ['raised_usd','teamsize','success','accept_count',])

# Potential multicollinearity issue corr = df.corr() sns.heatmap(corr) plt.show()

!pip install statsmodels

Requirement already satisfied: statsmodels in /root/venv/lib/python3.7/site-packages (0.13.0)
Requirement already satisfied: pandas>=0.25 in /root/venv/lib/python3.7/site-packages (from statsmodels) (1.0.5)
Requirement already satisfied: patsy>=0.5.2 in /root/venv/lib/python3.7/site-packages (from statsmodels) (0.5.2)
Requirement already satisfied: numpy>=1.17 in /root/venv/lib/python3.7/site-packages (from statsmodels) (1.19.1)
Requirement already satisfied: scipy>=1.3 in /root/venv/lib/python3.7/site-packages (from statsmodels) (1.5.4)
Requirement already satisfied: python-dateutil>=2.6.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2021.3)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
WARNING: You are using pip version 21.2.4; however, version 21.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.

import statsmodels.api as sm from statsmodels.formula.api import ols, logit # dependent/target/outcome variable y = df['success'] # independent/predictor/explanatory variable Xlist = ['token_for_sale','sold_tokens','teamsize','accept_count','distributed_in_ico','ERC20'] X = df[Xlist] # A. Logit regression # turn independent variables into floating type (best practice) # 'missing='drop'' drops rows with missing values from the regression logit_model=sm.Logit(y,X.astype(float), missing='drop' ) # fit logit model into the data result=logit_model.fit() # summarize the logit model print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.641035
         Iterations 25
                          Results: Logit
===================================================================
Model:               Logit             Pseudo R-squared:  -0.022   
Dependent Variable:  success           AIC:               6513.3734
Date:                2021-10-20 06:52  BIC:               6552.5612
No. Observations:    5071              Log-Likelihood:    -3250.7  
Df Model:            5                 LL-Null:           -3181.3  
Df Residuals:        5065              LLR p-value:       1.0000   
Converged:           1.0000            Scale:             1.0000   
No. Iterations:      25.0000                                       
-------------------------------------------------------------------
                    Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
-------------------------------------------------------------------
token_for_sale      0.0000   0.0000   0.3022 0.7625 -0.0000  0.0000
sold_tokens         0.0000   0.0000   2.4679 0.0136  0.0000  0.0000
teamsize            0.0139   0.0042   3.3034 0.0010  0.0056  0.0221
accept_count       -0.1443   0.0180  -8.0253 0.0000 -0.1796 -0.1091
distributed_in_ico -0.0003   0.0003  -1.0027 0.3160 -0.0008  0.0003
ERC20              -0.5639   0.0555 -10.1572 0.0000 -0.6727 -0.4550
===================================================================

sns.regplot(x = "sold_tokens", y = "success", data = df, # old plot was sns.regplot(x = "sold_tokens", y = "success", data = df, logistic = True, y_jitter = .05) plt.ylabel("success probability")

sns.regplot(x = "teamsize", y = "success", data = df, # old plot was sns.regplot(x = "sold_tokens", y = "success", data = df, logistic = True, y_jitter = .05) plt.ylabel("success probability")

''' options for "at" 1. 'overall' The average of the marginal effects at each observation 2. 'mean' The marginal effects at the mean of each regressor 3. 'median' The marginal effects at the median of each regressor 4. 'zero' The marginal effects at zero for each regressor 5. 'all' The marginal effects at each observation. options for "method" 1. 'dydx' No transformation is made and amrginal effects are returned 2. 'eyex' estimate elasticities of variables in exog 3. 'dyex' estimate semi-elasticity 4. 'eydx' estimate semi-elasticity ''' average_marginal_effect = result.get_margeff(at = "mean", method = "dydx") print(average_marginal_effect.summary())

        Logit Marginal Effects       
=====================================
Dep. Variable:                success
Method:                          dydx
At:                              mean
======================================================================================
                        dy/dx    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
token_for_sale              0          0        nan        nan           0           0
sold_tokens                 0          0        nan        nan           0           0
teamsize                    0          0        nan        nan           0           0
accept_count                0          0        nan        nan           0           0
distributed_in_ico          0          0        nan        nan           0           0
ERC20                       0          0        nan        nan           0           0
======================================================================================
/root/venv/lib/python3.7/site-packages/statsmodels/discrete/discrete_margins.py:435: RuntimeWarning: invalid value encountered in true_divide
  return self.margeff / self.margeff_se

# B. Linear Probability Model # logit regression X = sm.add_constant(X) ols_model=sm.OLS(y,X.astype(float), missing='drop') result=ols_model.fit() print(result.summary2())

                 Results: Ordinary least squares
==================================================================
Model:              OLS              Adj. R-squared:     0.038    
Dependent Variable: success          AIC:                6475.1084
Date:               2021-10-20 06:53 BIC:                6514.2962
No. Observations:   5071             Log-Likelihood:     -3231.6  
Df Model:           5                F-statistic:        40.63    
Df Residuals:       5065             Prob (F-statistic): 4.14e-41 
R-squared:          0.039            Scale:              0.20968  
------------------------------------------------------------------
                     Coef.  Std.Err.    t    P>|t|   [0.025 0.975]
------------------------------------------------------------------
const                0.0748   0.0060 12.5125 0.0000  0.0631 0.0865
token_for_sale       0.0000   0.0000  1.5971 0.1103 -0.0000 0.0000
sold_tokens          0.0000   0.0000  1.7552 0.0793 -0.0000 0.0000
teamsize             0.0135   0.0010 13.5211 0.0000  0.0115 0.0154
accept_count         0.0082   0.0037  2.2346 0.0255  0.0010 0.0153
distributed_in_ico   0.0000   0.0001  0.0843 0.9328 -0.0001 0.0001
ERC20                0.1008   0.0083 12.1356 0.0000  0.0845 0.1170
------------------------------------------------------------------
Omnibus:           8621.085   Durbin-Watson:      0.079           
Prob(Omnibus):     0.000      Jarque-Bera (JB):   786.640         
Skew:              0.731      Prob(JB):           0.000           
Kurtosis:          1.741      Condition No.:      1186621678309991
==================================================================
* The condition number is large (1e+15). This might indicate
strong multicollinearity or other numerical problems.

# Use wrapper lazypredict !pip install lazypredict

Requirement already satisfied: lazypredict in /root/venv/lib/python3.7/site-packages (0.2.9)
Requirement already satisfied: scikit-learn==0.23.1 in /root/venv/lib/python3.7/site-packages (from lazypredict) (0.23.1)
Requirement already satisfied: pytest==5.4.3 in /root/venv/lib/python3.7/site-packages (from lazypredict) (5.4.3)
Requirement already satisfied: numpy==1.19.1 in /root/venv/lib/python3.7/site-packages (from lazypredict) (1.19.1)
Requirement already satisfied: PyYAML==5.3.1 in /root/venv/lib/python3.7/site-packages (from lazypredict) (5.3.1)
Requirement already satisfied: tqdm==4.56.0 in /root/venv/lib/python3.7/site-packages (from lazypredict) (4.56.0)
Requirement already satisfied: xgboost==1.1.1 in /root/venv/lib/python3.7/site-packages (from lazypredict) (1.1.1)
Collecting six==1.15.0
  Using cached six-1.15.0-py2.py3-none-any.whl (10 kB)
Requirement already satisfied: lightgbm==2.3.1 in /root/venv/lib/python3.7/site-packages (from lazypredict) (2.3.1)
Requirement already satisfied: pandas==1.0.5 in /root/venv/lib/python3.7/site-packages (from lazypredict) (1.0.5)
Requirement already satisfied: scipy==1.5.4 in /root/venv/lib/python3.7/site-packages (from lazypredict) (1.5.4)
Requirement already satisfied: joblib==1.0.0 in /root/venv/lib/python3.7/site-packages (from lazypredict) (1.0.0)
Requirement already satisfied: click==7.1.2 in /root/venv/lib/python3.7/site-packages (from lazypredict) (7.1.2)
Requirement already satisfied: python-dateutil>=2.6.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas==1.0.5->lazypredict) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas==1.0.5->lazypredict) (2021.3)
Requirement already satisfied: attrs>=17.4.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (21.2.0)
Collecting pluggy<1.0,>=0.12
  Using cached pluggy-0.13.1-py2.py3-none-any.whl (18 kB)
Requirement already satisfied: more-itertools>=4.0.0 in /root/venv/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (8.10.0)
Requirement already satisfied: py>=1.5.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (1.10.0)
Requirement already satisfied: packaging in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (21.0)
Requirement already satisfied: wcwidth in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (0.2.5)
Requirement already satisfied: importlib-metadata>=0.12 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (4.8.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn==0.23.1->lazypredict) (3.0.0)
Requirement already satisfied: typing-extensions>=3.6.4 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from importlib-metadata>=0.12->pytest==5.4.3->lazypredict) (3.10.0.2)
Requirement already satisfied: zipp>=0.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from importlib-metadata>=0.12->pytest==5.4.3->lazypredict) (3.6.0)
Requirement already satisfied: pyparsing>=2.0.2 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from packaging->pytest==5.4.3->lazypredict) (2.4.7)
Installing collected packages: six, pluggy
  Attempting uninstall: six
    Found existing installation: six 1.16.0
    Not uninstalling six at /shared-libs/python3.7/py-core/lib/python3.7/site-packages, outside environment /root/venv
    Can't uninstall 'six'. No files were found to uninstall.
  Attempting uninstall: pluggy
    Found existing installation: pluggy 1.0.0
    Not uninstalling pluggy at /shared-libs/python3.7/py-core/lib/python3.7/site-packages, outside environment /root/venv
    Can't uninstall 'pluggy'. No files were found to uninstall.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.4.1 requires numpy~=1.19.2, but you have numpy 1.19.1 which is incompatible.
tensorflow 2.4.1 requires typing-extensions~=3.7.4, but you have typing-extensions 3.10.0.2 which is incompatible.
Successfully installed pluggy-0.13.1 six-1.15.0
WARNING: You are using pip version 21.2.4; however, version 21.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.

from lazypredict.Supervised import LazyClassifier, LazyRegressor from sklearn.model_selection import train_test_split

/root/venv/lib/python3.7/site-packages/sklearn/utils/deprecation.py:143: FutureWarning: The sklearn.utils.testing module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.utils. Anything that cannot be imported from sklearn.utils is now part of the private API.
  warnings.warn(message, FutureWarning)

# load data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42) # fit all models clf = LazyClassifier(predictions=True) models, predictions = clf.fit(X_train, X_test, y_train, y_test)

100%|██████████| 29/29 [00:07<00:00,  3.69it/s]

models

predictions