Lab 8 Live - Duplicate

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns

df = pd.read_csv('/work/tord_v3_edited.csv') df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6415 entries, 0 to 6414
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        6415 non-null   int64  
 1   name                      6415 non-null   object 
 2   token                     6282 non-null   object 
 3   country                   6415 non-null   object 
 4   is_ico                    6415 non-null   int64  
 5   is_ieo                    6415 non-null   int64  
 6   is_sto                    6415 non-null   int64  
 7   ico_start                 5636 non-null   object 
 8   ico_end                   5497 non-null   object 
 9   price_usd                 5928 non-null   object 
 10  raised_usd                2139 non-null   float64
 11  distributed_in_ico        4870 non-null   float64
 12  sold_tokens               192 non-null    float64
 13  token_for_sale            5122 non-null   float64
 14  whitelist                 3882 non-null   object 
 15  kyc                       6415 non-null   int64  
 16  bonus                     6415 non-null   int64  
 17  restricted_areas          2343 non-null   object 
 18  min_investment            2079 non-null   object 
 19  bounty                    6415 non-null   int64  
 20  mvp                       1296 non-null   object 
 21  pre_ico_start             2717 non-null   object 
 22  pre_ico_end               2705 non-null   object 
 23  pre_ico_price_usd         1733 non-null   object 
 24  platform                  6415 non-null   int64  
 25  accepting                 5545 non-null   object 
 26  link_white_paper          5828 non-null   object 
 27  linkedin_link             4355 non-null   object 
 28  github_link               5649 non-null   object 
 29  website                   5649 non-null   object 
 30  rating                    5709 non-null   float64
 31  teamsize                  4622 non-null   float64
 32  Coinmarketcap_identifier  1281 non-null   float64
 33  ERC20                     5679 non-null   float64
dtypes: float64(8), int64(8), object(18)
memory usage: 1.7+ MB

df.head(20)

df.describe()

# sanitize "raised_usd" df['raised_usd'] = df['raised_usd'].fillna(0) # create outcome variable success df['success'] = np.where(df['raised_usd'] >= 500000, 1, 0)

df['success'].describe()

sns.pairplot(df, vars = ['raised_usd','teamsize','success'])

# Potential multicollinearity issue corr = df.corr() sns.heatmap(corr) plt.show()

df['rating'] = df['rating'].fillna(0) df['ERC20'] = df['ERC20'].fillna(0) df['teamsize'] = df['teamsize'].fillna(1)

candidate_variables = ['teamsize','rating','bonus','sold_tokens','bounty','kyc'] current_variables = [] target = ['success'] df['ratxteamsize']=df['teamsize'] * df['rating'] df['ratxteamsize'].head(20) df['ERC20'].fillna(0) print(df['ERC20'].isna().sum()) df['whitelist']=df['whitelist'].map({'Yes': 1, 'No': 0}) print(df['whitelist'].head())

0
0   1.00
1   0.00
2   0.00
3   1.00
4   1.00
Name: whitelist, dtype: float64

df['whitelist'].fillna(0) df['whitelist'].isna().sum() df['whitelist'].head()

#df['kyc'].head() df['whxky']=df['whitelist'] * df['kyc'] df['whxky'].head()

df['logsoldtokens']=np.log(df['sold_tokens']) print(df['logsoldtokens'].head()) df['logsoldtokens'] = df['logsoldtokens'].fillna(0) print(df['logsoldtokens'].head()) #for i in df['linkedin_link']: # if i = 'TBD': df.loc[df['linkedin_link']=="TBD", 'linkedin_link'] = 0 df['linkedin_link'].fillna(0) df.loc[df['linkedin_link']!=0, "linkedin_link"] = 1 #df['linkedin_link']=df['linkedin_link'].map({'TBD': 0, 'No': 0}) print(df['linkedin_link'].sum())

0   18.42
1     NaN
2     NaN
3     NaN
4     NaN
Name: logsoldtokens, dtype: float64
0   18.42
1    0.00
2    0.00
3    0.00
4    0.00
Name: logsoldtokens, dtype: float64
5587

df['link_white_paper'].fillna(0, inplace=True) df.loc[df['link_white_paper']!=0, "link_white_paper"] = 1 print(df['link_white_paper'].sum()) print(df['link_white_paper'].head()) df['linkxwhitep']=df['linkedin_link'] * df['link_white_paper'] print(df['linkxwhitep'].head())

5828
0    0
1    1
2    1
3    1
4    1
Name: link_white_paper, dtype: object
0    0
1    1
2    1
3    1
4    1
Name: linkxwhitep, dtype: object

df['github_link'].fillna(0, inplace=True) df.loc[df['github_link']=="None", "github_link"] = 0 df.loc[df['github_link']!=0, "github_link"] = 1 print(df['github_link'].sum())

df['ico_start'][0]

df['ico_start'].fillna(0, inplace=True) print(df['ico_start'].head()) df['ico_end'].fillna(0, inplace=True) print(df['ico_end'].head())

0    8/10/2020
1     8/1/2020
2     3/1/2019
3    6/25/2020
4            0
Name: ico_start, dtype: object
0    12/31/2020
1    12/31/2020
2    12/31/2020
3     1/31/2021
4             0
Name: ico_end, dtype: object

for i in range(len(df)): if df['ico_end'][i] == 0 and df['ico_start'][i] != 0: df['ico_end'][i] = "10/24/2021"

df['ico_start']=pd.to_datetime(df['ico_start'], errors='coerce',dayfirst=False) df['ico_end']=pd.to_datetime(df['ico_end'], errors='coerce',dayfirst=False)

df['ico_length']=df['ico_end']-df['ico_start'] df['ico_length']=df['ico_length'].dt.days print(df['ico_length'].head())

0    143
1    152
2    671
3    220
4      0
Name: ico_length, dtype: int64

for i in range(len(df)): if df['ico_length'][i] < 0: df['ico_length'][i] = 1

!pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.13.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
     |████████████████████████████████| 9.8 MB 21.8 MB/s 
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.7.1)
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Collecting patsy>=0.5.2
  Downloading patsy-0.5.2-py2.py3-none-any.whl (233 kB)
     |████████████████████████████████| 233 kB 42.7 MB/s 
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2021.3)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.2 statsmodels-0.13.0
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.

import statsmodels.api as sm from statsmodels.formula.api import ols, logit # dependent/target/outcome variable y = df['success'] # independent/predictor/explanatory variable X = df[['teamsize', 'rating']] # A. Logit regression # turn independent variables into floating type (best practice) # 'missing='drop'' drops rows with missing values from the regression logit_model=sm.Logit(y,X.astype(float), missing='drop' ) # fit logit model into the data result=logit_model.fit() # summarize the logit model print(result.summary2()) print(result.tvalues)

Optimization terminated successfully.
         Current function value: 0.634254
         Iterations 5
                         Results: Logit
================================================================
Model:              Logit            Pseudo R-squared: -0.083   
Dependent Variable: success          AIC:              8141.4755
Date:               2021-10-24 09:49 BIC:              8155.0083
No. Observations:   6415             Log-Likelihood:   -4068.7  
Df Model:           1                LL-Null:          -3755.4  
Df Residuals:       6413             LLR p-value:      1.0000   
Converged:          1.0000           Scale:            1.0000   
No. Iterations:     5.0000                                      
-----------------------------------------------------------------
             Coef.   Std.Err.     z      P>|z|    [0.025   0.975]
-----------------------------------------------------------------
teamsize     0.0233    0.0038    6.1941  0.0000   0.0159   0.0307
rating      -0.3148    0.0150  -21.0364  0.0000  -0.3441  -0.2855
================================================================

teamsize     6.19413
rating     -21.03639
dtype: float64

sns.regplot(x = "rating", y = "success", data = df, logistic = True, y_jitter = .05) plt.ylabel("success probability")

''' options for "at" 1. 'overall' The average of the marginal effects at each observation 2. 'mean' The marginal effects at the mean of each regressor 3. 'median' The marginal effects at the median of each regressor 4. 'zero' The marginal effects at zero for each regressor 5. 'all' The marginal effects at each observation. options for "method" 1. 'dydx' No transformation is made and amrginal effects are returned 2. 'eyex' estimate elasticities of variables in exog 3. 'dyex' estimate semi-elasticity 4. 'eydx' estimate semi-elasticity ''' average_marginal_effect = result.get_margeff(at = "mean", method = "dydx") print(average_marginal_effect.summary())

        Logit Marginal Effects       
=====================================
Dep. Variable:                success
Method:                          dydx
At:                              mean
==============================================================================
                dy/dx    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
teamsize       0.0053      0.001      6.186      0.000       0.004       0.007
rating        -0.0714      0.003    -23.165      0.000      -0.077      -0.065
==============================================================================

# B. Linear Probability Model # logit regression X = sm.add_constant(X) ols_model=sm.OLS(y,X.astype(float), missing='drop') result=ols_model.fit() print(result.summary2()) #print(result.rsquared_adj)

                 Results: Ordinary least squares
==================================================================
Model:              OLS              Adj. R-squared:     0.070    
Dependent Variable: success          AIC:                7359.1216
Date:               2021-10-24 09:50 BIC:                7379.4208
No. Observations:   6415             Log-Likelihood:     -3676.6  
Df Model:           2                F-statistic:        241.5    
Df Residuals:       6412             Prob (F-statistic): 7.41e-102
R-squared:          0.070            Scale:              0.18430  
--------------------------------------------------------------------
              Coef.    Std.Err.      t      P>|t|    [0.025   0.975]
--------------------------------------------------------------------
const         0.0268     0.0132    2.0379   0.0416   0.0010   0.0526
teamsize      0.0077     0.0008    9.9486   0.0000   0.0062   0.0092
rating        0.0708     0.0050   14.1388   0.0000   0.0609   0.0806
------------------------------------------------------------------
Omnibus:             1252.131      Durbin-Watson:         1.721   
Prob(Omnibus):       0.000         Jarque-Bera (JB):      1080.438
Skew:                0.919         Prob(JB):              0.000   
Kurtosis:            2.184         Condition No.:         29      
==================================================================

# Use wrapper lazypredict !pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.9-py2.py3-none-any.whl (12 kB)
Collecting lightgbm==2.3.1
  Downloading lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl (1.2 MB)
     |████████████████████████████████| 1.2 MB 20.1 MB/s 
Collecting six==1.15.0
  Downloading six-1.15.0-py2.py3-none-any.whl (10 kB)
Collecting PyYAML==5.3.1
  Downloading PyYAML-5.3.1.tar.gz (269 kB)
     |████████████████████████████████| 269 kB 23.8 MB/s 
Collecting scipy==1.5.4
  Downloading scipy-1.5.4-cp37-cp37m-manylinux1_x86_64.whl (25.9 MB)
     |████████████████████████████████| 25.9 MB 42.7 MB/s 
Collecting pandas==1.0.5
  Downloading pandas-1.0.5-cp37-cp37m-manylinux1_x86_64.whl (10.1 MB)
     |████████████████████████████████| 10.1 MB 39.7 MB/s 
Collecting click==7.1.2
  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
     |████████████████████████████████| 82 kB 2.7 MB/s 
Collecting xgboost==1.1.1
  Downloading xgboost-1.1.1-py3-none-manylinux2010_x86_64.whl (127.6 MB)
     |████████████████████████████████| 127.6 MB 54 kB/s 
Collecting numpy==1.19.1
  Downloading numpy-1.19.1-cp37-cp37m-manylinux2010_x86_64.whl (14.5 MB)
     |████████████████████████████████| 14.5 MB 33.5 MB/s 
Collecting joblib==1.0.0
  Downloading joblib-1.0.0-py3-none-any.whl (302 kB)
     |████████████████████████████████| 302 kB 28.2 MB/s 
Collecting pytest==5.4.3
  Downloading pytest-5.4.3-py3-none-any.whl (248 kB)
     |████████████████████████████████| 248 kB 51.0 MB/s 
Collecting scikit-learn==0.23.1
  Downloading scikit_learn-0.23.1-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
     |████████████████████████████████| 6.8 MB 34.5 MB/s 
Collecting tqdm==4.56.0
  Downloading tqdm-4.56.0-py2.py3-none-any.whl (72 kB)
     |████████████████████████████████| 72 kB 1.6 MB/s 
Requirement already satisfied: pytz>=2017.2 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas==1.0.5->lazypredict) (2021.3)
Requirement already satisfied: python-dateutil>=2.6.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas==1.0.5->lazypredict) (2.8.2)
Collecting pluggy<1.0,>=0.12
  Downloading pluggy-0.13.1-py2.py3-none-any.whl (18 kB)
Collecting more-itertools>=4.0.0
  Downloading more_itertools-8.10.0-py3-none-any.whl (51 kB)
     |████████████████████████████████| 51 kB 632 kB/s 
Requirement already satisfied: importlib-metadata>=0.12 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (4.8.1)
Requirement already satisfied: wcwidth in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (0.2.5)
Requirement already satisfied: attrs>=17.4.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (21.2.0)
Requirement already satisfied: py>=1.5.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (1.10.0)
Requirement already satisfied: packaging in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pytest==5.4.3->lazypredict) (21.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn==0.23.1->lazypredict) (3.0.0)
Requirement already satisfied: typing-extensions>=3.6.4 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from importlib-metadata>=0.12->pytest==5.4.3->lazypredict) (3.10.0.2)
Requirement already satisfied: zipp>=0.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from importlib-metadata>=0.12->pytest==5.4.3->lazypredict) (3.6.0)
Requirement already satisfied: pyparsing>=2.0.2 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from packaging->pytest==5.4.3->lazypredict) (2.4.7)
Building wheels for collected packages: PyYAML
  Building wheel for PyYAML (setup.py) ... done
  Created wheel for PyYAML: filename=PyYAML-5.3.1-cp37-cp37m-linux_x86_64.whl size=44635 sha256=7756ce74e267a8f02970ac2be10a236f3aa1e883066ef00db35326a3d0ad4819
  Stored in directory: /root/.cache/pip/wheels/5e/03/1e/e1e954795d6f35dfc7b637fe2277bff021303bd9570ecea653
Successfully built PyYAML
Installing collected packages: numpy, six, scipy, joblib, scikit-learn, pluggy, more-itertools, xgboost, tqdm, PyYAML, pytest, pandas, lightgbm, click, lazypredict
  Attempting uninstall: numpy
    Found existing installation: numpy 1.19.5
    Not uninstalling numpy at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
    Can't uninstall 'numpy'. No files were found to uninstall.
  Attempting uninstall: six
    Found existing installation: six 1.16.0
    Not uninstalling six at /shared-libs/python3.7/py-core/lib/python3.7/site-packages, outside environment /root/venv
    Can't uninstall 'six'. No files were found to uninstall.
  Attempting uninstall: scipy
    Found existing installation: scipy 1.7.1
    Not uninstalling scipy at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
    Can't uninstall 'scipy'. No files were found to uninstall.
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Not uninstalling joblib at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
    Can't uninstall 'joblib'. No files were found to uninstall.
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0
    Not uninstalling scikit-learn at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
    Can't uninstall 'scikit-learn'. No files were found to uninstall.
  Attempting uninstall: pluggy
    Found existing installation: pluggy 1.0.0
    Not uninstalling pluggy at /shared-libs/python3.7/py-core/lib/python3.7/site-packages, outside environment /root/venv
    Can't uninstall 'pluggy'. No files were found to uninstall.
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.62.3
    Not uninstalling tqdm at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
    Can't uninstall 'tqdm'. No files were found to uninstall.
  Attempting uninstall: PyYAML
    Found existing installation: PyYAML 5.4.1
    Not uninstalling pyyaml at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
    Can't uninstall 'PyYAML'. No files were found to uninstall.
  Attempting uninstall: pytest
    Found existing installation: pytest 6.2.5
    Not uninstalling pytest at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
    Can't uninstall 'pytest'. No files were found to uninstall.
  Attempting uninstall: pandas
    Found existing installation: pandas 1.2.5
    Not uninstalling pandas at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
    Can't uninstall 'pandas'. No files were found to uninstall.
  Attempting uninstall: click
    Found existing installation: click 8.0.3
    Not uninstalling click at /shared-libs/python3.7/py/lib/python3.7/site-packages, outside environment /root/venv
    Can't uninstall 'click'. No files were found to uninstall.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.4.1 requires numpy~=1.19.2, but you have numpy 1.19.1 which is incompatible.
tensorflow 2.4.1 requires typing-extensions~=3.7.4, but you have typing-extensions 3.10.0.2 which is incompatible.
Successfully installed PyYAML-5.3.1 click-7.1.2 joblib-1.0.0 lazypredict-0.2.9 lightgbm-2.3.1 more-itertools-8.10.0 numpy-1.19.1 pandas-1.0.5 pluggy-0.13.1 pytest-5.4.3 scikit-learn-0.23.1 scipy-1.5.4 six-1.15.0 tqdm-4.56.0 xgboost-1.1.1
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.

from lazypredict.Supervised import LazyClassifier, LazyRegressor from sklearn.model_selection import train_test_split

/root/venv/lib/python3.7/site-packages/sklearn/utils/deprecation.py:143: FutureWarning: The sklearn.utils.testing module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.utils. Anything that cannot be imported from sklearn.utils is now part of the private API.
  warnings.warn(message, FutureWarning)

100%|██████████| 29/29 [00:08<00:00,  3.32it/s]

models

predictions

models['Accuracy']['LogisticRegression']

def prsq(variables, target, basetable): X = basetable[variables] y = basetable[target] logreg = sm.Logit(y,X.astype(float), missing = 'drop') result=logreg.fit() #return(abs(result.tvalues)) return(result.prsquared)

def next_best(current_variables, candidate_variables, target, basetable): best_rsq = -1 best_variable = None for v in candidate_variables: logrsq = prsq(current_variables+[v], target, basetable) if logrsq >= best_rsq: best_rsq = logrsq best_variable = v return best_variable candidate_variables = ['bonus','sold_tokens','bounty','linkedin_link'] current_variables = ['teamsize','rating'] target = ['success'] next_variable = next_best(current_variables, candidate_variables,target, df) print(next_variable)

Optimization terminated successfully.
         Current function value: 0.615002
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.642771
         Iterations 24
Optimization terminated successfully.
         Current function value: 0.631929
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.624116
         Iterations 5
sold_tokens

candidate_variables = ['teamsize','rating','bonus','sold_tokens','bounty','kyc'] #candidate_variables = ['teamsize','rating','bonus','bounty','kyc'] current_variables = [] target = ['success'] max_number_variables = 5 number_iterations = min(max_number_variables, len(candidate_variables)) for i in range(0,number_iterations): next_var = next_best(current_variables, candidate_variables, target, df) current_variables = current_variables + [next_var] candidate_variables.remove(next_var) print(current_variables)

Optimization terminated successfully.
         Current function value: 0.670867
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.637245
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.656717
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.688166
         Iterations 1
Optimization terminated successfully.
         Current function value: 0.666551
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.655511
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.650429
         Iterations 24
Optimization terminated successfully.
         Current function value: 0.643492
         Iterations 24
Optimization terminated successfully.
         Current function value: 0.633588
         Iterations 24
Optimization terminated successfully.
         Current function value: 0.651911
         Iterations 24
Optimization terminated successfully.
         Current function value: 0.652853
         Iterations 24
Optimization terminated successfully.
         Current function value: 0.627634
         Iterations 24
Optimization terminated successfully.
         Current function value: 0.615034
         Iterations 24
Optimization terminated successfully.
         Current function value: 0.633588
         Iterations 24
Optimization terminated successfully.
         Current function value: 0.631486
         Iterations 24
Optimization terminated successfully.
         Current function value: 0.614418
         Iterations 24
Optimization terminated successfully.
         Current function value: 0.611037
         Iterations 24
Optimization terminated successfully.
         Current function value: 0.614516
         Iterations 24
Optimization terminated successfully.
         Current function value: 0.610882
         Iterations 24
Optimization terminated successfully.
         Current function value: 0.611017
         Iterations 24
['sold_tokens', 'bonus', 'rating', 'bounty', 'teamsize']

# dependent/target/outcome variable y = df['success'] # independent/predictor/explanatory variable X = df[current_variables] # A. Logit regression # turn independent variables into floating type (best practice) # 'missing='drop'' drops rows with missing values from the regression logit_model=sm.Logit(y,X.astype(float), missing='drop' ) # fit logit model into the data result=logit_model.fit() # summarize the logit model print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.610882
         Iterations 24
                         Results: Logit
=================================================================
Model:              Logit            Pseudo R-squared: 0.096     
Dependent Variable: success          AIC:              240.5785  
Date:               2021-10-24 09:52 BIC:              250.3510  
No. Observations:   192              Log-Likelihood:   -117.29   
Df Model:           2                LL-Null:          -129.69   
Df Residuals:       189              LLR p-value:      4.1188e-06
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     24.0000                                      
------------------------------------------------------------------
               Coef.   Std.Err.     z     P>|z|    [0.025   0.975]
------------------------------------------------------------------
sold_tokens    0.0000    0.0000   1.6031  0.1089  -0.0000   0.0000
bonus         -2.1902    0.8241  -2.6578  0.0079  -3.8054  -0.5751
rating         0.2028    0.0842   2.4088  0.0160   0.0378   0.3678
bounty        -0.5241    0.4482  -1.1693  0.2423  -1.4026   0.3544
teamsize      -0.0058    0.0236  -0.2446  0.8068  -0.0521   0.0405
=================================================================

# B. Linear Probability Model # logit regression X = sm.add_constant(X) ols_model=sm.OLS(y,X.astype(float), missing='drop') result=ols_model.fit() print(result.summary2()) #print(result.rsquared_adj)

                 Results: Ordinary least squares
=================================================================
Model:              OLS              Adj. R-squared:     0.012   
Dependent Variable: success          AIC:                273.5059
Date:               2021-10-24 09:52 BIC:                283.2784
No. Observations:   192              Log-Likelihood:     -133.75 
Df Model:           2                F-statistic:        2.154   
Df Residuals:       189              Prob (F-statistic): 0.119   
R-squared:          0.022            Scale:              0.23958 
------------------------------------------------------------------
                Coef.   Std.Err.     t     P>|t|    [0.025  0.975]
------------------------------------------------------------------
const           0.0737    0.0059  12.5733  0.0000   0.0622  0.0853
sold_tokens     0.0000    0.0000   1.1848  0.2376  -0.0000  0.0000
bonus           0.0048    0.0004  12.6795  0.0000   0.0040  0.0055
rating          0.1991    0.0157  12.6540  0.0000   0.1681  0.2302
bounty          0.0095    0.0007  13.1892  0.0000   0.0081  0.0109
teamsize       -0.0073    0.0051  -1.4092  0.1604  -0.0174  0.0029
-----------------------------------------------------------------
Omnibus:          2292.577   Durbin-Watson:      1.772           
Prob(Omnibus):    0.000      Jarque-Bera (JB):   23.549          
Skew:             -0.338     Prob(JB):           0.000           
Kurtosis:         1.423      Condition No.:      4417515078361991
=================================================================
* The condition number is large (4e+15). This might indicate
strong multicollinearity or other numerical problems.

100%|██████████| 29/29 [00:09<00:00,  3.16it/s]

print(models['Accuracy']['LogisticRegression']) print(models['ROC AUC']['LogisticRegression'])

0.7365549493374902
0.5572824266484709

def acctest(variables, target, basetable): X = basetable[variables] y = basetable[target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42) clf = LazyClassifier(predictions=True) models, predictions = clf.fit(X_train, X_test, y_train, y_test) #logreg = sm.Logit(y,X.astype(float), missing = 'drop') #result=logreg.fit() #return(abs(result.tvalues)) return(models['ROC AUC']['LogisticRegression'])

def next_best2(current_variables, candidate_variables, target, basetable): best_rsq = -1 best_variable = None for v in candidate_variables: logrsq = acctest(current_variables+[v], target, basetable) if logrsq >= best_rsq: best_rsq = logrsq best_variable = v return best_variable candidate_variables = ['bonus','sold_tokens','bounty'] current_variables = ['teamsize','rating'] target = ['success'] next_variable = next_best2(current_variables, candidate_variables,target, df) print(next_variable)

100%|██████████| 29/29 [00:08<00:00,  3.38it/s]
100%|██████████| 29/29 [00:08<00:00,  3.39it/s]
100%|██████████| 29/29 [00:08<00:00,  3.29it/s]bonus

print(next_variable)

bonus

candidate_variables = ['teamsize','rating','bonus','sold_tokens','bounty','kyc'] current_variables = [] target = ['success'] max_number_variables = 5 number_iterations = min(max_number_variables, len(candidate_variables)) for i in range(0,number_iterations): next_var = next_best2(current_variables, candidate_variables, target, df) current_variables = current_variables + [next_var] candidate_variables.remove(next_var) print(current_variables)

100%|██████████| 29/29 [00:08<00:00,  3.62it/s]
100%|██████████| 29/29 [00:07<00:00,  4.04it/s]
100%|██████████| 29/29 [00:05<00:00,  5.74it/s]
100%|██████████| 29/29 [00:06<00:00,  4.81it/s]
100%|██████████| 29/29 [00:05<00:00,  5.35it/s]
100%|██████████| 29/29 [00:05<00:00,  5.03it/s]
100%|██████████| 29/29 [00:09<00:00,  3.11it/s]
100%|██████████| 29/29 [00:08<00:00,  3.59it/s]
100%|██████████| 29/29 [00:08<00:00,  3.57it/s]
100%|██████████| 29/29 [00:08<00:00,  3.61it/s]
100%|██████████| 29/29 [00:07<00:00,  3.71it/s]
100%|██████████| 29/29 [00:09<00:00,  3.04it/s]
100%|██████████| 29/29 [00:08<00:00,  3.42it/s]
100%|██████████| 29/29 [00:08<00:00,  3.52it/s]
100%|██████████| 29/29 [00:07<00:00,  3.64it/s]
100%|██████████| 29/29 [00:09<00:00,  3.03it/s]
100%|██████████| 29/29 [00:09<00:00,  3.06it/s]
100%|██████████| 29/29 [00:09<00:00,  3.04it/s]
100%|██████████| 29/29 [00:09<00:00,  2.93it/s]
100%|██████████| 29/29 [00:09<00:00,  2.91it/s]['rating', 'bonus', 'teamsize', 'bounty', 'sold_tokens']

Optimization terminated successfully.
         Current function value: 0.600325
         Iterations 11
                         Results: Logit
================================================================
Model:              Logit            Pseudo R-squared: 0.113    
Dependent Variable: success          AIC:              136.0681 
Date:               2021-10-23 16:43 BIC:              149.3379 
No. Observations:   105              Log-Likelihood:   -63.034  
Df Model:           4                LL-Null:          -71.052  
Df Residuals:       100              LLR p-value:      0.0029717
Converged:          1.0000           Scale:            1.0000   
No. Iterations:     11.0000                                     
-----------------------------------------------------------------
              Coef.   Std.Err.     z     P>|z|    [0.025   0.975]
-----------------------------------------------------------------
rating        0.2013    0.1302   1.5462  0.1221  -0.0539   0.4565
bonus        -2.7624    1.4020  -1.9703  0.0488  -5.5103  -0.0145
teamsize     -0.0045    0.0296  -0.1522  0.8790  -0.0624   0.0534
bounty       -0.6432    0.5085  -1.2647  0.2060  -1.6399   0.3536
sold_tokens   0.0000    0.0000   1.3223  0.1861  -0.0000   0.0000
================================================================

100%|██████████| 29/29 [00:09<00:00,  2.99it/s]

print(models['Accuracy']['LogisticRegression']) print(models['ROC AUC']['LogisticRegression'])

0.7420109119251753
0.567486508281124

df.columns[2:]

df['name'].isnull().sum()

#candidate_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','linkedin_link','link_white_paper','linkxwhitep','github_link','ico_length','raised_usd'] candidate_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','link_white_paper','linkxwhitep','github_link','ico_length'] current_variables = [] target = ['success'] max_number_variables = 8 number_iterations = min(max_number_variables, len(candidate_variables)) for i in range(0,number_iterations): next_var = next_best(current_variables, candidate_variables, target, df) current_variables = current_variables + [next_var] candidate_variables.remove(next_var) print(next_var) print(current_variables)

Optimization terminated successfully.
         Current function value: 0.670867
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.637245
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.656717
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.692039
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.666551
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.655511
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.684098
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.593962
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.625762
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.654769
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.667061
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601703
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.637722
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.671335
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.691685
         Iterations 5
is_ico
Optimization terminated successfully.
         Current function value: 0.582406
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.583269
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.579338
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.585713
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593915
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593947
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.573491
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593249
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.570740
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.569132
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.592900
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.589103
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.587018
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593872
         Iterations 5
ratxteamsize
Optimization terminated successfully.
         Current function value: 0.569498
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.573266
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.554547
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.564267
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.570808
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.569001
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.573248
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.549291
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.549036
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.569408
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.571803
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.572682
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.573341
         Iterations 5
bonus
Optimization terminated successfully.
         Current function value: 0.550192
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.554210
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.545085
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.552735
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.550887
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.554314
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.528229
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.527774
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.550560
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.554016
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.553627
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.554390
         Iterations 7
whxky
Optimization terminated successfully.
         Current function value: 0.524424
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.527488
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.524731
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.527684
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.527768
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.527101
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.527440
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.527310
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.526081
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.527597
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.527719
         Iterations 7
teamsize
Optimization terminated successfully.
         Current function value: 0.522754
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.521284
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.524256
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.524401
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.523621
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.524197
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.524117
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.522756
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.524417
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.524358
         Iterations 7
logsoldtokens
Optimization terminated successfully.
         Current function value: 0.519546
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.521093
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.521254
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.520414
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.521059
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.520967
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.519669
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.521278
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.521215
         Iterations 7
rating
Optimization terminated successfully.
         Current function value: 0.519501
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.519184
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.519366
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.519324
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.519513
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.514750
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.519232
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.519456
         Iterations 7
linkxwhitep
['is_ico', 'ratxteamsize', 'bonus', 'whxky', 'teamsize', 'logsoldtokens', 'rating', 'linkxwhitep']

#candidate_variables = ['teamsize','rating','bonus','sold_tokens','bounty','kyc','ratxteamsize'] candidate_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','linkedin_link'] current_variables = [] target = ['success'] max_number_variables = 8 number_iterations = min(max_number_variables, len(candidate_variables)) for i in range(0,number_iterations): next_var = next_best2(current_variables, candidate_variables, target, df) current_variables = current_variables + [next_var] candidate_variables.remove(next_var) print(current_variables)

100%|██████████| 29/29 [00:07<00:00,  3.74it/s]
100%|██████████| 29/29 [00:07<00:00,  3.89it/s]
100%|██████████| 29/29 [00:05<00:00,  5.58it/s]
100%|██████████| 29/29 [00:06<00:00,  4.68it/s]
100%|██████████| 29/29 [00:05<00:00,  5.17it/s]
100%|██████████| 29/29 [00:05<00:00,  4.94it/s]
100%|██████████| 29/29 [00:08<00:00,  3.34it/s]
100%|██████████| 29/29 [00:05<00:00,  5.64it/s]
100%|██████████| 29/29 [00:05<00:00,  5.25it/s]
100%|██████████| 29/29 [00:05<00:00,  5.15it/s]
100%|██████████| 29/29 [00:05<00:00,  5.11it/s]
100%|██████████| 29/29 [00:06<00:00,  4.78it/s]
100%|██████████| 29/29 [00:08<00:00,  3.25it/s]
100%|██████████| 29/29 [00:09<00:00,  3.12it/s]
100%|██████████| 29/29 [00:08<00:00,  3.23it/s]
100%|██████████| 29/29 [00:08<00:00,  3.28it/s]
100%|██████████| 29/29 [00:09<00:00,  3.18it/s]
100%|██████████| 29/29 [00:08<00:00,  3.25it/s]
100%|██████████| 29/29 [00:08<00:00,  3.28it/s]
100%|██████████| 29/29 [00:08<00:00,  3.26it/s]
100%|██████████| 29/29 [00:08<00:00,  3.26it/s]
100%|██████████| 29/29 [00:09<00:00,  3.22it/s]
100%|██████████| 29/29 [00:08<00:00,  3.25it/s]
100%|██████████| 29/29 [00:08<00:00,  3.35it/s]
100%|██████████| 29/29 [00:09<00:00,  3.04it/s]
100%|██████████| 29/29 [00:09<00:00,  3.18it/s]
100%|██████████| 29/29 [00:09<00:00,  3.19it/s]
100%|██████████| 29/29 [00:08<00:00,  3.28it/s]
100%|██████████| 29/29 [00:09<00:00,  3.21it/s]
100%|██████████| 29/29 [00:09<00:00,  3.14it/s]
100%|██████████| 29/29 [00:09<00:00,  3.19it/s]
100%|██████████| 29/29 [00:09<00:00,  3.20it/s]
100%|██████████| 29/29 [00:09<00:00,  3.22it/s]
100%|██████████| 29/29 [00:10<00:00,  2.86it/s]
100%|██████████| 29/29 [00:09<00:00,  2.94it/s]
100%|██████████| 29/29 [00:09<00:00,  3.01it/s]
100%|██████████| 29/29 [00:09<00:00,  2.94it/s]
100%|██████████| 29/29 [00:09<00:00,  2.94it/s]
100%|██████████| 29/29 [00:09<00:00,  3.00it/s]
100%|██████████| 29/29 [00:09<00:00,  3.07it/s]
100%|██████████| 29/29 [00:09<00:00,  2.95it/s]
100%|██████████| 29/29 [00:10<00:00,  2.86it/s]
100%|██████████| 29/29 [00:10<00:00,  2.72it/s]
100%|██████████| 29/29 [00:09<00:00,  2.94it/s]
100%|██████████| 29/29 [00:10<00:00,  2.88it/s]
100%|██████████| 29/29 [00:10<00:00,  2.79it/s]
100%|██████████| 29/29 [00:15<00:00,  1.83it/s]
100%|██████████| 29/29 [00:12<00:00,  2.28it/s]
100%|██████████| 29/29 [00:10<00:00,  2.75it/s]
100%|██████████| 29/29 [00:10<00:00,  2.78it/s]
100%|██████████| 29/29 [00:11<00:00,  2.62it/s]
100%|██████████| 29/29 [00:10<00:00,  2.85it/s]
100%|██████████| 29/29 [00:10<00:00,  2.88it/s]
100%|██████████| 29/29 [00:10<00:00,  2.79it/s]
100%|██████████| 29/29 [00:10<00:00,  2.83it/s]
100%|██████████| 29/29 [00:10<00:00,  2.85it/s]
100%|██████████| 29/29 [00:10<00:00,  2.86it/s]
100%|██████████| 29/29 [00:15<00:00,  1.82it/s]
100%|██████████| 29/29 [00:17<00:00,  1.62it/s]
100%|██████████| 29/29 [00:15<00:00,  1.89it/s]
100%|██████████| 29/29 [00:10<00:00,  2.66it/s]
100%|██████████| 29/29 [00:10<00:00,  2.74it/s]
100%|██████████| 29/29 [00:10<00:00,  2.81it/s]
100%|██████████| 29/29 [00:11<00:00,  2.60it/s]
100%|██████████| 29/29 [00:10<00:00,  2.68it/s]
100%|██████████| 29/29 [00:10<00:00,  2.67it/s]
100%|██████████| 29/29 [00:10<00:00,  2.66it/s]
100%|██████████| 29/29 [00:10<00:00,  2.66it/s]['ratxteamsize', 'bonus', 'rating', 'linkedin_link', 'logsoldtokens', 'whxky', 'whitelist', 'teamsize']

# dependent/target/outcome variable y = df['success'] # independent/predictor/explanatory variable #test_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','linkedin_link','link_white_paper'] test_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','linkedin_link','link_white_paper','linkxwhitep','github_link','ico_length'] #X = df[current_variables] X = df[test_variables] # load data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42) # fit all models clf = LazyClassifier(predictions=True) models, predictions = clf.fit(X_train, X_test, y_train, y_test) from sklearn.ensemble import RandomForestClassifier #Create a Gaussian Classifier clf=RandomForestClassifier(n_estimators=100) X_train.replace([np.inf, -np.inf], np.nan, inplace=True) X_train.fillna(0, inplace=True) X_train = np.nan_to_num(X_train.astype(np.float32)) clf.fit(X_train,y_train)

100%|██████████| 29/29 [00:14<00:00,  2.04it/s]

#df.current_variables

feature_imp = pd.Series(clf.feature_importances_,index=df[test_variables].columns).sort_values(ascending=False) feature_imp

import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline # Creating a bar plot sns.barplot(x=feature_imp, y=feature_imp.index) # Add labels to your graph plt.xlabel('Feature Importance Score') plt.ylabel('Features') plt.title("Visualizing Important Features") plt.legend() plt.show()

No handles with labels found to put in legend.

#chosen_variables = ['raised_usd','is_ico','rating','teamsize','ratxteamsize','linkedin_link'] chosen_variables = ['is_ico', 'ratxteamsize', 'rating', 'bonus', 'teamsize', 'kyc','whitelist','logsoldtokens'] #chosen_variables = ['is_ico', 'ratxteamsize', 'bonus', 'teamsize', 'logsoldtokens', 'rating', 'linkxwhitep','linkedin_link','link_white_paper'] # dependent/target/outcome variable y = df['success'] # independent/predictor/explanatory variable X = df[chosen_variables] #X = df[current_variables] # A. Logit regression # turn independent variables into floating type (best practice) # 'missing='drop'' drops rows with missing values from the regression logit_model=sm.Logit(y,X.astype(float), missing='drop' ) # fit logit model into the data result=logit_model.fit() # summarize the logit model print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.519402
         Iterations 7
                         Results: Logit
=================================================================
Model:              Logit            Pseudo R-squared: 0.093     
Dependent Variable: success          AIC:              4048.6381 
Date:               2021-10-24 11:23 BIC:              4098.7510 
No. Observations:   3882             Log-Likelihood:   -2016.3   
Df Model:           7                LL-Null:          -2224.3   
Df Residuals:       3874             LLR p-value:      9.2401e-86
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     7.0000                                       
-----------------------------------------------------------------
                   Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
-----------------------------------------------------------------
is_ico            -1.0620   0.1276 -8.3229 0.0000 -1.3121 -0.8119
ratxteamsize       0.0507   0.0057  8.8940 0.0000  0.0395  0.0619
rating            -0.1944   0.0467 -4.1657 0.0000 -0.2858 -0.1029
bonus             -2.6637   0.2958 -9.0046 0.0000 -3.2434 -2.0839
teamsize          -0.1228   0.0205 -5.9808 0.0000 -0.1630 -0.0826
kyc                0.2291   0.0917  2.4978 0.0125  0.0493  0.4088
whitelist          0.0858   0.0838  1.0243 0.3057 -0.0784  0.2500
logsoldtokens      0.0773   0.0152  5.0848 0.0000  0.0475  0.1071
=================================================================

# B. Linear Probability Model # logit regression X = sm.add_constant(X) ols_model=sm.OLS(y,X.astype(float), missing='drop') result=ols_model.fit() print(result.summary2()) #print(result.rsquared_adj)

                 Results: Ordinary least squares
==================================================================
Model:              OLS              Adj. R-squared:     0.114    
Dependent Variable: success          AIC:                4159.3884
Date:               2021-10-24 11:23 BIC:                4215.7654
No. Observations:   3882             Log-Likelihood:     -2070.7  
Df Model:           8                F-statistic:        63.19    
Df Residuals:       3873             Prob (F-statistic): 1.28e-97 
R-squared:          0.115            Scale:              0.17055  
------------------------------------------------------------------
                   Coef.  Std.Err.    t     P>|t|   [0.025  0.975]
------------------------------------------------------------------
const              0.0799   0.0486   1.6432 0.1004 -0.0154  0.1752
is_ico            -0.0582   0.0311  -1.8720 0.0613 -0.1192  0.0028
ratxteamsize       0.0039   0.0011   3.7106 0.0002  0.0019  0.0060
rating             0.0454   0.0128   3.5599 0.0004  0.0204  0.0704
bonus             -0.2910   0.0235 -12.3879 0.0000 -0.3371 -0.2449
teamsize          -0.0046   0.0036  -1.2771 0.2016 -0.0116  0.0025
kyc                0.0413   0.0155   2.6717 0.0076  0.0110  0.0717
whitelist          0.0270   0.0146   1.8534 0.0639 -0.0016  0.0555
logsoldtokens      0.0153   0.0029   5.3465 0.0000  0.0097  0.0210
------------------------------------------------------------------
Omnibus:             537.531       Durbin-Watson:          1.809  
Prob(Omnibus):       0.000         Jarque-Bera (JB):       596.738
Skew:                0.909         Prob(JB):               0.000  
Kurtosis:            2.378         Condition No.:          371    
==================================================================

# load data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42) # fit all models clf = LazyClassifier(predictions=True) models, predictions = clf.fit(X_train, X_test, y_train, y_test) print('The accuracy of our model is',models['Accuracy']['LogisticRegression']) print('The ROC AUC of our model is',models['ROC AUC']['LogisticRegression'])

100%|██████████| 29/29 [00:18<00:00,  1.54it/s]The accuracy of our model is 0.7388932190179267
The ROC AUC of our model is 0.5709137150300849

models