Lab 8 Live - Regression

# Objective: To predict successful ICOs (who are the winners in the cryptocurrency ICOs?)

!pip install statsmodels import statsmodels.api as sm import numpy as np # linear algebra import pandas as pd # data processing import seaborn as sns import matplotlib.pyplot as plt import numpy.ma as ma # deal with NaN import scipy.stats from scipy import stats import datetime as dt import plotly.express as px

Requirement already satisfied: statsmodels in /root/venv/lib/python3.7/site-packages (0.13.0)
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.5)
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Requirement already satisfied: patsy>=0.5.2 in /root/venv/lib/python3.7/site-packages (from statsmodels) (0.5.2)
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.7.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2021.3)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.

Preparatory work and DataFrame Cleaning

# Import dataset df = pd.read_csv("tord_v3.csv", delimiter=';', skiprows=0, low_memory=False) #This line solves the ParseError prompt #filter out the relevant columns for analysis later coin_ico = df[['name','country','is_ico','is_ieo','is_sto','ico_start','ico_end','raised_usd','whitelist','kyc','bonus','rating','teamsize','platform','distributed_in_ico']] df.head()

df.describe()

#converting the dates to datetime to minimise inconsistency coin_ico['ico_start'] = pd.to_datetime(coin_ico['ico_start']) coin_ico['ico_end'] = pd.to_datetime(coin_ico['ico_end']) coin_ico['ico_duration'] = coin_ico['ico_end'] - coin_ico['ico_start'] #NEW, only keep coins which have successful ICOs coin_ico = coin_ico[coin_ico['is_ico']== 1] #changing the number of days from datetime format to integer coin_ico["ico_duration"] = coin_ico["ico_duration"].dt.days #getting rid of the NaT values in the ICO date variable coin_ico['ico_start'] = coin_ico['ico_start'].apply(lambda x: x.strftime('%Y-%m-%d')if not pd.isnull(x) else '') coin_ico['ico_end'] = coin_ico['ico_end'].apply(lambda x: x.strftime('%Y-%m-%d')if not pd.isnull(x) else '') #NEW, REPLACED NaN VALUES WITH MEDIAN NUMBERS coin_ico['ico_duration'].fillna(31,inplace=True) coin_ico['raised_usd'].fillna(0,inplace=True) coin_ico['distributed_in_ico'].fillna(0.5,inplace=True) #Winsorize variables def clip_series(s, lower, upper): clipped = s.clip(lower=s.quantile(lower), upper=s.quantile(upper)) return clipped # Manage list of features to be winsorized winsorize_list = ['ico_duration','raised_usd','distributed_in_ico'] for var in winsorize_list: coin_ico[var+'_winsorised'] = clip_series(coin_ico[var], 0.005, 0.995)

/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.

#find out the median or mean value of each columns, which informs the pervious step's fillna formula print(coin_ico['ico_duration'].median()) print(coin_ico['raised_usd'].median()) print(coin_ico['distributed_in_ico'].median())

31.0
0.0
0.5

#Creating a dummy variable indicating how successful the Initial Coin Offering is successful a=[] coin_ico['is_successful'] = coin_ico['raised_usd_winsorised'] for row in coin_ico['is_successful']: if row >= 500000: #this line sets the threshold of 500k USD, the prescribed yardstick of a successful ICO. a.append(1) else: a.append(0) coin_ico['is_successful'] = a print(coin_ico['is_successful']) #the 'is_successful' column will now indicate whether the respective tokens had successful ICOs.

0       1
1       0
2       1
3       0
4       0
       ..
6410    0
6411    0
6412    0
6413    0
6414    0
Name: is_successful, Length: 5978, dtype: int64

#dropping the pre-winsorised columns to rule out collinearity in subsequent data analysis coin_ico = coin_ico.drop(columns=['ico_duration','raised_usd','distributed_in_ico']) coin_ico.head()

print(sum(coin_ico['raised_usd_winsorised']>=500000)/len(coin_ico)) # calculate the successful fund raising rate if the soft cap is set to $500000 as stated in the reference paper.

0.27350284376045503

#Creating a dummy variable represented by 2 columns with regards to whether the respective stock is on whitelist coin_ico = pd.get_dummies(coin_ico, columns=['whitelist']) coin_ico.head()

#General visualisation of correlations amongst the variables created & cleaned f, ax = plt.subplots(figsize=(10, 8)) corr = coin_ico.corr() # Generate a custom diverging colormap cmap = sns.diverging_palette(230, 20, as_cmap=True) # Plot heatmap sns.heatmap(corr, cmap=cmap, square=True, ax=ax)

#g = sns.pairplot(coin_ico[['is_successful','ico_duration', 'is_ico', 'raised_usd', 'kyc','rating','bonus','teamsize','whitelist_No','whitelist_Yes','country']], hue='is_successful',palette='gnuplot2') #g.fig.suptitle("Pairplot, colour-coded with country of origin",y=1.1)

def successful_visual(dependent, independent, dataframe,colour): fig = px.scatter( dataframe[dataframe[independent] != None], x=independent, y=dependent, color=colour, title="Successfulness:" + dependent, marginal_y="violin", marginal_x="box", ) fig.show()

successful_visual('raised_usd_winsorised', 'rating', coin_ico,'rating')

successful_visual('raised_usd_winsorised', 'ico_duration_winsorised', coin_ico,'distributed_in_ico_winsorised')

successful_visual('country', 'raised_usd_winsorised', coin_ico,'raised_usd_winsorised')

Experimenting on Regressions

#regression trials X = coin_ico[['teamsize','rating','ico_duration_winsorised','distributed_in_ico_winsorised']] y = coin_ico['raised_usd_winsorised'] # Note the difference in argument order X = sm.add_constant(X) model = sm.OLS(y.astype(float), X.astype(float), missing='drop').fit() predictions = model.predict(X.astype(float)) # make the predictions by the model # Print out the statistics print(model.summary())

                              OLS Regression Results                             
=================================================================================
Dep. Variable:     raised_usd_winsorised   R-squared:                       0.032
Model:                               OLS   Adj. R-squared:                  0.031
Method:                    Least Squares   F-statistic:                     32.34
Date:                   Sun, 24 Oct 2021   Prob (F-statistic):           1.47e-26
Time:                           15:13:36   Log-Likelihood:                -68498.
No. Observations:                   3914   AIC:                         1.370e+05
Df Residuals:                       3909   BIC:                         1.370e+05
Df Model:                              4                                         
Covariance Type:               nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                         -1.481e+06   7.59e+05     -1.951      0.051   -2.97e+06    6941.113
teamsize                       1.431e+05   2.34e+04      6.109      0.000    9.71e+04    1.89e+05
rating                         1.366e+06   2.58e+05      5.291      0.000     8.6e+05    1.87e+06
ico_duration_winsorised       -1.066e+04   2107.527     -5.060      0.000   -1.48e+04   -6532.727
distributed_in_ico_winsorised  -709.5961   1405.685     -0.505      0.614   -3465.542    2046.350
==============================================================================
Omnibus:                     3490.286   Durbin-Watson:                   1.928
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           105137.312
Skew:                           4.312   Prob(JB):                         0.00
Kurtosis:                      26.881   Cond. No.                         590.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

                              OLS Regression Results                             
=================================================================================
Dep. Variable:     raised_usd_winsorised   R-squared:                       0.032
Model:                               OLS   Adj. R-squared:                  0.031
Method:                    Least Squares   F-statistic:                     32.34
Date:                   Sun, 24 Oct 2021   Prob (F-statistic):           1.47e-26
Time:                           15:13:36   Log-Likelihood:                -68498.
No. Observations:                   3914   AIC:                         1.370e+05
Df Residuals:                       3909   BIC:                         1.370e+05
Df Model:                              4                                         
Covariance Type:               nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                         -1.481e+06   7.59e+05     -1.951      0.051   -2.97e+06    6941.113
teamsize                       1.431e+05   2.34e+04      6.109      0.000    9.71e+04    1.89e+05
rating                         1.366e+06   2.58e+05      5.291      0.000     8.6e+05    1.87e+06
ico_duration_winsorised       -1.066e+04   2107.527     -5.060      0.000   -1.48e+04   -6532.727
distributed_in_ico_winsorised  -709.5961   1405.685     -0.505      0.614   -3465.542    2046.350
==============================================================================
Omnibus:                     3490.286   Durbin-Watson:                   1.928
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           105137.312
Skew:                           4.312   Prob(JB):                         0.00
Kurtosis:                      26.881   Cond. No.                         590.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

#regression trials logit model coin_ico['log_rating'] = np.log(coin_ico['rating']) X = coin_ico[['teamsize','log_rating','kyc','whitelist_No','ico_duration_winsorised']] y = coin_ico['is_successful'] # Note the difference in argument order X = sm.add_constant(X) model = sm.Logit(y, X.astype(float), missing='drop').fit() # Print out the statistics print(model.summary2())

Optimization terminated successfully.
         Current function value: 0.567883
         Iterations 6
                             Results: Logit
========================================================================
Model:                 Logit              Pseudo R-squared:   0.087     
Dependent Variable:    is_successful      AIC:                4457.3908 
Date:                  2021-10-24 15:13   BIC:                4495.0247 
No. Observations:      3914               Log-Likelihood:     -2222.7   
Df Model:              5                  LL-Null:            -2435.5   
Df Residuals:          3908               LLR p-value:        8.5636e-90
Converged:             1.0000             Scale:              1.0000    
No. Iterations:        6.0000                                           
------------------------------------------------------------------------
                         Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
------------------------------------------------------------------------
const                   -3.9730   0.2270 -17.5052 0.0000 -4.4179 -3.5282
teamsize                 0.0394   0.0055   7.2132 0.0000  0.0287  0.0501
log_rating               2.9291   0.2125  13.7812 0.0000  2.5125  3.3456
kyc                     -0.6173   0.0806  -7.6603 0.0000 -0.7752 -0.4594
whitelist_No            -0.3649   0.0773  -4.7223 0.0000 -0.5164 -0.2135
ico_duration_winsorised -0.0030   0.0006  -5.2108 0.0000 -0.0041 -0.0019
========================================================================

average_marginal_effect = model.get_margeff(at = "mean", method = "dydx") print(average_marginal_effect.summary())

        Logit Marginal Effects       
=====================================
Dep. Variable:          is_successful
Method:                          dydx
At:                              mean
===========================================================================================
                             dy/dx    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
teamsize                    0.0081      0.001      7.209      0.000       0.006       0.010
log_rating                  0.6043      0.043     14.198      0.000       0.521       0.688
kyc                        -0.1274      0.017     -7.713      0.000      -0.160      -0.095
whitelist_No               -0.0753      0.016     -4.732      0.000      -0.106      -0.044
ico_duration_winsorised    -0.0006      0.000     -5.236      0.000      -0.001      -0.000
===========================================================================================

sns.regplot(x = 'teamsize', y = "is_successful", data = coin_ico,logistic=True) plt.ylabel("Probability of Successful ICO") plt.xlabel("Team Size")

sns.regplot(x = 'log_rating', y = "is_successful", data = coin_ico,logistic=True) plt.ylabel("Probability of Successful ICO") plt.xlabel("Logarithmic form of Rating")

sns.regplot(x = 'kyc', y = "is_successful", data = coin_ico,logistic=True) plt.ylabel("Probability of Successful ICO") plt.xlabel("Know Your Customer Dummy Variable")

sns.regplot(x = 'whitelist_No', y = "is_successful", data = coin_ico,logistic=True) plt.ylabel("Probability of Successful ICO") plt.xlabel("Not being on White List Dummy Variable ")

sns.regplot(x = 'ico_duration_winsorised', y = "is_successful", data = coin_ico,logistic=True) plt.ylabel("Probability of Successful ICO") plt.xlabel("ICO Duration")