# Objective: To predict successful ICOs (who are the winners in the cryptocurrency ICOs?)
!pip install statsmodels
import statsmodels.api as sm
import numpy as np # linear algebra
import pandas as pd # data processing
import seaborn as sns
import matplotlib.pyplot as plt
import numpy.ma as ma # deal with NaN
import scipy.stats
from scipy import stats
import datetime as dt
import plotly.express as px
Requirement already satisfied: statsmodels in /root/venv/lib/python3.7/site-packages (0.13.0)
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.5)
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Requirement already satisfied: patsy>=0.5.2 in /root/venv/lib/python3.7/site-packages (from statsmodels) (0.5.2)
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.7.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2021.3)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
Preparatory work and DataFrame Cleaning
# Import dataset
df = pd.read_csv("tord_v3.csv", delimiter=';', skiprows=0, low_memory=False) #This line solves the ParseError prompt
#filter out the relevant columns for analysis later
coin_ico = df[['name','country','is_ico','is_ieo','is_sto','ico_start','ico_end','raised_usd','whitelist','kyc','bonus','rating','teamsize','platform','distributed_in_ico']]
df.head()
df.describe()
#converting the dates to datetime to minimise inconsistency
coin_ico['ico_start'] = pd.to_datetime(coin_ico['ico_start'])
coin_ico['ico_end'] = pd.to_datetime(coin_ico['ico_end'])
coin_ico['ico_duration'] = coin_ico['ico_end'] - coin_ico['ico_start']
#NEW, only keep coins which have successful ICOs
coin_ico = coin_ico[coin_ico['is_ico']== 1]
#changing the number of days from datetime format to integer
coin_ico["ico_duration"] = coin_ico["ico_duration"].dt.days
#getting rid of the NaT values in the ICO date variable
coin_ico['ico_start'] = coin_ico['ico_start'].apply(lambda x: x.strftime('%Y-%m-%d')if not pd.isnull(x) else '')
coin_ico['ico_end'] = coin_ico['ico_end'].apply(lambda x: x.strftime('%Y-%m-%d')if not pd.isnull(x) else '')
#NEW, REPLACED NaN VALUES WITH MEDIAN NUMBERS
coin_ico['ico_duration'].fillna(31,inplace=True)
coin_ico['raised_usd'].fillna(0,inplace=True)
coin_ico['distributed_in_ico'].fillna(0.5,inplace=True)
#Winsorize variables
def clip_series(s, lower, upper):
clipped = s.clip(lower=s.quantile(lower), upper=s.quantile(upper))
return clipped
# Manage list of features to be winsorized
winsorize_list = ['ico_duration','raised_usd','distributed_in_ico']
for var in winsorize_list:
coin_ico[var+'_winsorised'] = clip_series(coin_ico[var], 0.005, 0.995)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
This is separate from the ipykernel package so we can avoid doing imports until
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
after removing the cwd from sys.path.
#find out the median or mean value of each columns, which informs the pervious step's fillna formula
print(coin_ico['ico_duration'].median())
print(coin_ico['raised_usd'].median())
print(coin_ico['distributed_in_ico'].median())
31.0
0.0
0.5
#Creating a dummy variable indicating how successful the Initial Coin Offering is successful
a=[]
coin_ico['is_successful'] = coin_ico['raised_usd_winsorised']
for row in coin_ico['is_successful']:
if row >= 500000: #this line sets the threshold of 500k USD, the prescribed yardstick of a successful ICO.
a.append(1)
else:
a.append(0)
coin_ico['is_successful'] = a
print(coin_ico['is_successful'])
#the 'is_successful' column will now indicate whether the respective tokens had successful ICOs.
0 1
1 0
2 1
3 0
4 0
..
6410 0
6411 0
6412 0
6413 0
6414 0
Name: is_successful, Length: 5978, dtype: int64
#dropping the pre-winsorised columns to rule out collinearity in subsequent data analysis
coin_ico = coin_ico.drop(columns=['ico_duration','raised_usd','distributed_in_ico'])
coin_ico.head()
print(sum(coin_ico['raised_usd_winsorised']>=500000)/len(coin_ico)) # calculate the successful fund raising rate if the soft cap is set to $500000 as stated in the reference paper.
0.27350284376045503
#Creating a dummy variable represented by 2 columns with regards to whether the respective stock is on whitelist
coin_ico = pd.get_dummies(coin_ico, columns=['whitelist'])
coin_ico.head()
#General visualisation of correlations amongst the variables created & cleaned
f, ax = plt.subplots(figsize=(10, 8))
corr = coin_ico.corr()
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Plot heatmap
sns.heatmap(corr, cmap=cmap, square=True, ax=ax)
#g = sns.pairplot(coin_ico[['is_successful','ico_duration', 'is_ico', 'raised_usd', 'kyc','rating','bonus','teamsize','whitelist_No','whitelist_Yes','country']], hue='is_successful',palette='gnuplot2')
#g.fig.suptitle("Pairplot, colour-coded with country of origin",y=1.1)
def successful_visual(dependent, independent, dataframe,colour):
fig = px.scatter(
dataframe[dataframe[independent] != None],
x=independent,
y=dependent,
color=colour,
title="Successfulness:" + dependent,
marginal_y="violin",
marginal_x="box",
)
fig.show()
successful_visual('raised_usd_winsorised', 'rating', coin_ico,'rating')
successful_visual('raised_usd_winsorised', 'ico_duration_winsorised', coin_ico,'distributed_in_ico_winsorised')
successful_visual('country', 'raised_usd_winsorised', coin_ico,'raised_usd_winsorised')
Experimenting on Regressions
#regression trials
X = coin_ico[['teamsize','rating','ico_duration_winsorised','distributed_in_ico_winsorised']]
y = coin_ico['raised_usd_winsorised']
# Note the difference in argument order
X = sm.add_constant(X)
model = sm.OLS(y.astype(float), X.astype(float), missing='drop').fit()
predictions = model.predict(X.astype(float)) # make the predictions by the model
# Print out the statistics
print(model.summary())
OLS Regression Results
=================================================================================
Dep. Variable: raised_usd_winsorised R-squared: 0.032
Model: OLS Adj. R-squared: 0.031
Method: Least Squares F-statistic: 32.34
Date: Sun, 24 Oct 2021 Prob (F-statistic): 1.47e-26
Time: 15:13:36 Log-Likelihood: -68498.
No. Observations: 3914 AIC: 1.370e+05
Df Residuals: 3909 BIC: 1.370e+05
Df Model: 4
Covariance Type: nonrobust
=================================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------------
const -1.481e+06 7.59e+05 -1.951 0.051 -2.97e+06 6941.113
teamsize 1.431e+05 2.34e+04 6.109 0.000 9.71e+04 1.89e+05
rating 1.366e+06 2.58e+05 5.291 0.000 8.6e+05 1.87e+06
ico_duration_winsorised -1.066e+04 2107.527 -5.060 0.000 -1.48e+04 -6532.727
distributed_in_ico_winsorised -709.5961 1405.685 -0.505 0.614 -3465.542 2046.350
==============================================================================
Omnibus: 3490.286 Durbin-Watson: 1.928
Prob(Omnibus): 0.000 Jarque-Bera (JB): 105137.312
Skew: 4.312 Prob(JB): 0.00
Kurtosis: 26.881 Cond. No. 590.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
#regression trials
X = coin_ico[['teamsize','rating','ico_duration_winsorised','distributed_in_ico_winsorised']]
y = coin_ico['raised_usd_winsorised']
# Note the difference in argument order
X = sm.add_constant(X)
model = sm.OLS(y.astype(float), X.astype(float), missing='drop').fit()
predictions = model.predict(X.astype(float)) # make the predictions by the model
# Print out the statistics
print(model.summary())
OLS Regression Results
=================================================================================
Dep. Variable: raised_usd_winsorised R-squared: 0.032
Model: OLS Adj. R-squared: 0.031
Method: Least Squares F-statistic: 32.34
Date: Sun, 24 Oct 2021 Prob (F-statistic): 1.47e-26
Time: 15:13:36 Log-Likelihood: -68498.
No. Observations: 3914 AIC: 1.370e+05
Df Residuals: 3909 BIC: 1.370e+05
Df Model: 4
Covariance Type: nonrobust
=================================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------------
const -1.481e+06 7.59e+05 -1.951 0.051 -2.97e+06 6941.113
teamsize 1.431e+05 2.34e+04 6.109 0.000 9.71e+04 1.89e+05
rating 1.366e+06 2.58e+05 5.291 0.000 8.6e+05 1.87e+06
ico_duration_winsorised -1.066e+04 2107.527 -5.060 0.000 -1.48e+04 -6532.727
distributed_in_ico_winsorised -709.5961 1405.685 -0.505 0.614 -3465.542 2046.350
==============================================================================
Omnibus: 3490.286 Durbin-Watson: 1.928
Prob(Omnibus): 0.000 Jarque-Bera (JB): 105137.312
Skew: 4.312 Prob(JB): 0.00
Kurtosis: 26.881 Cond. No. 590.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
#regression trials logit model
coin_ico['log_rating'] = np.log(coin_ico['rating'])
X = coin_ico[['teamsize','log_rating','kyc','whitelist_No','ico_duration_winsorised']]
y = coin_ico['is_successful']
# Note the difference in argument order
X = sm.add_constant(X)
model = sm.Logit(y, X.astype(float), missing='drop').fit()
# Print out the statistics
print(model.summary2())
Optimization terminated successfully.
Current function value: 0.567883
Iterations 6
Results: Logit
========================================================================
Model: Logit Pseudo R-squared: 0.087
Dependent Variable: is_successful AIC: 4457.3908
Date: 2021-10-24 15:13 BIC: 4495.0247
No. Observations: 3914 Log-Likelihood: -2222.7
Df Model: 5 LL-Null: -2435.5
Df Residuals: 3908 LLR p-value: 8.5636e-90
Converged: 1.0000 Scale: 1.0000
No. Iterations: 6.0000
------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
------------------------------------------------------------------------
const -3.9730 0.2270 -17.5052 0.0000 -4.4179 -3.5282
teamsize 0.0394 0.0055 7.2132 0.0000 0.0287 0.0501
log_rating 2.9291 0.2125 13.7812 0.0000 2.5125 3.3456
kyc -0.6173 0.0806 -7.6603 0.0000 -0.7752 -0.4594
whitelist_No -0.3649 0.0773 -4.7223 0.0000 -0.5164 -0.2135
ico_duration_winsorised -0.0030 0.0006 -5.2108 0.0000 -0.0041 -0.0019
========================================================================
average_marginal_effect = model.get_margeff(at = "mean", method = "dydx")
print(average_marginal_effect.summary())
Logit Marginal Effects
=====================================
Dep. Variable: is_successful
Method: dydx
At: mean
===========================================================================================
dy/dx std err z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------------------
teamsize 0.0081 0.001 7.209 0.000 0.006 0.010
log_rating 0.6043 0.043 14.198 0.000 0.521 0.688
kyc -0.1274 0.017 -7.713 0.000 -0.160 -0.095
whitelist_No -0.0753 0.016 -4.732 0.000 -0.106 -0.044
ico_duration_winsorised -0.0006 0.000 -5.236 0.000 -0.001 -0.000
===========================================================================================
sns.regplot(x = 'teamsize', y = "is_successful", data = coin_ico,logistic=True)
plt.ylabel("Probability of Successful ICO")
plt.xlabel("Team Size")
sns.regplot(x = 'log_rating', y = "is_successful", data = coin_ico,logistic=True)
plt.ylabel("Probability of Successful ICO")
plt.xlabel("Logarithmic form of Rating")
sns.regplot(x = 'kyc', y = "is_successful", data = coin_ico,logistic=True)
plt.ylabel("Probability of Successful ICO")
plt.xlabel("Know Your Customer Dummy Variable")
sns.regplot(x = 'whitelist_No', y = "is_successful", data = coin_ico,logistic=True)
plt.ylabel("Probability of Successful ICO")
plt.xlabel("Not being on White List Dummy Variable ")
sns.regplot(x = 'ico_duration_winsorised', y = "is_successful", data = coin_ico,logistic=True)
plt.ylabel("Probability of Successful ICO")
plt.xlabel("ICO Duration")