Lab 8 Live - Duplicate

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns

df = pd.read_csv('/work/tord_v3_edited.csv') df.info()

# counr the number of tokens accepting df = df.copy() df['accept_count'] = 0 for i in range(len(df['id'])): if df.loc[i,'accepting'] != 'nan': df.loc[i,'accept_count'] = str(df.loc[i,'accepting']).count(',')+1 df['accept_count'].describe()

df = df.sort_values(by = 'raised_usd', ascending= False) df.head()

df['min_invest_dum'] = np.where(df['min_investment'] != np.nan, 1, 0) df['min_invest_dum'].describe()

# data cleaning, choosing those is lised on ico and has finished df = df[df['is_ico']== 1] df = df[df['ico_end'].isnull() == False] df['token_for_sale'] = df['token_for_sale'].fillna(0) df['sold_tokens'] = df['sold_tokens'].fillna(0) df['distributed_in_ico'] = df['distributed_in_ico'].fillna(0) df['ERC20'] = df['ERC20'].fillna(0)

df['teamsize'].describe()

#replace the teamzie with median df['teamsize'] = df['teamsize'].fillna(11) df['teamsize'].hist(bins = 30) plt.show()

#define Y # sanitize "raised_usd" df['raised_usd'] = df['raised_usd'].fillna(0) # create outcome variable success df['success'] = np.where(df['raised_usd'] >= 500000, 1, 0)

df['success'].describe()

sns.pairplot(df, vars = ['raised_usd','teamsize','success','accept_count',])

# Potential multicollinearity issue corr = df.corr() sns.heatmap(corr) plt.show()

!pip install statsmodels

import statsmodels.api as sm from statsmodels.formula.api import ols, logit # dependent/target/outcome variable y = df['success'] # independent/predictor/explanatory variable Xlist = ['token_for_sale','sold_tokens','teamsize','accept_count','distributed_in_ico','ERC20'] X = df[Xlist] # A. Logit regression # turn independent variables into floating type (best practice) # 'missing='drop'' drops rows with missing values from the regression logit_model=sm.Logit(y,X.astype(float), missing='drop' ) # fit logit model into the data result=logit_model.fit() # summarize the logit model print(result.summary2())

sns.regplot(x = "sold_tokens", y = "success", data = df, # old plot was sns.regplot(x = "sold_tokens", y = "success", data = df, logistic = True, y_jitter = .05) plt.ylabel("success probability")

sns.regplot(x = "teamsize", y = "success", data = df, # old plot was sns.regplot(x = "sold_tokens", y = "success", data = df, logistic = True, y_jitter = .05) plt.ylabel("success probability")

''' options for "at" 1. 'overall' The average of the marginal effects at each observation 2. 'mean' The marginal effects at the mean of each regressor 3. 'median' The marginal effects at the median of each regressor 4. 'zero' The marginal effects at zero for each regressor 5. 'all' The marginal effects at each observation. options for "method" 1. 'dydx' No transformation is made and amrginal effects are returned 2. 'eyex' estimate elasticities of variables in exog 3. 'dyex' estimate semi-elasticity 4. 'eydx' estimate semi-elasticity ''' average_marginal_effect = result.get_margeff(at = "mean", method = "dydx") print(average_marginal_effect.summary())

# B. Linear Probability Model # logit regression X = sm.add_constant(X) ols_model=sm.OLS(y,X.astype(float), missing='drop') result=ols_model.fit() print(result.summary2())

# Use wrapper lazypredict !pip install lazypredict

from lazypredict.Supervised import LazyClassifier, LazyRegressor from sklearn.model_selection import train_test_split

# load data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42) # fit all models clf = LazyClassifier(predictions=True) models, predictions = clf.fit(X_train, X_test, y_train, y_test)

models

predictions