import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/work/tord_v3_edited.csv')
df.info()
df.head(20)
df.describe()
# sanitize "raised_usd"
df['raised_usd'] = df['raised_usd'].fillna(0)
# create outcome variable success
df['success'] = np.where(df['raised_usd'] >= 500000, 1, 0)
df['success'].describe()
sns.pairplot(df, vars = ['raised_usd','teamsize','success'])
# Potential multicollinearity issue
corr = df.corr()
sns.heatmap(corr)
plt.show()
df['rating'] = df['rating'].fillna(0)
df['ERC20'] = df['ERC20'].fillna(0)
df['teamsize'] = df['teamsize'].fillna(1)
candidate_variables = ['teamsize','rating','bonus','sold_tokens','bounty','kyc']
current_variables = []
target = ['success']
df['ratxteamsize']=df['teamsize'] * df['rating']
df['ratxteamsize'].head(20)
df['ERC20'].fillna(0)
print(df['ERC20'].isna().sum())
df['whitelist']=df['whitelist'].map({'Yes': 1, 'No': 0})
print(df['whitelist'].head())
df['whitelist'].fillna(0)
df['whitelist'].isna().sum()
df['whitelist'].head()
#df['kyc'].head()
df['whxky']=df['whitelist'] * df['kyc']
df['whxky'].head()
df['logsoldtokens']=np.log(df['sold_tokens'])
print(df['logsoldtokens'].head())
df['logsoldtokens'] = df['logsoldtokens'].fillna(0)
print(df['logsoldtokens'].head())
#for i in df['linkedin_link']:
# if i = 'TBD':
df.loc[df['linkedin_link']=="TBD", 'linkedin_link'] = 0
df['linkedin_link'].fillna(0)
df.loc[df['linkedin_link']!=0, "linkedin_link"] = 1
#df['linkedin_link']=df['linkedin_link'].map({'TBD': 0, 'No': 0})
print(df['linkedin_link'].sum())
df['link_white_paper'].fillna(0, inplace=True)
df.loc[df['link_white_paper']!=0, "link_white_paper"] = 1
print(df['link_white_paper'].sum())
print(df['link_white_paper'].head())
df['linkxwhitep']=df['linkedin_link'] * df['link_white_paper']
print(df['linkxwhitep'].head())
df['github_link'].fillna(0, inplace=True)
df.loc[df['github_link']=="None", "github_link"] = 0
df.loc[df['github_link']!=0, "github_link"] = 1
print(df['github_link'].sum())
df['ico_start'][0]
df['ico_start'].fillna(0, inplace=True)
print(df['ico_start'].head())
df['ico_end'].fillna(0, inplace=True)
print(df['ico_end'].head())
for i in range(len(df)):
if df['ico_end'][i] == 0 and df['ico_start'][i] != 0:
df['ico_end'][i] = "10/24/2021"
df['ico_start']=pd.to_datetime(df['ico_start'], errors='coerce',dayfirst=False)
df['ico_end']=pd.to_datetime(df['ico_end'], errors='coerce',dayfirst=False)
df['ico_length']=df['ico_end']-df['ico_start']
df['ico_length']=df['ico_length'].dt.days
print(df['ico_length'].head())
for i in range(len(df)):
if df['ico_length'][i] < 0:
df['ico_length'][i] = 1
!pip install statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import ols, logit
# dependent/target/outcome variable
y = df['success']
# independent/predictor/explanatory variable
X = df[['teamsize', 'rating']]
# A. Logit regression
# turn independent variables into floating type (best practice)
# 'missing='drop'' drops rows with missing values from the regression
logit_model=sm.Logit(y,X.astype(float), missing='drop' )
# fit logit model into the data
result=logit_model.fit()
# summarize the logit model
print(result.summary2())
print(result.tvalues)
sns.regplot(x = "rating", y = "success", data = df,
logistic = True, y_jitter = .05)
plt.ylabel("success probability")
'''
options for "at"
1. 'overall' The average of the marginal effects at each observation
2. 'mean' The marginal effects at the mean of each regressor
3. 'median' The marginal effects at the median of each regressor
4. 'zero' The marginal effects at zero for each regressor
5. 'all' The marginal effects at each observation.
options for "method"
1. 'dydx' No transformation is made and amrginal effects are returned
2. 'eyex' estimate elasticities of variables in exog
3. 'dyex' estimate semi-elasticity
4. 'eydx' estimate semi-elasticity
'''
average_marginal_effect = result.get_margeff(at = "mean", method = "dydx")
print(average_marginal_effect.summary())
# B. Linear Probability Model
# logit regression
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
#print(result.rsquared_adj)
# Use wrapper lazypredict
!pip install lazypredict
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.model_selection import train_test_split
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
models
predictions
models['Accuracy']['LogisticRegression']
def prsq(variables, target, basetable):
X = basetable[variables]
y = basetable[target]
logreg = sm.Logit(y,X.astype(float), missing = 'drop')
result=logreg.fit()
#return(abs(result.tvalues))
return(result.prsquared)
def next_best(current_variables, candidate_variables, target, basetable):
best_rsq = -1
best_variable = None
for v in candidate_variables:
logrsq = prsq(current_variables+[v], target, basetable)
if logrsq >= best_rsq:
best_rsq = logrsq
best_variable = v
return best_variable
candidate_variables = ['bonus','sold_tokens','bounty','linkedin_link']
current_variables = ['teamsize','rating']
target = ['success']
next_variable = next_best(current_variables, candidate_variables,target, df)
print(next_variable)
candidate_variables = ['teamsize','rating','bonus','sold_tokens','bounty','kyc']
#candidate_variables = ['teamsize','rating','bonus','bounty','kyc']
current_variables = []
target = ['success']
max_number_variables = 5
number_iterations = min(max_number_variables, len(candidate_variables))
for i in range(0,number_iterations):
next_var = next_best(current_variables, candidate_variables, target, df)
current_variables = current_variables + [next_var]
candidate_variables.remove(next_var)
print(current_variables)
# dependent/target/outcome variable
y = df['success']
# independent/predictor/explanatory variable
X = df[current_variables]
# A. Logit regression
# turn independent variables into floating type (best practice)
# 'missing='drop'' drops rows with missing values from the regression
logit_model=sm.Logit(y,X.astype(float), missing='drop' )
# fit logit model into the data
result=logit_model.fit()
# summarize the logit model
print(result.summary2())
# B. Linear Probability Model
# logit regression
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
#print(result.rsquared_adj)
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
print(models['Accuracy']['LogisticRegression'])
print(models['ROC AUC']['LogisticRegression'])
def acctest(variables, target, basetable):
X = basetable[variables]
y = basetable[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
#logreg = sm.Logit(y,X.astype(float), missing = 'drop')
#result=logreg.fit()
#return(abs(result.tvalues))
return(models['ROC AUC']['LogisticRegression'])
def next_best2(current_variables, candidate_variables, target, basetable):
best_rsq = -1
best_variable = None
for v in candidate_variables:
logrsq = acctest(current_variables+[v], target, basetable)
if logrsq >= best_rsq:
best_rsq = logrsq
best_variable = v
return best_variable
candidate_variables = ['bonus','sold_tokens','bounty']
current_variables = ['teamsize','rating']
target = ['success']
next_variable = next_best2(current_variables, candidate_variables,target, df)
print(next_variable)
print(next_variable)
candidate_variables = ['teamsize','rating','bonus','sold_tokens','bounty','kyc']
current_variables = []
target = ['success']
max_number_variables = 5
number_iterations = min(max_number_variables, len(candidate_variables))
for i in range(0,number_iterations):
next_var = next_best2(current_variables, candidate_variables, target, df)
current_variables = current_variables + [next_var]
candidate_variables.remove(next_var)
print(current_variables)
# dependent/target/outcome variable
y = df['success']
# independent/predictor/explanatory variable
X = df[current_variables]
# A. Logit regression
# turn independent variables into floating type (best practice)
# 'missing='drop'' drops rows with missing values from the regression
logit_model=sm.Logit(y,X.astype(float), missing='drop' )
# fit logit model into the data
result=logit_model.fit()
# summarize the logit model
print(result.summary2())
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
print(models['Accuracy']['LogisticRegression'])
print(models['ROC AUC']['LogisticRegression'])
df.columns[2:]
df['name'].isnull().sum()
#candidate_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','linkedin_link','link_white_paper','linkxwhitep','github_link','ico_length','raised_usd']
candidate_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','link_white_paper','linkxwhitep','github_link','ico_length']
current_variables = []
target = ['success']
max_number_variables = 8
number_iterations = min(max_number_variables, len(candidate_variables))
for i in range(0,number_iterations):
next_var = next_best(current_variables, candidate_variables, target, df)
current_variables = current_variables + [next_var]
candidate_variables.remove(next_var)
print(next_var)
print(current_variables)
#candidate_variables = ['teamsize','rating','bonus','sold_tokens','bounty','kyc','ratxteamsize']
candidate_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','linkedin_link']
current_variables = []
target = ['success']
max_number_variables = 8
number_iterations = min(max_number_variables, len(candidate_variables))
for i in range(0,number_iterations):
next_var = next_best2(current_variables, candidate_variables, target, df)
current_variables = current_variables + [next_var]
candidate_variables.remove(next_var)
print(current_variables)
# dependent/target/outcome variable
y = df['success']
# independent/predictor/explanatory variable
#test_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','linkedin_link','link_white_paper']
test_variables = ['teamsize','rating','bonus','logsoldtokens','bounty','kyc','ratxteamsize','is_ico','ERC20','whitelist','whxky','linkedin_link','link_white_paper','linkxwhitep','github_link','ico_length']
#X = df[current_variables]
X = df[test_variables]
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_train = np.nan_to_num(X_train.astype(np.float32))
clf.fit(X_train,y_train)
#df.current_variables
feature_imp = pd.Series(clf.feature_importances_,index=df[test_variables].columns).sort_values(ascending=False)
feature_imp
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()
#chosen_variables = ['raised_usd','is_ico','rating','teamsize','ratxteamsize','linkedin_link']
chosen_variables = ['is_ico', 'ratxteamsize', 'rating', 'bonus', 'teamsize', 'kyc','whitelist','logsoldtokens']
#chosen_variables = ['is_ico', 'ratxteamsize', 'bonus', 'teamsize', 'logsoldtokens', 'rating', 'linkxwhitep','linkedin_link','link_white_paper']
# dependent/target/outcome variable
y = df['success']
# independent/predictor/explanatory variable
X = df[chosen_variables]
#X = df[current_variables]
# A. Logit regression
# turn independent variables into floating type (best practice)
# 'missing='drop'' drops rows with missing values from the regression
logit_model=sm.Logit(y,X.astype(float), missing='drop' )
# fit logit model into the data
result=logit_model.fit()
# summarize the logit model
print(result.summary2())
# B. Linear Probability Model
# logit regression
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
#print(result.rsquared_adj)
# load data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
# fit all models
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
print('The accuracy of our model is',models['Accuracy']['LogisticRegression'])
print('The ROC AUC of our model is',models['ROC AUC']['LogisticRegression'])
models