import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
pip install xlrd
m_wires = pd.read_excel('/work/BAB262-XLS-ENG.xls', sheet_name = 'Wires By Month modified')
m_wires
m_wires.info()
m_wires['month in year'] = m_wires['Month'].dt.month
monthly = pd.melt(m_wires, id_vars = 'month in year',value_vars = ['2007','2008','2009','2010','2011','2012','2013'])
monthly.columns = ['month','year','wires']
monthly['ym'] = monthly['year'].map(str) + "/" + monthly['month'].map(str)
monthly['ym'] = pd.to_datetime(monthly['ym'],format = '%Y/%m')
monthly.head()
monthly['year'] = monthly['year'].astype(int)
monthly['timeindex'] = monthly['year'] + monthly['month']/12
monthly['timeindex'].head()
monthly = monthly.dropna()
m, b = np.polyfit(monthly['timeindex'], monthly['wires'], 1)
m, b
plt.figure(figsize = (16,9))
plt.scatter(monthly['timeindex'],monthly['wires'])
m, b = np.polyfit(monthly['timeindex'], monthly['wires'], 1)
plt.plot(monthly['timeindex'], m*monthly['timeindex'] + b)
plt.xlabel('year')
plt.ylabel('Amount of wires')
plt.show()
monthly = monthly.drop(index = 11, axis = 0)
monthly.head()
pip install statsmodels
import statsmodels.api as sm
from statsmodels import regression
X = monthly['timeindex']
y = monthly['wires']
# Note the difference in argument order
X = sm.add_constant(X)
model = sm.OLS(y.astype(float), X.astype(float), missing='drop').fit()
predictions = model.predict(X.astype(float)) # make the predictions by the model
# Print out the statistics
print(model.summary())
model.params
b1, m1 = model.params
plt.figure(figsize = (16,9))
plt.scatter(monthly['timeindex'], monthly['wires'])
plt.plot(monthly['timeindex'], m1*monthly['timeindex'] + b1)
plt.xlabel('year')
plt.ylabel('Amount of wires')
plt.show()
import seaborn as sns
sns.boxplot(monthly['wires'])
sns.stripplot(monthly['wires'])
plt.show()
np.percentile(monthly['wires'],1)
nov = pd.read_excel('BAB262-XLS-ENG.xls', sheet_name = 'November 2010 Wires')
nov.head()
nov.info()
nov['date_n_time'] = pd.to_datetime(nov['date_n_time'],errors='coerce')
nov.info()
nov = nov.sort_values(by = 'date_n_time', ascending = True)
nov = nov.set_index('date_n_time')
nov.head()
nov['Transaction Amount'] = nov['Transaction Amount'].astype('float')
nov['Transaction Amount'].isnull().sum()
nov = nov.dropna()
nov.info()
plt.figure(figsize=(20,10))
nov['Transaction Amount'].plot()
plt.ylabel("transaction amount in USD")
plt.show()
nov['Transaction Amount'].describe()
nov['Transaction Amount'].sum()
nov1 = nov.reset_index()
nov1 = nov1.sort_values(by = 'Transaction Amount', ascending = True)
nov1 = nov1.reset_index()
nov1 = nov1.loc[6: ,:]
nov1 = nov1.sort_values(by = 'Transaction Amount', ascending = False)
nov1 = nov1.reset_index()
nov1 = nov1.loc[3: ,:]
nov1.tail()
nov.sort_values(by = 'Transaction Amount', ascending = False).head(20)
nov2 = nov1.set_index('date_n_time')
nov2 = nov2.resample("T").sum()
nov2.head()
nov3 = nov2.reset_index()
nov3 = nov3.sort_values(by = 'date_n_time' , ascending = True)
nov3['Transaction Amount'].sum()
len(nov['Trans ID'].unique())
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
# Determining rolling statistics
rolmean = timeseries.rolling(window = 2).mean()
rolstd = timeseries.rolling(2).std()
# Plot rolling statistics:
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
# Perform Dickey-Fuller test:
print ('Results of Dickey-Fuller Test:')
timeseries = timeseries.iloc[:].values
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
test_stationarity(nov2['Transaction Amount'])
# Import the statsmodels module for regression and the adfuller function
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
def cointegration(dep, indep):
# Regress BTC on ETH
# Note the difference in argument order
X = indep
y = dep
X = sm.add_constant(X)
model = sm.OLS(y.astype(float), X.astype(float), missing='drop').fit()
predictions = model.predict(X.astype(float)) # make the predictions by the model
# Print out the statistics
print(model.summary())
b = model.params[1]
# Compute ADF
test_stationarity(indep - b*dep)
cointegration(nov['date_n_time'], nov3['Transaction Amount'])
pip install pystan==2.19.1.1
pip install prophet
from prophet import Prophet
from prophet.plot import plot_plotly
nov = nov.reset_index()
nov2 = nov1.set_index('date_n_time')
nov2 = nov2.resample("T").sum()
nov2.head()
# create a dataframe
nov2 = nov1.set_index('date_n_time')
nov2 = nov2.resample("T").sum()
nov2.head()
nov3 = nov2.reset_index()
nov3 = nov3.sort_values(by = 'date_n_time' , ascending = True)
train_dataset= pd.DataFrame()
train_dataset['ds'] = nov3['date_n_time']
train_dataset['y']= nov3['Transaction Amount']
train_dataset.head(2)
train_dataset= pd.DataFrame()
train_dataset['ds'] = nov3['date_n_time']
train_dataset['y']= nov3['Transaction Amount']
train_dataset= train_dataset.dropna()
train_dataset.head(2)
train_dataset['y'].hist(bins = 100)
# create a Prophet instance with default values to fit the dataset
prophet_basic = Prophet()
prophet_basic.fit(train_dataset)
# create a dataframe with ds (i.e., datetime stamp) that has the time series of dates we need for prediction
# periods specify the number of days to extend into the future
future= prophet_basic.make_future_dataframe(periods=1000, freq = 'T')
future.tail()
# forecast BTC prices
forecast=prophet_basic.predict(future)
# plot predicted BTC prices
fig1 =prophet_basic.plot(forecast)
# plot the trend and seasonality
fig1 = prophet_basic.plot_components(forecast)
# identify changepoints (i.e., datetime points when the time series exprience abrupt changes)
from prophet.plot import add_changepoints_to_plot
fig = prophet_basic.plot(forecast)
a = add_changepoints_to_plot(fig.gca(), prophet_basic, forecast)
prophet_basic.changepoints
# adjust trend sensitivity with "changepoint_prior_scale" parameter
# default value is 0.05. Lower value, less flexible trend, and vice versa
pro_change= Prophet(changepoint_prior_scale=0.15)
forecast = pro_change.fit(train_dataset).predict(future)
fig= pro_change.plot(forecast)
a = add_changepoints_to_plot(fig.gca(), prophet_basic, forecast)
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
y = nov3['date_n_time']
x = nov3[['Transaction Amount']]
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size = 0.3)
Xtrain.index = range(Xtrain.shape[0])
Clf = tree.DecisionTreeClassifier(criterion = "gini",
random_state = 0,
max_depth = 4,
#min_samples_split =
#min_samples_leaf
)
Clf = Clf.fit(Xtrain, Ytrain)
score = Clf.score(Xtest, Ytest)
score