import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
pip install xlrd
Collecting xlrd
Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
|████████████████████████████████| 96 kB 11.1 MB/s
Installing collected packages: xlrd
Successfully installed xlrd-2.0.1
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
m_wires = pd.read_excel('/work/BAB262-XLS-ENG.xls', sheet_name = 'Wires By Month modified')
m_wires
m_wires.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Month 12 non-null datetime64[ns]
1 2007 12 non-null float64
2 2008 9 non-null float64
3 2009 11 non-null float64
4 2010 10 non-null float64
5 2011 12 non-null float64
6 2012 10 non-null float64
7 2013 12 non-null float64
dtypes: datetime64[ns](1), float64(7)
memory usage: 896.0 bytes
m_wires['month in year'] = m_wires['Month'].dt.month
monthly = pd.melt(m_wires, id_vars = 'month in year',value_vars = ['2007','2008','2009','2010','2011','2012','2013'])
monthly.columns = ['month','year','wires']
monthly['ym'] = monthly['year'].map(str) + "/" + monthly['month'].map(str)
monthly['ym'] = pd.to_datetime(monthly['ym'],format = '%Y/%m')
monthly.head()
monthly['year'] = monthly['year'].astype(int)
monthly['timeindex'] = monthly['year'] + monthly['month']/12
monthly['timeindex'].head()
monthly = monthly.dropna()
m, b = np.polyfit(monthly['timeindex'], monthly['wires'], 1)
m, b
plt.figure(figsize = (16,9))
plt.scatter(monthly['timeindex'],monthly['wires'])
m, b = np.polyfit(monthly['timeindex'], monthly['wires'], 1)
plt.plot(monthly['timeindex'], m*monthly['timeindex'] + b)
plt.xlabel('year')
plt.ylabel('Amount of wires')
plt.show()
monthly = monthly.drop(index = 11, axis = 0)
monthly.head()
pip install statsmodels
Requirement already satisfied: statsmodels in /usr/local/lib/python3.7/site-packages (0.13.0)
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.7.1)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/site-packages (from statsmodels) (0.5.2)
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2021.3)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
import statsmodels.api as sm
from statsmodels import regression
X = monthly['timeindex']
y = monthly['wires']
# Note the difference in argument order
X = sm.add_constant(X)
model = sm.OLS(y.astype(float), X.astype(float), missing='drop').fit()
predictions = model.predict(X.astype(float)) # make the predictions by the model
# Print out the statistics
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: wires R-squared: 0.002
Model: OLS Adj. R-squared: -0.012
Method: Least Squares F-statistic: 0.1287
Date: Mon, 01 Nov 2021 Prob (F-statistic): 0.721
Time: 12:59:25 Log-Likelihood: -379.78
No. Observations: 75 AIC: 763.6
Df Residuals: 73 BIC: 768.2
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -1176.9605 4424.457 -0.266 0.791 -9994.891 7640.970
timeindex 0.7896 2.201 0.359 0.721 -3.596 5.175
==============================================================================
Omnibus: 1.224 Durbin-Watson: 1.326
Prob(Omnibus): 0.542 Jarque-Bera (JB): 1.290
Skew: 0.259 Prob(JB): 0.525
Kurtosis: 2.620 Cond. No. 1.99e+06
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.99e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
model.params
b1, m1 = model.params
plt.figure(figsize = (16,9))
plt.scatter(monthly['timeindex'], monthly['wires'])
plt.plot(monthly['timeindex'], m1*monthly['timeindex'] + b1)
plt.xlabel('year')
plt.ylabel('Amount of wires')
plt.show()
import seaborn as sns
sns.boxplot(monthly['wires'])
sns.stripplot(monthly['wires'])
plt.show()
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
FutureWarning
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
FutureWarning
np.percentile(monthly['wires'],1)
nov = pd.read_excel('BAB262-XLS-ENG.xls', sheet_name = 'November 2010 Wires')
nov.head()
nov.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17940 entries, 0 to 17939
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Transaction Amount 17908 non-null float64
1 Date of Transaction 17908 non-null datetime64[ns]
2 Time of Transaction 17908 non-null object
3 Trans ID 17908 non-null object
4 date_n_time 17928 non-null object
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 700.9+ KB
nov['date_n_time'] = pd.to_datetime(nov['date_n_time'],errors='coerce')
nov.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17940 entries, 0 to 17939
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Transaction Amount 17908 non-null float64
1 Date of Transaction 17908 non-null datetime64[ns]
2 Time of Transaction 17908 non-null object
3 Trans ID 17908 non-null object
4 date_n_time 17908 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(1), object(2)
memory usage: 700.9+ KB
nov = nov.sort_values(by = 'date_n_time', ascending = True)
nov = nov.set_index('date_n_time')
nov.head()
nov['Transaction Amount'] = nov['Transaction Amount'].astype('float')
nov['Transaction Amount'].isnull().sum()
nov = nov.dropna()
nov.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17908 entries, 2010-11-01 01:02:00 to 2010-11-30 07:26:00
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Transaction Amount 17908 non-null float64
1 Date of Transaction 17908 non-null datetime64[ns]
2 Time of Transaction 17908 non-null object
3 Trans ID 17908 non-null object
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 699.5+ KB
plt.figure(figsize=(20,10))
nov['Transaction Amount'].plot()
plt.ylabel("transaction amount in USD")
plt.show()
nov['Transaction Amount'].describe()
nov['Transaction Amount'].sum()
nov1 = nov.reset_index()
nov1 = nov1.sort_values(by = 'Transaction Amount', ascending = True)
nov1 = nov1.reset_index()
nov1 = nov1.loc[6: ,:]
nov1 = nov1.sort_values(by = 'Transaction Amount', ascending = False)
nov1 = nov1.reset_index()
nov1 = nov1.loc[3: ,:]
nov1.tail()
nov.sort_values(by = 'Transaction Amount', ascending = False).head(20)
nov2 = nov1.set_index('date_n_time')
nov2 = nov2.resample("T").sum()
nov2.head()
nov3 = nov2.reset_index()
nov3 = nov3.sort_values(by = 'date_n_time' , ascending = True)
nov3['Transaction Amount'].sum()
len(nov['Trans ID'].unique())
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
# Determining rolling statistics
rolmean = timeseries.rolling(window = 2).mean()
rolstd = timeseries.rolling(2).std()
# Plot rolling statistics:
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
# Perform Dickey-Fuller test:
print ('Results of Dickey-Fuller Test:')
timeseries = timeseries.iloc[:].values
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dfoutput)
test_stationarity(nov2['Transaction Amount'])
Results of Dickey-Fuller Test:
Test Statistic -21.083242
p-value 0.000000
#Lags Used 55.000000
Number of Observations Used 42089.000000
Critical Value (1%) -3.430505
Critical Value (5%) -2.861609
Critical Value (10%) -2.566807
dtype: float64
# Import the statsmodels module for regression and the adfuller function
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
def cointegration(dep, indep):
# Regress BTC on ETH
# Note the difference in argument order
X = indep
y = dep
X = sm.add_constant(X)
model = sm.OLS(y.astype(float), X.astype(float), missing='drop').fit()
predictions = model.predict(X.astype(float)) # make the predictions by the model
# Print out the statistics
print(model.summary())
b = model.params[1]
# Compute ADF
test_stationarity(indep - b*dep)
cointegration(nov['date_n_time'], nov3['Transaction Amount'])
Execution error
TypeError: cannot astype a datetimelike from [datetime64[ns]] to [float64]
pip install pystan==2.19.1.1
Requirement already satisfied: pystan==2.19.1.1 in /usr/local/lib/python3.7/site-packages (2.19.1.1)
Requirement already satisfied: Cython!=0.25.1,>=0.22 in /usr/local/lib/python3.7/site-packages (from pystan==2.19.1.1) (0.29.24)
Requirement already satisfied: numpy>=1.7 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pystan==2.19.1.1) (1.19.5)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
pip install prophet
Requirement already satisfied: prophet in /usr/local/lib/python3.7/site-packages (1.0.1)
Requirement already satisfied: Cython>=0.22 in /usr/local/lib/python3.7/site-packages (from prophet) (0.29.24)
Requirement already satisfied: cmdstanpy==0.9.68 in /usr/local/lib/python3.7/site-packages (from prophet) (0.9.68)
Requirement already satisfied: pystan~=2.19.1.1 in /usr/local/lib/python3.7/site-packages (from prophet) (2.19.1.1)
Requirement already satisfied: numpy>=1.15.4 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from prophet) (1.19.5)
Requirement already satisfied: pandas>=1.0.4 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from prophet) (1.2.5)
Requirement already satisfied: matplotlib>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from prophet) (3.4.3)
Requirement already satisfied: LunarCalendar>=0.0.9 in /usr/local/lib/python3.7/site-packages (from prophet) (0.0.9)
Requirement already satisfied: convertdate>=2.1.2 in /usr/local/lib/python3.7/site-packages (from prophet) (2.3.2)
Requirement already satisfied: holidays>=0.10.2 in /usr/local/lib/python3.7/site-packages (from prophet) (0.11.3.1)
Requirement already satisfied: setuptools-git>=1.2 in /usr/local/lib/python3.7/site-packages (from prophet) (1.2)
Requirement already satisfied: python-dateutil>=2.8.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from prophet) (2.8.2)
Requirement already satisfied: tqdm>=4.36.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from prophet) (4.62.3)
Requirement already satisfied: ujson in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from cmdstanpy==0.9.68->prophet) (4.2.0)
Requirement already satisfied: pytz>=2014.10 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from convertdate>=2.1.2->prophet) (2021.3)
Requirement already satisfied: pymeeus<=1,>=0.3.13 in /usr/local/lib/python3.7/site-packages (from convertdate>=2.1.2->prophet) (0.5.11)
Requirement already satisfied: hijri-converter in /usr/local/lib/python3.7/site-packages (from holidays>=0.10.2->prophet) (2.2.2)
Requirement already satisfied: korean-lunar-calendar in /usr/local/lib/python3.7/site-packages (from holidays>=0.10.2->prophet) (0.2.1)
Requirement already satisfied: ephem>=3.7.5.3 in /usr/local/lib/python3.7/site-packages (from LunarCalendar>=0.0.9->prophet) (4.1)
Requirement already satisfied: pyparsing>=2.2.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib>=2.0.0->prophet) (2.4.7)
Requirement already satisfied: pillow>=6.2.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib>=2.0.0->prophet) (8.3.2)
Requirement already satisfied: kiwisolver>=1.0.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib>=2.0.0->prophet) (1.3.2)
Requirement already satisfied: cycler>=0.10 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib>=2.0.0->prophet) (0.10.0)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from cycler>=0.10->matplotlib>=2.0.0->prophet) (1.16.0)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
WARNING: You are using pip version 21.2.4; however, version 21.3.1 is available.
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
from prophet import Prophet
from prophet.plot import plot_plotly
nov = nov.reset_index()
nov2 = nov1.set_index('date_n_time')
nov2 = nov2.resample("T").sum()
nov2.head()
# create a dataframe
nov2 = nov1.set_index('date_n_time')
nov2 = nov2.resample("T").sum()
nov2.head()
nov3 = nov2.reset_index()
nov3 = nov3.sort_values(by = 'date_n_time' , ascending = True)
train_dataset= pd.DataFrame()
train_dataset['ds'] = nov3['date_n_time']
train_dataset['y']= nov3['Transaction Amount']
train_dataset.head(2)
train_dataset= pd.DataFrame()
train_dataset['ds'] = nov3['date_n_time']
train_dataset['y']= nov3['Transaction Amount']
train_dataset= train_dataset.dropna()
train_dataset.head(2)
train_dataset['y'].hist(bins = 100)
# create a Prophet instance with default values to fit the dataset
prophet_basic = Prophet()
prophet_basic.fit(train_dataset)
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
# create a dataframe with ds (i.e., datetime stamp) that has the time series of dates we need for prediction
# periods specify the number of days to extend into the future
future= prophet_basic.make_future_dataframe(periods=1000, freq = 'T')
future.tail()
# forecast BTC prices
forecast=prophet_basic.predict(future)
# plot predicted BTC prices
fig1 =prophet_basic.plot(forecast)
# plot the trend and seasonality
fig1 = prophet_basic.plot_components(forecast)
# identify changepoints (i.e., datetime points when the time series exprience abrupt changes)
from prophet.plot import add_changepoints_to_plot
fig = prophet_basic.plot(forecast)
a = add_changepoints_to_plot(fig.gca(), prophet_basic, forecast)
prophet_basic.changepoints
# adjust trend sensitivity with "changepoint_prior_scale" parameter
# default value is 0.05. Lower value, less flexible trend, and vice versa
pro_change= Prophet(changepoint_prior_scale=0.15)
forecast = pro_change.fit(train_dataset).predict(future)
fig= pro_change.plot(forecast)
a = add_changepoints_to_plot(fig.gca(), prophet_basic, forecast)
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
y = nov3['date_n_time']
x = nov3[['Transaction Amount']]
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size = 0.3)
Xtrain.index = range(Xtrain.shape[0])
Clf = tree.DecisionTreeClassifier(criterion = "gini",
random_state = 0,
max_depth = 4,
#min_samples_split =
#min_samples_leaf
)
Clf = Clf.fit(Xtrain, Ytrain)
score = Clf.score(Xtest, Ytest)
score