!pip install yfinance --upgrade --no-cache-dir

```
Requirement already satisfied: yfinance in /root/venv/lib/python3.7/site-packages (0.1.63)
Requirement already satisfied: pandas>=0.24 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from yfinance) (1.2.5)
Requirement already satisfied: requests>=2.20 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from yfinance) (2.26.0)
Requirement already satisfied: multitasking>=0.0.7 in /root/venv/lib/python3.7/site-packages (from yfinance) (0.0.9)
Requirement already satisfied: lxml>=4.5.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from yfinance) (4.6.3)
Requirement already satisfied: numpy>=1.15 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from yfinance) (1.19.5)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.24->yfinance) (2021.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.24->yfinance) (2.8.2)
Requirement already satisfied: six>=1.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas>=0.24->yfinance) (1.16.0)
Requirement already satisfied: charset-normalizer~=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from requests>=2.20->yfinance) (2.0.6)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from requests>=2.20->yfinance) (1.26.7)
Requirement already satisfied: certifi>=2017.4.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from requests>=2.20->yfinance) (2021.5.30)
Requirement already satisfied: idna<4,>=2.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from requests>=2.20->yfinance) (3.2)
```

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import yfinance as yf

symbols_list = ['SPY']
start = dt.datetime(2015,9,24)
end = dt.datetime(2021,9,24)
data = yf.download(symbols_list, start=start, end=end)
data.info()

```
[*********************100%***********************] 1 of 1 completed
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1511 entries, 2015-09-24 to 2021-09-23
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Open 1511 non-null float64
1 High 1511 non-null float64
2 Low 1511 non-null float64
3 Close 1511 non-null float64
4 Adj Close 1511 non-null float64
5 Volume 1511 non-null int64
dtypes: float64(5), int64(1)
memory usage: 82.6 KB
```

## Reset Index to get Dates as a normal column in the dataframe.
df = data.reset_index()
df.info()

```
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1511 entries, 0 to 1510
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 1511 non-null datetime64[ns]
1 Open 1511 non-null float64
2 High 1511 non-null float64
3 Low 1511 non-null float64
4 Close 1511 non-null float64
5 Adj Close 1511 non-null float64
6 Volume 1511 non-null int64
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 82.8 KB
```

## Choose relevant data for the analysis.
df = df[['Date','Adj Close', 'Volume']]
df.info()

```
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1511 entries, 0 to 1510
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 1511 non-null datetime64[ns]
1 Adj Close 1511 non-null float64
2 Volume 1511 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 35.5 KB
```

# Compare the U.S. Stock market in December and January. We suggested that the Stock's return, volume, and volatility in January is higher than December.

## Pull months from the dates.
## Calculate the daily returns of Adjusted Changes.
## Calculate annualized volatility.
df['months'] = df['Date'].dt.month
df['return'] = df['Adj Close'].pct_change()
df['annualized_volatility'] = (df['return'].rolling(252).std())*(252)**(1/2)
df.tail()

## Filter for months December and January
Dec_returns = df.query('''months == 12''')[1:]
Jan_returns = df.query('''months == 1''')[1:]

print(Dec_returns.head())
Jan_returns.head()

```
Date Adj Close Volume months return annualized_volatility
48 2015-12-02 186.388840 108441300 12 -0.010205 NaN
49 2015-12-03 183.778885 166224200 12 -0.014003 NaN
50 2015-12-04 187.363113 192913900 12 0.019503 NaN
51 2015-12-07 186.227966 102027100 12 -0.006059 NaN
52 2015-12-08 184.976608 103372400 12 -0.006719 NaN
```

## Plot returns of the two month for comparison.
Dec_returns['return'].hist(bins=100, color='r', alpha=0.5)
Jan_returns['return'].hist(bins=100, color='g', alpha=0.5)

Dec_returns['return'].describe()

Jan_returns['return'].describe()

## Perform T-Test
import scipy.stats as stats
print("Difference in mean return: ")
print((Jan_returns['return'].mean() - Dec_returns['return'].mean())*100)
stat, p = stats.ttest_ind(Jan_returns['return'], Dec_returns['return'], equal_var=False)
# interpret p-value
alpha = 0.05
print("p value is " + str(p))
if p <= alpha:
print('The difference in mean return is significantly different (reject H0)')
else:
print('The difference in mean return is not significantly different (fail to reject H0)')

```
Difference in mean return:
0.102742472589344
p value is 0.40701230988476067
The difference in mean return is not significantly different (fail to reject H0)
```

## Plot the annualized volaitily for December and January.
Jan_returns['annualized_volatility'].hist(bins=100, color='r', alpha=0.5)
Dec_returns['annualized_volatility'].hist(bins=100, color='g', alpha=0.5)

Jan_returns['annualized_volatility'].describe()

Dec_returns['annualized_volatility'].describe()

import scipy.stats as stats
print("Difference in mean return volatility: ")
print((Jan_returns['annualized_volatility'].notna().mean() - Dec_returns['annualized_volatility'].notna().mean())*100)
stat, p = stats.ttest_ind(Jan_returns['annualized_volatility'].notna(), Dec_returns['annualized_volatility'].notna(), equal_var=False)
# interpret p-value
alpha = 0.05
print("p value is " + str(p))
if p <= alpha:
print('The difference in mean return volatility is significantly different (reject H0)')
else:
print('The difference in mean return volatility is not significantly different (fail to reject H0)')

```
Difference in mean return volatility:
1.9354838709677358
p value is 0.6812635182133023
The difference in mean return volatility is not significantly different (fail to reject H0)
```

Dec_returns['Volume'].hist(bins=100, color='r', alpha=0.5)
Jan_returns['Volume'].hist(bins=100, color='g', alpha=0.5)

Dec_returns['Volume'].describe()

Jan_returns['Volume'].describe()

import scipy.stats as stats
print("Difference in mean trading volume: ")
print(Jan_returns['Volume'].mean() - Dec_returns['Volume'].mean())
stat, p = stats.ttest_ind(Jan_returns['Volume'], Dec_returns['Volume'], equal_var=False)
# interpret p-value
alpha = 0.05
print("p value is " + str(p))
if p <= alpha:
print('The difference in mean trading volume is significantly different (reject H0)')
else:
print('The difference in mean trading volume is not significantly different (fail to reject H0)')

```
Difference in mean trading volume:
791944.7580645233
p value is 0.9045216746673639
The difference in mean trading volume is not significantly different (fail to reject H0)
```

# In conclusion. The U.S. Stocks' returns, trading volumes, and volatility don't have a significant differences between December and January

# Chi-Square Test to test independency of the Stock returns of December and January.

## Define function that returns the contingency table.
def Contingency_Table(df1, df2, date="Date", month1 = 12, month2 = 1):
dfs_old = [df1, df2]
dfs_new = []
m1 = month1
m2 = month2
## Prepare data of each Dataframe
for df in dfs_old:
df["Year"] = pd.DatetimeIndex(df[date]).year
df["Month"] = pd.DatetimeIndex(df[date]).month
##Not necessary. Drops data of last/ first year. Done above already.
#if df['Month'].mean() == m1:
#last_year = df['Year'].max()
#lm_days = df[df['Year'] == last_year].count()[date]
#print(last_year)
#print(lm_days)
#df = df.iloc[:(len(df)-lm_days),:]
#for y in range(2015,2022):
#days = df[df["Year"] == y].count()[date]
#print(str(y) + " has " + str(days) + " days.")
#elif df['Month'].mean() == m2:
#first_year = df['Year'].min()
#fm_days = df[df['Year'] == first_year].count()[date]
#print(first_year)
#print(fm_days)
#df = df.iloc[fm_days:,:]
#for y in range(2015,2022):
#days = df[df["Year"] == y].count()[date]
#print(str(y) + " has " + str(days) + " days.")
df = df[["Date", "Year", "Month", "return"]]
df['row_num'] = df.reset_index().index+1
dfs_new.append(df)
## Merge to Contingency Table
con_tab = dfs_new[0].merge(dfs_new[1], on = "row_num", suffixes = ("_dec", "_jan"))
con_tab = con_tab[['row_num','Year_dec','Year_jan','return_dec', 'return_jan']]
con_tab.set_index('row_num', inplace=True, drop=True)
dof = (len(dfs_new)-1)*(con_tab.count()['Year_dec']-1)
#print('Contingency Table contains: ' + str(len(dfs_new)) + " variables and " + str(con_tab.count()['Year_dec']) + " values. Degrees of Freedom are therefore "+ str(dof))
return con_tab, dof

## Create Contingency Table and Degrees of Freedom, assign to "CT" and "dof":
CT, dof = Contingency_Table(Dec_returns, Jan_returns)
print('Degrees of Freedom: ' + str(dof))
CT.head()

```
Degrees of Freedom: 119
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:40: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
```

## Perform Chi²-Test
## Set confidence level:
alpha = 0.05
##Calculate Test-Statistics
CT['Chi_sq_i'] = ((CT['return_dec']-CT['return_jan'])**2)/CT['return_jan']
Chi_sq_emp = CT['Chi_sq_i'].sum()
print("The emperical Chi²-Value is: " + str(Chi_sq_emp))
##Get theoretical Chi²-Value:
from scipy.stats import chisquare

```
The emperical Chi²-Value is: inf
```