!pip install pandas-datareader==0.10.0
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
from concurrent import futures
import numpy as np
from scipy.stats import gaussian_kde
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, set_link_color_palette
import pandas_datareader.data as web
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN, KMeans
data_dir = "./data/stock_data_for_clustering"
os.makedirs(data_dir, exist_ok=True)
# # tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
# # first_table = tables[0]
# # second_table = tables[1]
# # print(first_table.shape)
# # first_table["Symbol"] = first_table["Symbol"].map(lambda x: x.replace(".", "-")) # rename symbol to escape symbol error
# # first_table.to_csv("./data/SP500_20220327.csv", index=False)
# first_table = pd.read_csv("./data/SP500_20220327.csv")
# sp500_tickers = list(first_table["Symbol"])
# first_table.head()
# tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
# first_table = tables[0]
# second_table = tables[1]
# print(first_table.shape)
# first_table["Symbol"] = first_table["Symbol"].map(lambda x: x.replace(".", "-")) # rename symbol to escape symbol error
# first_table.to_csv("./data/SP500_20220716.csv", index=False)
first_table = pd.read_csv("./data/SP500_20220716.csv")
sp500_tickers = list(first_table["Symbol"])
first_table.head()
def download_stock(stock):
try:
print(stock)
stock_df = web.DataReader(stock,'yahoo', start_time, end_time)
stock_df['Name'] = stock
output_name = f"{data_dir}/{stock}.csv"
stock_df.to_csv(output_name)
except:
bad_names.append(stock)
print('bad: %s' % (stock))
""" set the download window """
start_time = dt.datetime(2021, 12, 1)
end_time = dt.datetime(2022, 3, 1)
bad_names =[] #to keep track of failed queries
#set the maximum thread number
max_workers = 20
now = dt.datetime.now()
path_failed_queries = f'{data_dir}/failed_queries.txt'
if os.path.exists(path_failed_queries):
with open(path_failed_queries) as f:
failed_queries = f.read().split("\n")[:-1]
sp500_tickers_ = failed_queries
else:
sp500_tickers_ = sp500_tickers
workers = min(max_workers, len(sp500_tickers_)) #in case a smaller number of stocks than threads was passed in
with futures.ThreadPoolExecutor(workers) as executor:
res = executor.map(download_stock, sp500_tickers_)
""" Save failed queries to a text file to retry """
if len(bad_names) > 0:
with open(f'{data_dir}/failed_queries.txt','w') as outfile:
for name in bad_names:
outfile.write(name+'\n')
finish_time = dt.datetime.now()
duration = finish_time - now
minutes, seconds = divmod(duration.seconds, 60)
print(f'The threaded script took {minutes} minutes and {seconds} seconds to run.')
print(f"{len(bad_names)} stocks failed: ", bad_names)
historical_stock_data_files = glob.glob(f"./{data_dir}/*.csv")
reference_day = "2021-12-31"
start_day = "2022-01-03"
midterm_day = "2022-01-18"
end_day = "2022-01-31"
price_change_list = []
tickers_to_ignore = []
for files in historical_stock_data_files:
df = pd.read_csv(files, index_col=["Date"])
ticker = os.path.splitext(os.path.basename(files))[0]
try:
price_close = df[reference_day: end_day][["Close"]]
price_change = (price_close / price_close.loc[reference_day, "Close"] - 1) * 100
price_change = price_change.iloc[1: ,:]
price_change = price_change.rename(columns={"Close": ticker})
price_change_list.append(price_change)
except KeyError as e:
# some stocks started trading after 2021-12-31
print(ticker)
tickers_to_ignore.append(ticker)
df = pd.concat(price_change_list, axis=1)
print(df.shape)
df.head()
data_1 = df.loc[[end_day], :]
display(data_1.head())
data_2 = df.loc[[start_day, midterm_day, end_day], :]
display(data_2.head())
data_3 = df
display(data_3.head())
first_table = first_table[~first_table["Symbol"].isin(tickers_to_ignore)]
industry_list = list(first_table["GICS Sector"].unique())
performance_by_industry = [data_1.loc[:, first_table[first_table["GICS Sector"]==x]["Symbol"]].values[0] for x in industry_list]
plt.boxplot(performance_by_industry, labels=industry_list)
plt.xticks(rotation=90)
plt.xlabel("sector")
plt.ylabel(f"performance(%) at {end_day}")
plt.show()
examples = ["AAPL", "TSLA", "BMY"]
for x in examples:
plt.plot(data_3[x], label=x)
plt.xticks(rotation=90)
plt.ylabel("performance(%)")
plt.legend(loc="upper right")
plt.show()
# Kernel density estimation
x = np.linspace(-35, 35, 1000)
kde = gaussian_kde(data_1.values)
y = kde(x)
plt.hist(data_1.loc[end_day, :], bins=20, density=True)
plt.plot(x, y)
plt.xlabel(f"performance(%) at {end_day}")
plt.show()
plt.hist(data_1[data_1 < 0].loc[end_day, :], bins=10)
plt.hist(data_1[data_1 >= 0].loc[end_day, :], bins=10)
plt.xlabel(f"performance(%) at {end_day}")
plt.show()
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data_1.T.values)
for i in range(n_clusters):
plt.hist(data_1.loc[end_day, kmeans.labels_ == i], bins=8)
plt.xlabel(f"performance(%) at {end_day}")
plt.show()
dummy_zero = np.random.normal(0,1,100)
dummy_minus = np.random.normal(-8,1,100)
dummy_plus = np.random.normal(8,1,100)
dummy = np.concatenate([dummy_zero, dummy_minus, dummy_plus])
plt.hist(dummy, bins=25)
plt.xlabel("performance(%)")
plt.show()
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(dummy.reshape(-1,1))
for i in range(n_clusters):
plt.hist(dummy[kmeans.labels_ == i], bins=8)
plt.xlabel("performance(%)")
plt.show()
def scatter_hist(x, y, ax, ax_histx, ax_histy):
# no labels
ax_histx.tick_params(axis="x", labelbottom=False)
ax_histy.tick_params(axis="y", labelleft=False)
# the scatter plot:
ax.scatter(x, y, alpha=0.5)
# now determine nice limits by hand:
binwidth = 0.25
xymax = max(np.max(np.abs(x)), np.max(np.abs(y)))
lim = (int(xymax/binwidth) + 1) * binwidth
bins = np.arange(-lim, lim + binwidth, binwidth)
ax_histx.hist(x, bins=bins)
ax_histy.hist(y, bins=bins, orientation='horizontal')
fig = plt.figure(figsize=(8, 8))
gs = fig.add_gridspec(2, 2, width_ratios=(7, 2), height_ratios=(2, 7),
left=0.1, right=0.9, bottom=0.1, top=0.9,
wspace=0.05, hspace=0.05)
ax = fig.add_subplot(gs[1, 0])
ax_histx = fig.add_subplot(gs[0, 0], sharex=ax)
ax_histy = fig.add_subplot(gs[1, 1], sharey=ax)
scatter_hist(data_2.loc[midterm_day, :], data_2.loc[end_day, :], ax, ax_histx, ax_histy)
ax.set_xlabel(f"performance(%) at {midterm_day}")
ax.set_ylabel(f"performance(%) at {end_day}")
plt.show()
fig = plt.figure(figsize=(8, 8))
gs = fig.add_gridspec(2, 2, width_ratios=(7, 2), height_ratios=(2, 7),
left=0.1, right=0.9, bottom=0.1, top=0.9,
wspace=0.05, hspace=0.05)
ax = fig.add_subplot(gs[1, 0])
ax_histx = fig.add_subplot(gs[0, 0], sharex=ax)
ax_histy = fig.add_subplot(gs[1, 1], sharey=ax)
scatter_hist(data_2.loc[start_day, :], data_2.loc[end_day, :], ax, ax_histx, ax_histy)
ax.set_xlabel(f"performance(%) at {start_day}")
ax.set_ylabel(f"performance(%) at {end_day}")
plt.show()
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data_2.T.values)
for i in range(n_clusters):
plt.scatter(data_2.loc[midterm_day, kmeans.labels_ == i], data_2.loc[end_day, kmeans.labels_ == i], alpha=0.5)
plt.xlabel(f"performance(%) at {midterm_day}")
plt.ylabel(f"performance(%) at {end_day}")
plt.show()
result = linkage(data_2.loc[midterm_day:, :].T.values, metric='euclidean', method='average')
color_list = ["orange", "green", "blue"]
set_link_color_palette(color_list)
dendrogram(result)
plt.title("Dedrogram")
plt.ylabel("Distance")
plt.show()
labels = fcluster(result, t=20, criterion="distance")
for i, x in enumerate(set(labels)):
plt.scatter(data_2.loc[midterm_day, labels == x],
data_2.loc[end_day, labels == x],
alpha=0.5, color=color_list[i], label=f"Cluster {x}")
plt.xlabel(f"performance at {midterm_day}")
plt.ylabel(f"performance at {end_day}")
plt.legend()
plt.show()
cluster_2 = data_2.loc[:, labels == 2].columns
sector_counts = first_table[first_table["Symbol"].isin(cluster_2)]["GICS Sector"].value_counts()
sectors = sector_counts.index
plt.figure(figsize=(5,3))
plt.bar(sectors, sector_counts)
plt.xticks(rotation=90)
plt.show()
sector_counts
cluster_3 = data_2.loc[:, labels == 3].columns
sector_counts = first_table[first_table["Symbol"].isin(cluster_3)]["GICS Sector"].value_counts()
sectors = sector_counts.index
plt.figure(figsize=(5,3))
plt.bar(sectors, sector_counts)
plt.xticks(rotation=90)
plt.show()
result = linkage(data_2.T.values, metric='euclidean', method='average')
color_list = ["orange", "green", "red"]
set_link_color_palette(color_list)
dendrogram(result)
plt.title("Dedrogram")
plt.ylabel("Distance")
plt.show()
labels = fcluster(result, t=20, criterion="distance")
for i, x in enumerate(set(labels)):
plt.scatter(data_2.loc[midterm_day, labels == x],
data_2.loc[end_day, labels == x],
alpha=0.5, color=color_list[i], label=f"Cluster {x}")
plt.xlabel(f"performance at {midterm_day}")
plt.ylabel(f"performance at {end_day}")
plt.legend()
plt.show()
centroids_list = []
for x in set(labels):
# print("Cluster:", x)
centroid = data_2.loc[:, labels == x].mean(axis=1)
# centroid.rename() = [x]
centroid = pd.DataFrame(centroid, columns=[x])
centroids_list.append(centroid)
centroids_df = pd.concat(centroids_list, axis=1)
print("Centroids of each cluster.")
display(centroids_df)
labels = fcluster(result, t=20, criterion="distance")
for i, x in enumerate(set(labels)):
plt.scatter(data_2.loc[midterm_day, labels == x],
data_2.loc[end_day, labels == x],
alpha=0.5, color=color_list[i], label=f"Cluster {x}")
plt.scatter(centroids_df.loc[midterm_day, x],
centroids_df.loc[end_day, x], color="black", marker="*")
plt.xlabel(f"performance at {midterm_day}")
plt.ylabel(f"performance at {end_day}")
plt.legend()
plt.show()
tsne = TSNE(n_components=2, random_state=0)
X_reduced = tsne.fit_transform(data_3.T)
print(X_reduced.shape)
plt.figure(figsize=(13, 7))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], alpha=0.5)
plt.show()
plt.figure(figsize=(13, 7))
for x in set(labels):
plt.scatter(X_reduced[labels==x, 0], X_reduced[labels==x, 1], alpha=0.8, label=f"Cluster {x}", color=color_list[x-1])
plt.legend()
plt.show()
clustering = DBSCAN(eps=1.72, min_samples=3).fit(X_reduced)
labels_dbscan = clustering.labels_
print(set(labels_dbscan))
plt.figure(figsize=(13, 7))
for x in set(labels_dbscan):
plt.scatter(X_reduced[labels_dbscan==x, 0], X_reduced[labels_dbscan==x, 1], alpha=0.8, label=f"Cluster {x}")
plt.legend()
plt.show()
centroids_list = []
for x in set(labels_dbscan):
# print("Cluster:", x)
centroid = data_3.loc[:, labels_dbscan == x].mean(axis=1)
# centroid.rename() = [x]
centroid = pd.DataFrame(centroid, columns=[x])
centroids_list.append(centroid)
centroids_df = pd.concat(centroids_list, axis=1)
print("Centroids of each cluster.")
# For ease of thinking, only 3 days are shown.
display(centroids_df.loc[[start_day, midterm_day, end_day], :])
for x in range(5):
plt.plot(centroids_df.loc[:, x], label=f"Cluster {x}")
plt.legend()
plt.xticks(rotation=90)
plt.legend(loc="lower left")
plt.ylim(-20,20)
plt.ylabel("performance(%) of each cluster")
plt.show()
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15,12))
for i, x in enumerate(set(labels_dbscan)):
ax = axes[i%3][i//3]
if x == -1:
ax.axis('off')
continue
ax.plot(data_3.loc[:, labels_dbscan==x])
ax.set_title(f"Cluster {x}")
ax.set_ylim(-40, 40)
ax.get_xaxis().set_visible(False)
ax.grid(axis='y',linestyle='dotted', color='b')
for x in set(labels_dbscan):
if x == -1:
continue
print(f"Cluster {x}")
sample_list = list(data_3.loc[:, labels_dbscan==x].T.sample(5, replace=True).index)
sample_list = list(set(sample_list))
print(sample_list)