Data Clustering for S&P500 Stock Data

!pip install pandas-datareader==0.10.0

import os import glob import pandas as pd import matplotlib.pyplot as plt import datetime as dt from concurrent import futures import numpy as np from scipy.stats import gaussian_kde from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, set_link_color_palette import pandas_datareader.data as web from sklearn.manifold import TSNE from sklearn.cluster import DBSCAN, KMeans data_dir = "./data/stock_data_for_clustering" os.makedirs(data_dir, exist_ok=True)

# # tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies') # # first_table = tables[0] # # second_table = tables[1] # # print(first_table.shape) # # first_table["Symbol"] = first_table["Symbol"].map(lambda x: x.replace(".", "-")) # rename symbol to escape symbol error # # first_table.to_csv("./data/SP500_20220327.csv", index=False) # first_table = pd.read_csv("./data/SP500_20220327.csv") # sp500_tickers = list(first_table["Symbol"]) # first_table.head()

# tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies') # first_table = tables[0] # second_table = tables[1] # print(first_table.shape) # first_table["Symbol"] = first_table["Symbol"].map(lambda x: x.replace(".", "-")) # rename symbol to escape symbol error # first_table.to_csv("./data/SP500_20220716.csv", index=False) first_table = pd.read_csv("./data/SP500_20220716.csv") sp500_tickers = list(first_table["Symbol"]) first_table.head()

def download_stock(stock): try: print(stock) stock_df = web.DataReader(stock,'yahoo', start_time, end_time) stock_df['Name'] = stock output_name = f"{data_dir}/{stock}.csv" stock_df.to_csv(output_name) except: bad_names.append(stock) print('bad: %s' % (stock)) """ set the download window """ start_time = dt.datetime(2021, 12, 1) end_time = dt.datetime(2022, 3, 1) bad_names =[] #to keep track of failed queries #set the maximum thread number max_workers = 20 now = dt.datetime.now() path_failed_queries = f'{data_dir}/failed_queries.txt' if os.path.exists(path_failed_queries): with open(path_failed_queries) as f: failed_queries = f.read().split("\n")[:-1] sp500_tickers_ = failed_queries else: sp500_tickers_ = sp500_tickers workers = min(max_workers, len(sp500_tickers_)) #in case a smaller number of stocks than threads was passed in with futures.ThreadPoolExecutor(workers) as executor: res = executor.map(download_stock, sp500_tickers_) """ Save failed queries to a text file to retry """ if len(bad_names) > 0: with open(f'{data_dir}/failed_queries.txt','w') as outfile: for name in bad_names: outfile.write(name+'\n') finish_time = dt.datetime.now() duration = finish_time - now minutes, seconds = divmod(duration.seconds, 60) print(f'The threaded script took {minutes} minutes and {seconds} seconds to run.') print(f"{len(bad_names)} stocks failed: ", bad_names)

historical_stock_data_files = glob.glob(f"./{data_dir}/*.csv") reference_day = "2021-12-31" start_day = "2022-01-03" midterm_day = "2022-01-18" end_day = "2022-01-31" price_change_list = [] tickers_to_ignore = [] for files in historical_stock_data_files: df = pd.read_csv(files, index_col=["Date"]) ticker = os.path.splitext(os.path.basename(files))[0] try: price_close = df[reference_day: end_day][["Close"]] price_change = (price_close / price_close.loc[reference_day, "Close"] - 1) * 100 price_change = price_change.iloc[1: ,:] price_change = price_change.rename(columns={"Close": ticker}) price_change_list.append(price_change) except KeyError as e: # some stocks started trading after 2021-12-31 print(ticker) tickers_to_ignore.append(ticker) df = pd.concat(price_change_list, axis=1) print(df.shape) df.head()

data_1 = df.loc[[end_day], :] display(data_1.head()) data_2 = df.loc[[start_day, midterm_day, end_day], :] display(data_2.head()) data_3 = df display(data_3.head())

first_table = first_table[~first_table["Symbol"].isin(tickers_to_ignore)]

industry_list = list(first_table["GICS Sector"].unique()) performance_by_industry = [data_1.loc[:, first_table[first_table["GICS Sector"]==x]["Symbol"]].values[0] for x in industry_list] plt.boxplot(performance_by_industry, labels=industry_list) plt.xticks(rotation=90) plt.xlabel("sector") plt.ylabel(f"performance(%) at {end_day}") plt.show()

examples = ["AAPL", "TSLA", "BMY"] for x in examples: plt.plot(data_3[x], label=x) plt.xticks(rotation=90) plt.ylabel("performance(%)") plt.legend(loc="upper right") plt.show()

# Kernel density estimation x = np.linspace(-35, 35, 1000) kde = gaussian_kde(data_1.values) y = kde(x) plt.hist(data_1.loc[end_day, :], bins=20, density=True) plt.plot(x, y) plt.xlabel(f"performance(%) at {end_day}") plt.show()

plt.hist(data_1[data_1 < 0].loc[end_day, :], bins=10) plt.hist(data_1[data_1 >= 0].loc[end_day, :], bins=10) plt.xlabel(f"performance(%) at {end_day}") plt.show()

n_clusters = 3 kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data_1.T.values) for i in range(n_clusters): plt.hist(data_1.loc[end_day, kmeans.labels_ == i], bins=8) plt.xlabel(f"performance(%) at {end_day}") plt.show()

dummy_zero = np.random.normal(0,1,100) dummy_minus = np.random.normal(-8,1,100) dummy_plus = np.random.normal(8,1,100) dummy = np.concatenate([dummy_zero, dummy_minus, dummy_plus]) plt.hist(dummy, bins=25) plt.xlabel("performance(%)") plt.show()

n_clusters = 3 kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(dummy.reshape(-1,1)) for i in range(n_clusters): plt.hist(dummy[kmeans.labels_ == i], bins=8) plt.xlabel("performance(%)") plt.show()

def scatter_hist(x, y, ax, ax_histx, ax_histy): # no labels ax_histx.tick_params(axis="x", labelbottom=False) ax_histy.tick_params(axis="y", labelleft=False) # the scatter plot: ax.scatter(x, y, alpha=0.5) # now determine nice limits by hand: binwidth = 0.25 xymax = max(np.max(np.abs(x)), np.max(np.abs(y))) lim = (int(xymax/binwidth) + 1) * binwidth bins = np.arange(-lim, lim + binwidth, binwidth) ax_histx.hist(x, bins=bins) ax_histy.hist(y, bins=bins, orientation='horizontal') fig = plt.figure(figsize=(8, 8)) gs = fig.add_gridspec(2, 2, width_ratios=(7, 2), height_ratios=(2, 7), left=0.1, right=0.9, bottom=0.1, top=0.9, wspace=0.05, hspace=0.05) ax = fig.add_subplot(gs[1, 0]) ax_histx = fig.add_subplot(gs[0, 0], sharex=ax) ax_histy = fig.add_subplot(gs[1, 1], sharey=ax) scatter_hist(data_2.loc[midterm_day, :], data_2.loc[end_day, :], ax, ax_histx, ax_histy) ax.set_xlabel(f"performance(%) at {midterm_day}") ax.set_ylabel(f"performance(%) at {end_day}") plt.show()

fig = plt.figure(figsize=(8, 8)) gs = fig.add_gridspec(2, 2, width_ratios=(7, 2), height_ratios=(2, 7), left=0.1, right=0.9, bottom=0.1, top=0.9, wspace=0.05, hspace=0.05) ax = fig.add_subplot(gs[1, 0]) ax_histx = fig.add_subplot(gs[0, 0], sharex=ax) ax_histy = fig.add_subplot(gs[1, 1], sharey=ax) scatter_hist(data_2.loc[start_day, :], data_2.loc[end_day, :], ax, ax_histx, ax_histy) ax.set_xlabel(f"performance(%) at {start_day}") ax.set_ylabel(f"performance(%) at {end_day}") plt.show()

n_clusters = 3 kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(data_2.T.values) for i in range(n_clusters): plt.scatter(data_2.loc[midterm_day, kmeans.labels_ == i], data_2.loc[end_day, kmeans.labels_ == i], alpha=0.5) plt.xlabel(f"performance(%) at {midterm_day}") plt.ylabel(f"performance(%) at {end_day}") plt.show()

result = linkage(data_2.loc[midterm_day:, :].T.values, metric='euclidean', method='average') color_list = ["orange", "green", "blue"] set_link_color_palette(color_list) dendrogram(result) plt.title("Dedrogram") plt.ylabel("Distance") plt.show()

labels = fcluster(result, t=20, criterion="distance") for i, x in enumerate(set(labels)): plt.scatter(data_2.loc[midterm_day, labels == x], data_2.loc[end_day, labels == x], alpha=0.5, color=color_list[i], label=f"Cluster {x}") plt.xlabel(f"performance at {midterm_day}") plt.ylabel(f"performance at {end_day}") plt.legend() plt.show()

cluster_2 = data_2.loc[:, labels == 2].columns sector_counts = first_table[first_table["Symbol"].isin(cluster_2)]["GICS Sector"].value_counts() sectors = sector_counts.index plt.figure(figsize=(5,3)) plt.bar(sectors, sector_counts) plt.xticks(rotation=90) plt.show()

sector_counts

cluster_3 = data_2.loc[:, labels == 3].columns sector_counts = first_table[first_table["Symbol"].isin(cluster_3)]["GICS Sector"].value_counts() sectors = sector_counts.index plt.figure(figsize=(5,3)) plt.bar(sectors, sector_counts) plt.xticks(rotation=90) plt.show()

result = linkage(data_2.T.values, metric='euclidean', method='average') color_list = ["orange", "green", "red"] set_link_color_palette(color_list) dendrogram(result) plt.title("Dedrogram") plt.ylabel("Distance") plt.show()

centroids_list = [] for x in set(labels): # print("Cluster:", x) centroid = data_2.loc[:, labels == x].mean(axis=1) # centroid.rename() = [x] centroid = pd.DataFrame(centroid, columns=[x]) centroids_list.append(centroid) centroids_df = pd.concat(centroids_list, axis=1) print("Centroids of each cluster.") display(centroids_df) labels = fcluster(result, t=20, criterion="distance") for i, x in enumerate(set(labels)): plt.scatter(data_2.loc[midterm_day, labels == x], data_2.loc[end_day, labels == x], alpha=0.5, color=color_list[i], label=f"Cluster {x}") plt.scatter(centroids_df.loc[midterm_day, x], centroids_df.loc[end_day, x], color="black", marker="*") plt.xlabel(f"performance at {midterm_day}") plt.ylabel(f"performance at {end_day}") plt.legend() plt.show()

tsne = TSNE(n_components=2, random_state=0) X_reduced = tsne.fit_transform(data_3.T) print(X_reduced.shape) plt.figure(figsize=(13, 7)) plt.scatter(X_reduced[:, 0], X_reduced[:, 1], alpha=0.5) plt.show()

plt.figure(figsize=(13, 7)) for x in set(labels): plt.scatter(X_reduced[labels==x, 0], X_reduced[labels==x, 1], alpha=0.8, label=f"Cluster {x}", color=color_list[x-1]) plt.legend() plt.show()

clustering = DBSCAN(eps=1.72, min_samples=3).fit(X_reduced) labels_dbscan = clustering.labels_ print(set(labels_dbscan)) plt.figure(figsize=(13, 7)) for x in set(labels_dbscan): plt.scatter(X_reduced[labels_dbscan==x, 0], X_reduced[labels_dbscan==x, 1], alpha=0.8, label=f"Cluster {x}") plt.legend() plt.show()

centroids_list = [] for x in set(labels_dbscan): # print("Cluster:", x) centroid = data_3.loc[:, labels_dbscan == x].mean(axis=1) # centroid.rename() = [x] centroid = pd.DataFrame(centroid, columns=[x]) centroids_list.append(centroid) centroids_df = pd.concat(centroids_list, axis=1) print("Centroids of each cluster.") # For ease of thinking, only 3 days are shown. display(centroids_df.loc[[start_day, midterm_day, end_day], :])

for x in range(5): plt.plot(centroids_df.loc[:, x], label=f"Cluster {x}") plt.legend() plt.xticks(rotation=90) plt.legend(loc="lower left") plt.ylim(-20,20) plt.ylabel("performance(%) of each cluster") plt.show()

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15,12)) for i, x in enumerate(set(labels_dbscan)): ax = axes[i%3][i//3] if x == -1: ax.axis('off') continue ax.plot(data_3.loc[:, labels_dbscan==x]) ax.set_title(f"Cluster {x}") ax.set_ylim(-40, 40) ax.get_xaxis().set_visible(False) ax.grid(axis='y',linestyle='dotted', color='b')

for x in set(labels_dbscan): if x == -1: continue print(f"Cluster {x}") sample_list = list(data_3.loc[:, labels_dbscan==x].T.sample(5, replace=True).index) sample_list = list(set(sample_list)) print(sample_list)