Time Series Analysis

from warnings import filterwarnings filterwarnings('ignore')

import numpy as np import matplotlib.pyplot as plt import yfinance as yf import pandas as pd import pandas_datareader as pdr import datetime

data = pd.read_csv('./ind_nifty100list.csv') #remove entries without values data.head()

#dict from dataframe companies_dict={} for i in range(len(data)): companies_dict[data.iloc[i]['Company Name']]=data.iloc[i]['Symbol']+'.NS'

companies = sorted(companies_dict.items(), key=lambda x: x[1]) #dataframe of name and symbols companies_df = pd.DataFrame(companies, columns=['Company Name', 'Symbol']) companies_df.head()

start_date = '2019-08-01' end_date = '2020-02-01'

panel_data = pdr.get_data_yahoo(list(companies_dict.values()), start=start_date, end=end_date)

panel_data.head()

stock_close = panel_data.loc[:, 'Close'] stock_open = panel_data.loc[:, 'Open']

stock_close = np.array(stock_close).T stock_open = np.array(stock_open).T row, col = stock_close.shape movements = np.zeros([row, col]) for i in range(0, row): movements[i, :] = (stock_close[i, :] - stock_open[i, :])/stock_open[i, :]

plt.figure(figsize=(20, 5)) ax1 = plt.subplot(221) plt.plot(movements[0, :]) plt.title(companies[0][0]) plt.subplot(222, sharey=ax1) plt.plot(movements[1, :]) plt.title(companies[1][0]) plt.show()

from sklearn.preprocessing import Normalizer

normalise = Normalizer().fit_transform(movements)

plt.figure(figsize=(20, 5)) ax1 = plt.subplot(221) plt.plot(movements[0, :]) plt.title(companies[0][0]) plt.subplot(222, sharey=ax1) plt.plot(movements[1, :]) plt.title(companies[1][0]) plt.show()

# machine learning libraries from sklearn.pipeline import make_pipeline from sklearn.cluster import KMeans

# define Normalizer normalizer = Normalizer() new = normalizer.fit_transform(movements)

# K-Means inertia = [] for i in range(2,90): kmeans = KMeans(n_clusters=i, random_state=0, max_iter=1000) pipeline = make_pipeline(normalizer, kmeans) pipeline.fit(movements) inertia.append(kmeans.inertia_)

inertia = np.array(inertia)

# plot of inertia plt.figure(figsize=(20, 5)) plt.plot(np.arange(2,30), -1*np.diff(inertia,1)[:28]) plt.xlabel('Number of clusters') plt.ylabel('Inertia') plt.show()

kmeans = KMeans(n_clusters=12, random_state=0, max_iter=1000)

# pipeline chaining Normalizer and K-Means pipeline = make_pipeline(normalizer, kmeans)

pipeline.fit(movements)

# predict cluster labels labels = pipeline.predict(movements)

# dataframe aligning labels and companies df = pd.DataFrame({'Company': list(companies), 'Cluster': labels})

# df sorted by cluster labels df_sorted = df.sort_values(by='Cluster')

df_sorted

# PCA from sklearn.decomposition import PCA

# visualize the results reduced_data = PCA(n_components=2).fit_transform(new)

# run kmeans on reduced data kmeans = KMeans(n_clusters=12) kmeans.fit(reduced_data) labels = kmeans.predict(reduced_data)

# create DataFrame aligning labels & companies df = pd.DataFrame({'labels': labels, 'companies': companies})

# Display df sorted by cluster labels df.sort_values('labels')

# Define step size of mesh h = 0.01 # plot the decision boundary x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain abels for each point in the mesh using our trained model Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) # define colorplot cmap = plt.cm.Paired # plot figure plt.clf() plt.figure(figsize=(10, 10)) plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=cmap, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=5) # plot the centroid of each cluster as a white X centroids = kmeans.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidth=3, color='w', zorder=10) plt.title('K-Means Clustering on Stock Market Movements (PCA-Reduced Data)') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.show()

start_date = '2020-02-01' end_date = '2020-09-01'

panel_data = pdr.get_data_yahoo(list(companies_dict.values()), start=start_date, end=end_date)

panel_data.head()

stock_close = panel_data.loc[:, 'Close'] stock_open = panel_data.loc[:, 'Open'] print(stock_close.iloc[0])

stock_close = np.array(stock_close).T stock_open = np.array(stock_open).T row, col = stock_close.shape movements = np.zeros([row, col]) for i in range(0, row): movements[i, :] = (stock_close[i, :] - stock_open[i, :])/stock_open[i, :]

print('Company', '\t', 'Movement') for i in range(0, len(companies)): print(companies[i][0], '\t\t', sum(movements[i, :]), '\n')

plt.figure(figsize=(20, 5)) ax1 = plt.subplot(221) plt.plot(movements[0, :]) plt.title(companies[0][0]) plt.subplot(222, sharey=ax1) plt.plot(movements[1, :]) plt.title(companies[1][0]) plt.show()

normalise = Normalizer().fit_transform(movements)

plt.figure(figsize=(20, 5)) ax1 = plt.subplot(221) plt.plot(movements[0, :]) plt.title(companies[0][0]) plt.subplot(222, sharey=ax1) plt.plot(movements[1, :]) plt.title(companies[1][0]) plt.show()

# define Normalizer normalizer = Normalizer() new = normalizer.fit_transform(movements)

# K-Means inertia = [] for i in range(2,90): kmeans = KMeans(n_clusters=i, random_state=0, max_iter=1000) pipeline = make_pipeline(normalizer, kmeans) pipeline.fit(movements) inertia.append(kmeans.inertia_)

inertia = np.array(inertia) np.diff(inertia,1)

# plot inertia plt.figure(figsize=(20, 5)) plt.plot(np.arange(2,30), -1*np.diff(inertia,1)[:28]) plt.xlabel('Number of clusters') plt.ylabel('Inertia') plt.show()

kmeans = KMeans(n_clusters=10, random_state=0, max_iter=1000)

# pipeline chaining Normalizer and K-Means pipeline = make_pipeline(normalizer, kmeans) pipeline.fit(movements)

print(kmeans.inertia_)

# predict cluster labels labels = pipeline.predict(movements)

# dataframe aligning labels and companies df = pd.DataFrame({'Company': list(companies), 'Cluster': labels})

# df sorted by cluster labels df_sorted = df.sort_values(by='Cluster')

df_sorted

# visualize the results reduced_data = PCA(n_components=2).fit_transform(new)

# run kmeans on reduced data kmeans = KMeans(n_clusters=10) kmeans.fit(reduced_data) labels = kmeans.predict(reduced_data)

# create DataFrame aligning labels & companies df = pd.DataFrame({'labels': labels, 'companies': companies})

# Display df sorted by cluster labels df.sort_values('labels')

# Define step size of mesh h = 0.01 # plot the decision boundary x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain abels for each point in the mesh using our trained model Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) # define colorplot cmap = plt.cm.Paired # plot figure plt.clf() plt.figure(figsize=(10, 10)) plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=cmap, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=5) # plot the centroid of each cluster as a white X centroids = kmeans.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidth=3, color='w', zorder=10) plt.title('K-Means Clustering on Stock Market Movements (PCA-Reduced Data)') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.show()