from warnings import filterwarnings
filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import pandas as pd
import pandas_datareader as pdr
import datetime
data = pd.read_csv('./ind_nifty100list.csv')
#remove entries without values
data.head()
#dict from dataframe
companies_dict={}
for i in range(len(data)):
companies_dict[data.iloc[i]['Company Name']]=data.iloc[i]['Symbol']+'.NS'
companies = sorted(companies_dict.items(), key=lambda x: x[1])
#dataframe of name and symbols
companies_df = pd.DataFrame(companies, columns=['Company Name', 'Symbol'])
companies_df.head()
start_date = '2019-08-01'
end_date = '2020-02-01'
panel_data = pdr.get_data_yahoo(list(companies_dict.values()),
start=start_date,
end=end_date)
panel_data.head()
stock_close = panel_data.loc[:, 'Close']
stock_open = panel_data.loc[:, 'Open']
stock_close = np.array(stock_close).T
stock_open = np.array(stock_open).T
row, col = stock_close.shape
movements = np.zeros([row, col])
for i in range(0, row):
movements[i, :] = (stock_close[i, :] - stock_open[i, :])/stock_open[i, :]
plt.figure(figsize=(20, 5))
ax1 = plt.subplot(221)
plt.plot(movements[0, :])
plt.title(companies[0][0])
plt.subplot(222, sharey=ax1)
plt.plot(movements[1, :])
plt.title(companies[1][0])
plt.show()
from sklearn.preprocessing import Normalizer
normalise = Normalizer().fit_transform(movements)
plt.figure(figsize=(20, 5))
ax1 = plt.subplot(221)
plt.plot(movements[0, :])
plt.title(companies[0][0])
plt.subplot(222, sharey=ax1)
plt.plot(movements[1, :])
plt.title(companies[1][0])
plt.show()
# machine learning libraries
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
# define Normalizer
normalizer = Normalizer()
new = normalizer.fit_transform(movements)
# K-Means
inertia = []
for i in range(2,90):
kmeans = KMeans(n_clusters=i, random_state=0, max_iter=1000)
pipeline = make_pipeline(normalizer, kmeans)
pipeline.fit(movements)
inertia.append(kmeans.inertia_)
inertia = np.array(inertia)
# plot of inertia
plt.figure(figsize=(20, 5))
plt.plot(np.arange(2,30), -1*np.diff(inertia,1)[:28])
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()
kmeans = KMeans(n_clusters=12, random_state=0, max_iter=1000)
# pipeline chaining Normalizer and K-Means
pipeline = make_pipeline(normalizer, kmeans)
pipeline.fit(movements)
# predict cluster labels
labels = pipeline.predict(movements)
# dataframe aligning labels and companies
df = pd.DataFrame({'Company': list(companies), 'Cluster': labels})
# df sorted by cluster labels
df_sorted = df.sort_values(by='Cluster')
df_sorted
# PCA
from sklearn.decomposition import PCA
# visualize the results
reduced_data = PCA(n_components=2).fit_transform(new)
# run kmeans on reduced data
kmeans = KMeans(n_clusters=12)
kmeans.fit(reduced_data)
labels = kmeans.predict(reduced_data)
# create DataFrame aligning labels & companies
df = pd.DataFrame({'labels': labels, 'companies': companies})
# Display df sorted by cluster labels
df.sort_values('labels')
# Define step size of mesh
h = 0.01
# plot the decision boundary
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain abels for each point in the mesh using our trained model
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
# define colorplot
cmap = plt.cm.Paired
# plot figure
plt.clf()
plt.figure(figsize=(10, 10))
plt.imshow(Z,
interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=cmap,
aspect='auto',
origin='lower')
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=5)
# plot the centroid of each cluster as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0],
centroids[:, 1],
marker='x',
s=169,
linewidth=3,
color='w',
zorder=10)
plt.title('K-Means Clustering on Stock Market Movements (PCA-Reduced Data)')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.show()
start_date = '2020-02-01'
end_date = '2020-09-01'
panel_data = pdr.get_data_yahoo(list(companies_dict.values()),
start=start_date,
end=end_date)
panel_data.head()
stock_close = panel_data.loc[:, 'Close']
stock_open = panel_data.loc[:, 'Open']
print(stock_close.iloc[0])
stock_close = np.array(stock_close).T
stock_open = np.array(stock_open).T
row, col = stock_close.shape
movements = np.zeros([row, col])
for i in range(0, row):
movements[i, :] = (stock_close[i, :] - stock_open[i, :])/stock_open[i, :]
print('Company', '\t', 'Movement')
for i in range(0, len(companies)):
print(companies[i][0], '\t\t', sum(movements[i, :]), '\n')
plt.figure(figsize=(20, 5))
ax1 = plt.subplot(221)
plt.plot(movements[0, :])
plt.title(companies[0][0])
plt.subplot(222, sharey=ax1)
plt.plot(movements[1, :])
plt.title(companies[1][0])
plt.show()
normalise = Normalizer().fit_transform(movements)
plt.figure(figsize=(20, 5))
ax1 = plt.subplot(221)
plt.plot(movements[0, :])
plt.title(companies[0][0])
plt.subplot(222, sharey=ax1)
plt.plot(movements[1, :])
plt.title(companies[1][0])
plt.show()
# define Normalizer
normalizer = Normalizer()
new = normalizer.fit_transform(movements)
# K-Means
inertia = []
for i in range(2,90):
kmeans = KMeans(n_clusters=i, random_state=0, max_iter=1000)
pipeline = make_pipeline(normalizer, kmeans)
pipeline.fit(movements)
inertia.append(kmeans.inertia_)
inertia = np.array(inertia)
np.diff(inertia,1)
# plot inertia
plt.figure(figsize=(20, 5))
plt.plot(np.arange(2,30), -1*np.diff(inertia,1)[:28])
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()
kmeans = KMeans(n_clusters=10, random_state=0, max_iter=1000)
# pipeline chaining Normalizer and K-Means
pipeline = make_pipeline(normalizer, kmeans)
pipeline.fit(movements)
print(kmeans.inertia_)
# predict cluster labels
labels = pipeline.predict(movements)
# dataframe aligning labels and companies
df = pd.DataFrame({'Company': list(companies), 'Cluster': labels})
# df sorted by cluster labels
df_sorted = df.sort_values(by='Cluster')
df_sorted
# visualize the results
reduced_data = PCA(n_components=2).fit_transform(new)
# run kmeans on reduced data
kmeans = KMeans(n_clusters=10)
kmeans.fit(reduced_data)
labels = kmeans.predict(reduced_data)
# create DataFrame aligning labels & companies
df = pd.DataFrame({'labels': labels, 'companies': companies})
# Display df sorted by cluster labels
df.sort_values('labels')
# Define step size of mesh
h = 0.01
# plot the decision boundary
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain abels for each point in the mesh using our trained model
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
# define colorplot
cmap = plt.cm.Paired
# plot figure
plt.clf()
plt.figure(figsize=(10, 10))
plt.imshow(Z,
interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=cmap,
aspect='auto',
origin='lower')
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=5)
# plot the centroid of each cluster as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0],
centroids[:, 1],
marker='x',
s=169,
linewidth=3,
color='w',
zorder=10)
plt.title('K-Means Clustering on Stock Market Movements (PCA-Reduced Data)')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.show()