from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
%matplotlib inline
from os import listdir
from os.path import isfile, join
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import sent_tokenize , word_tokenize
from nltk.corpus import PlaintextCorpusReader
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings("ignore")
TD_data = {'bat':[1, 3, 0] ,
'cat':[3, 0, 1] ,
'fat':[1, 0, 1] ,
'mat':[0, 1, 1] ,
'pat':[0, 1, 1] ,
'rat':[1, 1, 1] ,
'sat':[0, 0, 1] }
TD = pd.DataFrame(TD_data, index =['Doc1', 'Doc2','Doc3'])
print("Term Document Matrix")
print(TD)
Term Document Matrix
bat cat fat mat pat rat sat
Doc1 1 3 1 0 0 1 0
Doc2 3 0 0 1 1 1 0
Doc3 0 1 1 1 1 1 1
TF_data = {'bat':[0.167, 0.5, 0] ,
'cat':[0.5, 0, 0.167] ,
'fat':[0.167, 0, 0.167] ,
'mat':[0, 0.167, 0.167] ,
'pat':[0, 0.167, 0.167] ,
'rat':[0.167, 0.167, 0.167] ,
'sat':[0, 0, 0.167]}
TF = pd.DataFrame(TF_data, index =['Doc1', 'Doc2','Doc3'])
IDF_data = {'bat':[0.176],'cat':[0.176],'fat':[0.176],
'mat':[0.176],'pat':[0.176],'rat':[0],'sat':[0.477] }
IDF = pd.DataFrame(IDF_data, index =['IDF Score'])
TF_IDF_data = {'bat':[0.029, 0.088, 0] ,
'cat':[0.088, 0, 0.029] ,
'fat':[0.029, 0, 0.029] ,
'mat':[0, 0.029, 0.029] ,
'pat':[0, 0.029, 0.029] ,
'rat':[0, 0, 0] ,
'sat':[0, 0, 0.08] }
TF_IDF = pd.DataFrame(TF_IDF_data, index =['Doc1', 'Doc2','Doc3'])
print("TF(t) = (Number of times term t appears in a document) / \
(Total number of terms in the document).")
print(TF)
print('-----------------------------------------------------------------------')
print("IDF(t) = log(Total number of documents / \
Number of documents with term t in it).")
print(IDF)
print('-----------------------------------------------------------------------')
print("TF-IDF Matrix")
print(TF_IDF)
TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).
bat cat fat mat pat rat sat
Doc1 0.167 0.500 0.167 0.000 0.000 0.167 0.000
Doc2 0.500 0.000 0.000 0.167 0.167 0.167 0.000
Doc3 0.000 0.167 0.167 0.167 0.167 0.167 0.167
-----------------------------------------------------------------------
IDF(t) = log(Total number of documents / Number of documents with term t in it).
bat cat fat mat pat rat sat
IDF Score 0.176 0.176 0.176 0.176 0.176 0 0.477
-----------------------------------------------------------------------
TF-IDF Matrix
bat cat fat mat pat rat sat
Doc1 0.029 0.088 0.029 0.000 0.000 0 0.00
Doc2 0.088 0.000 0.000 0.029 0.029 0 0.00
Doc3 0.000 0.029 0.029 0.029 0.029 0 0.08
college = pd.read_csv('college_data.csv')
cnamesColl = college.columns[1:]
# college.info()
#Performing PCA
pca = PCA()
pca.fit(college.iloc[:, 3:21])
#Converting pc to DataFrame
pc = pd.DataFrame(pca.components_)
pc_columns = college.columns[3:21]
pc.index = ['PC' + str(i) for i in range(1, 19)]
# pc
X2 = pca.transform(college.iloc[:, 3:21])
z = X2[:, 0]
y = X2[:, 1]
n = college.iloc[:, 2]
fig, ax = plt.subplots(figsize=(9, 9))
ax.scatter(z, y)
plt.xlabel('PC1')
plt.ylabel('PC2')
for i, txt in enumerate(n):
ax.annotate(txt, (z[i], y[i]), fontsize=10, rotation = 30)
# Plotting variance
plt.subplots(figsize=(8, 4))
# plt.grid(True)
plt.plot([str(i) for i in range(1, 19)],
pca.explained_variance_ratio_.cumsum())
plt.xlabel("Principal Components")
plt.ylabel("Cumulative Proportion of Variance Explained")
plt.title("Cumulative Proportion of Variance");
# Loading in stock files
stock2020 = pd.read_csv("stock_data_2020.csv")
# Only considering columns after "Date"
cnames2020 = stock2020.columns[1:]
# pc for stock2020
pc2020 = PCA()
pc2020.fit(stock2020.iloc[:, 1:])
#Converting principal component directions to DataFrame
pc_2020 = pd.DataFrame(pc2020.components_)
pc_2020.columns = cnames2020
pc_2020.index = ['PC' + str(i) for i in range(1, 31)]
# pc_2020
#Finding PC scores
X2020 = pc2020.transform(stock2020.iloc[:, 1:])
#Plotting data in first two pc directions
plt.scatter(X2020[:,0], X2020[:,1])
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('2020 Stock PC Score Using First Two PC Directions');
# Used to visualize the principle component directions on the plot
def myplot(score,coeff,labels=None):
xs = score[:,0]
ys = score[:,1]
n = coeff.shape[0]
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
plt.scatter(xs * scalex,ys * scaley)
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
if labels is None:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1),
color = 'g', ha = 'center', va = 'center')
else:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i],
color = 'g', ha = 'center', va = 'center')
plt.xlim(-1.0,1.0)
plt.ylim(-1.0,1.0)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
plt.title('2020 Stock PC Score Using First Two PC Directions')
plt.grid()
plt.figure(figsize=(10, 10))
myplot(X2020[:,0:], np.transpose(pc2020.components_[0:,:]), cnames2020)
# Loading in stock files
stock2019 = pd.read_csv("stock_data_2019.csv")
# Only considering columns after "Date"
cnames2019 = stock2019.columns[1:]
# pc for stock2019
pc2019 = PCA()
pc2019.fit(stock2019.iloc[:, 1:])
#Converting principal component directions to DataFrame
pc_2019 = pd.DataFrame(pc2019.components_)
pc_2019.columns = cnames2019
pc_2019.index = ['PC' + str(i) for i in range(1, 31)]
#pc_2019
X2019 = pc2019.transform(stock2019.iloc[:, 1:])
plt.scatter(X2019[:, 0], X2019[:, 1])
plt.xlabel('PC1')
plt.ylabel('PC2');
plt.title('2019 Stock PC Score Using First Two PC Directions');
plt.figure(figsize=(10, 10))
myplot(X2019[:,0:], np.transpose(pc2019.components_[0:,:]), cnames2019)
#Get list of files specifically to have length of list
listfiles = [f for f in listdir("sotu/files") if isfile(join("sotu/files", f))]
#Load the speeches in the correct order
speeches = []
for i in range(1,len(listfiles)+1):
filename = 'sotu/files/a%d.txt' % (i)
data = open(filename, 'r')
speeches.append(data.read().splitlines())
# len(speeches)
# speeches[-1]
# report size
print('Total Documents:',len(speeches))
print('Mean Words per Document:', sum([len(i) for i in speeches]) / \
len(speeches))
Total Documents: 231
Mean Words per Document: 7624.350649350649
#Import Party file as Vector
data_file = open('sotu/party.txt')
presidents = np.loadtxt(data_file, delimiter=", ", dtype='str' )
data_file.close()
# Getting name of files in order
fname = []
for i in range(1,len(listfiles)+1):
fname.append(i) #('a%d.txt' % (i))
# Removing commas from speeches
speech = []
for sp in speeches:
speech.append(' '.join(sp))
# Create Dataframe containting Speech metadata plus contents
p = {'Party': presidents[:,0],
'President': presidents[:,1],
'Year': presidents[:,2],
'Speech_Text': speech }
Pres_Speech = pd.DataFrame(p, index = fname)
# Pres_Speech.head()
# token_pattern is regex to select only alpha (non-numeric) tokens
TokenPattern = r'\b[a-zA-Z]{1,}\b'
# 'content' method (stored corpus object in memory)
vectorizer = CountVectorizer(input = 'content', token_pattern=TokenPattern)
X = vectorizer.fit_transform(speech)
# create Document-Term Matrix / DataFrame
Xframe = pd.DataFrame(X.toarray(),
index = fname,
columns = vectorizer.get_feature_names())
count = Xframe.sum()
fwords = count.sort_values(ascending=False)
fwords[0:5]
print('Xframe Shape:',Xframe.shape)
count = Xframe.sum()
fwords = count.sort_values(ascending=False)
fwords[0:5]
plt.barh(list(range(0,10)), width=fwords[0:10], tick_label=fwords.index[0:10])
plt.xlabel("Counts")
plt.ylabel("Frequent Words")
plt.title("Top 10 Frequent Words");
Xframe Shape: (231, 27822)
with open("sotu/stopwords.txt") as f:
stopwds = f.read().lower().splitlines()
f.close()
# 'content' method (stored corpus object in memory)
vectorizer = CountVectorizer(input = 'content', token_pattern=TokenPattern,
stop_words = stopwds)
X = vectorizer.fit_transform(speech)
# create Document-Term Matrix / DataFrame
Xframe = pd.DataFrame(X.toarray(),
index = fname,
columns = vectorizer.get_feature_names())
# Xframe.iloc[0:9,0:8]
print('Xframe Shape:',Xframe.shape)
count = Xframe.sum()
fwords = count.sort_values(ascending=False)
fwords[0:5]
plt.barh(list(range(0,10)), width=fwords[0:10], tick_label=fwords.index[0:10])
plt.xlabel("Counts")
plt.ylabel("Frequent Words")
plt.title("Top 10 Frequent Words");
Xframe Shape: (231, 27267)
#Limit Speeches and T-D to only Dem/Rep
parties = ('d', 'r')
DR = Pres_Speech.loc[(Pres_Speech['Party'].isin(parties))]
DRInd = list(DR.index)
Pres_Speech_DR = Pres_Speech.loc[DRInd]
Xframe_DR = Xframe.loc[DRInd]
speech_DR = Pres_Speech_DR['Speech_Text'].tolist()
# Change Party column to
di = {"d": 1, "r": 2}
Pres_Speech_DR.Party = [di[item] for item in Pres_Speech_DR.Party]
#Rebuild T D with only top 3000 words
vectorizer_top = CountVectorizer(input = 'content', token_pattern=TokenPattern,
stop_words = stopwds, max_features=3000)
X_top = vectorizer_top.fit_transform(speech_DR)
# create Document-Term Matrix / DataFrame
Xtop3K = pd.DataFrame(X_top.toarray(),
index = DRInd,
columns = vectorizer_top.get_feature_names())
Xtop3K.iloc[0:10,0:5]
# Create Training & Test sets
# Pres_Speech.head()
names = ('trump', 'obama', 'bush', 'clinton', 'kennedy')
years = ('2017', '2014', '2006', '1995', '1962')
#List of Indexes for chosen speeches
Train = Pres_Speech.loc[(Pres_Speech['President'].isin(names))
& (Pres_Speech['Year'].isin(years))]
TrainInd = list(Train.index)
#Testing Set with chosen speeches
y_test = Pres_Speech_DR.loc[TrainInd,['Party']]
X_test = Xtop3K.loc[TrainInd]
#Training Set exluding chosen speeched
y_train = Pres_Speech_DR.loc[~Pres_Speech_DR.index.isin(TrainInd),['Party']]
X_train = Xtop3K.loc[~Xtop3K.index.isin(TrainInd)]
y_testB = y_test
X_testB = X_test
y_trainB = y_train
X_trainB = X_train
modelB = BernoulliNB().fit(X_trainB, y_trainB)
predB = modelB.predict_proba(X_testB)
# store the probabilities in dataframe
y_pred_prob_B = pd.DataFrame(data=predB,
columns=['Prob of Democrat', 'Prob of Republican'])
y_pred_prob_B
y_testM = y_test
X_testM = X_test
y_trainM = y_train
X_trainM = X_train
modelM = MultinomialNB().fit(X_trainM, y_trainM)
predM = modelM.predict_proba(X_testM)
# store the probabilities in dataframe
y_pred_prob_M = pd.DataFrame(data=predM,
columns=['Prob of Democrat', 'Prob of Republican'])
y_pred_prob_M
# Loading in data
congress = pd.read_csv('congress/H116_votes.csv')
#Performing PCA
pcca = PCA()
pcca.fit(congress.iloc[:, 8:])
#Converting pc to DataFrame
pcc = pd.DataFrame(pcca.components_)
pcc.columns = congress.columns[8: ]
pcc.index = ['PC' + str(i) for i in range(1, 51)]
# pcc.head()
X2 = pcca.transform(congress.iloc[:, 8:])
z = X2[:, 0]
y = X2[:, 1]
n = congress.iloc[:, 8]
# Assigning a color for each value in PartyCode
color = []
affiliation = congress.iloc[:, 5]
for a in affiliation:
if affiliation[a] == 100:
color.append('blue')
elif affiliation[a] == 200:
color.append('red')
else:
color.append('green')
# Plotting
fig, ax = plt.subplots(figsize=(9, 9))
ax.scatter(z, y, c = color)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PC Score Using First Two PC Directions');
# Opening files
filelist = [pd.read_csv(f) for f in glob.glob('congress/*.{}'.format('csv'))]
print(len(filelist))
20
# Getting name of files in order
hname = []
for i in range(97, 117):
hname.append('H%d_votes' % (i))
#Creating subplots
fig, axs = plt.subplots(5, 4, figsize = (15, 18))
row = 0
col = 0
# Loading in files
for f in filelist:
# con = pd.read_csv(f)
pcca.fit(f.iloc[:, 8:])
X2 = pcca.transform(f.iloc[:, 8:])
z = X2[:, 0]
y = X2[:, 1]
n = f.iloc[:, 8]
# Coloring for affiliation
color = []
affiliation = f.iloc[:, 5]
for a in affiliation:
if affiliation[a] == 100:
color.append('blue')
elif affiliation[a] == 200:
color.append('red')
else:
color.append('green')
axs[row][col].scatter(z, y, c = color)
col+=1
if col == 4:
row+=1
col = 0
for t, ax in enumerate(axs.flat):
ax.set_title(hname[t])
plt.setp(axs[-1, :], xlabel = 'PC1')
plt.setp(axs[:, 0], ylabel = 'PC2')