Part A
import pandas as pd
import numpy as np
!pip install bs4==0.0.1
from bs4 import BeautifulSoup
train = pd.read_csv('stack_stats_2020_train.csv')
test = pd.read_csv('stack_stats_2020_test.csv')
stack_stats = pd.concat([train, test])
stack_stats.head()
#create a function that turns html text into plain text
def plain_text(table, column):
plain_text = []
for index, row in column.iteritems():
each = BeautifulSoup(row)
plain_text.append(each.get_text())
#print(column.row)
table['Plain_Text'] = plain_text
return table.head()
plain_text(stack_stats, stack_stats['Body'])
#thing to remove: '\n','#', and texts between '$''s
#this is inefficient, but I could figure out a better way, and I think it gets the job done, I think
import re
new_column = []
for values in stack_stats['Plain_Text']:
new_column.append(re.sub('\\n', ' ', values))
stack_stats['Plainer_Text'] = new_column
new_column2 = []
for values in stack_stats['Plainer_Text']:
new_column2.append(re.sub('\#', ' ', values))
stack_stats['Plainer_Text'] = new_column2
new_column3 = []
for values in stack_stats['Plainer_Text']:
new_column3.append(re.sub('(\$.*?\$)', ' ', values))
stack_stats['Plainer_Text'] = new_column3
stack_stats['Plain_Text'][19244]
stack_stats['Plainer_Text'][19244]
Text Cleaning: Body
text_body = stack_stats['Plainer_Text']
#change to lowercase
text_body_low = text_body.str.lower()
#remove punctuation (from Lab 8b ipynb file)
from string import punctuation
def remove_punctuation(document):
no_punct = ''.join([character for character in document if character not in punctuation])
return no_punct
text_body_no_pnct = text_body_low.apply(remove_punctuation)
#tokenization
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
text_body_tokenized = text_body_no_pnct.apply(word_tokenize)
text_body_tokenized.head()
#remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(document):
words = [word for word in document if not word in stop_words]
return words
text_body_no_stop = text_body_tokenized.apply(remove_stopwords)
#stemming
from nltk.stem import PorterStemmer
porter = PorterStemmer()
def stemmer(document):
stemmed_document = [porter.stem(word) for word in document]
return stemmed_document
text_body_stemmed = text_body_no_stop.apply(stemmer)
text_body_stemmed
Text Cleaning: Title
text_title = stack_stats['Title']
#change to lowercase
text_title_low = text_title.str.lower()
#remove punctuation (from Lab 8b ipynb file)
text_title_no_pnct = text_title_low.apply(remove_punctuation)
text_title_tokenized = text_title_no_pnct.apply(word_tokenize)
text_title_tokenized.head()
text_title_no_stop = text_title_tokenized.apply(remove_stopwords)
text_title_stemmed = text_title_no_stop.apply(stemmer)
text_title_stemmed
Text Cleaning: Tags
text_tags = stack_stats['Tags']
#change to lowercase
text_tags_low = text_tags.str.lower()
text_tags_low.head()
#remove "< >"
import re
new_tags = []
for values in text_tags_low:
new_tags.append(re.findall('<(.*?)>', values))
stack_stats['Tags'] = new_tags
stack_stats['Tags']
DTM: Body
#detokenization
from nltk.tokenize.treebank import TreebankWordDetokenizer
text_body_detokenized = text_body_stemmed.apply(TreebankWordDetokenizer().detokenize)
#creating dtm
#set min_df as 0.05 to preserve as many words as possible while eliminating unrelated number
#'10' and '100' are still remaining (see below)
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer(min_df=0.05)
sparse_dtm_body = countvec.fit_transform(text_body_detokenized)
sparse_dtm_body
dtm_body = pd.DataFrame(sparse_dtm_body.toarray(), columns=countvec.get_feature_names_out(), index=stack_stats.index)
dtm_body.head()
DTM: Title
text_title_detokenized = text_title_stemmed.apply(TreebankWordDetokenizer().detokenize)
#creating dtm
#set min_df to 0.005 as adviced on piazza
countvec = CountVectorizer(min_df=0.005)
sparse_dtm_title = countvec.fit_transform(text_title_detokenized)
sparse_dtm_title
dtm_title = pd.DataFrame(sparse_dtm_title.toarray(), columns=countvec.get_feature_names_out(), index=stack_stats.index)
dtm_title.head()
DTM: Tags
text_tags_detokenized = stack_stats['Tags'].apply(TreebankWordDetokenizer().detokenize)
#creating dtm
countvec = CountVectorizer(min_df=0.005)
sparse_dtm_tags = countvec.fit_transform(text_tags_detokenized)
sparse_dtm_tags
dtm_tags = pd.DataFrame(sparse_dtm_tags.toarray(), columns=countvec.get_feature_names_out(), index=stack_stats.index)
dtm_tags.head()
Add suffix to distinguish words
dtm_body = dtm_body.add_suffix('_body')
dtm_title = dtm_title.add_suffix('_title')
dtm_tags = dtm_tags.add_suffix('_tags')
Combine Body, Title, and Tags along with score
body_title_joined = dtm_body.join(dtm_title)
body_title_tags_joined = body_title_joined.join(dtm_tags)
body_title_tags_joined
stack_stats_final = stack_stats[['Id','Score']].join(body_title_tags_joined)
stack_stats_final.head()
stack_stats_final.to_csv('stack_stats_final.csv')
'stack_stats_final.csv' is downloaded to be further modified in Section B.