IEOR 142 HW 5 Pt. A

Part A

import pandas as pd import numpy as np

!pip install bs4==0.0.1

from bs4 import BeautifulSoup

train = pd.read_csv('stack_stats_2020_train.csv') test = pd.read_csv('stack_stats_2020_test.csv')

stack_stats = pd.concat([train, test]) stack_stats.head()

#create a function that turns html text into plain text def plain_text(table, column): plain_text = [] for index, row in column.iteritems(): each = BeautifulSoup(row) plain_text.append(each.get_text()) #print(column.row) table['Plain_Text'] = plain_text return table.head()

plain_text(stack_stats, stack_stats['Body'])

#thing to remove: '\n','#', and texts between '$''s #this is inefficient, but I could figure out a better way, and I think it gets the job done, I think import re new_column = [] for values in stack_stats['Plain_Text']: new_column.append(re.sub('\\n', ' ', values)) stack_stats['Plainer_Text'] = new_column new_column2 = [] for values in stack_stats['Plainer_Text']: new_column2.append(re.sub('\#', ' ', values)) stack_stats['Plainer_Text'] = new_column2 new_column3 = [] for values in stack_stats['Plainer_Text']: new_column3.append(re.sub('(\$.*?\$)', ' ', values)) stack_stats['Plainer_Text'] = new_column3

stack_stats['Plain_Text'][19244]

stack_stats['Plainer_Text'][19244]

Text Cleaning: Body

text_body = stack_stats['Plainer_Text']

#change to lowercase text_body_low = text_body.str.lower()

#remove punctuation (from Lab 8b ipynb file) from string import punctuation def remove_punctuation(document): no_punct = ''.join([character for character in document if character not in punctuation]) return no_punct text_body_no_pnct = text_body_low.apply(remove_punctuation)

#tokenization import nltk nltk.download('punkt')

from nltk.tokenize import word_tokenize text_body_tokenized = text_body_no_pnct.apply(word_tokenize) text_body_tokenized.head()

#remove stopwords import nltk nltk.download('stopwords') from nltk.corpus import stopwords stop_words = set(stopwords.words('english'))

def remove_stopwords(document): words = [word for word in document if not word in stop_words] return words

text_body_no_stop = text_body_tokenized.apply(remove_stopwords)

#stemming from nltk.stem import PorterStemmer porter = PorterStemmer() def stemmer(document): stemmed_document = [porter.stem(word) for word in document] return stemmed_document

text_body_stemmed = text_body_no_stop.apply(stemmer) text_body_stemmed

Text Cleaning: Title

text_title = stack_stats['Title']

#change to lowercase text_title_low = text_title.str.lower()

#remove punctuation (from Lab 8b ipynb file) text_title_no_pnct = text_title_low.apply(remove_punctuation)

text_title_tokenized = text_title_no_pnct.apply(word_tokenize) text_title_tokenized.head()

text_title_no_stop = text_title_tokenized.apply(remove_stopwords)

text_title_stemmed = text_title_no_stop.apply(stemmer) text_title_stemmed

Text Cleaning: Tags

text_tags = stack_stats['Tags']

#change to lowercase text_tags_low = text_tags.str.lower() text_tags_low.head()

#remove "< >" import re new_tags = [] for values in text_tags_low: new_tags.append(re.findall('<(.*?)>', values)) stack_stats['Tags'] = new_tags stack_stats['Tags']

DTM: Body

#detokenization from nltk.tokenize.treebank import TreebankWordDetokenizer text_body_detokenized = text_body_stemmed.apply(TreebankWordDetokenizer().detokenize)

#creating dtm #set min_df as 0.05 to preserve as many words as possible while eliminating unrelated number #'10' and '100' are still remaining (see below) from sklearn.feature_extraction.text import CountVectorizer countvec = CountVectorizer(min_df=0.05) sparse_dtm_body = countvec.fit_transform(text_body_detokenized) sparse_dtm_body

dtm_body = pd.DataFrame(sparse_dtm_body.toarray(), columns=countvec.get_feature_names_out(), index=stack_stats.index) dtm_body.head()

DTM: Title

text_title_detokenized = text_title_stemmed.apply(TreebankWordDetokenizer().detokenize)

#creating dtm #set min_df to 0.005 as adviced on piazza countvec = CountVectorizer(min_df=0.005) sparse_dtm_title = countvec.fit_transform(text_title_detokenized) sparse_dtm_title

dtm_title = pd.DataFrame(sparse_dtm_title.toarray(), columns=countvec.get_feature_names_out(), index=stack_stats.index) dtm_title.head()

DTM: Tags

text_tags_detokenized = stack_stats['Tags'].apply(TreebankWordDetokenizer().detokenize)

#creating dtm countvec = CountVectorizer(min_df=0.005) sparse_dtm_tags = countvec.fit_transform(text_tags_detokenized) sparse_dtm_tags

dtm_tags = pd.DataFrame(sparse_dtm_tags.toarray(), columns=countvec.get_feature_names_out(), index=stack_stats.index) dtm_tags.head()

Add suffix to distinguish words

dtm_body = dtm_body.add_suffix('_body') dtm_title = dtm_title.add_suffix('_title') dtm_tags = dtm_tags.add_suffix('_tags')

Combine Body, Title, and Tags along with score

body_title_joined = dtm_body.join(dtm_title) body_title_tags_joined = body_title_joined.join(dtm_tags) body_title_tags_joined

stack_stats_final = stack_stats[['Id','Score']].join(body_title_tags_joined) stack_stats_final.head()

stack_stats_final.to_csv('stack_stats_final.csv')

'stack_stats_final.csv' is downloaded to be further modified in Section B.

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Part A