# Install dependencies
!pip install feedparser
!pip install transformers
# Get feed contents
import feedparser
d = feedparser.parse(r'http://arxiv.org/rss/cs.CV')
# Create data frame from feed entries
import pandas as pd
df = pd.DataFrame(d.entries)
# Drop unnecessary colulmns from data frame
df = df.drop(columns=['title_detail', 'links', 'summary_detail', 'author_detail', 'authors'])
# Helper method to remove HTML tags
from io import StringIO
from html.parser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs= True
self.text = StringIO()
def handle_data(self, d):
self.text.write(d)
def get_data(self):
return self.text.getvalue()
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
# Remove HTML tags from author list
for i, row in df.iterrows():
html = df.loc[i, 'author']
df.loc[i, 'author'] = strip_tags(html)
# Set up summarizer
import torch
from transformers import pipeline
device = torch.device("cuda") if torch.cuda.is_available() else torch.device('mps') if torch.has_mps else torch.device("cpu")
# MPS devices currently don't appear to work correctly with this pipeline - so going with non-MPS
summarizer = pipeline("summarization", "pszemraj/long-t5-tglobal-base-16384-book-summary", device=device)
# Remove HTML from summary, summarize it and then save to new column
for i, row in df.iterrows():
html = df.loc[i, 'summary']
txt = strip_tags(html)
summ = summarizer(txt)[0]['summary_text']
df.loc[i, 'brief'] = summ
# Display just the title and final summary
df[['title', 'brief']]