arXiv

# Install dependencies !pip install feedparser !pip install transformers

# Get feed contents import feedparser d = feedparser.parse(r'http://arxiv.org/rss/cs.CV')

# Create data frame from feed entries import pandas as pd df = pd.DataFrame(d.entries)

# Drop unnecessary colulmns from data frame df = df.drop(columns=['title_detail', 'links', 'summary_detail', 'author_detail', 'authors'])

# Helper method to remove HTML tags from io import StringIO from html.parser import HTMLParser class MLStripper(HTMLParser): def __init__(self): super().__init__() self.reset() self.strict = False self.convert_charrefs= True self.text = StringIO() def handle_data(self, d): self.text.write(d) def get_data(self): return self.text.getvalue() def strip_tags(html): s = MLStripper() s.feed(html) return s.get_data()

# Remove HTML tags from author list for i, row in df.iterrows(): html = df.loc[i, 'author'] df.loc[i, 'author'] = strip_tags(html)

# Set up summarizer import torch from transformers import pipeline device = torch.device("cuda") if torch.cuda.is_available() else torch.device('mps') if torch.has_mps else torch.device("cpu") # MPS devices currently don't appear to work correctly with this pipeline - so going with non-MPS summarizer = pipeline("summarization", "pszemraj/long-t5-tglobal-base-16384-book-summary", device=device)

# Remove HTML from summary, summarize it and then save to new column for i, row in df.iterrows(): html = df.loc[i, 'summary'] txt = strip_tags(html) summ = summarizer(txt)[0]['summary_text'] df.loc[i, 'brief'] = summ

# Display just the title and final summary df[['title', 'brief']]