# add these two lines of code to the top of __init__.ipynb, underneath the %%bash at the top of the cell
sudo apt-get update
sudo apt-get install chromium-driver -y
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
import datefinder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
options = Options()
options.headless = True
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
driver = Chrome(options=options)
driver.implicitly_wait(3)
driver.get('https://thedailytexan.com/section/news')
title = driver.find_element_by_xpath('//*[@id="section-news"]/div[1]/ul/li[1]/span/article/div[2]/h2/a')
title
print(title.text)
print(title.get_attribute('href'))
Austin-Travis county mask mandate to remain in effect
https://thedailytexan.com/2021/03/26/austin-travis-county-mask-mandate-to-remain-in-effect
author = driver.find_element_by_xpath('//*[@id="section-news"]/div[1]/ul/li[1]/span/article/div[2]/div[1]')
print(author.text)
BY SHERYL LAWRENCE - UPDATED ON MARCH 26, 2021 AT 3:30 PM
content = re.split("BY| - ", author.text) # this creates a list
print(content)
['', ' SHERYL LAWRENCE', 'UPDATED ON MARCH 26, 2021 AT 3:30 PM']
authors = content[1].split(',')
authors = [author.strip() for author in authors]
print(authors)
['SHERYL LAWRENCE']
date = content[2]
date = list(datefinder.find_dates(date))[0]
print(date)
2021-03-26 15:30:00
teaser = driver.find_element_by_xpath('//*[@id="section-news"]/div[1]/ul/li[1]/span/article/div[2]/div[2]')
print(teaser.text)
Travis County 261st District Court held the hearing on Friday morning, and the announcement of the mask mandate remaining in effect came at around 2:30 p.m. Friday afternoon.
titles = driver.find_elements_by_xpath('//*[@id="section-news"]/div[1]/ul/li/span/article/div[2]/h2/a')
title_list = [title.text for title in titles]
link_list = [title.get_attribute('href') for title in titles]
print(title_list)
print(link_list)
['Austin-Travis county mask mandate to remain in effect', 'The Daily Texan March 26, 2021', 'UT-Austin history professor releases contrary report on ‘The Eyes of Texas’ origins, says it should ‘not be the official song’', 'Bees practice social distancing in time of illness too, researchers led by UT alum discovered early this month', 'UT partners with Texas Department of State Health Services to sequence COVID-19 samples, detect variants', '‘I did not feel human’: UT-Austin students in COVID-19 isolation facility during winter storm received poor care', 'Cockrell School opens new additive manufacturing center to foster development in 3D printing technology', 'What you should, should not do once you’re fully vaccinated', 'Black Alumni Network’s Legacy Endowment HornRaiser aims to raise $1M to send 4 Black students to UT with full ride', 'How UT’s denial to remove The Eyes of Texas alma mater impacted Black students']
['https://thedailytexan.com/2021/03/26/austin-travis-county-mask-mandate-to-remain-in-effect', 'https://thedailytexan.com/2021/03/26/the-daily-texan-march-26-2021', 'https://thedailytexan.com/2021/03/24/UT-Austin-history-professor-contrary-report-The-Eyes-of-Texas-origins', 'https://thedailytexan.com/2021/03/24/bees-practice-social-distancing-in-time-of-illness-too-researchers-led-by-ut-alum', 'https://thedailytexan.com/2021/03/24/ut-partners-with-texas-department-of-state-health-services-to-sequence-covid-19-samples', 'https://thedailytexan.com/2021/03/24/UT-Austin-students-COVID-19-isolation-facility-winter-storm-poor-care', 'https://thedailytexan.com/2021/03/24/cockrell-school-opens-new-additive-manufacturing-center-to-foster-development-in-3d', 'https://thedailytexan.com/2021/03/24/what-to-do-once-fully-vaccinated', 'https://thedailytexan.com/2021/03/23/black-alumni-networks-legacy-endowment-hornraiser-aims-to-raise-1m-to-send-4-black', 'https://thedailytexan.com/2021/03/23/how-ut%E2%80%99s-denial-to-remove-the-eyes-of-texas-alma-mater-impacted-black-students']
authors_and_dates = driver.find_elements_by_xpath('//*[@id="section-news"]/div[1]/ul/li/span/article/div[2]/div[1]')
string_list = [re.split("BY| - ", author.text) for author in authors_and_dates]
author_list = [[author.strip() for author in item[1].split(',')] for item in string_list]
date_list = [list(datefinder.find_dates(item[2]))[0] for item in string_list]
print(author_list)
print(date_list)
[['SHERYL LAWRENCE'], ['THE DAILY TEXAN STAFF'], ['SKYE SEIPP'], ['KATY NELSON'], ['SAMANTHA GREYSON'], ['SKYE SEIPP'], ['KEVIN VU'], ['ANASTASIA GOODWIN'], ['KAUSHIKI ROY'], ['SKYE SEIPP']]
[datetime.datetime(2021, 3, 26, 15, 30), datetime.datetime(2021, 3, 26, 1, 6), datetime.datetime(2021, 3, 24, 21, 31), datetime.datetime(2021, 3, 24, 20, 15), datetime.datetime(2021, 3, 24, 19, 54), datetime.datetime(2021, 3, 24, 19, 49), datetime.datetime(2021, 3, 24, 19, 29), datetime.datetime(2021, 3, 24, 18, 52), datetime.datetime(2021, 3, 23, 22, 35), datetime.datetime(2021, 3, 23, 22, 5)]
authors_and_dates = driver.find_elements_by_xpath('//*[@id="section-news"]/div[1]/ul/li/span/article/div[2]/div[1]')
string_list = []
for item in authors_and_dates:
new_item = re.split("BY| - ", item.text)
string_list.append(new_item)
author_list = []
for item in string_list:
author = item[1]
authors = author.split(',')
authors = [author.strip() for author in authors]
author_list.append(authors)
date_list = []
for item in string_list:
date_string = item[2]
dates = list(datefinder.find_dates(date_string))
date = dates[0]
date_list.append(date)
teasers = driver.find_elements_by_xpath('//*[@id="section-news"]/div[1]/ul/li/span/article/div[2]/div[2]')
teasers = [teaser.text for teaser in teasers]
teasers
def scrape_page(link):
# load initial site
driver.get(link)
# title elements
titles = driver.find_elements_by_xpath('//*[@id="section-news"]/div[1]/ul/li/span/article/div[2]/h2/a')
title_list = [title.text for title in titles]
link_list = [title.get_attribute('href') for title in titles]
# author and date
authors_and_dates = driver.find_elements_by_xpath('//*[@id="section-news"]/div[1]/ul/li/span/article/div[2]/div[1]')
string_list = [re.split("BY| - ", author.text) for author in authors_and_dates]
author_list = [[author.strip() for author in item[1].split(',')] for item in string_list]
date_list = [list(datefinder.find_dates(item[2]))[0] for item in string_list]
# teaser
teasers = driver.find_elements_by_xpath('//*[@id="section-news"]/div[1]/ul/li/span/article/div[2]/div[2]')
teaser_list = [teaser.text for teaser in teasers]
# dataframe creation
df = pd.DataFrame(columns=['Title', 'Authors', 'Article_Link', 'Date', 'Teaser'])
df['Title'] = title_list
df['Authors'] = author_list
df['Article_Link'] = link_list
df['Date'] = date_list
df['Teaser'] = teaser_list
return df
scrape_page('https://thedailytexan.com/section/news')
# intializing driver
driver = Chrome(options=options)
# creating the initial dataframe
df = scrape_page('https://thedailytexan.com/section/news')
# scraping the next 9 pages and adding their results to the dataframe
for i in range(1, 10):
url = 'https://thedailytexan.com/section/news?page=' + str(i)
df = df.append(scrape_page(url), ignore_index=True)
df.sample(5)
df.to_csv('daily_texan.csv', index=False)
df.to_pickle('daily_texan.df')
df = pd.read_pickle('daily_texan.df')
sns.set_style('darkgrid')
fig,(ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 6))
sns.countplot(x=df.Authors.apply(lambda x: len(x)), ax=ax1)
ax1.set_title('Number of authors per article', y=1.05, fontsize=15, fontweight='bold')
ax1.set_ylabel('Article count', labelpad=10, fontsize=12)
ax1.set_xlabel('Number of authors', labelpad=10,fontsize=12)
sns.histplot(x=df.Title.apply(lambda x: len(x)), ax=ax2, bins=14)
ax2.set_title('Length of article titles', y=1.05, fontsize=15, fontweight='bold')
ax2.set_ylabel('Article count', labelpad=10, fontsize=12)
ax2.set_xlabel('Length of title (characters)', labelpad=10,fontsize=12)
sns.countplot(x=df.Title.apply(lambda x: 'UT' in x), ax=ax3)
ax3.set_title('Proportion of titles containing "UT"', y=1.05, fontsize=15, fontweight='bold')
ax3.set_ylabel('Article count', labelpad=10, fontsize=12)
ax3.set_xlabel('Title contains "UT"?', labelpad=10,fontsize=12)