The goal here is to extract Run Differentials (RD) from each team and compare them to their respective Winning Percentage (PCT). I will use teamrankings.com to extract these stats.
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import numpy as np
import requests
from bs4 import BeautifulSoup
import seaborn as sns
date = datetime.datetime.now().date()
print(date)
def scrape_teamrankings(url,selector):
# Make a request to fetch any teamrankings url
html = requests.get(url)
# Turn html into soup
soup = BeautifulSoup(html.text, 'lxml')
# Extract table based on 'selector' parameter
table = soup.find("table",{"class":selector})
# Obtain every title of columns with tag <th>
headers = []
for th in table.find_all('th'):
text = th.text
headers.append(text)
# Create dataframe with headers
teamrankings_data = pd.DataFrame(columns = headers)
# Populate dataframe with data from <td>
for tr in table.find_all('tr')[1:]:
row_data = tr.find_all('td')
row = [td.text for td in row_data]
length = len(teamrankings_data)
teamrankings_data.loc[length] = row
# Sort values alphabetically based on Team Name for normalization
teamrankings_data = teamrankings_data.sort_values(by=['Team'],ascending=True,ignore_index=True)
return teamrankings_data
I can test this function by printing a quick scrape of the MLB Run Differential dataset.
print(scrape_teamrankings('https://www.teamrankings.com/mlb/stat/run-differential','datatable').head())
def get_diff():
# Scrape RDs
teamrankings_diff = scrape_teamrankings('https://www.teamrankings.com/mlb/stat/run-differential','datatable')
# Scrape PCTs
teamrankings_pct = scrape_teamrankings('https://www.teamrankings.com/mlb/stat/win-pct-all-games','datatable')
# Insert RDs to PCT dataframe based on alphabetical sort
for row in teamrankings_pct.index:
# Create an empty dataframe for source data
this_diff = teamrankings_diff.iloc[row]['2022']
teamrankings_pct.loc[row,'Diff'] = this_diff
# Rename column '2022' to 'PCT' for clarification
teamrankings_pct.rename(columns={"2022": "PCT"}, inplace=True)
# Sort by PCT value
teamrankings_pct = teamrankings_pct.sort_values(by=['PCT'],ascending=False,ignore_index=True)
return teamrankings_pct
mlb_standings = get_diff()
print(mlb_standings.head())
mlb_standings = get_diff()
diff_plot = sns.scatterplot(data=mlb_standings, x="PCT", y="Diff")