EPL Web Scraper ⚽️

import requests import pandas as pd response = requests.get('https://www.premierleague.com/stats/top/players/goals?se=418') # se=418 should load current season data df = pd.read_html(response.text) df[0] # Loads the all-time list, not the current season data (/goals?se=363)...also, how are we going to go to the next page of data (as table is paginated)?

from nerodia.browser import Browser import pandas as pd import time # Using Selenium Chrome Options, set headless so the physical GUI of Chrome doesn't have to be used, and no sandbox to avoid crashes on Deepnote from selenium.webdriver.chrome.options import Options options = Options() options.add_argument('--headless') options.add_argument('--no-sandbox') # Remove if running outside deepnote browser = Browser('chrome', options=options) # Create Browser browser.goto('https://www.premierleague.com/stats/top/players/goals?se=418') # Now use the browser to navigate to the EPL Stats Page time.sleep(4) # Allow data time to load into HTML goals_df = pd.read_html(browser.html)[0] # Use Pandas to fetch all the tables within the browser html, select the first table it finds ([0]) # Note: On the EPL site, when you've reached the end of the table, the table's Page Next element has 'inactive' added to it's class. Use browser tools to inspect the Page Next html element on the last page of the goals table to see for yourself. # Note: As we know this, we can keep clicking the Page Next button and scraping the table until the element is 'inactive'. In Python we can use while not: while not browser.div(class_name=['paginationBtn', 'paginationNextContainer', 'inactive']).exists: browser.div(class_name=['paginationBtn', 'paginationNextContainer']).fire_event('onClick') # fire onClick event on page next element. If it was a button element (not a div element), we could simply use .click() # print('Next Page') goals_df = goals_df.append(pd.read_html(browser.html)[0]) # append the table from this page with the existing goals dataframe. browser.close() # Close Browser goals_df = goals_df[goals_df['Stat'] > 0] # Random Players at end of table with 0 goals... goals_df = goals_df.dropna(axis=1, how='all') # Random Unamed Column (all NaN elements, so clear columns where 'all' values are NaN) goals_df.to_csv(r'data/epl_goals_20_21.csv', index=False) # Save dataframe to new csv file goals_df

# Run the script version from a notebook (19/20 season) # !python epl_web_scraper.py