#Importing useful libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot
/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
import pandas.util.testing as tm
URL = f'http://www.mendosa.com/gilists.htm'
def request_and_parse(url):
request = requests.get(url)
soup = bs(request.text, 'lxml')
foods_gi = soup.find_all('td')
request_and_parse(URL)
products_gi = foods_gi[:137]
prod_gi = []
prods = []
gis = []
for value in products_gi[6:]:
if 'align' not in str(value):
prod_gi.append(value)
for key, value in enumerate(prod_gi):
if key%2==0:
prods.append(value.text)
else:
gis.append(value.text)
#Convert into a dataframe
df = pd.DataFrame(data = prods, columns = ['products'])
df['glucose_index(GI)'] = pd.Series(gis).to_frame()
df['glucose_index(GI)'] = df['glucose_index(GI)'].str[:-2]
#View
df.tail()
from google.colab import files
df.to_csv('Products_GI.csv')
files.download('Products_GI.csv')
#Now, let's scrape the other table consisting of over 2000 foods and convert it into a dataset
reqs = requests.get(URL)
soup = bs(reqs.text, 'lxml')
food_gi = soup.find_all('tr')
food_gi = food_gi[82:4574]
product_rows = []
for product in food_gi:
if '±' in str(product.text) and 'mean' not in str(product.text):
product_rows.append(product.text)
df1 = pd.Series(product_rows).to_frame().rename(columns={0: 'all_columns'})
df1.head()
df1['prod_gin'] = df1['all_columns'].str.split('±', expand=True)[0]
df1['glo'] = df1['all_columns'].str.split('±', expand=True)[1]
#Importing useful functions
%run Utility_tools.ipynb
values_stripper(df1)
Categorical features' values are stripped
df1['prod_gin'] = df1['prod_gin'].str[-3:]
values_stripper(df1)
df1 = df1.replace('8 7', '7').replace('2 3', '3').replace('d 7', '7').replace('® 3', '3').replace('ean', np.nan).replace('2 7', '7').replace('2 8', '8').replace('1 2', '2').replace(') 6', '6')
df1['glycemic_index(GI)'] = df1['prod_gin']
Categorical features' values are stripped
df1['glycemic_load'] = df1['glo'].str[-2:]
df1['glycemic_load'].unique()
df1 = df1.replace('d.', np.nan).replace('-', np.nan)
df1['product_name'] = df1['all_columns'].str[:-11]
df1['product_name'] = df1['product_name'].str.split(' ', 1, expand=True)[1]
df1.drop(['all_columns', 'prod_gin', 'glo'], axis=1, inplace=True)
df1.isna().sum()
duplicate_check_remove(df1)
Number of duplicate rows before: 22
Number of duplicate rows now: 0
df1 = df1[['product_name', 'glycemic_index(GI)', 'glycemic_load']]
#Final view
df1.head()
from google.colab import files
df1.to_csv('Food_GI_GL.csv')
files.download('Food_GI_GL.csv')