import pandas as pd
reviews = pd.read_csv('https://raw.githubusercontent.com/umsi-data-science/data/main/amazon_food_reviews.zip')
reviews_sample = reviews.head(10)
reviews_sample
reviews_sample.ProfileName.str.lower()
reviews_sample.ProfileName.str.upper()
reviews_sample.Summary.str.len()
reviews_sample.columns
reviews_sample
reviews_sample.columns.str.lower()
reviews_sample.columns.str.strip().str.lower().str.replace(' ','_')
reviews_sample.columns = reviews_sample.columns.str.strip().str.lower().str.replace(' ','_')
reviews_sample
reviews_sample.productid.head()
reviews_sample.productid.str.split('00')
reviews_sample.productid.str.split('00').str.get(1)
reviews_sample.productid.str.split('00').str[1]
reviews_sample.summary
reviews_sample.summary.str.lower().str.replace('dog','health')
reviews_sample.summary.str.lower().str.replace('Chris|Sam|Sidra|Ye Chan','_SOMETHING_')
reviews_sample.summary.head(10)
reviews_sample.summary.str.extract(r'(Dog)')
reviews_sample.summary
reviews_sample.summary.str.extract(r'(Dog|Taffy)')
reviews_sample.summary.str.extract(r'(Dog|[Tt]affy)')
# returns a Series
reviews_sample.summary.str.extract(r'(Dog|[Tt]affy)', expand = False)
reviews_sample.summary.str.extractall(r'(Dog|[Tt]affy)')
reviews_sample.summary
reviews_sample.summary.str.extractall(r'(a)')
reviews_sample.text.iloc[0]
pattern = r'[Gg]ood' # The same as (pattern = r'Good/good')
reviews_sample.text.str.contains(pattern)
reviews_sample.text.str.match(pattern)
pattern = r'.*[Gg]ood.*'
reviews_sample.text.str.match(pattern)
reviews.Text.str.contains(r'\b[Vv]egan\b').sum()
reviews.Text.str.contains('<.*>').sum()
reviews['text_no_html'] = reviews.Text.str.replace('<[^<]+?>', '', regex=True)
reviews
# insert your code here