import pandas as pd
reviews = pd.read_csv('https://raw.githubusercontent.com/umsi-data-science/data/main/amazon_food_reviews.zip')
reviews_sample = reviews.head(10)
reviews_sample
Idint64
1 - 10
ProductIdobject
B006K2ZZ7K40%
B001E4KFG010%
5 others50%
0
1
B001E4KFG0
1
2
B00813GRG4
2
3
B000LQOCH0
3
4
B000UA0QIQ
4
5
B006K2ZZ7K
5
6
B006K2ZZ7K
6
7
B006K2ZZ7K
7
8
B006K2ZZ7K
8
9
B000E7L2R4
9
10
B00171APVA
reviews_sample.ProfileName.str.lower()
reviews_sample.ProfileName.str.upper()
reviews_sample.Summary.str.len()
reviews_sample.columns
reviews_sample
Idint64
1 - 10
ProductIdobject
B006K2ZZ7K40%
B001E4KFG010%
5 others50%
0
1
B001E4KFG0
1
2
B00813GRG4
2
3
B000LQOCH0
3
4
B000UA0QIQ
4
5
B006K2ZZ7K
5
6
B006K2ZZ7K
6
7
B006K2ZZ7K
7
8
B006K2ZZ7K
8
9
B000E7L2R4
9
10
B00171APVA
reviews_sample.columns.str.lower()
reviews_sample.columns.str.strip().str.lower().str.replace(' ','_')
reviews_sample.columns = reviews_sample.columns.str.strip().str.lower().str.replace(' ','_')
reviews_sample
idint64
1 - 10
productidobject
B006K2ZZ7K40%
B001E4KFG010%
5 others50%
0
1
B001E4KFG0
1
2
B00813GRG4
2
3
B000LQOCH0
3
4
B000UA0QIQ
4
5
B006K2ZZ7K
5
6
B006K2ZZ7K
6
7
B006K2ZZ7K
7
8
B006K2ZZ7K
8
9
B000E7L2R4
9
10
B00171APVA
reviews_sample.productid.head()
reviews_sample.productid.str.split('00')
reviews_sample.productid.str.split('00').str.get(1)
reviews_sample.productid.str.split('00').str[1]
reviews_sample.summary
reviews_sample.summary.str.lower().str.replace('dog','health')
reviews_sample.summary.str.lower().str.replace('Chris|Sam|Sidra|Ye Chan','_SOMETHING_')
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: The default value of regex will change from True to False in a future version.
"""Entry point for launching an IPython kernel.
reviews_sample.summary.head(10)
reviews_sample.summary.str.extract(r'(Dog)')
0object
Dog20%
Missing80%
0
Dog
1
nan
2
nan
3
nan
4
nan
5
nan
6
nan
7
nan
8
nan
9
Dog
reviews_sample.summary
reviews_sample.summary.str.extract(r'(Dog|Taffy)')
0object
Dog20%
Taffy10%
Missing70%
0
Dog
1
nan
2
nan
3
nan
4
nan
5
Taffy
6
nan
7
nan
8
nan
9
Dog
reviews_sample.summary.str.extract(r'(Dog|[Tt]affy)')
0object
Dog20%
2 others30%
Missing50%
0
Dog
1
nan
2
nan
3
nan
4
taffy
5
Taffy
6
nan
7
taffy
8
nan
9
Dog
# returns a Series
reviews_sample.summary.str.extract(r'(Dog|[Tt]affy)', expand = False)
reviews_sample.summary.str.extractall(r'(Dog|[Tt]affy)')
0object
(0, 0)
Dog
(4, 0)
taffy
(5, 0)
Taffy
(7, 0)
taffy
(9, 0)
Dog
reviews_sample.summary
reviews_sample.summary.str.extractall(r'(a)')
0object
a100%
(0, 0)
a
(1, 0)
a
(2, 0)
a
(2, 1)
a
(4, 0)
a
(4, 1)
a
(5, 0)
a
(6, 0)
a
(6, 1)
a
(6, 2)
a
reviews_sample.text.iloc[0]
pattern = r'[Gg]ood' # The same as (pattern = r'Good/good')
reviews_sample.text.str.contains(pattern)
reviews_sample.text.str.match(pattern)
pattern = r'.*[Gg]ood.*'
reviews_sample.text.str.match(pattern)
reviews.Text.str.contains(r'\b[Vv]egan\b').sum()
reviews.Text.str.contains('<.*>').sum()
reviews['text_no_html'] = reviews.Text.str.replace('<[^<]+?>', '', regex=True)
reviews
Idint64
1 - 99999
ProductIdobject
0
1
B001E4KFG0
1
2
B00813GRG4
2
3
B000LQOCH0
3
4
B000UA0QIQ
4
5
B006K2ZZ7K
5
6
B006K2ZZ7K
6
7
B006K2ZZ7K
7
8
B006K2ZZ7K
8
9
B000E7L2R4
9
10
B00171APVA
# insert your code here