# Install mrjob library. This package is for running MapReduce jobs with Python
# In Jupyter notebooks, "!" runs terminal commands from inside notebooks
# Exercise 1
# use the pip package manager
# <your code here/>
! pip install mrjob
Collecting mrjob
Downloading mrjob-0.7.4-py2.py3-none-any.whl (439 kB)
|████████████████████████████████| 439 kB 8.1 MB/s
Requirement already satisfied: PyYAML>=3.10 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from mrjob) (5.4.1)
Installing collected packages: mrjob
Successfully installed mrjob-0.7.4
import random
contenders = ['Kill The Love', 'Dynamite', 'Likey']
# Exercise 2: generate 10 songs chosen randomly from contenders
# use random.choices() to draw samples with replacement
# there is an argument 'k' that can be used to specify how many samples to draw
# <your code here/>
random.choices(contenders, k=10)
# Exercise 3.1: Generate a list of 2000 songs
# <your code here/>
song_list = random.choices(contenders, k=2000)
# Exercise 3.2: write the song list to a file, each entry separated by newline
# <your code here/>
with open('songs.txt', 'w') as myfile:
myfile.write('\n'.join(song_list))
%%file wordcount.py
# %%file is an Ipython magic function that saves the code cell as a file
from mrjob.job import MRJob # import the mrjob library
class MRSongCount(MRJob):
# the map step: each line in the txt file is read as a key, value pair
# in this case, each line in the txt file only contains a value but no key
# _ means that in this case, there is no key for each line
def mapper(self, _, song):
# output each line as a tuple of (song_names, 1)
yield (song, 1)
# the reduce step: combine all tuples with the same key
# in this case, the key is the song name
# then sum all the values of the tuple, which will give the total song plays
def reducer(self, key, values):
yield (key, sum(values))
if __name__ == "__main__":
MRSongCount.run()
Writing wordcount.py
# Exercise 4: run the code as a terminal command
# <your code here/>
! python wordcount.py songs.txt
No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/wordcount.root.20210609.080733.176204
Running step 1 of 1...
job output is in /tmp/wordcount.root.20210609.080733.176204/output
Streaming final output from /tmp/wordcount.root.20210609.080733.176204/output...
"Dynamite" 681
"Kill The Love" 638
"Likey" 681
Removing temp directory /tmp/wordcount.root.20210609.080733.176204...