# Import jams.tsv into a pandas dataframe
import pandas as pd
jams = pd.read_csv('jams.tsv', sep='\t', error_bad_lines=False)
jams.head()
b'Skipping line 7872: expected 7 fields, saw 8\nSkipping line 11730: expected 7 fields, saw 9\nSkipping line 14131: expected 7 fields, saw 8\nSkipping line 58054: expected 7 fields, saw 8\nSkipping line 58754: expected 7 fields, saw 8\n'
b'Skipping line 847129: expected 7 fields, saw 8\n'
b'Skipping line 1091153: expected 7 fields, saw 8\nSkipping line 1175375: expected 7 fields, saw 8\n'
b'Skipping line 1225935: expected 7 fields, saw 8\nSkipping line 1255357: expected 7 fields, saw 8\nSkipping line 1279671: expected 7 fields, saw 8\n'
b'Skipping line 1330675: expected 7 fields, saw 8\n'
b'Skipping line 1448033: expected 7 fields, saw 8\nSkipping line 1543893: expected 7 fields, saw 8\n'
b'Skipping line 1579569: expected 7 fields, saw 8\nSkipping line 1612448: expected 7 fields, saw 8\n'
b'Skipping line 1784588: expected 7 fields, saw 8\n'
# Series of unique user_ids
unique = jams.user_id.unique()
unique
# Sample 1000
sample_jams = jams.sample(n=1000)
sample_jams
# Get all entries for user_ids in sample plus drop columns
jams_cleaned = pd.merge(sample_jams['user_id'], jams[['user_id', 'title', 'artist']], on='user_id', how='left')
jams_cleaned
# Make new song column
jams_cleaned['song'] = jams_cleaned['title'] + ', by ' + jams_cleaned['artist']
# Cut out more columns
jams_cleaned = jams_cleaned[['user_id', 'song']]
jams_cleaned.head()
# Make into csv file
jams_cleaned.to_csv("jams-sample.csv")