from csv import reader
def get_dataset(file_path, with_header=True):
opened_file = open(file_path)
read_file = reader(opened_file)
dataset = list(read_file)
if with_header:
return dataset[0], dataset[1:]
return dataset[1:]
android_header, android_data = get_dataset("googleplaystore.csv")
ios_header, ios_data = get_dataset("AppleStore.csv")
def explore_data(dataset, start, end, rows_and_columns=False):
dataset_slice = dataset[start:end]
for row in dataset_slice:
print(row)
if rows_and_columns:
print("\n")
print("Number of rows:", len(dataset))
print("Number of columns:", len(dataset[0]))
print(android_header, "\n")
explore_data(android_data, 0, 3, True)
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']
['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']
Number of rows: 10841
Number of columns: 13
print(ios_header, "\n")
explore_data(ios_data, 0, 3, True)
['', 'id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
['1', '281656475', 'PAC-MAN Premium', '100788224', 'USD', '3.99', '21292', '26', '4', '4.5', '6.3.5', '4+', 'Games', '38', '5', '10', '1']
['2', '281796108', 'Evernote - stay organized', '158578688', 'USD', '0', '161065', '26', '4', '3.5', '8.2.2', '4+', 'Productivity', '37', '5', '23', '1']
['3', '281940292', 'WeatherBug - Local Weather, Radar, Maps, Alerts', '100524032', 'USD', '0', '188583', '2822', '3.5', '4.5', '5.0.0', '4+', 'Weather', '37', '5', '3', '1']
Number of rows: 7197
Number of columns: 17
print(android_data[10472])
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
print(android_header, "\n")
print(android_data[0]) # correct format
print(android_data[10472]) # incorrect format
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
del android_data[10472]
def find_duplicate_apps(dataset, index_of_name_column):
duplicate_apps = []
unique_apps = []
for app in dataset:
name = app[index_of_name_column]
if name in unique_apps:
duplicate_apps.append(name)
else:
unique_apps.append(name)
return duplicate_apps, unique_apps
ios_duplicate_apps, ios_unique_apps = find_duplicate_apps(ios_data, 2)
android_duplicate_apps, android_unique_apps = find_duplicate_apps(android_data, 0)
print("Number of duplicate iOS apps:", len(ios_duplicate_apps))
print("Number of duplicate android iOS apps:", len(android_duplicate_apps))
Number of duplicate iOS apps: 2
Number of duplicate android iOS apps: 1181
print(ios_header)
for app in ios_data:
name = app[2]
if name == ios_duplicate_apps[0]:
print(app)
print("\n")
print(android_header)
for app in android_data:
name = app[0]
if name == android_duplicate_apps[0]:
print(app)
['', 'id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
['4000', '952877179', 'VR Roller Coaster', '169523200', 'USD', '0', '107', '102', '3.5', '3.5', '2.0.0', '4+', 'Games', '37', '5', '1', '1']
['7579', '1089824278', 'VR Roller Coaster', '240964608', 'USD', '0', '67', '44', '3.5', '4', '0.81', '4+', 'Games', '38', '0', '1', '1']
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80804', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
def filter_duplicates_by_review(dataset, name_column_index, reviews_column_index):
reviews_max = {}
for app in dataset:
name = app[name_column_index]
num_reviews = float(app[reviews_column_index])
if name in reviews_max and reviews_max[name] < num_reviews:
reviews_max[name] = num_reviews
elif name not in reviews_max:
reviews_max[name] = num_reviews
unique_data = []
already_added = []
for app in dataset:
name = app[name_column_index]
num_reviews = float(app[reviews_column_index])
if num_reviews == reviews_max[name] and name not in already_added:
unique_data.append(app)
already_added.append(name)
return unique_data
android_data_uniques = filter_duplicates_by_review(android_data, 0, 3)
print("Original Android Length:", len(android_data))
print("Expected Android Length:", len(android_data) - len(android_duplicate_apps))
print("Uniques Android length:", len(android_data_uniques))
print("\n")
ios_data_uniques = filter_duplicates_by_review(ios_data, 2, 6)
print("Original iOS length:", len(ios_data))
print("Expected iOS length:", len(ios_data) - len(ios_duplicate_apps))
print("Uniques iOS length:", len(ios_data_uniques))
Original Android Length: 10840
Expected Android Length: 9659
Uniques Android length: 9659
Original iOS length: 7197
Expected iOS length: 7195
Uniques iOS length: 7195
def is_app_directed_at_english_speakers(app_name):
num_non_english_characters = 0
for char in app_name:
if ord(char) > 127:
num_non_english_characters += 1
if num_non_english_characters > 2:
return False
return True
def filter_non_english_apps(dataset, name_index):
filtered_dataset = []
for app in dataset:
name = app[name_index]
if (is_app_directed_at_english_speakers(name)):
filtered_dataset.append(app)
return filtered_dataset
ios_data_english = filter_non_english_apps(ios_data_uniques, 2)
android_data_english = filter_non_english_apps(android_data_uniques, 0)
print("Android unique apps:", len(android_data_uniques))
print("Android unique, English apps:", len(android_data_english))
print("iOS unique apps:", len(ios_data_uniques))
print("iOS unique, English apps:", len(ios_data_english))
Android unique apps: 9659
Android unique, English apps: 9597
iOS unique apps: 7195
iOS unique, English apps: 6153
def isolate_free_apps(dataset, price_column):
free_apps = []
for app in dataset:
price = float(app[price_column].replace("$", ""))
if price == 0:
free_apps.append(app)
return free_apps
android_data_free = isolate_free_apps(android_data_english, 7)
ios_data_free = isolate_free_apps(ios_data_english, 5)
print("Android free apps:", len(android_data_free))
print("Android English apps:", len(android_data_english))
print("iOS free apps:", len(ios_data_free))
print("iOS English apps:", len(ios_data_english))
Android free apps: 8848
Android English apps: 9597
iOS free apps: 3201
iOS English apps: 6153
# functions we'll need for later
def freq_table(dataset, index):
frequency_table = {}
for app in dataset:
freq_column = app[index]
if freq_column in frequency_table:
frequency_table[freq_column] += 1
else:
frequency_table[freq_column] = 1
freq_table_as_percentage = {}
for key in frequency_table:
freq_table_as_percentage[key] = round((frequency_table[key] / len(dataset)) * 100, 2)
return freq_table_as_percentage
def display_table(dataset, index, limit=10):
table = freq_table(dataset, index)
table_display = []
for key in table:
key_val_as_tuple = (table[key], key)
table_display.append(key_val_as_tuple)
table_sorted = sorted(table_display, reverse=True)
for entry in table_sorted[:limit]:
print(entry[1], ":", str(entry[0]) + "%")
def display_popularity_by_genre(dataset, genre_index, popularity_index, formatter_cb=None):
table = freq_table(dataset, genre_index)
popularity_display = []
for genre in table:
sum_of_popularity = 0
num_apps_in_genre = 0
for app in dataset:
genre_app = app[genre_index]
if genre == genre_app:
if formatter_cb is not None:
metric = formatter_cb(app[popularity_index])
else:
metric = float(app[popularity_index])
sum_of_popularity += metric
num_apps_in_genre += 1
popularity_display.append((round(sum_of_popularity / num_apps_in_genre), genre, num_apps_in_genre))
for entry in sorted(popularity_display, reverse=True):
print(entry[1], ":", entry[0], "|", entry[2], "apps")
ios_genre_freq_table = display_table(ios_data_free, 12)
Games : 58.23%
Entertainment : 7.84%
Photo & Video : 5.0%
Education : 3.69%
Social Networking : 3.31%
Shopping : 2.59%
Utilities : 2.47%
Sports : 2.16%
Music : 2.06%
Health & Fitness : 2.03%
display_table(android_data_free, 9)
Tools : 8.44%
Entertainment : 6.08%
Education : 5.36%
Business : 4.6%
Productivity : 3.9%
Lifestyle : 3.88%
Finance : 3.71%
Medical : 3.54%
Sports : 3.46%
Personalization : 3.32%
display_table(android_data_free, 1)
FAMILY : 18.94%
GAME : 9.7%
TOOLS : 8.45%
BUSINESS : 4.6%
PRODUCTIVITY : 3.9%
LIFESTYLE : 3.89%
FINANCE : 3.71%
MEDICAL : 3.54%
SPORTS : 3.39%
PERSONALIZATION : 3.32%
display_popularity_by_genre(ios_data_free, 12, 6)
Navigation : 86090 | 6 apps
Reference : 79350 | 17 apps
Social Networking : 71548 | 106 apps
Music : 57327 | 66 apps
Weather : 52280 | 28 apps
Book : 46385 | 12 apps
Food & Drink : 33334 | 26 apps
Finance : 32367 | 35 apps
Photo & Video : 28442 | 160 apps
Travel : 28244 | 40 apps
Shopping : 27231 | 83 apps
Health & Fitness : 23298 | 65 apps
Sports : 23009 | 69 apps
Games : 22911 | 1864 apps
News : 21248 | 43 apps
Productivity : 21028 | 56 apps
Utilities : 19156 | 79 apps
Lifestyle : 16815 | 50 apps
Entertainment : 14195 | 251 apps
Business : 7491 | 17 apps
Education : 7004 | 118 apps
Catalogs : 4004 | 4 apps
Medical : 612 | 6 apps
display_table(android_data_free, 5)
1,000,000+ : 15.75%
100,000+ : 11.54%
10,000,000+ : 10.57%
10,000+ : 10.19%
1,000+ : 8.4%
100+ : 6.93%
5,000,000+ : 6.83%
500,000+ : 5.56%
50,000+ : 4.77%
5,000+ : 4.49%
def format_installs(installs):
installs = installs.replace("+", "")
installs = installs.replace(",", "")
return float(installs)
display_popularity_by_genre(android_data_free, 9, 5, format_installs)
Communication : 38590581 | 286 apps
Adventure;Action & Adventure : 35333333 | 3 apps
Video Players & Editors : 24947336 | 157 apps
Social : 23253652 | 236 apps
Arcade : 23028171 | 163 apps
Casual : 19569222 | 156 apps
Puzzle;Action & Adventure : 18366667 | 3 apps
Photography : 17840110 | 261 apps
Educational;Action & Adventure : 17016667 | 3 apps
Productivity : 16787331 | 345 apps
Racing : 15910646 | 88 apps
Travel & Local : 14051476 | 206 apps
Casual;Action & Adventure : 12916667 | 12 apps
Action : 12467106 | 274 apps
Strategy : 11199903 | 81 apps
Tools : 10831363 | 747 apps
Tools;Education : 10000000 | 1 apps
Role Playing;Brain Games : 10000000 | 1 apps
Lifestyle;Pretend Play : 10000000 | 1 apps
Casual;Music & Video : 10000000 | 1 apps
Card;Action & Adventure : 10000000 | 1 apps
Adventure;Education : 10000000 | 1 apps
News & Magazines : 9549178 | 248 apps
Music : 9445583 | 18 apps
Educational;Pretend Play : 9375000 | 8 apps
Puzzle;Brain Games : 9280667 | 15 apps
Word : 9094459 | 23 apps
Racing;Action & Adventure : 8816667 | 15 apps
Books & Reference : 8814200 | 189 apps
Puzzle : 8302862 | 100 apps
Video Players & Editors;Music & Video : 7500000 | 2 apps
Shopping : 7036877 | 199 apps
Role Playing;Action & Adventure : 7000000 | 3 apps
Casual;Pretend Play : 6957143 | 21 apps
Entertainment;Music & Video : 6413333 | 15 apps
Action;Action & Adventure : 5888889 | 9 apps
Entertainment : 5602793 | 538 apps
Education;Brain Games : 5333333 | 3 apps
Casual;Creativity : 5333333 | 6 apps
Role Playing;Pretend Play : 5275000 | 4 apps
Personalization : 5201483 | 294 apps
Weather : 5145550 | 70 apps
Sports;Action & Adventure : 5050000 | 2 apps
Music;Music & Video : 5050000 | 2 apps
Video Players & Editors;Creativity : 5000000 | 1 apps
Simulation;Action & Adventure : 4857143 | 7 apps
Education;Education : 4759517 | 30 apps
Board : 4759209 | 34 apps
Sports : 4611702 | 306 apps
Educational;Brain Games : 4433333 | 6 apps
Health & Fitness : 4188822 | 273 apps
Adventure : 4158765 | 59 apps
Maps & Navigation : 4049275 | 123 apps
Entertainment;Creativity : 4000000 | 3 apps
Role Playing : 3965645 | 83 apps
Card : 3815462 | 40 apps
Casino : 3520422 | 37 apps
Trivia : 3475713 | 37 apps
Simulation : 3475484 | 181 apps
Entertainment;Brain Games : 3314286 | 7 apps
Arcade;Action & Adventure : 3190909 | 11 apps
Entertainment;Pretend Play : 3000000 | 2 apps
Board;Action & Adventure : 3000000 | 2 apps
Education;Creativity : 2875000 | 4 apps
Entertainment;Action & Adventure : 2333333 | 3 apps
Educational;Creativity : 2333333 | 3 apps
Art & Design : 2122851 | 53 apps
Education;Music & Video : 2033333 | 3 apps
Food & Drink : 1924898 | 110 apps
Education;Pretend Play : 1800000 | 5 apps
Educational;Education : 1737143 | 35 apps
Business : 1712290 | 407 apps
Casual;Brain Games : 1425917 | 12 apps
Lifestyle : 1421220 | 343 apps
Finance : 1387692 | 328 apps
House & Home : 1360598 | 71 apps
Parenting;Music & Video : 1118333 | 6 apps
Strategy;Creativity : 1000000 | 1 apps
Strategy;Action & Adventure : 1000000 | 1 apps
Racing;Pretend Play : 1000000 | 1 apps
Parenting;Brain Games : 1000000 | 1 apps
Health & Fitness;Action & Adventure : 1000000 | 1 apps
Entertainment;Education : 1000000 | 1 apps
Education;Action & Adventure : 1000000 | 3 apps
Casual;Education : 1000000 | 2 apps
Arcade;Pretend Play : 1000000 | 1 apps
Dating : 854029 | 165 apps
Comics : 847380 | 53 apps
Puzzle;Creativity : 750000 | 2 apps
Auto & Vehicles : 647318 | 82 apps
Libraries & Demo : 638504 | 83 apps
Education : 550185 | 474 apps
Simulation;Pretend Play : 550000 | 2 apps
Beauty : 513152 | 53 apps
Strategy;Education : 500000 | 1 apps
Music & Audio;Music & Video : 500000 | 1 apps
Communication;Creativity : 500000 | 1 apps
Art & Design;Pretend Play : 500000 | 1 apps
Parenting : 467978 | 44 apps
Parenting;Education : 452857 | 7 apps
Educational : 411185 | 33 apps
Board;Brain Games : 407143 | 7 apps
Art & Design;Creativity : 285000 | 6 apps
Events : 253542 | 63 apps
Medical : 120551 | 313 apps
Travel & Local;Action & Adventure : 100000 | 1 apps
Puzzle;Education : 100000 | 1 apps
Lifestyle;Education : 100000 | 1 apps
Health & Fitness;Education : 100000 | 1 apps
Art & Design;Action & Adventure : 100000 | 1 apps
Comics;Creativity : 50000 | 1 apps
Books & Reference;Education : 1000 | 1 apps
Simulation;Education : 500 | 1 apps
Trivia;Education : 100 | 1 apps
display_popularity_by_genre(android_data_free, 1, 5, format_installs)
COMMUNICATION : 38590581 | 286 apps
VIDEO_PLAYERS : 24727872 | 159 apps
SOCIAL : 23253652 | 236 apps
PHOTOGRAPHY : 17840110 | 261 apps
PRODUCTIVITY : 16787331 | 345 apps
GAME : 15544015 | 858 apps
TRAVEL_AND_LOCAL : 13984078 | 207 apps
ENTERTAINMENT : 11640706 | 85 apps
TOOLS : 10830252 | 748 apps
NEWS_AND_MAGAZINES : 9549178 | 248 apps
BOOKS_AND_REFERENCE : 8814200 | 189 apps
SHOPPING : 7036877 | 199 apps
PERSONALIZATION : 5201483 | 294 apps
WEATHER : 5145550 | 70 apps
HEALTH_AND_FITNESS : 4188822 | 273 apps
MAPS_AND_NAVIGATION : 4049275 | 123 apps
FAMILY : 3695642 | 1676 apps
SPORTS : 3650602 | 300 apps
ART_AND_DESIGN : 1986335 | 57 apps
FOOD_AND_DRINK : 1924898 | 110 apps
EDUCATION : 1833495 | 103 apps
BUSINESS : 1712290 | 407 apps
LIFESTYLE : 1446158 | 344 apps
FINANCE : 1387692 | 328 apps
HOUSE_AND_HOME : 1360598 | 71 apps
DATING : 854029 | 165 apps
COMICS : 832614 | 54 apps
AUTO_AND_VEHICLES : 647318 | 82 apps
LIBRARIES_AND_DEMO : 638504 | 83 apps
PARENTING : 542604 | 58 apps
BEAUTY : 513152 | 53 apps
EVENTS : 253542 | 63 apps
MEDICAL : 120551 | 313 apps