from csv import reader
# open file referring to Android apps
openfile=open("/Users/midl/Documents/_Dataquest/mydatasets/project/googleplaystore.csv")
readfile=reader(openfile)
playstore_tt =list(readfile)
playstore_header=playstore_tt[0]
play_data=playstore_tt[1:]
#open file referring to iOS apps
openfile=open("/Users/midl/Documents/_Dataquest/mydatasets/project/AppleStore.csv")
readfile=reader(openfile)
istore_tt=list(readfile)
istore_header=istore_tt[0]
i_data=istore_tt[1:]
def explore_data(dataset, start, end, rows_and_columns=False):
dataset_slice=dataset[start:end]
for row in dataset_slice:
print(row)
print('\n')
if rows_and_columns:
print('Number of rows:', len(dataset))
print('Numner of columnns', len(dataset[0]))
#checking first 4 rows of data of each dataset
print("\n" + "---Android apps: --- "+'\n')
explore_data(play_data,0, 4, True)
print("---iOS apps: --- "+'\n')
explore_data(i_data, 0, 4, True)
#checking header of each dataset to identify the columns that could help us with our analysis.
print("Android columns names: ", "\n", playstore_header, "\n")
print("iOS columns names: ", "\n", istore_header, "\n")
#checking price columns:
#android:
print(play_data[1][7])
#iOS:
print(i_data[1][4])
#checking langguage of app name is english:
#android:
print(play_data[1][0])
#iOS:
print(i_data[1][1])
print(playstore_header)
print(play_data[10472])
print(len(playstore_header))
print(len(play_data[10472]))
#if we delete directly we will do:
#del play_data[10472]
#however, to be safe we will leave this instruction under comment, because we only can run it one time. It was already run.
#as alternative we can do this inside the following for part of code
#inspect if we have more rows with different lenght than expected:
#android
i=[]
j=0
m=[]
for each in play_data:
if len(each) != 13:#len information from result of explore_data()
i.append(each)
m.append(j)
j+=1
print(i, m) # if i and m are empety lists, it means we don't have more any situation cause by this problem
print(len(play_data)) #this is just to double check that we will not delete more than one row. the initial number of rows were 10841
if len(m)==1:
del play_data[m[0]]
print(len(play_data)) #this is just to double check that we will not delete more than one row. the initial number of rows were 10841
#iOS
i=[]
j=0
m=[]
for each in i_data:
if len(each) != 16:#len information from result of explore_data()
i.append(each)
m.append(j)
j+=1
print(i, m)# if i and m are empety lists, it means we don't have more any situation cause by this problem
print(len(i_data)) #this is just to double check that we will not delete more than one row. the initial number of rows were 7197
if len(m)==1:
del play_data[m[0]]
print(len(play_data)) #this is just to double check that we will not delete more than one row. the initial number of rows were 10841
#remove the non free apps?
play_nonfree=[]
j=0
for each in play_data:
price=each[6]
if price!="Free":
play_nonfree.append(j)
j+=1
print(len(play_nonfree)) #this is the total number of index which are not free
#print(play_nonfree) # these are the index that we will need to remove
i_nonfree=[]
j=0
for each in i_data:
price=float(each[4])
if price==0.0:
i_nonfree.append(j)
j+=1
print(len(i_nonfree)) #this is the total number of index which are not free
#print(i_nonfree) # these are the index that we will need to remove
i=0
for each in play_data:
name=each[0]
review=int(each[3])
if name== "Instagram":
print(each)
if review>i:
i=review
max_rev_instagram=i
print("\n", "Max reviews for Instagram: ", max_rev_instagram)
for each in play_data:
name=each[0]
review=int(each[3])
if name== "ZEDGE™ Ringtones & Wallpapers":
print(each)
print("\n", "We have 3 entries with max reviews of this app, and those entries are completely equals")
#Android
play_name_freq={}
duplicate_app=[]
reviews_max={}
for each in play_data:
name=str(each[0])
n_reviews=int(each[3])
if name in play_name_freq:
play_name_freq[name]+=1
duplicate_app.append(name)
if n_reviews>reviews_max[name]:
reviews_max[name]=n_reviews
else:
play_name_freq[name]=1
reviews_max[name]=n_reviews
print("\n", "ANDROID: ", "\n")
print("-> Examples of apps with dupplicate rows: ", duplicate_app[:5], "\n")
print("-> Total of duplicate apps' rows: ", len(duplicate_app), "\n")
print("Does it works for Instagram example?", max_rev_instagram==reviews_max["Instagram"], "\n")#checking if the dictionary gives the expected reviews number for Instgram attending what we saw before. If True, it means that we get same result as we expected
print("-> Expected total of apps without dupplicate rows: ", len(play_data)-len(duplicate_app))
print("-> Current total of apps without dupplicate rows: ", len(reviews_max), "\n")
print("Is the previsous expected number same as current number?", len(reviews_max)== len(play_data)-len(duplicate_app))
#print(play_name_freq)
#iOS
i_name_freq={}
duplicate_app=[]
for each in i_data:
name=str(each[0])
if name in i_name_freq:
i_name_freq[name]+=1
duplicate_app.append(name)
else:
i_name_freq[name]=1
print("\n", "iOS: ", "\n")
print("Examples of apps with dupplicate rows: ", duplicate_app[:5], "\n")
print("Total of duplicate apps' rows: ", len(duplicate_app), "\n")
#print(i_name_freq)
android_cleaned=[]
already_added=[]
for each in play_data:
name=str(each[0])
n_reviews=int(each[3])
if n_reviews== reviews_max[name] and name not in already_added:#we need to check why we have duplicate with same numbers of reviews and found another criteria
android_cleaned.append(each)
already_added.append(name)
# print(android_cleaned[:5])
print(len(android_cleaned))
#checking first 4 rows of data of each dataset
print("\n" + "---Android apps: --- "+'\n')
explore_data(android_cleaned,0, 4, True)
from pprint import pprint
play_name_freq={}
duplicate_app=[]
list_dup_rev={}
reviews_max={}
#for each in play_data:
# name=str(each[0])
# n_reviews=int(each[3])
# if n_reviews==reviews_max[name]:
# list_dup_rev[name]= each
#print(len(list_dup_rev))
#pprint(reviews_max_dup)
#print(dup_rev)
for each in play_data:
name=each[0]
review=int(each[3])
if name== "ZEDGE™ Ringtones & Wallpapers":
print(each)
print(play_name_freq['ZEDGE™ Ringtones & Wallpapers'])
print(reviews_max['ZEDGE™ Ringtones & Wallpapers'])
print(list_dup_rev['ZEDGE™ Ringtones & Wallpapers'])
#print(reviews_max_dup['ZEDGE™ Ringtones & Wallpapers'])
print("Android dataset")
print(android_cleaned[4412][0])
print(android_cleaned[7940][0], "\n")
print("iOS dataset")
print(i_data[813][1])
print(i_data[6731][1])
print("---First vesion of function to check if we have an english name:---", "\n")
def english_v1(string):
for each in string:
if ord(each)>127:
return False
return True
print(english_v1("Instagram"), "-> This result is beacause all characters are english and range number is less than 127")
print(english_v1("爱奇艺PPS -《欢乐颂2》电视剧热播"), "-> This result is beacause name contains foreign characters")
print(english_v1("Docs To Go™ Free Office Suite"), "-> This result is beacause '™'', which has range number: ", ord("™"))
print(english_v1("Instachat 😜"), "-> This result is beacause emoji which has range number: ", ord("😜"),"\n")
print("---Second vesion of function to check if we have an english name:---", "\n")
def english_v2(string):
tolerance=0
for each in string:
if ord(each)>127:
tolerance+=1
if tolerance>3:
return False
return True
print(english_v2("Instagram"), "-> This result is beacause all characters are english and range number is less than 127")
print(english_v2("爱奇艺PPS -《欢乐颂2》电视剧热播"), "-> This result is beacause name contains more than 3 foreign characters")
print(english_v2("Docs To Go™ Free Office Suite"), "-> This result is beacause we don't have more than 3 foreign characters")
print(english_v2("Instachat 😜"), "-> This result is beacause we don't have more than 3 foreign characters")
print("\n")
print("-> RESULTS: Android dataset")
play_en_cleaned=[]
play_nonenglish=[]
for each in android_cleaned:
name=each[0]
if english_v2(name):
play_en_cleaned.append(each)
else:
play_nonenglish.append(each)
print ("Apps total with foreign name: ",len(play_nonenglish), "\n")
print ("Apps total removing the ones with foreign name: " , len(play_en_cleaned), "\n")
#checking results for android dataset:
for each in play_en_cleaned:
name=each[0]
#print (name)
for each in play_nonenglish:
name=each[0]
#print (name)
print("\n")
print("-> RESULTS: iOS dataset")
i_en_cleaned=[]
i_nonenglish=[]
for each in i_data:
name=each[1]
if english_v2(name):
i_en_cleaned.append(each)
else:
i_nonenglish.append(each)
print ("Apps total with foreign name: ",len(i_nonenglish), "\n")
print ("Apps total removing the ones with foreign name: " , len(i_en_cleaned), "\n")
#checking results for ios dataset:
for each in i_en_cleaned:
name=each[1]
#print (name)
for each in i_nonenglish:
name=each[1]
#print (name)
print("-> EXPLORING DATASETS")
#checking first 4 rows of data of each dataset
print("\n" + "---Android apps: --- "+'\n')
explore_data(play_en_cleaned,0, 4, True)
print("---iOS apps: --- "+'\n')
explore_data(i_en_cleaned, 0, 4, True)
print("Android dataset")
play_en_free_cleaned=[]
play_en_nonfree_cleaned=[]
for each in play_en_cleaned:
price=each[6]
if price== "Free":
play_en_free_cleaned.append(each)
else:
play_en_nonfree_cleaned.append(each)
print("\n")
print("\n")
print("iOS dataset")
i_en_free_cleaned=[]
i_en_nonfree_cleaned=[]
for each in i_en_cleaned:
price=float(each[4])
if price==0:
i_en_free_cleaned.append(each)
else:
i_en_nonfree_cleaned.append(each)
print ("Free apps total in Android dataset: ",len(play_en_free_cleaned), "\n")
print ("Free apps total in iOS dataset: ", len(i_en_free_cleaned), "\n")
#checking results for android dataset:
for each in play_en_free_cleaned:
price=each[6]
#print (price)
for each in play_en_nonfree_cleaned:
price=each[6]
#print (price)
#checking results for android dataset:
for each in i_en_free_cleaned:
price=each[4]
#print (price)
for each in i_en_nonfree_cleaned:
price=each[4]
#print (price)
print("-> EXPLORING DATASETS")
#checking first 4 rows of data of each dataset
print("\n" + "---Android apps: --- "+'\n')
explore_data(play_en_free_cleaned,0, 4, True)
print("---iOS apps: --- "+'\n')
explore_data(i_en_free_cleaned, 0, 4, True)
print("MOST COMMON GENRES for each dataset:")
#first we creat both frequency tables by hand
play_app_genre={}
for each in play_en_free_cleaned:
genre=each[-4]
if genre in play_app_genre:
play_app_genre[genre]+=1
else:
play_app_genre[genre]=1
#print(play_app_genre)
#print("\n")
i_app_genre={}
for each in i_en_free_cleaned:
genre=each[-5]
if genre in i_app_genre:
i_app_genre[genre]+=1
else:
i_app_genre[genre]=1
#print(i_app_genre)
print("-> Android dataset - most common genre")
freq_play=0
most_common=0
most_common_list=[]
for each in play_app_genre:
if play_app_genre[each]>freq_play:
freq_play=play_app_genre[each]
most_common=each
elif play_app_genre[each]==freq_play:
most_common_list.append(each)
if len(most_common_list)==0:
print("most common genre:", most_common," -> with frequence ", freq_play, " and percentage: ", (freq_play/len(play_en_free_cleaned)*100))
else:
print("check! we have 2 parameteres with exactly same percentage: ",most_common_list)
print("\n")
print("-> iOS dataset - most common genre")
freq_i=0
most_common=0
most_common_list=[]
for each in i_app_genre:
if i_app_genre[each]>freq_i:
freq_i=i_app_genre[each]
most_common=each
elif i_app_genre[each]==freq_i:
most_common_list.append(each)
if len(most_common_list)==0:
print("most common genre:", most_common," -> with frequence: ", freq_i, "and percentage: ", (freq_i/len(i_en_free_cleaned)*100))
else:
print("check! we have 2 parameteres with exactly same percentage: ",most_common_list)
print("CREATING FREQENCY TABLES FOR ANY COLUMN AND IN PERCENTAGES:")
#now we will create a function to avoid the previous "manual work"
def freq_table(dataset, index):
app_parameter={}
for each in dataset:
parameter=each[index]
if parameter in app_parameter:
app_parameter[parameter]+=1
else:
app_parameter[parameter]=1
app_parameter_percentage={}
for each in app_parameter:
app_parameter_percentage[each]=app_parameter[each]/len(dataset)*100
return app_parameter_percentage
#following function will gave us most common parameter
def most_common(dataset, index):
app_parameter_percentage=freq_table(dataset, index)
freq=0
most_common=0
most_common_list=[]
for each in app_parameter_percentage:
if app_parameter_percentage[each]>freq:
freq=app_parameter_percentage[each]
most_common=each
elif app_parameter_percentage[each]==freq:
most_common_list.append(each)
if len(most_common_list)==0:
return print(most_common, freq)
else:
return print("check! we have 2 parameteres with exactly same percentage: ",most_common_list)
#freq_table(play_en_free_cleaned, -4)
#freq_table(i_en_free_cleaned, -5)
most_common(play_en_free_cleaned, -4)
most_common(i_en_free_cleaned, -5)
def display_table(dataset, index):
table = freq_table(dataset, index)
table_display = []
for key in table:
key_val_as_tuple = (table[key], key)
table_display.append(key_val_as_tuple)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0])
print("**iOS data set: prime_genre**")
display_table(i_en_free_cleaned,-5)
print("**1- Android data set: Genres**")
display_table(play_en_free_cleaned,-4)
print("\n")
print("-----")
print("\n")
print("**2- Android data set: Category**")
display_table(play_en_free_cleaned,1)
print("iOS data set: prime_genre", "\n")
i_genre_dic= freq_table(i_en_free_cleaned,-5)
table_display=[]
for genre in i_genre_dic:
total=0
len_genre=0
for each in i_en_free_cleaned:
installers= float(each[5])
genre_app=each[-5]
if genre_app==genre:
total +=installers
len_genre+=1
average=total/len_genre
genre_tupples=(average, genre)
table_display.append(genre_tupples)
#print(genre, ": ", average)
#print(genre_tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0])
table_display=[]
for each in i_en_free_cleaned:
if each[-5] == 'Navigation':
tupples=(int(each[5]), each[1])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered
table_display=[]
for each in i_en_free_cleaned:
if each[-5] == 'Reference':
tupples=(int(each[5]), each[1])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered
table_display=[]
for each in i_en_free_cleaned:
if each[-5] == 'Social Networking':
tupples=(int(each[5]), each[1])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered
table_display=[]
for each in i_en_free_cleaned:
if each[-5] == 'Music':
tupples=(int(each[5]), each[1])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered
table_display=[]
for each in i_en_free_cleaned:
if each[-5] == 'Weather':
tupples=(int(each[5]), each[1])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered
table_display=[]
for each in i_en_free_cleaned:
if each[-5] == 'Book':
tupples=(int(each[5]), each[1])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered
table_display=[]
for each in i_en_free_cleaned:
if each[-5] == 'Food & Drink':
tupples=(int(each[5]), each[1])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered
#play_genre_dic= freq_table(play_en_free_cleaned,-4)
play_categ_dic= freq_table(play_en_free_cleaned,1)
table_display=[]
#for genre in play_genre_dic:
for categ in play_categ_dic:
total=0
#len_genre=0
len_categ=0
for each in play_en_free_cleaned:
installers=each[5]
installers=installers.replace("+","")
installers=installers.replace(",","")
installers=int(installers)
#genre_app=each[-4]
categ_app=each[1]
#if genre_app==genre:
if categ_app==categ:
total +=installers
len_categ+=1
#len_genre+=1
average=total/len_categ
#average=total/len_genre
tupples=(average, categ)
table_display.append(tupples)
#print(categ, ": ", average)
#print(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0])
table_display=[]
for each in play_en_free_cleaned:
if each[1] == 'COMMUNICATION':
installers=each[5]
installers=installers.replace("+","")
installers=installers.replace(",","")
installers=int(installers)
tupples=(installers, each[0])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered
table_display=[]
for each in play_en_free_cleaned:
if each[1] == 'VIDEO_PLAYERS':
installers=each[5]
installers=installers.replace("+","")
installers=installers.replace(",","")
installers=int(installers)
tupples=(installers, each[0])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered
table_display=[]
for each in play_en_free_cleaned:
if each[1] == 'SOCIAL':
installers=each[5]
installers=installers.replace("+","")
installers=installers.replace(",","")
installers=int(installers)
tupples=(installers, each[0])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered
table_display=[]
for each in play_en_free_cleaned:
if each[1] == 'PHOTOGRAPHY':
installers=each[5]
installers=installers.replace("+","")
installers=installers.replace(",","")
installers=int(installers)
tupples=(installers, each[0])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered
table_display=[]
for each in play_en_free_cleaned:
if each[1] == 'PRODUCTIVITY':
installers=each[5]
installers=installers.replace("+","")
installers=installers.replace(",","")
installers=int(installers)
tupples=(installers, each[0])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered
table_display=[]
for each in play_en_free_cleaned:
if each[1] == 'GAME':
installers=each[5]
installers=installers.replace("+","")
installers=installers.replace(",","")
installers=int(installers)
tupples=(installers, each[0])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered
table_display=[]
for each in play_en_free_cleaned:
if each[1] == 'BOOKS_AND_REFERENCE':
installers=each[5]
installers=installers.replace("+","")
installers=installers.replace(",","")
installers=int(installers)
tupples=(installers, each[0])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered
table_display=[]
for each in play_en_free_cleaned:
if each[1] == 'FOOD_AND_DRINK':
installers=each[5]
installers=installers.replace("+","")
installers=installers.replace(",","")
installers=int(installers)
tupples=(installers, each[0])
table_display.append(tupples)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0]) # print name and number of ratings, ordered