The Android App Market on Google Play

# Read in dataset import pandas as pd apps_with_duplicates = pd.read_csv('/work/The-Android-App-Market-on-Google-Play/datasets/apps.csv') # Drop duplicates from apps_with_duplicates apps = apps_with_duplicates.drop_duplicates() # Print the total number of apps print('Total number of apps in the dataset = ', len(apps)) # Have a look at a random sample of 5 rows print(apps.sample(5))

# List of characters to remove chars_to_remove = ['+', ',', '$'] # List of column names to clean cols_to_clean = ['Installs', 'Price'] # Loop for each column in cols_to_clean for col in cols_to_clean: # Loop for each char in chars_to_remove for char in chars_to_remove: # Replace the character with an empty string apps[col] = apps[col].apply(lambda x: x.replace(char, '')) # Print a summary of the apps dataframe print(apps.info())

import numpy as np # Convert Installs to float data type apps['Installs'] = apps['Installs'].astype(float) # Convert Price to float data type apps['Price'] = apps['Price'].astype(float) # Checking dtypes of the apps dataframe print(apps.dtypes)

import plotly.graph_objs as go # Print the total number of unique categories num_categories = len(apps['Category'].unique()) print('Number of categories = ', num_categories) # Count the number of apps in each 'Category'. num_apps_in_category = apps['Category'].value_counts() # Sort num_apps_in_category in descending order based on the count of apps in each category sorted_num_apps_in_category = num_apps_in_category.sort_values(ascending=False) # Create bar chart data data = [go.Bar( x = sorted_num_apps_in_category.index, # index = category name y = sorted_num_apps_in_category.values, # value = count )] fig = go.Figure(data=data) fig.show()

# Average rating of apps avg_app_rating = apps['Rating'].mean() print('Average app rating = ', avg_app_rating) # Distribution of apps according to their ratings data = [go.Histogram( x = apps['Rating'] )] # Vertical dashed line to indicate the average app rating layout = {'shapes': [{ 'type' :'line', 'x0': avg_app_rating, 'y0': 0, 'x1': avg_app_rating, 'y1': 1000, 'line': { 'dash': 'dashdot'} }] } fig = go.Figure(data=data, layout=layout) fig.show()

%matplotlib inline import seaborn as sns sns.set_style("darkgrid") import warnings warnings.filterwarnings("ignore") # Select rows where both 'Rating' and 'Size' values are present (ie. the two values are not null) apps_with_size_and_rating_present = apps[(~apps['Rating'].isnull()) & (~apps['Size'].isnull())] # Subset for categories with at least 250 apps large_categories = apps_with_size_and_rating_present.groupby('Category').filter(lambda x: len(x) >= 250) # Plot size vs. rating plt1 = sns.jointplot(x=large_categories['Size'], y=large_categories['Rating']) # Select apps whose 'Type' is 'Paid' paid_apps = apps_with_size_and_rating_present[apps_with_size_and_rating_present['Type'] == 'Paid'] # Plot price vs. rating plt2 = sns.jointplot(x=paid_apps['Price'], y=paid_apps['Rating'])

import matplotlib.pyplot as plt fig, ax = plt.subplots() fig.set_size_inches(15, 8) # Select a few popular app categories popular_app_cats = apps[apps.Category.isin(['GAME', 'FAMILY', 'PHOTOGRAPHY', 'MEDICAL', 'TOOLS', 'FINANCE', 'LIFESTYLE','BUSINESS'])] # Examine the price trend by plotting Price vs Category ax = sns.stripplot(x = popular_app_cats['Price'], y = popular_app_cats['Category'], jitter=True, linewidth=1) ax.set_title('App pricing trend across categories') # Apps whose Price is greater than 200 apps_above_200 = apps[apps['Price'] > 200] apps_above_200[['Category', 'App', 'Price']]

# Select apps priced below $100 apps_under_100 = popular_app_cats[popular_app_cats['Price'] < 100] fig, ax = plt.subplots() fig.set_size_inches(15, 8) # Examine price vs category with the authentic apps (apps_under_100) ax = sns.stripplot(x=apps_under_100['Price'], y=apps_under_100['Category'], data=apps_under_100, jitter=True, linewidth=1) ax.set_title('App pricing trend across categories after filtering for junk apps')