The Android App Market on Google Play

# Read in dataset import pandas as pd apps_with_duplicates = pd.read_csv('datasets/apps.csv') # Drop duplicates from apps_with_duplicates apps = apps_with_duplicates.drop_duplicates(subset='App') # Print the total number of apps print('Total number of apps in the dataset = ', apps['App'].value_counts().sum()) apps

# List of characters to remove chars_to_remove = ['+', ',', '$'] # List of column names to clean cols_to_clean = ['Installs', 'Price'] # Loop for each column in cols_to_clean for col in cols_to_clean: # Loop for each char in chars_to_remove for char in chars_to_remove: # Replace the character with an empty string apps[col] = apps[col].apply(lambda x: x.replace(char, '')) # Print a summary of the apps dataframe print(apps.info())

import numpy as np # Convert Installs to float data type apps['Installs'] = apps['Installs'].astype('float64') # Convert Price to float data type apps['Price'] = apps['Price'].astype('float64') # Checking dtypes of the apps dataframe print(apps.dtypes)

import plotly plotly.offline.init_notebook_mode(connected=True) import plotly.graph_objs as go # Print the total number of unique categories num_categories = len(apps['Category'].unique()) print('Number of categories = ', num_categories) # Count the number of apps in each 'Category'. num_apps_in_category = apps['Category'].value_counts() # Sort num_apps_in_category in descending order based on the count of apps in each category sorted_num_apps_in_category = num_apps_in_category.sort_values(ascending=False) data = [go.Bar( x = num_apps_in_category.index, # index = category name y = num_apps_in_category.values, # value = count )] plotly.offline.iplot(data)

# Average rating of apps avg_app_rating = apps['Rating'].mean() print('Average app rating = ', avg_app_rating) # Distribution of apps according to their ratings data = [go.Histogram( x = apps['Rating'] )] # Vertical dashed line to indicate the average app rating layout = {'shapes': [{ 'type' :'line', 'x0': avg_app_rating, 'y0': 0, 'x1': avg_app_rating, 'y1': 1000, 'line': { 'dash': 'dashdot'} }] } plotly.offline.iplot({'data': data, 'layout': layout})

%matplotlib inline import seaborn as sns sns.set_style("darkgrid") import warnings warnings.filterwarnings("ignore") # Select rows where both 'Rating' and 'Size' values are present (ie. the two values are not null) apps_with_size_and_rating_present = apps.dropna() # Subset for categories with at least 250 apps large_categories = apps_with_size_and_rating_present.groupby('Category').filter(lambda x: len(x) >= 250) # Plot size vs. rating plt1 = sns.jointplot(x = large_categories['Size'], y = large_categories['Rating']) # Select apps whose 'Type' is 'Paid' paid_apps = apps_with_size_and_rating_present[apps_with_size_and_rating_present['Type'] == 'Paid'] # Plot price vs. rating plt2 = sns.jointplot(x = paid_apps['Price'], y = paid_apps['Rating'])

import matplotlib.pyplot as plt fig, ax = plt.subplots() fig.set_size_inches(15, 8) # Select a few popular app categories popular_app_cats = apps[apps.Category.isin(['GAME', 'FAMILY', 'PHOTOGRAPHY', 'MEDICAL', 'TOOLS', 'FINANCE', 'LIFESTYLE','BUSINESS'])] # Examine the price trend by plotting Price vs. Category ax = sns.stripplot(x = popular_app_cats['Price'], y = popular_app_cats['Category'], jitter=True, linewidth=1) ax.set_title('App pricing trend across categories') # Apps whose Price is greater than 200 apps_above_200 = apps[apps['Price'] > 200] apps_above_200[['Category', 'App', 'Price']]

# Select apps priced below $100 apps_under_100 = popular_app_cats[popular_app_cats['Price'] < 100] fig, ax = plt.subplots() fig.set_size_inches(15, 8) # Examine price vs. category with the authentic apps (apps_under_100) ax = sns.stripplot(x = 'Price', y = 'Category', data = apps_under_100, jitter = True, linewidth = 1) ax.set_title('App pricing trend across categories after filtering for junk apps')

trace0 = go.Box( # Data for paid apps y = apps[apps['Type'] == 'Paid']['Installs'], name = 'Paid' ) trace1 = go.Box( # Data for free apps y = apps[apps['Type'] == 'Free']['Installs'], name = 'Free' ) layout = go.Layout( title = "Number of downloads of paid apps vs. free apps", yaxis = dict(title = "Log number of downloads", type = 'log', autorange = True) ) # Add trace0 and trace1 to a list for plotting data = [trace0, trace1] plotly.offline.iplot({'data': data, 'layout': layout})

# Load user_reviews.csv reviews_df = pd.read_csv('datasets/user_reviews.csv') # Join the two dataframes merged_df = apps.merge(reviews_df) # Drop NA values from Sentiment and Review columns merged_df = merged_df.dropna(subset = ['Sentiment', 'Review']) sns.set_style('ticks') fig, ax = plt.subplots() fig.set_size_inches(11, 8) # User review sentiment polarity for paid vs. free apps ax = sns.boxplot(x = 'Type', y = 'Sentiment_Polarity', data = merged_df) ax.set_title('Sentiment Polarity Distribution')