# Read in dataset
import pandas as pd
apps_with_duplicates = pd.read_csv('/work/The-Android-App-Market-on-Google-Play/datasets/apps.csv')
# Drop duplicates from apps_with_duplicates
apps = apps_with_duplicates.drop_duplicates()
# Print the total number of apps
print('Total number of apps in the dataset = ', len(apps))
# Have a look at a random sample of 5 rows
print(apps.sample(5))
Total number of apps in the dataset = 9659
Unnamed: 0 App Category Rating \
8470 9604 Texas Holdem Poker Pro GAME 4.5
7449 8544 DM Screen BOOKS_AND_REFERENCE 4.4
665 812 HomeWork EDUCATION 4.3
6669 7728 HR Team CQ Region Ed Qld BUSINESS NaN
1964 2487 Anthem BC Anywhere MEDICAL 2.6
Reviews Size Installs Type Price Content Rating Genres \
8470 114479 26.0 5,000,000+ Free 0 Teen Card
7449 283 5.4 10,000+ Free 0 Everyone 10+ Books & Reference
665 16195 5.2 1,000,000+ Free 0 Everyone Education
6669 0 4.1 500+ Free 0 Everyone Business
1964 496 24.0 100,000+ Free 0 Everyone Medical
Last Updated Current Ver Android Ver
8470 December 20, 2017 4.7.0 3.0 and up
7449 January 12, 2018 1.1.10a 4.0 and up
665 September 20, 2016 8.5.2 4.0 and up
6669 January 12, 2018 3.8 4.0.3 and up
1964 July 27, 2018 8.0.226 4.4 and up
# List of characters to remove
chars_to_remove = ['+', ',', '$']
# List of column names to clean
cols_to_clean = ['Installs', 'Price']
# Loop for each column in cols_to_clean
for col in cols_to_clean:
# Loop for each char in chars_to_remove
for char in chars_to_remove:
# Replace the character with an empty string
apps[col] = apps[col].apply(lambda x: x.replace(char, ''))
# Print a summary of the apps dataframe
print(apps.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9659 entries, 0 to 9658
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 9659 non-null int64
1 App 9659 non-null object
2 Category 9659 non-null object
3 Rating 8196 non-null float64
4 Reviews 9659 non-null int64
5 Size 8432 non-null float64
6 Installs 9659 non-null object
7 Type 9659 non-null object
8 Price 9659 non-null object
9 Content Rating 9659 non-null object
10 Genres 9659 non-null object
11 Last Updated 9659 non-null object
12 Current Ver 9651 non-null object
13 Android Ver 9657 non-null object
dtypes: float64(2), int64(2), object(10)
memory usage: 1.1+ MB
None
import numpy as np
# Convert Installs to float data type
apps['Installs'] = apps['Installs'].astype(float)
# Convert Price to float data type
apps['Price'] = apps['Price'].astype(float)
# Checking dtypes of the apps dataframe
print(apps.dtypes)
Unnamed: 0 int64
App object
Category object
Rating float64
Reviews int64
Size float64
Installs float64
Type object
Price float64
Content Rating object
Genres object
Last Updated object
Current Ver object
Android Ver object
dtype: object
import plotly.graph_objs as go
# Print the total number of unique categories
num_categories = len(apps['Category'].unique())
print('Number of categories = ', num_categories)
# Count the number of apps in each 'Category'.
num_apps_in_category = apps['Category'].value_counts()
# Sort num_apps_in_category in descending order based on the count of apps in each category
sorted_num_apps_in_category = num_apps_in_category.sort_values(ascending=False)
# Create bar chart data
data = [go.Bar(
x = sorted_num_apps_in_category.index, # index = category name
y = sorted_num_apps_in_category.values, # value = count
)]
fig = go.Figure(data=data)
fig.show()
Number of categories = 33
# Average rating of apps
avg_app_rating = apps['Rating'].mean()
print('Average app rating = ', avg_app_rating)
# Distribution of apps according to their ratings
data = [go.Histogram(
x = apps['Rating']
)]
# Vertical dashed line to indicate the average app rating
layout = {'shapes': [{
'type' :'line',
'x0': avg_app_rating,
'y0': 0,
'x1': avg_app_rating,
'y1': 1000,
'line': { 'dash': 'dashdot'}
}]
}
fig = go.Figure(data=data, layout=layout)
fig.show()
Average app rating = 4.173243045387994
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
import warnings
warnings.filterwarnings("ignore")
# Select rows where both 'Rating' and 'Size' values are present (ie. the two values are not null)
apps_with_size_and_rating_present = apps[(~apps['Rating'].isnull()) & (~apps['Size'].isnull())]
# Subset for categories with at least 250 apps
large_categories = apps_with_size_and_rating_present.groupby('Category').filter(lambda x: len(x) >= 250)
# Plot size vs. rating
plt1 = sns.jointplot(x=large_categories['Size'], y=large_categories['Rating'])
# Select apps whose 'Type' is 'Paid'
paid_apps = apps_with_size_and_rating_present[apps_with_size_and_rating_present['Type'] == 'Paid']
# Plot price vs. rating
plt2 = sns.jointplot(x=paid_apps['Price'], y=paid_apps['Rating'])
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
# Select a few popular app categories
popular_app_cats = apps[apps.Category.isin(['GAME', 'FAMILY', 'PHOTOGRAPHY',
'MEDICAL', 'TOOLS', 'FINANCE',
'LIFESTYLE','BUSINESS'])]
# Examine the price trend by plotting Price vs Category
ax = sns.stripplot(x = popular_app_cats['Price'], y = popular_app_cats['Category'], jitter=True, linewidth=1)
ax.set_title('App pricing trend across categories')
# Apps whose Price is greater than 200
apps_above_200 = apps[apps['Price'] > 200]
apps_above_200[['Category', 'App', 'Price']]
# Select apps priced below $100
apps_under_100 = popular_app_cats[popular_app_cats['Price'] < 100]
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
# Examine price vs category with the authentic apps (apps_under_100)
ax = sns.stripplot(x=apps_under_100['Price'], y=apps_under_100['Category'], data=apps_under_100, jitter=True, linewidth=1)
ax.set_title('App pricing trend across categories after filtering for junk apps')