# Necessary Imports
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("/home/lyrax/matplotlib-dracula/dracula.mplstyle")
import seaborn as sns
sns.set_palette("bright")
from tqdm import tqdm, trange
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("phonesdata.csv")
data
# data info
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1148 entries, 0 to 1147
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Phone Title 1148 non-null object
1 Specs 1148 non-null object
2 Price(Kshs) 357 non-null object
3 Rating 1148 non-null float64
4 Specs Score 1148 non-null object
5 Likes 1148 non-null int64
6 PriceJumia(Kshs) 1003 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 62.9+ KB
# We'll use the PriceJumia column as the price column since it has few missing entries
# Price column is the only with missing values so we may drop these few instances
data['Price(Kshs)'] = data['PriceJumia(Kshs)']
data.drop('PriceJumia(Kshs)', axis=1, inplace=True)
data.dropna(inplace=True); data.reset_index(drop=True, inplace=True)
def preprocess(df):
# screen inches column
df['screen_inches'] = df.Specs.apply(lambda spec: float(spec.split(',')[0].split()[0].split(':')[1]))
# pixels column
df['pixels'] = df.Specs.apply(lambda spec: spec.split(',')[1].partition('pixels')[0])
df['pixels_0'] = df.pixels.apply(lambda pix: int(pix.split()[0]))
df['pixels_1'] = df.pixels.apply(lambda pix: int(pix.split()[-1]))
# front cam
df['front_cam(MP)'] = df.Specs.apply(lambda spec: spec.split(',')[2].partition('Front Camera:')[2].split('MP')[0])
# android
df['android'] = np.nan
for i, _ in enumerate(df.Specs):
try:
df.loc[i, 'android'] = df.loc[i, 'Specs'].split(',')[3].split()[1]
except IndexError:
pass
df.android = df.android.apply(lambda s: str(s).split('.')[0])
df.android = df.android.str.replace(';','')
for i, _ in enumerate(df.android):
try:
df.loc[i, 'android'] = float(df.loc[i, 'android'])
except ValueError:
df.loc[i, 'android'] = np.nan
# storage
df['storage(GB)'] = df.Specs.apply(lambda spec: spec.split(',')[4].partition('Storage:')[-1].split('GB')[0])
# ram
df['ram(GB)'] = df.Specs.apply(lambda spec: spec.split(',')[5].partition('RAM:')[-1].split('GB')[0])
# battery
df['battery(mAh)'] = df.Specs.apply(lambda spec: spec.split(',')[6].partition('Battery:')[-1].split('mAh')[0])
# Clean Other columns
df['Price(Kshs)'] = df['Price(Kshs)'].apply(lambda pr: str(pr.split()[1])).str.replace(',', '').astype(int)
df['Specs Score(%)'] = df['Specs Score'].str.replace('%', '').astype(int); df.drop('Specs Score', axis=1, inplace=True)
# Applying the function
preprocess(data)
data
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1003 entries, 0 to 1002
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Phone Title 1003 non-null object
1 Specs 1003 non-null object
2 Price(Kshs) 1003 non-null int64
3 Rating 1003 non-null float64
4 Likes 1003 non-null int64
5 screen_inches 1003 non-null float64
6 pixels 1003 non-null object
7 pixels_0 1003 non-null int64
8 pixels_1 1003 non-null int64
9 front_cam(MP) 1003 non-null object
10 android 873 non-null object
11 storage(GB) 1003 non-null object
12 ram(GB) 1003 non-null object
13 battery(mAh) 1003 non-null object
14 Specs Score(%) 1003 non-null int64
dtypes: float64(2), int64(5), object(8)
memory usage: 117.7+ KB
# drop instances without android version
data.dropna(inplace=True); data.reset_index(drop=True, inplace=True)
data
plt.figure(figsize=(9,6))
sns.distplot(data['Price(Kshs)'])
print(f"Minimum price: {data['Price(Kshs)'].min()} KShs.\nMaximum price: {data['Price(Kshs)'].max()} KShs.")
Minimum price: 124 KShs.
Maximum price: 446342 KShs.
data_clipped = data[data['Price(Kshs)'] == np.clip(data['Price(Kshs)'], 4000, 150000)]
data_clipped
# replotting price distribution
plt.figure(figsize=(9,6))
sns.distplot(data_clipped['Price(Kshs)'])
# distribution and boxplot of phone Rating
fig, ax = plt.subplots(1, 2, figsize=(17, 6))
sns.distplot(data_clipped.Rating, ax=ax[0]); ax[0].set_title("Distribution of phone Ratings")
sns.boxplot(data_clipped.Rating, ax=ax[1]); ax[1].set_title("Boxplot")
plt.figure(figsize=(12,7))
sns.scatterplot(data_clipped['Price(Kshs)'], data_clipped.Rating)
sns.jointplot(data_clipped['Price(Kshs)'], data_clipped.Rating)
plt.figure(figsize=(12,7))
sns.scatterplot(data_clipped.Rating, data_clipped.Likes)
sns.jointplot(data_clipped.Rating, data_clipped.Likes)
plt.figure(figsize=(8,15))
sns.countplot(y=data_clipped.screen_inches)
plt.figure(figsize=(12,7))
sns.countplot(data_clipped.android)
data_clipped = data_clipped[data_clipped.android <= 10]
plt.figure(figsize=(12,7))
sns.scatterplot(data_clipped.Rating, data_clipped['Specs Score(%)'])
sns.jointplot(data_clipped.Rating, data_clipped['Specs Score(%)'])
plt.figure(figsize=(12,7))
sns.scatterplot(data_clipped.Likes, data_clipped['Specs Score(%)'])
sns.jointplot(data_clipped.Likes, data_clipped['Specs Score(%)'])