# Necessary Imports
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("/home/lyrax/matplotlib-dracula/dracula.mplstyle")
import seaborn as sns
sns.set_palette("bright")
from tqdm import tqdm, trange
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("phonesdata.csv")
data
# data info
data.info()
# We'll use the PriceJumia column as the price column since it has few missing entries
# Price column is the only with missing values so we may drop these few instances
data['Price(Kshs)'] = data['PriceJumia(Kshs)']
data.drop('PriceJumia(Kshs)', axis=1, inplace=True)
data.dropna(inplace=True); data.reset_index(drop=True, inplace=True)
def preprocess(df):
# screen inches column
df['screen_inches'] = df.Specs.apply(lambda spec: float(spec.split(',')[0].split()[0].split(':')[1]))
# pixels column
df['pixels'] = df.Specs.apply(lambda spec: spec.split(',')[1].partition('pixels')[0])
df['pixels_0'] = df.pixels.apply(lambda pix: int(pix.split()[0]))
df['pixels_1'] = df.pixels.apply(lambda pix: int(pix.split()[-1]))
# front cam
df['front_cam(MP)'] = df.Specs.apply(lambda spec: spec.split(',')[2].partition('Front Camera:')[2].split('MP')[0])
# android
df['android'] = np.nan
for i, _ in enumerate(df.Specs):
try:
df.loc[i, 'android'] = df.loc[i, 'Specs'].split(',')[3].split()[1]
except IndexError:
pass
df.android = df.android.apply(lambda s: str(s).split('.')[0])
df.android = df.android.str.replace(';','')
for i, _ in enumerate(df.android):
try:
df.loc[i, 'android'] = float(df.loc[i, 'android'])
except ValueError:
df.loc[i, 'android'] = np.nan
# storage
df['storage(GB)'] = df.Specs.apply(lambda spec: spec.split(',')[4].partition('Storage:')[-1].split('GB')[0])
# ram
df['ram(GB)'] = df.Specs.apply(lambda spec: spec.split(',')[5].partition('RAM:')[-1].split('GB')[0])
# battery
df['battery(mAh)'] = df.Specs.apply(lambda spec: spec.split(',')[6].partition('Battery:')[-1].split('mAh')[0])
# Clean Other columns
df['Price(Kshs)'] = df['Price(Kshs)'].apply(lambda pr: str(pr.split()[1])).str.replace(',', '').astype(int)
df['Specs Score(%)'] = df['Specs Score'].str.replace('%', '').astype(int); df.drop('Specs Score', axis=1, inplace=True)
# Applying the function
preprocess(data)
data
data.info()
# drop instances without android version
data.dropna(inplace=True); data.reset_index(drop=True, inplace=True)
data
plt.figure(figsize=(9,6))
sns.distplot(data['Price(Kshs)'])
print(f"Minimum price: {data['Price(Kshs)'].min()} KShs.\nMaximum price: {data['Price(Kshs)'].max()} KShs.")
data_clipped = data[data['Price(Kshs)'] == np.clip(data['Price(Kshs)'], 4000, 150000)]
data_clipped
# replotting price distribution
plt.figure(figsize=(9,6))
sns.distplot(data_clipped['Price(Kshs)'])
# distribution and boxplot of phone Rating
fig, ax = plt.subplots(1, 2, figsize=(17, 6))
sns.distplot(data_clipped.Rating, ax=ax[0]); ax[0].set_title("Distribution of phone Ratings")
sns.boxplot(data_clipped.Rating, ax=ax[1]); ax[1].set_title("Boxplot")
plt.figure(figsize=(12,7))
sns.scatterplot(data_clipped['Price(Kshs)'], data_clipped.Rating)
sns.jointplot(data_clipped['Price(Kshs)'], data_clipped.Rating)
plt.figure(figsize=(12,7))
sns.scatterplot(data_clipped.Rating, data_clipped.Likes)
sns.jointplot(data_clipped.Rating, data_clipped.Likes)
plt.figure(figsize=(8,15))
sns.countplot(y=data_clipped.screen_inches)
plt.figure(figsize=(12,7))
sns.countplot(data_clipped.android)
data_clipped = data_clipped[data_clipped.android <= 10]
plt.figure(figsize=(12,7))
sns.scatterplot(data_clipped.Rating, data_clipped['Specs Score(%)'])
sns.jointplot(data_clipped.Rating, data_clipped['Specs Score(%)'])
plt.figure(figsize=(12,7))
sns.scatterplot(data_clipped.Likes, data_clipped['Specs Score(%)'])
sns.jointplot(data_clipped.Likes, data_clipped['Specs Score(%)'])