# Let's import the packages
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
# Dataset scraped from the computer section of https://www.backmarket.com/refurbished-computers.html
# Scrap Date: 2022/04/06
df_comp = pd.read_csv('backmarket.csv')
df_comp.head(5)
# Data types
df_comp.dtypes
# There are null values in the column 'specs' but doesn't show up.
df_comp.isnull().any()
df_comp.info()
# How many null values are there?.
df_comp[df_comp['specs']=='(None,)']
df_comp.describe()
df_comp.nunique()
df_comp['warranty'] = df_comp['warranty'].astype('category')
df_comp.dtypes
df_comp['warranty'].value_counts()
df_comp = df_comp.applymap(lambda x: str(x).strip())
df_comp['warranty'].value_counts()
df_comp['warranty'] = df_comp['warranty'].map(lambda x: x.replace('Warranty: ', ''))
df_comp['warranty'] = df_comp['warranty'].map(lambda x: x.replace(' months', ''))
# Manual timestamp.
df_comp['date_scrap'] = pd.Timestamp(year=2022, month=4, day=6, hour=12)
df_comp
df_comp['date_scrap'] = pd.to_datetime(df_comp['date_scrap']).dt.date
# df_comp = df_comp.drop('date_scrap', axis = 1)
df_comp['specs'] = df_comp['specs'].map(lambda x: x.replace('(None,)', 'None'))
df_comp['specs'][df_comp['specs']=='None'] = np.nan
df_comp.isnull().sum()
df_comp['price_new'] = df_comp['price_new'].map(lambda x: x.replace('$', ''))
df_comp['price'] = df_comp['price'].map(lambda x: x.replace('$', ''))
# Using regular expression.
df_comp['rel_year'] = df_comp['name'].map(lambda x: re.findall('[2][0-9]{3}', x))
df_comp['rel_year'] = df_comp['rel_year'].map(lambda x: str(x).replace("['",''))
df_comp['rel_year'] = df_comp['rel_year'].map(lambda x: str(x).replace("']",''))
df_comp['rel_year'][df_comp['rel_year']=='[]'] = np.nan
df_comp['rel_year'][df_comp['rel_year']>= '2022'] = np.nan
df_comp.dtypes
df_comp.convert_dtypes().dtypes
# Fixing some values.
df_comp['price'] = df_comp['price'].map(lambda x: str(x).replace(",",''))
df_comp['price_new'] = df_comp['price_new'].map(lambda x: str(x).replace(",",''))
df_comp['rel_year'] = df_comp['rel_year'].map(lambda x: str(x).replace("nan",'0'))
df_comp['rel_year'][df_comp['rel_year'].str.contains(',')]
df_comp['rel_year'].iloc[149] = 2012
df_comp['rel_year'].iloc[178] = 2019
df_comp['rel_year'].iloc[299] = 2015
df_comp['name'] = df_comp['name'].astype('string')
df_comp['price'] = df_comp['price'].astype('float32')
df_comp['price_new'] = df_comp['price_new'].astype('float32')
df_comp['specs'] = df_comp['specs'].astype('string')
df_comp['warranty'] = df_comp['warranty'].astype('int16')
df_comp['rel_year'] = df_comp['rel_year'].astype('int32')
df_comp.dtypes
df_comp['rel_year'][df_comp['rel_year']==0] = np.nan
df_comp.head(20)
# Saving the clean dataset.
df_comp.to_csv('./backmarket_clean.csv', index=False)
# Lowest computer prices for release year.
df_comp.groupby('rel_year').min()
# Highest and lowest price for release year
df_comp.groupby('rel_year').agg({'price':['max', 'min']})
# Laptops quantity vs release year
df_comp.groupby('rel_year')['price'].count()
plt.hist(df_comp['rel_year'])
# All the laptop models and prices arrange for rel_year
pd.pivot_table(df_comp, index=['rel_year', 'name'])['price']
# Price histogram
plt.hist(df_comp['price'])