backmarket_data

# Let's import the packages import numpy as np import pandas as pd import re import matplotlib.pyplot as plt

# Dataset scraped from the computer section of https://www.backmarket.com/refurbished-computers.html # Scrap Date: 2022/04/06 df_comp = pd.read_csv('backmarket.csv')

df_comp.head(5)

# Data types df_comp.dtypes

# There are null values in the column 'specs' but doesn't show up. df_comp.isnull().any()

df_comp.info()

# How many null values are there?. df_comp[df_comp['specs']=='(None,)']

df_comp.describe()

df_comp.nunique()

df_comp['warranty'] = df_comp['warranty'].astype('category') df_comp.dtypes

df_comp['warranty'].value_counts()

df_comp = df_comp.applymap(lambda x: str(x).strip())

df_comp['warranty'].value_counts()

df_comp['warranty'] = df_comp['warranty'].map(lambda x: x.replace('Warranty: ', ''))

df_comp['warranty'] = df_comp['warranty'].map(lambda x: x.replace(' months', ''))

# Manual timestamp. df_comp['date_scrap'] = pd.Timestamp(year=2022, month=4, day=6, hour=12) df_comp

df_comp['date_scrap'] = pd.to_datetime(df_comp['date_scrap']).dt.date # df_comp = df_comp.drop('date_scrap', axis = 1)

df_comp['specs'] = df_comp['specs'].map(lambda x: x.replace('(None,)', 'None'))

df_comp['specs'][df_comp['specs']=='None'] = np.nan

df_comp.isnull().sum()

df_comp['price_new'] = df_comp['price_new'].map(lambda x: x.replace('$', ''))

df_comp['price'] = df_comp['price'].map(lambda x: x.replace('$', ''))

# Using regular expression. df_comp['rel_year'] = df_comp['name'].map(lambda x: re.findall('[2][0-9]{3}', x))

df_comp['rel_year'] = df_comp['rel_year'].map(lambda x: str(x).replace("['",''))

df_comp['rel_year'] = df_comp['rel_year'].map(lambda x: str(x).replace("']",''))

df_comp['rel_year'][df_comp['rel_year']=='[]'] = np.nan

df_comp['rel_year'][df_comp['rel_year']>= '2022'] = np.nan

df_comp.dtypes

df_comp.convert_dtypes().dtypes

# Fixing some values. df_comp['price'] = df_comp['price'].map(lambda x: str(x).replace(",",''))

df_comp['price_new'] = df_comp['price_new'].map(lambda x: str(x).replace(",",''))

df_comp['rel_year'] = df_comp['rel_year'].map(lambda x: str(x).replace("nan",'0'))

df_comp['rel_year'][df_comp['rel_year'].str.contains(',')]

df_comp['rel_year'].iloc[149] = 2012 df_comp['rel_year'].iloc[178] = 2019 df_comp['rel_year'].iloc[299] = 2015

df_comp['name'] = df_comp['name'].astype('string') df_comp['price'] = df_comp['price'].astype('float32') df_comp['price_new'] = df_comp['price_new'].astype('float32') df_comp['specs'] = df_comp['specs'].astype('string') df_comp['warranty'] = df_comp['warranty'].astype('int16') df_comp['rel_year'] = df_comp['rel_year'].astype('int32')

df_comp.dtypes

df_comp['rel_year'][df_comp['rel_year']==0] = np.nan

df_comp.head(20)

# Saving the clean dataset. df_comp.to_csv('./backmarket_clean.csv', index=False)

# Lowest computer prices for release year. df_comp.groupby('rel_year').min()

# Highest and lowest price for release year df_comp.groupby('rel_year').agg({'price':['max', 'min']})

# Laptops quantity vs release year df_comp.groupby('rel_year')['price'].count()

plt.hist(df_comp['rel_year'])

# All the laptop models and prices arrange for rel_year pd.pivot_table(df_comp, index=['rel_year', 'name'])['price']

# Price histogram plt.hist(df_comp['price'])