project_3- used cars eBay

import pandas as pd import numpy as np

autos=pd.read_csv("autos.csv")

autos

autos.head()

autos.info()

#list of columns names: autos.columns

#reanme following columns: autos.rename({ "dateCrawled":"date_crawled", "offerType":"offer_type", "vehicleType":"vehicle_type", "yearOfRegistration":"registration_year", "monthOfRegistration":"registration_month", "fuelType":"fuel_type", "notRepairedDamage":"unrepaired_damage", "dateCreated":"ad_created", "nrOfPictures":"pic_num", "postalCode":"postal_code", "lastSeen":"last_seen"}, axis=1, inplace=True) autos.columns

autos.head()

autos["price"]=autos["price"].str.replace("$", "") autos["price"]=autos["price"].str.replace(",", "").astype(float)

#print(autos["price"]) print(autos["price"].dtype)

#autos["price"]="{:,}".format(autos["price"]) #print(autos["price"])

autos.rename({"price":"price_dollar"}, axis=1, inplace=True) print(autos["price_dollar"])

print(autos['price_dollar'].describe())

pd.options.display.float_format = '{:,.3f}'.format

#rechecking price column: print(autos['price_dollar'].describe())

autos["odometer"]=autos["odometer"].str.replace("km", "") autos["odometer"]=autos["odometer"].str.replace(",", "").astype(float) #print(autos["odometer"]) print(autos["odometer"].dtype)

autos.rename({"odometer":"odometer_km"}, axis=1, inplace=True) print(autos["odometer_km"])

import datetime

autos[['date_crawled','ad_created','last_seen','registration_year','registration_month']][0:5]

#(autos['date_crawled'].str[:10]).value_counts(dropna=False).sort_index() autos['date_crawled']=pd.to_datetime(autos['date_crawled'], format='%Y-%m-%d %H:%M:%S')

#(autos['ad_created'].str[:10]).value_counts(dropna=False).sort_index() autos['ad_created']=pd.to_datetime(autos['ad_created'], format='%Y-%m-%d %H:%M:%S')

#(autos['last_seen'].str[:10]).value_counts(dropna=False).sort_index() autos['last_seen']=pd.to_datetime(autos['last_seen'], format='%Y-%m-%d %H:%M:%S')

autos['registration_year']=pd.to_datetime(autos['registration_year'], format='%Y',errors='ignore') #we cannot converter to datetime format while we have wrong information on this column

#as we have some rows without month information, we need to replace the zero by nan: #we can do it with the following: #autos.loc[autos['registration_month']==0, "registration_month"]=np.nan #or using error"coerce" autos['registration_month']=pd.to_datetime(autos['registration_month'], format='%m',errors='coerce').dt.month #we will not leave this column in datetime format because we want only see the month information, so it will be int84

autos.info()

autos.describe(include="datetime64", datetime_is_numeric=True)

autos.describe()

autos["price_dollar"].value_counts().sort_index()

autos["price_dollar"].value_counts().sort_index().head(25)

autos["price_dollar"].value_counts().sort_index(ascending=False).head(12)

autos.loc[autos["price_dollar"]<100,("price_dollar","registration_year")]

# Creating the empty dictionary price_dist = {'p_0':0,'p_until_50':0,'p_100_1000':0, 'p_1000_10000':0, 'p10000_350000':0, 'p_more_350000':0} # Populating it for row in autos['price_dollar']: if row==0: price_dist['p_0'] += 1 elif row <= 50: price_dist['p_until_50'] += 1 elif 100 < row <= 1000: price_dist['p_100_1000'] += 1 elif 1000 < row <= 10000: price_dist['p_1000_10000'] += 1 elif 10000 < row <= 350000: price_dist['p10000_350000'] += 1 else: price_dist['p_more_350000'] += 1 print(price_dist)

print("percentage of price lower than 50 is :{:.2f}%".format(((price_dist["p_0"]+price_dist["p_until_50"])/50000)*100)) print("percentage of price higher than 350000 is :{:.2f}%".format(((price_dist["p_more_350000"])/50000)*100))

autos["registration_year"].value_counts(dropna=False).sort_index().head(10)

autos["registration_year"].value_counts(dropna=False).sort_index(ascending=False).head(20)

#autos["registration_month"].dt.month.value_counts() autos["registration_month"].value_counts(dropna=False)

autos["postal_code"].value_counts().sort_index().head(10)

autos["odometer_km"].value_counts().sort_index()

#autos.describe(include="all") autos.describe(include="object")

print(autos["offer_type"].value_counts(dropna=False))

print(autos["seller"].value_counts(dropna=False))

print(autos["abtest"].value_counts(dropna=False))

print(autos["gearbox"].value_counts(dropna=False))

print(autos["unrepaired_damage"].value_counts(dropna=False))

print(autos["vehicle_type"].value_counts(dropna=False))

print(autos["fuel_type"].value_counts(dropna=False))

dic_gearbox={"manuell":"manual", "automatik":"automatic"} autos["gearbox"]=autos["gearbox"].map(dic_gearbox) print(autos["gearbox"].value_counts(dropna=False))

dic_unrepaired={"nein":"no", "ja":"yes"} autos["unrepaired_damage"]=autos["unrepaired_damage"].map(dic_unrepaired) print(autos["unrepaired_damage"].value_counts(dropna=False))

dic_vehicle_type=({ "limousine":"sedan", "kleinwagen":"small car", "kombi":"station wagon", "NaN":"NaN", "bus":"bus", "cabrio":"convertible", "coupe":"coupe", "suv":"suv", "andere":"other"}) autos["vehicle_type"]=autos["vehicle_type"].map(dic_vehicle_type) print(autos["vehicle_type"].value_counts(dropna=False))

dic_fuel_type=({ "benzin":"petrol", "diesel":"diesel", "NaN":"NaN", "lpg":"lpg", "cng":"cng", "hybrid":"hybrid", "andere":"other", "elektro":"electro"}) autos["fuel_type"]=autos["fuel_type"].map(dic_fuel_type) print(autos["fuel_type"].value_counts(dropna=False))

#autos["model"]=autos["name"].str.split("_").str[1] #autos["model"]

print("Dataset original shape:") print(autos.shape)

#from price_dollar- as we conclude we will remove some rows #autos_select=autos[autos["price_dollar"]==0].remove() autos=autos.loc[autos["price_dollar"].between(50,350000)] print("Shape after delete outliers of price:") print(autos.shape)

#from odometer_km - based on results that we got, we will not remove any rows #from registration_year - basde on our previous conclusions we will remove some rows: autos=autos.loc[autos["registration_year"].between(1910,2016)] print("Shape after delete outliers of registration_year:") print(autos.shape)

#autos_select["price_dollar"].value_counts(normalize=True).sort_index() autos.describe(include="all", datetime_is_numeric=True)

# Checking the unique brand values print("What are the uniques brandes in our dataset: \n", autos["brand"].unique(),"\n") print("How many brands do we have in our dataset? \n", autos["brand"].unique().shape[0])

print("TOP 15 of quantity of cars per brand:") print(autos["brand"].value_counts().head(15)) print("\n") print("TOP 15 percentage of cars per brand:") print((((autos["brand"].value_counts())/autos["brand"].shape)*100).head(15))

freq_perc=((autos["brand"].value_counts()/autos["brand"].shape)*100)

print(freq_perc["jaguar"])

#creating empty dictionary to strore our aggregate data brand_avg_price={} #populating our dictionary with average price per brand (because we think it can be useful, we will also add the brand freq) for each in autos["brand"].unique(): avg_price=autos.loc[autos["brand"]==each, "price_dollar"].mean() brand_avg_price[each]=[avg_price,freq_perc[each]] print(brand_avg_price)

import json print(json.dumps(brand_avg_price, indent=4))

#Transform the dictionary in the dataframe: df_brand_price=pd.DataFrame.from_dict(brand_avg_price,orient="index", columns=["mean_price_dol", "freq_percentage"]) #if we don't include the columns argument in DataFrame constructor, we can put the label in the columns in the following way: #df_brand_price.rename({0:"mean_price_dol", 1:"freq_percentage"},axis=1, inplace=True) print("TOP 5 cheaper brands:") df_brand_price.sort_values(by=["mean_price_dol"]).head(5)

print("TOP 5 more expensive brands:") df_brand_price.sort_values(by=["mean_price_dol"], ascending=False).head(5)

df_brand_price[freq_perc>5].sort_values(by=["mean_price_dol"], ascending=False)

print("How can we read the values in this table?: \n") print("The {:.2f}% of cars in this dataset are from volkswagen and its price in average is {:.2f}$.".format(df_brand_price.loc['volkswagen','freq_percentage'],df_brand_price.loc['volkswagen','mean_price_dol'] )) print("The {:.2f}% of cars in this dataset are from opel and its price in average is {:.2f}$.".format(df_brand_price.loc['opel','freq_percentage'],df_brand_price.loc['opel','mean_price_dol'] ))

#creating empty dictionary to strore our aggregate data brand_avg_odom={} #populating our dictionary with average odemeter per brand for each in autos["brand"].unique(): avg_odom=autos.loc[autos["brand"]==each, "odometer_km"].mean() brand_avg_odom[each]=avg_odom #print(brand_avg_odom) #converting dictionary in series and dataframe using serie and dataframe constructor: s_brand_odom=pd.Series(brand_avg_odom) df_brand_odom=pd.DataFrame(s_brand_odom, columns=["mean_odometer_km"]) #print(df_brand_odom) print("TOP 5 brands with km:") df_brand_odom.sort_values(by=["mean_odometer_km"]).head(5)

#As we think it can be useful check odometer information comparing with price and brand frequence, we will agregate the new values to our previous dataframe df_brand_price_odom=df_brand_price.copy() df_brand_price_odom["mean_odometer_km"]=df_brand_odom df_brand_price_odom.head(5)

print("TOP 5 brands with less km:") df_brand_price_odom.sort_values(by=["mean_odometer_km"], ascending=False).head(5)

print("Brands with more than 5% of frequence:") df_brand_price_odom[freq_perc>5].sort_values(by=["freq_percentage"], ascending=False)

print(autos['odometer_km'].describe())

price_by_odometer = {'odom_until_10k':0,'odom_10k_30k':0, 'odom_30k_50k':0, 'odom_50k_75k':0,'odom_75k_100k':0,'odom_100k_140k':0, 'odom_more_140k':0} # Populating it for each in autos['odometer_km']: if each<=10000: price_by_odometer['odom_until_10k'] += 1 elif 10000 < each <= 30000: price_by_odometer['odom_10k_30k'] += 1 elif 30000 < each <= 50000: price_by_odometer['odom_30k_50k'] += 1 elif 30000 < each <= 75000: price_by_odometer['odom_50k_75k'] += 1 elif 10000 < each <= 100000: price_by_odometer['odom_75k_100k'] += 1 elif 10000 < each <= 140000: price_by_odometer['odom_100k_140k'] += 1 else: price_by_odometer['odom_more_140k'] += 1 price_by_odometer df_price_by_odometer=pd.DataFrame.from_dict(price_by_odometer, orient="index", columns=["freq_odom_group"]) df_price_by_odometer

df_price_by_odometer["mean_price"]=0

df_price_by_odometer

df_price_by_odometer.iloc[0,1]=autos.loc[autos["odometer_km"] <=10000, "price_dollar"].mean() df_price_by_odometer.iloc[1,1]=autos.loc[autos["odometer_km"] <=30000, "price_dollar"].mean() df_price_by_odometer.iloc[2,1]=autos.loc[autos["odometer_km"] <=50000, "price_dollar"].mean() df_price_by_odometer.iloc[3,1]=autos.loc[autos["odometer_km"] <=75000, "price_dollar"].mean() df_price_by_odometer.iloc[4,1]=autos.loc[autos["odometer_km"] <=100000, "price_dollar"].mean() df_price_by_odometer.iloc[5,1]=autos.loc[autos["odometer_km"] <=140000, "price_dollar"].mean() df_price_by_odometer.iloc[6,1]=autos.loc[autos["odometer_km"] >140000, "price_dollar"].mean()

df_price_by_odometer

#creating empty dictionary to strore our aggregate data brand_model={} #populating our dictionary with average odemeter per brand for each in autos["brand"].unique(): brand_filter=autos.loc[autos["brand"]==each,"model"] model_freq=brand_filter.value_counts().head(3) #for each brand we will have the most common 3 models model_dic= model_freq.to_dict() brand_model[each]=model_dic s_brand_model=pd.Series(brand_model) df_brand_model=pd.DataFrame(s_brand_model, columns=["model"]) df_brand_model["freq_percentage"]=freq_perc #we will add the frequence of each brand df_brand_model[freq_perc>5].sort_values(by=["freq_percentage"], ascending=False)

"The cars with deffect not repaired cost in average {:,.2f}$, while the other cost in average {:,.2f}$ ".format(autos.loc[autos["unrepaired_damage"]=="yes", "price_dollar"].mean(),autos.loc[autos["unrepaired_damage"]=="no", "price_dollar"].mean())

"The cars with automatic gearbox cost in average {:,.2f}$, while the manual gearbox cost in average {:,.2f}$ ".format(autos.loc[autos["gearbox"]=="automatic", "price_dollar"].mean(),autos.loc[autos["gearbox"]=="manual", "price_dollar"].mean())

#creating empty dictionary to strore our aggregate data vehicle_avg_price={} #populating our dictionary with average price per brand (because we think it can be useful, we will also add the brand freq) for each in autos["vehicle_type"].unique(): avg_price=autos.loc[autos["vehicle_type"]==each, "price_dollar"].mean() vehicle_avg_price[each]=float(avg_price) #print(vehicle_avg_price) print(json.dumps(vehicle_avg_price, indent=4))