import pandas as pd
import numpy as np
autos=pd.read_csv("autos.csv")
autos
autos.head()
autos.info()
#list of columns names:
autos.columns
#reanme following columns:
autos.rename({
"dateCrawled":"date_crawled",
"offerType":"offer_type",
"vehicleType":"vehicle_type",
"yearOfRegistration":"registration_year",
"monthOfRegistration":"registration_month",
"fuelType":"fuel_type",
"notRepairedDamage":"unrepaired_damage",
"dateCreated":"ad_created",
"nrOfPictures":"pic_num",
"postalCode":"postal_code",
"lastSeen":"last_seen"},
axis=1, inplace=True)
autos.columns
autos.head()
autos["price"]=autos["price"].str.replace("$", "")
autos["price"]=autos["price"].str.replace(",", "").astype(float)
#print(autos["price"])
print(autos["price"].dtype)
#autos["price"]="{:,}".format(autos["price"])
#print(autos["price"])
autos.rename({"price":"price_dollar"}, axis=1, inplace=True)
print(autos["price_dollar"])
print(autos['price_dollar'].describe())
pd.options.display.float_format = '{:,.3f}'.format
#rechecking price column:
print(autos['price_dollar'].describe())
autos["odometer"]=autos["odometer"].str.replace("km", "")
autos["odometer"]=autos["odometer"].str.replace(",", "").astype(float)
#print(autos["odometer"])
print(autos["odometer"].dtype)
autos.rename({"odometer":"odometer_km"}, axis=1, inplace=True)
print(autos["odometer_km"])
import datetime
autos[['date_crawled','ad_created','last_seen','registration_year','registration_month']][0:5]
#(autos['date_crawled'].str[:10]).value_counts(dropna=False).sort_index()
autos['date_crawled']=pd.to_datetime(autos['date_crawled'], format='%Y-%m-%d %H:%M:%S')
#(autos['ad_created'].str[:10]).value_counts(dropna=False).sort_index()
autos['ad_created']=pd.to_datetime(autos['ad_created'], format='%Y-%m-%d %H:%M:%S')
#(autos['last_seen'].str[:10]).value_counts(dropna=False).sort_index()
autos['last_seen']=pd.to_datetime(autos['last_seen'], format='%Y-%m-%d %H:%M:%S')
autos['registration_year']=pd.to_datetime(autos['registration_year'], format='%Y',errors='ignore')
#we cannot converter to datetime format while we have wrong information on this column
#as we have some rows without month information, we need to replace the zero by nan:
#we can do it with the following:
#autos.loc[autos['registration_month']==0, "registration_month"]=np.nan
#or using error"coerce"
autos['registration_month']=pd.to_datetime(autos['registration_month'], format='%m',errors='coerce').dt.month
#we will not leave this column in datetime format because we want only see the month information, so it will be int84
autos.info()
autos.describe(include="datetime64", datetime_is_numeric=True)
autos.describe()
autos["price_dollar"].value_counts().sort_index()
autos["price_dollar"].value_counts().sort_index().head(25)
autos["price_dollar"].value_counts().sort_index(ascending=False).head(12)
autos.loc[autos["price_dollar"]<100,("price_dollar","registration_year")]
# Creating the empty dictionary
price_dist = {'p_0':0,'p_until_50':0,'p_100_1000':0, 'p_1000_10000':0, 'p10000_350000':0,
'p_more_350000':0}
# Populating it
for row in autos['price_dollar']:
if row==0:
price_dist['p_0'] += 1
elif row <= 50:
price_dist['p_until_50'] += 1
elif 100 < row <= 1000:
price_dist['p_100_1000'] += 1
elif 1000 < row <= 10000:
price_dist['p_1000_10000'] += 1
elif 10000 < row <= 350000:
price_dist['p10000_350000'] += 1
else:
price_dist['p_more_350000'] += 1
print(price_dist)
print("percentage of price lower than 50 is :{:.2f}%".format(((price_dist["p_0"]+price_dist["p_until_50"])/50000)*100))
print("percentage of price higher than 350000 is :{:.2f}%".format(((price_dist["p_more_350000"])/50000)*100))
autos["registration_year"].value_counts(dropna=False).sort_index().head(10)
autos["registration_year"].value_counts(dropna=False).sort_index(ascending=False).head(20)
#autos["registration_month"].dt.month.value_counts()
autos["registration_month"].value_counts(dropna=False)
autos["postal_code"].value_counts().sort_index().head(10)
autos["odometer_km"].value_counts().sort_index()
#autos.describe(include="all")
autos.describe(include="object")
print(autos["offer_type"].value_counts(dropna=False))
print(autos["seller"].value_counts(dropna=False))
print(autos["abtest"].value_counts(dropna=False))
print(autos["gearbox"].value_counts(dropna=False))
print(autos["unrepaired_damage"].value_counts(dropna=False))
print(autos["vehicle_type"].value_counts(dropna=False))
print(autos["fuel_type"].value_counts(dropna=False))
dic_gearbox={"manuell":"manual", "automatik":"automatic"}
autos["gearbox"]=autos["gearbox"].map(dic_gearbox)
print(autos["gearbox"].value_counts(dropna=False))
dic_unrepaired={"nein":"no", "ja":"yes"}
autos["unrepaired_damage"]=autos["unrepaired_damage"].map(dic_unrepaired)
print(autos["unrepaired_damage"].value_counts(dropna=False))
dic_vehicle_type=({
"limousine":"sedan",
"kleinwagen":"small car",
"kombi":"station wagon",
"NaN":"NaN",
"bus":"bus",
"cabrio":"convertible",
"coupe":"coupe",
"suv":"suv",
"andere":"other"})
autos["vehicle_type"]=autos["vehicle_type"].map(dic_vehicle_type)
print(autos["vehicle_type"].value_counts(dropna=False))
dic_fuel_type=({
"benzin":"petrol",
"diesel":"diesel",
"NaN":"NaN",
"lpg":"lpg",
"cng":"cng",
"hybrid":"hybrid",
"andere":"other",
"elektro":"electro"})
autos["fuel_type"]=autos["fuel_type"].map(dic_fuel_type)
print(autos["fuel_type"].value_counts(dropna=False))
#autos["model"]=autos["name"].str.split("_").str[1]
#autos["model"]
print("Dataset original shape:")
print(autos.shape)
#from price_dollar- as we conclude we will remove some rows
#autos_select=autos[autos["price_dollar"]==0].remove()
autos=autos.loc[autos["price_dollar"].between(50,350000)]
print("Shape after delete outliers of price:")
print(autos.shape)
#from odometer_km - based on results that we got, we will not remove any rows
#from registration_year - basde on our previous conclusions we will remove some rows:
autos=autos.loc[autos["registration_year"].between(1910,2016)]
print("Shape after delete outliers of registration_year:")
print(autos.shape)
#autos_select["price_dollar"].value_counts(normalize=True).sort_index()
autos.describe(include="all", datetime_is_numeric=True)
# Checking the unique brand values
print("What are the uniques brandes in our dataset: \n", autos["brand"].unique(),"\n")
print("How many brands do we have in our dataset? \n", autos["brand"].unique().shape[0])
print("TOP 15 of quantity of cars per brand:")
print(autos["brand"].value_counts().head(15))
print("\n")
print("TOP 15 percentage of cars per brand:")
print((((autos["brand"].value_counts())/autos["brand"].shape)*100).head(15))
freq_perc=((autos["brand"].value_counts()/autos["brand"].shape)*100)
print(freq_perc["jaguar"])
#creating empty dictionary to strore our aggregate data
brand_avg_price={}
#populating our dictionary with average price per brand (because we think it can be useful, we will also add the brand freq)
for each in autos["brand"].unique():
avg_price=autos.loc[autos["brand"]==each, "price_dollar"].mean()
brand_avg_price[each]=[avg_price,freq_perc[each]]
print(brand_avg_price)
import json
print(json.dumps(brand_avg_price, indent=4))
#Transform the dictionary in the dataframe:
df_brand_price=pd.DataFrame.from_dict(brand_avg_price,orient="index", columns=["mean_price_dol", "freq_percentage"])
#if we don't include the columns argument in DataFrame constructor, we can put the label in the columns in the following way:
#df_brand_price.rename({0:"mean_price_dol", 1:"freq_percentage"},axis=1, inplace=True)
print("TOP 5 cheaper brands:")
df_brand_price.sort_values(by=["mean_price_dol"]).head(5)
print("TOP 5 more expensive brands:")
df_brand_price.sort_values(by=["mean_price_dol"], ascending=False).head(5)
df_brand_price[freq_perc>5].sort_values(by=["mean_price_dol"], ascending=False)
print("How can we read the values in this table?: \n")
print("The {:.2f}% of cars in this dataset are from volkswagen and its price in average is {:.2f}$.".format(df_brand_price.loc['volkswagen','freq_percentage'],df_brand_price.loc['volkswagen','mean_price_dol'] ))
print("The {:.2f}% of cars in this dataset are from opel and its price in average is {:.2f}$.".format(df_brand_price.loc['opel','freq_percentage'],df_brand_price.loc['opel','mean_price_dol'] ))
#creating empty dictionary to strore our aggregate data
brand_avg_odom={}
#populating our dictionary with average odemeter per brand
for each in autos["brand"].unique():
avg_odom=autos.loc[autos["brand"]==each, "odometer_km"].mean()
brand_avg_odom[each]=avg_odom
#print(brand_avg_odom)
#converting dictionary in series and dataframe using serie and dataframe constructor:
s_brand_odom=pd.Series(brand_avg_odom)
df_brand_odom=pd.DataFrame(s_brand_odom, columns=["mean_odometer_km"])
#print(df_brand_odom)
print("TOP 5 brands with km:")
df_brand_odom.sort_values(by=["mean_odometer_km"]).head(5)
#As we think it can be useful check odometer information comparing with price and brand frequence, we will agregate the new values to our previous dataframe
df_brand_price_odom=df_brand_price.copy()
df_brand_price_odom["mean_odometer_km"]=df_brand_odom
df_brand_price_odom.head(5)
print("TOP 5 brands with less km:")
df_brand_price_odom.sort_values(by=["mean_odometer_km"], ascending=False).head(5)
print("Brands with more than 5% of frequence:")
df_brand_price_odom[freq_perc>5].sort_values(by=["freq_percentage"], ascending=False)
print(autos['odometer_km'].describe())
price_by_odometer = {'odom_until_10k':0,'odom_10k_30k':0, 'odom_30k_50k':0, 'odom_50k_75k':0,'odom_75k_100k':0,'odom_100k_140k':0,
'odom_more_140k':0}
# Populating it
for each in autos['odometer_km']:
if each<=10000:
price_by_odometer['odom_until_10k'] += 1
elif 10000 < each <= 30000:
price_by_odometer['odom_10k_30k'] += 1
elif 30000 < each <= 50000:
price_by_odometer['odom_30k_50k'] += 1
elif 30000 < each <= 75000:
price_by_odometer['odom_50k_75k'] += 1
elif 10000 < each <= 100000:
price_by_odometer['odom_75k_100k'] += 1
elif 10000 < each <= 140000:
price_by_odometer['odom_100k_140k'] += 1
else:
price_by_odometer['odom_more_140k'] += 1
price_by_odometer
df_price_by_odometer=pd.DataFrame.from_dict(price_by_odometer, orient="index", columns=["freq_odom_group"])
df_price_by_odometer
df_price_by_odometer["mean_price"]=0
df_price_by_odometer
df_price_by_odometer.iloc[0,1]=autos.loc[autos["odometer_km"] <=10000, "price_dollar"].mean()
df_price_by_odometer.iloc[1,1]=autos.loc[autos["odometer_km"] <=30000, "price_dollar"].mean()
df_price_by_odometer.iloc[2,1]=autos.loc[autos["odometer_km"] <=50000, "price_dollar"].mean()
df_price_by_odometer.iloc[3,1]=autos.loc[autos["odometer_km"] <=75000, "price_dollar"].mean()
df_price_by_odometer.iloc[4,1]=autos.loc[autos["odometer_km"] <=100000, "price_dollar"].mean()
df_price_by_odometer.iloc[5,1]=autos.loc[autos["odometer_km"] <=140000, "price_dollar"].mean()
df_price_by_odometer.iloc[6,1]=autos.loc[autos["odometer_km"] >140000, "price_dollar"].mean()
df_price_by_odometer
#creating empty dictionary to strore our aggregate data
brand_model={}
#populating our dictionary with average odemeter per brand
for each in autos["brand"].unique():
brand_filter=autos.loc[autos["brand"]==each,"model"]
model_freq=brand_filter.value_counts().head(3) #for each brand we will have the most common 3 models
model_dic= model_freq.to_dict()
brand_model[each]=model_dic
s_brand_model=pd.Series(brand_model)
df_brand_model=pd.DataFrame(s_brand_model, columns=["model"])
df_brand_model["freq_percentage"]=freq_perc #we will add the frequence of each brand
df_brand_model[freq_perc>5].sort_values(by=["freq_percentage"], ascending=False)
"The cars with deffect not repaired cost in average {:,.2f}$, while the other cost in average {:,.2f}$ ".format(autos.loc[autos["unrepaired_damage"]=="yes", "price_dollar"].mean(),autos.loc[autos["unrepaired_damage"]=="no", "price_dollar"].mean())
"The cars with automatic gearbox cost in average {:,.2f}$, while the manual gearbox cost in average {:,.2f}$ ".format(autos.loc[autos["gearbox"]=="automatic", "price_dollar"].mean(),autos.loc[autos["gearbox"]=="manual", "price_dollar"].mean())
#creating empty dictionary to strore our aggregate data
vehicle_avg_price={}
#populating our dictionary with average price per brand (because we think it can be useful, we will also add the brand freq)
for each in autos["vehicle_type"].unique():
avg_price=autos.loc[autos["vehicle_type"]==each, "price_dollar"].mean()
vehicle_avg_price[each]=float(avg_price)
#print(vehicle_avg_price)
print(json.dumps(vehicle_avg_price, indent=4))