import pandas as pd
import numpy as np
autos=pd.read_csv("autos.csv")
autos
autos.head()
autos.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 dateCrawled 50000 non-null object
1 name 50000 non-null object
2 seller 50000 non-null object
3 offerType 50000 non-null object
4 price 50000 non-null object
5 abtest 50000 non-null object
6 vehicleType 44905 non-null object
7 yearOfRegistration 50000 non-null int64
8 gearbox 47320 non-null object
9 powerPS 50000 non-null int64
10 model 47242 non-null object
11 odometer 50000 non-null object
12 monthOfRegistration 50000 non-null int64
13 fuelType 45518 non-null object
14 brand 50000 non-null object
15 notRepairedDamage 40171 non-null object
16 dateCreated 50000 non-null object
17 nrOfPictures 50000 non-null int64
18 postalCode 50000 non-null int64
19 lastSeen 50000 non-null object
dtypes: int64(5), object(15)
memory usage: 7.6+ MB
#list of columns names:
autos.columns
#reanme following columns:
autos.rename({
"dateCrawled":"date_crawled",
"offerType":"offer_type",
"vehicleType":"vehicle_type",
"yearOfRegistration":"registration_year",
"monthOfRegistration":"registration_month",
"fuelType":"fuel_type",
"notRepairedDamage":"unrepaired_damage",
"dateCreated":"ad_created",
"nrOfPictures":"pic_num",
"postalCode":"postal_code",
"lastSeen":"last_seen"},
axis=1, inplace=True)
autos.columns
autos.head()
autos["price"]=autos["price"].str.replace("$", "")
autos["price"]=autos["price"].str.replace(",", "").astype(float)
<ipython-input-9-48ac3c5553ea>:1: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.
autos["price"]=autos["price"].str.replace("$", "")
#print(autos["price"])
print(autos["price"].dtype)
float64
#autos["price"]="{:,}".format(autos["price"])
#print(autos["price"])
autos.rename({"price":"price_dollar"}, axis=1, inplace=True)
print(autos["price_dollar"])
0 5000.0
1 8500.0
2 8990.0
3 4350.0
4 1350.0
...
49995 24900.0
49996 1980.0
49997 13200.0
49998 22900.0
49999 1250.0
Name: price_dollar, Length: 50000, dtype: float64
print(autos['price_dollar'].describe())
count 5.000000e+04
mean 9.840044e+03
std 4.811044e+05
min 0.000000e+00
25% 1.100000e+03
50% 2.950000e+03
75% 7.200000e+03
max 1.000000e+08
Name: price_dollar, dtype: float64
pd.options.display.float_format = '{:,.3f}'.format
#rechecking price column:
print(autos['price_dollar'].describe())
count 50,000.000
mean 9,840.044
std 481,104.381
min 0.000
25% 1,100.000
50% 2,950.000
75% 7,200.000
max 99,999,999.000
Name: price_dollar, dtype: float64
autos["odometer"]=autos["odometer"].str.replace("km", "")
autos["odometer"]=autos["odometer"].str.replace(",", "").astype(float)
#print(autos["odometer"])
print(autos["odometer"].dtype)
float64
autos.rename({"odometer":"odometer_km"}, axis=1, inplace=True)
print(autos["odometer_km"])
0 150,000.000
1 150,000.000
2 70,000.000
3 70,000.000
4 150,000.000
...
49995 100,000.000
49996 150,000.000
49997 5,000.000
49998 40,000.000
49999 150,000.000
Name: odometer_km, Length: 50000, dtype: float64
import datetime
autos[['date_crawled','ad_created','last_seen','registration_year','registration_month']][0:5]
#(autos['date_crawled'].str[:10]).value_counts(dropna=False).sort_index()
autos['date_crawled']=pd.to_datetime(autos['date_crawled'], format='%Y-%m-%d %H:%M:%S')
#(autos['ad_created'].str[:10]).value_counts(dropna=False).sort_index()
autos['ad_created']=pd.to_datetime(autos['ad_created'], format='%Y-%m-%d %H:%M:%S')
#(autos['last_seen'].str[:10]).value_counts(dropna=False).sort_index()
autos['last_seen']=pd.to_datetime(autos['last_seen'], format='%Y-%m-%d %H:%M:%S')
autos['registration_year']=pd.to_datetime(autos['registration_year'], format='%Y',errors='ignore')
#we cannot converter to datetime format while we have wrong information on this column
#as we have some rows without month information, we need to replace the zero by nan:
#we can do it with the following:
#autos.loc[autos['registration_month']==0, "registration_month"]=np.nan
#or using error"coerce"
autos['registration_month']=pd.to_datetime(autos['registration_month'], format='%m',errors='coerce').dt.month
#we will not leave this column in datetime format because we want only see the month information, so it will be int84
autos.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 date_crawled 50000 non-null datetime64[ns]
1 name 50000 non-null object
2 seller 50000 non-null object
3 offer_type 50000 non-null object
4 price_dollar 50000 non-null float64
5 abtest 50000 non-null object
6 vehicle_type 44905 non-null object
7 registration_year 50000 non-null int64
8 gearbox 47320 non-null object
9 powerPS 50000 non-null int64
10 model 47242 non-null object
11 odometer_km 50000 non-null float64
12 registration_month 44925 non-null float64
13 fuel_type 45518 non-null object
14 brand 50000 non-null object
15 unrepaired_damage 40171 non-null object
16 ad_created 50000 non-null datetime64[ns]
17 pic_num 50000 non-null int64
18 postal_code 50000 non-null int64
19 last_seen 50000 non-null datetime64[ns]
dtypes: datetime64[ns](3), float64(3), int64(4), object(10)
memory usage: 7.6+ MB
autos.describe(include="datetime64", datetime_is_numeric=True)
autos.describe()
autos["price_dollar"].value_counts().sort_index()
autos["price_dollar"].value_counts().sort_index().head(25)
autos["price_dollar"].value_counts().sort_index(ascending=False).head(12)
autos.loc[autos["price_dollar"]<100,("price_dollar","registration_year")]
# Creating the empty dictionary
price_dist = {'p_0':0,'p_until_50':0,'p_100_1000':0, 'p_1000_10000':0, 'p10000_350000':0,
'p_more_350000':0}
# Populating it
for row in autos['price_dollar']:
if row==0:
price_dist['p_0'] += 1
elif row <= 50:
price_dist['p_until_50'] += 1
elif 100 < row <= 1000:
price_dist['p_100_1000'] += 1
elif 1000 < row <= 10000:
price_dist['p_1000_10000'] += 1
elif 10000 < row <= 350000:
price_dist['p10000_350000'] += 1
else:
price_dist['p_more_350000'] += 1
print(price_dist)
{'p_0': 1421, 'p_until_50': 267, 'p_100_1000': 10103, 'p_1000_10000': 29928, 'p10000_350000': 8059, 'p_more_350000': 222}
print("percentage of price lower than 50 is :{:.2f}%".format(((price_dist["p_0"]+price_dist["p_until_50"])/50000)*100))
print("percentage of price higher than 350000 is :{:.2f}%".format(((price_dist["p_more_350000"])/50000)*100))
percentage of price lower than 50 is :3.38%
percentage of price higher than 350000 is :0.44%
autos["registration_year"].value_counts(dropna=False).sort_index().head(10)
autos["registration_year"].value_counts(dropna=False).sort_index(ascending=False).head(20)
#autos["registration_month"].dt.month.value_counts()
autos["registration_month"].value_counts(dropna=False)
autos["postal_code"].value_counts().sort_index().head(10)
autos["odometer_km"].value_counts().sort_index()
#autos.describe(include="all")
autos.describe(include="object")
print(autos["offer_type"].value_counts(dropna=False))
Angebot 49999
Gesuch 1
Name: offer_type, dtype: int64
print(autos["seller"].value_counts(dropna=False))
privat 49999
gewerblich 1
Name: seller, dtype: int64
print(autos["abtest"].value_counts(dropna=False))
test 25756
control 24244
Name: abtest, dtype: int64
print(autos["gearbox"].value_counts(dropna=False))
manuell 36993
automatik 10327
NaN 2680
Name: gearbox, dtype: int64
print(autos["unrepaired_damage"].value_counts(dropna=False))
nein 35232
NaN 9829
ja 4939
Name: unrepaired_damage, dtype: int64
print(autos["vehicle_type"].value_counts(dropna=False))
limousine 12859
kleinwagen 10822
kombi 9127
NaN 5095
bus 4093
cabrio 3061
coupe 2537
suv 1986
andere 420
Name: vehicle_type, dtype: int64
print(autos["fuel_type"].value_counts(dropna=False))
benzin 30107
diesel 14567
NaN 4482
lpg 691
cng 75
hybrid 37
andere 22
elektro 19
Name: fuel_type, dtype: int64
dic_gearbox={"manuell":"manual", "automatik":"automatic"}
autos["gearbox"]=autos["gearbox"].map(dic_gearbox)
print(autos["gearbox"].value_counts(dropna=False))
manual 36993
automatic 10327
NaN 2680
Name: gearbox, dtype: int64
dic_unrepaired={"nein":"no", "ja":"yes"}
autos["unrepaired_damage"]=autos["unrepaired_damage"].map(dic_unrepaired)
print(autos["unrepaired_damage"].value_counts(dropna=False))
no 35232
NaN 9829
yes 4939
Name: unrepaired_damage, dtype: int64
dic_vehicle_type=({
"limousine":"sedan",
"kleinwagen":"small car",
"kombi":"station wagon",
"NaN":"NaN",
"bus":"bus",
"cabrio":"convertible",
"coupe":"coupe",
"suv":"suv",
"andere":"other"})
autos["vehicle_type"]=autos["vehicle_type"].map(dic_vehicle_type)
print(autos["vehicle_type"].value_counts(dropna=False))
sedan 12859
small car 10822
station wagon 9127
NaN 5095
bus 4093
convertible 3061
coupe 2537
suv 1986
other 420
Name: vehicle_type, dtype: int64
dic_fuel_type=({
"benzin":"petrol",
"diesel":"diesel",
"NaN":"NaN",
"lpg":"lpg",
"cng":"cng",
"hybrid":"hybrid",
"andere":"other",
"elektro":"electro"})
autos["fuel_type"]=autos["fuel_type"].map(dic_fuel_type)
print(autos["fuel_type"].value_counts(dropna=False))
petrol 30107
diesel 14567
NaN 4482
lpg 691
cng 75
hybrid 37
other 22
electro 19
Name: fuel_type, dtype: int64
#autos["model"]=autos["name"].str.split("_").str[1]
#autos["model"]
print("Dataset original shape:")
print(autos.shape)
Dataset original shape:
(50000, 20)
#from price_dollar- as we conclude we will remove some rows
#autos_select=autos[autos["price_dollar"]==0].remove()
autos=autos.loc[autos["price_dollar"].between(50,350000)]
print("Shape after delete outliers of price:")
print(autos.shape)
Shape after delete outliers of price:
(48347, 20)
#from odometer_km - based on results that we got, we will not remove any rows
#from registration_year - basde on our previous conclusions we will remove some rows:
autos=autos.loc[autos["registration_year"].between(1910,2016)]
print("Shape after delete outliers of registration_year:")
print(autos.shape)
Shape after delete outliers of registration_year:
(46473, 20)
#autos_select["price_dollar"].value_counts(normalize=True).sort_index()
autos.describe(include="all", datetime_is_numeric=True)
# Checking the unique brand values
print("What are the uniques brandes in our dataset: \n", autos["brand"].unique(),"\n")
print("How many brands do we have in our dataset? \n", autos["brand"].unique().shape[0])
What are the uniques brandes in our dataset:
['peugeot' 'bmw' 'volkswagen' 'smart' 'ford' 'chrysler' 'seat' 'renault'
'mercedes_benz' 'audi' 'sonstige_autos' 'opel' 'mazda' 'porsche' 'mini'
'toyota' 'dacia' 'nissan' 'jeep' 'saab' 'volvo' 'mitsubishi' 'jaguar'
'fiat' 'skoda' 'subaru' 'kia' 'citroen' 'chevrolet' 'hyundai' 'honda'
'daewoo' 'suzuki' 'trabant' 'land_rover' 'alfa_romeo' 'lada' 'rover'
'daihatsu' 'lancia']
How many brands do we have in our dataset?
40
print("TOP 15 of quantity of cars per brand:")
print(autos["brand"].value_counts().head(15))
print("\n")
print("TOP 15 percentage of cars per brand:")
print((((autos["brand"].value_counts())/autos["brand"].shape)*100).head(15))
TOP 15 of quantity of cars per brand:
volkswagen 9821
bmw 5110
opel 4987
mercedes_benz 4485
audi 4025
ford 3252
renault 2193
peugeot 1390
fiat 1195
seat 848
skoda 763
nissan 712
mazda 709
smart 659
citroen 653
Name: brand, dtype: int64
TOP 15 percentage of cars per brand:
volkswagen 21.133
bmw 10.996
opel 10.731
mercedes_benz 9.651
audi 8.661
ford 6.998
renault 4.719
peugeot 2.991
fiat 2.571
seat 1.825
skoda 1.642
nissan 1.532
mazda 1.526
smart 1.418
citroen 1.405
Name: brand, dtype: float64
freq_perc=((autos["brand"].value_counts()/autos["brand"].shape)*100)
print(freq_perc["jaguar"])
0.15708045531814172
#creating empty dictionary to strore our aggregate data
brand_avg_price={}
#populating our dictionary with average price per brand (because we think it can be useful, we will also add the brand freq)
for each in autos["brand"].unique():
avg_price=autos.loc[autos["brand"]==each, "price_dollar"].mean()
brand_avg_price[each]=[avg_price,freq_perc[each]]
print(brand_avg_price)
{'peugeot': [3100.6928057553955, 2.9909840122221505], 'bmw': [8376.805088062622, 10.99563187226992], 'volkswagen': [5424.9185419000105, 21.132700707937943], 'smart': [3591.0212443095597, 1.4180276719815805], 'ford': [3762.1165436654364, 6.997611516364341], 'chrysler': [3465.743902439024, 0.35289307770103073], 'seat': [4423.116745283019, 1.824715426161427], 'renault': [2483.889192886457, 4.718869020721709], 'mercedes_benz': [8663.059308807135, 9.65076496029953], 'audi': [9373.771428571428, 8.66094291308932], 'sonstige_autos': [12642.134228187919, 0.9618488154412239], 'opel': [2996.0850210547424, 10.730962063994147], 'mazda': [4112.596614950635, 1.525617024939212], 'porsche': [46455.89323843416, 0.604652163621888], 'mini': [10639.450980392157, 0.8779291201342715], 'toyota': [5167.091062394604, 1.2760097260775074], 'dacia': [5915.528455284553, 0.264669808275773], 'nissan': [4750.063202247191, 1.5320723861166698], 'jeep': [11650.5, 0.22808942827017836], 'saab': [3211.6493506493507, 0.16568760355475223], 'volvo': [4993.208037825059, 0.9102059260215609], 'mitsubishi': [3430.25, 0.817679082477998], 'jaguar': [11635.493150684932, 0.15708045531814172], 'fiat': [2818.456066945607, 2.5713855356873885], 'skoda': [6393.018348623853, 1.6418135261334537], 'subaru': [4033.7551020408164, 0.21087513179695738], 'kia': [5982.330303030303, 0.7100897295203666], 'citroen': [3784.924961715161, 1.405116949626665], 'chevrolet': [6709.358490566037, 0.5702235706754459], 'hyundai': [5376.740899357602, 1.0048845566242766], 'honda': [4119.109589041096, 0.7854022765907086], 'daewoo': [1064.0579710144928, 0.14847330708153123], 'suzuki': [4111.572463768116, 0.5938932283261249], 'trabant': [1818.609375, 0.1377143717857681], 'land_rover': [19108.091836734693, 0.21087513179695738], 'alfa_romeo': [4100.915857605178, 0.6649022012781616], 'lada': [2688.296296296296, 0.05809825059712091], 'rover': [1602.2903225806451, 0.13341079766746283], 'daihatsu': [1636.1965811965813, 0.2517590859208573], 'lancia': [3444.877551020408, 0.10543756589847869]}
import json
print(json.dumps(brand_avg_price, indent=4))
{
"peugeot": [
3100.6928057553955,
2.9909840122221505
],
"bmw": [
8376.805088062622,
10.99563187226992
],
"volkswagen": [
5424.9185419000105,
21.132700707937943
],
"smart": [
3591.0212443095597,
1.4180276719815805
],
"ford": [
3762.1165436654364,
6.997611516364341
],
"chrysler": [
3465.743902439024,
0.35289307770103073
],
"seat": [
4423.116745283019,
1.824715426161427
],
"renault": [
2483.889192886457,
4.718869020721709
],
"mercedes_benz": [
8663.059308807135,
9.65076496029953
],
"audi": [
9373.771428571428,
8.66094291308932
],
"sonstige_autos": [
12642.134228187919,
0.9618488154412239
],
"opel": [
2996.0850210547424,
10.730962063994147
],
"mazda": [
4112.596614950635,
1.525617024939212
],
"porsche": [
46455.89323843416,
0.604652163621888
],
"mini": [
10639.450980392157,
0.8779291201342715
],
"toyota": [
5167.091062394604,
1.2760097260775074
],
"dacia": [
5915.528455284553,
0.264669808275773
],
"nissan": [
4750.063202247191,
1.5320723861166698
],
"jeep": [
11650.5,
0.22808942827017836
],
"saab": [
3211.6493506493507,
0.16568760355475223
],
"volvo": [
4993.208037825059,
0.9102059260215609
],
"mitsubishi": [
3430.25,
0.817679082477998
],
"jaguar": [
11635.493150684932,
0.15708045531814172
],
"fiat": [
2818.456066945607,
2.5713855356873885
],
"skoda": [
6393.018348623853,
1.6418135261334537
],
"subaru": [
4033.7551020408164,
0.21087513179695738
],
"kia": [
5982.330303030303,
0.7100897295203666
],
"citroen": [
3784.924961715161,
1.405116949626665
],
"chevrolet": [
6709.358490566037,
0.5702235706754459
],
"hyundai": [
5376.740899357602,
1.0048845566242766
],
"honda": [
4119.109589041096,
0.7854022765907086
],
"daewoo": [
1064.0579710144928,
0.14847330708153123
],
"suzuki": [
4111.572463768116,
0.5938932283261249
],
"trabant": [
1818.609375,
0.1377143717857681
],
"land_rover": [
19108.091836734693,
0.21087513179695738
],
"alfa_romeo": [
4100.915857605178,
0.6649022012781616
],
"lada": [
2688.296296296296,
0.05809825059712091
],
"rover": [
1602.2903225806451,
0.13341079766746283
],
"daihatsu": [
1636.1965811965813,
0.2517590859208573
],
"lancia": [
3444.877551020408,
0.10543756589847869
]
}
#Transform the dictionary in the dataframe:
df_brand_price=pd.DataFrame.from_dict(brand_avg_price,orient="index", columns=["mean_price_dol", "freq_percentage"])
#if we don't include the columns argument in DataFrame constructor, we can put the label in the columns in the following way:
#df_brand_price.rename({0:"mean_price_dol", 1:"freq_percentage"},axis=1, inplace=True)
print("TOP 5 cheaper brands:")
df_brand_price.sort_values(by=["mean_price_dol"]).head(5)
TOP 5 cheaper brands:
print("TOP 5 more expensive brands:")
df_brand_price.sort_values(by=["mean_price_dol"], ascending=False).head(5)
TOP 5 more expensive brands:
df_brand_price[freq_perc>5].sort_values(by=["mean_price_dol"], ascending=False)
<ipython-input-64-204e483a8890>:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
df_brand_price[freq_perc>5].sort_values(by=["mean_price_dol"], ascending=False)
print("How can we read the values in this table?: \n")
print("The {:.2f}% of cars in this dataset are from volkswagen and its price in average is {:.2f}$.".format(df_brand_price.loc['volkswagen','freq_percentage'],df_brand_price.loc['volkswagen','mean_price_dol'] ))
print("The {:.2f}% of cars in this dataset are from opel and its price in average is {:.2f}$.".format(df_brand_price.loc['opel','freq_percentage'],df_brand_price.loc['opel','mean_price_dol'] ))
How can we read the values in this table?:
The 21.13% of cars in this dataset are from volkswagen and its price in average is 5424.92$.
The 10.73% of cars in this dataset are from opel and its price in average is 2996.09$.
#creating empty dictionary to strore our aggregate data
brand_avg_odom={}
#populating our dictionary with average odemeter per brand
for each in autos["brand"].unique():
avg_odom=autos.loc[autos["brand"]==each, "odometer_km"].mean()
brand_avg_odom[each]=avg_odom
#print(brand_avg_odom)
#converting dictionary in series and dataframe using serie and dataframe constructor:
s_brand_odom=pd.Series(brand_avg_odom)
df_brand_odom=pd.DataFrame(s_brand_odom, columns=["mean_odometer_km"])
#print(df_brand_odom)
print("TOP 5 brands with km:")
df_brand_odom.sort_values(by=["mean_odometer_km"]).head(5)
TOP 5 brands with km:
#As we think it can be useful check odometer information comparing with price and brand frequence, we will agregate the new values to our previous dataframe
df_brand_price_odom=df_brand_price.copy()
df_brand_price_odom["mean_odometer_km"]=df_brand_odom
df_brand_price_odom.head(5)
print("TOP 5 brands with less km:")
df_brand_price_odom.sort_values(by=["mean_odometer_km"], ascending=False).head(5)
TOP 5 brands with less km:
print("Brands with more than 5% of frequence:")
df_brand_price_odom[freq_perc>5].sort_values(by=["freq_percentage"], ascending=False)
Brands with more than 5% of frequence:
<ipython-input-69-0ff77f17394f>:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
df_brand_price_odom[freq_perc>5].sort_values(by=["freq_percentage"], ascending=False)
print(autos['odometer_km'].describe())
count 46,473.000
mean 125,688.787
std 39,687.056
min 5,000.000
25% 100,000.000
50% 150,000.000
75% 150,000.000
max 150,000.000
Name: odometer_km, dtype: float64
price_by_odometer = {'odom_until_10k':0,'odom_10k_30k':0, 'odom_30k_50k':0, 'odom_50k_75k':0,'odom_75k_100k':0,'odom_100k_140k':0,
'odom_more_140k':0}
# Populating it
for each in autos['odometer_km']:
if each<=10000:
price_by_odometer['odom_until_10k'] += 1
elif 10000 < each <= 30000:
price_by_odometer['odom_10k_30k'] += 1
elif 30000 < each <= 50000:
price_by_odometer['odom_30k_50k'] += 1
elif 30000 < each <= 75000:
price_by_odometer['odom_50k_75k'] += 1
elif 10000 < each <= 100000:
price_by_odometer['odom_75k_100k'] += 1
elif 10000 < each <= 140000:
price_by_odometer['odom_100k_140k'] += 1
else:
price_by_odometer['odom_more_140k'] += 1
price_by_odometer
df_price_by_odometer=pd.DataFrame.from_dict(price_by_odometer, orient="index", columns=["freq_odom_group"])
df_price_by_odometer
df_price_by_odometer["mean_price"]=0
df_price_by_odometer
df_price_by_odometer.iloc[0,1]=autos.loc[autos["odometer_km"] <=10000, "price_dollar"].mean()
df_price_by_odometer.iloc[1,1]=autos.loc[autos["odometer_km"] <=30000, "price_dollar"].mean()
df_price_by_odometer.iloc[2,1]=autos.loc[autos["odometer_km"] <=50000, "price_dollar"].mean()
df_price_by_odometer.iloc[3,1]=autos.loc[autos["odometer_km"] <=75000, "price_dollar"].mean()
df_price_by_odometer.iloc[4,1]=autos.loc[autos["odometer_km"] <=100000, "price_dollar"].mean()
df_price_by_odometer.iloc[5,1]=autos.loc[autos["odometer_km"] <=140000, "price_dollar"].mean()
df_price_by_odometer.iloc[6,1]=autos.loc[autos["odometer_km"] >140000, "price_dollar"].mean()
df_price_by_odometer
#creating empty dictionary to strore our aggregate data
brand_model={}
#populating our dictionary with average odemeter per brand
for each in autos["brand"].unique():
brand_filter=autos.loc[autos["brand"]==each,"model"]
model_freq=brand_filter.value_counts().head(3) #for each brand we will have the most common 3 models
model_dic= model_freq.to_dict()
brand_model[each]=model_dic
s_brand_model=pd.Series(brand_model)
df_brand_model=pd.DataFrame(s_brand_model, columns=["model"])
df_brand_model["freq_percentage"]=freq_perc #we will add the frequence of each brand
df_brand_model[freq_perc>5].sort_values(by=["freq_percentage"], ascending=False)
<ipython-input-76-98d7d4861509>:16: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
df_brand_model[freq_perc>5].sort_values(by=["freq_percentage"], ascending=False)
"The cars with deffect not repaired cost in average {:,.2f}$, while the other cost in average {:,.2f}$ ".format(autos.loc[autos["unrepaired_damage"]=="yes", "price_dollar"].mean(),autos.loc[autos["unrepaired_damage"]=="no", "price_dollar"].mean())
"The cars with automatic gearbox cost in average {:,.2f}$, while the manual gearbox cost in average {:,.2f}$ ".format(autos.loc[autos["gearbox"]=="automatic", "price_dollar"].mean(),autos.loc[autos["gearbox"]=="manual", "price_dollar"].mean())
#creating empty dictionary to strore our aggregate data
vehicle_avg_price={}
#populating our dictionary with average price per brand (because we think it can be useful, we will also add the brand freq)
for each in autos["vehicle_type"].unique():
avg_price=autos.loc[autos["vehicle_type"]==each, "price_dollar"].mean()
vehicle_avg_price[each]=float(avg_price)
#print(vehicle_avg_price)
print(json.dumps(vehicle_avg_price, indent=4))