Hey! Heading here.
#remove
df.tail()
import pandas as pd
import seaborn as sns
df = pd.read_csv("vehicles.csv")
#remove
df.head()
df.nunique(axis=0) # Returns number of unique value for each variable
df.describe().apply(lambda s: s.apply(lambda x: format(x,
'f'))) # Code following describe suppresses scientific notation and formats each row to regular format
#remove
import matplotlib as mlt
#remove
df
df.condition.unique() # Wanting to focus on condition
# Reclassify condition column
# Columns are redundant, so we want to clean them out
def condition_cleaning(row):
# Adding some comments to commit?
# refactor
x = ["good", "fair"] # Refactor
y = ["excellent", "like new"] # Refactor
if row.condition in x:
return "good"
if row.condition in y:
return "excellent"
return row.condition # Clean dataframe
def clean_df(dataframe):
cleaned_df = df.copy()
cleaned_df["condition"] = cleaned_df.apply(
lambda row: condition_cleaning(row), axis=1
)
return cleaned_df # Get df with reclassified 'condition' column
df_cleaned = clean_df(df)
print(df_cleaned.condition.unique())
df_cleaned = df_cleaned.copy().drop(['url', 'image_url', 'region_url'], axis=1)
#Remove city url, it's region url!
NA_val = df_cleaned.isna().sum()
def na_filter(na, threshold=.4): #only select variables that passes the threshold
col_pass = []
for i in na.keys():
if na[i] / df_cleaned.shape[0] < threshold:
col_pass.append(i)
return col_pass
df_cleaned = df_cleaned[na_filter(NA_val)]
df_cleaned.columns
df_cleaned = df_cleaned[df_cleaned['price'].between(999.99, 99999.00)]
df_cleaned = df_cleaned[df_cleaned['year'] > 1990]
df_cleaned = df_cleaned[df_cleaned['odometer'] < 899999.00]
df_cleaned.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))
df_cleaned = df_cleaned.dropna(axis=0)
df
# calculate correlation matrix
corr = df_cleaned.corr() # plot the heatmap
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True,
cmap=sns.diverging_palette(220, 20, as_cmap=True))
df_cleaned.plot(kind='scatter', x='year', y='price')
sns.pairplot(df_cleaned)
df_cleaned['odometer'].plot(kind='hist', bins=50, figsize=(12, 6), facecolor='grey', edgecolor='black')
df_cleaned['year'].plot(kind='hist', bins=20, figsize=(12, 6), facecolor='grey', edgecolor='black')
df_cleaned['year'].plot(kind='hist', bins=20, figsize=(12, 6), facecolor='grey', edgecolor='black')
df_cleaned.boxplot('price')