Untitled project

Hey! Heading here.

#remove df.tail()

import pandas as pd import seaborn as sns df = pd.read_csv("vehicles.csv")

#remove df.head()

df.nunique(axis=0) # Returns number of unique value for each variable df.describe().apply(lambda s: s.apply(lambda x: format(x, 'f'))) # Code following describe suppresses scientific notation and formats each row to regular format

#remove import matplotlib as mlt #remove df

df.condition.unique() # Wanting to focus on condition

# Reclassify condition column # Columns are redundant, so we want to clean them out def condition_cleaning(row): # Adding some comments to commit? # refactor x = ["good", "fair"] # Refactor y = ["excellent", "like new"] # Refactor if row.condition in x: return "good" if row.condition in y: return "excellent" return row.condition # Clean dataframe def clean_df(dataframe): cleaned_df = df.copy() cleaned_df["condition"] = cleaned_df.apply( lambda row: condition_cleaning(row), axis=1 ) return cleaned_df # Get df with reclassified 'condition' column df_cleaned = clean_df(df) print(df_cleaned.condition.unique())

df_cleaned = df_cleaned.copy().drop(['url', 'image_url', 'region_url'], axis=1) #Remove city url, it's region url!

NA_val = df_cleaned.isna().sum() def na_filter(na, threshold=.4): #only select variables that passes the threshold col_pass = [] for i in na.keys(): if na[i] / df_cleaned.shape[0] < threshold: col_pass.append(i) return col_pass df_cleaned = df_cleaned[na_filter(NA_val)] df_cleaned.columns

df_cleaned = df_cleaned[df_cleaned['price'].between(999.99, 99999.00)] df_cleaned = df_cleaned[df_cleaned['year'] > 1990] df_cleaned = df_cleaned[df_cleaned['odometer'] < 899999.00] df_cleaned.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))

df_cleaned = df_cleaned.dropna(axis=0) df

# calculate correlation matrix corr = df_cleaned.corr() # plot the heatmap sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

df_cleaned.plot(kind='scatter', x='year', y='price')

sns.pairplot(df_cleaned)

df_cleaned['odometer'].plot(kind='hist', bins=50, figsize=(12, 6), facecolor='grey', edgecolor='black') df_cleaned['year'].plot(kind='hist', bins=20, figsize=(12, 6), facecolor='grey', edgecolor='black')

df_cleaned['year'].plot(kind='hist', bins=20, figsize=(12, 6), facecolor='grey', edgecolor='black')

df_cleaned.boxplot('price')

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Hey! Heading here.

Hey! Heading here.