import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# 1 - create a dataframe from a csv file
vfault = pd.read_csv('base_files/vehicle_fault_alarm.csv')
vfault.head()
# X plotting data from faults with the coordinates
plt.figure(figsize=(14,10))
plt.scatter(x=vfault.LONGITUDE, y=vfault.LATITUDE, c='black', edgecolor = 'white')
plt.title('Trucks faults in 2020')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid()
plt.show()
# 2 - selecting the data for only the 49 US states
us_codes= ['WV', 'FL', 'IL', 'MN', 'MD', 'RI', 'ID', 'NH', 'NC', 'VT', 'CT','DE', 'NM',
'CA', 'NJ', 'WI', 'OR', 'NE', 'PA', 'WA', 'LA', 'GA','AL', 'UT', 'OH', 'TX',
'CO', 'SC', 'OK', 'TN', 'WY', 'ND', 'KY','ME', 'NY', 'NV', 'MI', 'AR', 'MS',
'MO', 'MT', 'KS', 'IN', 'SD', 'MA', 'VA', 'DC', 'IA', 'AZ']
vfault= vfault.loc[vfault['STATE_CD'].isin(us_codes)]
# X plotting data from faults with the coordinates
plt.figure(figsize=(14,10))
plt.scatter(x=vfault.LONGITUDE, y=vfault.LATITUDE, c='black', edgecolor = 'white')
plt.title('Trucks faults in 2020')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid()
plt.show()
# 3 - Removing the mexican cities
vfault= vfault.loc[vfault['LATITUDE']> 22.0]
odds = vfault.loc[(vfault['LATITUDE'] < 28.0)&(vfault['LONGITUDE'] < -100.0)]
odds_cities = odds['CITY_NAME'].unique()
odds_cities
vfault = vfault.loc[~vfault['CITY_NAME'].isin(odds_cities)]
# X plotting data from faults with the coordinates
plt.figure(figsize=(10,6))
plt.scatter(x=vfault.LONGITUDE, y=vfault.LATITUDE, c='black', edgecolor = 'white')
plt.title('Trucks faults in 2020')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid()
plt.show()
# 4- fixing the date column (optional - this takes time to run)
# vfault['MESSAGE_DATE_TIME']= pd.to_datetime(vfault['MESSAGE_DATE_TIME'])
# Histogram of vehicles faults
sns.histplot(data= vfault, x='VEHICLE_ID', bins=40)
# 5- REmoving the outliers from the file
# 5.1 Create a dataframe with the faults grouped by vehicle
df= vfault.groupby('VEHICLE_ID')['MESSAGE_DATE_TIME'].count().to_frame(name = 'faults_count').reset_index()
df.describe()
# 5.2 plotting a ECDF graph
import numpy as np
x = np.sort(df['faults_count'])
y = np.arange(1, len(x)+1) / len(x)
_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.xlabel('Count of faults per vehicle')
_ = plt.ylabel('ECDF')
plt.margins(0.02) # Keeps data off plot edges
plt.show()
# 5.3 Finding the IQR
q1 = df['faults_count'].quantile(0.25) #percentile25
q3 = df['faults_count'].quantile(0.75) #percentile75
iqr = q1 + q3
# 5.3 Finding upper and lower limit
upper_limit = q3 + 1.5 * iqr
lower_limit = q1 - 1.5 * iqr
# 5.4 Trimming the df file
new_df = df[(df['faults_count'] < upper_limit)&(df['faults_count']> lower_limit)]
new_df.describe()
#plotting a ECDF graph for cleaned data
x = np.sort(new_df['faults_count'])
y = np.arange(1, len(x)+1) / len(x)
_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.xlabel('Count of faults per vehicle')
_ = plt.ylabel('ECDF')
plt.margins(0.02) # Keeps data off plot edges
plt.show()
# 5.5 creating lists to trim the fault file
keep_rows= new_df['VEHICLE_ID'].tolist() #list of the vehicles
outliers = df[(df['faults_count'] > upper_limit) | (df['faults_count']< lower_limit)]
drop_rows= outliers['VEHICLE_ID'].tolist()
#just checking if its correctly done (if keep_rows+drop_rows = total number of vehicles)
print(len(keep_rows))
print(len(drop_rows))
print(df.VEHICLE_ID.count())
# 5.7 triming the vfault file based on the IQR
vfault= vfault.loc[vfault['VEHICLE_ID'].isin(keep_rows)]
vfault.shape
# 6 Saving the file (optional, but encouraged)
# vfault.to_csv('vfault.csv', index= False)
# creating a dataset just for the outliers (optional)
faults_outliers= fault.loc[fault['VEHICLE_ID'].isin(drop_rows)]
faults_outliers.shape
faults_outliers.to_csv('faults_outliers.csv', index= False)