Descriptive Analysis of trucks and faults

import pandas as pd import matplotlib.pyplot as plt import numpy as np import seaborn as sn

fault= pd.read_csv('faults_clean.csv') vehicle= pd.read_csv('vehicle_clean.csv')

# getting the total faults per month per state fault_table = pd.pivot_table(fault2, values='FAULT_ALARM_ID', index=['STATE_CD'], # margins= True columns=['MONTH'], aggfunc='count') fault_table.head()

# getting the number of vehicles per month per state vehicle_table = pd.pivot_table(vehicle2, values='VEHICLE_ID', index=['STATE'], # margins=True columns=['MONTH'], aggfunc=pd.Series.nunique) vehicle_table.head()

# removing the influence of change in number of vehicles: ratio df1=fault_table.div(vehicle_table) df1.head()

#normalizing the table so we can get the seasonal changes: division of month by state total df2= df1.div(df1.sum(axis=1), axis=0) df2.head()

df1_totals= df1.sum(axis=1).to_frame(name = 'fault_per_vehicles') df_states= pd.merge(df1_totals,df2, left_index=True, right_index=True) df_states=df_states.rename(columns={1 : "JAN", 2 : "FEB", 3 : "MAR", 4 : "APR", 5 : "MAY", 6 : "JUN", 7 : "JUL", 8 : "AUG", 9 : "SEP", 10 : "OCT", 11 : "NOV", 12 : "DEC"}) df_states.head()

df_states.to_csv('fault_per_states.csv')

# Getting the max and min values of df2 to set the limits of our scale print("df2 min value = ", df2.min().min()) print("df2 max value = ", df2.max().max())

# heatmap of the correlations among values of the database) import seaborn as sns sns.set(rc = {'figure.figsize':(12,12)}) cmap = sns.diverging_palette(220, 20, as_cmap=True) sns.heatmap(df2, vmin=0.025, vmax=0.24, center=0.085, cmap= cmap, # square=True, annot=False) plt.show()

total_state= fault_table.sum(axis=1) total_state.describe()

top_states= fault_table.loc[fault_table.sum(axis=1) > 37276].index.values.tolist() top_states

# The 12 ranked states considering the total number of faults df3= df2.loc[top_states] df3

print("df2 min value = ", df3.min().min()) print("df2 max value = ", df3.max().max())

sns.set(rc = {'figure.figsize':(12,12)}) cmap = sns.diverging_palette(220, 20, as_cmap=True) sns.heatmap(df3, vmin=0.06, vmax=0.125, center=0.08, cmap= cmap, # square=True, annot=False)

# just checking if the coding is correct df3.sum(axis=1)

fault_out.head()

df_out= pd.pivot_table(fault_out, values='FAULT_ALARM_ID', index=['VEHICLE_ID'], # margins=True, columns=['MONTH'], aggfunc=pd.Series.nunique)

print("df_out min value = ", df_out.min().min()) print("df_out max value = ", df_out.max().max()) print("df_out mean value = ", df_out.mean().mean())

sns.set(rc = {'figure.figsize':(12,12)}) cmap = sns.diverging_palette(220, 20, as_cmap=True) sns.heatmap(df_out, vmin=1, vmax=1000, center=100, cmap= cmap, # square=True, annot=False)

# uploading the files fault= pd.read_csv('vfault.csv') vehicle= pd.read_csv('vlocation.csv')

fault.dtypes

df_m = fault.groupby('MODEL')['DATE'].count().to_frame(name = 'models_count').reset_index() df_m.head()

fault.groupby('COMPONENT')['DATE'].count()..to_frame(name = 'faults_count').reset_index() df.describe()