import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
fault= pd.read_csv('faults_clean.csv')
vehicle= pd.read_csv('vehicle_clean.csv')
# getting the total faults per month per state
fault_table = pd.pivot_table(fault2, values='FAULT_ALARM_ID', index=['STATE_CD'], # margins= True
columns=['MONTH'], aggfunc='count')
fault_table.head()
# getting the number of vehicles per month per state
vehicle_table = pd.pivot_table(vehicle2, values='VEHICLE_ID', index=['STATE'], # margins=True
columns=['MONTH'], aggfunc=pd.Series.nunique)
vehicle_table.head()
# removing the influence of change in number of vehicles: ratio
df1=fault_table.div(vehicle_table)
df1.head()
#normalizing the table so we can get the seasonal changes: division of month by state total
df2= df1.div(df1.sum(axis=1), axis=0)
df2.head()
df1_totals= df1.sum(axis=1).to_frame(name = 'fault_per_vehicles')
df_states= pd.merge(df1_totals,df2, left_index=True, right_index=True)
df_states=df_states.rename(columns={1 : "JAN", 2 : "FEB", 3 : "MAR", 4 : "APR", 5 : "MAY",
6 : "JUN", 7 : "JUL", 8 : "AUG", 9 : "SEP", 10 : "OCT",
11 : "NOV", 12 : "DEC"})
df_states.head()
df_states.to_csv('fault_per_states.csv')
# Getting the max and min values of df2 to set the limits of our scale
print("df2 min value = ", df2.min().min())
print("df2 max value = ", df2.max().max())
# heatmap of the correlations among values of the database)
import seaborn as sns
sns.set(rc = {'figure.figsize':(12,12)})
cmap = sns.diverging_palette(220, 20, as_cmap=True)
sns.heatmap(df2,
vmin=0.025, vmax=0.24, center=0.085,
cmap= cmap,
# square=True,
annot=False)
plt.show()
total_state= fault_table.sum(axis=1)
total_state.describe()
top_states= fault_table.loc[fault_table.sum(axis=1) > 37276].index.values.tolist()
top_states
# The 12 ranked states considering the total number of faults
df3= df2.loc[top_states]
df3
print("df2 min value = ", df3.min().min())
print("df2 max value = ", df3.max().max())
sns.set(rc = {'figure.figsize':(12,12)})
cmap = sns.diverging_palette(220, 20, as_cmap=True)
sns.heatmap(df3,
vmin=0.06, vmax=0.125, center=0.08,
cmap= cmap,
# square=True,
annot=False)
# just checking if the coding is correct
df3.sum(axis=1)
fault_out.head()
df_out= pd.pivot_table(fault_out, values='FAULT_ALARM_ID', index=['VEHICLE_ID'], # margins=True,
columns=['MONTH'], aggfunc=pd.Series.nunique)
print("df_out min value = ", df_out.min().min())
print("df_out max value = ", df_out.max().max())
print("df_out mean value = ", df_out.mean().mean())
sns.set(rc = {'figure.figsize':(12,12)})
cmap = sns.diverging_palette(220, 20, as_cmap=True)
sns.heatmap(df_out,
vmin=1, vmax=1000, center=100,
cmap= cmap,
# square=True,
annot=False)
# uploading the files
fault= pd.read_csv('vfault.csv')
vehicle= pd.read_csv('vlocation.csv')
fault.dtypes
df_m = fault.groupby('MODEL')['DATE'].count().to_frame(name = 'models_count').reset_index()
df_m.head()
fault.groupby('COMPONENT')['DATE'].count()..to_frame(name = 'faults_count').reset_index()
df.describe()