import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sn
import plotly.express as px
sn.set_style(style= 'whitegrid')
url = 'https://raw.githubusercontent.com/Qalinle1996/DS-portofolio/main/History_of_Mass_Shootings_in_the_USA.csv'
data = pd.read_csv(url)
data.head()
data.columns
data.info()
# convert the date to datetime type
data['Date'] = pd.to_datetime(data['Date'])
data.info()
# Add a year column to the dataframe
data['year'] = data.Date.dt.year
data
data['City'].nunique()
data['State'].nunique()
data['State'].unique()
data['Date'].min()
data['Date'].max()
data.describe()
# histogram of death column distribution
death = data[(data['Dead'] >0)]
px.histogram(death, x = 'Dead', template = 'plotly_dark',
title = 'The distribution of the death cases from every mass-shooting from 1924 to 2022',
nbins = 100)
# histogram of death column
px.histogram(data, x = 'Injured', template = 'plotly_dark',
title = 'The distribution of the Injured cases from every mass-shooting from 1924 to 2022',
nbins = 100)
# # histogram of death column
px.histogram(data, x = 'Total',
template = 'plotly_dark',
title = 'The distribution of the Total cases from every mass-shooting from 1924 to 2022',
nbins = 100)
# create a new dataframe
dead_data = data
# set year column as an index
dead_data = dead_data.set_index('year')
plt.figure(figsize = (20,15))
plt.plot(dead_data['Dead'], color = 'r' , lw = 5 , alpha = 1, linestyle=':' , label = 'Dead per year')
plt.title('Mass shooting fatal cases per year from 1924. US', fontsize = 20)
plt.xlabel('Dead People', fontsize = 20)
plt.ylabel('Frequency', fontsize = 20)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.show()
# create a new dataframe
injured_data = data
# set year column as an index
injured_data = injured_data.set_index('year')
plt.figure(figsize = (20,15))
plt.plot(injured_data['Injured'], color = 'b' , lw = 2 , alpha = 1, linestyle='-' )
plt.title('Mass shooting injured cases per year from 1924. US', fontsize = 20)
plt.xlabel('Number if Injured People through the year', fontsize = 20)
plt.ylabel('Frequency', fontsize = 30)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.show()
# create a new dataframe
total_data = data
# set year column as an index
total_data = total_data.set_index('year')
plt.figure(figsize = (20,15))
plt.plot(total_data['Total'], color = 'g' , lw = 3 , alpha = 0.7, linestyle='-' )
plt.title('Mass shooting Total cases - Dead and Injured - per year from 1924. US', fontsize = 20)
plt.xlabel('Number of Injured + dead through the year', fontsize = 20)
plt.ylabel('Frequency', fontsize = 20)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.show()
# pd.options.display.max_rows = 65
# Create an empty list of all years in the data set
Years = [2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012,
2011, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000,
1999, 1998, 1997, 1996, 1994, 1993, 1992, 1991, 1990, 1989, 1988,
1987, 1986, 1985, 1984, 1983, 1982, 1981, 1980, 1979, 1978, 1977,
1976, 1975, 1974, 1973, 1972, 1971, 1970, 1968, 1966, 1965, 1959,
1958, 1949, 1946, 1945, 1935, 1933, 1929, 1927, 1924]
# Reverse the years values
years = Years[::-1]
grouped_by_year = data.groupby(['year'])['Dead'].sum().to_frame()
grouped_by_year.reset_index()
grouped_by_year['Years']= years
grouped_by_year= grouped_by_year.sort_values(by = 'Dead' , ascending = False)
grouped_by_year
px.bar(data_frame = grouped_by_year ,
x = 'Years' ,
y = 'Dead' ,
color = 'Dead',
template = 'plotly_dark',
title = 'Bar char, comparison of mass shooting fatal cases from 1924 to 2022 in the USA'
)
# group the data by the death column to select
grouped_by_states_dead_cases = data.groupby('State')['Dead'].sum().to_frame().sort_values('Dead', ascending = False)
grouped_by_states_dead_cases = grouped_by_states_dead_cases.reset_index()
grouped_by_states_dead_cases = grouped_by_states_dead_cases.head(10)
px.bar(grouped_by_states_dead_cases,
x = 'State',
y = 'Dead',
title = 'Bar char, comparison of mass shooting fatal cases, 1924 to 2022. top 10 states in the USA.',
template = 'plotly_dark'
)
grouped_by_state_total_cases = data.groupby('State')['Total'].sum().to_frame().sort_values('Total', ascending = False)
grouped_by_state_total_cases = grouped_by_state_total_cases.reset_index()
grouped_by_state_total_cases = grouped_by_state_total_cases.head(20)
px.bar(grouped_by_state_total_cases,
x = 'State',
y = 'Total',
color = 'Total',
template = 'plotly_dark',
title = 'States with the highest mass shooting victims both fatal and non-fatal. top 20 ')
temp_df = data[(data['Dead'] >=4)]
mass_shooting_per_state = temp_df.groupby('State')['Date'].count().to_frame().reset_index()
mass_shooting_per_state = mass_shooting_per_state.rename(columns = {'Date':'num_of_mass-shooting'})
mass_shooting_per_state = mass_shooting_per_state.sort_values(by = 'num_of_mass-shooting', ascending = False)
mass_shooting_per_state = mass_shooting_per_state.head(20)
px.bar(mass_shooting_per_state, x = 'State' ,
y = 'num_of_mass-shooting',
template = 'plotly_dark',
color = 'num_of_mass-shooting',
title = 'Number of mass shootings in the United States between 1924 and July 2022. Top 20 States')
temp = data[(data['Dead'] >=4)]
mass_shooting_per_year = temp.groupby('year')['Date'].count().to_frame().reset_index()
mass_shooting_per_year = mass_shooting_per_year.rename(columns = {'Date':'num_of_mass-shooting_per_year'})
mass_shooting_per_year = mass_shooting_per_year.sort_values(by = 'num_of_mass-shooting_per_year', ascending = False)
mass_shooting_per_year = mass_shooting_per_year.head(20)
px.bar(mass_shooting_per_year, x = 'year',
y = 'num_of_mass-shooting_per_year',
color = 'num_of_mass-shooting_per_year',
template = 'plotly_dark',
title = 'Bar char, number of mass shooting per year from 1924 to 2022 in the USA. top 20')
# create a dataframe where atleasts one person deid and three people injured
dead_injured_at_least_four = data[(data['Dead'] > 0) & (data['Injured'] >= 3)]
# plot a treemap to show the concentration of total mass shooting casualties and group them by the states they happened.
# from 1924 to 2022
fig = px.treemap(dead_injured_at_least_four, path=['State'], values='Total',
color='Dead')
fig.show()
!pip install wordcloud
from wordcloud import WordCloud
text = data['Description'].values
wordcloud = WordCloud().generate(str(text))
plt.figure(figsize =(40,30))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
Conclusion:
This is a simple exploratory data analysis on US mass shooting data from 1924 to 2022. i have tried to discover as many interesting insights as possible. i hope it helps.