# System
import warnings
# Data manipulation
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap
# Suppress warnings
warnings.filterwarnings("ignore", category=FutureWarning)
# Set the default Seaborn style
sns.set_theme(style="whitegrid", font="serif")
def clean_zip(zip_code):
# Ensure the ZIP code is a string
zip_code = str(zip_code)
# Check if the ZIP code is 9 digits (ZIP+4 format)
if len(zip_code) == 9:
return zip_code[:5] # Return the first 5 digits
elif len(zip_code) == 5:
return zip_code # Return the 5-digit ZIP code
else:
return None # Invalid ZIP codes are set to None
def clean_name(name):
# Remove extra spaces
name = ' '.join(name.split())
# Convert to title case
name = name.title()
return name
def clean_email(email):
# Remove extra spaces
email = email.strip()
# Convert to lowercase
email = email.lower()
return email
def extract_address_and_coordinates(location):
# Split the location into address and coordinates
if location is None or location == 'Unknown':
return None, None
# Extract address (everything before the last line)
address_parts = location.split('\n')[:-1]
address = ' '.join(address_parts).strip()
# Extract coordinates (last line)
coordinates = location.split('\n')[-1].strip()
return address, coordinates
# Load the dataset
data = pd.read_csv("/kaggle/input/toxics-release-inventory/toxics_release_inventory.csv")
# Preview the dataset, first 5 rows
data.head()
# Preview the dataset, last 5 records
data.tail()
# Dataset information
data.info()
# Preview number of unique values
data.nunique().to_frame(name="Unique Values")
# Dataset shape; rows, columns
data.shape
# Preview the number of duplicated records
data.duplicated().sum()
# Preview the number of NULL values
data.isnull().sum()
# Dataset's statistical summary
data.describe().T
# Rename all columns by replacing whitespaces and hyphens with underscores and transforming to uppercase
data.columns = data.columns.str.replace(' ', '_').str.replace('-', '_').str.upper()
# Preview cleaned columns
data.columns
# Calculate the percentage of records with null values
null_percentage = (data.isnull().sum() / len(data)) * 100
# Create a DataFrame with the results
null_percentage_df = null_percentage.reset_index()
null_percentage_df.columns = ['Column', 'Null Percentage']
# Display the DataFrame
null_percentage_df
# Drop columns with over 80% NULL values
columns_to_drop = null_percentage[null_percentage > 50].index
data = data.drop(columns=columns_to_drop)
# Recalculate the percentage of records with null values
null_percentage = (data.isnull().sum() / len(data)) * 100
# Create a DataFrame with the results
null_percentage_df = null_percentage.reset_index()
null_percentage_df.columns = ['Column', 'Null Percentage']
# Display the DataFrame
null_percentage_df
# Fill categorical columns with the mode (most frequent value)
categorical_columns = ['CONT_E_ADDRESS', 'CERT_TITLE', 'FUGITIVEBASIS',
'STACKBASIS', 'GEOCODED_FACILITY_LOCATION']
for col in categorical_columns:
# Get the most frequent value
mode_value = data[col].mode()[0]
data[col].fillna(mode_value, inplace=True)
# Fill numerical columns with 0
numerical_columns = ['MAX_ONSITE', 'METAL', 'METAL_TO_POTW', 'POTW2', 'OFF_TRE',
'TOTAL_TO_AIR', 'TOTAL_TO_LAND', 'TOTAL_TO_WATER',
'FUGITIVEBASIS', 'STACKBASIS']
data[numerical_columns] = data[numerical_columns].fillna(0)
# Re-preview the number of NULL values
data.isnull().sum()
# Preview the dataset, again
data.head()
# Convert all facility names to uppercase
data['FAC_NAME'] = data['FAC_NAME'].str.upper()
# Preview all unique values in the NAICS column
unique_naics_values = data['NAICS'].unique()
unique_naics_values
# Preview all unique values in the FAC_ZIP column
unique_chem_name_values = data['CHEM_NAME'].unique()
unique_chem_name_values
# Mapping of similar names to standardized names
chem_name_mapping = {
'n-Hexane': 'N-Hexane',
'Copper compounds': 'Copper Compounds',
'Carbonyl sulfide': 'Carbonyl Sulfide',
'Manganese compounds': 'Manganese Compounds',
'Zinc compounds': 'Zinc Compounds',
'Nitrate compounds': 'Nitrate Compounds',
'Peracetic acid': 'Peracetic Acid',
'Lead compounds': 'Lead Compounds',
'Mercury compounds': 'Mercury Compounds',
'Cobalt compounds': 'Cobalt Compounds',
'Nickel compounds': 'Nickel Compounds',
'Antimony compounds': 'Antimony Compounds',
'Arsenic compounds': 'Arsenic Compounds',
'Certain glycol ethers': 'Certain Glycol Ethers',
'Ethylene glycol': 'Ethylene Glycol',
'Ethylene oxide': 'Ethylene Oxide',
'Carbon disulfide': 'Carbon Disulfide',
'Cyanide compounds': 'Cyanide Compounds',
'Dioxin and dioxin-like compounds': 'Dioxin and Dioxin-like Compounds',
'Hydrochloric acid': 'Hydrochloric Acid',
'Hydrogen cyanide': 'Hydrogen Cyanide',
'Hydrogen sulfide': 'Hydrogen Sulfide',
'Molybdenum trioxide': 'Molybdenum Trioxide',
'Sulfuric acid': 'Sulfuric Acid',
'Boron trifluoride': 'Boron Trifluoride',
'Hydrogen fluoride': 'Hydrogen Fluoride',
'Selenium compounds': 'Selenium Compounds',
"4,4'-Methylenebis(2-chloroaniline)": "4,4'-Methylenebis(2-Chloroaniline)",
'Titanium tetrachloride': 'Titanium Tetrachloride',
'Hydrazine sulfate': 'Hydrazine Sulfate',
'TOLUENE DIISOCYANATE (MIXED ISOMERS)': 'Toluene diisocyanate (mixed isomers)',
'CRESOL (MIXED ISOMERS)': 'Cresol (mixed isomers)',
'XYLENE (MIXED ISOMERS)': 'Xylene (mixed isomers)',
'CYCLOHEXANE': 'Cyclohexane',
'METHANOL': 'Methanol',
'Chromium Compounds (except for chromite ore mined in the Transvaal Region)': 'Chromium Compounds',
'Hydrochloric acid (1995 and after "Acid Aerosols" only)': 'Hydrochloric Acid',
'Sulfuric acid - (1994 and after "Acid Aerosols" only)': 'Sulfuric Acid',
'Chlorodifluoromethane (HCFC-22)': 'Chlorodifluoromethane',
'Hydrazine sulfate': 'Hydrazine Sulfate',
'Hydrazine sulfate (1:1)': 'Hydrazine Sulfate',
'Polycyclic aromatic Compounds': 'Polycyclic Aromatic Compounds',
'Vinyl acetate': 'Vinyl Acetate',
'Vinyl chloride': 'Vinyl Chloride',
'Polychlorinated biphenyls': 'Polychlorinated Biphenyls',
'Titanium tetrachloride': 'Titanium Tetrachloride',
'Butyl acrylate': 'Butyl Acrylate',
'Formic acid': 'Formic Acid',
'Ethyl acrylate': 'Ethyl Acrylate',
'Phthalic anhydride': 'Phthalic Anhydride',
}
# Apply the mapping to standardize names
data['CHEM_NAME'] = data['CHEM_NAME'].replace(chem_name_mapping)
# Apply the cleaning function to the FAC_ZIP column
data['FAC_ZIP'] = data['FAC_ZIP'].apply(clean_zip)
# Preview all unique values in the FAC_ZIP column
unique_values = data['FAC_ZIP'].unique()
unique_values
# Apply the cleaning function to the TECH_NAME and CONT_NAME columns
data['TECH_NAME'] = data['TECH_NAME'].apply(clean_name)
data['CONT_NAME'] = data['CONT_NAME'].apply(clean_name)
# Mapping of similar names to standardized names
name_mapping = {
'Russell W. Davis, Chmm, Csp': 'Russell W. Davis',
'Russell W. Davis, Chmm': 'Russell W. Davis',
'Russell Davis': 'Russell W. Davis',
'Daniel C Leandri': 'Daniel C. Leandri',
'Edward M. Short, President': 'Edward M. Short',
'Richard M. Foreman Csp': 'Richard M. Foreman',
'Sharon.M.Adams': 'Sharon M. Adams',
'Sharon Adams': 'Sharon M. Adams',
'Paul H Swarm': 'Paul H. Swarm',
'Daniel C Leandri': 'Daniel C. Leandri',
'Daniel Leandri': 'Daniel C. Leandri',
'Jeffrey S Chavanne': 'Jeffrey S. Chavanne',
'Sgt Sean Maynard': 'Sgt. Sean Maynard',
'John R Hall': 'John R. Hall'
}
# Apply the mapping to standardize names
data['TECH_NAME'] = data['TECH_NAME'].replace(name_mapping)
data['CONT_NAME'] = data['CONT_NAME'].replace(name_mapping)
# Apply the cleaning function to the TECH_E_ADDRESS and CONT_E_ADDRESS column
data['TECH_E_ADDRESS'] = data['TECH_E_ADDRESS'].apply(clean_email)
data['CONT_E_ADDRESS'] = data['CONT_E_ADDRESS'].apply(clean_email)
# Apply the function to the GEOCODED_FACILITY_LOCATION column
data[['GEOCODED_FACILITY_ADDRESS', 'GEOCODED_FACILITY_COORDINATES']] = data['GEOCODED_FACILITY_LOCATION'].apply(
lambda x: pd.Series(extract_address_and_coordinates(x))
)
# Drop the 'GEOCODED_FACILITY_LOCATION' column
data = data.drop(columns=['GEOCODED_FACILITY_LOCATION'])
# Display the new columns
data[['GEOCODED_FACILITY_ADDRESS', 'GEOCODED_FACILITY_COORDINATES']].head()
# Preview cleaned dataset
data.head()
# Set the figure size
plt.figure(figsize=(10, 6))
# Group by year and calculate total releases
yearly_releases = data.groupby('REPYR')[['TOTAL_TO_AIR', 'TOTAL_TO_LAND', 'TOTAL_TO_WATER']].sum()
# Reset index to ensure 'REPYR' is a column and not an index
yearly_releases = yearly_releases.reset_index()
# Use Seaborn to create a line plot with markers
sns.lineplot(data=yearly_releases, x='REPYR', y='TOTAL_TO_AIR', label='Air', marker='o')
sns.lineplot(data=yearly_releases, x='REPYR', y='TOTAL_TO_LAND', label='Land', marker='o')
sns.lineplot(data=yearly_releases, x='REPYR', y='TOTAL_TO_WATER', label='Water', marker='o')
# Annotate the peak values
for column in ['TOTAL_TO_AIR', 'TOTAL_TO_LAND', 'TOTAL_TO_WATER']:
max_value = yearly_releases[column].max()
max_year = yearly_releases.loc[yearly_releases[column].idxmax(), 'REPYR']
plt.text(max_year, max_value, f'{max_value:.2f}',
ha='center', va='bottom', fontsize=10, fontweight='bold')
# Set title and labels
plt.title('Trends in Total Releases Over Time', fontsize=16, fontweight='bold')
plt.xlabel('Year', fontsize=14)
plt.ylabel('Total Releases', fontsize=14)
# Add grid lines for better readability
plt.grid(True, linestyle='--', alpha=0.7)
# Customize the legend with new labels
plt.legend(title='Release Medium', fontsize=12, title_fontsize=14)
# Display the plot
plt.tight_layout()
plt.show()
# Set the figure size
plt.figure(figsize=(12, 8))
# Calculate total releases per facility
facility_releases = data.groupby('FAC_NAME')[['TOTAL_TO_AIR', 'TOTAL_TO_LAND', 'TOTAL_TO_WATER']].sum().sum(axis=1)
# Get the top 10 polluters
top_polluters = facility_releases.sort_values(ascending=False).head(10)
# Use Seaborn to create a bar plot
sns.barplot(x=top_polluters.values, y=top_polluters.index, palette='viridis')
# Set title and labels
plt.title('Top 10 Polluters by Total Releases', fontsize=16, fontweight='bold')
plt.xlabel('Total Releases', fontsize=14)
plt.ylabel('Facility', fontsize=14)
# Add grid lines for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)
# Display the plot
plt.tight_layout()
plt.show()
# Set the figure size
plt.figure(figsize=(12, 8))
# Calculate total releases per chemical
chemical_releases = data.groupby('CHEM_NAME')[['TOTAL_TO_AIR', 'TOTAL_TO_LAND', 'TOTAL_TO_WATER']].sum().sum(axis=1)
# Get the top 10 chemicals
top_chemicals = chemical_releases.sort_values(ascending=False).head(10)
# Use Seaborn to create a bar plot
sns.barplot(x=top_chemicals.values, y=top_chemicals.index, palette='viridis')
# Set the title of the plot
plt.title('Top 10 Chemicals by Total Releases', fontsize=16, fontweight='bold')
# Set labels
plt.xlabel('Total Releases', fontsize=14)
plt.ylabel('Chemical', fontsize=14)
# Add grid lines for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)
# Display the plot
plt.tight_layout()
plt.show()
# Set the figure size
plt.figure(figsize=(10, 6))
# Calculate total waste management quantities
waste_management = data[['LANDFILL', 'LANDFARM', 'POTW', 'OFF_TRE']].sum()
# Use Seaborn to create a stacked bar plot
sns.barplot(x=waste_management.index, y=waste_management.values, palette='viridis')
# Set the title and labels
plt.title('Waste Management Practices', fontsize=16, fontweight='bold')
plt.xlabel('Method', fontsize=14)
plt.ylabel('Total Quantity', fontsize=14)
# Add grid lines for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Display the plot
plt.tight_layout()
plt.show()
# Extract latitude and longitude
data['LATITUDE'] = data['GEOCODED_FACILITY_COORDINATES'].str.extract(r'\(([^,]+)')
data['LONGITUDE'] = data['GEOCODED_FACILITY_COORDINATES'].str.extract(r', ([^)]+)')
# Calculate total releases for each facility
data['TOTAL_RELEASES'] = data['TOTAL_TO_AIR'] + data['TOTAL_TO_LAND'] + data['TOTAL_TO_WATER']
# Create a map centered on Delaware
map_de = folium.Map(location=[39.0, -75.5], zoom_start=8, tiles='OpenStreetMap')
# Add markers for each facility
for idx, row in data.iterrows():
folium.Marker(
location=[float(row['LATITUDE']), float(row['LONGITUDE'])],
popup=f"{row['FAC_NAME']}<br>Total Releases: {row['TOTAL_RELEASES']}",
icon=folium.Icon(color='blue')
).add_to(map_de)
# Add a heatmap to visualize the concentration of facilities and their releases
heat_data = [[float(row['LATITUDE']), float(row['LONGITUDE']), row['TOTAL_RELEASES']] for idx, row in data.iterrows()]
HeatMap(heat_data, radius=15, blur=10).add_to(map_de)
# Save the map to an HTML file (optional)
map_de.save('facility_releases_map.html')
# Display the map
map_de
# Map NAICS codes to industry names
naics_mapping = {
311: 'Food Manufacturing',
928: 'National Security and International Affairs',
326: 'Plastics and Rubber Products Manufacturing',
325: 'Chemical Manufacturing',
324: 'Petroleum and Coal Products Manufacturing',
323: 'Printing and Related Support Activities',
333: 'Machinery Manufacturing',
335: 'Electrical Equipment, Appliance, and Component Manufacturing',
332: 'Fabricated Metal Product Manufacturing',
339: 'Miscellaneous Manufacturing',
221: 'Utilities',
424: 'Merchant Wholesalers, Nondurable Goods',
331: 'Primary Metal Manufacturing',
327: 'Nonmetallic Mineral Product Manufacturing',
212: 'Mining (except Oil and Gas)',
313: 'Textile Mills',
334: 'Computer and Electronic Product Manufacturing',
337: 'Furniture and Related Product Manufacturing',
454: 'Nonstore Retailers'
}
# Add a new column for facility type
data['FACILITY_TYPE'] = data['NAICS'].map(naics_mapping)
# Calculate total releases per facility type
facility_type_releases = data.groupby('FACILITY_TYPE')[['TOTAL_TO_AIR', 'TOTAL_TO_LAND', 'TOTAL_TO_WATER']].sum().sum(axis=1)
# Sort the facility types by total releases in descending order
facility_type_releases = facility_type_releases.sort_values(ascending=False)
# Set the figure size
plt.figure(figsize=(12, 8))
# Use Seaborn to create a bar plot
sns.barplot(x=facility_type_releases.index, y=facility_type_releases.values, palette='viridis')
# Set the title and labels
plt.title('Total Releases by Facility Type', fontsize=16, fontweight='bold')
plt.xlabel('Facility Type', fontsize=14)
plt.ylabel('Total Releases (in millions)', fontsize=14)
# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
# Add grid lines for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Display the plot
plt.tight_layout()
plt.show()
# Set the figure size
plt.figure(figsize=(12, 8))
# Use Seaborn to create a box plot
sns.boxplot(x='CHEM_NAME', y='P_INDEX', data=data, palette='viridis')
# Set the title and labels
plt.title('Toxicity Index by Chemical', fontsize=16, fontweight='bold')
plt.xlabel('Chemical', fontsize=14)
plt.ylabel('Toxicity Index', fontsize=14)
# Rotate x-axis labels for better readability
plt.xticks(rotation=90)
# Add grid lines for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Display the plot
plt.tight_layout()
plt.show()
# Identify the most used chemicals
top_chemicals = data.groupby('CHEM_NAME')['TOTAL_TO_AIR'].sum().sort_values(ascending=False).head(10).index
#Filter the data for the top chemicals
top_chemicals_data = data[data['CHEM_NAME'].isin(top_chemicals)]
# Group by year and chemical, and calculate total releases
yearly_releases = top_chemicals_data.groupby(['REPYR', 'CHEM_NAME'])['TOTAL_TO_AIR'].sum().reset_index()
# Set the figure size
plt.figure(figsize=(14, 8))
# Use Seaborn to create a line plot with markers
sns.lineplot(x='REPYR', y='TOTAL_TO_AIR', hue='CHEM_NAME', data=yearly_releases, marker='o', palette='viridis')
# Set the title and labels
plt.title('Trends in Releases of Top Chemicals Over Time', fontsize=16, fontweight='bold')
plt.xlabel('Year', fontsize=14)
plt.ylabel('Total Air Releases', fontsize=14)
# Add grid lines for better readability
plt.grid(True, linestyle='--', alpha=0.7)
# Display the legend
plt.legend(title='Chemical', loc='upper right', fontsize=12)
# Display the plot
plt.tight_layout()
plt.show()