Toxicity Across Industries

# System import warnings # Data manipulation import pandas as pd import numpy as np # Visualization import matplotlib.pyplot as plt import seaborn as sns import folium from folium.plugins import HeatMap

# Suppress warnings warnings.filterwarnings("ignore", category=FutureWarning) # Set the default Seaborn style sns.set_theme(style="whitegrid", font="serif")

def clean_zip(zip_code): # Ensure the ZIP code is a string zip_code = str(zip_code) # Check if the ZIP code is 9 digits (ZIP+4 format) if len(zip_code) == 9: return zip_code[:5] # Return the first 5 digits elif len(zip_code) == 5: return zip_code # Return the 5-digit ZIP code else: return None # Invalid ZIP codes are set to None

def clean_name(name): # Remove extra spaces name = ' '.join(name.split()) # Convert to title case name = name.title() return name

def clean_email(email): # Remove extra spaces email = email.strip() # Convert to lowercase email = email.lower() return email

def extract_address_and_coordinates(location): # Split the location into address and coordinates if location is None or location == 'Unknown': return None, None # Extract address (everything before the last line) address_parts = location.split('\n')[:-1] address = ' '.join(address_parts).strip() # Extract coordinates (last line) coordinates = location.split('\n')[-1].strip() return address, coordinates

# Load the dataset data = pd.read_csv("/kaggle/input/toxics-release-inventory/toxics_release_inventory.csv") # Preview the dataset, first 5 rows data.head()

# Preview the dataset, last 5 records data.tail()

# Dataset information data.info()

# Preview number of unique values data.nunique().to_frame(name="Unique Values")

# Dataset shape; rows, columns data.shape

# Preview the number of duplicated records data.duplicated().sum()

# Preview the number of NULL values data.isnull().sum()

# Dataset's statistical summary data.describe().T

# Rename all columns by replacing whitespaces and hyphens with underscores and transforming to uppercase data.columns = data.columns.str.replace(' ', '_').str.replace('-', '_').str.upper()

# Preview cleaned columns data.columns

# Calculate the percentage of records with null values null_percentage = (data.isnull().sum() / len(data)) * 100 # Create a DataFrame with the results null_percentage_df = null_percentage.reset_index() null_percentage_df.columns = ['Column', 'Null Percentage'] # Display the DataFrame null_percentage_df

# Drop columns with over 80% NULL values columns_to_drop = null_percentage[null_percentage > 50].index data = data.drop(columns=columns_to_drop)

# Recalculate the percentage of records with null values null_percentage = (data.isnull().sum() / len(data)) * 100 # Create a DataFrame with the results null_percentage_df = null_percentage.reset_index() null_percentage_df.columns = ['Column', 'Null Percentage'] # Display the DataFrame null_percentage_df

# Fill categorical columns with the mode (most frequent value) categorical_columns = ['CONT_E_ADDRESS', 'CERT_TITLE', 'FUGITIVEBASIS', 'STACKBASIS', 'GEOCODED_FACILITY_LOCATION'] for col in categorical_columns: # Get the most frequent value mode_value = data[col].mode()[0] data[col].fillna(mode_value, inplace=True)

# Fill numerical columns with 0 numerical_columns = ['MAX_ONSITE', 'METAL', 'METAL_TO_POTW', 'POTW2', 'OFF_TRE', 'TOTAL_TO_AIR', 'TOTAL_TO_LAND', 'TOTAL_TO_WATER', 'FUGITIVEBASIS', 'STACKBASIS'] data[numerical_columns] = data[numerical_columns].fillna(0)

# Re-preview the number of NULL values data.isnull().sum()

# Preview the dataset, again data.head()

# Convert all facility names to uppercase data['FAC_NAME'] = data['FAC_NAME'].str.upper()

# Preview all unique values in the NAICS column unique_naics_values = data['NAICS'].unique() unique_naics_values

# Preview all unique values in the FAC_ZIP column unique_chem_name_values = data['CHEM_NAME'].unique() unique_chem_name_values

# Mapping of similar names to standardized names chem_name_mapping = { 'n-Hexane': 'N-Hexane', 'Copper compounds': 'Copper Compounds', 'Carbonyl sulfide': 'Carbonyl Sulfide', 'Manganese compounds': 'Manganese Compounds', 'Zinc compounds': 'Zinc Compounds', 'Nitrate compounds': 'Nitrate Compounds', 'Peracetic acid': 'Peracetic Acid', 'Lead compounds': 'Lead Compounds', 'Mercury compounds': 'Mercury Compounds', 'Cobalt compounds': 'Cobalt Compounds', 'Nickel compounds': 'Nickel Compounds', 'Antimony compounds': 'Antimony Compounds', 'Arsenic compounds': 'Arsenic Compounds', 'Certain glycol ethers': 'Certain Glycol Ethers', 'Ethylene glycol': 'Ethylene Glycol', 'Ethylene oxide': 'Ethylene Oxide', 'Carbon disulfide': 'Carbon Disulfide', 'Cyanide compounds': 'Cyanide Compounds', 'Dioxin and dioxin-like compounds': 'Dioxin and Dioxin-like Compounds', 'Hydrochloric acid': 'Hydrochloric Acid', 'Hydrogen cyanide': 'Hydrogen Cyanide', 'Hydrogen sulfide': 'Hydrogen Sulfide', 'Molybdenum trioxide': 'Molybdenum Trioxide', 'Sulfuric acid': 'Sulfuric Acid', 'Boron trifluoride': 'Boron Trifluoride', 'Hydrogen fluoride': 'Hydrogen Fluoride', 'Selenium compounds': 'Selenium Compounds', "4,4'-Methylenebis(2-chloroaniline)": "4,4'-Methylenebis(2-Chloroaniline)", 'Titanium tetrachloride': 'Titanium Tetrachloride', 'Hydrazine sulfate': 'Hydrazine Sulfate', 'TOLUENE DIISOCYANATE (MIXED ISOMERS)': 'Toluene diisocyanate (mixed isomers)', 'CRESOL (MIXED ISOMERS)': 'Cresol (mixed isomers)', 'XYLENE (MIXED ISOMERS)': 'Xylene (mixed isomers)', 'CYCLOHEXANE': 'Cyclohexane', 'METHANOL': 'Methanol', 'Chromium Compounds (except for chromite ore mined in the Transvaal Region)': 'Chromium Compounds', 'Hydrochloric acid (1995 and after "Acid Aerosols" only)': 'Hydrochloric Acid', 'Sulfuric acid - (1994 and after "Acid Aerosols" only)': 'Sulfuric Acid', 'Chlorodifluoromethane (HCFC-22)': 'Chlorodifluoromethane', 'Hydrazine sulfate': 'Hydrazine Sulfate', 'Hydrazine sulfate (1:1)': 'Hydrazine Sulfate', 'Polycyclic aromatic Compounds': 'Polycyclic Aromatic Compounds', 'Vinyl acetate': 'Vinyl Acetate', 'Vinyl chloride': 'Vinyl Chloride', 'Polychlorinated biphenyls': 'Polychlorinated Biphenyls', 'Titanium tetrachloride': 'Titanium Tetrachloride', 'Butyl acrylate': 'Butyl Acrylate', 'Formic acid': 'Formic Acid', 'Ethyl acrylate': 'Ethyl Acrylate', 'Phthalic anhydride': 'Phthalic Anhydride', } # Apply the mapping to standardize names data['CHEM_NAME'] = data['CHEM_NAME'].replace(chem_name_mapping)

# Apply the cleaning function to the FAC_ZIP column data['FAC_ZIP'] = data['FAC_ZIP'].apply(clean_zip) # Preview all unique values in the FAC_ZIP column unique_values = data['FAC_ZIP'].unique() unique_values

# Apply the cleaning function to the TECH_NAME and CONT_NAME columns data['TECH_NAME'] = data['TECH_NAME'].apply(clean_name) data['CONT_NAME'] = data['CONT_NAME'].apply(clean_name) # Mapping of similar names to standardized names name_mapping = { 'Russell W. Davis, Chmm, Csp': 'Russell W. Davis', 'Russell W. Davis, Chmm': 'Russell W. Davis', 'Russell Davis': 'Russell W. Davis', 'Daniel C Leandri': 'Daniel C. Leandri', 'Edward M. Short, President': 'Edward M. Short', 'Richard M. Foreman Csp': 'Richard M. Foreman', 'Sharon.M.Adams': 'Sharon M. Adams', 'Sharon Adams': 'Sharon M. Adams', 'Paul H Swarm': 'Paul H. Swarm', 'Daniel C Leandri': 'Daniel C. Leandri', 'Daniel Leandri': 'Daniel C. Leandri', 'Jeffrey S Chavanne': 'Jeffrey S. Chavanne', 'Sgt Sean Maynard': 'Sgt. Sean Maynard', 'John R Hall': 'John R. Hall' } # Apply the mapping to standardize names data['TECH_NAME'] = data['TECH_NAME'].replace(name_mapping) data['CONT_NAME'] = data['CONT_NAME'].replace(name_mapping)

# Apply the cleaning function to the TECH_E_ADDRESS and CONT_E_ADDRESS column data['TECH_E_ADDRESS'] = data['TECH_E_ADDRESS'].apply(clean_email) data['CONT_E_ADDRESS'] = data['CONT_E_ADDRESS'].apply(clean_email)

# Apply the function to the GEOCODED_FACILITY_LOCATION column data[['GEOCODED_FACILITY_ADDRESS', 'GEOCODED_FACILITY_COORDINATES']] = data['GEOCODED_FACILITY_LOCATION'].apply( lambda x: pd.Series(extract_address_and_coordinates(x)) ) # Drop the 'GEOCODED_FACILITY_LOCATION' column data = data.drop(columns=['GEOCODED_FACILITY_LOCATION']) # Display the new columns data[['GEOCODED_FACILITY_ADDRESS', 'GEOCODED_FACILITY_COORDINATES']].head()

# Preview cleaned dataset data.head()

# Set the figure size plt.figure(figsize=(10, 6)) # Group by year and calculate total releases yearly_releases = data.groupby('REPYR')[['TOTAL_TO_AIR', 'TOTAL_TO_LAND', 'TOTAL_TO_WATER']].sum() # Reset index to ensure 'REPYR' is a column and not an index yearly_releases = yearly_releases.reset_index() # Use Seaborn to create a line plot with markers sns.lineplot(data=yearly_releases, x='REPYR', y='TOTAL_TO_AIR', label='Air', marker='o') sns.lineplot(data=yearly_releases, x='REPYR', y='TOTAL_TO_LAND', label='Land', marker='o') sns.lineplot(data=yearly_releases, x='REPYR', y='TOTAL_TO_WATER', label='Water', marker='o') # Annotate the peak values for column in ['TOTAL_TO_AIR', 'TOTAL_TO_LAND', 'TOTAL_TO_WATER']: max_value = yearly_releases[column].max() max_year = yearly_releases.loc[yearly_releases[column].idxmax(), 'REPYR'] plt.text(max_year, max_value, f'{max_value:.2f}', ha='center', va='bottom', fontsize=10, fontweight='bold') # Set title and labels plt.title('Trends in Total Releases Over Time', fontsize=16, fontweight='bold') plt.xlabel('Year', fontsize=14) plt.ylabel('Total Releases', fontsize=14) # Add grid lines for better readability plt.grid(True, linestyle='--', alpha=0.7) # Customize the legend with new labels plt.legend(title='Release Medium', fontsize=12, title_fontsize=14) # Display the plot plt.tight_layout() plt.show()

# Set the figure size plt.figure(figsize=(12, 8)) # Calculate total releases per facility facility_releases = data.groupby('FAC_NAME')[['TOTAL_TO_AIR', 'TOTAL_TO_LAND', 'TOTAL_TO_WATER']].sum().sum(axis=1) # Get the top 10 polluters top_polluters = facility_releases.sort_values(ascending=False).head(10) # Use Seaborn to create a bar plot sns.barplot(x=top_polluters.values, y=top_polluters.index, palette='viridis') # Set title and labels plt.title('Top 10 Polluters by Total Releases', fontsize=16, fontweight='bold') plt.xlabel('Total Releases', fontsize=14) plt.ylabel('Facility', fontsize=14) # Add grid lines for better readability plt.grid(axis='x', linestyle='--', alpha=0.7) # Display the plot plt.tight_layout() plt.show()

# Set the figure size plt.figure(figsize=(12, 8)) # Calculate total releases per chemical chemical_releases = data.groupby('CHEM_NAME')[['TOTAL_TO_AIR', 'TOTAL_TO_LAND', 'TOTAL_TO_WATER']].sum().sum(axis=1) # Get the top 10 chemicals top_chemicals = chemical_releases.sort_values(ascending=False).head(10) # Use Seaborn to create a bar plot sns.barplot(x=top_chemicals.values, y=top_chemicals.index, palette='viridis') # Set the title of the plot plt.title('Top 10 Chemicals by Total Releases', fontsize=16, fontweight='bold') # Set labels plt.xlabel('Total Releases', fontsize=14) plt.ylabel('Chemical', fontsize=14) # Add grid lines for better readability plt.grid(axis='x', linestyle='--', alpha=0.7) # Display the plot plt.tight_layout() plt.show()

# Set the figure size plt.figure(figsize=(10, 6)) # Calculate total waste management quantities waste_management = data[['LANDFILL', 'LANDFARM', 'POTW', 'OFF_TRE']].sum() # Use Seaborn to create a stacked bar plot sns.barplot(x=waste_management.index, y=waste_management.values, palette='viridis') # Set the title and labels plt.title('Waste Management Practices', fontsize=16, fontweight='bold') plt.xlabel('Method', fontsize=14) plt.ylabel('Total Quantity', fontsize=14) # Add grid lines for better readability plt.grid(axis='y', linestyle='--', alpha=0.7) # Display the plot plt.tight_layout() plt.show()

# Extract latitude and longitude data['LATITUDE'] = data['GEOCODED_FACILITY_COORDINATES'].str.extract(r'\(([^,]+)') data['LONGITUDE'] = data['GEOCODED_FACILITY_COORDINATES'].str.extract(r', ([^)]+)') # Calculate total releases for each facility data['TOTAL_RELEASES'] = data['TOTAL_TO_AIR'] + data['TOTAL_TO_LAND'] + data['TOTAL_TO_WATER'] # Create a map centered on Delaware map_de = folium.Map(location=[39.0, -75.5], zoom_start=8, tiles='OpenStreetMap') # Add markers for each facility for idx, row in data.iterrows(): folium.Marker( location=[float(row['LATITUDE']), float(row['LONGITUDE'])], popup=f"{row['FAC_NAME']}<br>Total Releases: {row['TOTAL_RELEASES']}", icon=folium.Icon(color='blue') ).add_to(map_de) # Add a heatmap to visualize the concentration of facilities and their releases heat_data = [[float(row['LATITUDE']), float(row['LONGITUDE']), row['TOTAL_RELEASES']] for idx, row in data.iterrows()] HeatMap(heat_data, radius=15, blur=10).add_to(map_de) # Save the map to an HTML file (optional) map_de.save('facility_releases_map.html') # Display the map map_de

# Map NAICS codes to industry names naics_mapping = { 311: 'Food Manufacturing', 928: 'National Security and International Affairs', 326: 'Plastics and Rubber Products Manufacturing', 325: 'Chemical Manufacturing', 324: 'Petroleum and Coal Products Manufacturing', 323: 'Printing and Related Support Activities', 333: 'Machinery Manufacturing', 335: 'Electrical Equipment, Appliance, and Component Manufacturing', 332: 'Fabricated Metal Product Manufacturing', 339: 'Miscellaneous Manufacturing', 221: 'Utilities', 424: 'Merchant Wholesalers, Nondurable Goods', 331: 'Primary Metal Manufacturing', 327: 'Nonmetallic Mineral Product Manufacturing', 212: 'Mining (except Oil and Gas)', 313: 'Textile Mills', 334: 'Computer and Electronic Product Manufacturing', 337: 'Furniture and Related Product Manufacturing', 454: 'Nonstore Retailers' } # Add a new column for facility type data['FACILITY_TYPE'] = data['NAICS'].map(naics_mapping) # Calculate total releases per facility type facility_type_releases = data.groupby('FACILITY_TYPE')[['TOTAL_TO_AIR', 'TOTAL_TO_LAND', 'TOTAL_TO_WATER']].sum().sum(axis=1) # Sort the facility types by total releases in descending order facility_type_releases = facility_type_releases.sort_values(ascending=False) # Set the figure size plt.figure(figsize=(12, 8)) # Use Seaborn to create a bar plot sns.barplot(x=facility_type_releases.index, y=facility_type_releases.values, palette='viridis') # Set the title and labels plt.title('Total Releases by Facility Type', fontsize=16, fontweight='bold') plt.xlabel('Facility Type', fontsize=14) plt.ylabel('Total Releases (in millions)', fontsize=14) # Rotate x-axis labels for better readability plt.xticks(rotation=45, ha='right') # Add grid lines for better readability plt.grid(axis='y', linestyle='--', alpha=0.7) # Display the plot plt.tight_layout() plt.show()

# Set the figure size plt.figure(figsize=(12, 8)) # Use Seaborn to create a box plot sns.boxplot(x='CHEM_NAME', y='P_INDEX', data=data, palette='viridis') # Set the title and labels plt.title('Toxicity Index by Chemical', fontsize=16, fontweight='bold') plt.xlabel('Chemical', fontsize=14) plt.ylabel('Toxicity Index', fontsize=14) # Rotate x-axis labels for better readability plt.xticks(rotation=90) # Add grid lines for better readability plt.grid(axis='y', linestyle='--', alpha=0.7) # Display the plot plt.tight_layout() plt.show()

# Identify the most used chemicals top_chemicals = data.groupby('CHEM_NAME')['TOTAL_TO_AIR'].sum().sort_values(ascending=False).head(10).index #Filter the data for the top chemicals top_chemicals_data = data[data['CHEM_NAME'].isin(top_chemicals)] # Group by year and chemical, and calculate total releases yearly_releases = top_chemicals_data.groupby(['REPYR', 'CHEM_NAME'])['TOTAL_TO_AIR'].sum().reset_index() # Set the figure size plt.figure(figsize=(14, 8)) # Use Seaborn to create a line plot with markers sns.lineplot(x='REPYR', y='TOTAL_TO_AIR', hue='CHEM_NAME', data=yearly_releases, marker='o', palette='viridis') # Set the title and labels plt.title('Trends in Releases of Top Chemicals Over Time', fontsize=16, fontweight='bold') plt.xlabel('Year', fontsize=14) plt.ylabel('Total Air Releases', fontsize=14) # Add grid lines for better readability plt.grid(True, linestyle='--', alpha=0.7) # Display the legend plt.legend(title='Chemical', loc='upper right', fontsize=12) # Display the plot plt.tight_layout() plt.show()