List of Substances Detected in NC
Code demonstration Python notebook to identify substances identified in North Carolina samples, in any (primary or trace) and primary-only abundance.
import pandas as pd
# Load data from public repo using Stata format
url = 'https://github.com/opioiddatalab/drugchecking/raw/main/datasets/nc/nc_lab_detail.dta'
df = pd.read_stata(url)
# Drop if date_complete is missing or blank
df = df.dropna(subset=['date_complete'])
df = df[df['date_complete'] != '']
# Convert date_complete to datetime, specifying the format to avoid UserWarning
# Assuming the correct format is '%Y-%m-%d'
df['date_complete'] = pd.to_datetime(df['date_complete'], format='%Y-%m-%d', errors='coerce')
# Extract unique substances and their total counts
substance_counts = df['substance'].value_counts().reset_index()
substance_counts.columns = ['substance', 'total']
# Filter data where abundance is not "trace" and compute counts
primary_counts = df[df['abundance'] != 'trace']['substance'].value_counts().reset_index()
primary_counts.columns = ['substance', 'primary']
# Merge the two dataframes on 'substance'
merged_df = pd.merge(substance_counts, primary_counts, on='substance', how='left')
# Print number of unique sampleid
print(f'Number of unique NC samples: {df["sampleid"].nunique()}')
# Print the most recent date_complete in "Day, Date" nice format
most_recent_date = df['date_complete'].max()
print(f"Most recent date analyzed: {most_recent_date.strftime('%A, %B %d, %Y')}")
merged_df