-- previewing the first few rows of the data
SELECT *
FROM "paytondhubbard/census_data"."2017_agg_census_data"
LIMIT 5
SELECT *
FROM "paytondhubbard/census_data"."2017_agg_census_data"
WHERE census_chapter = 1
AND census_table = 1
SELECT *
FROM "paytondhubbard/census_data"."2017_agg_census_data"
WHERE census_chapter = 2
AND census_table IN (46, 47)
AND agg_level_desc = 'STATE'
SELECT *
FROM "paytondhubbard/census_data"."2017_agg_census_data"
WHERE census_chapter = 2
AND census_table IN (48, 49, 50, 51, 52, 53, 54)
AND agg_level_desc = 'STATE'
table_1_df_raw.to_csv("data/table_1.csv", index=False)
farmer_gender_table_df_raw.to_csv("data/farmer_gender_table.csv", index=False)
farmer_race_table_df_raw.to_csv("data/farmer_race_table.csv", index=False)
import geopandas as gpd
from shapely.geometry import Point, Polygon
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
table_1_df_raw = pd.read_csv("data/table_1.csv")
def clean_data_frame(df):
cleaned_df = df.copy()
# Replace non numeric values with an empty string
cleaned_df["value"] = cleaned_df["value"].str.replace(r"[^0-9]+", '', regex=True)
# Coercing is necessary for the blank string values
cleaned_df["value"] = pd.to_numeric(cleaned_df["value"], errors='coerce')
# Convert from float to integer
cleaned_df["value"] = cleaned_df["value"].fillna(0).astype(int)
# Converting the state name to be sentence case
cleaned_df["state_name"] = cleaned_df["state_name"].str.title()
return cleaned_df
table_1_df = clean_data_frame(table_1_df_raw)
def plot_map(df, state_col, intensity_col, title=False, cmap="Blues", legend=False):
usa = gpd.read_file("./cb_2018_us_state_20m/cb_2018_us_state_20m.shp")
# Filtering to have a simple continental map
continental_us = usa.loc[~usa["NAME"].isin(["Alaska", "Hawaii", "Puerto Rico"])].copy()
merged_df = continental_us.merge(df, how="left", left_on="NAME", right_on=state_col)
merged_df.plot(column=intensity_col, cmap=cmap, legend=legend)
plt.axis('off')
if title:
plt.title(title)
return plt.show()
def style_dataframe(df, title=None, column_names=None, percent_column_indexes=None,
highlighting=None, highlight_subset=None, heatmap=None, heatmap_subset=None):
new_df = df.copy()
percent_formatting = {}
if column_names:
column_conversions = {old: new for old, new in zip(new_df.columns, column_names)}
new_df.columns = column_names
styler = new_df.style
if title:
styler.set_caption(title)
if percent_column_indexes:
percent_columns = new_df.columns[percent_column_indexes]
styler.format({column_name: '{:,.2%}'.format for column_name in percent_columns})
if highlighting:
if highlight_subset:
if column_names:
subset_columns = [column_conversions[column_name] for column_name in highlight_subset]
else:
subset_columns = highlight_subset
styler.highlight_max(color='#86BCE9', axis=1, subset=subset_columns)
else:
styler.highlight_max(color = '#86BCE9', axis = 1)
if heatmap:
if heatmap_subset:
subset_columns = [column_conversions[column_name] for column_name in heatmap_subset]
styler.background_gradient(cmap='Blues', axis=None, subset=subset_columns)
else:
styler.background_gradient(cmap='Blues', axis=None)
return styler
def print_as_currency(value, postface=""):
print("${:,}".format(value) + postface)
# filter to only data pertaining to the number of farms in each state and only the columns we need
num_operations_filter = table_1_df["short_desc"] == "FARM OPERATIONS - NUMBER OF OPERATIONS"
num_farms_df = table_1_df.loc[num_operations_filter].copy()
state_filter = num_farms_df["state_name"] == "Arkansas"
num_farms_ark_df = num_farms_df.loc[state_filter]
num_farms_ark_df
no_domaincat_desc = num_farms_ark_df["domaincat_desc"].isnull()
total_farms_w_domaincat_desc = num_farms_ark_df.loc[no_domaincat_desc, "value"].sum()
total_farms_no_domaincat_desc = num_farms_ark_df.loc[~no_domaincat_desc, "value"].sum()
print("Total number of farms that have a domaincat_desc value: " + str(total_farms_w_domaincat_desc))
print("Total number of farms that don't have a domaincat_desc value: " + str(total_farms_no_domaincat_desc))
only_totals_rows = num_farms_df["domaincat_desc"].isnull()
num_farms_totals = num_farms_df.loc[only_totals_rows, [ "state_name", "value"]].sort_values(["value"], ascending=False)
num_farms_totals = num_farms_totals.iloc[1:]
num_farms_totals.columns = ["State", "Number_of_Farms"]
num_farms_totals.reset_index(inplace=True, drop=True)
style_dataframe(
num_farms_totals,
title="Total # of Farms by State",
column_names=["State", "Number of Farms"],
heatmap=True
)
num_farms_totals["Percent_US_Total"] = (
num_farms_totals["Number_of_Farms"] / num_farms_totals["Number_of_Farms"].sum()
)
top_five_states = num_farms_totals.iloc[0:5]
style_dataframe(
top_five_states,
title="Top 5 U.S. states - Number of Farms",
column_names=["State", "Number of Farms", "Percent of U.S. Total"],
percent_column_indexes=[2]
)
plot_map(num_farms_totals, "State", "Number_of_Farms", title="Total # of Farms per State")
# Reading in census data on population estimates
state_pop_df_raw = pd.read_excel(
"https://www2.census.gov/programs-surveys/popest/tables/2010-2019/state/totals/nst-est2019-01.xlsx"
)
rows_to_drop = [0 , 1, 2] + list(range(3,8)) + list(range(59, 66))
new_header_row = state_pop_df_raw.iloc[2].copy()
new_header_row[0] = "State"
state_pop_df = state_pop_df_raw.drop(rows_to_drop)
state_pop_df.columns = new_header_row
state_pop_df["State"] = state_pop_df["State"].str.replace(".", "", regex=True)
state_pop_df.reset_index(drop=True, inplace=True)
population_2017 = state_pop_df[["State", 2017]]
population_2017.columns = ["State", "Population"]
# Previewing the first 5 rows of the df
population_2017.head()
farms_per_capita = num_farms_totals.merge(population_2017, how="left", left_on="State", right_on="State")
farms_per_capita["Farms_per_1000_people"] = farms_per_capita["Number_of_Farms"] / (farms_per_capita["Population"]/1000)
farms_per_capita.sort_values("Farms_per_1000_people", inplace=True, ascending=False)
farms_per_capita["Population"] = farms_per_capita["Population"].astype(int)
farms_per_capita.reset_index(drop=True, inplace=True)
style_dataframe(
farms_per_capita,
title="Number of Farms per capita by State",
column_names=["State", "Number of Farms", "Percent of U.S. Total", "State Population", "Farms per 1000 people"],
percent_column_indexes=[2],
heatmap=True,
heatmap_subset=["Farms_per_1000_people"]
)
plot_map(farms_per_capita, "State", "Farms_per_1000_people", title="# of Farms per Capita per State")
farm_size_conversions = {
"AREA OPERATED: (500 TO 999 ACRES)": ["500-999", 5],
"AREA OPERATED: (50 TO 179 ACRES)": ["50-179", 3],
"AREA OPERATED: (180 TO 499 ACRES)": ["180-599", 4],
"AREA OPERATED: (10.0 TO 49.9 ACRES)": ["10-49.9", 2],
"AREA OPERATED: (1.0 TO 9.9 ACRES)": ["1-9.9", 1],
"AREA OPERATED: (1,000 TO 1,999 ACRES)": ["1k-1,999", 6],
"AREA OPERATED: (2,000 OR MORE ACRES)": ["2k+", 7],
}
farm_sizes_df = num_farms_df.loc[
(num_farms_df["agg_level_desc"] == "NATIONAL") &
(~num_farms_df["domaincat_desc"].isnull()),
["domaincat_desc", "value"]
].copy()
farm_sizes_df["farm_size_value"] = farm_sizes_df["domaincat_desc"].apply(lambda x: farm_size_conversions.get(x)[1])
farm_sizes_df = farm_sizes_df.sort_values("farm_size_value").reset_index(drop=True)
farm_sizes_df = farm_sizes_df.drop(columns=["farm_size_value"])
farm_sizes_df.columns = ["Farm_Size_Acres", "Number_of_Farms"]
farm_sizes_df["Farm_Size_Acres"] = farm_sizes_df["Farm_Size_Acres"].apply(lambda x: farm_size_conversions.get(x)[0])
style_dataframe(
farm_sizes_df,
title="Number of Farms by Size in Acres",
column_names=["Farm Size in Acres", "Number of Farms"],
heatmap=True
)
plt.bar(farm_sizes_df["Farm_Size_Acres"], farm_sizes_df["Number_of_Farms"])
plt.title("Number of Farms by Size in Acres")
plt.xlabel("Farm Size (Acres)")
plt.ylabel("Number of Farms")
plt.show()
large_farms_filter = (
(num_farms_df["domaincat_desc"] == "AREA OPERATED: (2,000 OR MORE ACRES)") &
(num_farms_df["agg_level_desc"] != "NATIONAL")
)
large_farms = num_farms_df.loc[large_farms_filter, ["short_desc", "state_name", "domaincat_desc", "value"]]
top_ten_states_large_farms = large_farms.groupby(["state_name"])["value"].sum().sort_values(ascending=False)[:10].reset_index()
style_dataframe(
top_ten_states_large_farms,
title="Number of farms with over 2k acres",
column_names=["State", "Number of Farms >= 2k Acres"]
)
# Filtering down to the data we need
desired_descriptions = ["CROP TOTALS - SALES, MEASURED IN $", "ANIMAL TOTALS, INCL PRODUCTS - SALES, MEASURED IN $"]
crops_livestock_filter = (
(table_1_df["short_desc"].isin(desired_descriptions)) &
(table_1_df["state_name"] != "Us Total")
)
crops_livestock_df = table_1_df.loc[
crops_livestock_filter, ["commodity_desc", "state_name", "value"]
].copy()
crops_livestock_df = crops_livestock_df.pivot(index="state_name", columns="commodity_desc", values="value")
crops_livestock_df = crops_livestock_df.reset_index()
crops_livestock_df.columns = ["State_Name", "Livestock_Revenue", "Crop_Revenue"]
crops_livestock_df.head()
crops_livestock_df["Dominant_Industry"] = np.where(
crops_livestock_df["Livestock_Revenue"] > crops_livestock_df["Crop_Revenue"],
"Livestock", "Crops"
)
style_dataframe(
crops_livestock_df,
title="Livestock vs. Crop Revenue by State",
column_names=["State", "Revenue from Livestock", "Revenue from Crops", "Dominant Industry"]
)
total_livestock_revenue = crops_livestock_df["Livestock_Revenue"].sum()
total_crop_revenue = crops_livestock_df["Crop_Revenue"].sum()
print_as_currency(total_livestock_revenue, postface=" - US Total Livestock Revenue")
print_as_currency(total_crop_revenue, postface=" - US Total Crop Revenue")
import matplotlib.colors
my_cmap = matplotlib.colors.ListedColormap(['#4670AA', '#D9A979'])
plot_map(crops_livestock_df, "State_Name", "Dominant_Industry", cmap=my_cmap, legend=True)
# filtering to only livestock and previewing the first 5 rows
desired_descriptions = [
"CATTLE, COWS, BEEF - OPERATIONS WITH INVENTORY",
"CATTLE, COWS, MILK - OPERATIONS WITH INVENTORY",
"HOGS - OPERATIONS WITH INVENTORY",
"CHICKENS, LAYERS - OPERATIONS WITH INVENTORY"
]
livestock_details_filter = (
(table_1_df["short_desc"].isin(desired_descriptions)) &
(table_1_df["state_name"] != "Us Total")
)
livestock_details_df_raw = table_1_df.loc[
livestock_details_filter,
["short_desc", "state_name", "value"]
]
livestock_details_df_raw.head()
# Pivoting the data and calculating percentages
livestock_details_df = livestock_details_df_raw.pivot(index="state_name", columns="short_desc", values="value")
livestock_percentages = livestock_details_df.div(livestock_details_df.sum(axis=1), axis=0)
percentage_columns = ["Percent_Beef_Cow_Farms", "Percent_Milk_Cow_Farms", "Percent_Chicken_Farms", "Percent_Hog_Farms"]
livestock_percentages.columns = percentage_columns
livestock_details_df = livestock_details_df.join(livestock_percentages)
livestock_percentages = livestock_percentages.reset_index()
livestock_details_df = livestock_details_df.reset_index()
livestock_details_df.columns = [
"State_Name", "Num_Beef_Cow_Farms", "Num_Milk_Cow_Farms",
"Num_Chicken_Farms", "Num_Hog_Farms"] + percentage_columns
style_dataframe(
livestock_percentages,
title="Livestock Percentages by State",
column_names=["State", "% Beef Cow Farms", "% Milk Cow Farms", "% Chicken Farms", "% Hog Farms"],
percent_column_indexes=[1,2,3,4],
heatmap=True
)
non_beef_majority_filter = livestock_percentages["Percent_Beef_Cow_Farms"] <= 0.5
non_beef_majority = livestock_percentages.loc[non_beef_majority_filter]
style_dataframe(
non_beef_majority,
column_names=["State", "% Beef Cow Farms", "% Milk Cow Farms", "% Chicken Farms", "% Hog Farms"],
heatmap=True,
heatmap_subset=["Percent_Milk_Cow_Farms", "Percent_Chicken_Farms", "Percent_Hog_Farms"],
percent_column_indexes=[1,2,3,4]
)
livestock_types = ["Beef_Cow", "Milk_Cow", "Chicken", "Hog"]
top_5_states = {}
def print_top_five_states(column_prefix, print_postface):
for livestock_type in livestock_types:
column_name = column_prefix + "_" + livestock_type + "_Farms"
states = livestock_details_df[
["State_Name", column_name]
].sort_values(column_name, ascending=False).iloc[:5].reset_index(drop=True)
top_5_states[livestock_type] = states["State_Name"]
title = "Top 5 " + livestock_type.replace('_', ' ') + " states " + print_postface
display(style_dataframe(
states,
title=title
))
print("\n")
print_top_five_states("Num", "(Total # of Farms)")
print("---------------\n")
print_top_five_states("Percent", "(Percentage of Farms)")
livestock_details_df["top_five_beef_cow"] = livestock_details_df["State_Name"].isin(top_5_states["Beef_Cow"])
for livestock_type in livestock_types:
column_name = "top_five_" + livestock_type
livestock_details_df[column_name] = ~livestock_details_df["State_Name"].isin(top_5_states[livestock_type])
title = "Top 5 " + livestock_type.replace('_', ' ') + " States (Percentage)"
plot_map(livestock_details_df, "State_Name", column_name, title=title, cmap="tab20c")
# Read in the data we had queried earlier and filter to only principal producer info
farmer_gender_df_raw = pd.read_csv("data/farmer_gender_table.csv")
farmer_gender_df = clean_data_frame(farmer_gender_df_raw)
producer_gender_filter = farmer_gender_df["short_desc"].isin(
["PRODUCERS, PRINCIPAL, MALE - NUMBER OF PRODUCERS", "PRODUCERS, PRINCIPAL, FEMALE - NUMBER OF PRODUCERS"]
)
farmer_gender_df = farmer_gender_df.loc[producer_gender_filter, ["short_desc", "state_name", "value"]]
farmer_gender_df["gender"] = np.where(
farmer_gender_df["short_desc"].str.contains("FEMALE"),
"Female", "Male"
)
national_gender_breakdown = pd.DataFrame(farmer_gender_df.groupby(["gender"])["value"].sum()).reset_index()
national_gender_breakdown["percentage"] = national_gender_breakdown["value"] / national_gender_breakdown["value"].sum()
style_dataframe(
national_gender_breakdown,
title="Gender of U.S. Farmers",
column_names=["Gender", "Number of Farmers", "Percent of Farmers"],
percent_column_indexes=[2]
)
state_gender_breakdown = farmer_gender_df.pivot(index="state_name", columns="gender", values="value").reset_index()
state_gender_breakdown.index.name = None
state_gender_breakdown["percent_female"] = state_gender_breakdown["Female"]/(state_gender_breakdown["Female"] + state_gender_breakdown["Male"])
style_dataframe(
state_gender_breakdown,
title="Gender of U.S. Farmers by State",
column_names=["State", "Number of Female Farmers", "Number of Male Farmers", "Percent of Farmers who are Female"],
heatmap=True,
heatmap_subset=["percent_female"],
percent_column_indexes=[3]
)
top_five_female_states = state_gender_breakdown.sort_values("percent_female", ascending=False)
top_five_female_states = top_five_female_states.iloc[:5, [0, 3]].reset_index(drop=True)
style_dataframe(
top_five_female_states,
title="Top 5 States - % of Female Farmers",
column_names=["State", "Percent of Farmers who are Female"],
percent_column_indexes=[1]
)
bottom_five_female_states = state_gender_breakdown.sort_values("percent_female")
bottom_five_female_states = bottom_five_female_states.iloc[:5, [0, 3]].reset_index(drop=True)
style_dataframe(
bottom_five_female_states,
title="Bottom 5 States - % of Female Farmers",
column_names=["State", "Percent of Farmers who are Female"],
percent_column_indexes=[1]
)
farmer_race_df_raw = pd.read_csv("data/farmer_race_table.csv")
farmer_race_df_raw = clean_data_frame(farmer_race_df_raw)
# Here I just provide alternative, shorter column names
race_conversion = {
"PRODUCERS, PRINCIPAL, HISPANIC - NUMBER OF PRODUCERS": "hispanic",
"PRODUCERS, PRINCIPAL, AMERICAN INDIAN OR ALASKA NATIVE - NUMBER OF PRODUCERS": "am_indian_alaska_native",
"PRODUCERS, PRINCIPAL, ASIAN - NUMBER OF PRODUCERS": "asian",
"PRODUCERS, PRINCIPAL, BLACK OR AFRICAN AMERICAN - NUMBER OF PRODUCERS": "black",
"PRODUCERS, PRINCIPAL, NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER - NUMBER OF PRODUCERS": "hawaiian_pacific_islander",
"PRODUCERS, PRINCIPAL, WHITE - NUMBER OF PRODUCERS": "white",
"PRODUCERS, PRINCIPAL, MULTI-RACE - NUMBER OF PRODUCERS": "multi_race"
}
# Filtering down the dataframe to only the desired values
race_description_filter = (farmer_race_df_raw["short_desc"].isin(race_conversion.keys()))
farmer_race_df = farmer_race_df_raw.loc[
race_description_filter, ["short_desc", "state_name", "value"]]
farmer_race_df["race"] = farmer_race_df["short_desc"].apply(lambda x: race_conversion[x])
# Grouping the data on a national level
us_race_breakdown = farmer_race_df.groupby(["race"])["value"].sum().reset_index().sort_values("value", ascending=False)
us_race_breakdown.columns = ["race", "num_farmers"]
us_race_breakdown["percent_farmers"] = round(us_race_breakdown["num_farmers"]/us_race_breakdown["num_farmers"].sum(), 2)
style_dataframe(
us_race_breakdown,
column_names=["Race", "Number of Farmers", "Percent of all Farmers"],
title="U.S. Farmer Race Demographics:",
percent_column_indexes=[2]
)
state_race_breakdown_totals = farmer_race_df.groupby(["state_name", "race"])["value"].sum().reset_index().sort_values("value", ascending=False)
state_race_breakdown_totals = state_race_breakdown_totals.pivot(index="state_name", columns="race", values="value")
state_race_breakdown = state_race_breakdown_totals.div(state_race_breakdown_totals.sum(axis=1), axis=0)
state_race_breakdown.columns = [column_name + "_perc" for column_name in state_race_breakdown.columns]
state_race_breakdown = state_race_breakdown.sort_values("white_perc")
not_white_majority_filter = state_race_breakdown.idxmax(axis=1) != "white_perc" # return the index of the maximum column for each row
style_dataframe(
state_race_breakdown.loc[not_white_majority_filter],
title="States where white farmers are not the majority",
column_names=["American Indian/Alaska Native", "Asian", "Black", "Hawaiian/Pacific Islander", "Hispanic", "Multi-Racial", "White"],
percent_column_indexes=[0,1,2,3,4,5,6],
highlighting=True
)
white_perc_filter = state_race_breakdown["white_perc"] < .9
state_race_breakdown_perc = state_race_breakdown.loc[white_perc_filter]
style_dataframe(
state_race_breakdown_perc,
title="States with white-farmer percentages less than 90%",
column_names=["American Indian/Alaska Native", "Asian", "Black", "Hawaiian/Pacific Islander", "Hispanic", "Multi-Racial", "White"],
percent_column_indexes=[0,1,2,3,4,5,6],
highlighting=True,
highlight_subset=["am_indian_alaska_native_perc", "asian_perc", "black_perc", "hawaiian_pacific_islander_perc", "hispanic_perc", "multi_race_perc"]
)