import pandas as pd
import re
import unicodedata
dfs = {
name: obj for name, obj in globals().items()
if isinstance(obj, pd.DataFrame)
}
print("Dataframes found:")
for name, df in dfs.items():
print(f"{name}: {df.columns.tolist()}")
iso_df = isocodes.copy()
pop_df = pop.copy()
def looks_like_gdp_df(df):
cols = [str(c).lower() for c in df.columns]
has_country = any(x in c for c in cols for x in ["country name", "country", "name"])
has_code = any(x in c for c in cols for x in ["country code", "code"])
has_year = any(re.fullmatch(r"\d{4}", str(c)) for c in df.columns)
return has_country and (has_code or has_year)
gdp_candidates = {
name: df for name, df in dfs.items()
if name not in ["pop", "isocodes"] and looks_like_gdp_df(df)
}
print("\nPossible GDP dataframes:")
for name, df in gdp_candidates.items():
print(f"{name}: {df.columns.tolist()}")
if len(gdp_candidates) == 0:
raise ValueError("No likely GDP dataframe found. Re-run the cell where you created/downloaded GDP per capita data.")
gdp_name = list(gdp_candidates.keys())[0]
gdp_df = gdp_candidates[gdp_name].copy()
print(f"\nUsing GDP dataframe: {gdp_name}")
def flatten_cols(df):
df = df.copy()
df.columns = [
" ".join(str(x) for x in col if str(x) != "nan").strip()
if isinstance(col, tuple) else str(col).strip()
for col in df.columns
]
return df
iso_df = flatten_cols(iso_df)
pop_df = flatten_cols(pop_df)
gdp_df = flatten_cols(gdp_df)
def clean_country(x):
if pd.isna(x):
return pd.NA
x = str(x).strip()
x = unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8")
x = x.lower()
x = re.sub(r"\[[^\]]*\]", "", x)
x = re.sub(r"[.,'()]", "", x)
x = x.replace("&", "and")
x = re.sub(r"\s+", " ", x).strip()
replacements = {
"united states of america": "united states",
"usa": "united states",
"uk": "united kingdom",
"united kingdom of great britain and northern ireland": "united kingdom",
"russian federation": "russia",
"iran islamic republic of": "iran",
"syrian arab republic": "syria",
"lao peoples democratic republic": "laos",
"viet nam": "vietnam",
"bolivia plurinational state of": "bolivia",
"venezuela bolivarian republic of": "venezuela",
"moldova republic of": "moldova",
"tanzania united republic of": "tanzania",
"micronesia federated states of": "micronesia",
"palestine state of": "palestine",
"republic of korea": "south korea",
"korea republic of": "south korea",
"democratic peoples republic of korea": "north korea",
"korea democratic peoples republic of": "north korea",
"democratic republic of the congo": "dr congo",
"congo the democratic republic of the": "dr congo",
"republic of the congo": "congo",
"congo republic of the": "congo",
"czech republic": "czechia",
"turkiye": "turkey",
"brunei darussalam": "brunei",
"cabo verde": "cape verde",
"timor-leste": "east timor",
"holy see": "vatican city",
}
return replacements.get(x, x)
def find_name_col(df, priority=None):
if priority:
for col in priority:
if col in df.columns:
return col
candidates = [
c for c in df.columns
if any(w in str(c).lower() for w in ["country", "name", "location", "iso 3166"])
]
if not candidates:
raise ValueError(f"Could not find a country/name column in: {df.columns.tolist()}")
return candidates[0]
iso_name_col = find_name_col(iso_df, priority=["ISO 3166", "Sovereignty"])
pop_name_col = find_name_col(pop_df, priority=["country", "Location"])
gdp_name_col = find_name_col(gdp_df, priority=["Country Name", "country", "Country", "name"])
iso_df["country_clean"] = iso_df[iso_name_col].apply(clean_country)
pop_df["country_clean"] = pop_df[pop_name_col].apply(clean_country)
gdp_df["country_clean"] = gdp_df[gdp_name_col].apply(clean_country)
iso_keep = [c for c in ["ISO 3166", "A-2", "A-3", "Num.", "country_clean"] if c in iso_df.columns]
iso_small = iso_df[iso_keep].drop_duplicates(subset="country_clean")
pop_keep = [c for c in ["country", "Location", "population", "pct_world", "date", "country_clean"] if c in pop_df.columns]
pop_small = pop_df[pop_keep].drop_duplicates(subset="country_clean")
gdp_small = gdp_df.drop(columns=[gdp_name_col], errors="ignore").copy()
countrydata = iso_small.merge(pop_small, on="country_clean", how="outer")
countrydata = countrydata.merge(gdp_small, on="country_clean", how="outer")
print("\nMerged dataframe preview:")
print(countrydata.head())
print("\nShape:")
print(countrydata.shape)