1. Introduction to Data Analysis with Pandas

# Let's import pandas and some other basic packages we will use from __future__ import division %matplotlib inline import os import matplotlib.pyplot as plt import numpy as np np.random.seed(123456) import pandas as pd

Run to view results

first_names = pd.Series(['Taylor', 'John', 'Fela', 'Manu', 'Şebnem'], name='First Name') first_names

Run to view results

last_names = pd.Series({0:'Swift', 1:'Lennon', 2:'Kuti', 3:'Chau', 4:'Ferrah'}, name='Last Name') last_names

Run to view results

age = pd.Series(np.random.randint(33, 85, size=5), name='Age') age

Run to view results

musicians = pd.DataFrame({'First Name':first_names, 'Last Name':last_names, 'Age':age}) musicians

Run to view results

musicians.dtypes

Run to view results

musicians['Age'] = musicians['Age'].astype(int)

Run to view results

musicians.dtypes

Run to view results

musicians = pd.DataFrame([first_names, last_names, age]).T musicians

Run to view results

musicians.dtypes

Run to view results

musicians['Age'] = musicians['Age'].astype(int)

Run to view results

musicians.dtypes

Run to view results

random_data = pd.DataFrame(np.random.normal(size=(5,3)), columns=['Random Series 1', 'Random Series 2', 'Random Series 3']) random_data

Run to view results

pathout = './data/' if not os.path.exists(pathout): os.makedirs(pathout) musicians.to_csv(pathout + 'musicians.csv', encoding='utf8', index=False) musicians.to_excel(pathout + 'musicians.xlsx', index=False)

Run to view results

pathout = './data/' musicians_csv = pd.read_csv(pathout + 'musicians.csv', encoding='utf8') musicians_csv

Run to view results

musicians_excel = pd.read_excel(pathout + 'musicians.xlsx') musicians_excel

Run to view results

musicians['Gender'] = ['Female', 'Male', 'Male', 'Male', 'Female'] musicians

Run to view results

random_data['New Series'] = random_data['Random Series 1'] * random_data['Random Series 2'] random_data

Run to view results

musicians['Young Female'] = (musicians['Age']<35) * (musicians['Gender']=='Female') musicians['Young Female (as integer)'] = (musicians['Age']<35) * (musicians['Gender']=='Female').astype(int) musicians

Run to view results

random_data['Some Transformation'] = random_data.apply(lambda x: x['New Series'] - x['Random Series 3'], axis=1) random_data

Run to view results

musicians['First Name']

Run to view results

musicians[['First Name', 'Age']]

Run to view results

musicians.iloc[0]

Run to view results

musicians['First Name'].loc[musicians['Age']<50]

Run to view results

musicians.loc[(musicians['Age']>=50) & (musicians['Gender']=='Female') , ['First Name', 'Last Name']]

Run to view results

musicians.describe()

Run to view results

musicians['Mean Age'] = musicians['Age'].mean() musicians

Run to view results

musicians_gender = musicians.groupby(['Gender']).mean(numeric_only=True) musicians_gender

Run to view results

musicians_gender.loc['Female']

Run to view results

musicians_gender = musicians_gender.reset_index() musicians_gender

Run to view results

random_data.plot.scatter(x='New Series', y='Some Transformation', color='r', s=50, label='Very Important Relation!')

Run to view results

# Import display options for showing websites from IPython.display import IFrame url = 'https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes' #IFrame(url+'?printable=yes', width=800, height=400) IFrame(url, width=800, height=400)

Run to view results

import requests # Define a User-Agent to mimic a browser headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } # Fetch the content response = requests.get(url, headers=headers) # Parse the HTML with pandas if response.status_code == 200: isocodes = pd.read_html(response.text, encoding='utf-8')[0] # df = tables[0] # Select the desired table else: print(f"Failed to fetch page: {response.status_code}") #isocodes = pd.read_html(url, encoding='utf-8')[0] isocodes

Run to view results

isocodes.columns

Run to view results

isocodes = isocodes.droplevel(0, axis=1) isocodes.head()

Run to view results

mycols = isocodes.columns mycols = [c[:c.find('[')].strip() for c in mycols] mycols

Run to view results

isocodes.columns = mycols isocodes.head()

Run to view results

We also need to eliminate the first row

isocodes = isocodes.loc[1:] isocodes.head()

Run to view results

isocodes['Alpha-2 code original'] = isocodes['A-2'] isocodes['A-2'] = isocodes['ISO 3166-2'].apply(lambda x: x[x.find(':')+1:]) isocodes.head()

Run to view results

url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(PPP)_per_capita' IFrame(url, width=800, height=400)

Run to view results

# Fetch the content response = requests.get(url, headers=headers) # Parse the HTML with pandas if response.status_code == 200: # Read all tables from the URL and select the first one gdppc_wiki = pd.read_html(url, encoding='utf-8')[0] else: print(f"Failed to fetch page: {response.status_code}") gdppc_wiki

Run to view results

gdppc_wiki.columns = ['Country/Territory', 'gdppc_IMF', 'year_IMF', 'gdppc_WB', 'year_WB', 'gdppc_CIA', 'year_CIA'] gdppc_wiki.head()

Run to view results

gdppc_wiki['country_name'] = gdppc_wiki['Country/Territory'].str.replace('*', '', regex=True).str.strip() gdppc_wiki.head()

Run to view results

gdppc_wiki.dtypes

Run to view results

for c in gdppc_wiki.columns[1:-1]: if gdppc_wiki[c].dtype=='O': gdppc_wiki[c] = pd.to_numeric(gdppc_wiki[c].str.replace('—', 'nan'), errors='coerce') if c.startswith('year'): gdppc_wiki[c] = gdppc_wiki[c].astype('Int64')

Run to view results

gdppc_wiki.dtypes

Run to view results

isocodes.head(2)

Run to view results

gdppc_wiki.head(1)

Run to view results

merged = isocodes.merge(gdppc_wiki, left_on='ISO 3166', right_on='country_name') merged

Run to view results

merged.shape

Run to view results

isocodes_names = set(isocodes['ISO 3166']) gdppc_wiki_names = set(gdppc_wiki['country_name'])

Run to view results

isocodes_names.difference(gdppc_wiki_names)

Run to view results

gdppc_wiki_names.difference(isocodes_names)

Run to view results

# Set the size of the figure and get a figure and axis object fig, ax = plt.subplots(figsize=(10,6)) merged.gdppc_CIA.plot.kde(ax=ax, label='CIA') merged.gdppc_IMF.plot.kde(ax=ax, label='IMF') merged.gdppc_WB.plot.kde(ax=ax, label='WB') ax.legend()

Run to view results

# Set the size of the figure and get a figure and axis object fig, ax = plt.subplots(figsize=(10,6)) merged.gdppc_CIA.plot.hist(ax=ax, label='CIA') merged.gdppc_IMF.plot.hist(ax=ax, label='IMF', alpha=0.6) merged.gdppc_WB.plot.hist(ax=ax, label='WB', alpha=0.3) ax.legend()

Run to view results

# Set the size of the figure and get a figure and axis object fig, ax = plt.subplots(figsize=(10,6)) merged.plot.scatter(x='gdppc_WB', y='gdppc_CIA', ax=ax, label='WB-CIA', c='r') merged.plot.scatter(x='gdppc_WB', y='gdppc_IMF', ax=ax, label='WB-IMF', c='b') ax.set_xlabel('World Bank') ax.set_ylabel('Other Source') ax.legend(loc='lower right')

Run to view results

countries = pd.Series(['Colombia', 'Turkey', 'United States', 'Germany', 'Chile'], name='country') countries

Run to view results

print('\n', 'There are ', countries.shape[0], 'countries in this series.')

Run to view results

countries.apply(len)

Run to view results

np.random.seed(123456) data = pd.Series(np.random.normal(size=(countries.shape)), name='noise') data

Run to view results

print('\n', 'The average in this sample is ', data.mean()) print('\n', 'The average in this sample is ', "{:.2f}".format(data.mean())) print('\n', 'The maximum in this sample is ', "{:.2f}".format(data.max())) print('\n', 'The standard deviation in this sample is ', "{:.2f}".format(data.std()))

Run to view results

data.apply(np.exp)

Run to view results

df = pd.DataFrame([countries, data]) df

Run to view results

df = df.T df

Run to view results

df = pd.concat([countries, data], axis=1) df

Run to view results

df = pd.DataFrame({'country':countries, 'noise':data}) df

Run to view results

df['noise_sq'] = df.noise**2 df['noise and its square'] = df.noise + df.noise_sq df['name length'] = df.country.apply(len) df

Run to view results

south_america = ['Colombia', 'Chile']

Run to view results

df['South America Logical'] = df.country.apply(lambda x: x in south_america) df

Run to view results

mydict = {True:1, False:0} df['South America Dict'] = df['South America Logical'].map(mydict) df

Run to view results

df['South America'] = df.country.apply(lambda x: x in south_america).astype(int) df

Run to view results

import pandas as pd import requests import re from io import StringIO url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population" headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/124.0.0.0 Safari/537.36" ) } # Download page HTML first to avoid 403 from direct read_html(url) resp = requests.get(url, headers=headers, timeout=30) resp.raise_for_status() # Read tables from downloaded HTML tables = pd.read_html(StringIO(resp.text)) def flatten_columns(cols): out = [] for c in cols: if isinstance(c, tuple): c = " ".join(str(x) for x in c if str(x) != "nan") c = re.sub(r"\s+", " ", str(c)).strip() out.append(c) return out # Find the population table target = None for df in tables: df.columns = flatten_columns(df.columns) cols = " | ".join(df.columns).lower() if "location" in cols and "population" in cols: target = df.copy() break if target is None: raise ValueError("Population table not found on the Wikipedia page.") # Rename columns rename_map = {} for c in target.columns: lc = c.lower() if "location" in lc: rename_map[c] = "country" elif "population" in lc: rename_map[c] = "population" elif "% of" in lc and "world" in lc: rename_map[c] = "pct_world" elif "date" in lc: rename_map[c] = "date" elif "source" in lc: rename_map[c] = "source" elif "notes" in lc: rename_map[c] = "notes" pop = target.rename(columns=rename_map) # Keep useful columns keep = [c for c in ["country", "population", "pct_world", "date", "source", "notes"] if c in pop.columns] pop = pop[keep].copy() # Clean text / footnotes def clean_text(x): if pd.isna(x): return pd.NA x = str(x) x = re.sub(r"\[[^\]]*\]", "", x) # remove [1], [a], etc. x = re.sub(r"\s+", " ", x).strip() return x if x else pd.NA for col in pop.columns: pop[col] = pop[col].map(clean_text) # Clean numeric columns pop["population"] = ( pop["population"] .str.replace(",", "", regex=False) .str.replace(r"[^\d]", "", regex=True) ) pop["population"] = pd.to_numeric(pop["population"], errors="coerce").astype("Int64") if "pct_world" in pop.columns: pop["pct_world"] = ( pop["pct_world"] .str.replace("%", "", regex=False) .str.replace(r"[^\d.]", "", regex=True) ) pop["pct_world"] = pd.to_numeric(pop["pct_world"], errors="coerce") if "date" in pop.columns: pop["date"] = pd.to_datetime(pop["date"], errors="coerce") # Drop aggregate row and empty rows pop = pop.loc[pop["country"].notna()] pop = pop.loc[pop["country"] != "World"] # Final sort pop = pop.sort_values("population", ascending=False, na_position="last").reset_index(drop=True) print(pop.head()) print(pop.info())

Run to view results

import pandas as pd import re import unicodedata # -------- 1. Check columns -------- print("pop columns:", pop.columns.tolist()) print("isocodes columns:", isocodes.columns.tolist()) # -------- 2. Pick correct name columns -------- # change pop_name_col if needed after checking pop.columns if "country" in pop.columns: pop_name_col = "country" elif "Location" in pop.columns: pop_name_col = "Location" else: pop_name_col = [c for c in pop.columns if any(x in c.lower() for x in ["country", "location", "name"])][0] # For your ISO table, this is the country-name column iso_name_col = "ISO 3166" # -------- 3. Clean names -------- def clean_country(x): if pd.isna(x): return pd.NA x = str(x).strip() x = unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8") x = x.lower() # remove wikipedia footnotes x = re.sub(r"\[[^\]]*\]", "", x) # remove punctuation x = re.sub(r"[.,'()]", "", x) x = x.replace("&", "and") x = re.sub(r"\s+", " ", x).strip() replacements = { "united states of america": "united states", "united states": "united states", "usa": "united states", "united kingdom of great britain and northern ireland": "united kingdom", "uk": "united kingdom", "russian federation": "russia", "iran islamic republic of": "iran", "syrian arab republic": "syria", "lao peoples democratic republic": "laos", "viet nam": "vietnam", "bolivia plurinational state of": "bolivia", "venezuela bolivarian republic of": "venezuela", "moldova republic of": "moldova", "tanzania united republic of": "tanzania", "micronesia federated states of": "micronesia", "palestine state of": "palestine", "korea republic of": "south korea", "republic of korea": "south korea", "korea democratic peoples republic of": "north korea", "democratic peoples republic of korea": "north korea", "congo the democratic republic of the": "dr congo", "democratic republic of the congo": "dr congo", "congo republic of the": "republic of the congo", "republic of the congo": "republic of the congo", "czech republic": "czechia", "turkiye": "turkey", "brunei darussalam": "brunei", "cabo verde": "cape verde", "timor-leste": "east timor", "holy see": "vatican city", } return replacements.get(x, x) pop["country_clean"] = pop[pop_name_col].apply(clean_country) isocodes["country_clean"] = isocodes[iso_name_col].apply(clean_country) # -------- 4. First merge -------- merged = pop.merge( isocodes, on="country_clean", how="outer", indicator=True, suffixes=("_pop", "_iso") ) # -------- 5. Show mismatches -------- print("\nOnly in pop:") print(sorted(set(pop["country_clean"].dropna()) - set(isocodes["country_clean"].dropna()))) print("\nOnly in isocodes:") print(sorted(set(isocodes["country_clean"].dropna()) - set(pop["country_clean"].dropna()))) # -------- 6. Final matched merge -------- merged_full = pop.merge( isocodes, on="country_clean", how="inner", suffixes=("_pop", "_iso") ) print("\nMerged rows:", len(merged_full)) print(merged_full.head())

Run to view results

import pandas as pd import re import unicodedata # ----------------------------- # 1. Find all dataframes in memory # ----------------------------- dfs = { name: obj for name, obj in globals().items() if isinstance(obj, pd.DataFrame) } print("Dataframes found:") for name, df in dfs.items(): print(f"{name}: {df.columns.tolist()}") # ----------------------------- # 2. Use your existing pop and isocodes dataframes # ----------------------------- iso_df = isocodes.copy() pop_df = pop.copy() # ----------------------------- # 3. Automatically detect the GDP dataframe # ----------------------------- def looks_like_gdp_df(df): cols = [str(c).lower() for c in df.columns] has_country = any(x in c for c in cols for x in ["country name", "country", "name"]) has_code = any(x in c for c in cols for x in ["country code", "code"]) has_year = any(re.fullmatch(r"\d{4}", str(c)) for c in df.columns) return has_country and (has_code or has_year) gdp_candidates = { name: df for name, df in dfs.items() if name not in ["pop", "isocodes"] and looks_like_gdp_df(df) } print("\nPossible GDP dataframes:") for name, df in gdp_candidates.items(): print(f"{name}: {df.columns.tolist()}") if len(gdp_candidates) == 0: raise ValueError("No likely GDP dataframe found. Re-run the cell where you created/downloaded GDP per capita data.") # Take the first likely GDP dataframe gdp_name = list(gdp_candidates.keys())[0] gdp_df = gdp_candidates[gdp_name].copy() print(f"\nUsing GDP dataframe: {gdp_name}") # ----------------------------- # 4. Flatten column names # ----------------------------- def flatten_cols(df): df = df.copy() df.columns = [ " ".join(str(x) for x in col if str(x) != "nan").strip() if isinstance(col, tuple) else str(col).strip() for col in df.columns ] return df iso_df = flatten_cols(iso_df) pop_df = flatten_cols(pop_df) gdp_df = flatten_cols(gdp_df) # ----------------------------- # 5. Clean country names # ----------------------------- def clean_country(x): if pd.isna(x): return pd.NA x = str(x).strip() x = unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8") x = x.lower() x = re.sub(r"\[[^\]]*\]", "", x) x = re.sub(r"[.,'()]", "", x) x = x.replace("&", "and") x = re.sub(r"\s+", " ", x).strip() replacements = { "united states of america": "united states", "usa": "united states", "uk": "united kingdom", "united kingdom of great britain and northern ireland": "united kingdom", "russian federation": "russia", "iran islamic republic of": "iran", "syrian arab republic": "syria", "lao peoples democratic republic": "laos", "viet nam": "vietnam", "bolivia plurinational state of": "bolivia", "venezuela bolivarian republic of": "venezuela", "moldova republic of": "moldova", "tanzania united republic of": "tanzania", "micronesia federated states of": "micronesia", "palestine state of": "palestine", "republic of korea": "south korea", "korea republic of": "south korea", "democratic peoples republic of korea": "north korea", "korea democratic peoples republic of": "north korea", "democratic republic of the congo": "dr congo", "congo the democratic republic of the": "dr congo", "republic of the congo": "congo", "congo republic of the": "congo", "czech republic": "czechia", "turkiye": "turkey", "brunei darussalam": "brunei", "cabo verde": "cape verde", "timor-leste": "east timor", "holy see": "vatican city", } return replacements.get(x, x) # ----------------------------- # 6. Detect country columns # ----------------------------- def find_name_col(df, priority=None): if priority: for col in priority: if col in df.columns: return col candidates = [ c for c in df.columns if any(w in str(c).lower() for w in ["country", "name", "location", "iso 3166"]) ] if not candidates: raise ValueError(f"Could not find a country/name column in: {df.columns.tolist()}") return candidates[0] iso_name_col = find_name_col(iso_df, priority=["ISO 3166", "Sovereignty"]) pop_name_col = find_name_col(pop_df, priority=["country", "Location"]) gdp_name_col = find_name_col(gdp_df, priority=["Country Name", "country", "Country", "name"]) iso_df["country_clean"] = iso_df[iso_name_col].apply(clean_country) pop_df["country_clean"] = pop_df[pop_name_col].apply(clean_country) gdp_df["country_clean"] = gdp_df[gdp_name_col].apply(clean_country) # ----------------------------- # 7. Keep useful columns # ----------------------------- iso_keep = [c for c in ["ISO 3166", "A-2", "A-3", "Num.", "country_clean"] if c in iso_df.columns] iso_small = iso_df[iso_keep].drop_duplicates(subset="country_clean") pop_keep = [c for c in ["country", "Location", "population", "pct_world", "date", "country_clean"] if c in pop_df.columns] pop_small = pop_df[pop_keep].drop_duplicates(subset="country_clean") gdp_small = gdp_df.drop(columns=[gdp_name_col], errors="ignore").copy() # ----------------------------- # 8. Merge all three # ----------------------------- countrydata = iso_small.merge(pop_small, on="country_clean", how="outer") countrydata = countrydata.merge(gdp_small, on="country_clean", how="outer") print("\nMerged dataframe preview:") print(countrydata.head()) print("\nShape:") print(countrydata.shape)

Run to view results

import os path = "./data/" pathout = "./data/" pathgraphs = "./graphs/" os.makedirs(path, exist_ok=True) os.makedirs(pathout, exist_ok=True) os.makedirs(pathgraphs, exist_ok=True)

Run to view results

import os filename = "Wiki_Data" countrydata.to_csv(os.path.join(pathout, f"{filename}.csv"), index=False) countrydata.to_excel(os.path.join(pathout, f"{filename}.xlsx"), index=False) countrydata.to_stata(os.path.join(pathout, f"{filename}.dta"), write_index=False, version=118)

Run to view results

import os import numpy as np import pandas as pd import matplotlib.pyplot as plt # ---------------------------- # Make sure the columns exist # ---------------------------- if "population" not in countrydata.columns: raise ValueError("countrydata does not contain a 'population' column.") if "gdp_per_capita" not in countrydata.columns: raise ValueError("countrydata does not contain a 'gdp_per_capita' column.") # ---------------------------- # Clean plotting data # ---------------------------- plotdf = countrydata[["population", "gdp_per_capita"]].copy() plotdf["population"] = pd.to_numeric(plotdf["population"], errors="coerce") plotdf["gdp_per_capita"] = pd.to_numeric(plotdf["gdp_per_capita"], errors="coerce") plotdf = plotdf.dropna(subset=["population", "gdp_per_capita"]) plotdf = plotdf[(plotdf["population"] > 0) & (plotdf["gdp_per_capita"] > 0)].copy() plotdf["population"] = plotdf["population"].astype(float) plotdf["gdp_per_capita"] = plotdf["gdp_per_capita"].astype(float) plotdf["log_population"] = np.log(plotdf["population"]) plotdf["log_gdp_per_capita"] = np.log(plotdf["gdp_per_capita"]) # ---------------------------- # Save helper # ---------------------------- def save_plot(fig, name): fig.savefig(os.path.join(pathgraphs, f"{name}.png"), dpi=300, bbox_inches="tight") fig.savefig(os.path.join(pathgraphs, f"{name}.pdf"), bbox_inches="tight") fig.savefig(os.path.join(pathgraphs, f"{name}.jpg"), dpi=300, bbox_inches="tight") # ---------------------------- # Fit-line helper # ---------------------------- def add_fit_line(ax, x, y): x = np.asarray(x, dtype=float) y = np.asarray(y, dtype=float) m, b = np.polyfit(x, y, 1) xline = np.linspace(x.min(), x.max(), 200) yline = m * xline + b ax.plot(xline, yline) # ---------------------------- # 1. Levels-Levels # y = GDP per capita, x = Population # ---------------------------- fig, ax = plt.subplots(figsize=(8, 6)) ax.scatter(plotdf["population"], plotdf["gdp_per_capita"]) add_fit_line(ax, plotdf["population"], plotdf["gdp_per_capita"]) ax.set_xlabel("Population") ax.set_ylabel("GDP per capita") ax.set_title("GDP per capita vs Population (Levels-Levels)") save_plot(fig, "gdp_pop_levels_levels") plt.close(fig) # ---------------------------- # 2. Levels-Logs # y = GDP per capita, x = log(Population) # ---------------------------- fig, ax = plt.subplots(figsize=(8, 6)) ax.scatter(plotdf["log_population"], plotdf["gdp_per_capita"]) add_fit_line(ax, plotdf["log_population"], plotdf["gdp_per_capita"]) ax.set_xlabel("log(Population)") ax.set_ylabel("GDP per capita") ax.set_title("GDP per capita vs log(Population) (Levels-Logs)") save_plot(fig, "gdp_pop_levels_logs") plt.close(fig) # ---------------------------- # 3. Logs-Levels # y = log(GDP per capita), x = Population # ---------------------------- fig, ax = plt.subplots(figsize=(8, 6)) ax.scatter(plotdf["population"], plotdf["log_gdp_per_capita"]) add_fit_line(ax, plotdf["population"], plotdf["log_gdp_per_capita"]) ax.set_xlabel("Population") ax.set_ylabel("log(GDP per capita)") ax.set_title("log(GDP per capita) vs Population (Logs-Levels)") save_plot(fig, "gdp_pop_logs_levels") plt.close(fig) # ---------------------------- # 4. Logs-Logs # y = log(GDP per capita), x = log(Population) # ---------------------------- fig, ax = plt.subplots(figsize=(8, 6)) ax.scatter(plotdf["log_population"], plotdf["log_gdp_per_capita"]) add_fit_line(ax, plotdf["log_population"], plotdf["log_gdp_per_capita"]) ax.set_xlabel("log(Population)") ax.set_ylabel("log(GDP per capita)") ax.set_title("log(GDP per capita) vs log(Population) (Logs-Logs)") save_plot(fig, "gdp_pop_logs_logs") plt.close(fig) print("Plots saved in:", pathgraphs) print(sorted(os.listdir(pathgraphs)))

Run to view results

import os import re import requests import numpy as np import pandas as pd import matplotlib.pyplot as plt # ------------------------------------------------- # 1. Make sure GDP per capita exists # ------------------------------------------------- if "gdp_per_capita" not in countrydata.columns: raise ValueError("countrydata does not contain 'gdp_per_capita'. Run the GDP merge step first.") # ------------------------------------------------- # 2. Find or create population growth # ------------------------------------------------- popg_candidates = [ "population_growth", "pop_growth", "Population growth (annual %)", "SP.POP.GROW" ] popg_col = next((c for c in popg_candidates if c in countrydata.columns), None) # If not already present, download latest population growth from World Bank if popg_col is None: if "A-3" not in countrydata.columns: raise ValueError("countrydata does not contain an 'A-3' ISO column, so population growth cannot be merged automatically.") url = "https://api.worldbank.org/v2/country/all/indicator/SP.POP.GROW?format=json&per_page=20000" resp = requests.get(url, timeout=60) resp.raise_for_status() wb = resp.json()[1] popgrowth = pd.DataFrame(wb) popgrowth = popgrowth[["country", "countryiso3code", "date", "value"]].copy() popgrowth["country"] = popgrowth["country"].apply(lambda x: x["value"] if isinstance(x, dict) else x) popgrowth = popgrowth.rename(columns={ "country": "country_wb", "countryiso3code": "A-3", "date": "pop_growth_year", "value": "population_growth" }) popgrowth["pop_growth_year"] = pd.to_numeric(popgrowth["pop_growth_year"], errors="coerce") popgrowth["population_growth"] = pd.to_numeric(popgrowth["population_growth"], errors="coerce") popgrowth = popgrowth[popgrowth["A-3"].notna() & (popgrowth["A-3"] != "")] popgrowth = popgrowth[popgrowth["population_growth"].notna()] # keep latest available population growth by country popgrowth = popgrowth.sort_values(["A-3", "pop_growth_year"], ascending=[True, False]) popgrowth = popgrowth.drop_duplicates(subset="A-3").reset_index(drop=True) # drop old population growth columns if they exist drop_existing = [c for c in ["population_growth", "pop_growth_year"] if c in countrydata.columns] countrydata = countrydata.drop(columns=drop_existing, errors="ignore") countrydata = countrydata.merge( popgrowth[["A-3", "population_growth", "pop_growth_year"]], on="A-3", how="left" ) popg_col = "population_growth" print("Using GDP per capita column:", "gdp_per_capita") print("Using population growth column:", popg_col) # ------------------------------------------------- # 3. Clean plotting data # ------------------------------------------------- plotdf = countrydata[["gdp_per_capita", popg_col]].copy() plotdf.columns = ["gdp_per_capita", "population_growth"] plotdf["gdp_per_capita"] = pd.to_numeric(plotdf["gdp_per_capita"], errors="coerce") plotdf["population_growth"] = pd.to_numeric(plotdf["population_growth"], errors="coerce") plotdf = plotdf.dropna(subset=["gdp_per_capita", "population_growth"]).copy() # logs require positive values plotdf_pos = plotdf[(plotdf["gdp_per_capita"] > 0) & (plotdf["population_growth"] > 0)].copy() plotdf_pos["log_gdp_per_capita"] = np.log(plotdf_pos["gdp_per_capita"]) plotdf_pos["log_population_growth"] = np.log(plotdf_pos["population_growth"]) # ------------------------------------------------- # 4. Helper functions # ------------------------------------------------- def save_plot(fig, name): fig.savefig(os.path.join(pathgraphs, f"{name}.png"), dpi=300, bbox_inches="tight") fig.savefig(os.path.join(pathgraphs, f"{name}.pdf"), bbox_inches="tight") fig.savefig(os.path.join(pathgraphs, f"{name}.jpg"), dpi=300, bbox_inches="tight") def add_fit_line(ax, x, y): x = np.asarray(x, dtype=float) y = np.asarray(y, dtype=float) m, b = np.polyfit(x, y, 1) xline = np.linspace(x.min(), x.max(), 200) ax.plot(xline, m * xline + b) # ------------------------------------------------- # 5. Levels-Levels # y = GDP per capita, x = Population Growth # ------------------------------------------------- fig, ax = plt.subplots(figsize=(8, 6)) ax.scatter(plotdf["population_growth"], plotdf["gdp_per_capita"]) add_fit_line(ax, plotdf["population_growth"], plotdf["gdp_per_capita"]) ax.set_xlabel("Population Growth") ax.set_ylabel("GDP per capita") ax.set_title("GDP per capita vs Population Growth (Levels-Levels)") save_plot(fig, "gdp_popgrowth_levels_levels") plt.close(fig) # ------------------------------------------------- # 6. Levels-Logs # y = GDP per capita, x = log(Population Growth) # ------------------------------------------------- fig, ax = plt.subplots(figsize=(8, 6)) ax.scatter(plotdf_pos["log_population_growth"], plotdf_pos["gdp_per_capita"]) add_fit_line(ax, plotdf_pos["log_population_growth"], plotdf_pos["gdp_per_capita"]) ax.set_xlabel("log(Population Growth)") ax.set_ylabel("GDP per capita") ax.set_title("GDP per capita vs log(Population Growth) (Levels-Logs)") save_plot(fig, "gdp_popgrowth_levels_logs") plt.close(fig) # ------------------------------------------------- # 7. Logs-Levels # y = log(GDP per capita), x = Population Growth # ------------------------------------------------- fig, ax = plt.subplots(figsize=(8, 6)) ax.scatter(plotdf_pos["population_growth"], plotdf_pos["log_gdp_per_capita"]) add_fit_line(ax, plotdf_pos["population_growth"], plotdf_pos["log_gdp_per_capita"]) ax.set_xlabel("Population Growth") ax.set_ylabel("log(GDP per capita)") ax.set_title("log(GDP per capita) vs Population Growth (Logs-Levels)") save_plot(fig, "gdp_popgrowth_logs_levels") plt.close(fig) # ------------------------------------------------- # 8. Logs-Logs # y = log(GDP per capita), x = log(Population Growth) # ------------------------------------------------- fig, ax = plt.subplots(figsize=(8, 6)) ax.scatter(plotdf_pos["log_population_growth"], plotdf_pos["log_gdp_per_capita"]) add_fit_line(ax, plotdf_pos["log_population_growth"], plotdf_pos["log_gdp_per_capita"]) ax.set_xlabel("log(Population Growth)") ax.set_ylabel("log(GDP per capita)") ax.set_title("log(GDP per capita) vs log(Population Growth) (Logs-Logs)") save_plot(fig, "gdp_popgrowth_logs_logs") plt.close(fig) print("Plots saved in:", pathgraphs) print(sorted(os.listdir(pathgraphs)))

Run to view results

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}We also need to eliminate the first row

We also need to eliminate the first row