Lesson 14 - Merging datasets

import pandas as pd url="https://raw.githubusercontent.com/austinlasseter/hosting_some_files/main/pandas_files/eu-govt-bonds.tsv" df = pd.read_csv(url, sep="\t") df.head()

"2020M01" in df.columns

print(df.columns)

# you can use a list comprehension to trim whitespace for all column names df.columns = [c.strip() for c in df.columns] df["2020M01"].head()

df["country_code"] = df["int_rt,geo\\time"].str[-2:] df.drop("int_rt,geo\\time", axis=1, inplace=True) df.head()

df.dtypes

df["2019M02"].sort_values().head()

df["2019M02"].sort_values().tail()

# extract that last value (so we don't have to guess how many spaces there are in it) offending_value = df["2019M02"].sort_values().values[-1] print(offending_value)

df.loc[df["2019M02"] == offending_value, "country_code"]

df.drop(df[df["country_code"] == "EE"].index, axis=0, inplace=True) # use a list comprehension to find the column names that should be floats numeric_cols = [c for c in df.columns if c.startswith("20")] for c in numeric_cols: # convert to float df[c] = df[c].astype(float) # verify df.dtypes

url="https://raw.githubusercontent.com/austinlasseter/hosting_some_files/main/pandas_files/country-codes.csv"

df_codes = pd.read_csv(url, encoding="latin-1") df_codes.head()

print(len(df)) df_merged = df.merge(df_codes, left_on=["country_code"], right_on=["Alpha-2 code"], how="left") print(len(df_merged)) # remove some unnecessary columns df_merged.drop(["Alpha-2 code", "Alpha-3 code"], axis=1, inplace=True) df_merged.head()

df_merged.isnull().sum()

df_merged[df_merged["Country"].isnull()]

df_merged.loc[df_merged["country_code"] == "UK", "Country"] = "United Kingdom"

# Option 1: sort and get head() and tail() jan_rates = df_merged[["Country", "2020M01"]].sort_values("2020M01") print(jan_rates.head(1)) print(jan_rates.tail(1))

# Option 2 # another option is to make the country name the index (the unique/primary key) # and use .idxmax to find "the index of the highest value" (and idxmin for the minimum) # we can also combine the two by using .agg and passing a list of aggregations we want to perform together jan_rates.set_index("Country").agg(["idxmin", "idxmax"])

df_merged["diff"] = abs(df_merged["2020M01"] - df_merged["2019M02"]) df_merged.sort_values("diff").head(1)

df_merged.drop("diff", axis=1, inplace=True)

df_merged.to_csv("eu-govt-bonds-cleaned.csv", index=False)

# Set the index to country AND country code (to allow for codes that didn't match to a country) # and calculate the mean, but along axis=1 meaning per row, not per column df_merged.set_index(["country_code", "Country"]).mean(axis=1)

# agg takes a list of valid aggregations df_merged.set_index(["country_code", "Country"]).agg(["mean", "median"], axis=1)

# Python trick: put brackets around your entire expression and you can then lay it out over multiple lines df_stacked = ( df_merged .set_index(["country_code", "Country"]) .stack() # this puts all the month columns in one column .reset_index() # convert the resulting Series to a DataFrame .rename(columns={"level_2": "month", 0: "rate"}) # rename the resulting default column names ) df_stacked.head()

df_stacked.groupby("Country")["rate"].agg(["mean", "median"])