import pandas as pd
url="https://raw.githubusercontent.com/austinlasseter/hosting_some_files/main/pandas_files/eu-govt-bonds.tsv"
df = pd.read_csv(url, sep="\t")
df.head()
Run to view results
"2020M01" in df.columns
Run to view results
print(df.columns)
Run to view results
# you can use a list comprehension to trim whitespace for all column names
df.columns = [c.strip() for c in df.columns]
df["2020M01"].head()
Run to view results
df["country_code"] = df["int_rt,geo\\time"].str[-2:]
df.drop("int_rt,geo\\time", axis=1, inplace=True)
df.head()
Run to view results
df.dtypes
Run to view results
df["2019M02"].sort_values().head()
Run to view results
df["2019M02"].sort_values().tail()
Run to view results
# extract that last value (so we don't have to guess how many spaces there are in it)
offending_value = df["2019M02"].sort_values().values[-1]
print(offending_value)
Run to view results
df.loc[df["2019M02"] == offending_value, "country_code"]
Run to view results
df.drop(df[df["country_code"] == "EE"].index, axis=0, inplace=True)
# use a list comprehension to find the column names that should be floats
numeric_cols = [c for c in df.columns if c.startswith("20")]
for c in numeric_cols:
# convert to float
df[c] = df[c].astype(float)
# verify
df.dtypes
Run to view results
url="https://raw.githubusercontent.com/austinlasseter/hosting_some_files/main/pandas_files/country-codes.csv"
Run to view results
df_codes = pd.read_csv(url, encoding="latin-1")
df_codes.head()
Run to view results
print(len(df))
df_merged = df.merge(df_codes, left_on=["country_code"], right_on=["Alpha-2 code"], how="left")
print(len(df_merged))
# remove some unnecessary columns
df_merged.drop(["Alpha-2 code", "Alpha-3 code"], axis=1, inplace=True)
df_merged.head()
Run to view results
df_merged.isnull().sum()
Run to view results
df_merged[df_merged["Country"].isnull()]
Run to view results
df_merged.loc[df_merged["country_code"] == "UK", "Country"] = "United Kingdom"
Run to view results
# Option 1: sort and get head() and tail()
jan_rates = df_merged[["Country", "2020M01"]].sort_values("2020M01")
print(jan_rates.head(1))
print(jan_rates.tail(1))
Run to view results
# Option 2
# another option is to make the country name the index (the unique/primary key)
# and use .idxmax to find "the index of the highest value" (and idxmin for the minimum)
# we can also combine the two by using .agg and passing a list of aggregations we want to perform together
jan_rates.set_index("Country").agg(["idxmin", "idxmax"])
Run to view results
df_merged["diff"] = abs(df_merged["2020M01"] - df_merged["2019M02"])
df_merged.sort_values("diff").head(1)
Run to view results
df_merged.drop("diff", axis=1, inplace=True)
Run to view results
df_merged.to_csv("eu-govt-bonds-cleaned.csv", index=False)
Run to view results
# Set the index to country AND country code (to allow for codes that didn't match to a country)
# and calculate the mean, but along axis=1 meaning per row, not per column
df_merged.set_index(["country_code", "Country"]).mean(axis=1)
Run to view results
# agg takes a list of valid aggregations
df_merged.set_index(["country_code", "Country"]).agg(["mean", "median"], axis=1)
Run to view results
# Python trick: put brackets around your entire expression and you can then lay it out over multiple lines
df_stacked = (
df_merged
.set_index(["country_code", "Country"])
.stack() # this puts all the month columns in one column
.reset_index() # convert the resulting Series to a DataFrame
.rename(columns={"level_2": "month", 0: "rate"}) # rename the resulting default column names
)
df_stacked.head()
Run to view results
df_stacked.groupby("Country")["rate"].agg(["mean", "median"])
Run to view results