Visual Analytics

import numpy as np import pandas as pd from datetime import datetime from IPython.display import clear_output # cleaner output

number_of_files = 3 df_ADS_list = [] for file_num in range(number_of_files): # this and all other local imports can be revised to suit your needs df_ADS_list.append(pd.read_csv(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\ADS\\{2017+file_num}.csv", delimiter="|"))

df_ADS_original = pd.concat(df_ADS_list) df_ADS = df_ADS_original.copy()

df_ADS["Product"] = "C" # ("C" stands for courier)

df_ADS.ConsignorCountryCode.unique()

df_ADS.ConsigneeCountryCode.unique()

df_ADS.ConsignorCountryCode = df_ADS.ConsignorCountryCode.str.upper().replace({"DANMARK": "DK", "SWEDEN": "SE", "NORWAY": "NO"}) df_ADS.ConsigneeCountryCode = df_ADS.ConsigneeCountryCode.str.upper().replace({"DANMARK": "DK", "SWEDEN": "SE", "NORWAY": "NO"}) df_ADS.head()

df_ADS.info()

check_date = df_ADS.sort_values(by=["Shipment Tracking Number"]) check_date.head(20)

df_ADS["ShipmentDate"] = np.where(df_ADS["ShipmentDate"].isna(), df_ADS["FileName"].astype(str), df_ADS["ShipmentDate"]) # NaN > "2017" df_ADS["ShipmentDate"] = np.where(df_ADS["ShipmentDate"].str.len() == 4, df_ADS["ShipmentDate"] + "-01-01", df_ADS["ShipmentDate"]) # "2017" > "2017-01-01" df_ADS["ShipmentDate"] = pd.to_datetime(df_ADS.ShipmentDate, format='%Y-%m-%d')

total = df_ADS["Price_Paid"].sum() partial = (df_ADS["Price_Paid"][df_ADS["Consignee Location Nr"].isna()]).sum() print("The percentage of total turnover that refers to missing Consignee Location Nr is " + str((partial/total)*100) + "%")

df_ADS["Consignee Location Nr"][df_ADS["Involved Party ID"] == 271498323].unique()

print(df_ADS.loc[df_ADS["ConsigneeCountryCode"] == "DK", ["Consignee Location Nr"]]) print(df_ADS.loc[df_ADS["ConsigneeCountryCode"] == "DK", ["Consignee Location Nr"]].value_counts()) print(df_ADS.loc[df_ADS["ConsigneeCountryCode"] == "SE", ["Consignee Location Nr"]]) print(df_ADS.loc[df_ADS["ConsigneeCountryCode"] == "SE", ["Consignee Location Nr"]].value_counts()) print(df_ADS.loc[df_ADS["ConsigneeCountryCode"] == "NO", ["Consignee Location Nr"]]) print(df_ADS.loc[df_ADS["ConsigneeCountryCode"] == "NO", ["Consignee Location Nr"]].value_counts())

df_ADS[df_ADS["Consignee Location Nr"].isna() & df_ADS["ConsigneeCountryCode"] == "DK"]

df_ADS[df_ADS.duplicated()]

df_ADS.info()

df_master_ADS = pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\Master_Data\\ADS_DDS_RF_CustomerNumbers_MasterData.xlsx", sheet_name="ADS") df_master_ADS

df_master_ADS[df_master_ADS.duplicated()]

df_ADS = df_ADS.merge(df_master_ADS, how="left", on="Involved Party ID") df_ADS.info()

years = 3 df_DDS_groupage_list = [] months = ['jan', 'feb', 'mar', 'apr', 'maj', 'jun', 'jul', 'aug', 'sep', 'okt', 'nov', 'dec'] for year in range(years): year += 2017 for month in (months): if year == 2019 and month == 'sep': break # this is to prevent throwing an error, can be changed based on the month df_DDS_groupage_list.append(pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\DDS\\Groupage\\DatadumpProduct G_{year}_{month}.xls"))

df_DDS_groupage_original = pd.concat(df_DDS_groupage_list) df_DDS_groupage = df_DDS_groupage_original.copy()

df_DDS_groupage.head()

df_DDS_groupage.info() # checking for nulls in the parcel pallet

df_DDS_groupage["Parcel/Pallet type"].unique() # quick check for elements to substitute

df_DDS_groupage["Parcel/Pallet type"].replace("PKK", np.NaN, inplace=True) # substitute PKK > NaN df_DDS_groupage["Parcel/Pallet type"].replace("HPL", np.NaN, inplace=True) # substitute HPL > NaN df_DDS_groupage["Parcel/Pallet type"].replace("PLL", np.NaN, inplace=True) # substitute PLL > NaN df_DDS_groupage["Parcel/Pallet type"].replace("QPL", np.NaN, inplace=True) # substitute QPL > NaN df_DDS_groupage["Parcel/Pallet type"].replace("SPL", np.NaN, inplace=True) # substitute SPL > NaN

df_DDS_groupage["ConsignorWeight"].replace(np.NaN, 0, inplace=True) # substitute NaN > 0

total = df_DDS_groupage["Price_Paid"].sum() partial = (df_DDS_groupage["Price_Paid"][df_DDS_groupage["Customer_ID"].isna()]).sum() print("The percentage of total turnover that refers to missing Customer_ID is " + str((partial/total)*100) + "%")

df_DDS_groupage.groupby(by=["Customer_ID"])["Price_Paid"].sum().sort_values()

df_DDS_groupage["Customer_ID"].replace(np.NaN, "GAAAABEAB", inplace=True) # substitute NaN > GAAAABEAB

df_DDS_groupage[df_DDS_groupage.duplicated()]

df_DDS_groupage[df_DDS_groupage["Shipment Tracking Number"].duplicated()]

df_DDS_groupage[df_DDS_groupage["Shipment Tracking Number"] == "DODA648Z44DOOOA68"]

df_DDS_groupage.drop_duplicates(inplace=True)

df_DDS_groupage.info()

years = 3 df_DDS_innight_list = [] months = ['jan', 'feb', 'mar', 'apr', 'maj', 'jun', 'jul', 'aug', 'sep', 'okt', 'nov', 'dec'] for year in range(years): year += 2017 for month in (months): if year == 2019 and month == 'sep': break # this is to prevent throwing an error df_DDS_innight_list.append(pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\DDS\\Innight\\DatadumpProduct E_{year}_{month}.xls"))

df_DDS_innight_original = pd.concat(df_DDS_innight_list) df_DDS_innight = df_DDS_innight_original.copy()

df_DDS_innight.head()

df_DDS_innight.info()

df_DDS_innight["Parcel/Pallet type"].replace("PKK", np.NaN, inplace=True) # substitute PKK > NaN df_DDS_innight["Parcel/Pallet type"].replace("HPL", np.NaN, inplace=True) # substitute HPL > NaN df_DDS_innight["Parcel/Pallet type"].replace("PLL", np.NaN, inplace=True) # substitute PLL > NaN df_DDS_innight["Parcel/Pallet type"].replace("QPL", np.NaN, inplace=True) # substitute QPL > NaN df_DDS_innight["Parcel/Pallet type"].replace("SPL", np.NaN, inplace=True) # substitute SPL > NaN

df_DDS_innight["ConsignorWeight"].replace(np.NaN, 0, inplace=True) # substitute NaN > 0

total = df_DDS_innight["Price_Paid"].sum() partial = (df_DDS_innight["Price_Paid"][df_DDS_innight["Customer_ID"].isna()]).sum() print("The percentage of total turnover that refers to missing Customer_ID is " + str((partial/total)*100) + "%")

df_DDS_innight.groupby(by=["Customer_ID"])["Price_Paid"].sum().sort_values()

df_DDS_innight["Customer_ID"].replace(np.NaN, "EAABAAAB", inplace=True) # substitute NaN > EAABAAAB

df_DDS_innight[df_DDS_innight.duplicated()]

df_DDS_innight.info()

years = 3 df_DDS_pallet_list = [] months = ['jan', 'feb', 'mar', 'apr', 'maj', 'jun', 'jul', 'aug', 'sep', 'okt', 'nov', 'dec'] for year in range(years): year += 2017 for month in (months): if year == 2019 and month == 'sep': break # this is to prevent throwing an error df_DDS_pallet_list.append(pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\DDS\\Pallet\\DatadumpProduct P_{year}_{month}.xls"))

df_DDS_pallet_original = pd.concat(df_DDS_pallet_list) df_DDS_pallet = df_DDS_pallet_original.copy()

df_DDS_pallet.head()

df_DDS_pallet.info()

df_DDS_pallet["Parcel/Pallet type"].replace(np.NaN, "PLL", inplace=True) # NaN > PLL

df_DDS_pallet["ConsignorWeight"].replace(np.NaN, 0, inplace=True) # substitute NaN > 0

total = df_DDS_pallet["Price_Paid"].sum() partial = (df_DDS_pallet["Price_Paid"][df_DDS_pallet["Customer_ID"].isna()]).sum() print("The percentage of total turnover that refers to missing Customer_ID is " + str((partial/total)*100) + "%")

df_DDS_pallet.groupby(by=["Customer_ID"])["Price_Paid"].sum().sort_values()

df_DDS_pallet["Customer_ID"].replace(np.NaN, "CGCAAAEGD", inplace=True) # substitute NaN > CGCAAAEGD

df_DDS_pallet[df_DDS_pallet.duplicated()]

df_DDS_pallet.drop_duplicates(inplace=True)

df_DDS_pallet.info()

year = 2019 df_DDS_roadfreight_list = [] months = ['maj', 'jun', 'jul', 'aug', 'sep', 'okt', 'nov', 'dec'] for month in (months): if month == 'sep': break # this is to prevent throwing an error df_DDS_roadfreight_list.append(pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\DDS\\Road_Freight\\DatadumpProduct R_{year}_{month}.xls"))

df_DDS_roadfreight_original = pd.concat(df_DDS_roadfreight_list) df_DDS_roadfreight = df_DDS_roadfreight_original.copy()

df_DDS_roadfreight.head()

df_DDS_roadfreight.info()

df_DDS_roadfreight["Parcel/Pallet type"].replace("PKK", np.NaN, inplace=True) # substitute PKK > NaN df_DDS_roadfreight["Parcel/Pallet type"].replace("HPL", np.NaN, inplace=True) # substitute HPL > NaN df_DDS_roadfreight["Parcel/Pallet type"].replace("PLL", np.NaN, inplace=True) # substitute PLL > NaN df_DDS_roadfreight["Parcel/Pallet type"].replace("QPL", np.NaN, inplace=True) # substitute QPL > NaN df_DDS_roadfreight["Parcel/Pallet type"].replace("SPL", np.NaN, inplace=True) # substitute SPL > NaN

df_DDS_roadfreight["ConsignorWeight"].replace(np.NaN, 0, inplace=True) # substitute NaN > 0

total = df_DDS_roadfreight["Price_Paid"].sum() partial = (df_DDS_roadfreight["Price_Paid"][df_DDS_roadfreight["Customer_ID"].isna()]).sum() print("The percentage of total turnover that refers to missing Customer_ID is " + str((partial/total)*100) + "%")

df_DDS_roadfreight.groupby(by=["Customer_ID"])["Price_Paid"].sum().sort_values()

df_DDS_roadfreight["Customer_ID"].replace(np.NaN, "HAAHAAAEC", inplace=True) # substitute NaN > HAAHAAAEC

df_DDS_roadfreight[df_DDS_roadfreight.duplicated()]

df_DDS_roadfreight.info()

df_DDS = [df_DDS_groupage, df_DDS_innight, df_DDS_pallet, df_DDS_roadfreight] df_DDS = pd.concat(df_DDS)

df_DDS.info(null_counts=True)

df_master_DDS = pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\Master_Data\\ADS_DDS_RF_CustomerNumbers_MasterData.xlsx", sheet_name="DDS") df_master_DDS

df_master_DDS.rename(columns={"Customer ID": "Customer_ID"}, inplace=True)

df_master_DDS[df_master_DDS.duplicated()]

df_DDS = df_DDS.merge(df_master_DDS, how="left", on="Customer_ID") df_DDS.info(null_counts=True)

df = pd.concat([df_DDS, df_ADS])

df_DDS_groupage.shape[0] + df_DDS_innight.shape[0] \ + df_DDS_pallet.shape[0] + df_DDS_roadfreight.shape[0] == df_DDS.shape[0]

df_DDS.shape[0] + df_ADS.shape[0] == df.shape[0] # global sanity check

print(df.ConsignorCountryCode.unique()) print(df.ConsigneeCountryCode.unique()) print(df.Product.unique()) print(df["Parcel/Pallet type"].unique())

df.info(null_counts=True)

df.loc[(df["Consignee Location Nr"].isna()) & (df["ConsigneeCountryCode"] == "DK"), ["Consignee Location Nr"]] = df[df["ConsigneeCountryCode"] == "DK"].value_counts("Consignee Location Nr").index[0] df.loc[(df["Consignee Location Nr"].isna()) & (df["ConsigneeCountryCode"] == "SE"), ["Consignee Location Nr"]] = df[df["ConsigneeCountryCode"] == "SE"].value_counts("Consignee Location Nr").index[0] df.loc[(df["Consignee Location Nr"].isna()) & (df["ConsigneeCountryCode"] == "NO"), ["Consignee Location Nr"]] = df[df["ConsigneeCountryCode"] == "NO"].value_counts("Consignee Location Nr").index[0]

df.info(null_counts=True)

df_DDS_roadfreight_old = pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\EasyForward\\Historical_Before_May_2019.xls")

df_DDS_roadfreight_old.head()

df.head()

df_DDS_roadfreight_old["Product"] = "R" # product column implemented ("R" stands for Road Freight)

df_DDS_roadfreight_old["ShipmentDate"] = pd.to_datetime(df_DDS_roadfreight_old.Year.astype(str), format='%Y') + \ pd.to_timedelta(df_DDS_roadfreight_old.Week.mul(7).astype(str) + " days") # it counts the last day of the specified week

df_DDS_roadfreight_old.drop(columns={"Week", "Month", "Year"}, inplace=True) # remove the unneccesary clutter

df_DDS_roadfreight_old

df_DDS_roadfreight_old[df_DDS_roadfreight_old.duplicated()]

df_DDS_roadfreight_old.info()

df_master_EasyForward = pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\Master_Data\\ADS_DDS_RF_CustomerNumbers_MasterData.xlsx", sheet_name="Road Frieght") # typo in the original document df_master_EasyForward

df_master_EasyForward[df_master_EasyForward.duplicated()]

df_master_EasyForward.drop_duplicates(inplace=True)

df_master_EasyForward

df_master_EasyForward[df_master_EasyForward["Main Customer"].duplicated()]

df_master_EasyForward.reset_index(inplace=True)

df_master_EasyForward[df_master_EasyForward["Main Customer"] == "Customer 374"]

df_master_EasyForward.drop(df_master_EasyForward.index[47], inplace=True) df_master_EasyForward.drop(columns=["index"], inplace=True) df_master_EasyForward.reset_index(inplace=True)

df_master_EasyForward[df_master_EasyForward["Main Customer"] == "Customer 374"]

df_master_EasyForward[df_master_EasyForward["Main Customer"] == "Customer 2486"]

df_master_EasyForward.drop(df_master_EasyForward.index[310], inplace=True) df_master_EasyForward.drop(columns=["index"], inplace=True) df_master_EasyForward.reset_index(inplace=True)

df_master_EasyForward[df_master_EasyForward["Main Customer"] == "Customer 2486"]

df_master_EasyForward[df_master_EasyForward["Main Customer"] == "Customer 3685"]

df_master_EasyForward.drop(df_master_EasyForward.index[336], inplace=True) df_master_EasyForward.drop(columns=["index"], inplace=True) df_master_EasyForward.reset_index(inplace=True)

df_master_EasyForward[df_master_EasyForward["Main Customer"] == "Customer 3685"]

df_master_EasyForward.drop(columns=["index"], inplace=True)

df_DDS_roadfreight_old = df_DDS_roadfreight_old.merge(df_master_EasyForward, how="left", on="Main Customer") df_DDS_roadfreight_old = df_DDS_roadfreight_old.drop_duplicates() df_DDS_roadfreight_old.info()

df_DDS_roadfreight_old["FileName"] = "Historical_Before_May_2019"

print(df_DDS_roadfreight["ConsignorParcelCount"].mode()) print(df_DDS_roadfreight["ConsignorParcelCount"].median()) df_DDS_roadfreight["ConsignorParcelCount"].describe()

df_DDS_roadfreight_old["ConsignorParcelCount"] = int(df_DDS_roadfreight["ConsignorParcelCount"].median())

new = df[df["Product"] == "R"].copy() # Roadfreight

new = new[["Main Customer", "Consignor Location Nr", "ConsigneeCountryCode", "Consignee Location Nr"]]

single = new.copy() single.drop_duplicates(subset="Main Customer", inplace=True) df_DDS_roadfreight_old = pd.merge(df_DDS_roadfreight_old, single, how="left", on=["Main Customer"])

df_DDS_roadfreight_old.info()

# df_DDS_roadfreight_old["Consignor Location Nr"].replace(np.NaN, new.value_counts("Consignor Location Nr").index[0], inplace=True) # df_DDS_roadfreight_old["ConsigneeCountryCode"].replace(np.NaN, new.value_counts("ConsigneeCountryCode").index[0], inplace=True) # df_DDS_roadfreight_old["Consignee Location Nr"].replace(np.NaN, new.value_counts("Consignee Location Nr").index[0], inplace=True)

df_DDS_roadfreight_old["ConsigneeCountryCode"].replace(np.NaN, "SE/NO", inplace=True)

df_DDS_roadfreight_old.info()

df_DDS_roadfreight_old

df = pd.concat([df, df_DDS_roadfreight_old])

df.info(null_counts=True)

df.drop(columns=["Customer_ID"], inplace=True) df.drop(columns=["Involved Party ID"], inplace=True) df.drop(columns=["Allocated SAP Number"], inplace=True)

df["Main Customer"].replace(np.NaN, "Unknown", inplace=True) df["Industry Level 1"].replace(np.NaN, "Unclassified", inplace=True) df["Industry Level 2"].replace(np.NaN, "Unclassified", inplace=True) df["Industry Level 1"].replace("Missing SNI", "Unclassified", inplace=True) df["Industry Level 2"].replace("Missing SNI", "Unclassified", inplace=True)

df["Base_Price"] = np.where(df["Base_Price"].isna(), df["Price_Paid"], df["Base_Price"])

df["ConsignorWeight"].replace(np.NaN, 0, inplace=True) # null values > 0 df["ConsignorWeight"].replace(0, np.NaN, inplace=True) # all back to null df["ConsignorWeight"].replace(np.NaN, df["ConsignorWeight"].median(), inplace=True) # null values > median

df.drop(columns=["System"], inplace=True)

print(df["ConsignorParcelCount"].mode()) print(df["ConsignorParcelCount"].median()) df["ConsignorParcelCount"].describe()

print(df["Price_Paid"][df["ConsignorParcelCount"] == 0])

df["ConsignorParcelCount"].replace(np.NaN, 0, inplace=True) # null values > 0 df["ConsignorParcelCount"].replace(0, 1, inplace=True) # 0 > 1

df.drop(columns=["Freight calc_ status"], inplace=True)

df["Parcel/Pallet type"].replace(np.NaN, "Product is not a pallet", inplace=True)

df["Freight"] = np.where(df["Freight"].isna(), df["Price_Paid"], df["Freight"]) df["Surcharge"].replace(np.NaN, 0, inplace=True) df["Dangerous goods surcharge"].replace(np.NaN, 0, inplace=True) df["Delivery outside of core area surcharge"].replace(np.NaN, 0, inplace=True) df["Energy surcharge"].replace(np.NaN, 0, inplace=True) df["Infrastructure surcharge"].replace(np.NaN, 0, inplace=True) df["Multi-parcel surcharge"].replace(np.NaN, 0, inplace=True) df["Private delivery surcharge"].replace(np.NaN, 0, inplace=True) df["Road toll"].replace(np.NaN, 0, inplace=True) df["Special goods surcharge"].replace(np.NaN, 0, inplace=True)

df.info(null_counts=True)

df["Consignor Location Nr"].replace(np.NaN, 0, inplace=True) df["Consignee Location Nr"].replace(np.NaN, 0, inplace=True) df["Consignor Location Nr"] = df["Consignor Location Nr"].astype(int) df["Consignee Location Nr"] = df["Consignee Location Nr"].astype(int) df["ConsignorParcelCount"] = df["ConsignorParcelCount"].astype(int)

dk_postcodes = pd.read_excel("C:\\Users\\emilf\\Downloads\\POSTNORD\\Master_Data\\Postcodes\\DK_new.xls", header=1) no_postcodes = pd.read_excel("C:\\Users\\emilf\\Downloads\\POSTNORD\\Master_Data\\Postcodes\\NO_new.xlsx") ### Loop the data lines with open("C:\\Users\\emilf\\Downloads\\POSTNORD\\Master_Data\\Postcodes\\SE_new.csv", 'r') as temp_f: # get No of columns in each line col_count = [ len(l.split(",")) for l in temp_f.readlines() ] ### Generate column names (names will be 0, 1, 2, ..., maximum columns - 1) column_names = [i for i in range(0, max(col_count))] ### Read csv se_postcodes = pd.read_csv("C:\\Users\\emilf\\Downloads\\POSTNORD\\Master_Data\\Postcodes\\SE_new.csv", delimiter=",", names=column_names)

dk_postcodes

dk_postcodes.drop(columns=["Street", "Firm", "Region", "Country"], inplace=True) dk_postcodes.rename(columns={"Postcode": "Postal_Code", "City/town name": "Place"}, inplace=True) dk_postcodes["ISO"] = "DK" dk_postcodes

se_postcodes

se_postcodes.drop(columns=[1, 2, 3, 5], inplace=True) se_postcodes.drop(index=0, inplace=True) se_postcodes.rename(columns={0: "Postal_Code", 4: "Place"}, inplace=True) se_postcodes["ISO"] = "SE" se_postcodes.drop_duplicates(inplace=True) se_postcodes

no_postcodes

no_postcodes.drop(columns=["Municipality_number", "Municipality_name", "Category"], inplace=True) no_postcodes["Postal_name"] = no_postcodes["Postal_name"].str.title() no_postcodes.rename(columns={"Postcode": "Postal_Code", "Postal_name": "Place"}, inplace=True) no_postcodes["ISO"] = "NO" no_postcodes

df_postcodes = pd.concat([dk_postcodes, se_postcodes, no_postcodes]) df_postcodes

df_postcodes.info()

df_postcodes["Postal_Code"] = df_postcodes["Postal_Code"].astype(str).str.replace(" ", "").astype(int)

df_postcodes["ISOCAP"] = df_postcodes["ISO"] + df_postcodes["Postal_Code"].astype(str) # postcodes dataframe df["ISOCAP_Consignor"] = df["ConsignorCountryCode"] + df["Consignor Location Nr"].astype(str) # main dataframe df["ISOCAP_Consignee"] = df["ConsigneeCountryCode"] + df["Consignee Location Nr"].astype(str) # main dataframe

df.reset_index(inplace=True) # quick index reset df_postcodes.reset_index(inplace=True) # quick index reset df.drop(columns=["index"], inplace=True) # remove the old indexing df_postcodes.drop(columns=["index"], inplace=True) # remove the old indexing print(df.shape[0], df_postcodes.shape[0]) # sanity check

df_postcodes.drop_duplicates(subset=["ISOCAP"], inplace=True)

df = pd.merge(df, df_postcodes, how="left", left_on=["ISOCAP_Consignor"], right_on=["ISOCAP"]) df.rename(columns={"Place": "ConsignorLocation"}, inplace=True)

df = pd.merge(df, df_postcodes, how="left", left_on=["ISOCAP_Consignee"], right_on=["ISOCAP"]) df.rename(columns={"Place": "ConsigneeLocation"}, inplace=True)

df.drop(columns=["ISOCAP_Consignor", "ISOCAP_Consignee", "ISO_x", "ISO_y", "Postal_Code_x", "Postal_Code_y", "ISOCAP_x", "ISOCAP_y"], inplace=True)

print(df.loc[df["ConsignorLocation"].isna(), ["Price_Paid"]].sum() / df["Price_Paid"].sum()) # try to get a sense of missing postcodes print(df.loc[df["ConsigneeLocation"].isna(), ["Price_Paid"]].sum() / df["Price_Paid"].sum()) # try to get a sense of missing postcodes

df["Route"] = df["ConsignorLocation"] + " - " + df["ConsigneeLocation"]

with pd.option_context('display.max_rows', 200, 'display.max_columns', None): display(df)

df.info(null_counts=True)

df.to_csv("C:\\Users\\emilf\\Downloads\\POSTNORD\\postnord_tableau_dataset.csv")