import numpy as np
import pandas as pd
from datetime import datetime
from IPython.display import clear_output # cleaner output
number_of_files = 3
df_ADS_list = []
for file_num in range(number_of_files): # this and all other local imports can be revised to suit your needs
df_ADS_list.append(pd.read_csv(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\ADS\\{2017+file_num}.csv", delimiter="|"))
df_ADS_original = pd.concat(df_ADS_list)
df_ADS = df_ADS_original.copy()
df_ADS["Product"] = "C" # ("C" stands for courier)
df_ADS.ConsignorCountryCode.unique()
df_ADS.ConsigneeCountryCode.unique()
df_ADS.ConsignorCountryCode = df_ADS.ConsignorCountryCode.str.upper().replace({"DANMARK": "DK", "SWEDEN": "SE", "NORWAY": "NO"})
df_ADS.ConsigneeCountryCode = df_ADS.ConsigneeCountryCode.str.upper().replace({"DANMARK": "DK", "SWEDEN": "SE", "NORWAY": "NO"})
df_ADS.head()
df_ADS.info()
check_date = df_ADS.sort_values(by=["Shipment Tracking Number"])
check_date.head(20)
df_ADS["ShipmentDate"] = np.where(df_ADS["ShipmentDate"].isna(),
df_ADS["FileName"].astype(str), df_ADS["ShipmentDate"]) # NaN > "2017"
df_ADS["ShipmentDate"] = np.where(df_ADS["ShipmentDate"].str.len() == 4,
df_ADS["ShipmentDate"] + "-01-01", df_ADS["ShipmentDate"]) # "2017" > "2017-01-01"
df_ADS["ShipmentDate"] = pd.to_datetime(df_ADS.ShipmentDate, format='%Y-%m-%d')
total = df_ADS["Price_Paid"].sum()
partial = (df_ADS["Price_Paid"][df_ADS["Consignee Location Nr"].isna()]).sum()
print("The percentage of total turnover that refers to missing Consignee Location Nr is " + str((partial/total)*100) + "%")
df_ADS["Consignee Location Nr"][df_ADS["Involved Party ID"] == 271498323].unique()
print(df_ADS.loc[df_ADS["ConsigneeCountryCode"] == "DK", ["Consignee Location Nr"]])
print(df_ADS.loc[df_ADS["ConsigneeCountryCode"] == "DK", ["Consignee Location Nr"]].value_counts())
print(df_ADS.loc[df_ADS["ConsigneeCountryCode"] == "SE", ["Consignee Location Nr"]])
print(df_ADS.loc[df_ADS["ConsigneeCountryCode"] == "SE", ["Consignee Location Nr"]].value_counts())
print(df_ADS.loc[df_ADS["ConsigneeCountryCode"] == "NO", ["Consignee Location Nr"]])
print(df_ADS.loc[df_ADS["ConsigneeCountryCode"] == "NO", ["Consignee Location Nr"]].value_counts())
df_ADS[df_ADS["Consignee Location Nr"].isna() & df_ADS["ConsigneeCountryCode"] == "DK"]
df_ADS[df_ADS.duplicated()]
df_ADS.info()
df_master_ADS = pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\Master_Data\\ADS_DDS_RF_CustomerNumbers_MasterData.xlsx",
sheet_name="ADS")
df_master_ADS
df_master_ADS[df_master_ADS.duplicated()]
df_ADS = df_ADS.merge(df_master_ADS, how="left", on="Involved Party ID")
df_ADS.info()
years = 3
df_DDS_groupage_list = []
months = ['jan', 'feb', 'mar', 'apr', 'maj', 'jun', 'jul', 'aug', 'sep', 'okt', 'nov', 'dec']
for year in range(years):
year += 2017
for month in (months):
if year == 2019 and month == 'sep':
break # this is to prevent throwing an error, can be changed based on the month
df_DDS_groupage_list.append(pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\DDS\\Groupage\\DatadumpProduct G_{year}_{month}.xls"))
df_DDS_groupage_original = pd.concat(df_DDS_groupage_list)
df_DDS_groupage = df_DDS_groupage_original.copy()
df_DDS_groupage.head()
df_DDS_groupage.info() # checking for nulls in the parcel pallet
df_DDS_groupage["Parcel/Pallet type"].unique() # quick check for elements to substitute
df_DDS_groupage["Parcel/Pallet type"].replace("PKK", np.NaN, inplace=True) # substitute PKK > NaN
df_DDS_groupage["Parcel/Pallet type"].replace("HPL", np.NaN, inplace=True) # substitute HPL > NaN
df_DDS_groupage["Parcel/Pallet type"].replace("PLL", np.NaN, inplace=True) # substitute PLL > NaN
df_DDS_groupage["Parcel/Pallet type"].replace("QPL", np.NaN, inplace=True) # substitute QPL > NaN
df_DDS_groupage["Parcel/Pallet type"].replace("SPL", np.NaN, inplace=True) # substitute SPL > NaN
df_DDS_groupage["ConsignorWeight"].replace(np.NaN, 0, inplace=True) # substitute NaN > 0
total = df_DDS_groupage["Price_Paid"].sum()
partial = (df_DDS_groupage["Price_Paid"][df_DDS_groupage["Customer_ID"].isna()]).sum()
print("The percentage of total turnover that refers to missing Customer_ID is " + str((partial/total)*100) + "%")
df_DDS_groupage.groupby(by=["Customer_ID"])["Price_Paid"].sum().sort_values()
df_DDS_groupage["Customer_ID"].replace(np.NaN, "GAAAABEAB", inplace=True) # substitute NaN > GAAAABEAB
df_DDS_groupage[df_DDS_groupage.duplicated()]
df_DDS_groupage[df_DDS_groupage["Shipment Tracking Number"].duplicated()]
df_DDS_groupage[df_DDS_groupage["Shipment Tracking Number"] == "DODA648Z44DOOOA68"]
df_DDS_groupage.drop_duplicates(inplace=True)
df_DDS_groupage.info()
years = 3
df_DDS_innight_list = []
months = ['jan', 'feb', 'mar', 'apr', 'maj', 'jun', 'jul', 'aug', 'sep', 'okt', 'nov', 'dec']
for year in range(years):
year += 2017
for month in (months):
if year == 2019 and month == 'sep':
break # this is to prevent throwing an error
df_DDS_innight_list.append(pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\DDS\\Innight\\DatadumpProduct E_{year}_{month}.xls"))
df_DDS_innight_original = pd.concat(df_DDS_innight_list)
df_DDS_innight = df_DDS_innight_original.copy()
df_DDS_innight.head()
df_DDS_innight.info()
df_DDS_innight["Parcel/Pallet type"].replace("PKK", np.NaN, inplace=True) # substitute PKK > NaN
df_DDS_innight["Parcel/Pallet type"].replace("HPL", np.NaN, inplace=True) # substitute HPL > NaN
df_DDS_innight["Parcel/Pallet type"].replace("PLL", np.NaN, inplace=True) # substitute PLL > NaN
df_DDS_innight["Parcel/Pallet type"].replace("QPL", np.NaN, inplace=True) # substitute QPL > NaN
df_DDS_innight["Parcel/Pallet type"].replace("SPL", np.NaN, inplace=True) # substitute SPL > NaN
df_DDS_innight["ConsignorWeight"].replace(np.NaN, 0, inplace=True) # substitute NaN > 0
total = df_DDS_innight["Price_Paid"].sum()
partial = (df_DDS_innight["Price_Paid"][df_DDS_innight["Customer_ID"].isna()]).sum()
print("The percentage of total turnover that refers to missing Customer_ID is " + str((partial/total)*100) + "%")
df_DDS_innight.groupby(by=["Customer_ID"])["Price_Paid"].sum().sort_values()
df_DDS_innight["Customer_ID"].replace(np.NaN, "EAABAAAB", inplace=True) # substitute NaN > EAABAAAB
df_DDS_innight[df_DDS_innight.duplicated()]
df_DDS_innight.info()
years = 3
df_DDS_pallet_list = []
months = ['jan', 'feb', 'mar', 'apr', 'maj', 'jun', 'jul', 'aug', 'sep', 'okt', 'nov', 'dec']
for year in range(years):
year += 2017
for month in (months):
if year == 2019 and month == 'sep':
break # this is to prevent throwing an error
df_DDS_pallet_list.append(pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\DDS\\Pallet\\DatadumpProduct P_{year}_{month}.xls"))
df_DDS_pallet_original = pd.concat(df_DDS_pallet_list)
df_DDS_pallet = df_DDS_pallet_original.copy()
df_DDS_pallet.head()
df_DDS_pallet.info()
df_DDS_pallet["Parcel/Pallet type"].replace(np.NaN, "PLL", inplace=True) # NaN > PLL
df_DDS_pallet["ConsignorWeight"].replace(np.NaN, 0, inplace=True) # substitute NaN > 0
total = df_DDS_pallet["Price_Paid"].sum()
partial = (df_DDS_pallet["Price_Paid"][df_DDS_pallet["Customer_ID"].isna()]).sum()
print("The percentage of total turnover that refers to missing Customer_ID is " + str((partial/total)*100) + "%")
df_DDS_pallet.groupby(by=["Customer_ID"])["Price_Paid"].sum().sort_values()
df_DDS_pallet["Customer_ID"].replace(np.NaN, "CGCAAAEGD", inplace=True) # substitute NaN > CGCAAAEGD
df_DDS_pallet[df_DDS_pallet.duplicated()]
df_DDS_pallet.drop_duplicates(inplace=True)
df_DDS_pallet.info()
year = 2019
df_DDS_roadfreight_list = []
months = ['maj', 'jun', 'jul', 'aug', 'sep', 'okt', 'nov', 'dec']
for month in (months):
if month == 'sep':
break # this is to prevent throwing an error
df_DDS_roadfreight_list.append(pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\DDS\\Road_Freight\\DatadumpProduct R_{year}_{month}.xls"))
df_DDS_roadfreight_original = pd.concat(df_DDS_roadfreight_list)
df_DDS_roadfreight = df_DDS_roadfreight_original.copy()
df_DDS_roadfreight.head()
df_DDS_roadfreight.info()
df_DDS_roadfreight["Parcel/Pallet type"].replace("PKK", np.NaN, inplace=True) # substitute PKK > NaN
df_DDS_roadfreight["Parcel/Pallet type"].replace("HPL", np.NaN, inplace=True) # substitute HPL > NaN
df_DDS_roadfreight["Parcel/Pallet type"].replace("PLL", np.NaN, inplace=True) # substitute PLL > NaN
df_DDS_roadfreight["Parcel/Pallet type"].replace("QPL", np.NaN, inplace=True) # substitute QPL > NaN
df_DDS_roadfreight["Parcel/Pallet type"].replace("SPL", np.NaN, inplace=True) # substitute SPL > NaN
df_DDS_roadfreight["ConsignorWeight"].replace(np.NaN, 0, inplace=True) # substitute NaN > 0
total = df_DDS_roadfreight["Price_Paid"].sum()
partial = (df_DDS_roadfreight["Price_Paid"][df_DDS_roadfreight["Customer_ID"].isna()]).sum()
print("The percentage of total turnover that refers to missing Customer_ID is " + str((partial/total)*100) + "%")
df_DDS_roadfreight.groupby(by=["Customer_ID"])["Price_Paid"].sum().sort_values()
df_DDS_roadfreight["Customer_ID"].replace(np.NaN, "HAAHAAAEC", inplace=True) # substitute NaN > HAAHAAAEC
df_DDS_roadfreight[df_DDS_roadfreight.duplicated()]
df_DDS_roadfreight.info()
df_DDS = [df_DDS_groupage, df_DDS_innight, df_DDS_pallet, df_DDS_roadfreight]
df_DDS = pd.concat(df_DDS)
df_DDS.info(null_counts=True)
df_master_DDS = pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\Master_Data\\ADS_DDS_RF_CustomerNumbers_MasterData.xlsx",
sheet_name="DDS")
df_master_DDS
df_master_DDS.rename(columns={"Customer ID": "Customer_ID"}, inplace=True)
df_master_DDS[df_master_DDS.duplicated()]
df_DDS = df_DDS.merge(df_master_DDS, how="left", on="Customer_ID")
df_DDS.info(null_counts=True)
df = pd.concat([df_DDS, df_ADS])
df_DDS_groupage.shape[0] + df_DDS_innight.shape[0] \
+ df_DDS_pallet.shape[0] + df_DDS_roadfreight.shape[0] == df_DDS.shape[0]
df_DDS.shape[0] + df_ADS.shape[0] == df.shape[0] # global sanity check
print(df.ConsignorCountryCode.unique())
print(df.ConsigneeCountryCode.unique())
print(df.Product.unique())
print(df["Parcel/Pallet type"].unique())
df.info(null_counts=True)
df.loc[(df["Consignee Location Nr"].isna()) & (df["ConsigneeCountryCode"] == "DK"), ["Consignee Location Nr"]] = df[df["ConsigneeCountryCode"] == "DK"].value_counts("Consignee Location Nr").index[0]
df.loc[(df["Consignee Location Nr"].isna()) & (df["ConsigneeCountryCode"] == "SE"), ["Consignee Location Nr"]] = df[df["ConsigneeCountryCode"] == "SE"].value_counts("Consignee Location Nr").index[0]
df.loc[(df["Consignee Location Nr"].isna()) & (df["ConsigneeCountryCode"] == "NO"), ["Consignee Location Nr"]] = df[df["ConsigneeCountryCode"] == "NO"].value_counts("Consignee Location Nr").index[0]
df.info(null_counts=True)
df_DDS_roadfreight_old = pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\EasyForward\\Historical_Before_May_2019.xls")
df_DDS_roadfreight_old.head()
df.head()
df_DDS_roadfreight_old["Product"] = "R" # product column implemented ("R" stands for Road Freight)
df_DDS_roadfreight_old["ShipmentDate"] = pd.to_datetime(df_DDS_roadfreight_old.Year.astype(str), format='%Y') + \
pd.to_timedelta(df_DDS_roadfreight_old.Week.mul(7).astype(str) + " days") # it counts the last day of the specified week
df_DDS_roadfreight_old.drop(columns={"Week", "Month", "Year"}, inplace=True) # remove the unneccesary clutter
df_DDS_roadfreight_old
df_DDS_roadfreight_old[df_DDS_roadfreight_old.duplicated()]
df_DDS_roadfreight_old.info()
df_master_EasyForward = pd.read_excel(f"C:\\Users\\emilf\\Downloads\\POSTNORD\\Master_Data\\ADS_DDS_RF_CustomerNumbers_MasterData.xlsx",
sheet_name="Road Frieght") # typo in the original document
df_master_EasyForward
df_master_EasyForward[df_master_EasyForward.duplicated()]
df_master_EasyForward.drop_duplicates(inplace=True)
df_master_EasyForward
df_master_EasyForward[df_master_EasyForward["Main Customer"].duplicated()]
df_master_EasyForward.reset_index(inplace=True)
df_master_EasyForward[df_master_EasyForward["Main Customer"] == "Customer 374"]
df_master_EasyForward.drop(df_master_EasyForward.index[47], inplace=True)
df_master_EasyForward.drop(columns=["index"], inplace=True)
df_master_EasyForward.reset_index(inplace=True)
df_master_EasyForward[df_master_EasyForward["Main Customer"] == "Customer 374"]
df_master_EasyForward[df_master_EasyForward["Main Customer"] == "Customer 2486"]
df_master_EasyForward.drop(df_master_EasyForward.index[310], inplace=True)
df_master_EasyForward.drop(columns=["index"], inplace=True)
df_master_EasyForward.reset_index(inplace=True)
df_master_EasyForward[df_master_EasyForward["Main Customer"] == "Customer 2486"]
df_master_EasyForward[df_master_EasyForward["Main Customer"] == "Customer 3685"]
df_master_EasyForward.drop(df_master_EasyForward.index[336], inplace=True)
df_master_EasyForward.drop(columns=["index"], inplace=True)
df_master_EasyForward.reset_index(inplace=True)
df_master_EasyForward[df_master_EasyForward["Main Customer"] == "Customer 3685"]
df_master_EasyForward.drop(columns=["index"], inplace=True)
df_DDS_roadfreight_old = df_DDS_roadfreight_old.merge(df_master_EasyForward, how="left", on="Main Customer")
df_DDS_roadfreight_old = df_DDS_roadfreight_old.drop_duplicates()
df_DDS_roadfreight_old.info()
df_DDS_roadfreight_old["FileName"] = "Historical_Before_May_2019"
print(df_DDS_roadfreight["ConsignorParcelCount"].mode())
print(df_DDS_roadfreight["ConsignorParcelCount"].median())
df_DDS_roadfreight["ConsignorParcelCount"].describe()
df_DDS_roadfreight_old["ConsignorParcelCount"] = int(df_DDS_roadfreight["ConsignorParcelCount"].median())
new = df[df["Product"] == "R"].copy() # Roadfreight
new = new[["Main Customer", "Consignor Location Nr", "ConsigneeCountryCode", "Consignee Location Nr"]]
single = new.copy()
single.drop_duplicates(subset="Main Customer", inplace=True)
df_DDS_roadfreight_old = pd.merge(df_DDS_roadfreight_old, single, how="left", on=["Main Customer"])
df_DDS_roadfreight_old.info()
# df_DDS_roadfreight_old["Consignor Location Nr"].replace(np.NaN, new.value_counts("Consignor Location Nr").index[0], inplace=True)
# df_DDS_roadfreight_old["ConsigneeCountryCode"].replace(np.NaN, new.value_counts("ConsigneeCountryCode").index[0], inplace=True)
# df_DDS_roadfreight_old["Consignee Location Nr"].replace(np.NaN, new.value_counts("Consignee Location Nr").index[0], inplace=True)
df_DDS_roadfreight_old["ConsigneeCountryCode"].replace(np.NaN, "SE/NO", inplace=True)
df_DDS_roadfreight_old.info()
df_DDS_roadfreight_old
df = pd.concat([df, df_DDS_roadfreight_old])
df.info(null_counts=True)
df.drop(columns=["Customer_ID"], inplace=True)
df.drop(columns=["Involved Party ID"], inplace=True)
df.drop(columns=["Allocated SAP Number"], inplace=True)
df["Main Customer"].replace(np.NaN, "Unknown", inplace=True)
df["Industry Level 1"].replace(np.NaN, "Unclassified", inplace=True)
df["Industry Level 2"].replace(np.NaN, "Unclassified", inplace=True)
df["Industry Level 1"].replace("Missing SNI", "Unclassified", inplace=True)
df["Industry Level 2"].replace("Missing SNI", "Unclassified", inplace=True)
df["Base_Price"] = np.where(df["Base_Price"].isna(), df["Price_Paid"], df["Base_Price"])
df["ConsignorWeight"].replace(np.NaN, 0, inplace=True) # null values > 0
df["ConsignorWeight"].replace(0, np.NaN, inplace=True) # all back to null
df["ConsignorWeight"].replace(np.NaN, df["ConsignorWeight"].median(), inplace=True) # null values > median
df.drop(columns=["System"], inplace=True)
print(df["ConsignorParcelCount"].mode())
print(df["ConsignorParcelCount"].median())
df["ConsignorParcelCount"].describe()
print(df["Price_Paid"][df["ConsignorParcelCount"] == 0])
df["ConsignorParcelCount"].replace(np.NaN, 0, inplace=True) # null values > 0
df["ConsignorParcelCount"].replace(0, 1, inplace=True) # 0 > 1
df.drop(columns=["Freight calc_ status"], inplace=True)
df["Parcel/Pallet type"].replace(np.NaN, "Product is not a pallet", inplace=True)
df["Freight"] = np.where(df["Freight"].isna(), df["Price_Paid"], df["Freight"])
df["Surcharge"].replace(np.NaN, 0, inplace=True)
df["Dangerous goods surcharge"].replace(np.NaN, 0, inplace=True)
df["Delivery outside of core area surcharge"].replace(np.NaN, 0, inplace=True)
df["Energy surcharge"].replace(np.NaN, 0, inplace=True)
df["Infrastructure surcharge"].replace(np.NaN, 0, inplace=True)
df["Multi-parcel surcharge"].replace(np.NaN, 0, inplace=True)
df["Private delivery surcharge"].replace(np.NaN, 0, inplace=True)
df["Road toll"].replace(np.NaN, 0, inplace=True)
df["Special goods surcharge"].replace(np.NaN, 0, inplace=True)
df.info(null_counts=True)
df["Consignor Location Nr"].replace(np.NaN, 0, inplace=True)
df["Consignee Location Nr"].replace(np.NaN, 0, inplace=True)
df["Consignor Location Nr"] = df["Consignor Location Nr"].astype(int)
df["Consignee Location Nr"] = df["Consignee Location Nr"].astype(int)
df["ConsignorParcelCount"] = df["ConsignorParcelCount"].astype(int)
dk_postcodes = pd.read_excel("C:\\Users\\emilf\\Downloads\\POSTNORD\\Master_Data\\Postcodes\\DK_new.xls", header=1)
no_postcodes = pd.read_excel("C:\\Users\\emilf\\Downloads\\POSTNORD\\Master_Data\\Postcodes\\NO_new.xlsx")
### Loop the data lines
with open("C:\\Users\\emilf\\Downloads\\POSTNORD\\Master_Data\\Postcodes\\SE_new.csv", 'r') as temp_f:
# get No of columns in each line
col_count = [ len(l.split(",")) for l in temp_f.readlines() ]
### Generate column names (names will be 0, 1, 2, ..., maximum columns - 1)
column_names = [i for i in range(0, max(col_count))]
### Read csv
se_postcodes = pd.read_csv("C:\\Users\\emilf\\Downloads\\POSTNORD\\Master_Data\\Postcodes\\SE_new.csv", delimiter=",", names=column_names)
dk_postcodes
dk_postcodes.drop(columns=["Street", "Firm", "Region", "Country"], inplace=True)
dk_postcodes.rename(columns={"Postcode": "Postal_Code", "City/town name": "Place"}, inplace=True)
dk_postcodes["ISO"] = "DK"
dk_postcodes
se_postcodes
se_postcodes.drop(columns=[1, 2, 3, 5], inplace=True)
se_postcodes.drop(index=0, inplace=True)
se_postcodes.rename(columns={0: "Postal_Code", 4: "Place"}, inplace=True)
se_postcodes["ISO"] = "SE"
se_postcodes.drop_duplicates(inplace=True)
se_postcodes
no_postcodes
no_postcodes.drop(columns=["Municipality_number", "Municipality_name", "Category"], inplace=True)
no_postcodes["Postal_name"] = no_postcodes["Postal_name"].str.title()
no_postcodes.rename(columns={"Postcode": "Postal_Code", "Postal_name": "Place"}, inplace=True)
no_postcodes["ISO"] = "NO"
no_postcodes
df_postcodes = pd.concat([dk_postcodes, se_postcodes, no_postcodes])
df_postcodes
df_postcodes.info()
df_postcodes["Postal_Code"] = df_postcodes["Postal_Code"].astype(str).str.replace(" ", "").astype(int)
df_postcodes["ISOCAP"] = df_postcodes["ISO"] + df_postcodes["Postal_Code"].astype(str) # postcodes dataframe
df["ISOCAP_Consignor"] = df["ConsignorCountryCode"] + df["Consignor Location Nr"].astype(str) # main dataframe
df["ISOCAP_Consignee"] = df["ConsigneeCountryCode"] + df["Consignee Location Nr"].astype(str) # main dataframe
df.reset_index(inplace=True) # quick index reset
df_postcodes.reset_index(inplace=True) # quick index reset
df.drop(columns=["index"], inplace=True) # remove the old indexing
df_postcodes.drop(columns=["index"], inplace=True) # remove the old indexing
print(df.shape[0], df_postcodes.shape[0]) # sanity check
df_postcodes.drop_duplicates(subset=["ISOCAP"], inplace=True)
df = pd.merge(df, df_postcodes, how="left", left_on=["ISOCAP_Consignor"], right_on=["ISOCAP"])
df.rename(columns={"Place": "ConsignorLocation"}, inplace=True)
df = pd.merge(df, df_postcodes, how="left", left_on=["ISOCAP_Consignee"], right_on=["ISOCAP"])
df.rename(columns={"Place": "ConsigneeLocation"}, inplace=True)
df.drop(columns=["ISOCAP_Consignor", "ISOCAP_Consignee",
"ISO_x", "ISO_y", "Postal_Code_x", "Postal_Code_y",
"ISOCAP_x", "ISOCAP_y"], inplace=True)
print(df.loc[df["ConsignorLocation"].isna(), ["Price_Paid"]].sum() / df["Price_Paid"].sum()) # try to get a sense of missing postcodes
print(df.loc[df["ConsigneeLocation"].isna(), ["Price_Paid"]].sum() / df["Price_Paid"].sum()) # try to get a sense of missing postcodes
df["Route"] = df["ConsignorLocation"] + " - " + df["ConsigneeLocation"]
df
with pd.option_context('display.max_rows', 200, 'display.max_columns', None):
display(df)
df.info(null_counts=True)
df.to_csv("C:\\Users\\emilf\\Downloads\\POSTNORD\\postnord_tableau_dataset.csv")