Notebook

! pip install --quiet kaggle # API to load Kaggle datasets directly into Google Colab ! pip install --quiet geopandas # library that provides a dataframe-like interface for manipulating and analysing geospatial data

# DATA PROCESSING import pandas as pd # read csv file as a dataframe and wrangle the data import numpy as np # linear algebra import missingno as msno # identify missing data from collections import Counter # count duplicates from tqdm.notebook import tqdm # Progress bar and API request details # GEOSPATIAL DATA import geopandas as gpd # main GIS library from geopy.geocoders import Nominatim # reverse geocoding from shapely.geometry import Point, Polygon # generate spatial data # DATA VISUALISATION import matplotlib.pyplot as plt # multiple visualisations import matplotlib.ticker as ticker # visual customisation import seaborn as sns # correlation heatmap

! mkdir ~/.kaggle # Making a directory named “.kaggle” ! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json # Copying the “kaggle.json” from my mounted cloud drive into this new directory ! chmod 600 ~/.kaggle/kaggle.json # Allocating the required permission for this file ! kaggle datasets download -d leekahwin/ukraine-waste-water-treatment-plants # Downloading the dataset (which comes with a zip extension) ! unzip /content/ukraine-waste-water-treatment-plants.zip # Unzipping the file to extract the data

# Importing, reading a csv file as a Pandas DataFrame and initially assigning it to an initial variable called 'RawDataframe' RawDataframe = pd.read_csv('/content/HydroWASTE_Ukraine_extract.csv') # Printing the first 3 and last 2 rows of the data frame RawDataframe.head(3).append(RawDataframe.tail(2))

# Importing, reading a csv file as a GeoPandas DataFrame and initially assigning it to an variable called 'Basemap' Basemap = gpd.read_file('/content/drive/MyDrive/Academic/IT/Projects/MASTERProjects/UkraineProject/UkraineBasemap.geojson') Basemap.head(3).append(Basemap.tail(2))

# Checking duplicate records counts = dict(Counter(RawDataframe['WASTE_ID'])) # Counting by treatment plant IDs duplicates = {key:value for key, value in counts.items() if value > 1} # Fetching only counts above the number 2. duplicates