! pip install --quiet kaggle # API to load Kaggle datasets directly into Google Colab
! pip install --quiet geopandas # library that provides a dataframe-like interface for manipulating and analysing geospatial data
# DATA PROCESSING
import pandas as pd # read csv file as a dataframe and wrangle the data
import numpy as np # linear algebra
import missingno as msno # identify missing data
from collections import Counter # count duplicates
from tqdm.notebook import tqdm # Progress bar and API request details
# GEOSPATIAL DATA
import geopandas as gpd # main GIS library
from geopy.geocoders import Nominatim # reverse geocoding
from shapely.geometry import Point, Polygon # generate spatial data
# DATA VISUALISATION
import matplotlib.pyplot as plt # multiple visualisations
import matplotlib.ticker as ticker # visual customisation
import seaborn as sns # correlation heatmap
! mkdir ~/.kaggle # Making a directory named “.kaggle”
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json # Copying the “kaggle.json” from my mounted cloud drive into this new directory
! chmod 600 ~/.kaggle/kaggle.json # Allocating the required permission for this file
! kaggle datasets download -d leekahwin/ukraine-waste-water-treatment-plants # Downloading the dataset (which comes with a zip extension)
! unzip /content/ukraine-waste-water-treatment-plants.zip # Unzipping the file to extract the data
# Importing, reading a csv file as a Pandas DataFrame and initially assigning it to an initial variable called 'RawDataframe'
RawDataframe = pd.read_csv('/content/HydroWASTE_Ukraine_extract.csv')
# Printing the first 3 and last 2 rows of the data frame
RawDataframe.head(3).append(RawDataframe.tail(2))
# Importing, reading a csv file as a GeoPandas DataFrame and initially assigning it to an variable called 'Basemap'
Basemap = gpd.read_file('/content/drive/MyDrive/Academic/IT/Projects/MASTERProjects/UkraineProject/UkraineBasemap.geojson')
Basemap.head(3).append(Basemap.tail(2))
# Checking duplicate records
counts = dict(Counter(RawDataframe['WASTE_ID'])) # Counting by treatment plant IDs
duplicates = {key:value for key, value in counts.items()
if value > 1} # Fetching only counts above the number 2.
duplicates