Pre-requisites
1. Download desired variables .zip file from Geoquery (http://geo.aiddata.org/query/#!/)
2. Extract zip file, upload .geojson and .csv into the Files section
[R] Tutorial on how to clean data from Geoquery
Load libraries
library(ggplot2)
library(sf)
library(tidyverse)
library(skimr)
Load geojson
chn_sh <- st_read("/work/CHN_ADM3.geojson")
Reading layer `CHN_ADM3' from data source `/work/CHN_ADM3.geojson' using driver `GeoJSON'
Simple feature collection with 2852 features and 10 fields
geometry type: MULTIPOLYGON
dimension: XY
bbox: xmin: 73.49973 ymin: 15.78066 xmax: 134.7755 ymax: 53.56082
geographic CRS: WGS 84
Select ID code for merging
chn_sh <- chn_sh %>% select(gbid)
Load csv
chn_csv <- read_csv("CHN_SAT.csv")
── Column specification ────────────────────────────────────────────────────────
cols(
asdf_id = col_double(),
viirs_vcmcfg_dnb_composites_v10_yearly_max.2016.mean = col_double(),
viirs_vcmcfg_dnb_composites_v10_yearly_max.2012.mean = col_double(),
viirs_vcmcfg_dnb_composites_v10_yearly_max.2013.mean = col_double(),
viirs_vcmcfg_dnb_composites_v10_yearly_max.2015.mean = col_double(),
viirs_vcmcfg_dnb_composites_v10_yearly_max.2014.mean = col_double(),
viirs_vcmcfg_dnb_composites_v10_yearly_max.2017.mean = col_double(),
viirs_vcmcfg_dnb_composites_v10_yearly_max.2018.mean = col_double(),
adm = col_character(),
gbid = col_character(),
NAME = col_character(),
Level = col_character(),
adm_int = col_double(),
feature_id = col_double(),
iso = col_character(),
gqid = col_double()
)
Merge
chn <- merge(chn_sh, chn_csv, by.x="gbid", by.y="gbid")
Compute coordinates
coord <- st_coordinates(st_centroid(chn))
coordinates <- as.data.frame(coord)
chn$X <- coordinates$X
chn$Y <- coordinates$Y
Warning message in st_centroid.sf(chn):
“st_centroid assumes attributes are constant over geometries of x”
Warning message in st_centroid.sfc(st_geometry(x), of_largest_polygon = of_largest_polygon):
“st_centroid does not give correct centroids for longitude/latitude data”
Create ID vector
nrow(chn)
chn <- chn %>% mutate(id = c(1:2852))
Select, rename and rearrange variables
chn <- chn %>% select(id, NAME, starts_with("viirs"), geometry)
colnames(chn) <- c("id", "county", "sat16", "sat12","sat13","sat15","sat14","sat17","sat18","geometry" )
chn <- chn %>% select(id, county, sat12, sat13, sat14, sat15, sat16, sat17, sat18, geometry)
Save file
st_write(chn,"chn_sat.gpkg", append=FALSE)
chn_df <- st_drop_geometry(chn)
write.csv(chn_df, "chn_df.csv")
Writing layer `chn_sat' to data source `chn_sat.gpkg' using driver `GPKG'
Writing 2852 features with 19 fields and geometry type Multi Polygon.
Skim file
skim(chn_df)
── Data Summary ────────────────────────
Values
Name chn_df
Number of rows 2852
Number of columns 19
_______________________
Column type frequency:
character 5
numeric 14
________________________
Group variables None
── Variable type: character ────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 gbid 0 1 16 19 0 2852 0
2 adm 0 1 4 4 0 1 0
3 NAME 0 1 2 54 0 2709 0
4 Level 0 1 4 4 0 1 0
5 iso 0 1 3 3 0 1 0
── Variable type: numeric ──────────────────────────────────────────────────────
skim_variable n_missing complete_rate
1 asdf_id 0 1
2 viirs_vcmcfg_dnb_composites_v10_yearly_max.2016.mean 0 1
3 viirs_vcmcfg_dnb_composites_v10_yearly_max.2012.mean 0 1
4 viirs_vcmcfg_dnb_composites_v10_yearly_max.2013.mean 0 1
5 viirs_vcmcfg_dnb_composites_v10_yearly_max.2015.mean 0 1
6 viirs_vcmcfg_dnb_composites_v10_yearly_max.2014.mean 0 1
7 viirs_vcmcfg_dnb_composites_v10_yearly_max.2017.mean 0 1
8 viirs_vcmcfg_dnb_composites_v10_yearly_max.2018.mean 0 1
9 adm_int 0 1
10 feature_id 0 1
11 gqid 0 1
12 X 0 1
13 Y 0 1
14 id 0 1
mean sd p0 p25 p50 p75 p100 hist
1 1426. 823. 0 713. 1426. 2138. 2851 ▇▇▇▇▇
2 4.33 13.8 0.143 0.425 0.864 2.78 534. ▇▁▁▁▁
3 3.77 10.4 0.167 0.397 0.761 2.29 306. ▇▁▁▁▁
4 4.30 10.9 0.212 0.511 0.968 2.76 277. ▇▁▁▁▁
5 4.19 10.7 0.174 0.484 0.921 2.76 287. ▇▁▁▁▁
6 4.17 10.3 0.157 0.515 0.947 2.76 243. ▇▁▁▁▁
7 4.85 15.7 0.332 0.651 1.14 3.16 646. ▇▁▁▁▁
8 4.93 13.1 0.332 0.628 1.16 3.34 420. ▇▁▁▁▁
9 3 0 3 3 3 3 3 ▁▁▇▁▁
10 1426. 823. 0 713. 1426. 2138. 2851 ▇▇▇▇▇
11 1426. 823. 0 713. 1426. 2138. 2851 ▇▇▇▇▇
12 112. 9.52 74.9 107. 113. 118. 134. ▁▁▅▇▁
13 33.2 6.74 16.5 28.1 32.7 37.9 52.9 ▂▇▇▅▁
14 1426. 823. 1 714. 1426. 2139. 2852 ▇▇▇▇▇
skim(chn_df)
── Data Summary ────────────────────────
Values
Name chn_df
Number of rows 2852
Number of columns 19
_______________________
Column type frequency:
character 5
numeric 14
________________________
Group variables None
── Variable type: character ────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 gbid 0 1 16 19 0 2852 0
2 adm 0 1 4 4 0 1 0
3 NAME 0 1 2 54 0 2709 0
4 Level 0 1 4 4 0 1 0
5 iso 0 1 3 3 0 1 0
── Variable type: numeric ──────────────────────────────────────────────────────
skim_variable n_missing complete_rate
1 asdf_id 0 1
2 viirs_vcmcfg_dnb_composites_v10_yearly_max.2016.mean 0 1
3 viirs_vcmcfg_dnb_composites_v10_yearly_max.2012.mean 0 1
4 viirs_vcmcfg_dnb_composites_v10_yearly_max.2013.mean 0 1
5 viirs_vcmcfg_dnb_composites_v10_yearly_max.2015.mean 0 1
6 viirs_vcmcfg_dnb_composites_v10_yearly_max.2014.mean 0 1
7 viirs_vcmcfg_dnb_composites_v10_yearly_max.2017.mean 0 1
8 viirs_vcmcfg_dnb_composites_v10_yearly_max.2018.mean 0 1
9 adm_int 0 1
10 feature_id 0 1
11 gqid 0 1
12 X 0 1
13 Y 0 1
14 id 0 1
mean sd p0 p25 p50 p75 p100 hist
1 1426. 823. 0 713. 1426. 2138. 2851 ▇▇▇▇▇
2 4.33 13.8 0.143 0.425 0.864 2.78 534. ▇▁▁▁▁
3 3.77 10.4 0.167 0.397 0.761 2.29 306. ▇▁▁▁▁
4 4.30 10.9 0.212 0.511 0.968 2.76 277. ▇▁▁▁▁
5 4.19 10.7 0.174 0.484 0.921 2.76 287. ▇▁▁▁▁
6 4.17 10.3 0.157 0.515 0.947 2.76 243. ▇▁▁▁▁
7 4.85 15.7 0.332 0.651 1.14 3.16 646. ▇▁▁▁▁
8 4.93 13.1 0.332 0.628 1.16 3.34 420. ▇▁▁▁▁
9 3 0 3 3 3 3 3 ▁▁▇▁▁
10 1426. 823. 0 713. 1426. 2138. 2851 ▇▇▇▇▇
11 1426. 823. 0 713. 1426. 2138. 2851 ▇▇▇▇▇
12 112. 9.52 74.9 107. 113. 118. 134. ▁▁▅▇▁
13 33.2 6.74 16.5 28.1 32.7 37.9 52.9 ▂▇▇▅▁
14 1426. 823. 1 714. 1426. 2139. 2852 ▇▇▇▇▇
plot(chn$geometry)