sf
package.Recommended textbooks:
Other sources:
sf
, sp
tend to be well-documented, and with worked examples.EPSG:5513
)..shp
), geodatabase (.gdb
), geopackage (.gpk
), geoJSON (.geojson
)..shp
) format. The portal allows downloading the maps at different resolutions. In this guide, we will be working with the most precise one, which is referred to as ‘Základná úroveň/ ZBGIS - Administratívne hranice’.sf
packageInstall the package using:
st_read()
to load mapsst_read(dsn, layer)
dsn
: file name of the map file.layer
: certain file formats allow saving multiple layers, we will work with single layer files (it does not mean we can’t have multiple indicators/statistic for each location).Load the map of Slovakia at the municipality level:
To see what coordinate system it uses, st_crs()
function can be used:
## Coordinate Reference System:
## No EPSG code
## proj4string: "+proj=krovak +lat_0=49.5 +lon_0=24.83333333333333 +alpha=30.28813975277778 +k=0.9999 +x_0=0 +y_0=0 +ellps=bessel +units=m +no_defs"
Due to the missing code, we do a transformation to the official code using st_transform()
function.
## DOW AUT ACH SOI FACC IDN4 NM4 IDN3
## Min. :2019-04-30 Min. :1 Min. :1 Min. :9 FA004:2927 Min. :500011 Lúčka : 4 Min. :101.0
## 1st Qu.:2019-04-30 1st Qu.:1 1st Qu.:1 1st Qu.:9 1st Qu.:508989 Porúbka : 4 1st Qu.:402.0
## Median :2019-04-30 Median :1 Median :1 Median :9 Median :518204 Dúbrava : 3 Median :606.0
## Mean :2019-04-30 Mean :1 Mean :1 Mean :9 Mean :521206 Lúčky : 3 Mean :547.7
## 3rd Qu.:2019-04-30 3rd Qu.:1 3rd Qu.:1 3rd Qu.:9 3rd Qu.:526138 Petrovce: 3 3rd Qu.:708.0
## Max. :2019-04-30 Max. :1 Max. :1 Max. :9 Max. :599972 Píla : 3 Max. :811.0
## (Other) :2907
## NM3 IDN2 NM2 VYMERA Shape_Leng Shape_Area
## Košice - okolie: 114 Min. :1.000 Prešovský :665 Min. : 361255 Min. : 3211 Min. : 354208
## Rimavská Sobota: 107 1st Qu.:4.000 Banskobystrický:516 1st Qu.: 7207245 1st Qu.: 14031 1st Qu.: 7207035
## Prešov : 91 Median :6.000 Košický :461 Median : 11655767 Median : 18582 Median : 11647728
## Levice : 89 Mean :5.415 Nitriansky :354 Mean : 16752332 Mean : 21367 Mean : 16747701
## Bardejov : 86 3rd Qu.:7.000 Žilinský :315 3rd Qu.: 19769322 3rd Qu.: 25503 3rd Qu.: 19794868
## Trebišov : 82 Max. :8.000 Trenčiansky :276 Max. :359784254 Max. :204284 Max. :359936992
## (Other) :2358 (Other) :340
## geometry
## MULTIPOLYGON :2927
## epsg:5513 : 0
## +proj=krov...: 0
##
##
##
##
From the columns included in the shapefiles, we see that:
IDN4
is a unique identifier for each municipality.NM4
is the name of the municipality.IDN3
is the identifier for okres.NM3
is the name for okres.IDN2
is the identifier for kraj.NM2
is the name for kraj.Shape_Leng
and Shape_Area
are the perimeter and the area of a municipality.VYMERA
is given that the values slightly deviate from the Shape_Area
.The other columns are not relevant for this tutorial.
Before, we proceed, we rename the columns to more meaningful names:
Some of the most popular options are:
plot()
method in the sf
package,ggplot
,tmap
.We will give examples of all throughout the rest of this tutorial.
Load migration and population data
migration.district.df <- read_excel(file.path(getwd(), 'data', "raw", "migration_by_district.xlsx"),
skip=8, col_names=FALSE)
colnames(migration.district.df) <- c('district', 'inflow_sk', 'inflow_abroad', 'outflow_sk', 'outflow_abroad')
migration.district.df <- migration.district.df %>%
dplyr::mutate(district = gsub("^District of (.+)$", "\\1", district))
# load population data and clean them
population.municipality.df <- read_excel(file.path(getwd(), 'data', "raw", "population_municipality.xlsx"),
skip=7, col_names=FALSE)
colnames(population.municipality.df) <- c('district', 'municipality', 'population')
population.district.df <- population.municipality.df %>%
dplyr::mutate(district = gsub("^District of (.+)$", "\\1", district)) %>%
dplyr::group_by(district) %>%
dplyr::summarise(population = sum(population))
Merge the data with maps
library(stringi) # for removing diacritics for joins
pop.migration.district.df <- migration.district.df %>%
dplyr::full_join(population.district.df, by = 'district') %>%
dplyr::mutate(district_ascii = stri_trans_general(str = district, id = "Latin-ASCII"))
pop.migration.district.sf <- mapa.okres.sf %>%
dplyr::mutate(district_ascii = stri_trans_general(str = okres, id = "Latin-ASCII")) %>%
dplyr::left_join(pop.migration.district.df, by = "district_ascii")
pop.migration.district.sf <- pop.migration.district.sf %>%
dplyr::mutate(outflow_abroad_percent = 100*(outflow_abroad / population)) %>%
dplyr::mutate(outflow_sk_percent = 100*(outflow_sk / population)) %>%
dplyr::mutate(inflow_abroad_percent = 100*(inflow_abroad / population)) %>%
dplyr::mutate(inflow_sk_percent = 100*(inflow_sk / population))
Plot using the default plot()
method
Plotting side-by-side using the default method is possible, but has limitations such as not supporting legends. tmap
package can handle those very well.
tmap
allows integration with Open Street Map data and interactive maps on top of regular plotting. What we show here is only scratching the surface, for more examples, please visit the tmap’s github page.
pop.migration.sf.tall <- pop.migration.district.sf %>%
tidyr::gather("type", "percent", c("outflow_abroad_percent", "outflow_sk_percent",
"inflow_abroad_percent", "inflow_sk_percent")) %>%
dplyr::mutate(type = gsub("_", " ", gsub("(.+)\\_percent", "\\1", type)))
sk.migration <- pop.migration.sf.tall %>%
dplyr::filter(grepl('.*sk', type))
tm_shape(sk.migration) +
tm_fill(col = "percent", style = "cont", title="% of population") +
tm_style('gray', title = 'Migration within Slovakia') +
tm_facets('type')
Exercise: do the same for migration to/out of Slovakia.
death.causes.df <- read_excel(file.path(getwd(), 'data', "raw", "deaths_causes.xlsx"), skip=7, col_names=FALSE)
colnames(death.causes.df) <- c('district', 'group', 'type', 'y2014', 'y2015', 'y2016', 'y2017', 'y2018')
# sum over the years, merge with population counts and group by disease group
respiratory.causes.df <- death.causes.df %>%
dplyr::mutate(district = gsub("^District of (.+)$", "\\1", district)) %>%
dplyr::mutate(counts_5year = (y2014 + y2015 + y2016 + y2017 + y2018) / 5) %>%
dplyr::group_by(district, group) %>%
dplyr::summarise(death_count = sum(counts_5year)) %>%
dplyr::filter(group == 'X Diseases of the respiratory system') %>%
dplyr::full_join(population.district.df, by = 'district') %>%
dplyr::mutate(death_rate = 100 * (death_count / population)) %>%
dplyr::mutate(district_ascii = stri_trans_general(str = district, id = "Latin-ASCII"))
# merge with the map
respiratory.causes.sf <- mapa.okres.sf %>%
dplyr::mutate(district_ascii = stri_trans_general(str = okres, id = "Latin-ASCII")) %>%
dplyr::left_join(respiratory.causes.df, by = "district_ascii")
For a change, we will use ggplot
:
\[ \begin{equation} I=\frac{m \sum_{i=1}^{m} \sum_{j=1}^{m} w_{i j}\left(Z_{i}-\overline{Z}\right)\left(Z_{j}-\overline{Z}\right)}{\left(\sum_{i=1}^{m} \sum_{j=1}^{m} w_{i j}\right)\left(\sum_{i=1}^{m}\left(Z_{i}-\overline{Z}\right)^{2}\right)} \end{equation} \]
library(spatstat)
library(sp)
library(spdep)
death.rates <- unlist(st_drop_geometry(respiratory.causes.sf['death_rate']))
nb.list <- poly2nb(pl = respiratory.causes.sf, snap = 1)
nb.weights <- nb2listw(nb.list)
spdep::moran.test(death.rates, nb.weights)
##
## Moran I test under randomisation
##
## data: death.rates
## weights: nb.weights
##
## Moran I statistic standard deviate = 2.6527, p-value = 0.003993
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.177311258 -0.012820513 0.005137317
Next, we look at presidential elections in Slovakia from 2019.
Load the data:
library(readxl)
results.obec.file.path <- file.path(getwd(), 'data', "raw",
"PRE_2019_KOLO1_xlsx",
"PRE_2019_KOLO1_tab03e.xlsx")
results.obec <- read_excel(results.obec.file.path)
results.obec <- results.obec[-(1:4), ]
names(results.obec) <- c('kraj_id', 'kraj', 'uzemny_obvod_id', 'uzemny_obvod',
'okres_id', 'okres', 'obec_id', 'obec',
'candidate_id', 'candidate_fname', 'candidate_sname',
'platne_hlasy_count', 'platne_hlasy_percent',
'pullouts')
results.obec <- results.obec %>%
dplyr::mutate_at(c("platne_hlasy_count", "platne_hlasy_percent"), as.numeric)
Join the data with the map:
Since we are working with ‘obec’ level, we focus only on one ‘kraj’ region, and we only compare top 4 candidates in that region.
zilinsky.kraj.results.sf <- results.obec.sf %>%
dplyr::filter(kraj_id == 5) # zilinsky kraj
# select top 4 candidates in Zilinsky kraj
zilinsky.kraj.top.candidates <- zilinsky.kraj.results.sf %>%
dplyr::group_by(name) %>%
dplyr::summarise(vote_total = sum(platne_hlasy_count)) %>%
dplyr::arrange(vote_total) %>%
dplyr::top_n(4, vote_total) %>%
dplyr::pull(name)
# filter out only the 4 candidates
zilinsky.kraj.results.top.candidates.sf <- zilinsky.kraj.results.sf %>%
dplyr::filter(name %in% zilinsky.kraj.top.candidates)
As we can see this visualisation is not ideal, so we go for an interactive map where we can zoom in/out as we please.
library(tidyr)
zilinsky.kraj.results.top.2.sf <- zilinsky.kraj.results.top.candidates.sf %>%
dplyr::filter(name %in% c('Maroš Šefčovič', 'Zuzana Čaputová'))
zilinsky.results.wide.sf <- zilinsky.kraj.results.top.2.sf %>%
dplyr::select(name, platne_hlasy_percent, obec) %>%
tidyr::spread(name, platne_hlasy_percent)
tmap_mode("view")
tm <- tm_shape(zilinsky.results.wide.sf) +
tm_polygons(c('Maroš Šefčovič', 'Zuzana Čaputová')) +
tm_facets(sync = TRUE, ncol = 2) +
tm_layout(legend.text.size = 0.4)
tm
We will:
mapa.roads.sf <- st_read(dsn = file.path(getwd(), 'maps', 'slovakia-roads-shape', 'roads.shp'))
mapa.roads.sf <- st_transform(x = mapa.roads.sf, crs = crsCodeSkCadastreOffice)
mapa.places.sf <- st_read(dsn = file.path(getwd(), 'maps', 'slovakia-places-shape', 'places.shp'))
mapa.places.sf <- st_transform(x = mapa.places.sf, crs = crsCodeSkCadastreOffice)
# motorways, primary roads, secondary roads
slovakia.motorways.sf <- mapa.roads.sf %>%
dplyr::filter(type %in% c('motorway', 'primary', 'secondary')) %>%
dplyr::select(name, type)
# 8 cities that we have in Slovakia (extracted as points on the map)
cities.points.sf <- mapa.places.sf %>%
dplyr::filter(type == 'city') %>%
dplyr::select(name, type)
slovakia.motorways.df <- as.data.frame(slovakia.motorways.sf) %>% droplevels()
slovakia.motorways.sf <- st_sf(slovakia.motorways.df,st_geometry(slovakia.motorways.sf))
tmap_mode("plot")
tm_shape(mapa.kraj.sf) +
tm_borders(col='black') +
tm_shape(slovakia.motorways.sf) +
tm_lines(col='type', palette = c('red', 'green', 'blue')) +
tm_shape(cities.points.sf) +
tm_dots(size=0.1) +
tm_text('name', size=0.7, auto.placement = T)