mlspatial:Spatial Machine Learning workflow

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

olddir <- getwd()
setwd(tempdir())
setwd(olddir)

oldopt <- options(digits = 4)
options(oldopt)

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(mlspatial)
library(dplyr)
library(ggplot2)
library(tmap)
library(sf)
library(spdep)
library(randomForest)
library(xgboost) 
library(e1071) 
library(caret)
library(tidyr)

# Quick thematic map with country labels
plot(africa_shp)

qtm(africa_shp, text = "NAME")  # Replace with actual field name


tm_shape(africa_shp) +
  tm_polygons() +
  tm_text("NAME", size = 0.5) +  # Replace with correct column
  tm_title ("Africa Countries")


ggplot(africa_shp) +
  geom_sf(fill = "lightgray", color = "black") +
  geom_sf_text(aes(label = NAME), size = 2) +  # Replace as needed
  ggtitle("Africa countries") +
  theme_minimal()

# Join data
mapdata <- join_data(africa_shp, panc_incidence, by = "NAME")

## OR Joining/ merging my data and shapefiles
mapdata <- inner_join(africa_shp, panc_incidence, by = "NAME")   
## OR mapdata <- left_join(nat, codata, by = "DISTRICT_N")
str(mapdata)
#> Classes 'sf' and 'data.frame':   53 obs. of  26 variables:
#>  $ OBJECTID  : int  2 3 5 6 7 8 9 10 11 12 ...
#>  $ FIPS_CNTRY: chr  "UV" "CV" "GA" "GH" ...
#>  $ ISO_2DIGIT: chr  "BF" "CV" "GM" "GH" ...
#>  $ ISO_3DIGIT: chr  "BFA" "CPV" "GMB" "GHA" ...
#>  $ NAME      : chr  "Burkina Faso" "Cabo Verde" "Gambia" "Ghana" ...
#>  $ COUNTRYAFF: chr  "Burkina Faso" "Cabo Verde" "Gambia" "Ghana" ...
#>  $ CONTINENT : chr  "Africa" "Africa" "Africa" "Africa" ...
#>  $ TOTPOP    : int  20107509 560899 2051363 27499924 12413867 1792338 4689021 17885245 3758571 33986655 ...
#>  $ incidence : num  330.4 53.4 31.4 856.3 163.1 ...
#>  $ female    : num  1683 362 140 4566 375 ...
#>  $ male      : num  1869 211 197 4640 1378 ...
#>  $ ageb      : num  669.7 93.7 68.7 2047 336.7 ...
#>  $ agec      : num  2878 480 268 7147 1414 ...
#>  $ agea      : num  4.597 0.265 0.718 11.888 2.13 ...
#>  $ fageb     : num  250.3 40.2 23.1 782 59.1 ...
#>  $ fagec     : num  1429 322 116 3775 315 ...
#>  $ fagea     : num  3.413 0.146 0.548 8.816 1.228 ...
#>  $ mageb     : num  419.5 53.5 45.6 1265 277.6 ...
#>  $ magec     : num  1448 158 152 3372 1100 ...
#>  $ magea     : num  1.184 0.12 0.17 3.073 0.902 ...
#>  $ yra       : num  182.4 30.2 16.6 524.7 73.1 ...
#>  $ yrb       : num  187.2 34.1 17.1 552.6 74.9 ...
#>  $ yrc       : num  193.1 35 18 578.5 76.9 ...
#>  $ yrd       : num  198.5 35.9 18.3 602.7 78.6 ...
#>  $ yre       : num  204.3 36.5 18.7 621.5 79.4 ...
#>  $ geometry  :sfc_MULTIPOLYGON of length 53; first list element: List of 1
#>   ..$ :List of 1
#>   .. ..$ : num [1:317, 1:2] 102188 90385 80645 74151 70224 ...
#>   ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
#>  - attr(*, "sf_column")= chr "geometry"
#>  - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
#>   ..- attr(*, "names")= chr [1:25] "OBJECTID" "FIPS_CNTRY" "ISO_2DIGIT" "ISO_3DIGIT" ...

#Visualize Pancreatic cancer Incidence by countries
#Basic map with labels
# quantile map
p1 <- tm_shape(mapdata) + 
  tm_fill("incidence", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Incidence")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p2 <- tm_shape(mapdata) + 
  tm_fill("female", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Female")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p3 <- tm_shape(mapdata) + 
  tm_fill("male", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Male")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p4 <- tm_shape(mapdata) + 
  tm_fill("ageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Age:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p5 <- tm_shape(mapdata) + 
  tm_fill("agec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Age:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p6 <- tm_shape(mapdata) + 
  tm_fill("agea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Age:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p7 <- tm_shape(mapdata) + 
  tm_fill("fageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Female:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p8 <- tm_shape(mapdata) + 
  tm_fill("fagec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Female:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p9 <- tm_shape(mapdata) + 
  tm_fill("fagea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Female:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p10 <- tm_shape(mapdata) + 
  tm_fill("mageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Male:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p11 <- tm_shape(mapdata) + 
  tm_fill("magec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Male:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p12 <- tm_shape(mapdata) + 
  tm_fill("magea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Male:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)

current.mode <- tmap_mode("plot")
tmap_arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, 
             widths = c(.75, .75))

tmap_mode(current.mode)

## Machine Learning Model building
# 1. Random Forest Regression
# Train Random Forest
set.seed(123)
rf_model <- randomForest(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
                           magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata, ntree = 500, 
                         importance = TRUE)

# View model results
print(rf_model)
#> 
#> Call:
#>  randomForest(formula = incidence ~ female + male + agea + ageb +      agec + fagea + fageb + fagec + magea + mageb + magec + yrb +      yrc + yrd + yre, data = mapdata, ntree = 500, importance = TRUE) 
#>                Type of random forest: regression
#>                      Number of trees: 500
#> No. of variables tried at each split: 5
#> 
#>           Mean of squared residuals: 76299.33
#>                     % Var explained: 91.35
varImpPlot(rf_model)


#Plot Variable Importance
library(ggplot2)
importance_df <- data.frame(
  Variable = rownames(importance(rf_model)),
  Importance = importance(rf_model)[, "IncNodePurity"])

ggplot(importance_df, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Variable Importance (IncNodePurity)", x = "Variable", y = "Importance")


# 2. Gradient Boosting Machine (XGBoost)
# Prepare the data
x_vars <- model.matrix(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
                         magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata)[,-1]
y <- mapdata$incidence

# Convert to DMatrix
dtrain <- xgb.DMatrix(data = x_vars, label = y)

# Train model
# Train model using xgb.train()
params <- list(
  objective = "reg:squarederror",
  max_depth = 4,
  learning_rate = 0.1,
  verbosity = 0
)

xgb_model <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = 100
)
# Feature importance
xgb.importance(model = xgb_model)
#>     Feature         Gain       Cover   Frequency
#>      <char>        <num>       <num>       <num>
#>  1:  female 9.907517e-01 0.604198127 0.614457831
#>  2:    agec 5.419284e-03 0.084314910 0.105421687
#>  3:    ageb 2.267636e-03 0.113560858 0.105421687
#>  4:    male 1.553178e-03 0.114869626 0.106927711
#>  5:   mageb 5.295916e-06 0.013188362 0.013554217
#>  6:   fagec 1.610951e-06 0.019832880 0.016566265
#>  7:     yrd 4.180083e-07 0.007449914 0.006024096
#>  8:    agea 4.159760e-07 0.018222088 0.013554217
#>  9:   magea 2.949594e-07 0.015201852 0.010542169
#> 10:   fageb 7.749886e-08 0.003976644 0.003012048
#> 11:     yrb 4.104050e-08 0.003070573 0.003012048
#> 12:   fagea 3.490967e-09 0.002114165 0.001506024

# 3. Support Vector Regression (SVR)
# Train SVR model
svr_model <- svm(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
                   magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata, 
                 type = "eps-regression", 
                 kernel = "radial")

# Summary and predictions
summary(svr_model)
#> 
#> Call:
#> svm(formula = incidence ~ female + male + agea + ageb + agec + fagea + 
#>     fageb + fagec + magea + mageb + magec + yrb + yrc + yrd + yre, 
#>     data = mapdata, type = "eps-regression", kernel = "radial")
#> 
#> 
#> Parameters:
#>    SVM-Type:  eps-regression 
#>  SVM-Kernel:  radial 
#>        cost:  1 
#>       gamma:  0.06666667 
#>     epsilon:  0.1 
#> 
#> 
#> Number of Support Vectors:  7
#mapdata$pred_svr <- predict(svr_model)

# Model Evaluation (predictions):
# evaluate each model step-by-step:
# 1. Random Forest Evaluation
rf_preds <- predict(rf_model, newdata = mapdata)
rf_metrics <- postResample(pred = rf_preds, obs = mapdata$incidence)
print(rf_metrics)
#>       RMSE   Rsquared        MAE 
#> 98.2197100  0.9976235 30.2558297

# 2. XGBoost Evaluation
xgb_preds <- predict(xgb_model, newdata = x_vars)
xgb_metrics <- postResample(pred = xgb_preds, obs = mapdata$incidence)
print(xgb_metrics)
#>      RMSE  Rsquared       MAE 
#> 2.1004543 0.9999983 0.7290456

# 3. SVR Evaluation
svr_preds <- predict(svr_model, newdata = mapdata)
svr_metrics <- postResample(pred = svr_preds, obs = mapdata$incidence)
print(svr_metrics)
#>        RMSE    Rsquared         MAE 
#> 445.9372342   0.9178101 127.7105534

# Compare All Models
model_eval <- data.frame(
  Model = c("Random Forest", "XGBoost", "SVR"),
  RMSE = c(rf_metrics["RMSE"], xgb_metrics["RMSE"], svr_metrics["RMSE"]),
  MAE = c(rf_metrics["MAE"], xgb_metrics["MAE"], svr_metrics["MAE"]),
  Rsquared = c(rf_metrics["Rsquared"], xgb_metrics["Rsquared"], svr_metrics["Rsquared"]))

print(model_eval)
#>           Model       RMSE         MAE  Rsquared
#> 1 Random Forest  98.219710  30.2558297 0.9976235
#> 2       XGBoost   2.100454   0.7290456 0.9999983
#> 3           SVR 445.937234 127.7105534 0.9178101

#Choosing the Best Model
#Lowest RMSE and MAE = most accurate predictions
#Highest R² = best variance explanation
#Sort and interpret:
model_eval[order(model_eval$RMSE), ]  # Best = Top row
#>           Model       RMSE         MAE  Rsquared
#> 2       XGBoost   2.100454   0.7290456 0.9999983
#> 1 Random Forest  98.219710  30.2558297 0.9976235
#> 3           SVR 445.937234 127.7105534 0.9178101

#### Models Predicted plots
# Create a data frame from your table
model_preds <- data.frame(rf_preds, xgb_preds, svr_preds)
# Add observation ID
model_preds$ID <- 1:nrow(model_preds)

# Reshape for plotting
model_long <- model_preds %>%
  tidyr::pivot_longer(cols = c("rf_preds", "xgb_preds", "svr_preds"), names_to = "Model", values_to = "Predicted")

# Plot
ggplot(model_long, aes(x = ID, y = Predicted, color = Model)) +
  geom_line(linewidth = 0.5) +
  labs(title = "Model Predictions Over Observations",
       x = "Observation", y = "Predicted Incidence") +
  theme_minimal()


## plot actual vs predicted 
oldpar <- par(mfrow = c(1, 3)) # 3 plots side-by-side

# Random Forest
plot(mapdata$incidence, mapdata$rf_pred,
     xlab = "Observed", ylab = "Predicted",
     main = "Random Forest", pch = 19, col = "steelblue")
abline(0, 1, col = "red", lwd = 2)

# XGBoost
plot(mapdata$incidence, mapdata$xgb_pred,
     xlab = "Observed", ylab = "Predicted",
     main = "XGBoost", pch = 19, col = "darkgreen")
abline(0, 1, col = "red", lwd = 2)

# SVR
plot(mapdata$incidence, mapdata$svr_pred,
     xlab = "Observed", ylab = "Predicted",
     main = "SVR", pch = 19, col = "purple")
abline(0, 1, col = "red", lwd = 2)


par(oldpar)

## Actual vs predicted plot with correlations

library(ggplot2)
library(ggpubr)  # For stat_cor

mapdata$rf_pred <- predict(rf_model)
mapdata$xgb_pred <- predict(xgb_model, newdata = x_vars)
mapdata$svr_pred <- predict(svr_model, newdata = mapdata)

# Random Forest Plot
p1 <- ggplot(mapdata, aes(x = incidence, y = rf_pred)) +
  geom_point(color = "steelblue", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
  labs(title = "Random Forest", x = "Observed Incidence", y = "Predicted Incidence") +
  theme_minimal()

# XGBoost Plot
p2 <- ggplot(mapdata, aes(x = incidence, y = xgb_pred)) +
  geom_point(color = "darkgreen", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
  labs(title = "XGBoost", x = "Observed Incidence", y = "Predicted Incidence") +
  theme_minimal()

# SVR Plot
p3 <- ggplot(mapdata, aes(x = incidence, y = svr_pred)) +
  geom_point(color = "purple", alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
  labs(title = "SVR", x = "Observed Incidence", y = "Predicted Incidence") +
  theme_minimal()

combined_plot <- ggarrange(p1, p2, p3, ncol = 3, nrow = 1, common.legend = FALSE)

## CROSS-VALIDATION
#Step 1: Prepare common setup
# Set seed for reproducibility
set.seed(123)
library(caret)
# Define 5-fold cross-validation
cv_control <- trainControl(method = "cv", number = 3)

# 1. Random Forest
library(randomForest)

rf_cv <- train(
  incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
    magea + mageb + magec + yrb + yrc + yrd + yre,
  data = mapdata,
  method = "rf",
  trControl = cv_control,
  tuneLength = 3,
  importance = TRUE
)
print(rf_cv)
#> Random Forest 
#> 
#> 53 samples
#> 16 predictors
#> 
#> No pre-processing
#> Resampling: Cross-Validated (3 fold) 
#> Summary of sample sizes: 36, 34, 36 
#> Resampling results across tuning parameters:
#> 
#>   mtry  RMSE      Rsquared   MAE     
#>    2    444.8157  0.8580149  161.2283
#>    8    450.7475  0.8548550  163.6862
#>   15    454.0229  0.8542215  165.2292
#> 
#> RMSE was used to select the optimal model using the smallest value.
#> The final value used for the model was mtry = 2.

# 2. XGBoost
library(xgboost)
mapdata <- st_drop_geometry(mapdata)
mapdata$incidence <- as.numeric(mapdata$incidence)

cv_control <- trainControl(
  method = "cv",
  number = 5
)

# XGBoost with caret
xgb_cv <- xgb.cv(
  params = params,
  data = dtrain,
  nrounds = 100,
  nfold = 5,           # 5-fold CV
  early_stopping_rounds = 10, # stop if no improvement in 10 rounds
  metrics = list("rmse"),
  verbose = 0
)
print(xgb_cv)
#> ##### xgb.cv 5-folds
#>   iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
#>  <int>           <num>          <num>          <num>         <num>
#>      1      869.239393    127.3707020       790.2016      556.6500
#>      2      811.895853    113.6967223       753.6544      548.5258
#>      3      758.695958    101.1327812       719.3175      541.3886
#>      4      709.258868     89.5985897       687.8282      534.0328
#>      5      663.309677     79.0397616       658.7448      527.2620
#>      6      620.545640     69.4028269       632.3349      520.5086
#>      7      580.734477     60.6017161       608.0614      514.3051
#>      8      543.630439     52.5890340       585.6143      508.4693
#>      9      509.061361     45.2924036       565.3273      502.8183
#>     10      476.811169     38.6662221       545.6182      498.5917
#>     11      446.728563     32.6587860       527.4934      495.3552
#>     12      418.645545     27.2396970       510.8609      491.8151
#>     13      392.428753     22.3383309       495.3064      488.7539
#>     14      367.936479     17.9487507       480.7644      485.8850
#>     15      345.050737     14.0417407       467.7380      482.9319
#>     16      323.657820     10.5989823       455.5772      480.3232
#>     17      303.658221      7.6468438       444.2999      478.0095
#>     18      284.949110      5.3008448       433.7681      476.0269
#>     19      267.446830      3.9041583       423.8386      474.4026
#>     20      251.072552      3.8834340       414.6807      472.9013
#>     21      235.748441      4.8799317       406.5349      471.3381
#>     22      221.402044      6.2001606       398.7159      470.1287
#>     23      207.972113      7.5265587       391.4943      469.2266
#>     24      195.395832      8.7500043       384.7359      468.2956
#>     25      183.618137      9.8448664       378.4595      467.4727
#>     26      172.579433     10.8036044       372.8544      466.6354
#>     27      162.233649     11.6317973       367.5725      465.6864
#>     28      152.531762     12.3298351       362.3877      464.9543
#>     29      143.434085     12.9128515       357.7527      464.1551
#>     30      134.899409     13.3869674       353.4512      463.4164
#>     31      126.894472     13.7625645       349.4280      462.7497
#>     32      119.383493     14.0499628       345.6474      462.1675
#>     33      112.336085     14.2548206       342.1012      461.6551
#>     34      105.721359     14.3870126       338.8096      461.1720
#>     35       99.516489     14.4496212       335.7164      460.7433
#>     36       93.683744     14.4612375       332.8313      460.3522
#>     37       88.207532     14.4191607       330.1307      460.0082
#>     38       83.066339     14.3300654       327.5992      459.7049
#>     39       78.239213     14.2001212       325.2061      459.4532
#>     40       73.706952     14.0344257       322.9586      459.2320
#>     41       69.453772     13.8329782       320.8131      459.0585
#>     42       65.455340     13.6042703       318.8138      458.8997
#>     43       61.694388     13.3553540       316.8836      458.7848
#>     44       58.161545     13.0847456       315.0938      458.6745
#>     45       54.843352     12.7942086       313.4106      458.5800
#>     46       51.727013     12.4870438       311.8410      458.4893
#>     47       48.800109     12.1668617       310.3461      458.4239
#>     48       46.050852     11.8341907       308.9519      458.3600
#>     49       43.470091     11.4911414       307.6304      458.3118
#>     50       41.045067     11.1414634       306.3959      458.2640
#>     51       38.766216     10.7874114       305.2391      458.2214
#>     52       36.628218     10.4270416       304.1478      458.1830
#>     53       34.619590     10.0648898       303.1232      458.1492
#>     54       32.735149      9.7000239       302.1662      458.1140
#>     55       30.967443      9.3330799       301.2729      458.0778
#>     56       29.310898      8.9641345       300.4338      458.0435
#>     57       27.756956      8.5961935       299.6458      458.0095
#>     58       26.298972      8.2304126       298.9090      457.9752
#>     59       24.933027      7.8658598       298.2208      457.9401
#>     60       23.653849      7.5020737       297.5853      457.8992
#>     61       22.451003      7.1460083       296.9914      457.8580
#>     62       21.309321      6.8065028       296.4383      457.8143
#>     63       20.226651      6.4820813       295.9174      457.7738
#>     64       19.202495      6.1707325       295.3967      457.7511
#>     65       18.228768      5.8757562       294.9482      457.7059
#>     66       17.310864      5.5886756       294.4956      457.6777
#>     67       16.437824      5.3168546       294.0978      457.6350
#>     68       15.614714      5.0534595       293.7217      457.5942
#>     69       14.832464      4.8022642       293.3740      457.5513
#>     70       14.086469      4.5660667       293.0471      457.5093
#>     71       13.381798      4.3394510       292.7398      457.4681
#>     72       12.712208      4.1234158       292.4562      457.4248
#>     73       12.076774      3.9185987       292.1869      457.3840
#>     74       11.472889      3.7233422       291.9417      457.3399
#>     75       10.899541      3.5380380       291.7111      457.2972
#>     76       10.355206      3.3622312       291.4919      457.2569
#>     77        9.837891      3.1947687       291.2877      457.2165
#>     78        9.344513      3.0377197       291.1026      457.1739
#>     79        8.878307      2.8866616       290.9245      457.1346
#>     80        8.435075      2.7427897       290.7651      457.0920
#>     81        8.014227      2.6064334       290.6128      457.0525
#>     82        7.614482      2.4769421       290.4711      457.0131
#>     83        7.234784      2.3536897       290.3374      456.9753
#>     84        6.873823      2.2364538       290.2146      456.9371
#>     85        6.531273      2.1254459       290.0973      456.9014
#>     86        6.205946      2.0200509       289.9841      456.8685
#>     87        5.896936      1.9198917       289.8765      456.8371
#>     88        5.603172      1.8247213       289.7767      456.8061
#>     89        5.324408      1.7343520       289.6757      456.7801
#>     90        5.059483      1.6487059       289.5910      456.7492
#>     91        4.808006      1.5671828       289.5027      456.7241
#>     92        4.569061      1.4898766       289.4191      456.7002
#>     93        4.342314      1.4165729       289.3420      456.6764
#>     94        4.126835      1.3469663       289.2688      456.6538
#>     95        3.922361      1.2809817       289.1989      456.6325
#>     96        3.728132      1.2182728       289.1344      456.6112
#>     97        3.543669      1.1588310       289.0731      456.5911
#>     98        3.368373      1.1020500       289.0163      456.5712
#>     99        3.202143      1.0484843       288.9618      456.5526
#>    100        3.044323      0.9976821       288.9115      456.5340
#>   iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
#> Best iteration:
#>   iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
#>  <int>           <num>          <num>          <num>         <num>
#>    100        3.044323      0.9976821       288.9115       456.534
best_nrounds <- xgb_cv$best_iteration
cat("Best number of rounds:", best_nrounds, "\n")
#> Best number of rounds:
# Extract mean RMSE
mean_rmse <- min(xgb_cv$evaluation_log$test_rmse_mean)
cat("XGBoost CV RMSE:", mean_rmse, "\n")
#> XGBoost CV RMSE: 288.9115


# 3. Support Vector Regression (SVR)
library(e1071)
library(kernlab)

svr_cv <- train(
  incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
    magea + mageb + magec + yrb + yrc + yrd + yre,
  data = mapdata,
  method = "svmRadial",
  trControl = cv_control,
  preProcess = c("center", "scale"),  # SVR often benefits from scaling
  tuneLength = 3
)
print(svr_cv)
#> Support Vector Machines with Radial Basis Function Kernel 
#> 
#> 53 samples
#> 15 predictors
#> 
#> Pre-processing: centered (15), scaled (15) 
#> Resampling: Cross-Validated (5 fold) 
#> Summary of sample sizes: 43, 42, 42, 43, 42 
#> Resampling results across tuning parameters:
#> 
#>   C     RMSE      Rsquared   MAE     
#>   0.25  713.0466  0.4939345  363.8929
#>   0.50  704.6990  0.4971961  355.2001
#>   1.00  701.0369  0.5012774  350.6448
#> 
#> Tuning parameter 'sigma' was held constant at a value of 17.63942
#> RMSE was used to select the optimal model using the smallest value.
#> The final values used for the model were sigma = 17.63942 and C = 1.

## Spatial maps of predicted values of each model
# 1. Random Forest Spatial Map
mapdata <- inner_join(africa_shp, panc_incidence, by = "NAME")
mapdata$pred_rf <- predict(rf_model, newdata = mapdata)

tm_shape(mapdata) + 
  tm_fill("pred_rf", fill.scale =tm_scale_intervals(values = "brewer.greens",  style = "quantile"), 
          fill.legend = tm_legend(title = "Inci_pred_rf")) + tm_borders(fill_alpha = .2) + 
  tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                           frame = TRUE, component.autoscale = FALSE)


# 2. XGBoost Spatial Map
# Ensure matrix used in training
mapdata$pred_xgb <- predict(xgb_model, newdata = x_vars)

tm_shape(mapdata) + 
  tm_fill("pred_xgb", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile"), 
          fill.legend = tm_legend(title = "Inci_pred_xgb")) + tm_borders(fill_alpha = .2) + 
  tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                           frame = TRUE, component.autoscale = FALSE)


# 3. Support Vector Regression (SVR) Spatial Map
mapdata$pred_svr <- predict(svr_model, newdata = mapdata)

tm_shape(mapdata) + 
  tm_fill("pred_svr", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Inci_pred_svr")) + tm_borders(fill_alpha = .2) + 
  tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                           frame = TRUE, component.autoscale = FALSE)


# Compare Side by Side
tmap_arrange(
  tm_shape(mapdata) + 
    tm_fill("pred_rf", fill.scale =tm_scale_intervals(values = "brewer.greens",  style = "quantile"), 
            fill.legend = tm_legend(title = "Inci_pred_rf")) + tm_borders(fill_alpha = .2) + 
    tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                             frame = TRUE, component.autoscale = FALSE),
  tm_shape(mapdata) + 
    tm_fill("pred_xgb", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile"), 
            fill.legend = tm_legend(title = "Inci_pred_xgb")) + tm_borders(fill_alpha = .2) + 
    tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                             frame = TRUE, component.autoscale = FALSE),
  tm_shape(mapdata) + 
    tm_fill("pred_svr", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
            fill.legend = tm_legend(title = "Inci_pred_svr")) + tm_borders(fill_alpha = .2) + 
    tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                             frame = TRUE, component.autoscale = FALSE),
  nrow = 1)

# Predicted Residuals
# we've already trained all 3 models and have `mapdata$incidence` as your actual values.
### Step 1: Generate predictions and residuals for each model
# Random Forest
rf_preds <- predict(rf_model, newdata = mapdata)
rf_resid <- mapdata$incidence - rf_preds

# XGBoost
xgb_preds <- predict(xgb_model, newdata = x_vars)  # x_vars = model.matrix(...)
xgb_resid <- mapdata$incidence - xgb_preds

# SVR
svr_preds <- predict(svr_model, newdata = mapdata)
svr_resid <- mapdata$incidence - svr_preds

### Step 2: Combine into a single data frame
residuals_df <- data.frame(
  actual = mapdata$incidence,
  rf_pred = rf_preds,
  rf_resid = rf_resid,
  xgb_pred = xgb_preds,
  xgb_resid = xgb_resid,
  svr_pred = svr_preds,
  svr_resid = svr_resid
)

# export
library(writexl)

### Compare residual distributions

boxplot(residuals_df$rf_resid, residuals_df$xgb_resid, residuals_df$svr_resid,
        names = c("RF", "XGB", "SVR"),
        main = "Model Residuals",
        ylab = "Prediction Error (Residual)")

## Spatial maps of residual values from each model
#Add residuals to mapdata
#You should already have these from the previous steps:
mapdata$rf_resid <- residuals_df$rf_resid
mapdata$xgb_resid <- residuals_df$xgb_resid
mapdata$svr_resid <- residuals_df$svr_resid

# Set tmap mode to plot (static map)
tmap_mode("plot")

# Create individual residual maps
map_rf <- tm_shape(mapdata) +
  tm_fill("rf_resid", fill.scale =tm_scale_intervals(values = "brewer.greens", style = "quantile", 
                                                     midpoint = 0), fill.legend = tm_legend(title = "Inci_rf_resid")) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
                                          frame = TRUE, component.autoscale = FALSE)

map_xgb <- tm_shape(mapdata) + 
  tm_fill("xgb_resid", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile", 
                                                      midpoint = 0), fill.legend = tm_legend(title = "Inci_xgb_resid")) + tm_borders(fill_alpha = .2) + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
            frame = TRUE, component.autoscale = FALSE)
map_svr <- tm_shape(mapdata) + 
  tm_fill("svr_resid", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"), 
          fill.legend = tm_legend(title = "Inci_svr_resid")) + tm_borders(fill_alpha = .2) + 
  tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), 
            frame = TRUE, component.autoscale = FALSE)

#Step 3: Combine maps in a grid
# Combine maps in a grid layout
tmap_arrange(map_rf, map_xgb, map_svr, nrow = 1)

##Barplot and Spatial maps for RMSE and MAE
#Step 1: Calculate RMSE and MAE for each model
# Random Forest
rf_metrics <- postResample(pred = rf_preds, obs = mapdata$incidence)

# XGBoost
xgb_metrics <- postResample(pred = xgb_preds, obs = mapdata$incidence)

# SVR
svr_metrics <- postResample(pred = svr_preds, obs = mapdata$incidence)

#Step 2: Combine into a summary data frame
model_eval <- data.frame(
  Model = c("Random Forest", "XGBoost", "SVR"),
  RMSE = c(rf_metrics["RMSE"], xgb_metrics["RMSE"], svr_metrics["RMSE"]),
  MAE = c(rf_metrics["MAE"], xgb_metrics["MAE"], svr_metrics["MAE"]),
  Rsquared = c(rf_metrics["Rsquared"], xgb_metrics["Rsquared"], svr_metrics["Rsquared"])
)

print(model_eval)
#>           Model       RMSE         MAE  Rsquared
#> 1 Random Forest  98.219710  30.2558297 0.9976235
#> 2       XGBoost   2.100454   0.7290456 0.9999983
#> 3           SVR 445.937234 127.7105534 0.9178101

#Visualize MAE and RMSE
oldpar <- par(mfrow = c(1, 3))

#Barplot of RMSE
barplot(model_eval$RMSE, names.arg = model_eval$Model, 
        col = "skyblue", las = 1, main = "Model RMSE", ylab = "RMSE")

#Barplot of MAE
barplot(model_eval$MAE, names.arg = model_eval$Model, 
        col = "lightgreen", las = 1, main = "Model MAE", ylab = "MAE")
#Barplot of MAE
barplot(model_eval$Rsquared, names.arg = model_eval$Model, 
        col = "grey", las = 1, main = "Model Rsquared", ylab = "MAE")


par(oldpar)

#Add metrics to residual maps as captions
map_rf <- tm_shape(mapdata) +
  tm_fill("rf_resid", fill.scale =tm_scale_intervals(values = "brewer.greens", midpoint = 0),
          title = paste0("rf_resid\nRMSE: ", round(rf_metrics["RMSE"], 2),
                         "\nMAE: ", round(rf_metrics["MAE"], 2))) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))

map_xgb <- tm_shape(mapdata) +
  tm_fill("xgb_resid", fill.scale =tm_scale_intervals(values = "-RdBu", midpoint = 0),
          title = paste0("xgb_resid\nRMSE: ", round(xgb_metrics["RMSE"], 2),
                         "\nMAE: ", round(xgb_metrics["MAE"], 2))) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))

map_svr <- tm_shape(mapdata) +
  tm_fill("svr_resid", fill.scale =tm_scale_intervals(values = "-RdBu", midpoint = 0),
          title = paste0("svr_resid\nRMSE: ", round(svr_metrics["RMSE"], 2),
                         "\nMAE: ", round(svr_metrics["MAE"], 2))) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))

tmap_arrange(map_rf, map_xgb, map_svr, nrow = 1)

#Global Moran’s I, Local Moran’s I (LISA), Cluster categories (e.g., High-High, Low-Low), Maps: Moran’s I map,
#LISA clusters, High-High clusters using the predicted values from the four machine learning models
#Assumptions: Your spatial data is in mapdata (an sf object).
#Predicted values for each model are already stored in: rf_preds, xgb_preds, svr_preds, mlp_preds
#STEP 1: Create Spatial Weights (if not done yet)
neighbors <- poly2nb(mapdata) #if this gives warning, use the below codes
mapdata <- st_as_sf(mapdata)  # If it's not already sf
mapdata <- st_make_valid(mapdata)  # Fix any invalid geometries
neighbors <- poly2nb(mapdata, snap = 1e-15) ## Try 1e-6, 1e-5, or higher if needed. You can adjust snap upward incrementally until the warnings disappear or are reduced

listw <- nb2listw(neighbors, style = "W", zero.policy = TRUE)

#STEP 2: Define a function to compute Moran’s I and cluster categories
analyze_spatial_autocorrelation <- function(mapdata, values, listw, model_name, signif_level = 0.05) {
  # Standardize predicted values
  mapdata$val_st <- scale(values)[, 1]
  # Compute lag
  mapdata$lag_val <- lag.listw(listw, mapdata$val_st, zero.policy = TRUE)
  # Global Moran's I
  global_moran <- moran.test(values, listw, zero.policy = TRUE)
  # Local Moran's I (LISA)
  lisa <- localmoran(values, listw, zero.policy = TRUE)
  lisa_df <- as.data.frame(lisa)
  #rename p-value column
  colnames(lisa_df)[5] <- "Pr_z"
  # Add to mapdata
  mapdata$Ii <- lisa_df$Ii
  mapdata$Z_Ii <- lisa_df$Z.I
  mapdata$Pr_z <- lisa_df$Pr_z
  #mapdata$Pr_z <- lisa_df[, "Pr(z > 0)"]
  # Classify clusters
  mapdata <- mapdata %>%
    mutate(
      cluster = case_when(
        val_st > 0 & lag_val > 0 & Pr_z <= signif_level ~ "High-High",
        val_st < 0 & lag_val < 0 & Pr_z <= signif_level ~ "Low-Low",
        val_st < 0 & lag_val > 0 & Pr_z <= signif_level ~ "Low-High",
        val_st > 0 & lag_val < 0 & Pr_z <= signif_level ~ "High-Low",
        TRUE ~ "Not Significant"
      )
    )
  return(list(
    map = mapdata,
    global_moran = global_moran
  ))
}

#STEP 3: Run the function for each model
rf_result <- analyze_spatial_autocorrelation(mapdata, rf_preds, listw, "Random Forest")
xgb_result <- analyze_spatial_autocorrelation(mapdata, xgb_preds, listw, "XGBoost")
svr_result <- analyze_spatial_autocorrelation(mapdata, svr_preds, listw, "SVR")

#STEP 4: Mapping LISA Clusters and High-High Areas for Random Forest
tmap_mode("plot")
# LISA Cluster Map.  fill.scale =tm_scale_intervals(values = "-RdBu")
tm_rf <- tm_shape(rf_result$map) +
  tm_fill(
    "cluster",
    fill.scale = tm_scale(values = c(
      "High-High" = "red",
      "Low-Low" = "blue",
      "Low-High" = "lightblue",
      "High-Low" = "pink",
      "Not Significant" = "gray90")),
    fill.legend = tm_legend(title = "LISA Clusters - RF")) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom")) 

# Mapping LISA Clusters and High-High Areas for XGBoost
tmap_mode("plot")
# LISA Cluster Map
tm_xgb <- tm_shape(xgb_result$map) +
  tm_fill("cluster", 
          fill.scale = tm_scale(values = c(
            "High-High" = "red", 
            "Low-Low" = "blue",
            "Low-High" = "lightblue", 
            "High-Low" = "pink",
            "Not Significant" = "gray90")),
          fill.legend = tm_legend(title = "LISA Clusters - XGBoost")) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom")) 

#Mapping LISA Clusters and High-High Areas for SVR
tmap_mode("plot")
# LISA Cluster Map
tm_svr <- tm_shape(svr_result$map) +
  tm_fill("cluster", 
          fill.scale = tm_scale(values = c(
            "High-High" = "red", 
            "Low-Low" = "blue",
            "Low-High" = "lightblue", 
            "High-Low" = "pink",
            "Not Significant" = "gray90")),
          fill.legend = tm_legend(title = "LISA Clusters - SVR")) +
  tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))

tmap_arrange(tm_rf, tm_xgb, tm_svr, nrow = 1)

#You can also arrange maps side-by-side using tmap_arrange().

#View Global Moran’s I Results
#These print the test statistic and p-values for global spatial autocorrelation of predictions.
rf_result$global_moran
#> 
#>  Moran I test under randomisation
#> 
#> data:  values  
#> weights: listw  
#> n reduced by no-neighbour observations  
#> 
#> Moran I statistic standard deviate = -0.24595, p-value = 0.5971
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic       Expectation          Variance 
#>      -0.044681255      -0.022727273       0.007967958
xgb_result$global_moran
#> 
#>  Moran I test under randomisation
#> 
#> data:  values  
#> weights: listw  
#> n reduced by no-neighbour observations  
#> 
#> Moran I statistic standard deviate = -0.32513, p-value = 0.6275
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic       Expectation          Variance 
#>      -0.051250814      -0.022727273       0.007696578
svr_result$global_moran
#> 
#>  Moran I test under randomisation
#> 
#> data:  values  
#> weights: listw  
#> n reduced by no-neighbour observations  
#> 
#> Moran I statistic standard deviate = -0.28034, p-value = 0.6104
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic       Expectation          Variance 
#>       -0.05111299       -0.02272727        0.01025262

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.