The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(mlspatial)
library(dplyr)
library(ggplot2)
library(tmap)
library(sf)
library(spdep)
library(randomForest)
library(xgboost)
library(e1071)
library(caret)
library(tidyr)
tm_shape(africa_shp) +
tm_polygons() +
tm_text("NAME", size = 0.5) + # Replace with correct column
tm_title ("Africa Countries")
ggplot(africa_shp) +
geom_sf(fill = "lightgray", color = "black") +
geom_sf_text(aes(label = NAME), size = 2) + # Replace as needed
ggtitle("Africa countries") +
theme_minimal()# Join data
mapdata <- join_data(africa_shp, panc_incidence, by = "NAME")
## OR Joining/ merging my data and shapefiles
mapdata <- inner_join(africa_shp, panc_incidence, by = "NAME")
## OR mapdata <- left_join(nat, codata, by = "DISTRICT_N")
str(mapdata)
#> Classes 'sf' and 'data.frame': 53 obs. of 26 variables:
#> $ OBJECTID : int 2 3 5 6 7 8 9 10 11 12 ...
#> $ FIPS_CNTRY: chr "UV" "CV" "GA" "GH" ...
#> $ ISO_2DIGIT: chr "BF" "CV" "GM" "GH" ...
#> $ ISO_3DIGIT: chr "BFA" "CPV" "GMB" "GHA" ...
#> $ NAME : chr "Burkina Faso" "Cabo Verde" "Gambia" "Ghana" ...
#> $ COUNTRYAFF: chr "Burkina Faso" "Cabo Verde" "Gambia" "Ghana" ...
#> $ CONTINENT : chr "Africa" "Africa" "Africa" "Africa" ...
#> $ TOTPOP : int 20107509 560899 2051363 27499924 12413867 1792338 4689021 17885245 3758571 33986655 ...
#> $ incidence : num 330.4 53.4 31.4 856.3 163.1 ...
#> $ female : num 1683 362 140 4566 375 ...
#> $ male : num 1869 211 197 4640 1378 ...
#> $ ageb : num 669.7 93.7 68.7 2047 336.7 ...
#> $ agec : num 2878 480 268 7147 1414 ...
#> $ agea : num 4.597 0.265 0.718 11.888 2.13 ...
#> $ fageb : num 250.3 40.2 23.1 782 59.1 ...
#> $ fagec : num 1429 322 116 3775 315 ...
#> $ fagea : num 3.413 0.146 0.548 8.816 1.228 ...
#> $ mageb : num 419.5 53.5 45.6 1265 277.6 ...
#> $ magec : num 1448 158 152 3372 1100 ...
#> $ magea : num 1.184 0.12 0.17 3.073 0.902 ...
#> $ yra : num 182.4 30.2 16.6 524.7 73.1 ...
#> $ yrb : num 187.2 34.1 17.1 552.6 74.9 ...
#> $ yrc : num 193.1 35 18 578.5 76.9 ...
#> $ yrd : num 198.5 35.9 18.3 602.7 78.6 ...
#> $ yre : num 204.3 36.5 18.7 621.5 79.4 ...
#> $ geometry :sfc_MULTIPOLYGON of length 53; first list element: List of 1
#> ..$ :List of 1
#> .. ..$ : num [1:317, 1:2] 102188 90385 80645 74151 70224 ...
#> ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
#> - attr(*, "sf_column")= chr "geometry"
#> - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
#> ..- attr(*, "names")= chr [1:25] "OBJECTID" "FIPS_CNTRY" "ISO_2DIGIT" "ISO_3DIGIT" ...#Visualize Pancreatic cancer Incidence by countries
#Basic map with labels
# quantile map
p1 <- tm_shape(mapdata) +
tm_fill("incidence", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Incidence")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p2 <- tm_shape(mapdata) +
tm_fill("female", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Female")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p3 <- tm_shape(mapdata) +
tm_fill("male", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Male")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p4 <- tm_shape(mapdata) +
tm_fill("ageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Age:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p5 <- tm_shape(mapdata) +
tm_fill("agec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Age:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p6 <- tm_shape(mapdata) +
tm_fill("agea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Age:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p7 <- tm_shape(mapdata) +
tm_fill("fageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Female:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p8 <- tm_shape(mapdata) +
tm_fill("fagec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Female:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p9 <- tm_shape(mapdata) +
tm_fill("fagea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Female:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p10 <- tm_shape(mapdata) +
tm_fill("mageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Male:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p11 <- tm_shape(mapdata) +
tm_fill("magec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Male:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p12 <- tm_shape(mapdata) +
tm_fill("magea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Male:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
current.mode <- tmap_mode("plot")
tmap_arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12,
widths = c(.75, .75))## Machine Learning Model building
# 1. Random Forest Regression
# Train Random Forest
set.seed(123)
rf_model <- randomForest(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata, ntree = 500,
importance = TRUE)
# View model results
print(rf_model)
#>
#> Call:
#> randomForest(formula = incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec + magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata, ntree = 500, importance = TRUE)
#> Type of random forest: regression
#> Number of trees: 500
#> No. of variables tried at each split: 5
#>
#> Mean of squared residuals: 76299.33
#> % Var explained: 91.35
varImpPlot(rf_model)
#Plot Variable Importance
library(ggplot2)
importance_df <- data.frame(
Variable = rownames(importance(rf_model)),
Importance = importance(rf_model)[, "IncNodePurity"])
ggplot(importance_df, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Variable Importance (IncNodePurity)", x = "Variable", y = "Importance")
# 2. Gradient Boosting Machine (XGBoost)
# Prepare the data
x_vars <- model.matrix(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata)[,-1]
y <- mapdata$incidence
# Convert to DMatrix
dtrain <- xgb.DMatrix(data = x_vars, label = y)
# Train model
# Train model using xgb.train()
params <- list(
objective = "reg:squarederror",
max_depth = 4,
learning_rate = 0.1,
verbosity = 0
)
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 100
)
# Feature importance
xgb.importance(model = xgb_model)
#> Feature Gain Cover Frequency
#> <char> <num> <num> <num>
#> 1: female 9.907517e-01 0.604198127 0.614457831
#> 2: agec 5.419284e-03 0.084314910 0.105421687
#> 3: ageb 2.267636e-03 0.113560858 0.105421687
#> 4: male 1.553178e-03 0.114869626 0.106927711
#> 5: mageb 5.295916e-06 0.013188362 0.013554217
#> 6: fagec 1.610951e-06 0.019832880 0.016566265
#> 7: yrd 4.180083e-07 0.007449914 0.006024096
#> 8: agea 4.159760e-07 0.018222088 0.013554217
#> 9: magea 2.949594e-07 0.015201852 0.010542169
#> 10: fageb 7.749886e-08 0.003976644 0.003012048
#> 11: yrb 4.104050e-08 0.003070573 0.003012048
#> 12: fagea 3.490967e-09 0.002114165 0.001506024
# 3. Support Vector Regression (SVR)
# Train SVR model
svr_model <- svm(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata,
type = "eps-regression",
kernel = "radial")
# Summary and predictions
summary(svr_model)
#>
#> Call:
#> svm(formula = incidence ~ female + male + agea + ageb + agec + fagea +
#> fageb + fagec + magea + mageb + magec + yrb + yrc + yrd + yre,
#> data = mapdata, type = "eps-regression", kernel = "radial")
#>
#>
#> Parameters:
#> SVM-Type: eps-regression
#> SVM-Kernel: radial
#> cost: 1
#> gamma: 0.06666667
#> epsilon: 0.1
#>
#>
#> Number of Support Vectors: 7
#mapdata$pred_svr <- predict(svr_model)# Model Evaluation (predictions):
# evaluate each model step-by-step:
# 1. Random Forest Evaluation
rf_preds <- predict(rf_model, newdata = mapdata)
rf_metrics <- postResample(pred = rf_preds, obs = mapdata$incidence)
print(rf_metrics)
#> RMSE Rsquared MAE
#> 98.2197100 0.9976235 30.2558297
# 2. XGBoost Evaluation
xgb_preds <- predict(xgb_model, newdata = x_vars)
xgb_metrics <- postResample(pred = xgb_preds, obs = mapdata$incidence)
print(xgb_metrics)
#> RMSE Rsquared MAE
#> 2.1004543 0.9999983 0.7290456
# 3. SVR Evaluation
svr_preds <- predict(svr_model, newdata = mapdata)
svr_metrics <- postResample(pred = svr_preds, obs = mapdata$incidence)
print(svr_metrics)
#> RMSE Rsquared MAE
#> 445.9372342 0.9178101 127.7105534
# Compare All Models
model_eval <- data.frame(
Model = c("Random Forest", "XGBoost", "SVR"),
RMSE = c(rf_metrics["RMSE"], xgb_metrics["RMSE"], svr_metrics["RMSE"]),
MAE = c(rf_metrics["MAE"], xgb_metrics["MAE"], svr_metrics["MAE"]),
Rsquared = c(rf_metrics["Rsquared"], xgb_metrics["Rsquared"], svr_metrics["Rsquared"]))
print(model_eval)
#> Model RMSE MAE Rsquared
#> 1 Random Forest 98.219710 30.2558297 0.9976235
#> 2 XGBoost 2.100454 0.7290456 0.9999983
#> 3 SVR 445.937234 127.7105534 0.9178101
#Choosing the Best Model
#Lowest RMSE and MAE = most accurate predictions
#Highest R² = best variance explanation
#Sort and interpret:
model_eval[order(model_eval$RMSE), ] # Best = Top row
#> Model RMSE MAE Rsquared
#> 2 XGBoost 2.100454 0.7290456 0.9999983
#> 1 Random Forest 98.219710 30.2558297 0.9976235
#> 3 SVR 445.937234 127.7105534 0.9178101#### Models Predicted plots
# Create a data frame from your table
model_preds <- data.frame(rf_preds, xgb_preds, svr_preds)
# Add observation ID
model_preds$ID <- 1:nrow(model_preds)
# Reshape for plotting
model_long <- model_preds %>%
tidyr::pivot_longer(cols = c("rf_preds", "xgb_preds", "svr_preds"), names_to = "Model", values_to = "Predicted")
# Plot
ggplot(model_long, aes(x = ID, y = Predicted, color = Model)) +
geom_line(linewidth = 0.5) +
labs(title = "Model Predictions Over Observations",
x = "Observation", y = "Predicted Incidence") +
theme_minimal()
## plot actual vs predicted
oldpar <- par(mfrow = c(1, 3)) # 3 plots side-by-side
# Random Forest
plot(mapdata$incidence, mapdata$rf_pred,
xlab = "Observed", ylab = "Predicted",
main = "Random Forest", pch = 19, col = "steelblue")
abline(0, 1, col = "red", lwd = 2)
# XGBoost
plot(mapdata$incidence, mapdata$xgb_pred,
xlab = "Observed", ylab = "Predicted",
main = "XGBoost", pch = 19, col = "darkgreen")
abline(0, 1, col = "red", lwd = 2)
# SVR
plot(mapdata$incidence, mapdata$svr_pred,
xlab = "Observed", ylab = "Predicted",
main = "SVR", pch = 19, col = "purple")
abline(0, 1, col = "red", lwd = 2)
par(oldpar)
## Actual vs predicted plot with correlations
library(ggplot2)
library(ggpubr) # For stat_cor
mapdata$rf_pred <- predict(rf_model)
mapdata$xgb_pred <- predict(xgb_model, newdata = x_vars)
mapdata$svr_pred <- predict(svr_model, newdata = mapdata)
# Random Forest Plot
p1 <- ggplot(mapdata, aes(x = incidence, y = rf_pred)) +
geom_point(color = "steelblue", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
labs(title = "Random Forest", x = "Observed Incidence", y = "Predicted Incidence") +
theme_minimal()
# XGBoost Plot
p2 <- ggplot(mapdata, aes(x = incidence, y = xgb_pred)) +
geom_point(color = "darkgreen", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
labs(title = "XGBoost", x = "Observed Incidence", y = "Predicted Incidence") +
theme_minimal()
# SVR Plot
p3 <- ggplot(mapdata, aes(x = incidence, y = svr_pred)) +
geom_point(color = "purple", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
labs(title = "SVR", x = "Observed Incidence", y = "Predicted Incidence") +
theme_minimal()
combined_plot <- ggarrange(p1, p2, p3, ncol = 3, nrow = 1, common.legend = FALSE)## CROSS-VALIDATION
#Step 1: Prepare common setup
# Set seed for reproducibility
set.seed(123)
library(caret)
# Define 5-fold cross-validation
cv_control <- trainControl(method = "cv", number = 3)
# 1. Random Forest
library(randomForest)
rf_cv <- train(
incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
magea + mageb + magec + yrb + yrc + yrd + yre,
data = mapdata,
method = "rf",
trControl = cv_control,
tuneLength = 3,
importance = TRUE
)
print(rf_cv)
#> Random Forest
#>
#> 53 samples
#> 16 predictors
#>
#> No pre-processing
#> Resampling: Cross-Validated (3 fold)
#> Summary of sample sizes: 36, 34, 36
#> Resampling results across tuning parameters:
#>
#> mtry RMSE Rsquared MAE
#> 2 444.8157 0.8580149 161.2283
#> 8 450.7475 0.8548550 163.6862
#> 15 454.0229 0.8542215 165.2292
#>
#> RMSE was used to select the optimal model using the smallest value.
#> The final value used for the model was mtry = 2.
# 2. XGBoost
library(xgboost)
mapdata <- st_drop_geometry(mapdata)
mapdata$incidence <- as.numeric(mapdata$incidence)
cv_control <- trainControl(
method = "cv",
number = 5
)
# XGBoost with caret
xgb_cv <- xgb.cv(
params = params,
data = dtrain,
nrounds = 100,
nfold = 5, # 5-fold CV
early_stopping_rounds = 10, # stop if no improvement in 10 rounds
metrics = list("rmse"),
verbose = 0
)
print(xgb_cv)
#> ##### xgb.cv 5-folds
#> iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
#> <int> <num> <num> <num> <num>
#> 1 869.239393 127.3707020 790.2016 556.6500
#> 2 811.895853 113.6967223 753.6544 548.5258
#> 3 758.695958 101.1327812 719.3175 541.3886
#> 4 709.258868 89.5985897 687.8282 534.0328
#> 5 663.309677 79.0397616 658.7448 527.2620
#> 6 620.545640 69.4028269 632.3349 520.5086
#> 7 580.734477 60.6017161 608.0614 514.3051
#> 8 543.630439 52.5890340 585.6143 508.4693
#> 9 509.061361 45.2924036 565.3273 502.8183
#> 10 476.811169 38.6662221 545.6182 498.5917
#> 11 446.728563 32.6587860 527.4934 495.3552
#> 12 418.645545 27.2396970 510.8609 491.8151
#> 13 392.428753 22.3383309 495.3064 488.7539
#> 14 367.936479 17.9487507 480.7644 485.8850
#> 15 345.050737 14.0417407 467.7380 482.9319
#> 16 323.657820 10.5989823 455.5772 480.3232
#> 17 303.658221 7.6468438 444.2999 478.0095
#> 18 284.949110 5.3008448 433.7681 476.0269
#> 19 267.446830 3.9041583 423.8386 474.4026
#> 20 251.072552 3.8834340 414.6807 472.9013
#> 21 235.748441 4.8799317 406.5349 471.3381
#> 22 221.402044 6.2001606 398.7159 470.1287
#> 23 207.972113 7.5265587 391.4943 469.2266
#> 24 195.395832 8.7500043 384.7359 468.2956
#> 25 183.618137 9.8448664 378.4595 467.4727
#> 26 172.579433 10.8036044 372.8544 466.6354
#> 27 162.233649 11.6317973 367.5725 465.6864
#> 28 152.531762 12.3298351 362.3877 464.9543
#> 29 143.434085 12.9128515 357.7527 464.1551
#> 30 134.899409 13.3869674 353.4512 463.4164
#> 31 126.894472 13.7625645 349.4280 462.7497
#> 32 119.383493 14.0499628 345.6474 462.1675
#> 33 112.336085 14.2548206 342.1012 461.6551
#> 34 105.721359 14.3870126 338.8096 461.1720
#> 35 99.516489 14.4496212 335.7164 460.7433
#> 36 93.683744 14.4612375 332.8313 460.3522
#> 37 88.207532 14.4191607 330.1307 460.0082
#> 38 83.066339 14.3300654 327.5992 459.7049
#> 39 78.239213 14.2001212 325.2061 459.4532
#> 40 73.706952 14.0344257 322.9586 459.2320
#> 41 69.453772 13.8329782 320.8131 459.0585
#> 42 65.455340 13.6042703 318.8138 458.8997
#> 43 61.694388 13.3553540 316.8836 458.7848
#> 44 58.161545 13.0847456 315.0938 458.6745
#> 45 54.843352 12.7942086 313.4106 458.5800
#> 46 51.727013 12.4870438 311.8410 458.4893
#> 47 48.800109 12.1668617 310.3461 458.4239
#> 48 46.050852 11.8341907 308.9519 458.3600
#> 49 43.470091 11.4911414 307.6304 458.3118
#> 50 41.045067 11.1414634 306.3959 458.2640
#> 51 38.766216 10.7874114 305.2391 458.2214
#> 52 36.628218 10.4270416 304.1478 458.1830
#> 53 34.619590 10.0648898 303.1232 458.1492
#> 54 32.735149 9.7000239 302.1662 458.1140
#> 55 30.967443 9.3330799 301.2729 458.0778
#> 56 29.310898 8.9641345 300.4338 458.0435
#> 57 27.756956 8.5961935 299.6458 458.0095
#> 58 26.298972 8.2304126 298.9090 457.9752
#> 59 24.933027 7.8658598 298.2208 457.9401
#> 60 23.653849 7.5020737 297.5853 457.8992
#> 61 22.451003 7.1460083 296.9914 457.8580
#> 62 21.309321 6.8065028 296.4383 457.8143
#> 63 20.226651 6.4820813 295.9174 457.7738
#> 64 19.202495 6.1707325 295.3967 457.7511
#> 65 18.228768 5.8757562 294.9482 457.7059
#> 66 17.310864 5.5886756 294.4956 457.6777
#> 67 16.437824 5.3168546 294.0978 457.6350
#> 68 15.614714 5.0534595 293.7217 457.5942
#> 69 14.832464 4.8022642 293.3740 457.5513
#> 70 14.086469 4.5660667 293.0471 457.5093
#> 71 13.381798 4.3394510 292.7398 457.4681
#> 72 12.712208 4.1234158 292.4562 457.4248
#> 73 12.076774 3.9185987 292.1869 457.3840
#> 74 11.472889 3.7233422 291.9417 457.3399
#> 75 10.899541 3.5380380 291.7111 457.2972
#> 76 10.355206 3.3622312 291.4919 457.2569
#> 77 9.837891 3.1947687 291.2877 457.2165
#> 78 9.344513 3.0377197 291.1026 457.1739
#> 79 8.878307 2.8866616 290.9245 457.1346
#> 80 8.435075 2.7427897 290.7651 457.0920
#> 81 8.014227 2.6064334 290.6128 457.0525
#> 82 7.614482 2.4769421 290.4711 457.0131
#> 83 7.234784 2.3536897 290.3374 456.9753
#> 84 6.873823 2.2364538 290.2146 456.9371
#> 85 6.531273 2.1254459 290.0973 456.9014
#> 86 6.205946 2.0200509 289.9841 456.8685
#> 87 5.896936 1.9198917 289.8765 456.8371
#> 88 5.603172 1.8247213 289.7767 456.8061
#> 89 5.324408 1.7343520 289.6757 456.7801
#> 90 5.059483 1.6487059 289.5910 456.7492
#> 91 4.808006 1.5671828 289.5027 456.7241
#> 92 4.569061 1.4898766 289.4191 456.7002
#> 93 4.342314 1.4165729 289.3420 456.6764
#> 94 4.126835 1.3469663 289.2688 456.6538
#> 95 3.922361 1.2809817 289.1989 456.6325
#> 96 3.728132 1.2182728 289.1344 456.6112
#> 97 3.543669 1.1588310 289.0731 456.5911
#> 98 3.368373 1.1020500 289.0163 456.5712
#> 99 3.202143 1.0484843 288.9618 456.5526
#> 100 3.044323 0.9976821 288.9115 456.5340
#> iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
#> Best iteration:
#> iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
#> <int> <num> <num> <num> <num>
#> 100 3.044323 0.9976821 288.9115 456.534
best_nrounds <- xgb_cv$best_iteration
cat("Best number of rounds:", best_nrounds, "\n")
#> Best number of rounds:
# Extract mean RMSE
mean_rmse <- min(xgb_cv$evaluation_log$test_rmse_mean)
cat("XGBoost CV RMSE:", mean_rmse, "\n")
#> XGBoost CV RMSE: 288.9115
# 3. Support Vector Regression (SVR)
library(e1071)
library(kernlab)
svr_cv <- train(
incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
magea + mageb + magec + yrb + yrc + yrd + yre,
data = mapdata,
method = "svmRadial",
trControl = cv_control,
preProcess = c("center", "scale"), # SVR often benefits from scaling
tuneLength = 3
)
print(svr_cv)
#> Support Vector Machines with Radial Basis Function Kernel
#>
#> 53 samples
#> 15 predictors
#>
#> Pre-processing: centered (15), scaled (15)
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 43, 42, 42, 43, 42
#> Resampling results across tuning parameters:
#>
#> C RMSE Rsquared MAE
#> 0.25 713.0466 0.4939345 363.8929
#> 0.50 704.6990 0.4971961 355.2001
#> 1.00 701.0369 0.5012774 350.6448
#>
#> Tuning parameter 'sigma' was held constant at a value of 17.63942
#> RMSE was used to select the optimal model using the smallest value.
#> The final values used for the model were sigma = 17.63942 and C = 1.## Spatial maps of predicted values of each model
# 1. Random Forest Spatial Map
mapdata <- inner_join(africa_shp, panc_incidence, by = "NAME")
mapdata$pred_rf <- predict(rf_model, newdata = mapdata)
tm_shape(mapdata) +
tm_fill("pred_rf", fill.scale =tm_scale_intervals(values = "brewer.greens", style = "quantile"),
fill.legend = tm_legend(title = "Inci_pred_rf")) + tm_borders(fill_alpha = .2) +
tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE)
# 2. XGBoost Spatial Map
# Ensure matrix used in training
mapdata$pred_xgb <- predict(xgb_model, newdata = x_vars)
tm_shape(mapdata) +
tm_fill("pred_xgb", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile"),
fill.legend = tm_legend(title = "Inci_pred_xgb")) + tm_borders(fill_alpha = .2) +
tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE)
# 3. Support Vector Regression (SVR) Spatial Map
mapdata$pred_svr <- predict(svr_model, newdata = mapdata)
tm_shape(mapdata) +
tm_fill("pred_svr", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Inci_pred_svr")) + tm_borders(fill_alpha = .2) +
tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE)
# Compare Side by Side
tmap_arrange(
tm_shape(mapdata) +
tm_fill("pred_rf", fill.scale =tm_scale_intervals(values = "brewer.greens", style = "quantile"),
fill.legend = tm_legend(title = "Inci_pred_rf")) + tm_borders(fill_alpha = .2) +
tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE),
tm_shape(mapdata) +
tm_fill("pred_xgb", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile"),
fill.legend = tm_legend(title = "Inci_pred_xgb")) + tm_borders(fill_alpha = .2) +
tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE),
tm_shape(mapdata) +
tm_fill("pred_svr", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Inci_pred_svr")) + tm_borders(fill_alpha = .2) +
tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE),
nrow = 1)# Predicted Residuals
# we've already trained all 3 models and have `mapdata$incidence` as your actual values.
### Step 1: Generate predictions and residuals for each model
# Random Forest
rf_preds <- predict(rf_model, newdata = mapdata)
rf_resid <- mapdata$incidence - rf_preds
# XGBoost
xgb_preds <- predict(xgb_model, newdata = x_vars) # x_vars = model.matrix(...)
xgb_resid <- mapdata$incidence - xgb_preds
# SVR
svr_preds <- predict(svr_model, newdata = mapdata)
svr_resid <- mapdata$incidence - svr_preds
### Step 2: Combine into a single data frame
residuals_df <- data.frame(
actual = mapdata$incidence,
rf_pred = rf_preds,
rf_resid = rf_resid,
xgb_pred = xgb_preds,
xgb_resid = xgb_resid,
svr_pred = svr_preds,
svr_resid = svr_resid
)
# export
library(writexl)
### Compare residual distributions
boxplot(residuals_df$rf_resid, residuals_df$xgb_resid, residuals_df$svr_resid,
names = c("RF", "XGB", "SVR"),
main = "Model Residuals",
ylab = "Prediction Error (Residual)")## Spatial maps of residual values from each model
#Add residuals to mapdata
#You should already have these from the previous steps:
mapdata$rf_resid <- residuals_df$rf_resid
mapdata$xgb_resid <- residuals_df$xgb_resid
mapdata$svr_resid <- residuals_df$svr_resid
# Set tmap mode to plot (static map)
tmap_mode("plot")
# Create individual residual maps
map_rf <- tm_shape(mapdata) +
tm_fill("rf_resid", fill.scale =tm_scale_intervals(values = "brewer.greens", style = "quantile",
midpoint = 0), fill.legend = tm_legend(title = "Inci_rf_resid")) +
tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE)
map_xgb <- tm_shape(mapdata) +
tm_fill("xgb_resid", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile",
midpoint = 0), fill.legend = tm_legend(title = "Inci_xgb_resid")) + tm_borders(fill_alpha = .2) +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE)
map_svr <- tm_shape(mapdata) +
tm_fill("svr_resid", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Inci_svr_resid")) + tm_borders(fill_alpha = .2) +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE)
#Step 3: Combine maps in a grid
# Combine maps in a grid layout
tmap_arrange(map_rf, map_xgb, map_svr, nrow = 1)##Barplot and Spatial maps for RMSE and MAE
#Step 1: Calculate RMSE and MAE for each model
# Random Forest
rf_metrics <- postResample(pred = rf_preds, obs = mapdata$incidence)
# XGBoost
xgb_metrics <- postResample(pred = xgb_preds, obs = mapdata$incidence)
# SVR
svr_metrics <- postResample(pred = svr_preds, obs = mapdata$incidence)
#Step 2: Combine into a summary data frame
model_eval <- data.frame(
Model = c("Random Forest", "XGBoost", "SVR"),
RMSE = c(rf_metrics["RMSE"], xgb_metrics["RMSE"], svr_metrics["RMSE"]),
MAE = c(rf_metrics["MAE"], xgb_metrics["MAE"], svr_metrics["MAE"]),
Rsquared = c(rf_metrics["Rsquared"], xgb_metrics["Rsquared"], svr_metrics["Rsquared"])
)
print(model_eval)
#> Model RMSE MAE Rsquared
#> 1 Random Forest 98.219710 30.2558297 0.9976235
#> 2 XGBoost 2.100454 0.7290456 0.9999983
#> 3 SVR 445.937234 127.7105534 0.9178101
#Visualize MAE and RMSE
oldpar <- par(mfrow = c(1, 3))
#Barplot of RMSE
barplot(model_eval$RMSE, names.arg = model_eval$Model,
col = "skyblue", las = 1, main = "Model RMSE", ylab = "RMSE")
#Barplot of MAE
barplot(model_eval$MAE, names.arg = model_eval$Model,
col = "lightgreen", las = 1, main = "Model MAE", ylab = "MAE")
#Barplot of MAE
barplot(model_eval$Rsquared, names.arg = model_eval$Model,
col = "grey", las = 1, main = "Model Rsquared", ylab = "MAE")
par(oldpar)
#Add metrics to residual maps as captions
map_rf <- tm_shape(mapdata) +
tm_fill("rf_resid", fill.scale =tm_scale_intervals(values = "brewer.greens", midpoint = 0),
title = paste0("rf_resid\nRMSE: ", round(rf_metrics["RMSE"], 2),
"\nMAE: ", round(rf_metrics["MAE"], 2))) +
tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))
map_xgb <- tm_shape(mapdata) +
tm_fill("xgb_resid", fill.scale =tm_scale_intervals(values = "-RdBu", midpoint = 0),
title = paste0("xgb_resid\nRMSE: ", round(xgb_metrics["RMSE"], 2),
"\nMAE: ", round(xgb_metrics["MAE"], 2))) +
tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))
map_svr <- tm_shape(mapdata) +
tm_fill("svr_resid", fill.scale =tm_scale_intervals(values = "-RdBu", midpoint = 0),
title = paste0("svr_resid\nRMSE: ", round(svr_metrics["RMSE"], 2),
"\nMAE: ", round(svr_metrics["MAE"], 2))) +
tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))
tmap_arrange(map_rf, map_xgb, map_svr, nrow = 1)#Global Moran’s I, Local Moran’s I (LISA), Cluster categories (e.g., High-High, Low-Low), Maps: Moran’s I map,
#LISA clusters, High-High clusters using the predicted values from the four machine learning models
#Assumptions: Your spatial data is in mapdata (an sf object).
#Predicted values for each model are already stored in: rf_preds, xgb_preds, svr_preds, mlp_preds
#STEP 1: Create Spatial Weights (if not done yet)
neighbors <- poly2nb(mapdata) #if this gives warning, use the below codes
mapdata <- st_as_sf(mapdata) # If it's not already sf
mapdata <- st_make_valid(mapdata) # Fix any invalid geometries
neighbors <- poly2nb(mapdata, snap = 1e-15) ## Try 1e-6, 1e-5, or higher if needed. You can adjust snap upward incrementally until the warnings disappear or are reduced
listw <- nb2listw(neighbors, style = "W", zero.policy = TRUE)
#STEP 2: Define a function to compute Moran’s I and cluster categories
analyze_spatial_autocorrelation <- function(mapdata, values, listw, model_name, signif_level = 0.05) {
# Standardize predicted values
mapdata$val_st <- scale(values)[, 1]
# Compute lag
mapdata$lag_val <- lag.listw(listw, mapdata$val_st, zero.policy = TRUE)
# Global Moran's I
global_moran <- moran.test(values, listw, zero.policy = TRUE)
# Local Moran's I (LISA)
lisa <- localmoran(values, listw, zero.policy = TRUE)
lisa_df <- as.data.frame(lisa)
#rename p-value column
colnames(lisa_df)[5] <- "Pr_z"
# Add to mapdata
mapdata$Ii <- lisa_df$Ii
mapdata$Z_Ii <- lisa_df$Z.I
mapdata$Pr_z <- lisa_df$Pr_z
#mapdata$Pr_z <- lisa_df[, "Pr(z > 0)"]
# Classify clusters
mapdata <- mapdata %>%
mutate(
cluster = case_when(
val_st > 0 & lag_val > 0 & Pr_z <= signif_level ~ "High-High",
val_st < 0 & lag_val < 0 & Pr_z <= signif_level ~ "Low-Low",
val_st < 0 & lag_val > 0 & Pr_z <= signif_level ~ "Low-High",
val_st > 0 & lag_val < 0 & Pr_z <= signif_level ~ "High-Low",
TRUE ~ "Not Significant"
)
)
return(list(
map = mapdata,
global_moran = global_moran
))
}
#STEP 3: Run the function for each model
rf_result <- analyze_spatial_autocorrelation(mapdata, rf_preds, listw, "Random Forest")
xgb_result <- analyze_spatial_autocorrelation(mapdata, xgb_preds, listw, "XGBoost")
svr_result <- analyze_spatial_autocorrelation(mapdata, svr_preds, listw, "SVR")
#STEP 4: Mapping LISA Clusters and High-High Areas for Random Forest
tmap_mode("plot")
# LISA Cluster Map. fill.scale =tm_scale_intervals(values = "-RdBu")
tm_rf <- tm_shape(rf_result$map) +
tm_fill(
"cluster",
fill.scale = tm_scale(values = c(
"High-High" = "red",
"Low-Low" = "blue",
"Low-High" = "lightblue",
"High-Low" = "pink",
"Not Significant" = "gray90")),
fill.legend = tm_legend(title = "LISA Clusters - RF")) +
tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))
# Mapping LISA Clusters and High-High Areas for XGBoost
tmap_mode("plot")
# LISA Cluster Map
tm_xgb <- tm_shape(xgb_result$map) +
tm_fill("cluster",
fill.scale = tm_scale(values = c(
"High-High" = "red",
"Low-Low" = "blue",
"Low-High" = "lightblue",
"High-Low" = "pink",
"Not Significant" = "gray90")),
fill.legend = tm_legend(title = "LISA Clusters - XGBoost")) +
tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))
#Mapping LISA Clusters and High-High Areas for SVR
tmap_mode("plot")
# LISA Cluster Map
tm_svr <- tm_shape(svr_result$map) +
tm_fill("cluster",
fill.scale = tm_scale(values = c(
"High-High" = "red",
"Low-Low" = "blue",
"Low-High" = "lightblue",
"High-Low" = "pink",
"Not Significant" = "gray90")),
fill.legend = tm_legend(title = "LISA Clusters - SVR")) +
tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))
tmap_arrange(tm_rf, tm_xgb, tm_svr, nrow = 1)#You can also arrange maps side-by-side using tmap_arrange().
#View Global Moran’s I Results
#These print the test statistic and p-values for global spatial autocorrelation of predictions.
rf_result$global_moran
#>
#> Moran I test under randomisation
#>
#> data: values
#> weights: listw
#> n reduced by no-neighbour observations
#>
#> Moran I statistic standard deviate = -0.24595, p-value = 0.5971
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic Expectation Variance
#> -0.044681255 -0.022727273 0.007967958
xgb_result$global_moran
#>
#> Moran I test under randomisation
#>
#> data: values
#> weights: listw
#> n reduced by no-neighbour observations
#>
#> Moran I statistic standard deviate = -0.32513, p-value = 0.6275
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic Expectation Variance
#> -0.051250814 -0.022727273 0.007696578
svr_result$global_moran
#>
#> Moran I test under randomisation
#>
#> data: values
#> weights: listw
#> n reduced by no-neighbour observations
#>
#> Moran I statistic standard deviate = -0.28034, p-value = 0.6104
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic Expectation Variance
#> -0.05111299 -0.02272727 0.01025262These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.