The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
library(graphicalExtremes)
library(ggplot2)
library(igraph)
C:4cd061984267.R
The flights
dataset contains daily total delays of major U.S. airlines.
For details, see the corresponding documentation page.
Below, we plot all airports in the dataset.
plotFlights(plotConnections = FALSE, map = "world", xyRatio = 2)
C:4cd061984267.R
To perform an example application, we follow Section 6 of Hentschel et al. (2022).
The subset of airports analyzed there is obtained through a previous clustering step,
whose results are available through the function getFlightDelayData()
.
For more detailed explanations of the individual methods see Vignette “applicationDanube” and the help pages of applied functions.
Note: Due to size restrictions, the CRAN version of this package contains only a part of the full dataset.
A version of this package with the complete dataset is available on GitHub.
# Get IATAs from Texas cluster
<- getFlightDelayData('IATAs', 'tcCluster')
IATAs
# Plot airports + connections (indicating number of flights by thickness)
plotFlights(
IATAs,useAirportNFlights = TRUE,
useConnectionNFlights = TRUE
)
C:4cd061984267.R
For hyperparameter tuning and model comparison purposes, we will use a train-test-split, utilizing some data to estimate the graph structure and parameter matrix, and the remaining data to compute a likelihood value.
# Check whether all dates from the train-test-split are available
# (due to size restrictions, the CRAN version of this package does not contain all dates)
<- tryCatch({
allDatesAvailable getFlightDelayData('dates', dateFilter = c('tcAll'))
TRUE
error = function(...) FALSE)
}, cat('All dates avilable:', allDatesAvailable, '\n')
#> All dates avilable: FALSE
# Create train and test data sets
if(allDatesAvailable){
# Use train-test-split and threshold p from article
<- drop(getFlightDelayData('delays', 'tcCluster', 'tcTrain'))
matEst <- drop(getFlightDelayData('delays', 'tcCluster', 'tcTest'))
matVal <- 0.95
p else {
} # Take all available dates that do not contain NAs
<- drop(getFlightDelayData('delays', 'tcCluster', 'all'))
mat <- apply(is.na(mat), 1, any)
rowHasNA <- mat[!rowHasNA, ]
mat
# Split dates in half
<- floor(nrow(mat)/2)
splitInd <- mat[1:splitInd,]
matEst <- mat[(splitInd+1):nrow(mat),]
matVal
# Use a lower threshold to compensate for the smaller dataset
<- 0.8
p }
C:4cd061984267.R
This function will be used later to compare fitted parameters to empirical ones.
# Compute the empirical extremal correlation matrix
<- emp_chi(matEst, p = p)
emp_chi_mat
# Utility function to plot fitted parameters against true/empirical ones
<- function(G0, G1, xlab = 'Empirical', ylab = 'Fitted'){
plot_fitted_params return(
ggplot()
+ geom_point(aes(
x = G0[upper.tri(G0)],
y = G1[upper.tri(G1)]
))+ geom_abline(slope = 1, intercept = 0)
+ xlab(xlab)
+ ylab(ylab)
) }
C:4cd061984267.R
As a baseline for graphical modelling, we consider the graph with edges representing at least monthly connections between airports.
# Compute undirected flights per connection between selected airports
<- dim(flights$flightCounts)[3]
nYears <- apply(flights$flightCounts[IATAs,IATAs,], c(1, 2), sum)
flightsPerConnection <- flightsPerConnection + t(flightsPerConnection)
flightsPerConnectionUD
# Make flight graph from adjacency matrix
<- 1 * (flightsPerConnectionUD > nYears * 12)
A <- graph_from_adjacency_matrix(A, diag = FALSE, mode = "undirected")
flight_graph
# Plot flight graph
plotFlights(IATAs, graph = flight_graph, clipMap = 1.3, xyRatio = 1)
C:4cd061984267.R
Given the flight_graph
object, we fit a Hüsler–Reiss model with that graphical structure.
# Fit the model
<- fmpareto_graph_HR(
model_fit data = matEst,
graph = flight_graph,
p = p,
method = "vario"
)
# Compute likelihood/ICs
<- loglik_HR(
flights_loglik_graph data = matVal,
p = p,
graph = flight_graph,
Gamma = model_fit
)cat("Flight graph test-loglikelihood =", round(flights_loglik_graph['loglik'], 2), "\n")
#> Flight graph test-loglikelihood = -7831.91
# Plot fitted parameters
plot_fitted_params(emp_chi_mat, Gamma2chi(model_fit))
C:4cd061984267.R
Next, we fit an extremal tree model to the flight delays using emst()
.
# Fit the model
<- emst(data = matEst, p = p, method = "vario")
flights_emst_fit
# Compute likelihood/ICs
<- loglik_HR(
flights_loglik_tree data = matVal,
p = p,
Gamma = flights_emst_fit$Gamma,
graph = flights_emst_fit$graph
)cat("Tree test-loglikelihood =", round(flights_loglik_tree['likelihood'], 2), "\n")
#> Tree test-loglikelihood = NA
# Plot fitted graph, parameters
plotFlights(
IATAs,graph = flights_emst_fit$graph,
xyRatio = 1,
clipMap = 1.3
)plot_fitted_params(emp_chi_mat, Gamma2chi(flights_emst_fit$Gamma))
C:4cd061984267.R
Lastly, we fit a general graphical model with eglearn()
, using a suitable list of penalization parameters.
# Fit the model
<- seq(0, 0.50, length.out = 11)
rholist <- eglearn(matEst, p = p, rholist = rholist, complete_Gamma = TRUE)
flights_eglearn_fit
# Compute likelihood/ICs
<- sapply(seq_along(rholist), FUN = function(j) {
flights_loglik loglik_HR(
data = matVal,
p = p,
Gamma = flights_eglearn_fit$Gamma[[j]],
graph = flights_eglearn_fit$graph[[j]]
) })
C:4cd061984267.R
The “best” penalization parameter can be chosen e.g. such that the test-likelihood is maximized
ggplot(
mapping = aes(x = rholist, y = flights_loglik['loglik', ])) +
geom_line() +
geom_point(shape = 21, size = 3, stroke = 1, fill = "white") +
geom_hline(aes(yintercept = flights_loglik_graph['loglik']), lty = "dashed") +
geom_hline(aes(yintercept = flights_loglik_tree['loglik']), lty = "dotted") +
xlab("rho") +
ylab("Log-likelihood") +
scale_x_continuous(
breaks = rholist,
labels = round(rholist, 3),
sec.axis = sec_axis(
trans = ~., breaks = rholist,
labels = sapply(
$graph,
flights_eglearn_fit::gsize
igraph
),name = "Number of edges"
) )
<- which.max(flights_loglik['loglik',])
best_index cat('Best rho =', rholist[best_index], '\n')
#> Best rho = 0.1
cat('Corresponding test-loglikelihood =', flights_loglik['loglik', best_index])
#> Corresponding test-loglikelihood = -7263.573
C:4cd061984267.R
We plot the estimated graph and parameters of the best fitted model.
<- flights_eglearn_fit$graph[[best_index]]
best_graph <- flights_eglearn_fit$Gamma[[best_index]]
best_Gamma plotFlights(IATAs, graph = best_graph, clipMap = 1.3, xyRatio = 1)
plot_fitted_params(emp_chi_mat, Gamma2chi(best_Gamma))
C:4cd061984267.R
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.