The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
ggmlR supports multi-GPU inference through the backend scheduler API. This allows you to distribute computations across multiple GPUs for improved performance with large models.
First, check how many GPUs are available:
library(ggmlR)
if (ggml_vulkan_available()) {
n_gpus <- ggml_vulkan_device_count()
cat("Available GPUs:", n_gpus, "\n\n")
for (i in seq_len(n_gpus)) {
cat("GPU", i - 1, ":", ggml_vulkan_device_description(i - 1), "\n")
mem_gb <- ggml_vulkan_device_memory(i - 1) / 1024^3
cat(" Memory:", round(mem_gb, 2), "GB\n")
}
}The scheduler automatically distributes work across backends:
if (ggml_vulkan_available() && ggml_vulkan_device_count() >= 2) {
# Initialize multiple GPU backends
gpu0 <- ggml_vulkan_init(0)
gpu1 <- ggml_vulkan_init(1)
# Create scheduler with multiple backends
# Order matters: first backend is preferred for supported operations
sched <- ggml_backend_sched_new(list(gpu0, gpu1))
cat("Scheduler created with", ggml_backend_sched_get_n_backends(sched),
"backends\n")
# Check backends
for (i in seq_len(ggml_backend_sched_get_n_backends(sched))) {
backend <- ggml_backend_sched_get_backend(sched, i - 1)
cat("Backend", i - 1, ":", ggml_backend_name(backend), "\n")
}
}A common pattern is to use GPU with CPU as fallback for unsupported operations:
if (ggml_vulkan_available()) {
# Initialize backends
gpu <- ggml_vulkan_init(0)
cpu <- ggml_backend_cpu_init()
ggml_backend_cpu_set_n_threads(cpu, 4)
# GPU first, CPU as fallback
sched <- ggml_backend_sched_new(list(gpu, cpu))
ctx <- ggml_init(64 * 1024 * 1024)
# Create computation
a <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1000, 1000)
b <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1000, 1000)
c <- ggml_mul_mat(ctx, a, b)
graph <- ggml_build_forward_expand(ctx, c)
ggml_backend_sched_reserve(sched, graph)
ggml_backend_sched_alloc_graph(sched, graph)
# Check which backend handles each tensor
cat("\nTensor backend assignment:\n")
cat(" a:", ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, a)),
"\n")
cat(" b:", ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, b)),
"\n")
cat(" c:", ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, c)),
"\n")
# Cleanup
ggml_backend_sched_free(sched)
ggml_vulkan_free(gpu)
ggml_backend_free(cpu)
ggml_free(ctx)
}You can explicitly assign tensors to specific backends:
if (ggml_vulkan_available() && ggml_vulkan_device_count() >= 2) {
gpu0 <- ggml_vulkan_init(0)
gpu1 <- ggml_vulkan_init(1)
sched <- ggml_backend_sched_new(list(gpu0, gpu1))
ctx <- ggml_init(128 * 1024 * 1024)
# Create tensors for two parallel computations
a1 <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 512, 512)
b1 <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 512, 512)
c1 <- ggml_mul_mat(ctx, a1, b1)
a2 <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 512, 512)
b2 <- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 512, 512)
c2 <- ggml_mul_mat(ctx, a2, b2)
# Combine results
result <- ggml_add(ctx, c1, c2)
graph <- ggml_build_forward_expand(ctx, result)
# Manually assign tensors to different GPUs
ggml_backend_sched_set_tensor_backend(sched, a1, gpu0)
ggml_backend_sched_set_tensor_backend(sched, b1, gpu0)
ggml_backend_sched_set_tensor_backend(sched, c1, gpu0)
ggml_backend_sched_set_tensor_backend(sched, a2, gpu1)
ggml_backend_sched_set_tensor_backend(sched, b2, gpu1)
ggml_backend_sched_set_tensor_backend(sched, c2, gpu1)
ggml_backend_sched_reserve(sched, graph)
ggml_backend_sched_alloc_graph(sched, graph)
# Set data and compute
ggml_set_f32(a1, rnorm(512 * 512))
ggml_set_f32(b1, rnorm(512 * 512))
ggml_set_f32(a2, rnorm(512 * 512))
ggml_set_f32(b2, rnorm(512 * 512))
ggml_backend_sched_graph_compute(sched, graph)
cat("Multi-GPU computation completed\n")
cat("Result shape:", ggml_tensor_shape(result), "\n")
# Cleanup
ggml_backend_sched_free(sched)
ggml_vulkan_free(gpu0)
ggml_vulkan_free(gpu1)
ggml_free(ctx)
}For maximum performance, use asynchronous operations:
if (ggml_vulkan_available() && ggml_vulkan_device_count() >= 2) {
gpu0 <- ggml_vulkan_init(0)
gpu1 <- ggml_vulkan_init(1)
sched <- ggml_backend_sched_new(list(gpu0, gpu1))
ctx <- ggml_init(64 * 1024 * 1024)
# Build graph
a <- ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 100000)
b <- ggml_relu(ctx, a)
c <- ggml_sum(ctx, b)
graph <- ggml_build_forward_expand(ctx, c)
ggml_backend_sched_reserve(sched, graph)
ggml_backend_sched_alloc_graph(sched, graph)
ggml_set_f32(a, rnorm(100000))
# Async compute - returns immediately
ggml_backend_sched_graph_compute_async(sched, graph)
# Do other work here while GPU computes...
cat("Computing asynchronously...\n")
# Wait for completion
ggml_backend_sched_synchronize(sched)
cat("Result:", ggml_get_f32(c), "\n")
# Cleanup
ggml_backend_sched_free(sched)
ggml_vulkan_free(gpu0)
ggml_vulkan_free(gpu1)
ggml_free(ctx)
}ggml_backend_sched_reserve() before allocation to
optimize memory layoutggml_vulkan_device_memory()ggml_backend_sched_get_n_splits()vignette("vulkan-backend") for single-GPU usagevignette("quantization") for memory-efficient
modelsThese binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.