The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
ggmlR includes a PyTorch-style dynamic autograd engine built on top
of ggml tensors. Gradients are computed via reverse-mode AD on a tape
recorded inside with_grad_tape({}).
Tensor layout: all autograd tensors are
column-major — shape is [features, batch],
matching ggml’s native C layout.
ag_tensor and ag_paramwith_grad_tape({
h <- ag_relu(ag_add(ag_matmul(W, x), b)) # [2, 3]
loss <- ag_mean(ag_mul(h, h)) # scalar MSE-like
})
grads <- backward(loss) # returns named list of gradients
cat("dL/dW:\n"); print(grads[["W"]])
cat("dL/db:\n"); print(grads[["b"]])backward() returns gradients keyed by the parameter’s
variable name.
| Function | Description |
|---|---|
ag_matmul(A, B) |
Matrix multiply |
ag_add(A, B) |
Element-wise add (broadcast supported) |
ag_mul(A, B) |
Element-wise multiply |
ag_relu(x) |
ReLU activation |
ag_sigmoid(x) |
Sigmoid activation |
ag_tanh(x) |
Tanh activation |
ag_gelu(x) |
GELU activation |
ag_softmax(x) |
Softmax (column-wise) |
ag_sum(x) |
Sum all elements → scalar |
ag_mean(x) |
Mean all elements → scalar |
ag_transpose(x) |
Transpose [m,n] → [n,m] |
ag_reshape(x, dims) |
Reshape tensor |
ag_softmax_cross_entropy_loss(logits, y) |
Fused softmax + cross-entropy |
ag_sequential — module APIag_sequential stacks layers into a callable module with
.forward() and .parameters().
data(iris)
set.seed(42)
x_all <- t(scale(as.matrix(iris[, 1:4]))) # [4, 150]
y_oh <- model.matrix(~ Species - 1, iris)
y_all <- t(y_oh) # [3, 150]
idx <- sample(150L)
x_tr <- x_all[, idx[1:120]]; x_vl <- x_all[, idx[121:150]]
y_tr <- y_all[, idx[1:120]]; y_vl <- y_all[, idx[121:150]]
model <- ag_sequential(
ag_linear(4L, 64L, activation = "relu"),
ag_batch_norm(64L),
ag_dropout(0.3),
ag_linear(64L, 32L, activation = "relu"),
ag_linear(32L, 3L)
)
params <- model$parameters()
cat("Parameter tensors:", length(params), "\n")# Adam (default lr = 1e-3)
opt <- optimizer_adam(params, lr = 1e-3)
# SGD with momentum
opt_sgd <- optimizer_sgd(params, lr = 0.05, momentum = 0.9)BS <- 32L
n <- ncol(x_tr)
ag_train(model) # set training mode (enables dropout, batch norm train)
set.seed(1)
for (ep in seq_len(150L)) {
perm <- sample(n)
for (b in seq_len(ceiling(n / BS))) {
idx <- perm[seq((b-1L)*BS + 1L, min(b*BS, n))]
xb <- ag_tensor(x_tr[, idx, drop = FALSE])
yb <- y_tr[, idx, drop = FALSE]
with_grad_tape({
loss <- ag_softmax_cross_entropy_loss(model$forward(xb), yb)
})
grads <- backward(loss)
opt$step(grads)
opt$zero_grad()
}
if (ep %% 50L == 0L)
cat(sprintf("epoch %3d loss %.4f\n", ep, loss$data[1]))
}opt2 <- optimizer_adam(params, lr = 1e-3)
# Cosine annealing: lr goes from lr_max to lr_min over T_max epochs
sch_cos <- lr_scheduler_cosine(opt2, T_max = 150L, lr_min = 1e-5)
# Step decay: multiply lr by gamma every step_size epochs
sch_step <- lr_scheduler_step(opt2, step_size = 30L, gamma = 0.5)
# Call after each epoch:
# sch_cos$step()with_grad_tape({
loss <- ag_softmax_cross_entropy_loss(model$forward(ag_tensor(x_tr)), y_tr)
})
grads <- backward(loss)
# Clip global gradient norm to max_norm
clip_grad_norm(params, grads, max_norm = 5.0)
opt$step(grads)
opt$zero_grad()ag_dataloader shuffles and batches column-major data
matrices:
dl <- ag_dataloader(x_tr, y_tr, batch_size = BS, shuffle = TRUE)
ag_train(model)
for (ep in seq_len(100L)) {
for (batch in dl$epoch()) {
with_grad_tape({
loss <- ag_softmax_cross_entropy_loss(model$forward(batch$x), batch$y$data)
})
grads <- backward(loss)
opt$step(grads); opt$zero_grad()
}
}ag_eval(model) # disables dropout, switches batch norm to inference stats
# Forward in chunks to avoid memory pressure
predict_cm <- function(mod, x_cm, chunk = 64L) {
n <- ncol(x_cm)
out <- NULL
for (s in seq(1L, n, by = chunk)) {
e <- min(s + chunk - 1L, n)
lg <- mod$forward(ag_tensor(x_cm[, s:e, drop = FALSE]))$data
ev <- exp(lg - apply(lg, 2, max))
sm <- ev / colSums(ev)
out <- if (is.null(out)) sm else cbind(out, sm)
}
out
}
probs <- predict_cm(model, x_vl) # [3, 30]
preds <- apply(probs, 2, which.max)
truth <- apply(y_vl, 1, which.max)
cat(sprintf("Val accuracy: %.4f\n", mean(preds == truth)))ag_param — full manual controlFor complete flexibility, build the network from scratch:
set.seed(7)
W1 <- ag_param(matrix(rnorm(64*4) * sqrt(2/4), 64, 4))
b1 <- ag_param(matrix(0.0, 64, 1))
W2 <- ag_param(matrix(rnorm(3*64) * sqrt(2/64), 3, 64))
b2 <- ag_param(matrix(0.0, 3, 1))
forward <- function(x)
ag_add(ag_matmul(W2, ag_relu(ag_add(ag_matmul(W1, x), b1))), b2)
opt_raw <- optimizer_adam(list(W1=W1, b1=b1, W2=W2, b2=b2), lr = 1e-3)
for (ep in seq_len(200L)) {
perm <- sample(n)
for (b in seq_len(ceiling(n / BS))) {
idx <- perm[seq((b-1L)*BS+1L, min(b*BS, n))]
xb <- ag_tensor(x_tr[, idx, drop = FALSE])
yb <- y_tr[, idx, drop = FALSE]
with_grad_tape({ loss_r <- ag_softmax_cross_entropy_loss(forward(xb), yb) })
gr <- backward(loss_r)
opt_raw$step(gr); opt_raw$zero_grad()
}
}# f16 on GPU, f32 on CPU
device <- tryCatch({ ag_device("gpu"); "gpu" }, error = function(e) "cpu")
ag_dtype(if (device == "gpu") "f16" else "f32")
# All subsequent ag_param / ag_tensor use the selected dtypeSee also vignette("gpu-vulkan", package = "ggmlR") for
device management.
For multi-GPU or faster single-GPU training see
dp_train():
See also
vignette("data-parallel-training", package = "ggmlR").
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.