## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(collapse = TRUE, comment = "#>", eval = FALSE)

## ----setup--------------------------------------------------------------------
# library(crawlee)

## -----------------------------------------------------------------------------
# pages <- crawler("https://books.toscrape.com/") |>
#   cr_options(max_requests = 100) |>
#   cr_on_html(function(ctx) {
#     ctx$push_data(list(
#       url   = ctx$request$url,
#       title = ctx$page |> rvest::html_element("title") |> rvest::html_text2(),
#       text  = ctx$page |> rvest::html_element("body") |> rvest::html_text2()
#     ))
#     ctx$enqueue_links(glob = "*/catalogue/*")
#   }) |>
#   cr_run() |>
#   cr_collect()

## -----------------------------------------------------------------------------
# chunks <- cr_chunk(pages, text = text, size = 1000, overlap = 200, by = "char")
# chunks
# #> columns: doc_id, chunk_id, chunk, text, n_chars, url, title

## -----------------------------------------------------------------------------
# # A real embedder typically calls an HTTP API (any provider) with httr2:
# embed_fn <- function(texts) {
#   # return a length(texts) x d numeric matrix
#   resp <- httr2::request("https://api.example.com/v1/embeddings") |>
#     httr2::req_auth_bearer_token(Sys.getenv("EMBEDDINGS_API_KEY")) |>
#     httr2::req_body_json(list(input = texts)) |>
#     httr2::req_perform()
#   do.call(rbind, lapply(httr2::resp_body_json(resp)$data, \(x) unlist(x$embedding)))
# }
# 
# embedded <- cr_embed(chunks, embed_fn, batch_size = 32)

## -----------------------------------------------------------------------------
# fake_embed <- function(x) matrix(nchar(x), nrow = length(x), ncol = 1)
# embedded <- cr_embed(chunks, fake_embed)

## -----------------------------------------------------------------------------
# cr_export(embedded, "corpus.parquet", format = "parquet")
# cr_export(embedded, "corpus.jsonl", format = "jsonl")
# cr_export(embedded, "corpus.duckdb", format = "duckdb", table = "chunks")

## -----------------------------------------------------------------------------
# crawler("https://books.toscrape.com/") |>
#   cr_options(max_requests = 100) |>
#   cr_on_html(function(ctx) {
#     ctx$push_data(list(
#       url  = ctx$request$url,
#       text = ctx$page |> rvest::html_element("body") |> rvest::html_text2()
#     ))
#     ctx$enqueue_links(glob = "*/catalogue/*")
#   }) |>
#   cr_run() |>
#   cr_collect() |>
#   cr_chunk(text = text, size = 1000, overlap = 200) |>
#   cr_embed(embed_fn) |>
#   cr_export("corpus.parquet", format = "parquet")

