The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
An API-first R package for accessing machine learning models,
embeddings, and datasets on the Hugging
Face Hub via the Hugging Face Inference API. No Python required. The
Inference API serves a curated subset of the Hub’s 500,000+ models – use
hf_check_inference() to verify model availability.
install.packages("huggingfaceR")
# install.packages("devtools")
devtools::install_github("farach/huggingfaceR")Get a free API token by following the Hugging Face access tokens documentation, then configure it in R:
library(huggingfaceR)
hf_set_token("hf_your_token_here", store = TRUE)
hf_whoami()# Sentiment analysis
hf_classify("I love using R for data science!")
#> # A tibble: 1 × 3
#> text label score
#> <chr> <chr> <dbl>
#> 1 I love using R for data science! POSITIVE 1.000
# Zero-shot classification with custom labels
hf_classify_zero_shot(
"I just bought a new laptop for coding",
labels = c("technology", "sports", "politics", "food")
)
#> # A tibble: 4 × 3
#> text label score
#> <chr> <chr> <dbl>
#> 1 I just bought a new laptop for coding technology 0.985
#> 2 I just bought a new laptop for coding sports 0.00661
#> 3 I just bought a new laptop for coding food 0.00485
#> 4 I just bought a new laptop for coding politics 0.00391sentences <- c(
"The cat sat on the mat",
"A feline rested on the rug",
"The dog played in the park"
)
embeddings <- hf_embed(sentences)
embeddings
#> # A tibble: 3 × 3
#> text embedding n_dims
#> <chr> <list> <int>
#> 1 The cat sat on the mat <dbl [384]> 384
#> 2 A feline rested on the rug <dbl [384]> 384
#> 3 The dog played in the park <dbl [384]> 384
hf_similarity(embeddings)
#> # A tibble: 3 × 3
#> text_1 text_2 similarity
#> <chr> <chr> <dbl>
#> 1 The cat sat on the mat A feline rested on the rug 0.748
#> 2 The cat sat on the mat The dog played in the park 0.516
#> 3 A feline rested on the rug The dog played in the park 0.555hf_chat("What is the tidyverse?", max_tokens = 60)
#> # A tibble: 1 × 5
#> role content model tokens_used tool_calls
#> <chr> <chr> <chr> <int> <list>
#> 1 assistant "**What is the Tidyverse?**\n\nThe Tid… meta… 60 <list [0]>
# With a system prompt
hf_chat(
"Explain logistic regression in two sentences.",
system = "You are a statistics instructor. Use plain language.",
max_tokens = 80
)
#> # A tibble: 1 × 5
#> role content model tokens_used tool_calls
#> <chr> <chr> <chr> <int> <list>
#> 1 assistant Logistic regression is a type of stati… meta… 80 <list [0]>
# Multi-turn conversation
convo <- hf_conversation(system = "You are a helpful R tutor.")
convo <- chat(convo, "How do I read a CSV file?", max_tokens = 60, temperature = 0)
convo <- chat(
convo,
"What about Excel files? Reply in one short sentence.",
max_tokens = 40,
temperature = 0
)
convo$history[[length(convo$history)]]$content
#> [1] "You can use the `readxl` package in R to read Excel files, specifically using the `read_excel()` function."hf_generate("Once upon a time in a land far away,", max_new_tokens = 40)
#> # A tibble: 1 × 2
#> prompt generated_text
#> <chr> <chr>
#> 1 Once upon a time in a land far away, ...there was a beautiful and magical kin…
hf_fill_mask("The capital of France is [MASK].")
#> # A tibble: 5 × 4
#> text token score filled
#> <chr> <chr> <dbl> <chr>
#> 1 The capital of France is [MASK]. paris 0.417 The capital of France is pa…
#> 2 The capital of France is [MASK]. lille 0.0714 The capital of France is li…
#> 3 The capital of France is [MASK]. lyon 0.0634 The capital of France is ly…
#> 4 The capital of France is [MASK]. marseille 0.0444 The capital of France is ma…
#> 5 The capital of France is [MASK]. tours 0.0303 The capital of France is to…# Turn unstructured text into typed columns
hf_extract(
"Amelie is a chef in Paris who mentions burnout.",
c(name = "string", occupation = "string", city = "string", theme = "string")
)
#> # A tibble: 1 × 4
#> name occupation city theme
#> <chr> <chr> <chr> <chr>
#> 1 Amelie chef Paris burnout
# Tool calling with a tool-capable model
add_tool <- hf_tool("add", "Add two numbers.", c(x = "number", y = "number"))
convo <- hf_conversation(model = "Qwen/Qwen2.5-72B-Instruct")
convo <- chat(
convo,
"Use the add tool to add x=2 and y=3, then tell me the answer.",
tools = list(add_tool),
tool_choice = "auto"
)
convo <- hf_run_tools(convo, list(add = function(x, y) x + y))
convo$history[[length(convo$history)]]$content
#> [1] "The answer to adding \\( x = 2 \\) and \\( y = 3 \\) is \\( 5 \\)."# Audio transcription
audio <- "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac"
transcript <- hf_transcribe(audio, return_timestamps = "word")
substr(transcript$text, 1, 120)
#> [1] " I have a dream that one day this nation will rise up and live out the true meaning of its creed."
head(transcript$chunks[[1]], 3)
#> [[1]]
#> [[1]]$text
#> [1] " I"
#>
#> [[1]]$timestamp
#> [[1]]$timestamp[[1]]
#> [1] 0
#>
#> [[1]]$timestamp[[2]]
#> [1] 1.1
#>
#>
#>
#> [[2]]
#> [[2]]$text
#> [1] " have"
#>
#> [[2]]$timestamp
#> [[2]]$timestamp[[1]]
#> [1] 1.1
#>
#> [[2]]$timestamp[[2]]
#> [1] 1.44
#>
#>
#>
#> [[3]]
#> [[3]]$text
#> [1] " a"
#>
#> [[3]]$timestamp
#> [[3]]$timestamp[[1]]
#> [1] 1.44
#>
#> [[3]]$timestamp[[2]]
#> [1] 1.62
# Image generation writes a file and returns raw bytes
img <- hf_text_to_image(
"a small red cube on a white background",
output = "man/figures/README-red-cube.jpg",
seed = 1,
num_inference_steps = 2,
guidance_scale = 0,
overwrite = TRUE
)
tibble::tibble(
content_type = img$content_type,
bytes = length(img$image[[1]]),
file_exists = file.exists(img$path)
)
#> # A tibble: 1 × 3
#> content_type bytes file_exists
#> <chr> <int> <lgl>
#> 1 image/jpeg 26383 TRUE
knitr::include_graphics("man/figures/README-red-cube.jpg")
# Image understanding
image <- "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png"
hf_classify_image(image, top_k = 3)
#> # A tibble: 3 × 3
#> image label score
#> <chr> <chr> <dbl>
#> 1 https://huggingface.co/datasets/huggingface/documentation-images/… tabb… 0.277
#> 2 https://huggingface.co/datasets/huggingface/documentation-images/… tige… 0.276
#> 3 https://huggingface.co/datasets/huggingface/documentation-images/… Egyp… 0.140
hf_detect_objects(image, threshold = 0.5) |>
dplyr::filter(label == "cat")
#> # A tibble: 2 × 7
#> image label score xmin ymin xmax ymax
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 https://huggingface.co/datasets/huggingfa… cat 0.997 156 31 385 146
#> 2 https://huggingface.co/datasets/huggingfa… cat 0.999 145 132 429 341All functions accept character vectors and return tibbles.
library(dplyr)
library(tidyr)
reviews <- tibble(
id = 1:3,
text = c(
"This product is amazing!",
"Terrible experience.",
"It's okay, nothing special."
)
)
reviews |>
mutate(sentiment = hf_classify(text)) |>
unnest(sentiment, names_sep = "_") |>
select(id, text, sentiment_label, sentiment_score)
#> # A tibble: 3 × 4
#> id text sentiment_label sentiment_score
#> <int> <chr> <chr> <dbl>
#> 1 1 This product is amazing! POSITIVE 1.000
#> 2 2 Terrible experience. NEGATIVE 1.000
#> 3 3 It's okay, nothing special. NEGATIVE 0.819Use embeddings as features in machine learning workflows:
library(tidymodels)
rec <- recipe(sentiment ~ text, data = train_data) |>
step_hf_embed(text)
wf <- workflow() |>
add_recipe(rec) |>
add_model(logistic_reg()) |>
fit(data = train_data)Semantic search and document clustering:
docs |>
hf_embed_text(text) |>
hf_nearest_neighbors("machine learning", k = 5)
docs |>
hf_embed_text(text) |>
hf_cluster_texts(k = 3) |>
hf_extract_topics(text_col = "text", k = 3)# Search models
hf_search_models(task = "text-classification", limit = 5)
#> # A tibble: 5 × 7
#> model_id author task downloads likes tags library
#> <chr> <chr> <chr> <int> <int> <lis> <chr>
#> 1 BAAI/bge-reranker-v2-m3 <NA> text… 16443234 1053 <chr> senten…
#> 2 ProsusAI/finbert <NA> text… 7648889 1184 <chr> transf…
#> 3 BAAI/bge-reranker-base <NA> text… 4167279 238 <chr> senten…
#> 4 cardiffnlp/twitter-roberta-base-se… <NA> text… 3953164 813 <chr> transf…
#> 5 distilbert/distilbert-base-uncased… <NA> text… 3644729 910 <chr> transf…
# Load datasets into tibbles (no Python needed)
imdb <- hf_load_dataset("imdb", split = "train", limit = 5)
imdb |>
dplyr::mutate(text = substr(text, 1, 80))
#> # A tibble: 5 × 4
#> text label .dataset .split
#> <chr> <int> <chr> <chr>
#> 1 "I rented I AM CURIOUS-YELLOW from my video store becau… 0 stanfor… train
#> 2 "\"I Am Curious: Yellow\" is a risible and pretentious … 0 stanfor… train
#> 3 "If only to avoid making this type of film in the futur… 0 stanfor… train
#> 4 "This film was probably inspired by Godard's Masculin, … 0 stanfor… train
#> 5 "Oh, brother...after hearing about this ridiculous film… 0 stanfor… train
# Hugging Face split slices work too
imdb_sample <- hf_load_dataset("imdb", split = "train[:10%]", limit = 5)
imdb_sample |>
dplyr::mutate(text = substr(text, 1, 80))
#> # A tibble: 5 × 4
#> text label .dataset .split
#> <chr> <int> <chr> <chr>
#> 1 "I rented I AM CURIOUS-YELLOW from my video store becau… 0 stanfor… train…
#> 2 "\"I Am Curious: Yellow\" is a risible and pretentious … 0 stanfor… train…
#> 3 "If only to avoid making this type of film in the futur… 0 stanfor… train…
#> 4 "This film was probably inspired by Godard's Masculin, … 0 stanfor… train…
#> 5 "Oh, brother...after hearing about this ridiculous film… 0 stanfor… train…
# Download files and inspect provider options
hf_list_repo_files("BAAI/bge-small-en-v1.5", recursive = FALSE)
#> # A tibble: 14 × 4
#> path type size oid
#> <chr> <chr> <int> <chr>
#> 1 1_Pooling directory 0 b9d00290ba7577bd1709db…
#> 2 onnx directory 0 8e8db8abdebe9ba4820469…
#> 3 .gitattributes file 1519 a6344aac8c09253b3b630f…
#> 4 README.md file 94783 8b8567d75ffa619486d959…
#> 5 config.json file 743 3992bf890728a92476c700…
#> 6 config_sentence_transformers.json file 124 dcb0c0d97d09b930d13600…
#> 7 model.safetensors file 133466304 a4fef68c99b10468206a9b…
#> 8 modules.json file 349 952a9b81c0bfd99800fabf…
#> 9 pytorch_model.bin file 133508397 b6e2a796fcfd4513c609fc…
#> 10 sentence_bert_config.json file 52 ea85692bff64b0d1917833…
#> 11 special_tokens_map.json file 125 a8b3208c2884c4efb86e49…
#> 12 tokenizer.json file 711396 688882a79f44442ddc1f60…
#> 13 tokenizer_config.json file 366 37fca74771bc76a8e01178…
#> 14 vocab.txt file 231508 fb140275c155a9c7c5a3b3…
readme <- hf_hub_download("BAAI/bge-small-en-v1.5", "README.md")
readLines(readme, n = 3)
#> [1] "---" "tags:"
#> [3] "- sentence-transformers"
hf_list_providers("Qwen/Qwen2.5-72B-Instruct")
#> # A tibble: 2 × 12
#> model_id provider status context_length input_price output_price is_free
#> <chr> <chr> <chr> <int> <dbl> <dbl> <lgl>
#> 1 Qwen/Qwen2.5-… novita live 32000 0.38 0.4 FALSE
#> 2 Qwen/Qwen2.5-… feather… live NA NA NA FALSE
#> # ℹ 5 more variables: supports_tools <lgl>, supports_structured_output <lgl>,
#> # first_token_latency_ms <dbl>, throughput <dbl>, is_model_author <lgl># Guarded dataset publishing requires a write-scoped token and confirm = TRUE
hf_push_dataset(mtcars, "your-username/mtcars-small", confirm = TRUE)vignette("getting-started") – setup and first
examplesvignette("text-classification") – sentiment analysis
and zero-shot labelingvignette("embeddings-and-similarity") – semantic
search, clustering, visualizationvignette("llm-chat-and-generation") – conversations and
text generationvignette("structured-extraction-and-tools") – typed
extraction, streaming, and tool callsvignette("multimodal-images-audio-speech") –
transcription, image generation, and visionvignette("hub-datasets-and-modeling") – Hub discovery
and tidymodels pipelinesvignette("hub-download-upload-share") – downloads,
providers, and guarded Hub writesvignette("anthropic-economic-index") – AI productivity
research with the Anthropic Economic IndexMIT
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.