README

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

huggingfaceR

An API-first R package for accessing machine learning models, embeddings, and datasets on the Hugging Face Hub via the Hugging Face Inference API. No Python required. The Inference API serves a curated subset of the Hub’s 500,000+ models – use hf_check_inference() to verify model availability.

Installation

install.packages("huggingfaceR")

# install.packages("devtools")
devtools::install_github("farach/huggingfaceR")

Setup

library(huggingfaceR)

hf_set_token("hf_your_token_here", store = TRUE)
hf_whoami()

Text Classification

# Sentiment analysis
hf_classify("I love using R for data science!")
#> # A tibble: 1 × 3
#>   text                             label    score
#>   <chr>                            <chr>    <dbl>
#> 1 I love using R for data science! POSITIVE 1.000

# Zero-shot classification with custom labels
hf_classify_zero_shot(
  "I just bought a new laptop for coding",
  labels = c("technology", "sports", "politics", "food")
)
#> # A tibble: 4 × 3
#>   text                                  label        score
#>   <chr>                                 <chr>        <dbl>
#> 1 I just bought a new laptop for coding technology 0.985  
#> 2 I just bought a new laptop for coding sports     0.00661
#> 3 I just bought a new laptop for coding food       0.00485
#> 4 I just bought a new laptop for coding politics   0.00391

Embeddings and Similarity

sentences <- c(
  "The cat sat on the mat",
  "A feline rested on the rug",
  "The dog played in the park"
)

embeddings <- hf_embed(sentences)
embeddings
#> # A tibble: 3 × 3
#>   text                       embedding   n_dims
#>   <chr>                      <list>       <int>
#> 1 The cat sat on the mat     <dbl [384]>    384
#> 2 A feline rested on the rug <dbl [384]>    384
#> 3 The dog played in the park <dbl [384]>    384

hf_similarity(embeddings)
#> # A tibble: 3 × 3
#>   text_1                     text_2                     similarity
#>   <chr>                      <chr>                           <dbl>
#> 1 The cat sat on the mat     A feline rested on the rug      0.748
#> 2 The cat sat on the mat     The dog played in the park      0.516
#> 3 A feline rested on the rug The dog played in the park      0.555

Chat with Open-Source LLMs

hf_chat("What is the tidyverse?", max_tokens = 60)
#> # A tibble: 1 × 5
#>   role      content                                 model tokens_used tool_calls
#>   <chr>     <chr>                                   <chr>       <int> <list>    
#> 1 assistant "**What is the Tidyverse?**\n\nThe Tid… meta…          60 <list [0]>

# With a system prompt
hf_chat(
  "Explain logistic regression in two sentences.",
  system = "You are a statistics instructor. Use plain language.",
  max_tokens = 80
)
#> # A tibble: 1 × 5
#>   role      content                                 model tokens_used tool_calls
#>   <chr>     <chr>                                   <chr>       <int> <list>    
#> 1 assistant Logistic regression is a type of stati… meta…          80 <list [0]>

# Multi-turn conversation
convo <- hf_conversation(system = "You are a helpful R tutor.")
convo <- chat(convo, "How do I read a CSV file?", max_tokens = 60, temperature = 0)
convo <- chat(
  convo,
  "What about Excel files? Reply in one short sentence.",
  max_tokens = 40,
  temperature = 0
)
convo$history[[length(convo$history)]]$content
#> [1] "You can use the `readxl` package in R to read Excel files, specifically using the `read_excel()` function."

Text Generation

hf_generate("Once upon a time in a land far away,", max_new_tokens = 40)
#> # A tibble: 1 × 2
#>   prompt                               generated_text                           
#>   <chr>                                <chr>                                    
#> 1 Once upon a time in a land far away, ...there was a beautiful and magical kin…

hf_fill_mask("The capital of France is [MASK].")
#> # A tibble: 5 × 4
#>   text                             token      score filled                      
#>   <chr>                            <chr>      <dbl> <chr>                       
#> 1 The capital of France is [MASK]. paris     0.417  The capital of France is pa…
#> 2 The capital of France is [MASK]. lille     0.0714 The capital of France is li…
#> 3 The capital of France is [MASK]. lyon      0.0634 The capital of France is ly…
#> 4 The capital of France is [MASK]. marseille 0.0444 The capital of France is ma…
#> 5 The capital of France is [MASK]. tours     0.0303 The capital of France is to…

Structured Extraction and Tools

# Turn unstructured text into typed columns
hf_extract(
  "Amelie is a chef in Paris who mentions burnout.",
  c(name = "string", occupation = "string", city = "string", theme = "string")
)
#> # A tibble: 1 × 4
#>   name   occupation city  theme  
#>   <chr>  <chr>      <chr> <chr>  
#> 1 Amelie chef       Paris burnout

# Tool calling with a tool-capable model
add_tool <- hf_tool("add", "Add two numbers.", c(x = "number", y = "number"))

convo <- hf_conversation(model = "Qwen/Qwen2.5-72B-Instruct")
convo <- chat(
  convo,
  "Use the add tool to add x=2 and y=3, then tell me the answer.",
  tools = list(add_tool),
  tool_choice = "auto"
)
convo <- hf_run_tools(convo, list(add = function(x, y) x + y))
convo$history[[length(convo$history)]]$content
#> [1] "The answer to adding \\( x = 2 \\) and \\( y = 3 \\) is \\( 5 \\)."

Multimodal

# Audio transcription
audio <- "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac"
transcript <- hf_transcribe(audio, return_timestamps = "word")
substr(transcript$text, 1, 120)
#> [1] " I have a dream that one day this nation will rise up and live out the true meaning of its creed."
head(transcript$chunks[[1]], 3)
#> [[1]]
#> [[1]]$text
#> [1] " I"
#> 
#> [[1]]$timestamp
#> [[1]]$timestamp[[1]]
#> [1] 0
#> 
#> [[1]]$timestamp[[2]]
#> [1] 1.1
#> 
#> 
#> 
#> [[2]]
#> [[2]]$text
#> [1] " have"
#> 
#> [[2]]$timestamp
#> [[2]]$timestamp[[1]]
#> [1] 1.1
#> 
#> [[2]]$timestamp[[2]]
#> [1] 1.44
#> 
#> 
#> 
#> [[3]]
#> [[3]]$text
#> [1] " a"
#> 
#> [[3]]$timestamp
#> [[3]]$timestamp[[1]]
#> [1] 1.44
#> 
#> [[3]]$timestamp[[2]]
#> [1] 1.62

# Image generation writes a file and returns raw bytes
img <- hf_text_to_image(
  "a small red cube on a white background",
  output = "man/figures/README-red-cube.jpg",
  seed = 1,
  num_inference_steps = 2,
  guidance_scale = 0,
  overwrite = TRUE
)
tibble::tibble(
  content_type = img$content_type,
  bytes = length(img$image[[1]]),
  file_exists = file.exists(img$path)
)
#> # A tibble: 1 × 3
#>   content_type bytes file_exists
#>   <chr>        <int> <lgl>      
#> 1 image/jpeg   26383 TRUE
knitr::include_graphics("man/figures/README-red-cube.jpg")


# Image understanding
image <- "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png"
hf_classify_image(image, top_k = 3)
#> # A tibble: 3 × 3
#>   image                                                              label score
#>   <chr>                                                              <chr> <dbl>
#> 1 https://huggingface.co/datasets/huggingface/documentation-images/… tabb… 0.277
#> 2 https://huggingface.co/datasets/huggingface/documentation-images/… tige… 0.276
#> 3 https://huggingface.co/datasets/huggingface/documentation-images/… Egyp… 0.140
hf_detect_objects(image, threshold = 0.5) |>
  dplyr::filter(label == "cat")
#> # A tibble: 2 × 7
#>   image                                      label score  xmin  ymin  xmax  ymax
#>   <chr>                                      <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 https://huggingface.co/datasets/huggingfa… cat   0.997   156    31   385   146
#> 2 https://huggingface.co/datasets/huggingfa… cat   0.999   145   132   429   341

Tidyverse Integration

library(dplyr)
library(tidyr)

reviews <- tibble(
  id = 1:3,
  text = c(
    "This product is amazing!",
    "Terrible experience.",
    "It's okay, nothing special."
  )
)

reviews |>
  mutate(sentiment = hf_classify(text)) |>
  unnest(sentiment, names_sep = "_") |>
  select(id, text, sentiment_label, sentiment_score)
#> # A tibble: 3 × 4
#>      id text                        sentiment_label sentiment_score
#>   <int> <chr>                       <chr>                     <dbl>
#> 1     1 This product is amazing!    POSITIVE                  1.000
#> 2     2 Terrible experience.        NEGATIVE                  1.000
#> 3     3 It's okay, nothing special. NEGATIVE                  0.819

Tidymodels

library(tidymodels)

rec <- recipe(sentiment ~ text, data = train_data) |>
  step_hf_embed(text)

wf <- workflow() |>
  add_recipe(rec) |>
  add_model(logistic_reg()) |>
  fit(data = train_data)

Tidytext

docs |>
  hf_embed_text(text) |>
  hf_nearest_neighbors("machine learning", k = 5)

docs |>
  hf_embed_text(text) |>
  hf_cluster_texts(k = 3) |>
  hf_extract_topics(text_col = "text", k = 3)

Hub and Datasets

# Search models
hf_search_models(task = "text-classification", limit = 5)
#> # A tibble: 5 × 7
#>   model_id                            author task  downloads likes tags  library
#>   <chr>                               <chr>  <chr>     <int> <int> <lis> <chr>  
#> 1 BAAI/bge-reranker-v2-m3             <NA>   text…  16443234  1053 <chr> senten…
#> 2 ProsusAI/finbert                    <NA>   text…   7648889  1184 <chr> transf…
#> 3 BAAI/bge-reranker-base              <NA>   text…   4167279   238 <chr> senten…
#> 4 cardiffnlp/twitter-roberta-base-se… <NA>   text…   3953164   813 <chr> transf…
#> 5 distilbert/distilbert-base-uncased… <NA>   text…   3644729   910 <chr> transf…

# Load datasets into tibbles (no Python needed)
imdb <- hf_load_dataset("imdb", split = "train", limit = 5)
imdb |>
  dplyr::mutate(text = substr(text, 1, 80))
#> # A tibble: 5 × 4
#>   text                                                     label .dataset .split
#>   <chr>                                                    <int> <chr>    <chr> 
#> 1 "I rented I AM CURIOUS-YELLOW from my video store becau…     0 stanfor… train 
#> 2 "\"I Am Curious: Yellow\" is a risible and pretentious …     0 stanfor… train 
#> 3 "If only to avoid making this type of film in the futur…     0 stanfor… train 
#> 4 "This film was probably inspired by Godard's Masculin, …     0 stanfor… train 
#> 5 "Oh, brother...after hearing about this ridiculous film…     0 stanfor… train

# Hugging Face split slices work too
imdb_sample <- hf_load_dataset("imdb", split = "train[:10%]", limit = 5)
imdb_sample |>
  dplyr::mutate(text = substr(text, 1, 80))
#> # A tibble: 5 × 4
#>   text                                                     label .dataset .split
#>   <chr>                                                    <int> <chr>    <chr> 
#> 1 "I rented I AM CURIOUS-YELLOW from my video store becau…     0 stanfor… train…
#> 2 "\"I Am Curious: Yellow\" is a risible and pretentious …     0 stanfor… train…
#> 3 "If only to avoid making this type of film in the futur…     0 stanfor… train…
#> 4 "This film was probably inspired by Godard's Masculin, …     0 stanfor… train…
#> 5 "Oh, brother...after hearing about this ridiculous film…     0 stanfor… train…

# Download files and inspect provider options
hf_list_repo_files("BAAI/bge-small-en-v1.5", recursive = FALSE)
#> # A tibble: 14 × 4
#>    path                              type           size oid                    
#>    <chr>                             <chr>         <int> <chr>                  
#>  1 1_Pooling                         directory         0 b9d00290ba7577bd1709db…
#>  2 onnx                              directory         0 8e8db8abdebe9ba4820469…
#>  3 .gitattributes                    file           1519 a6344aac8c09253b3b630f…
#>  4 README.md                         file          94783 8b8567d75ffa619486d959…
#>  5 config.json                       file            743 3992bf890728a92476c700…
#>  6 config_sentence_transformers.json file            124 dcb0c0d97d09b930d13600…
#>  7 model.safetensors                 file      133466304 a4fef68c99b10468206a9b…
#>  8 modules.json                      file            349 952a9b81c0bfd99800fabf…
#>  9 pytorch_model.bin                 file      133508397 b6e2a796fcfd4513c609fc…
#> 10 sentence_bert_config.json         file             52 ea85692bff64b0d1917833…
#> 11 special_tokens_map.json           file            125 a8b3208c2884c4efb86e49…
#> 12 tokenizer.json                    file         711396 688882a79f44442ddc1f60…
#> 13 tokenizer_config.json             file            366 37fca74771bc76a8e01178…
#> 14 vocab.txt                         file         231508 fb140275c155a9c7c5a3b3…
readme <- hf_hub_download("BAAI/bge-small-en-v1.5", "README.md")
readLines(readme, n = 3)
#> [1] "---"                     "tags:"                  
#> [3] "- sentence-transformers"
hf_list_providers("Qwen/Qwen2.5-72B-Instruct")
#> # A tibble: 2 × 12
#>   model_id       provider status context_length input_price output_price is_free
#>   <chr>          <chr>    <chr>           <int>       <dbl>        <dbl> <lgl>  
#> 1 Qwen/Qwen2.5-… novita   live            32000        0.38          0.4 FALSE  
#> 2 Qwen/Qwen2.5-… feather… live               NA       NA            NA   FALSE  
#> # ℹ 5 more variables: supports_tools <lgl>, supports_structured_output <lgl>,
#> #   first_token_latency_ms <dbl>, throughput <dbl>, is_model_author <lgl>

# Guarded dataset publishing requires a write-scoped token and confirm = TRUE
hf_push_dataset(mtcars, "your-username/mtcars-small", confirm = TRUE)

Learn More

License

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.