The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

urlparse

CRAN status R-CMD-check Codecov test coverage urlparse status badge

Fast and simple url parser for R. Initially developed for the paws.common package.

urlparse::url_parse("https://user:pass@host.com:8000/path?query=1#fragment")
#> $scheme
#> [1] "https"
#> 
#> $user
#> [1] "user"
#> 
#> $password
#> [1] "pass"
#> 
#> $host
#> [1] "host.com"
#> 
#> $port
#> [1] "8000"
#> 
#> $path
#> [1] "/path"
#> 
#> $raw_path
#> [1] ""
#> 
#> $query
#> $query$query
#> [1] "1"
#> 
#> 
#> $raw_query
#> [1] "query=1"
#> 
#> $fragment
#> [1] "fragment"

Installation

You can install the development version of urlparse like so:

remotes::install_github("dyfanjones/urlparse")

r-universe installation:

install.packages("urlparse", repos = c("https://dyfanjones.r-universe.dev", "https://cloud.r-project.org"))

Example

This is a basic example which shows you how to solve a common problem:

library(urlparse)
url_encoder("foo = bar + 5")
#> [1] "foo%20%3D%20bar%20%2B%205"

url_decoder(url_encoder("foo = bar + 5"))
#> [1] "foo = bar + 5"

Similar to python’s from urllib.parse import quote, urlparse::url_encoder supports the safe parameter. The additional ASCII characters that should not be encoded.

from urllib.parse import quote
quote("foo = bar + 5", safe = "+")
#> 'foo%20%3D%20bar%20+%205'
url_encoder("foo = bar + 5", safe = "+")
#> [1] "foo%20%3D%20bar%20+%205"

url <- "http://example.com"
set_scheme(url, "https") |>
  set_port(1234L) |>
  set_path("foo/bar") |>
  set_query("baz") |>
  set_fragment("quux")
#> [1] "https://example.com:1234/foo/bar?baz#quux"

url_modify(url, scheme = "https", port = 1234, path = "foo/bar", query = "baz", fragment = "quux")
#> [1] "https://example.com:1234/foo/bar?baz#quux"

Note: it is faster to use url_modify rather than piping the set_* functions. This is because urlparse has to parse the url within each set_* to modify the url.

url <- "http://example.com"
bench::mark(
  piping = {set_scheme(url, "https") |>
  set_port(1234L) |>
  set_path("foo/bar") |>
  set_query("baz") |>
  set_fragment("quux")},
  single_function = url_modify(url, scheme = "https", port = 1234, path = "foo/bar", query = "baz", fragment = "quux")
)
#> # A tibble: 2 × 6
#>   expression           min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>      <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 piping            5.29µs   5.86µs   169576.        0B        0
#> 2 single_function   1.64µs    1.8µs   507863.        0B        0

Benchmark:

Parsing URL:

url <- "https://user:pass@host.com:8000/path?query=1#fragment"
(bm <- bench::mark(
  urlparse = urlparse::url_parse(url),
  httr2 = httr2::url_parse(url),
  curl = curl::curl_parse_url(url),
  urltools = urltools::url_parse(url),
  check = F
))
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 urlparse     1.68µs   1.84µs   503156.        0B      0  
#> 2 httr2       64.86µs  68.59µs    14312.   560.9KB     17.4
#> 3 curl        27.22µs  28.54µs    34390.   48.78KB     13.8
#> 4 urltools   124.35µs 129.03µs     7604.    2.17MB     20.9

show_relative(bm)
#> # A tibble: 4 × 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 urlparse     1      1       66.2        NaN      NaN
#> 2 httr2       38.6   37.2      1.88       Inf      Inf
#> 3 curl        16.2   15.5      4.52       Inf      Inf
#> 4 urltools    74.0   69.9      1          Inf      Inf

ggplot2::autoplot(bm)
#> Loading required namespace: tidyr

Encoding URL:

Note: urltools encode special characters to lower case hex i.e.: “?” -> “%3f” instead of “%3F”

string <- "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~`!@#$%^&*()=+[{]}\\|;:'\",<>/? "
(bm <- bench::mark(
  urlparse = urlparse::url_encoder(string),
  curl = curl::curl_escape(string),
  urltools = urltools::url_encode(string),
  base = URLencode(string, reserved = T),
  check = F
))
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 urlparse     1.48µs   1.56µs   623378.      208B     0   
#> 2 curl          2.3µs   2.42µs   399842.    3.06KB     0   
#> 3 urltools     2.42µs   2.67µs   370964.    2.48KB     0   
#> 4 base        79.09µs  83.15µs    11703.   28.59KB     8.24

show_relative(bm)
#> # A tibble: 4 × 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 urlparse    1      1         53.3       1        NaN
#> 2 curl        1.56   1.55      34.2      15.0      NaN
#> 3 urltools    1.64   1.71      31.7      12.2      NaN
#> 4 base       53.6   53.4        1       141.       Inf

ggplot2::autoplot(bm)

string <- "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~`!@#$%^&*()=+[{]}\\|;:'\",<>/? "
url <- paste0(sample(strsplit(string, "")[[1]], 1e4, replace = TRUE), collapse = "")
(bm <- bench::mark(
  urlparse = urlparse::url_encoder(url),
  curl = curl::curl_escape(url),
  urltools = urltools::url_encode(url),
  base = URLencode(url, reserved = T, repeated = T),
  check = F,
  filter_gc = F
))
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 urlparse    86.06µs  87.41µs    11291.    15.8KB     0   
#> 2 curl        92.95µs  94.26µs    10209.        0B     0   
#> 3 urltools    238.7µs 244.16µs     3950.    15.8KB     0   
#> 4 base         6.72ms   6.84ms      141.   333.2KB     9.91

show_relative(bm)
#> # A tibble: 4 × 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 urlparse    1      1         80.2       Inf      NaN
#> 2 curl        1.08   1.08      72.5       NaN      NaN
#> 3 urltools    2.77   2.79      28.1       Inf      NaN
#> 4 base       78.1   78.2        1         Inf      Inf

ggplot2::autoplot(bm)

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.