The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

urlparse

CRAN status R-CMD-check Codecov test coverage urlparse status badge

Fast and simple url parser for R. Initially developed for the paws.common package.

urlparse::url_parse("https://user:pass@host.com:8000/path?query=1#fragment")
#> $scheme
#> [1] "https"
#> 
#> $user
#> [1] "user"
#> 
#> $password
#> [1] "pass"
#> 
#> $host
#> [1] "host.com"
#> 
#> $port
#> [1] "8000"
#> 
#> $path
#> [1] "/path"
#> 
#> $raw_path
#> [1] ""
#> 
#> $query
#> $query$query
#> [1] "1"
#> 
#> 
#> $raw_query
#> [1] "query=1"
#> 
#> $fragment
#> [1] "fragment"

Installation

You can install the development version of urlparse like so:

remotes::install_github("dyfanjones/urlparse")

r-universe installation:

install.packages("urlparse", repos = c("https://dyfanjones.r-universe.dev", "https://cloud.r-project.org"))

Example

This is a basic example which shows you how to solve a common problem:

library(urlparse)
url_encoder("foo = bar + 5")
#> [1] "foo%20%3D%20bar%20%2B%205"

url_decoder(url_encoder("foo = bar + 5"))
#> [1] "foo = bar + 5"

Similar to python’s from urllib.parse import quote, urlparse::url_encoder supports the safe parameter. The additional ASCII characters that should not be encoded.

from urllib.parse import quote
quote("foo = bar + 5", safe = "+")
#> 'foo%20%3D%20bar%20+%205'
url_encoder("foo = bar + 5", safe = "+")
#> [1] "foo%20%3D%20bar%20+%205"

Modify an url through piping using the set_* functions or using the stand alone url_modify function.


url <- "http://example.com"
set_scheme(url, "https") |>
  set_port(1234L) |>
  set_path("foo/bar") |>
  set_query("baz") |>
  set_fragment("quux")
#> [1] "https://example.com:1234/foo/bar?baz#quux"

url_modify(url, scheme = "https", port = 1234, path = "foo/bar", query = "baz", fragment = "quux")
#> [1] "https://example.com:1234/foo/bar?baz#quux"

Note: it is faster to use url_modify rather than piping the set_* functions. This is because urlparse has to parse the url within each set_* to modify the url.

url <- "http://example.com"
bench::mark(
  piping = {set_scheme(url, "https") |>
  set_port(1234L) |>
  set_path("foo/bar") |>
  set_query("baz") |>
  set_fragment("quux")},
  single_function = url_modify(url, scheme = "https", port = 1234, path = "foo/bar", query = "baz", fragment = "quux")
)
#> # A tibble: 2 × 6
#>   expression           min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>      <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 piping            5.29µs    5.9µs   162854.        0B     16.3
#> 2 single_function    1.6µs    1.8µs   517151.        0B     51.7

Benchmark:

Parsing URL:

url <- "https://user:pass@host.com:8000/path?query=1#fragment"
(bm <- bench::mark(
  urlparse = urlparse::url_parse(url),
  httr2 = httr2::url_parse(url),
  curl = curl::curl_parse_url(url),
  urltools = urltools::url_parse(url),
  check = F
))
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 urlparse     1.72µs   1.93µs   488712.        0B      0  
#> 2 httr2       22.39µs  23.86µs    40406.  571.07KB     24.3
#> 3 curl        27.06µs  29.64µs    29324.        0B     14.7
#> 4 urltools   124.44µs 132.68µs     7119.    2.18MB     23.2

show_relative(bm)
#> # A tibble: 4 × 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 urlparse     1      1       68.6        NaN      NaN
#> 2 httr2       13.0   12.4      5.68       Inf      Inf
#> 3 curl        15.7   15.4      4.12       NaN      Inf
#> 4 urltools    72.3   68.9      1          Inf      Inf

ggplot2::autoplot(bm)

Since urlpase v0.1.999+ you can use the vectorised url parser url_parser_v2

urls <- c(
  "https://www.example.com",
  "https://www.google.com/maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519",
  "https://user_1:password_1@example.org:8080/dir/../api?q=1#frag",
  "https://user:password@example.com",
  "https://www.example.com:8080/search%3D1%2B3",
  "https://www.google.co.jp/search?q=\u30c9\u30a4\u30c4",
  "https://www.example.com:8080?var1=foo&var2=ba%20r&var3=baz+larry",
  "https://user:password@example.com:8080",
  "https://user:password@example.com",
  "https://user@example.com:8080",
  "https://user@example.com"
)
(bm <- bench::mark(
  urlparse = lapply(urls, urlparse::url_parse),
  urlparse_v2 = urlparse::url_parse_v2(urls),
  httr2 =  lapply(urls, httr2::url_parse),
  curl = lapply(urls, curl::curl_parse_url),
  urltools = urltools::url_parse(urls),
  check = F
))
#> # A tibble: 5 × 6
#>   expression       min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>  <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 urlparse      19.4µs   21.3µs    46214.      200B    13.9 
#> 2 urlparse_v2   10.5µs     11µs    87963.     4.3KB     0   
#> 3 httr2        171.6µs  181.6µs     5232.        0B    10.2 
#> 4 curl         188.7µs  198.4µs     4895.        0B     8.14
#> 5 urltools       130µs  142.1µs     6569.        0B    10.2

show_relative(bm)
#> # A tibble: 5 × 6
#>   expression    min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>  <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 urlparse     1.85   1.94      9.44       Inf      Inf
#> 2 urlparse_v2  1      1        18.0        Inf      NaN
#> 3 httr2       16.4   16.5       1.07       NaN      Inf
#> 4 curl        18.0   18.1       1          NaN      Inf
#> 5 urltools    12.4   12.9       1.34       NaN      Inf

ggplot2::autoplot(bm)

Note: url_parse_v2 returns the parsed url as a data.frame this is similar behaviour to urltools and adaR:

urlparse::url_parse_v2(urls)
#>                                                                                                                                                                       href
#> 1                                                                                                                                                  https://www.example.com
#> 2  https://www.google.com/maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519
#> 3                                                                                                           https://user_1:password_1@example.org:8080/dir/../api?q=1#frag
#> 4                                                                                                                                        https://user:password@example.com
#> 5                                                                                                                              https://www.example.com:8080/search%3D1%2B3
#> 6                                                                                                                                 https://www.google.co.jp/search?q=ドイツ
#> 7                                                                                                         https://www.example.com:8080?var1=foo&var2=ba%20r&var3=baz+larry
#> 8                                                                                                                                   https://user:password@example.com:8080
#> 9                                                                                                                                        https://user:password@example.com
#> 10                                                                                                                                           https://user@example.com:8080
#> 11                                                                                                                                                https://user@example.com
#>    scheme   user   password             host port
#> 1   https                    www.example.com     
#> 2   https                     www.google.com     
#> 3   https user_1 password_1      example.org 8080
#> 4   https   user   password      example.com     
#> 5   https                    www.example.com 8080
#> 6   https                   www.google.co.jp     
#> 7   https                    www.example.com 8080
#> 8   https   user   password      example.com 8080
#> 9   https   user   password      example.com     
#> 10  https   user                 example.com 8080
#> 11  https   user                 example.com     
#>                                                                                                                                                 path
#> 1                                                                                                                                                   
#> 2  /maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519
#> 3                                                                                                                                        /dir/../api
#> 4                                                                                                                                                   
#> 5                                                                                                                                        /search=1+3
#> 6                                                                                                                                            /search
#> 7                                                                                                                                                   
#> 8                                                                                                                                                   
#> 9                                                                                                                                                   
#> 10                                                                                                                                                  
#> 11                                                                                                                                                  
#>                                                                                                                                             raw_path
#> 1                                                                                                                                                   
#> 2  /maps/place/Pennsylvania+Station/@40.7519848,-74.0015045,14.7z/data=!4m5!3m4!1s0x89c259ae15b2adcb:0x7955420634fd7eba!8m2!3d40.750568!4d-73.993519
#> 3                                                                                                                                                   
#> 4                                                                                                                                                   
#> 5                                                                                                                                    /search%3D1%2B3
#> 6                                                                                                                                                   
#> 7                                                                                                                                                   
#> 8                                                                                                                                                   
#> 9                                                                                                                                                   
#> 10                                                                                                                                                  
#> 11                                                                                                                                                  
#>                                raw_query fragment
#> 1                                                
#> 2                                                
#> 3                                    q=1     frag
#> 4                                                
#> 5                                                
#> 6          q=%E3%83%89%E3%82%A4%E3%83%84         
#> 7  var1=foo&var2=ba%20r&var3=baz%2Blarry         
#> 8                                                
#> 9                                                
#> 10                                               
#> 11

Encoding URL:

Note: urltools encode special characters to lower case hex i.e.: “?” -> “%3f” instead of “%3F”

string <- "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~`!@#$%^&*()=+[{]}\\|;:'\",<>/? "
(bm <- bench::mark(
  urlparse = urlparse::url_encoder(string),
  curl = curl::curl_escape(string),
  urltools = urltools::url_encode(string),
  base = URLencode(string, reserved = T),
  check = F
))
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 urlparse     1.48µs   1.64µs   581380.      208B     0   
#> 2 curl         2.25µs   2.58µs   349595.    3.03KB     0   
#> 3 urltools     2.34µs   2.54µs   381930.    2.48KB     0   
#> 4 base        78.84µs  82.33µs    11746.   28.59KB     8.25

show_relative(bm)
#> # A tibble: 4 × 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 urlparse    1      1         49.5       1        NaN
#> 2 curl        1.53   1.57      29.8      14.9      NaN
#> 3 urltools    1.58   1.55      32.5      12.2      NaN
#> 4 base       53.4   50.2        1       141.       Inf

ggplot2::autoplot(bm)

string <- "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~`!@#$%^&*()=+[{]}\\|;:'\",<>/? "
url <- paste0(sample(strsplit(string, "")[[1]], 1e4, replace = TRUE), collapse = "")
(bm <- bench::mark(
  urlparse = urlparse::url_encoder(url),
  curl = curl::curl_escape(url),
  urltools = urltools::url_encode(url),
  base = URLencode(url, reserved = T, repeated = T),
  check = F,
  filter_gc = F
))
#> # A tibble: 4 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 urlparse     86.4µs   88.3µs    10599.    15.8KB     0   
#> 2 curl         91.6µs   94.9µs    10306.        0B     0   
#> 3 urltools    241.8µs  247.8µs     3943.    15.8KB     0   
#> 4 base          6.7ms      7ms      138.   333.5KB     8.00

show_relative(bm)
#> # A tibble: 4 × 6
#>   expression   min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 urlparse    1      1         76.8       Inf      NaN
#> 2 curl        1.06   1.08      74.7       NaN      NaN
#> 3 urltools    2.80   2.81      28.6       Inf      NaN
#> 4 base       77.6   79.3        1         Inf      Inf

ggplot2::autoplot(bm)

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.