Let’s take a look at a few HTTP APIs that transmit data in JSON format, and then get that data into tidy tibbles with tidyjson.
library(dplyr)
library(tidyr)
library(jsonlite)
library(tidyjson)
library(ggplot2)
library(lubridate)
The tidyverse is used heavily for data cleansing, so let’s explore some tidyverse repository data through Github’s APIs. We are going to grab the data directly and then explore the structure of the JSON with json_schema
.
baseurl <- 'https://api.github.com/repos/tidyverse/dplyr/issues'
dplyr_issues <- as.tbl_json(baseurl)
dplyr_issues %>% json_schema %>% prettify
#> [
#> {
#> "assignee": "null",
#> "assignees": [
#>
#> ],
#> "author_association": "string",
#> "body": "string",
#> "closed_at": "null",
#> "comments": "number",
#> "comments_url": "string",
#> "created_at": "string",
#> "events_url": "string",
#> "html_url": "string",
#> "id": "number",
#> "labels": [
#> {
#> "color": "string",
#> "default": "logical",
#> "description": "string",
#> "id": "number",
#> "name": "string",
#> "node_id": "string",
#> "url": "string"
#> }
#> ],
#> "labels_url": "string",
#> "locked": "logical",
#> "milestone": {
#> "closed_at": "null",
#> "closed_issues": "number",
#> "created_at": "string",
#> "creator": {
#> "avatar_url": "string",
#> "events_url": "string",
#> "followers_url": "string",
#> "following_url": "string",
#> "gists_url": "string",
#> "gravatar_id": "string",
#> "html_url": "string",
#> "id": "number",
#> "login": "string",
#> "node_id": "string",
#> "organizations_url": "string",
#> "received_events_url": "string",
#> "repos_url": "string",
#> "site_admin": "logical",
#> "starred_url": "string",
#> "subscriptions_url": "string",
#> "type": "string",
#> "url": "string"
#> },
#> "description": "string",
#> "due_on": "null",
#> "html_url": "string",
#> "id": "number",
#> "labels_url": "string",
#> "node_id": "string",
#> "number": "number",
#> "open_issues": "number",
#> "state": "string",
#> "title": "string",
#> "updated_at": "string",
#> "url": "string"
#> },
#> "node_id": "string",
#> "number": "number",
#> "repository_url": "string",
#> "state": "string",
#> "title": "string",
#> "updated_at": "string",
#> "url": "string",
#> "user": {
#> "avatar_url": "string",
#> "events_url": "string",
#> "followers_url": "string",
#> "following_url": "string",
#> "gists_url": "string",
#> "gravatar_id": "string",
#> "html_url": "string",
#> "id": "number",
#> "login": "string",
#> "node_id": "string",
#> "organizations_url": "string",
#> "received_events_url": "string",
#> "repos_url": "string",
#> "site_admin": "logical",
#> "starred_url": "string",
#> "subscriptions_url": "string",
#> "type": "string",
#> "url": "string"
#> }
#> }
#> ]
#>
After exploring the structure of the data, we decide we want to look at a high level overview of the isssues we have. Note that we can grab nested object detail by declaring a more complex path like jstring('assignee','login')
. This avoids the tendency to use enter_object()
where it is not necessary.
highlevel <- dplyr_issues %>% gather_array('index') %>%
spread_values(id=jnumber('id')
, assignee=jstring('assignee','login')
, comments=jnumber('comments')
, title=jstring('title')
, state=jstring('state')
, number=jnumber('number')
)
print(highlevel)
#> # A tbl_json: 30 x 8 tibble with a "JSON" attribute
#> `attr(., "JSON"… document.id index id assignee comments title state
#> <chr> <int> <int> <dbl> <chr> <dbl> <chr> <chr>
#> 1 "{\"url\":\"htt… 1 1 5.29e8 <NA> 0 Rewo… open
#> 2 "{\"url\":\"htt… 1 2 5.28e8 <NA> 0 Use … open
#> 3 "{\"url\":\"htt… 1 3 5.27e8 <NA> 0 muta… open
#> 4 "{\"url\":\"htt… 1 4 5.27e8 <NA> 0 Add … open
#> # … with 26 more rows, and 1 more variable: number <dbl>
And perhaps we want to look at a few different summaries. We notice that there are only 30 issues here, but anyone familiar with dplyr
will know that the repo is much more popular than that. Github’s API is paginated, so we only got the first 30 issues back from the API.
highlevel %>% group_by(assignee) %>% summarize(nissues=n())
#> # A tibble: 1 x 2
#> assignee nissues
#> <chr> <int>
#> 1 <NA> 30
highlevel %>% group_by(comments) %>% summarize(nissues=n(), issues=paste(number,collapse=',')) %>%
ungroup() %>% arrange(desc(comments))
#> # A tibble: 7 x 3
#> comments nissues issues
#> <dbl> <int> <chr>
#> 1 11 1 4595
#> 2 10 1 4586
#> 3 5 2 4603,4574
#> 4 3 1 4615
#> 5 2 5 4614,4598,4590,4589,4561
#> 6 1 1 4572
#> 7 0 19 4636,4631,4629,4628,4627,4617,4612,4605,4602,4601,4597,…
highlevel %>% group_by(state) %>% summarize(nissues=n())
#> # A tibble: 1 x 2
#> state nissues
#> <chr> <int>
#> 1 open 30
Let’s aggregate a few more api calls. Documentation can be found at the github API docs and in particular here.
manyissues <- lapply(c(1:7), function(x){as.tbl_json(paste0(baseurl,'?state=all&per_page=50&page=',x))})
## Collapse into one tbl_json
manyissues <- tidyjson::bind_rows(manyissues)
## Summarize status & users that create issues
manyissues %>% gather_array('issue') %>% spread_values(
login=jstring('user','login')
, comments=jnumber('comments')
, issuenum = jnumber('number')
, state = jstring('state')
) %>% group_by(login, state) %>% summarize(issuecount=n()) %>% ungroup() %>%
spread(state, issuecount, fill=0) %>%
mutate(total=closed+open) %>%
arrange(desc(total), desc(open)) %>% head(10)
#> # A tibble: 10 x 4
#> login closed open total
#> <chr> <dbl> <dbl> <dbl>
#> 1 romainfrancois 69 9 78
#> 2 hadley 8 6 14
#> 3 krlmlr 4 6 10
#> 4 lionel- 3 3 6
#> 5 batpigandme 5 1 6
#> 6 jennybc 0 4 4
#> 7 IndrajeetPatil 3 1 4
#> 8 iago-pssjd 3 1 4
#> 9 jzadra 3 1 4
#> 10 dhicks 0 3 3
This is a static public API that shows location, status, and current availability for bikes in NYC bike sharing.
citibike <- as.tbl_json("http://citibikenyc.com/stations/json")
## We see what we have is an object
citibike %>% json_types()
#> # A tbl_json: 1 x 2 tibble with a "JSON" attribute
#> `attr(., "JSON")` document.id type
#> <chr> <int> <fct>
#> 1 "{\"executionTime..." 1 object
## So let's explore that object
citibike %>% gather_object()
#> # A tbl_json: 2 x 2 tibble with a "JSON" attribute
#> `attr(., "JSON")` document.id name
#> <chr> <int> <chr>
#> 1 "\"2019-11-28 10:..." 1 executionTime
#> 2 "[{\"id\":281,\"sta..." 1 stationBeanList
Let’s explore the array, but store executionTime for later reference:
citibike_list <- citibike %>%
spread_values(execution=jstring(executionTime)) %>%
enter_object('stationBeanList') %>% gather_array('arrayid')
citibike_list %>%
filter(arrayid==1) %>%
json_schema() %>% prettify()
#> {
#> "altitude": "string",
#> "availableBikes": "number",
#> "availableDocks": "number",
#> "city": "string",
#> "id": "number",
#> "landMark": "string",
#> "lastCommunicationTime": "string",
#> "latitude": "number",
#> "location": "string",
#> "longitude": "number",
#> "postalCode": "string",
#> "stAddress1": "string",
#> "stAddress2": "string",
#> "stationName": "string",
#> "statusKey": "number",
#> "statusValue": "string",
#> "testStation": "logical",
#> "totalDocks": "number"
#> }
#>
The percentage availablity of bikes should be linearly correlated. I.e. 25% bikes available means 75% of docks available.
citibike_available <- citibike_list %>%
spread_values(id=jnumber(id)
, location=jstring(location)
, lastCommunication=jstring(lastCommunicationTime)
, availableBikes=jnumber(availableBikes)
, availableDocks=jnumber(availableDocks)
, totalDocks=jnumber(totalDocks)) %>%
mutate(openDockPct=availableDocks / totalDocks
, bikeDockPct=availableBikes / totalDocks
, timeSinceUpdateMinutes=as.integer(as_datetime(execution)-as_datetime(lastCommunication))/60
, timeSinceUpdateBin=cut(timeSinceUpdateMinutes
,c(0,1,15,60,6*60,24*60,Inf)
, labels=c('0-1 Min','1-15 Min'
, '15 Min - 1 Hr'
, '1-6 Hr'
, '6-24 Hr'
, '24+ Hr'))
)
## Expect generally linear behavior
ggplot(citibike_available, aes(openDockPct, bikeDockPct)) + geom_point()
#> Warning: Removed 7 rows containing missing values (geom_point).
And if we are in the process of exploring New York City, we probably care about how many actual bikes / docks are available, and how up-to-date that information is.
ggplot(citibike_available, aes(availableBikes, availableDocks, col=timeSinceUpdateBin)) +
geom_point()
Remember that our object is still a tbl_json object, so we can go back and grab additional keys if necessary. What if we wanted to map the data for easier use while we explore the city?
citibike_map <- citibike_available %>%
spread_values(lat=jnumber(latitude)
, long=jnumber(longitude))
citibike_map %>% group_by(is.na(lat),is.na(long)) %>% summarize(n())
#> # A tibble: 1 x 3
#> # Groups: is.na(lat) [1]
#> `is.na(lat)` `is.na(long)` `n()`
#> <lgl> <lgl> <int>
#> 1 FALSE FALSE 922
It looks like the data are populated, so we should be good to go!! This is a feature we plan to add to this vignette in the future. Data analysis is always more fun with quality visualizations.
One last point of note. What if we got a bad response and our pipeline above was automated?
citibike_list_0 <- '{}' %>%
spread_values(execution=jstring(executionTime)) %>%
enter_object('stationBeanList') %>% gather_array('arrayid')
citibike_available_0 <- citibike_list_0 %>%
spread_values(id=jnumber(id)
, location=jstring(location)
, lastCommunication=jstring(lastCommunicationTime)
, availableBikes=jnumber(availableBikes)
, availableDocks=jnumber(availableDocks)
, totalDocks=jnumber(totalDocks)) %>%
mutate(openDockPct=availableDocks / totalDocks
, bikeDockPct=availableBikes / totalDocks
, timeSinceUpdateMinutes=as.integer(as_datetime(execution)-as_datetime(lastCommunication))/60
, timeSinceUpdateBin=cut(timeSinceUpdateMinutes
,c(0,1,15,60,6*60,24*60,Inf)
, labels=c('0-1 Min','1-15 Min'
, '15 Min - 1 Hr'
, '1-6 Hr'
, '6-24 Hr'
, '24+ Hr'))
)
ggplot(citibike_available_0, aes(availableBikes, availableDocks, col=timeSinceUpdateBin)) +
geom_point()
While some may prefer an error (and it would be easy enough to check and implement an error of our own using a package like assertthat
), this is a powerful feature of the tidyjson
package that allows us to be sure of the structure of data that we receive from parsing the JSON object.
So if the API changes its schema, or if the response you receive does not have sufficient data, you can rest assurred that the resulting data structure will conform to the specifications you provide and stay tidy. For further information on this, see documentation on spread_values
(which explicitly defines the data structure you will create) and spread_all
(which is easier to use when interactively exploring).