library(dataset)
library(declared)
You need the latest development version of declared.
::install_github('dusadrian/declared') remotes
The survey class will be derived from the dataset
class.
<- c("Saschia Iemand", "Jane Doe", "Jack Doe", "Pim Iemand", "Matti Virtanen" )
obs_id <- declared ( c(1,1,0,-1,1),
sex labels = c(Male = 0, Female = 1, DK = -1),
na_values = -1)
<- c("NL-ZH", "IE-05", "GB-NIR", "NL-ZH", "FI1C") geo
<- declared (
difficulty_bills c(0,1,2,-1,0),
labels = c(Never = 0, Time_to_time = 1, Always = 2, DK = -1)
)
<- declared (
age_exact c( 34,45,21,55,-1),
labels = c( A = 34,A = 45,A = 21, A= 55, DK = -1)
)<- declared (
listen_spotify c(0,1,9,0,1),
labels = c( No = 0, Yes = 1,Inap = 9),
na_values = 9
)
<- data.frame (
raw_survey obs_id = obs_id,
geo = geo,
listen_spotify = listen_spotify,
sex = sex,
age_exact = age_exact,
difficulty_bills = difficulty_bills
)
<- dataset( x= raw_survey,
survey_dataset Dimensions = "geo",
Measures = c("listen_spotify", "sex",
"age_exact", "difficulty_bills"),
Attributes = NULL,
sdmx_attributes = "geo",
Title = "Tiny Survey",
Creator = person("Jane", "Doe"))
datacite(survey_dataset)
#> $names
#> [1] "obs_id" "geo" "listen_spotify" "sex"
#> [5] "age_exact" "difficulty_bills"
#>
#> $dimensions
#> names class
#> geo geo character
#> isDefinedBy
#> geo https://purl.org/linked-data/cube|https://raw.githubusercontent.com/UKGovLD/publishing-statistical-data/master/specs/src/main/vocab/sdmx-attribute.ttl
#> codeList
#> geo not yet defined
#>
#> $measures
#> names class
#> listen_spotify listen_spotify declared|integer
#> sex sex declared|integer
#> age_exact age_exact declared|integer
#> difficulty_bills difficulty_bills declared|integer
#> isDefinedBy codeListe
#> listen_spotify https://purl.org/linked-data/cube not yet defined
#> sex https://purl.org/linked-data/cube not yet defined
#> age_exact https://purl.org/linked-data/cube not yet defined
#> difficulty_bills https://purl.org/linked-data/cube not yet defined
#>
#> $attributes
#> [1] names class isDefinedBy codeListe
#> <0 rows> (or 0-length row.names)
#>
#> $Type
#> resourceType resourceTypeGeneral
#> 1 DCMITYPE:Dataset Dataset
#>
#> $Title
#> Title titleType
#> 1 Tiny Survey Title
#>
#> $Identifier
#> [1] NA
#>
#> $Creator
#> [1] "Jane Doe"
#>
#> $Source
#> [1] NA
#>
#> $Publisher
#> [1] NA
#>
#> $Rights
#> [1] NA
#>
#> $Description
#> [1] NA
#>
#> $Size
#> [1] "16.01 kB [15.63 KiB]"
#>
#> $Date
#> [1] "2022-12-01"
It is a good practice to define valid, but not present labels in
declared
, because in the retrospective harmonization
workflow they may be concatenated (binded) together with further
observations that do have the currently not used label.
In this example, the DK
or declined label is not in
use.
# This is not valied in declared
<- declared(
listen_spotify c(0,1,9,0,1),
labels = c( No = 0, Yes = 1,Inap = 9, DK =-1),
na_values = c(9, -1)
)
print(listen_spotify)
#> <declared<integer>[5]>
#> [1] 0 1 NA(9) 0 1
#> Missing values: 9
#>
#> Labels:
#> value label
#> 0 No
#> 1 Yes
#> 9 Inap
c(listen_spotify, declared(
c(-1,-1,-1),
labels = c( No = 0, Yes = 1,Inap = 9, DK =-1)
))#> <declared<integer>[8]>
#> [1] 0 1 NA(9) 0 1 -1 -1 -1
#> Missing values: 9
#>
#> Labels:
#> value label
#> -1 DK
#> 0 No
#> 1 Yes
#> 9 Inap
summary(listen_spotify)
#> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#> 0.0 0.0 0.5 0.5 1.0 1.0 1
<- datacite_add(survey_dataset,
survey_dataset Title = "Tiny Survey",
Creator = person("Daniel", "Antal"),
Identifier = "https://doi.org/xxxx.yyyyy",
Publisher = "Reprex",
PublicationYear = 2022,
Subject = "Surveys",
Language = "en")
The survey
class inherits elements of the
dataset
class, but it will be more strictly defined. I am
considering to make declared
every single column except for
the obs_id
. Even numeric
types with
Inap
and DK
would map nicely to
CL_OBS_STATUS
SDMX codes that make missing observation
explicit, and try to categorize them.
datacite(survey_dataset)
#> $names
#> [1] "obs_id" "geo" "listen_spotify" "sex"
#> [5] "age_exact" "difficulty_bills"
#>
#> $dimensions
#> names class
#> geo geo character
#> isDefinedBy
#> geo https://purl.org/linked-data/cube|https://raw.githubusercontent.com/UKGovLD/publishing-statistical-data/master/specs/src/main/vocab/sdmx-attribute.ttl
#> codeList
#> geo not yet defined
#>
#> $measures
#> names class
#> listen_spotify listen_spotify declared|integer
#> sex sex declared|integer
#> age_exact age_exact declared|integer
#> difficulty_bills difficulty_bills declared|integer
#> isDefinedBy codeListe
#> listen_spotify https://purl.org/linked-data/cube not yet defined
#> sex https://purl.org/linked-data/cube not yet defined
#> age_exact https://purl.org/linked-data/cube not yet defined
#> difficulty_bills https://purl.org/linked-data/cube not yet defined
#>
#> $attributes
#> [1] names class isDefinedBy codeListe
#> <0 rows> (or 0-length row.names)
#>
#> $Type
#> resourceType resourceTypeGeneral
#> 1 Dataset Dataset
#>
#> $Title
#> Title titleType
#> 1 Tiny Survey Title
#>
#> $Identifier
#> [1] "https://doi.org/xxxx.yyyyy"
#>
#> $Creator
#> [1] "Daniel Antal"
#>
#> $Source
#> [1] NA
#>
#> $Publisher
#> [1] "Reprex"
#>
#> $Size
#> [1] "18.59 kB [18.16 KiB]"
#>
#> $Date
#> [1] "2022-12-01"
#>
#> $Subject
#> term subjectScheme schemeURI valueURI
#> 1 Surveys <NA> <NA> <NA>
#>
#> $Issued
#> [1] 2022
#>
#> $publication_year
#> [1] 2022
#>
#> $Geolocation
#> [1] NA
#>
#> $Language
#> [1] "eng"
Is the summary
method implemented for
declared
? Both dataset
and survey
will need new print
and summary
methods.
summary(survey_dataset)
#> obs_id geo listen_spotify sex
#> Length:5 Length:5 Min. :0.0 Min. :0.00
#> Class :character Class :character 1st Qu.:0.0 1st Qu.:0.75
#> Mode :character Mode :character Median :0.5 Median :1.00
#> Mean :0.5 Mean :0.75
#> 3rd Qu.:1.0 3rd Qu.:1.00
#> Max. :1.0 Max. :1.00
#> NA's :1 NA's :1
#> age_exact difficulty_bills
#> Min. :-1.0 Min. :-1.0
#> 1st Qu.:21.0 1st Qu.: 0.0
#> Median :34.0 Median : 0.0
#> Mean :30.8 Mean : 0.4
#> 3rd Qu.:45.0 3rd Qu.: 1.0
#> Max. :55.0 Max. : 2.0
#>
The survey
(should) contain the entire processing
history from creation, and optionally the DataCite
schema
for publication created with datacite_add()
. A similar
dublincore_add
function uses the Dublin Core metadata
definitions.
Eventually, a connection to the packages zen4R will make sure that the correctly described dataset can get a Zenodo record, receive a DOI, the DOI recorded in the object, and upload to Zenodo.