The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
This package allows to read large text
tables in chunks, using a fast C++ backend. Text files can be imported
as data frames (with automatic column type detection option) or
matrices. The program is designed to be simple and user-friendly.
chunkR is based on three basic functions: chunker, to create a connection to a text file; next_chunk() to read the next chunk, and get_table() to retrieve the corresponding data chunk.
These functions can be easily included in loops and other source code, using the return value of the next_chunk() function, that is TRUE when a new chunk is available and FALSE when the file was totally read, respectively. The get_table() function, returns an empty data frame/matrix when next_chunk() is FALSE. See the examples below.
library(devtools)
install_github("leandroroser/chunkR")
data(iris)
# write iris as tab delimited file. Note that quote is set to FALSE
<- file.path(tempdir(),"iris.txt")
tmp_path write.table(iris, tmp_path, quote = FALSE)
+#-----------------------------------------------------------------#
+#--- Reading a data frame with automatic column-type detection ---#
+#-----------------------------------------------------------------#
# create a 'chunker' object passing the path of the input file.
<- chunker(tmp_path, chunksize = 30)
my_chunker_object
# read a chunk
next_chunk(my_chunker_object)
# get the chunk
get_table(my_chunker_object)
# read another chunk
next_chunk(my_chunker_object)
# get the number of lines already read
get_completed(my_chunker_object)
-#---- Quoted data --------#
write.table(iris, tmp_path, quote = TRUE)
<- chunker(tmp_path, quoted = TRUE, chunksize = 30)
my_chunker_object
next_chunk(my_chunker_object)
get_table(my_chunker_object)
-#---- Data without rownames and/or colnames ----#
<- file.path(tempdir(),"iris.txt")
tmp_path write.table(iris, tmp_path, row.names = FALSE, col.names = FALSE)
<- chunker(tmp_path, quoted = TRUE, chunksize = 30,
my_chunker_object2 has_rownames = FALSE, has_colnames = FALSE)
next_chunk(my_chunker_object2)
get_table(my_chunker_object2) # automatic generation of rownames and/or colnames
-#--- read a csv file ---#
<- file.path(tempdir(),"iris.csv")
tmp_path_csv
write.table(iris, tmp_path_csv, quote = FALSE, sep = ",")
# read the csv indicating the value of the sep parameter
<- chunker(tmp_path_csv, chunksize = 30, sep = ",")
my_chunker_object3 # the file can then be processed as with tab delimiters
next_chunk(my_chunker_object3)
get_table(my_chunker_object3)
# remove temporal file
file.remove(tmp_path_csv)
+#--------------------------------------------------------#
+#--- Reading a data frame using column types argument ---#
+#--------------------------------------------------------#
## Four types can be passed : "character", "numeric" (aka "double"), "integer", "logical"
# create a 'chunker' object passing the path of the input file.
<- chunker(tmp_path, chunksize = 120,
my_chunker_object4 columns_classes = c("numeric", "numeric", "numeric","numeric", "character"))
# read a chunk
next_chunk(my_chunker_object4)
# get the chunk
get_table(my_chunker_object4)
# read another chunk
next_chunk(my_chunker_object4)
# get the number of lines already read
get_completed(my_chunker_object4)
+#-------------------------#
+#--- Reading a matrix ---#
+#-------------------------#
<- chunker(tmp_path, chunksize = 30, data_format= "matrix")
my_chunker_object5
# read a chunk
next_chunk(my_chunker_object5)
# store the chunk as a character matrix in R
<- get_table(my_chunker_object5)
this_data
# The package provides a fast generic C++ function for conversion from
# matrix (any R type) to data frame
<- matrix2df(this_data)
this_data_as_df2
# remove temporal file
file.remove(tmp_path)
+#----------------------------------#
+#--- Example with a big table -----#
+#----------------------------------#
-### Example with a data frame
# create a large data frame, and write it in a temporal directory
<- file.path(tempdir(),"big_table.txt")
tmp_path
<- data.frame(numeric_data = runif(1000000),
out character_data = sample(c("a", "t", "c", "g"), 1000000,
replace = TRUE),
integer_data = sample(1000000),
bool_data = sample(c(TRUE, FALSE), 1000000, replace = TRUE))
write.table(out, tmp_path, quote = FALSE)
# create a chunker object, reading in chunks of 10000 lines
<- chunker(tmp_path, chunksize = 10000)
my_chunker_object6
next_chunk(my_chunker_object6)
<- get_table(my_chunker_object6)
data
# check classes
lapply(data,typeof)
file.remove(tmp_path)
-### Example with a matrix
# create a large matrix, and write it in a temporal directory
<- tempfile()
my_table write.table(matrix(sample(c("a", "t", "c", "g"), 1000000, replace = TRUE),
100000, 1000), my_table, quote = FALSE)
# create a chunker object, reading in chunks of 10000 lines
<- chunker(my_table, chunksize = 10000, data_format= "matrix")
my_chunker_object7
# create a loop to read all the file and do something with it
<- 0
lines while(next_chunk(my_chunker_object7))
{<- get_table(my_chunker_object7)
data
# do something with data, e.g., convert to data frame first
<- matrix2df(data)
data
<- lines + nrow(data)
lines cat("Processed ", lines, "lines\n")
}
# remove the temporal file
file.remove(my_table)
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.