2018-07-24

DelayedDataFrame class

  • Extends: DataFrame
library(DelayedDataFrame)
(ddf <- DelayedDataFrame())
## DelayedDataFrame with 0 rows and 0 columns
  • lazyIndex slot
    Saves the mapping indexes for each column.
lazyIndex(ddf)
## LazyIndex of length 0
## list()
## index of each column: 
## integer(0)

Constructor:

Each argument in "..." is coerced to a DataFrame and combined column-wise.

DelayedDataFrame(..., row.names=FALSE, check.names=TRUE)
(obj <- DelayedDataFrame(letters, LETTERS, row.names=LETTERS))
## DelayedDataFrame with 26 rows and 2 columns
##         letters     LETTERS
##     <character> <character>
## A             a           A
## B             b           B
## C             c           C
## ...         ...         ...
## X             x           X
## Y             y           Y
## Z             z           Z

lazyIndex(obj)
## LazyIndex of length 1
## [[1]]
## NULL
## 
## index of each column: 
## [1] 1 1

On-disk data representation in DataFrame format

  • Genomic Data Structure (GDS)

CoreArray Genomic Data Structure (GDS) is designed for large-scale datasets (for available random-access memory). The Bioconductor package gdsfmt has provided a high-level R interface to GDS.

file <- SeqArray::seqExampleFileName("gds")
f <- gdsfmt::openfn.gds(file)
f
## File: /home/qian/R/x86_64-pc-linux-gnu-library/3.5/SeqArray/extdata/CEU_Exon.gds (299.1K)
## +    [  ] *
## |--+ description   [  ] *
## |--+ sample.id   { Str8 90 LZMA_ra(35.8%), 258B } *
## |--+ variant.id   { Int32 1348 LZMA_ra(16.8%), 906B } *
## |--+ position   { Int32 1348 LZMA_ra(64.6%), 3.4K } *
## |--+ chromosome   { Str8 1348 LZMA_ra(4.63%), 158B } *
## |--+ allele   { Str8 1348 LZMA_ra(16.7%), 902B } *
## |--+ genotype   [  ] *
## |  |--+ data   { Bit2 2x90x1348 LZMA_ra(26.3%), 15.6K } *
## |  |--+ ~data   { Bit2 2x1348x90 LZMA_ra(29.3%), 17.3K }
## |  |--+ extra.index   { Int32 3x0 LZMA_ra, 19B } *
## |  \--+ extra   { Int16 0 LZMA_ra, 19B }
## |--+ phase   [  ]
## |  |--+ data   { Bit1 90x1348 LZMA_ra(0.91%), 138B } *
## |  |--+ ~data   { Bit1 1348x90 LZMA_ra(0.91%), 138B }
## |  |--+ extra.index   { Int32 3x0 LZMA_ra, 19B } *
## |  \--+ extra   { Bit1 0 LZMA_ra, 19B }
## |--+ annotation   [  ]
## |  |--+ id   { Str8 1348 LZMA_ra(38.4%), 5.5K } *
## |  |--+ qual   { Float32 1348 LZMA_ra(2.26%), 122B } *
## |  |--+ filter   { Int32,factor 1348 LZMA_ra(2.26%), 122B } *
## |  |--+ info   [  ]
## |  |  |--+ AA   { Str8 1348 LZMA_ra(25.6%), 690B } *
## |  |  |--+ AC   { Int32 1348 LZMA_ra(24.2%), 1.3K } *
## |  |  |--+ AN   { Int32 1348 LZMA_ra(19.8%), 1.0K } *
## |  |  |--+ DP   { Int32 1348 LZMA_ra(47.9%), 2.5K } *
## |  |  |--+ HM2   { Bit1 1348 LZMA_ra(150.3%), 254B } *
## |  |  |--+ HM3   { Bit1 1348 LZMA_ra(150.3%), 254B } *
## |  |  |--+ OR   { Str8 1348 LZMA_ra(20.1%), 342B } *
## |  |  |--+ GP   { Str8 1348 LZMA_ra(24.4%), 3.8K } *
## |  |  \--+ BN   { Int32 1348 LZMA_ra(20.9%), 1.1K } *
## |  \--+ format   [  ]
## |     \--+ DP   [  ] *
## |        |--+ data   { Int32 90x1348 LZMA_ra(25.1%), 118.8K } *
## |        \--+ ~data   { Int32 1348x90 LZMA_ra(24.1%), 114.2K }
## \--+ sample.annotation   [  ]
##    \--+ family   { Str8 90 LZMA_ra(57.1%), 222B }

  • GDSArray

GDSArray is an R and Bioconductor package, that represents GDS files as DelayedArray instances.

library(GDSArray)
gdsnodes(file)
##  [1] "description"                "sample.id"                 
##  [3] "variant.id"                 "position"                  
##  [5] "chromosome"                 "allele"                    
##  [7] "genotype/data"              "genotype/~data"            
##  [9] "genotype/extra.index"       "genotype/extra"            
## [11] "phase/data"                 "phase/~data"               
## [13] "phase/extra.index"          "phase/extra"               
## [15] "annotation/id"              "annotation/qual"           
## [17] "annotation/filter"          "annotation/info/AA"        
## [19] "annotation/info/AC"         "annotation/info/AN"        
## [21] "annotation/info/DP"         "annotation/info/HM2"       
## [23] "annotation/info/HM3"        "annotation/info/OR"        
## [25] "annotation/info/GP"         "annotation/info/BN"        
## [27] "annotation/format/DP/data"  "annotation/format/DP/~data"
## [29] "sample.annotation/family"

Use GDSArray to represent the GDS nodes for variant annotation.

varid <- GDSArray(file, "annotation/id")
AA <- GDSArray(file, "annotation/info/AA")
varid
## <1348> GDSArray object of type "character":
##          1             2             3             .             1347 
## "rs111751804" "rs114390380" "rs1320571"               . "rs116581756" 
##          1348 
## "rs5771206"

seed(varid)
## GDSArraySeed
## gds file: /home/qian/R/x86_64-pc-linux-gnu-library/3.5/SeqArray/extdata/CEU_Exon.gds
## array data: annotation/id
## dim: 1348

Construct a DelayedDataFrame object with GDSArray columns.

(ddf <- DelayedDataFrame(varid, AA))
## DelayedDataFrame with 1348 rows and 2 columns
##            varid         AA
##       <GDSArray> <GDSArray>
## 1    rs111751804          T
## 2    rs114390380          G
## 3      rs1320571          A
## ...          ...        ...
## 1346   rs8135982          C
## 1347 rs116581756          G
## 1348   rs5771206          G

subsetting

  • listData: unchanged
(ddf1 <- ddf[1:20,])
## DelayedDataFrame with 20 rows and 2 columns
##              varid             AA
##     <DelayedArray> <DelayedArray>
## 1      rs111751804              T
## 2      rs114390380              G
## 3        rs1320571              A
## ...            ...            ...
## 18     rs115614983              T
## 19      rs61751002              C
## 20       rs6691840              C
identical(ddf@listData, ddf1@listData)
## [1] TRUE

  • lazyIndex: updated
  • nrows: updated
  • rownames: updated (if not NULL)
lazyIndex(ddf1)
## LazyIndex of length 1
## [[1]]
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
## 
## index of each column: 
## [1] 1 1
nrow(ddf1)
## [1] 20

lazyIndex realization

as(ddf1, "DataFrame")
## DataFrame with 20 rows and 2 columns
##              varid             AA
##     <DelayedArray> <DelayedArray>
## 1      rs111751804              T
## 2      rs114390380              G
## 3        rs1320571              A
## ...            ...            ...
## 18     rs115614983              T
## 19      rs61751002              C
## 20       rs6691840              C

Availability

The development version is available to download through github:

devtools::install_github("Bioconductor/DelayedDataFrame")