Generate parallel random numbers on GPU

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

library('clrng')

runifGpu

get_system_info()

## $`1. Operating System`
## [1] "Linux"
## 
## $`2. CPU`
## [1] "Intel(R) Xeon(R) W-2145 CPU @ 3.70GHz"
## 
## $`3. GPU`
## [1] "\020n\r\xaa9\177"       "\020n\r\xaa9\177"       "gfx906:sramecc+:xnack-"
## 
## $`4. OpenCL Version`
## [1] "OpenCL 3.0 LINUX"                                         
## [2] "OpenCL 1.2 Intel(R) FPGA SDK for OpenCL(TM), Version 20.3"
## [3] "OpenCL 2.1 AMD-APP (3614.0)"

if (detectGPUs()) {

  ## show all the available platforms
  gpuR::listContexts()[,'platform']
  
  ## set the context and show the device we are using
  setContext(grep("gpu", listContexts()$device_type)[1])
  currentDevice()
  
  ## set the initial seed of the package/first stream
  setBaseCreator(rep(12235,6))
  
  ## check the size of work items and GPU precision type at the moment
  getOption('clrng.Nglobal')
  getOption('clrng.type')
  
  ## create default number of streams on GPU
  myStreamsGpu = clrng::createStreamsGpu()
  ## show its dimention
  dim(myStreamsGpu)
  ## create 10 uniform random numbers using the streams and show detailed backend code
  as.vector(clrng::runifGpu(10, myStreamsGpu, verbose=2))
  
  # generating a 100 x 100 matrix of random uniforms
  b<-clrng::runifGpu(c(100,100), myStreamsGpu)
  bvector<-as.vector(as.matrix(b))
  
  # plot the histogram of the uniform random numbers
  hist(bvector, breaks=40)
  # check its quantiles
  quantile(bvector)

  } else {
  message("No GPU detected. Skipping GPU-dependent code.")
}

## 
## #pragma OPENCL EXTENSION cl_khr_fp64 : enable
## #define PI_2 M_PI_2
## #define TWOPI 6.283185307179586231996
## #define mrg31k3p_NORM_cl 4.656612873077392578125e-10
## //TWOPI * mrg31k3p_NORM_cl
## #define TWOPI_mrg31k3p_NORM_cl 2.925836158534319248049e-09
## 
## 
## #define Nrow 10
## #define Ncol 1
## #define NpadStreams 128
## #define NpadCol 128
## #define mrg31k3p_M1 2147483647
## #define mrg31k3p_M2 2147462579
## #define mrg31k3p_MASK12 511  
## #define mrg31k3p_MASK13 16777215  
## #define mrg31k3p_MASK2 65535     
## #define mrg31k3p_MULT2 21069
## 
## void streamsToPrivate(__global int* streams, uint* g1, uint* g2, const int start){
##   int Drow, Dcol, DrowStart;  for(Drow = 0, DrowStart = start, Dcol = DrowStart + 3;
##     Drow < 3; Drow++, DrowStart++, Dcol++){
##     g1[Drow] = streams[DrowStart];
##     g2[Drow] = streams[Dcol];
##   }
## }
## 
## void streamsFromPrivate(__global int* streams, uint* g1, uint* g2,  const int start){
##  int Drow, Dcol, DrowStart; for(Drow = 0,DrowStart = start, Dcol = DrowStart + 3;
##      Drow < 3; Drow++, DrowStart++, Dcol++){
##    streams[DrowStart] = g1[Drow];
##    streams[Dcol] = g2[Drow];
##  }
## }
## 
## uint clrngMrg31k3pNextState(uint *g1, uint *g2) {
## 	uint y1, y2;
## 	y1 = ((g1[1] & mrg31k3p_MASK12) << 22) + (g1[1] >> 9)
## 		+ ((g1[2] & mrg31k3p_MASK13) << 7) + (g1[2] >> 24);
## 	if (y1 >= mrg31k3p_M1)
## 		y1 -= mrg31k3p_M1;
## 	y1 += g1[2];
## 	if (y1 >= mrg31k3p_M1)
## 		y1 -= mrg31k3p_M1;
## 	g1[2] = g1[1];
## 	g1[1] = g1[0];
## 	g1[0] = y1;
## 	y1 = ((g2[0] & mrg31k3p_MASK2) << 15) + (mrg31k3p_MULT2 * (g2[0] >> 16));
## 	if (y1 >= mrg31k3p_M2)
## 		y1 -= mrg31k3p_M2;
## 	y2 = ((g2[2] & mrg31k3p_MASK2) << 15) + (mrg31k3p_MULT2 * (g2[2] >> 16));
## 	if (y2 >= mrg31k3p_M2)
## 		y2 -= mrg31k3p_M2;
## 	y2 += g2[2];
## 	if (y2 >= mrg31k3p_M2)
## 		y2 -= mrg31k3p_M2;
## 	y2 += y1;
## 	if (y2 >= mrg31k3p_M2)
## 		y2 -= mrg31k3p_M2;
## 	g2[2] = g2[1];
## 	g2[1] = g2[0];
## 	g2[0] = y2;
## 	if (g1[0] <= g2[0]){
## 		return (g1[0] - g2[0] + mrg31k3p_M1);
## 	} else {
## 		return(g1[0] - g2[0]);
##  }
## }
## 
## 
## 
## __kernel void mrg31k3pMatrix(
##   __global int* streams,
##   __global double* out){
## 
## const int index = get_global_id(0)*get_global_size(1) + get_global_id(1);
## int Drow, Dcol, DrowStart, Dentry, DrowBlock, DcolBlock, DrowInBounds;
## const int DrowStartInc = get_global_size(0) * NpadCol;
## uint g1[3], g2[3];
## const int startvalue=index * NpadStreams;
## double temp;
## const double fact = mrg31k3p_NORM_cl;
## streamsToPrivate(streams,g1,g2,startvalue);
## for(DrowBlock = 0, Drow=get_global_id(0), DrowStart = Drow * NpadCol;
##     DrowBlock < Nrow;
##     Drow += get_global_size(0), DrowBlock +=get_global_size(0), DrowStart += DrowStartInc) {
##     DrowInBounds = Drow < Nrow;
##     for(DcolBlock = 0, Dcol=get_global_id(1), Dentry = DrowStart + Dcol;
##         DcolBlock < Ncol;
##         DcolBlock += get_global_size(1), Dentry += get_global_size(1) ) {
##       temp = fact * clrngMrg31k3pNextState(g1, g2);
##         if(DrowInBounds) out[Dentry] = temp;
##   }//Dcol
## }//Drow
## streamsFromPrivate(streams,g1,g2,startvalue);
## }//kernel

plot of chunk unif

##           0%          25%          50%          75%         100% 
## 4.066154e-06 2.527364e-01 5.037022e-01 7.535758e-01 9.999677e-01

rnormGpu

if (detectGPUs()) {
  
  setContext(grep("gpu", listContexts()$device_type)[1])
  ## configure the size of work items
  options(clrng.Nglobal = c(128,64))
  
  ## create default (here is 128*64) number of streams on GPU
  myStreamsGpu = clrng::createStreamsGpu()
  ## check its dimension
  dim(myStreamsGpu)
  
  ## generate a vector of 10 random normal numbers on GPU, and print out the kernel code
  as.vector(clrng::rnormGpu(10, myStreamsGpu, verbose=2))
  ## generate a 4x4 matrix of random normal numbers on GPU, still using the ctreated streams
  as.matrix(clrng::rnormGpu(c(4, 4), myStreamsGpu))
  
  ## create many new streams and generate a 1024x512 matrix of normal random numbers, with specified Nglobal
  streamsGpu2 <- createStreamsGpu(n =512*128)
  a<-clrng::rnormGpu(c(1024,512), streams=streamsGpu2, Nglobal=c(512,128))
  
  ## see the histogram of the produced normal random numbers
  avector<-as.vector(as.matrix(a))
  hist(avector,breaks=40)
  
  # do a Q-Q plot with quantiles computed on GPU by clrng package
  clrng::qqnormGpu(avector)
  # can also do the Q-Q plot calculation on CPU by stats::qqnorm
  stats::qqnorm(avector)

} else {
  message("No GPU detected. Skipping GPU-dependent code.")
}

## 
## #pragma OPENCL EXTENSION cl_khr_fp64 : enable
## #define PI_2 M_PI_2
## #define TWOPI 6.283185307179586231996
## #define mrg31k3p_NORM_cl 4.656612873077392578125e-10
## //TWOPI * mrg31k3p_NORM_cl
## #define TWOPI_mrg31k3p_NORM_cl 2.925836158534319248049e-09
## 
## 
## #define Nrow 1
## #define Ncol 10
## #define NpadStreams 128
## #define NpadCol 128
## #define mrg31k3p_M1 2147483647
## #define mrg31k3p_M2 2147462579
## #define mrg31k3p_MASK12 511  
## #define mrg31k3p_MASK13 16777215  
## #define mrg31k3p_MASK2 65535     
## #define mrg31k3p_MULT2 21069
## 
## void streamsToPrivate(__global int* streams, uint* g1, uint* g2, const int start){
##   int Drow, Dcol, DrowStart;  for(Drow = 0, DrowStart = start, Dcol = DrowStart + 3;
##     Drow < 3; Drow++, DrowStart++, Dcol++){
##     g1[Drow] = streams[DrowStart];
##     g2[Drow] = streams[Dcol];
##   }
## }
## 
## void streamsFromPrivate(__global int* streams, uint* g1, uint* g2,  const int start){
##  int Drow, Dcol, DrowStart; for(Drow = 0,DrowStart = start, Dcol = DrowStart + 3;
##      Drow < 3; Drow++, DrowStart++, Dcol++){
##    streams[DrowStart] = g1[Drow];
##    streams[Dcol] = g2[Drow];
##  }
## }
## 
## uint clrngMrg31k3pNextState(uint *g1, uint *g2) {
## 	uint y1, y2;
## 	y1 = ((g1[1] & mrg31k3p_MASK12) << 22) + (g1[1] >> 9)
## 		+ ((g1[2] & mrg31k3p_MASK13) << 7) + (g1[2] >> 24);
## 	if (y1 >= mrg31k3p_M1)
## 		y1 -= mrg31k3p_M1;
## 	y1 += g1[2];
## 	if (y1 >= mrg31k3p_M1)
## 		y1 -= mrg31k3p_M1;
## 	g1[2] = g1[1];
## 	g1[1] = g1[0];
## 	g1[0] = y1;
## 	y1 = ((g2[0] & mrg31k3p_MASK2) << 15) + (mrg31k3p_MULT2 * (g2[0] >> 16));
## 	if (y1 >= mrg31k3p_M2)
## 		y1 -= mrg31k3p_M2;
## 	y2 = ((g2[2] & mrg31k3p_MASK2) << 15) + (mrg31k3p_MULT2 * (g2[2] >> 16));
## 	if (y2 >= mrg31k3p_M2)
## 		y2 -= mrg31k3p_M2;
## 	y2 += g2[2];
## 	if (y2 >= mrg31k3p_M2)
## 		y2 -= mrg31k3p_M2;
## 	y2 += y1;
## 	if (y2 >= mrg31k3p_M2)
## 		y2 -= mrg31k3p_M2;
## 	g2[2] = g2[1];
## 	g2[1] = g2[0];
## 	g2[0] = y2;
## 	if (g1[0] <= g2[0]){
## 		return (g1[0] - g2[0] + mrg31k3p_M1);
## 	} else {
## 		return(g1[0] - g2[0]);
##  }
## }
## 
## 
## 
## __kernel void mrg31k3pMatrix(
##   __global int* streams,
##   __global double* out){
## 
## const int index = get_global_id(0)*get_global_size(1) + get_global_id(1);
## int Drow, Dcol, DrowStart, Dentry, DrowBlock, DcolBlock, DrowInBounds;
## const int DrowStartInc = get_global_size(0) * NpadCol;
## uint g1[3], g2[3];
## const int startvalue=index * NpadStreams;
## double temp;
## const double fact[2] = { mrg31k3p_NORM_cl, TWOPI * mrg31k3p_NORM_cl };
## const double addForSine[2] = { 0.0, - PI_2 };
## local double part[2];
## streamsToPrivate(streams,g1,g2,startvalue);
## for(DrowBlock = 0, Drow=get_global_id(0), DrowStart = Drow * NpadCol;
##     DrowBlock < Nrow;
##     Drow += get_global_size(0), DrowBlock +=get_global_size(0), DrowStart += DrowStartInc) {
##     DrowInBounds = Drow < Nrow;
##     for(DcolBlock = 0, Dcol=get_global_id(1), Dentry = DrowStart + Dcol;
##         DcolBlock < Ncol;
##         DcolBlock += get_global_size(1), Dentry += get_global_size(1) ) {
##       part[get_local_id(1)] = fact[get_local_id(1)] * clrngMrg31k3pNextState(g1, g2);
##       barrier(CLK_LOCAL_MEM_FENCE);
##       temp = sqrt( -2.0*log(part[0]) ) * cos(part[1] + addForSine[get_local_id(1)] );// is cos for local0, sine for local1
##         if(DrowInBounds) out[Dentry] = temp;
##       barrier(CLK_LOCAL_MEM_FENCE);
##   }//Dcol
## }//Drow
## streamsFromPrivate(streams,g1,g2,startvalue);
## }//kernel

plot of chunk rnormGpu

rexpGpu

if (detectGPUs()) {
    ## generate a 200x100 matrix of exponential random numbers of mean = 1, 
    ## using the supplied streams, and specify the size of Nglobal to be equal to the number of strams 
    b<-clrng::rexpGpu(c(200,100), rate=1, streams = myStreamsGpu, Nglobal=c(64,16))
    
    # convert GPU matrix to an R vector on CPU, plot its histogram
    bGpu<-as.vector(as.matrix(b))
    hist(bGpu, freq=TRUE, breaks=40)
    
    # generate on CPU a vector of exponential random numbers of mean = 1, plot its histogram
    bCpu <- stats::rexp(20000, rate=1)
    hist(bCpu, freq=TRUE, breaks = 40)
    
    # compare the two sequence of random exponential numbers by looking at their distribution quantiles
    quantile(bGpu)
    quantile(bCpu)

    
   } else {
  message("No GPU detected. Skipping GPU-dependent code.")
}

plot of chunk exp

##           0%          25%          50%          75%         100% 
## 9.343121e-05 2.847599e-01 6.849064e-01 1.386177e+00 9.095221e+00

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.