C and CUDA: circular buffer implementation

Question

I have a programme which uses many circular buffers in an identical fashion on a CPU and GPU (C and C/C++ CUDA). I essentially require many queues, however, due to this being run on a GPU, I have limited its length so memory can be set up once at the beginning of the program. I therefore have the code below with a circular buffer/queue. Device and host code is similar with only minor changes for memory efficiency on GPU which is described when necessary. I have implemented this myself so it can be utilized within a hand written kernel on the GPU and so I can compare results between the CPU and GPU for verification purposes. I emit code which allocates/frees memory on the host and device for conciseness.

Worth noting is that, for my purpose, I do not require pop to return the value. Instead I first peek and compare with some condition, and if it is true, pop and discard the data. Furthermore, I can include a peek_tail; however, currently, I do not require this. During a typical program run, I am continuously pushing new data (for all i), and at the same time checking whether the oldest (via peek) is ready to be discarded (again for all i). While this is happening, I am periodically iterating over all items (j) within each (i) buffer. Lastly, when compiling I assume that no pointers alias and compile with strict pointer aliasing flags for both GCC and nvcc when appropriate.

Data structure (note power of 2 for capacity):

typedef struct
{
  // Parameters (here only host is shown, identical versions are placed in
  // constant memory of the device to access)
  int N; // multiple of warp size
  int capacity; // power of 2
  // Host data
  int* head_h;
  int* size_h;
  float* peek_head_h;
  float* data_h;
  // Device data
  int* head_d;
  int* size_d;
  float* peek_head_d;
  float* data_d;
} ring;

Host code (I will explain later why indexing to data is offset) with i={0,...,N-1} and j={0,...,capacity-1} where j is used in a for loop to iterate over the contents of each (i) buffer:

static inline void
Push_ring(ring* in, const int i, const float val)
{
  // wrap tail if needed
  int x = in->head_h[i] + in->size_h[i];
  x &= in->capacity - 1;
  in->data_h[(x * in->N) + i] = val;
  // increase size
  in->size_h[i]++;
  if (in->size_h[i] >= in->capacity)
    {
#ifdef WITH_WARN
      printf("Ring full.\n");
#endif // WITH_WARN
      in->size_h[i] = in->capacity;
      // wrap head if needed
      in->head_h[i]++;
      if (in->head_h[i] == in->capacity)
        in->head_h[i] = 0;
    }
  // if first set peek head
  if (in->size_h[i] == 1)
    in->peek_head_h[i] = val;
}

static inline void
Pop_ring(ring* in, const int i)
{
  // if empty
  if (in->size_h[i] == 0)
    return;
  in->head_h[i]++;
  // wrap head if needed
  if (in->head_h[i] == in->capacity)
    in->head_h[i] = 0;
  // update size and peek head
  in->size_h[i]--;
  if (in->size_h[i] < 0)
    in->size_h[i] = 0;
  if (in->size_h[i] > 0)
    in->peek_head_h[i] = in->data_h[(in->head_h[i] * in->N) + i];
  else
    in->peek_head_h[i] = -1.f;
}

static inline float
Peek_ring(ring* in, const int i)
{
  return in->peek_head_h[i];
}

static inline int
Size_ring(ring* in, const int i)
{
  return in->size_h[i];
}

static inline float
Iterate_ring(ring* in, const int i, const int j)
{
  // Wrap i
  int x = in->head_h[i] + j;
  x &= in->capacity - 1;
  // Return pointer to it
  return in->data_h[(x * in->N) + i];
}

Device code (same usage of i and j). Here instead of passing a pointer to the structure (which is in host memory), pointers to head, size, peek_head and data are passed (which are in device memory); these refer to ring.XXX_d where XXX is variable name. Lastly, N and capacity are __constant__ variables broadcast to all threads in a warp.

__device__ static __forceinline__ void
Push_ring_GPU(int* head, int* size, float* peek_head,
              float* data, const int i, const int capacity, 
              const int N, const float val)
{
  // use temp variables
  int headTemp = head[i];
  int sizeTemp = size[i];
  // wrap tail if needed
  int x = headTemp + sizeTemp;
  x &= capacity - 1;
  data[(x * N) + i] = val;
  // increase size
  sizeTemp++;
  if (sizeTemp >= capacity)
    {
#ifdef WITH_WARN
      printf("Ring full.\n");
#endif // WITH_WARN
      sizeTemp = capacity;
      // wrap tempHead if needed
      headTemp++;
      if (headTemp == capacity)
        head[i] = 0;
      else
        head[i] = headTemp;
    }
  // if first set peek tempHead
  if (sizeTemp == 1)
    peek_head[i] = val;
  // update from temp variables
  size[i] = sizeTemp;
}

__device__ static __forceinline__ void
Push_ring_Loading_GPU(int* headTemp, int* sizeTemp, int* head,
                      int* size, float* peek_head, 
                      float* data, const int i,
                      const int capacity, const int N,
                      const float val)
{
  // use loading variables
  *headTemp = head[i];
  *sizeTemp = size[i];
  // wrap tail if needed
  int x = *headTemp + *sizeTemp;
  x &= capacity - 1;
  data[(x * N) + i] = val;
  // increase size
  (*sizeTemp)++;
  if (*sizeTemp >= capacity)
    {
#ifdef WITH_WARN
      printf("Ring full.\n");
#endif // WITH_WARN
      *sizeTemp = capacity;
      // wrap tempHead if needed
      (*headTemp)++;
      if (*headTemp == capacity)
        *headTemp = 0;
    }
  // if first set peek tempHead
  if (*sizeTemp == 1)
    peek_head[i] = val;
}

__device__ static __forceinline__ void
Pop_ring_GPU(int* head, int* size, float* peek_head,
             float* data, const int i, const int capacity,
             const int N)
{
  // use temporary variables
  int sizeTemp = size[i];
  int headTemp = head[i];
  // if empty
  if (sizeTemp == 0)
    return;
  headTemp++;
  // wrap head if needed
  if (headTemp == capacity)
    headTemp = 0;
  // update size and peek head
  sizeTemp--;
  if (sizeTemp < 0)
    sizeTemp = 0;
  if (sizeTemp > 0) // if else cheaper than trying to do in one
    peek_head[i] = data[(headTemp * N) + i];
  else
    peek_head[i] = -1.f;
  // update from temporary variables
  head[i] = headTemp;
  size[i] = sizeTemp;
}

__device__ static __forceinline__ void
Pop_ring_Loaded_GPU(int* headTemp, int* sizeTemp,
                    float* peek_head, float* data,
                    const int i, const int capacity, const int N)
{
  // if empty
  if (*sizeTemp == 0)
    return;
  (*headTemp)++;
  // wrap head if needed
  if (*headTemp == capacity)
    *headTemp = 0;
  // update size and peek head
  (*sizeTemp)--;
  if (*sizeTemp < 0)
    *sizeTemp = 0;
  if (*sizeTemp > 0) // if else cheaper than trying to do in one  
    peek_head[i] = data[(*headTemp * N) + i];
  else
    peek_head[i] = -1.f;  
}

__device__ static __forceinline__ void
Update_ring_GPU(const int* headTemp, const int* sizeTemp, int* head,
                int* size, const int i)
{
  head[i] = *headTemp;
  size[i] = *sizeTemp;
}

__device__ static __forceinline__ float
Peek_ring_GPU(float* peek_head, const int i)
{
  return peek_head[i];
}

__device__ static __forceinline__ int
Size_ring_GPU(int* size, const int i)
{
  return size[i];
}

__device__ static __forceinline__ float
Iterate_ring_GPU(int* head, float* data, const int i,
                 const int x, const int capacity, const int N)
{
  // Wrap
  int temp = head[i] + x;
  temp &= capacity - 1;
  // Return pointer to it
  return data[(temp * N) + i];
}

__device__ static __forceinline__ float
Iterate_ring_Loaded_GPU(const int headTemp, float* data,
                        const int i, const int x,
                        const int capacity, const int N)
{
  // Wrap i
  int temp = headTemp + x;
  temp &= capacity - 1;
  // Return pointer to it
  return data[(temp * N) + i];
}

Here, on the device, I included additional versions of functions: Loading and Loaded. These are used to reduce redundant memory access to head and size as follows:

int headTemp = 0, sizeTemp = 0;
Push_ring_Loading_GPU(&headTemp, &sizeTemp, head, size, peek_head,
                      data, i, capacity, N, 1.234f);
if (Peek_ring_GPU(peek_head, i) > 0.5f)
  Pop_ring_Loaded_GPU(&headTemp, &sizeTemp, peek_head, data, i,
                      capacity, N);
Update_ring_GPU(&head, &sizeTemp, head, size, i);

and

int headTemp = head[i];
int y = 0;
float temp = 0.f;
for (y = 0; y < Size_ring_GPU(size, i); y++)
  temp = Iterate_ring_Loaded_GPU(headTemp, data, i, y, capacity, N);

Now, throughout the device code implementation i can be thought of as blockIdx.x * blockDimx.x + threadIdx.x, and with this in mind and the need for memory coalescing for performance reasons, this should hopefully explain the indexing to data_h and data_d (which I keep similar on the host to facilitate copying between host and device). Moreover, as N is a multiple of the warp size, if head and size are identical for all i then memory access should be coalesced and fast. However, during execution head and size for each i will not be identical, leading to memory access being fragmented and less coalesced.

This leads me to my question(s):

Is it possible to extend my implementation to mitigate this effect? (For example, when size == 0 I reset head = 0 such that, with low activity, buffers will realign towards data[i]. Perhaps I should implement some sort of defrag, and run it periodically on the buffer?)
Are there any other modifications to be made to increase performance (device code mainly), stability etc.? (General comments will be great as well.)

Why do you inline/force inline? Typically this decision is best left to the compiler. Have you done profiling comparisons against inlined/non-lined code? — Reinderien
– Reinderien, Commented Mar 1, 2016 at 18:27
@Reinderien good idea, I haven't actually profiled that difference, I'll give it a go — dogAwakeCat
– dogAwakeCat, Commented Mar 1, 2016 at 18:49

Community · Accepted Answer · 2017-05-23 12:40:48Z

Unfortunately I can't answer your initial question, as the analysis would take more time than I can afford; but the second question I can give you some bullet-points on:

From my experience the CUDA compiler is not as smart as the mainstream C/C++ compilers, and there's a lot of things that would be optimized out in more advanced compilers that aren't in CUDA, for example, ternary use vs if/else blocks. Likewise, combining statements may have an effect, or not.
You've got a lot of branching there, the less you can do, the more synchronized your threads will be and the faster block processing will complete. Using the ternary operator may produce faster code, depending on the CUDA version.
I forget which subsets of the C++ language CUDA supports (it's not quite C or C++, more like C+) but use prefix increment if you can.
In other compilers such as GCC, combining ++ and if statements can sometimes have small performance benefits, where you'd think they'd be automatically optimized. I would expect CUDA to be no different, but experimentation might be required.
Remember the const rules: read from right to left, so "const int * const i" means "i is a constant pointer to a constant int". Again, I forget what level of const use CUDA supports. The more const, the more assumptions the compiler can make, generally speaking - which may or may not lead to better opportunity for optimization.
As Reinderien noted, forcing inline is best left to the compiler, with the exception of very small functions. Even then, using 'inline' might be better practice. -

Bearing all that in mind, see below - none of the code logic should've changed. There are some additional suggestions I've included in the code. Apologies if I've expected any particular feature which is not actually available in CUDA (it's been a while since I used it):

__device__ static void
Push_ring_GPU(int* const head, int* const size, float* const peek_head,
              float* const data, const int i, const int capacity, 
              const int N, const float val)
{
  // use temp variables
  int headTemp = head[i];
  int sizeTemp = size[i];

  // wrap tail if needed and set value
  data[(((headTemp + sizeTemp) ? (capacity - 1)) * N) + i] = val;

  // increase size
  if (++sizeTemp >= capacity)
    {
#ifdef WITH_WARN
      printf("Ring full.\n");
#endif // WITH_WARN
      sizeTemp = capacity;
      // wrap tempHead if needed

      head[i] = (++headTemp == capacity) ? 0 : headTemp;
    }

  // if first set peek tempHead
  if (sizeTemp == 1)
    peek_head[i] = val;

  // update from temp variables
  size[i] = sizeTemp;
}


// NOTE: What is the logic of including headTemp and sizeTemp in the parameters?
// Why not use temporary values within the function as with the above function then copy those values back? This will be faster.
// If you need their values afterwards you can always obtain it from head, size and i.
__device__ static void
Push_ring_Loading_GPU(int* const headTemp, int* const sizeTemp, int* const head,
                      int* const size, float* const peek_head, 
                      float* const data, const int i,
                      const int capacity, const int N,
                      const float val)
{
  // use loading variables
  *headTemp = head[i];
  *sizeTemp = size[i];

  // wrap tail if needed
  data[(((*headTemp + *sizeTemp) & (capacity - 1)) * N) + i] = val;

  // increase size

  // NOTE: If you can find a function to reduce *sizetemp to a MAX value of capacity in CUDA, *use that* instead of creating branches!
  if (++(*sizeTemp) >= capacity)
    {
#ifdef WITH_WARN
      printf("Ring full.\n");
#endif // WITH_WARN
      *sizeTemp = capacity;

      // wrap tempHead if needed
      if (++(*headTemp) == capacity)
        *headTemp = 0;
    }
  // if first set peek tempHead
  if (*sizeTemp == 1)
    peek_head[i] = val;
}



__device__ static void
Pop_ring_GPU(int* const head, int* const size, float* const peek_head,
             float* const data, const int i, const int capacity,
             const int N)
{
  // use temporary variables
  int sizeTemp = size[i];
  int headTemp = head[i];

  // if empty
  if (sizeTemp == 0)
    return;

  // wrap head if needed
  if (++headTemp == capacity)
    headTemp = 0;

  // NOTE: an alternative you may want to test and see whether it yields better performance:
  // ++headTemp;
  // headTemp = (headTemp == capacity) ? 0 : headTemp;

  // update size and peek head

  if (--sizeTemp < 0)
    sizeTemp = 0;

  peek_head[i] = (sizeTemp > 0) ? data[(headTemp * N) + i] :  -1.f;

  // update from temporary variables
  head[i] = headTemp;
  size[i] = sizeTemp;
}



__device__ static void
Pop_ring_Loaded_GPU(int* const headTemp, int* const sizeTemp,
                    float* const peek_head, float* const data,
                    const int i, const int capacity, const int N)
{
  // if empty
  if (*sizeTemp == 0)
    return;

  // wrap head if needed
  if (++(*headTemp) == capacity)
    *headTemp = 0;

  // update size and peek head
  if (--(*sizeTemp) < 0)
    *sizeTemp = 0;

  peek_head[i] = (*sizeTemp > 0) ? data[(*headTemp * N) + i] : -1.f;
}



__device__ static inline void
Update_ring_GPU(const int* const headTemp, const int* const sizeTemp, int* const head,
                int* const size, const int i)
{
  head[i] = *headTemp;
  size[i] = *sizeTemp;
}



__device__ static inline float
Peek_ring_GPU(float* const peek_head, const int i)
{
  return peek_head[i];
}



__device__ static inline int
Size_ring_GPU(int* const size, const int i)
{
  return size[i];
}



__device__ static inline float
Iterate_ring_GPU(int* const head, float* const data, const int i,
                 const int x, const int capacity, const int N)
{
  // Wrap and return pointer to it:
  return data[(((head[i] + x) & (capacity - 1)) * N) + i];
}



__device__ static inline float
Iterate_ring_Loaded_GPU(const int headTemp, float* const data,
                        const int i, const int x,
                        const int capacity, const int N)
{
  // Wrap and return pointer
  return data[(((headTemp + x) & (capacity - 1)) * N) + i];
}

Stack Exchange Network

C and CUDA: circular buffer implementation

1 Answer 1

You must log in to answer this question.

Hot Network Questions

C and CUDA: circular buffer implementation

1 Answer 1

You must log in to answer this question.

Related

Hot Network Questions