I have a programme which uses many circular buffers in an identical fashion on a CPU and GPU (C and C/C++ CUDA). I essentially require many queues, however, due to this being run on a GPU, I have limited its length so memory can be set up once at the beginning of the program. I therefore have the code below with a circular buffer/queue. Device and host code is similar with only minor changes for memory efficiency on GPU which is described when necessary. I have implemented this myself so it can be utilized within a hand written kernel on the GPU and so I can compare results between the CPU and GPU for verification purposes. I emit code which allocates/frees memory on the host and device for conciseness.
Worth noting is that, for my purpose, I do not require pop to return the value. Instead I first peek and compare with some condition, and if it is true, pop and discard the data. Furthermore, I can include a peek_tail; however, currently, I do not require this. During a typical program run, I am continuously pushing new data (for all i), and at the same time checking whether the oldest (via peek) is ready to be discarded (again for all i). While this is happening, I am periodically iterating over all items (j) within each (i) buffer. Lastly, when compiling I assume that no pointers alias and compile with strict pointer aliasing flags for both GCC and nvcc when appropriate.
Data structure (note power of 2 for capacity):
typedef struct
{
// Parameters (here only host is shown, identical versions are placed in
// constant memory of the device to access)
int N; // multiple of warp size
int capacity; // power of 2
// Host data
int* head_h;
int* size_h;
float* peek_head_h;
float* data_h;
// Device data
int* head_d;
int* size_d;
float* peek_head_d;
float* data_d;
} ring;
Host code (I will explain later why indexing to data is offset) with i={0,...,N-1} and j={0,...,capacity-1} where j is used in a for loop to iterate over the contents of each (i) buffer:
static inline void
Push_ring(ring* in, const int i, const float val)
{
// wrap tail if needed
int x = in->head_h[i] + in->size_h[i];
x &= in->capacity - 1;
in->data_h[(x * in->N) + i] = val;
// increase size
in->size_h[i]++;
if (in->size_h[i] >= in->capacity)
{
#ifdef WITH_WARN
printf("Ring full.\n");
#endif // WITH_WARN
in->size_h[i] = in->capacity;
// wrap head if needed
in->head_h[i]++;
if (in->head_h[i] == in->capacity)
in->head_h[i] = 0;
}
// if first set peek head
if (in->size_h[i] == 1)
in->peek_head_h[i] = val;
}
static inline void
Pop_ring(ring* in, const int i)
{
// if empty
if (in->size_h[i] == 0)
return;
in->head_h[i]++;
// wrap head if needed
if (in->head_h[i] == in->capacity)
in->head_h[i] = 0;
// update size and peek head
in->size_h[i]--;
if (in->size_h[i] < 0)
in->size_h[i] = 0;
if (in->size_h[i] > 0)
in->peek_head_h[i] = in->data_h[(in->head_h[i] * in->N) + i];
else
in->peek_head_h[i] = -1.f;
}
static inline float
Peek_ring(ring* in, const int i)
{
return in->peek_head_h[i];
}
static inline int
Size_ring(ring* in, const int i)
{
return in->size_h[i];
}
static inline float
Iterate_ring(ring* in, const int i, const int j)
{
// Wrap i
int x = in->head_h[i] + j;
x &= in->capacity - 1;
// Return pointer to it
return in->data_h[(x * in->N) + i];
}
Device code (same usage of i and j). Here instead of passing a pointer to the structure (which is in host memory), pointers to head, size, peek_head and data are passed (which are in device memory); these refer to ring.XXX_d where XXX is variable name. Lastly, N and capacity are __constant__ variables broadcast to all threads in a warp.
__device__ static __forceinline__ void
Push_ring_GPU(int* head, int* size, float* peek_head,
float* data, const int i, const int capacity,
const int N, const float val)
{
// use temp variables
int headTemp = head[i];
int sizeTemp = size[i];
// wrap tail if needed
int x = headTemp + sizeTemp;
x &= capacity - 1;
data[(x * N) + i] = val;
// increase size
sizeTemp++;
if (sizeTemp >= capacity)
{
#ifdef WITH_WARN
printf("Ring full.\n");
#endif // WITH_WARN
sizeTemp = capacity;
// wrap tempHead if needed
headTemp++;
if (headTemp == capacity)
head[i] = 0;
else
head[i] = headTemp;
}
// if first set peek tempHead
if (sizeTemp == 1)
peek_head[i] = val;
// update from temp variables
size[i] = sizeTemp;
}
__device__ static __forceinline__ void
Push_ring_Loading_GPU(int* headTemp, int* sizeTemp, int* head,
int* size, float* peek_head,
float* data, const int i,
const int capacity, const int N,
const float val)
{
// use loading variables
*headTemp = head[i];
*sizeTemp = size[i];
// wrap tail if needed
int x = *headTemp + *sizeTemp;
x &= capacity - 1;
data[(x * N) + i] = val;
// increase size
(*sizeTemp)++;
if (*sizeTemp >= capacity)
{
#ifdef WITH_WARN
printf("Ring full.\n");
#endif // WITH_WARN
*sizeTemp = capacity;
// wrap tempHead if needed
(*headTemp)++;
if (*headTemp == capacity)
*headTemp = 0;
}
// if first set peek tempHead
if (*sizeTemp == 1)
peek_head[i] = val;
}
__device__ static __forceinline__ void
Pop_ring_GPU(int* head, int* size, float* peek_head,
float* data, const int i, const int capacity,
const int N)
{
// use temporary variables
int sizeTemp = size[i];
int headTemp = head[i];
// if empty
if (sizeTemp == 0)
return;
headTemp++;
// wrap head if needed
if (headTemp == capacity)
headTemp = 0;
// update size and peek head
sizeTemp--;
if (sizeTemp < 0)
sizeTemp = 0;
if (sizeTemp > 0) // if else cheaper than trying to do in one
peek_head[i] = data[(headTemp * N) + i];
else
peek_head[i] = -1.f;
// update from temporary variables
head[i] = headTemp;
size[i] = sizeTemp;
}
__device__ static __forceinline__ void
Pop_ring_Loaded_GPU(int* headTemp, int* sizeTemp,
float* peek_head, float* data,
const int i, const int capacity, const int N)
{
// if empty
if (*sizeTemp == 0)
return;
(*headTemp)++;
// wrap head if needed
if (*headTemp == capacity)
*headTemp = 0;
// update size and peek head
(*sizeTemp)--;
if (*sizeTemp < 0)
*sizeTemp = 0;
if (*sizeTemp > 0) // if else cheaper than trying to do in one
peek_head[i] = data[(*headTemp * N) + i];
else
peek_head[i] = -1.f;
}
__device__ static __forceinline__ void
Update_ring_GPU(const int* headTemp, const int* sizeTemp, int* head,
int* size, const int i)
{
head[i] = *headTemp;
size[i] = *sizeTemp;
}
__device__ static __forceinline__ float
Peek_ring_GPU(float* peek_head, const int i)
{
return peek_head[i];
}
__device__ static __forceinline__ int
Size_ring_GPU(int* size, const int i)
{
return size[i];
}
__device__ static __forceinline__ float
Iterate_ring_GPU(int* head, float* data, const int i,
const int x, const int capacity, const int N)
{
// Wrap
int temp = head[i] + x;
temp &= capacity - 1;
// Return pointer to it
return data[(temp * N) + i];
}
__device__ static __forceinline__ float
Iterate_ring_Loaded_GPU(const int headTemp, float* data,
const int i, const int x,
const int capacity, const int N)
{
// Wrap i
int temp = headTemp + x;
temp &= capacity - 1;
// Return pointer to it
return data[(temp * N) + i];
}
Here, on the device, I included additional versions of functions: Loading and Loaded. These are used to reduce redundant memory access to head and size as follows:
int headTemp = 0, sizeTemp = 0;
Push_ring_Loading_GPU(&headTemp, &sizeTemp, head, size, peek_head,
data, i, capacity, N, 1.234f);
if (Peek_ring_GPU(peek_head, i) > 0.5f)
Pop_ring_Loaded_GPU(&headTemp, &sizeTemp, peek_head, data, i,
capacity, N);
Update_ring_GPU(&head, &sizeTemp, head, size, i);
and
int headTemp = head[i];
int y = 0;
float temp = 0.f;
for (y = 0; y < Size_ring_GPU(size, i); y++)
temp = Iterate_ring_Loaded_GPU(headTemp, data, i, y, capacity, N);
Now, throughout the device code implementation i can be thought of as blockIdx.x * blockDimx.x + threadIdx.x, and with this in mind and the need for memory coalescing for performance reasons, this should hopefully explain the indexing to data_h and data_d (which I keep similar on the host to facilitate copying between host and device). Moreover, as N is a multiple of the warp size, if head and size are identical for all i then memory access should be coalesced and fast. However, during execution head and size for each i will not be identical, leading to memory access being fragmented and less coalesced.
This leads me to my question(s):
- Is it possible to extend my implementation to mitigate this effect? (For example, when
size == 0I resethead = 0such that, with low activity, buffers will realign towardsdata[i]. Perhaps I should implement some sort ofdefrag, and run it periodically on the buffer?) - Are there any other modifications to be made to increase performance (device code mainly), stability etc.? (General comments will be great as well.)