CUDA/NVRTC context switching function

Question

I've implemented a feature in my C++ fractal explorer application to switch between CUDA and NVRTC. The main reason for the NVRTC/Driver API context is to support runtime compilation of custom CUDA kernels. My approach involves freeing all GPU memory associated with the current context and then reallocating it for the new selected context.

Macros implementations:

#pragma once
#include "FractalClass.cuh"
#include "HardCodedVars.h"
#include <iostream>

/// Macro to wrap NVRTC API calls for error checking.
/// If an NVRTC call fails, it prints an error message and throws a runtime_error.
#define NVRTC_SAFE_CALL(x)                                        \
  do {                                                            \
    nvrtcResult result = x;                                       \
    if (result != NVRTC_SUCCESS) {                                \
      std::cerr << "\nerror: " #x " failed with error "           \
                << nvrtcGetErrorString(result) << '\n';           \
      throw std::runtime_error("ERR");                                                     \
    }                                                             \
} while(0)

/// Macro to wrap CUDA Driver API calls for error checking.
/// If a Driver API call fails, it prints an error message and throws a runtime_error.
#define CU_SAFE_CALL(x)                                           \
  do {                                                            \
    CUresult result = x;                                          \
    if (result != CUDA_SUCCESS) {                                 \
      const char *msg;                                            \
      cuGetErrorName(result, &msg);                               \
      std::cerr << "\nerror: " #x " failed with error "           \
                << msg << '\n';                                   \
      throw std::runtime_error(#x " failed with error " + std::string(msg));                                                     \
    }                                                             \
} while(0)


/// Macro to wrap CUDA Runtime API calls for error checking.
/// If a Runtime API call fails, it prints an error message and throws a runtime_error.
#define CUDA_SAFE_CALL(x) \
  do {                    \
    cudaError_t result = x;                                         \
    if (result != cudaSuccess) {                                   \
      const char *msg =  cudaGetErrorName(result);                  \
      std::cerr << "\nerror: " #x " failed with error "             \
                << msg << '\n';                                     \
      throw std::runtime_error(#x " failed with error " + std::string(msg));                                                       \
    }                                                               \
  }while(0)


/// Macro to abstract operations that differ between CUDA Runtime and Driver APIs.
/// It executes 'x' if the current context is CUDA Runtime, and 'y' if it's NVRTC (Driver API).
/// This allows the rest of the code to use a single macro call regardless of the active CUDA API.
#define MAKE_CURR_CONTEXT_OPERATION(x, y, ctx)                      \
  do {                                                              \
    if (context == context_type::CUDA){                             \
      CUDA_SAFE_CALL(x);                                            \
    }                                                               \
    else{                                                           \
      CU_SAFE_CALL(y);                                              \
    }                                                               \
} while(0)


/// Macro to copy the color palette from host to device memory.
/// Uses MAKE_CURR_CONTEXT_OPERATION to handle both CUDA Runtime and Driver APIs.
#define COPY_PALETTE_TO_DEVICE(host, d, cu, ctx)                                                       \
  do {                                                                                          \
    if (context == context_type::CUDA){                                                         \
      CUDA_SAFE_CALL(cudaMemcpy(d, host, sizeof(Color) * paletteSize, cudaMemcpyHostToDevice));    \
    }                                                                                           \
    else{                                                                                       \
      CU_SAFE_CALL(cuMemcpyHtoD(cu, host, sizeof(Color) * paletteSize));                            \
    }                                                                                           \
} while(0)

/// Macro to allocate all necessary GPU and host memory for image data.
/// This includes device buffers for the main render target (`d_pixels`/`cu_d_pixels`)
/// and the anti-aliasing buffer (`ssaa_buffer`/`CUssaa_buffer`),
/// and host (CPU) pinned memory for transferring data (`pixels`, `compressed`).
/// The allocation size for `d_pixels` and `pixels` is 2x the basic resolution in each dimension,
/// anticipating 4x SSAA rendering.
#define ALLOCATE_ALL_IMAGE_MEMORY()                                                                                                                                                                 \
  do{                                                                                                                                                                                               \
    MAKE_CURR_CONTEXT_OPERATION(cudaMalloc(&d_pixels, basic_width * 2 * basic_height * 2 * 4 * sizeof(unsigned char)), cuMemAlloc(&cu_d_pixels, sizeof(unsigned char) * basic_width * 2 * basic_height * 2 * 4), context);                  \
    MAKE_CURR_CONTEXT_OPERATION(cudaMallocHost(&pixels, basic_width * 2 * basic_height * 2 * 4 * sizeof(unsigned char)), cuMemHostAlloc((void**)&pixels, sizeof(unsigned char) * basic_width * 2 * basic_height * 2 * 4, 0), context);      \
    MAKE_CURR_CONTEXT_OPERATION(cudaMalloc(&ssaa_buffer, basic_width * basic_height * 4 * sizeof(unsigned char)), cuMemAlloc(&CUssaa_buffer, basic_width * basic_height * 4 * sizeof(unsigned char)), context);                             \
    MAKE_CURR_CONTEXT_OPERATION(cudaMallocHost(&compressed, basic_width * basic_height * 4 * sizeof(unsigned char)), cuMemHostAlloc((void**)&compressed, basic_width * basic_height * 4 * sizeof(unsigned char), 0), context);                                                                      \
  } while(0)

/// Macro to free all GPU and host memory allocated for image data.
/// Uses MAKE_CURR_CONTEXT_OPERATION to handle both CUDA Runtime and Driver APIs.
#define FREE_ALL_IMAGE_MEMORY()                                                                 \
  do {                                                                                          \
    MAKE_CURR_CONTEXT_OPERATION(cudaFree(d_pixels), cuMemFree(cu_d_pixels), context);           \
    MAKE_CURR_CONTEXT_OPERATION(cudaFree(ssaa_buffer), cuMemFree(CUssaa_buffer), context);      \
    MAKE_CURR_CONTEXT_OPERATION(cudaFreeHost(pixels), cuMemFreeHost(pixels), context);          \
    MAKE_CURR_CONTEXT_OPERATION(cudaFreeHost(compressed), cuMemFreeHost(compressed), context);  \
  } while(0)

/// Macro to allocate necessary GPU and host memory for non-image data.
/// This includes device memory for the color palette (`d_palette`/`cu_palette`)
/// and the total iteration counter (`d_total_iterations`/`cu_d_total_iterations`),
/// host pinned memory for the total iteration counter (`h_total_iterations`),
/// and CUDA streams for asynchronous operations (`stream`/`CUss`, `dataStream`/`CUssData`).
/// It also copies the initial palette data to the device.
#define ALLOCATE_ALL_NON_IMAGE_MEMORY() \
  do {                                    \
    unsigned int zero = 0; \
    MAKE_CURR_CONTEXT_OPERATION(cudaMalloc(&d_palette, palette.size() * sizeof(Color)), cuMemAlloc(&cu_palette, sizeof(Color) * paletteSize), context);\
    MAKE_CURR_CONTEXT_OPERATION(cudaMemcpy(d_palette, palette.data(), palette.size() * sizeof(Color), cudaMemcpyHostToDevice), cuMemcpyHtoD(cu_palette, palette.data(), sizeof(Color) * paletteSize), context);\
    MAKE_CURR_CONTEXT_OPERATION(cudaStreamCreate(&stream), cuStreamCreate(&CUss, 0), context);\
    MAKE_CURR_CONTEXT_OPERATION(cudaMalloc(&d_total_iterations, sizeof(unsigned int)), cuMemAlloc(&cu_d_total_iterations, sizeof(unsigned int)), context);\
    MAKE_CURR_CONTEXT_OPERATION(cudaMemset(d_total_iterations, 0, sizeof(unsigned int)), cuMemcpyHtoD(cu_d_total_iterations, &zero, sizeof(unsigned int)), context);\
    MAKE_CURR_CONTEXT_OPERATION(cudaMallocHost(&h_total_iterations , sizeof(unsigned int)), cuMemHostAlloc((void**)&h_total_iterations, sizeof(unsigned int), 0), context);\
    MAKE_CURR_CONTEXT_OPERATION(cudaStreamCreate(&dataStream), cuStreamCreate(&CUssData, 0), context);\
  } while(0)

/// Macro to free all GPU and host memory allocated for non-image data.
/// Uses MAKE_CURR_CONTEXT_OPERATION to handle both CUDA Runtime and Driver APIs.
#define FREE_ALL_NON_IMAGE_MEMORY() \
  do {                               \
    MAKE_CURR_CONTEXT_OPERATION(cudaStreamDestroy(stream), cuStreamDestroy(CUss), context);\
    MAKE_CURR_CONTEXT_OPERATION(cudaFreeHost(h_total_iterations), cuMemFreeHost(h_total_iterations), context);\
    MAKE_CURR_CONTEXT_OPERATION(cudaStreamDestroy(dataStream), cuStreamDestroy(CUssData), context);\
    MAKE_CURR_CONTEXT_OPERATION(cudaFree(d_total_iterations), cuMemFree(cu_d_total_iterations), context);\
    MAKE_CURR_CONTEXT_OPERATION(cudaFree(d_palette), cuMemFree(cu_palette), context);\
  } while(0)

Class header:

/// Namespace to define structs representing different fractal types.
/// These are used as template parameters for `FractalBase`.
namespace fractals {
    struct mandelbrot{};
    struct julia{};
};

/// Enum for rendering quality states.
/// 'good' is typically faster for interactive use, 'best' uses higher
/// resolution and anti-aliasing for final output.
enum class render_state {
    good,
    best
};

/// Enum for the CUDA computation context.
/// 'CUDA' refers to the CUDA Runtime API.
/// 'NVRTC' refers to using NVRTC for runtime compilation, implying the CUDA Driver API.
enum class context_type {
    CUDA,
    NVRTC
};

static bool initialized_nvrtc;

/// Base class for fractal rendering, providing common functionality.
/// It is templated on a `Derived` type (e.g., `fractals::mandelbrot`) to allow
/// for specific fractal logic.
template <typename Derived>
class FractalBase : public sf::Transformable, public sf::Drawable {
protected:

    // Custom Formula Properties (NVRTC related)
    std::thread compile_thread;          // Thread for asynchronous NVRTC compilation
    std::string compute_capability;      // GPU compute capability string (e.g., "compute_75")
    std::future<std::string> current_compile_future; // Future to get compilation result/log
    std::atomic<unsigned int> progress_compiling_percentage = 0; // Compilation progress
    std::string log_buffer;              // Buffer for NVRTC compilation log
    context_type context = context_type::CUDA; // Current CUDA context (Runtime or NVRTC/Driver)
    bool custom_formula = false;         // Flag indicating if a custom formula is active
    std::string kernel_code;             // Stores the custom kernel code string (not explicitly used in provided snippets)
    CUcontext ctx;                       // CUDA Driver API context
    CUdevice device;                     // CUDA Driver API device
    CUmodule module;                     // CUDA Driver API module (for NVRTC compiled code)
    std::atomic<bool> is_compiling = false; // Flag indicating if compilation is in progress
    bool module_loaded = false;          // Flag indicating if an NVRTC module is loaded
    bool created_context;                // Flag indicating if a CUDA Driver API context was created by this instance
    CUfunction kernelFloat;              // CUDA Driver API function pointer (float precision custom kernel)
    CUfunction kernelDouble;             // CUDA Driver API function pointer (double precision custom kernel)
    CUfunction kernelAntialiasing;       // CUDA Driver API function pointer (anti-aliasing kernel)
    CUdeviceptr cu_d_total_iterations;   // Device pointer for total iterations (Driver API)
    CUdeviceptr cu_d_pixels;             // Device pointer for pixel data (Driver API)
    CUdeviceptr cu_palette;              // Device pointer for palette (Driver API)
    CUdeviceptr CUssaa_buffer;           // Device pointer for SSAA buffer (Driver API)
    CUstream CUss;                       // CUDA stream for NVRTC rendering (Driver API)
    CUstream CUssData;                   // CUDA stream for NVRTC data transfers (Driver API)


    /// Flag indicating if a CUDA-capable GPU is available.
    bool isCudaAvailable = false;

    // Palette properties
    Color* d_palette;                    // Device pointer for the color palette (CUDA Runtime)
    std::vector<Color> palette;          // Host-side color palette
    int paletteSize;

    // Pixel buffers
    unsigned char* d_pixels;             // Device pointer for pixel data (CUDA Runtime, main render target)
    unsigned char* pixels;               // Host (pinned) memory for pixel data (transfer from d_pixels or for CPU rendering)
    unsigned char* ssaa_buffer;          // Device pointer for anti-aliasing buffer (CUDA Runtime)
    unsigned char* compressed;           // Host (pinned) memory for anti-aliased pixel data

    // Rendering properties
    render_state state = render_state::good; // Current rendering quality state

    // CUDA properties (Runtime API)
    dim3 dimGrid;                        // Grid dimensions for kernel launch
    dim3 dimBlock;                       // Block dimensions for kernel launch
    unsigned int basic_width;            // Base rendering width (before SSAA scaling)
    unsigned int basic_height;           // Base rendering height
    unsigned int width;                  // Current rendering width (can be basic_width or 2*basic_width for SSAA)
    unsigned int height;                 // Current rendering height
    cudaStream_t stream;                 // Main CUDA stream for rendering (Runtime API)
    cudaStream_t dataStream;             // Separate CUDA stream for data transfers (Runtime API)
    unsigned int* d_total_iterations;    // Device pointer for total iterations (CUDA Runtime)
    unsigned int* h_total_iterations;    // Host (pinned) memory for total iterations

public:
    FractalBase();
    ~FractalBase();
    bool get_isCudaAvailable() {return isCudaAvailable;}
    void set_context(context_type ctx);
    void set_grid(dim3 block);

    void post_processing();

    std::shared_future<std::string> set_custom_formula(const std::string& formula);
    context_type get_context();

    void render(render_state quality);
    void render(
            render_state quality,
            double mouse_x, double mouse_y
    );
    void draw(sf::RenderTarget& target, sf::RenderStates states) const override;
};

Here is the function that handles this logic:

/// Sets the CUDA context for fractal rendering (CUDA Runtime API vs. NVRTC/Driver API).
/// This involves freeing existing GPU resources associated with the old context
/// and allocating/initializing resources for the new context.
template <typename Derived>
void FractalBase<Derived>::set_context(context_type contx) {
    if(contx == context) return; // No change if the context is already the same

    // FREE_ALL_IMAGE_MEMORY() and FREE_ALL_NON_IMAGE_MEMORY() are macros
    // that handle freeing GPU memory. They check for null pointers before attempting to free.
    // They also use current context and allocate/free memory based on that, if something goes wrong inside them, they throw an exception.
    FREE_ALL_IMAGE_MEMORY();
    FREE_ALL_NON_IMAGE_MEMORY();

    if(contx == context_type::NVRTC) { // Switching TO NVRTC/Driver API
        // initialized_nvrtc is a global static bool, ensuring cuInit(0) is called only once.
        if(!initialized_nvrtc){ 
            CU_SAFE_CALL(cuInit(0));
            initialized_nvrtc = true;
        }
        // 'device' is a CUdevice member; 'ctx' is a CUcontext member.
        CU_SAFE_CALL(cuDeviceGet(&device, 0));
        CU_SAFE_CALL(cuCtxCreate(&ctx, 0, device)); // Create a Driver API context
        CU_SAFE_CALL(cuCtxSetCurrent(ctx));        // Set it as current
        created_context = true;  // bool member: tracks if this instance created 'ctx'.
    }
    else { // Switching TO CUDA Runtime API
        if (ctx && created_context) { // If this instance created a Driver API context, destroy it
            CU_SAFE_CALL(cuCtxDestroy(ctx));
            ctx = nullptr;
            created_context = false;
        }
        if (device) {
            device = 0; // Reset CUdevice handle, as it's not directly used by Runtime API in this way.
        }
        /// For CUDA Runtime API, `cudaSetDevice` is typically sufficient for context management.
        CUDA_SAFE_CALL(cudaSetDevice(0));
    }
    // 'context' is a member variable of type context_type (enum).
    context = contx; // Update the active context type

    /// Re-allocate resources for the new context.
    /// ALLOCATE_ALL_IMAGE_MEMORY() and ALLOCATE_ALL_NON_IMAGE_MEMORY() are macros
    /// that handle GPU memory allocation. They include error checking (e.g., throw/exit on failure).
    ALLOCATE_ALL_IMAGE_MEMORY();
    ALLOCATE_ALL_NON_IMAGE_MEMORY();
}

I would appreciate your opinions on what I could improve or add to this set_context function. Specifically:

Are there any potential pitfalls or edge cases I might've not noticed?
Could the logic for initializing/destroying contexts and managing device handles be made more robust or idiomatic?
Are there any general best practices for this kind of operation that I should consider?

Any suggestions would be highly appreciated.

Also if you wanna try out the full app yourself easily, you can download the archive from the release page, thanks!

\$\begingroup\$ What version of C++ are you using? \$\endgroup\$

pacmaninbw
– pacmaninbw ♦

2025-05-16 14:35:15 +00:00
Commented May 16 at 14:35 — pacmaninbw
– pacmaninbw ♦, Commented May 16 at 14:35
\$\begingroup\$ C++20, and CUDA17 \$\endgroup\$

NeKon
– NeKon

2025-05-16 14:41:50 +00:00
Commented May 16 at 14:41 — NeKon
– NeKon, Commented May 16 at 14:41

G. Sliepen · Accepted Answer · 2025-05-16 21:01:55Z

Get rid of all the macros

Macros are problematic for various reasons, among them:

the parameters are expanded in unexpected ways
they can confuse IDEs and debuggers
they are less readable

So avoid them where possible. In your case, you can avoid all of them. For some macros it's trivial. For example, instead of ALLOCATE_ALL_IMAGE_MEMORY() you could just have defined:

void FractalBase::allocate_all_image_memory() {
    MAKE_CURR_CONTEXT_OPERATION(cudaMalloc(&d_pixels, basic_width * 2 * basic_height * 2 * 4 * sizeof(unsigned char)), cuMemAlloc(&cu_d_pixels, sizeof(unsigned char) * basic_width * 2 * basic_height * 2 * 4), context);
    MAKE_CURR_CONTEXT_OPERATION(cudaMallocHost(&pixels, basic_width * 2 * basic_height * 2 * 4 * sizeof(unsigned char)), cuMemHostAlloc((void**)&pixels, sizeof(unsigned char) * basic_width * 2 * basic_height * 2 * 4, 0), context);
    MAKE_CURR_CONTEXT_OPERATION(cudaMalloc(&ssaa_buffer, basic_width * basic_height * 4 * sizeof(unsigned char)), cuMemAlloc(&CUssaa_buffer, basic_width * basic_height * 4 * sizeof(unsigned char)), context);
    MAKE_CURR_CONTEXT_OPERATION(cudaMallocHost(&compressed, basic_width * basic_height * 4 * sizeof(unsigned char)), cuMemHostAlloc((void**)&compressed, basic_width * basic_height * 4 * sizeof(unsigned char), 0), context);
}

Of course the real issue is MAKE_CURR_CONTEXT_OPERATION(x, y, ctx), which should either evaluate x or y depending on ctx. There are various options here. One is to delay evaluation of x and y by passing them as lambdas instead:

template<typename X, typename Y>
void make_curr_context_operation(X x, Y y, context_type ctx) {
    if (context == context_type::CUDA) {
        CUDA_SAFE_CALL(x());
    } else {
        CU_SAFE_CALL(y());
    }
}

That makes calling it a bit more cumbersome, for example you'd have to write:

void FractalBase::allocate_all_image_memory() {
    make_curr_context_operation([&]{cudaMalloc(&d_pixels, basic_width * 2 * basic_height * 2 * 4 * sizeof(unsigned char));}, [&]{cuMemAlloc(&cu_d_pixels, sizeof(unsigned char) * basic_width * 2 * basic_height * 2 * 4);}, context);
    …
}

A completely different way to approach this is to make a base class Context that has virtual member functions to allocate and free memory, and then derive CudaContext and CuContext from it:

class Context {
public:
    virtual alloc(void** ptr, size_t size) = 0;
    virtual alloc_host(void** ptr, size_t size) = 0;
    virtual free(void *ptr) = 0;
    …
};

class CudaContext: public Context {
public:
    alloc() override (void** ptr, size_t size) {
        CUDA_SAFE_CALL(cudaMalloc(ptr, size));
    }
    …
};

Then in FractalBase you'd have a member std::unique_ptr<Context> context, which is then either assigned a CudaContext or a CuContext, and then you would write:

void FractalBase::allocate_all_image_memory() {
    context->alloc(&d_pixels, basic_width * 2 * basic_height * 2 * 4);
    context->alloc_host(&pixels, basic_width * 2 * basic_height * 2 * 4);
    context->alloc(&ssaa_buffer, basic_width * basic_height * 4);
    context->alloc_host(&compressed, basic_width * basic_height * 4);
}

Given how the latter approach makes the code much more readable, I would go for that one. Also consider adding a constructor and destructor to the derived classes, and have them initialize the device and create the actual context.

Finally, for CUDA_SAFE_CALL() and CU_SAFE_CALL(), consider whether printing the command that failed is really that useful. For an end-user, it's not. For a programmer that wants to debug things, I would just print the error string and std::abort(), and use a debugger to tell you exactly where the error happened.

void cu_safe_call(CUresult result) {
    if (result != CUDA_SUCCESS) {
        const char *msg;
        cuGetErrorName(result, &msg);
        std::cerr << "CU call failed with error " << msg << '\n';
        std::abort();
    }
}

However, if your compiler supports them, utilities like std::source_location and std::backtrace might be interesting to use.

Yes, i agree with all of those advises, especially one with macros, thanks! — NeKon
– NeKon, Commented May 17 at 6:32

Stack Exchange Network

CUDA/NVRTC context switching function

1 Answer 1

Get rid of all the macros

You must log in to answer this question.

Hot Network Questions

CUDA/NVRTC context switching function

1 Answer 1

Get rid of all the macros

You must log in to answer this question.

Related

Hot Network Questions