I've implemented a feature in my C++ fractal explorer application to switch between CUDA and NVRTC. The main reason for the NVRTC/Driver API context is to support runtime compilation of custom CUDA kernels. My approach involves freeing all GPU memory associated with the current context and then reallocating it for the new selected context.
Macros implementations:
#pragma once
#include "FractalClass.cuh"
#include "HardCodedVars.h"
#include <iostream>
/// Macro to wrap NVRTC API calls for error checking.
/// If an NVRTC call fails, it prints an error message and throws a runtime_error.
#define NVRTC_SAFE_CALL(x) \
do { \
nvrtcResult result = x; \
if (result != NVRTC_SUCCESS) { \
std::cerr << "\nerror: " #x " failed with error " \
<< nvrtcGetErrorString(result) << '\n'; \
throw std::runtime_error("ERR"); \
} \
} while(0)
/// Macro to wrap CUDA Driver API calls for error checking.
/// If a Driver API call fails, it prints an error message and throws a runtime_error.
#define CU_SAFE_CALL(x) \
do { \
CUresult result = x; \
if (result != CUDA_SUCCESS) { \
const char *msg; \
cuGetErrorName(result, &msg); \
std::cerr << "\nerror: " #x " failed with error " \
<< msg << '\n'; \
throw std::runtime_error(#x " failed with error " + std::string(msg)); \
} \
} while(0)
/// Macro to wrap CUDA Runtime API calls for error checking.
/// If a Runtime API call fails, it prints an error message and throws a runtime_error.
#define CUDA_SAFE_CALL(x) \
do { \
cudaError_t result = x; \
if (result != cudaSuccess) { \
const char *msg = cudaGetErrorName(result); \
std::cerr << "\nerror: " #x " failed with error " \
<< msg << '\n'; \
throw std::runtime_error(#x " failed with error " + std::string(msg)); \
} \
}while(0)
/// Macro to abstract operations that differ between CUDA Runtime and Driver APIs.
/// It executes 'x' if the current context is CUDA Runtime, and 'y' if it's NVRTC (Driver API).
/// This allows the rest of the code to use a single macro call regardless of the active CUDA API.
#define MAKE_CURR_CONTEXT_OPERATION(x, y, ctx) \
do { \
if (context == context_type::CUDA){ \
CUDA_SAFE_CALL(x); \
} \
else{ \
CU_SAFE_CALL(y); \
} \
} while(0)
/// Macro to copy the color palette from host to device memory.
/// Uses MAKE_CURR_CONTEXT_OPERATION to handle both CUDA Runtime and Driver APIs.
#define COPY_PALETTE_TO_DEVICE(host, d, cu, ctx) \
do { \
if (context == context_type::CUDA){ \
CUDA_SAFE_CALL(cudaMemcpy(d, host, sizeof(Color) * paletteSize, cudaMemcpyHostToDevice)); \
} \
else{ \
CU_SAFE_CALL(cuMemcpyHtoD(cu, host, sizeof(Color) * paletteSize)); \
} \
} while(0)
/// Macro to allocate all necessary GPU and host memory for image data.
/// This includes device buffers for the main render target (`d_pixels`/`cu_d_pixels`)
/// and the anti-aliasing buffer (`ssaa_buffer`/`CUssaa_buffer`),
/// and host (CPU) pinned memory for transferring data (`pixels`, `compressed`).
/// The allocation size for `d_pixels` and `pixels` is 2x the basic resolution in each dimension,
/// anticipating 4x SSAA rendering.
#define ALLOCATE_ALL_IMAGE_MEMORY() \
do{ \
MAKE_CURR_CONTEXT_OPERATION(cudaMalloc(&d_pixels, basic_width * 2 * basic_height * 2 * 4 * sizeof(unsigned char)), cuMemAlloc(&cu_d_pixels, sizeof(unsigned char) * basic_width * 2 * basic_height * 2 * 4), context); \
MAKE_CURR_CONTEXT_OPERATION(cudaMallocHost(&pixels, basic_width * 2 * basic_height * 2 * 4 * sizeof(unsigned char)), cuMemHostAlloc((void**)&pixels, sizeof(unsigned char) * basic_width * 2 * basic_height * 2 * 4, 0), context); \
MAKE_CURR_CONTEXT_OPERATION(cudaMalloc(&ssaa_buffer, basic_width * basic_height * 4 * sizeof(unsigned char)), cuMemAlloc(&CUssaa_buffer, basic_width * basic_height * 4 * sizeof(unsigned char)), context); \
MAKE_CURR_CONTEXT_OPERATION(cudaMallocHost(&compressed, basic_width * basic_height * 4 * sizeof(unsigned char)), cuMemHostAlloc((void**)&compressed, basic_width * basic_height * 4 * sizeof(unsigned char), 0), context); \
} while(0)
/// Macro to free all GPU and host memory allocated for image data.
/// Uses MAKE_CURR_CONTEXT_OPERATION to handle both CUDA Runtime and Driver APIs.
#define FREE_ALL_IMAGE_MEMORY() \
do { \
MAKE_CURR_CONTEXT_OPERATION(cudaFree(d_pixels), cuMemFree(cu_d_pixels), context); \
MAKE_CURR_CONTEXT_OPERATION(cudaFree(ssaa_buffer), cuMemFree(CUssaa_buffer), context); \
MAKE_CURR_CONTEXT_OPERATION(cudaFreeHost(pixels), cuMemFreeHost(pixels), context); \
MAKE_CURR_CONTEXT_OPERATION(cudaFreeHost(compressed), cuMemFreeHost(compressed), context); \
} while(0)
/// Macro to allocate necessary GPU and host memory for non-image data.
/// This includes device memory for the color palette (`d_palette`/`cu_palette`)
/// and the total iteration counter (`d_total_iterations`/`cu_d_total_iterations`),
/// host pinned memory for the total iteration counter (`h_total_iterations`),
/// and CUDA streams for asynchronous operations (`stream`/`CUss`, `dataStream`/`CUssData`).
/// It also copies the initial palette data to the device.
#define ALLOCATE_ALL_NON_IMAGE_MEMORY() \
do { \
unsigned int zero = 0; \
MAKE_CURR_CONTEXT_OPERATION(cudaMalloc(&d_palette, palette.size() * sizeof(Color)), cuMemAlloc(&cu_palette, sizeof(Color) * paletteSize), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaMemcpy(d_palette, palette.data(), palette.size() * sizeof(Color), cudaMemcpyHostToDevice), cuMemcpyHtoD(cu_palette, palette.data(), sizeof(Color) * paletteSize), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaStreamCreate(&stream), cuStreamCreate(&CUss, 0), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaMalloc(&d_total_iterations, sizeof(unsigned int)), cuMemAlloc(&cu_d_total_iterations, sizeof(unsigned int)), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaMemset(d_total_iterations, 0, sizeof(unsigned int)), cuMemcpyHtoD(cu_d_total_iterations, &zero, sizeof(unsigned int)), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaMallocHost(&h_total_iterations , sizeof(unsigned int)), cuMemHostAlloc((void**)&h_total_iterations, sizeof(unsigned int), 0), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaStreamCreate(&dataStream), cuStreamCreate(&CUssData, 0), context);\
} while(0)
/// Macro to free all GPU and host memory allocated for non-image data.
/// Uses MAKE_CURR_CONTEXT_OPERATION to handle both CUDA Runtime and Driver APIs.
#define FREE_ALL_NON_IMAGE_MEMORY() \
do { \
MAKE_CURR_CONTEXT_OPERATION(cudaStreamDestroy(stream), cuStreamDestroy(CUss), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaFreeHost(h_total_iterations), cuMemFreeHost(h_total_iterations), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaStreamDestroy(dataStream), cuStreamDestroy(CUssData), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaFree(d_total_iterations), cuMemFree(cu_d_total_iterations), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaFree(d_palette), cuMemFree(cu_palette), context);\
} while(0)
Class header:
/// Namespace to define structs representing different fractal types.
/// These are used as template parameters for `FractalBase`.
namespace fractals {
struct mandelbrot{};
struct julia{};
};
/// Enum for rendering quality states.
/// 'good' is typically faster for interactive use, 'best' uses higher
/// resolution and anti-aliasing for final output.
enum class render_state {
good,
best
};
/// Enum for the CUDA computation context.
/// 'CUDA' refers to the CUDA Runtime API.
/// 'NVRTC' refers to using NVRTC for runtime compilation, implying the CUDA Driver API.
enum class context_type {
CUDA,
NVRTC
};
static bool initialized_nvrtc;
/// Base class for fractal rendering, providing common functionality.
/// It is templated on a `Derived` type (e.g., `fractals::mandelbrot`) to allow
/// for specific fractal logic.
template <typename Derived>
class FractalBase : public sf::Transformable, public sf::Drawable {
protected:
// Custom Formula Properties (NVRTC related)
std::thread compile_thread; // Thread for asynchronous NVRTC compilation
std::string compute_capability; // GPU compute capability string (e.g., "compute_75")
std::future<std::string> current_compile_future; // Future to get compilation result/log
std::atomic<unsigned int> progress_compiling_percentage = 0; // Compilation progress
std::string log_buffer; // Buffer for NVRTC compilation log
context_type context = context_type::CUDA; // Current CUDA context (Runtime or NVRTC/Driver)
bool custom_formula = false; // Flag indicating if a custom formula is active
std::string kernel_code; // Stores the custom kernel code string (not explicitly used in provided snippets)
CUcontext ctx; // CUDA Driver API context
CUdevice device; // CUDA Driver API device
CUmodule module; // CUDA Driver API module (for NVRTC compiled code)
std::atomic<bool> is_compiling = false; // Flag indicating if compilation is in progress
bool module_loaded = false; // Flag indicating if an NVRTC module is loaded
bool created_context; // Flag indicating if a CUDA Driver API context was created by this instance
CUfunction kernelFloat; // CUDA Driver API function pointer (float precision custom kernel)
CUfunction kernelDouble; // CUDA Driver API function pointer (double precision custom kernel)
CUfunction kernelAntialiasing; // CUDA Driver API function pointer (anti-aliasing kernel)
CUdeviceptr cu_d_total_iterations; // Device pointer for total iterations (Driver API)
CUdeviceptr cu_d_pixels; // Device pointer for pixel data (Driver API)
CUdeviceptr cu_palette; // Device pointer for palette (Driver API)
CUdeviceptr CUssaa_buffer; // Device pointer for SSAA buffer (Driver API)
CUstream CUss; // CUDA stream for NVRTC rendering (Driver API)
CUstream CUssData; // CUDA stream for NVRTC data transfers (Driver API)
/// Flag indicating if a CUDA-capable GPU is available.
bool isCudaAvailable = false;
// Palette properties
Color* d_palette; // Device pointer for the color palette (CUDA Runtime)
std::vector<Color> palette; // Host-side color palette
int paletteSize;
// Pixel buffers
unsigned char* d_pixels; // Device pointer for pixel data (CUDA Runtime, main render target)
unsigned char* pixels; // Host (pinned) memory for pixel data (transfer from d_pixels or for CPU rendering)
unsigned char* ssaa_buffer; // Device pointer for anti-aliasing buffer (CUDA Runtime)
unsigned char* compressed; // Host (pinned) memory for anti-aliased pixel data
// Rendering properties
render_state state = render_state::good; // Current rendering quality state
// CUDA properties (Runtime API)
dim3 dimGrid; // Grid dimensions for kernel launch
dim3 dimBlock; // Block dimensions for kernel launch
unsigned int basic_width; // Base rendering width (before SSAA scaling)
unsigned int basic_height; // Base rendering height
unsigned int width; // Current rendering width (can be basic_width or 2*basic_width for SSAA)
unsigned int height; // Current rendering height
cudaStream_t stream; // Main CUDA stream for rendering (Runtime API)
cudaStream_t dataStream; // Separate CUDA stream for data transfers (Runtime API)
unsigned int* d_total_iterations; // Device pointer for total iterations (CUDA Runtime)
unsigned int* h_total_iterations; // Host (pinned) memory for total iterations
public:
FractalBase();
~FractalBase();
bool get_isCudaAvailable() {return isCudaAvailable;}
void set_context(context_type ctx);
void set_grid(dim3 block);
void post_processing();
std::shared_future<std::string> set_custom_formula(const std::string& formula);
context_type get_context();
void render(render_state quality);
void render(
render_state quality,
double mouse_x, double mouse_y
);
void draw(sf::RenderTarget& target, sf::RenderStates states) const override;
};
Here is the function that handles this logic:
/// Sets the CUDA context for fractal rendering (CUDA Runtime API vs. NVRTC/Driver API).
/// This involves freeing existing GPU resources associated with the old context
/// and allocating/initializing resources for the new context.
template <typename Derived>
void FractalBase<Derived>::set_context(context_type contx) {
if(contx == context) return; // No change if the context is already the same
// FREE_ALL_IMAGE_MEMORY() and FREE_ALL_NON_IMAGE_MEMORY() are macros
// that handle freeing GPU memory. They check for null pointers before attempting to free.
// They also use current context and allocate/free memory based on that, if something goes wrong inside them, they throw an exception.
FREE_ALL_IMAGE_MEMORY();
FREE_ALL_NON_IMAGE_MEMORY();
if(contx == context_type::NVRTC) { // Switching TO NVRTC/Driver API
// initialized_nvrtc is a global static bool, ensuring cuInit(0) is called only once.
if(!initialized_nvrtc){
CU_SAFE_CALL(cuInit(0));
initialized_nvrtc = true;
}
// 'device' is a CUdevice member; 'ctx' is a CUcontext member.
CU_SAFE_CALL(cuDeviceGet(&device, 0));
CU_SAFE_CALL(cuCtxCreate(&ctx, 0, device)); // Create a Driver API context
CU_SAFE_CALL(cuCtxSetCurrent(ctx)); // Set it as current
created_context = true; // bool member: tracks if this instance created 'ctx'.
}
else { // Switching TO CUDA Runtime API
if (ctx && created_context) { // If this instance created a Driver API context, destroy it
CU_SAFE_CALL(cuCtxDestroy(ctx));
ctx = nullptr;
created_context = false;
}
if (device) {
device = 0; // Reset CUdevice handle, as it's not directly used by Runtime API in this way.
}
/// For CUDA Runtime API, `cudaSetDevice` is typically sufficient for context management.
CUDA_SAFE_CALL(cudaSetDevice(0));
}
// 'context' is a member variable of type context_type (enum).
context = contx; // Update the active context type
/// Re-allocate resources for the new context.
/// ALLOCATE_ALL_IMAGE_MEMORY() and ALLOCATE_ALL_NON_IMAGE_MEMORY() are macros
/// that handle GPU memory allocation. They include error checking (e.g., throw/exit on failure).
ALLOCATE_ALL_IMAGE_MEMORY();
ALLOCATE_ALL_NON_IMAGE_MEMORY();
}
I would appreciate your opinions on what I could improve or add to this set_context function.
Specifically:
- Are there any potential pitfalls or edge cases I might've not noticed?
- Could the logic for initializing/destroying contexts and managing device handles be made more robust or idiomatic?
- Are there any general best practices for this kind of operation that I should consider?
Any suggestions would be highly appreciated.
Also if you wanna try out the full app yourself easily, you can download the archive from the release page, thanks!