Macros implementations:
#pragma once
#include "FractalClass.cuh"
#include "HardCodedVars.h"
#include <iostream>
/// Macro to wrap NVRTC API calls for error checking.
/// If an NVRTC call fails, it prints an error message and throws a runtime_error.
#define NVRTC_SAFE_CALL(x) \
do { \
nvrtcResult result = x; \
if (result != NVRTC_SUCCESS) { \
std::cerr << "\nerror: " #x " failed with error " \
<< nvrtcGetErrorString(result) << '\n'; \
throw std::runtime_error("ERR"); \
} \
} while(0)
/// Macro to wrap CUDA Driver API calls for error checking.
/// If a Driver API call fails, it prints an error message and throws a runtime_error.
#define CU_SAFE_CALL(x) \
do { \
CUresult result = x; \
if (result != CUDA_SUCCESS) { \
const char *msg; \
cuGetErrorName(result, &msg); \
std::cerr << "\nerror: " #x " failed with error " \
<< msg << '\n'; \
throw std::runtime_error(#x " failed with error " + std::string(msg)); \
} \
} while(0)
/// Macro to wrap CUDA Runtime API calls for error checking.
/// If a Runtime API call fails, it prints an error message and throws a runtime_error.
#define CUDA_SAFE_CALL(x) \
do { \
cudaError_t result = x; \
if (result != cudaSuccess) { \
const char *msg = cudaGetErrorName(result); \
std::cerr << "\nerror: " #x " failed with error " \
<< msg << '\n'; \
throw std::runtime_error(#x " failed with error " + std::string(msg)); \
} \
}while(0)
/// Macro to abstract operations that differ between CUDA Runtime and Driver APIs.
/// It executes 'x' if the current context is CUDA Runtime, and 'y' if it's NVRTC (Driver API).
/// This allows the rest of the code to use a single macro call regardless of the active CUDA API.
#define MAKE_CURR_CONTEXT_OPERATION(x, y, ctx) \
do { \
if (context == context_type::CUDA){ \
CUDA_SAFE_CALL(x); \
} \
else{ \
CU_SAFE_CALL(y); \
} \
} while(0)
/// Macro to copy the color palette from host to device memory.
/// Uses MAKE_CURR_CONTEXT_OPERATION to handle both CUDA Runtime and Driver APIs.
#define COPY_PALETTE_TO_DEVICE(host, d, cu, ctx) \
do { \
if (context == context_type::CUDA){ \
CUDA_SAFE_CALL(cudaMemcpy(d, host, sizeof(Color) * paletteSize, cudaMemcpyHostToDevice)); \
} \
else{ \
CU_SAFE_CALL(cuMemcpyHtoD(cu, host, sizeof(Color) * paletteSize)); \
} \
} while(0)
/// Macro to allocate all necessary GPU and host memory for image data.
/// This includes device buffers for the main render target (`d_pixels`/`cu_d_pixels`)
/// and the anti-aliasing buffer (`ssaa_buffer`/`CUssaa_buffer`),
/// and host (CPU) pinned memory for transferring data (`pixels`, `compressed`).
/// The allocation size for `d_pixels` and `pixels` is 2x the basic resolution in each dimension,
/// anticipating 4x SSAA rendering.
#define ALLOCATE_ALL_IMAGE_MEMORY() \
do{ \
MAKE_CURR_CONTEXT_OPERATION(cudaMalloc(&d_pixels, basic_width * 2 * basic_height * 2 * 4 * sizeof(unsigned char)), cuMemAlloc(&cu_d_pixels, sizeof(unsigned char) * basic_width * 2 * basic_height * 2 * 4), context); \
MAKE_CURR_CONTEXT_OPERATION(cudaMallocHost(&pixels, basic_width * 2 * basic_height * 2 * 4 * sizeof(unsigned char)), cuMemHostAlloc((void**)&pixels, sizeof(unsigned char) * basic_width * 2 * basic_height * 2 * 4, 0), context); \
MAKE_CURR_CONTEXT_OPERATION(cudaMalloc(&ssaa_buffer, basic_width * basic_height * 4 * sizeof(unsigned char)), cuMemAlloc(&CUssaa_buffer, basic_width * basic_height * 4 * sizeof(unsigned char)), context); \
MAKE_CURR_CONTEXT_OPERATION(cudaMallocHost(&compressed, basic_width * basic_height * 4 * sizeof(unsigned char)), cuMemHostAlloc((void**)&compressed, basic_width * basic_height * 4 * sizeof(unsigned char), 0), context); \
} while(0)
/// Macro to free all GPU and host memory allocated for image data.
/// Uses MAKE_CURR_CONTEXT_OPERATION to handle both CUDA Runtime and Driver APIs.
#define FREE_ALL_IMAGE_MEMORY() \
do { \
MAKE_CURR_CONTEXT_OPERATION(cudaFree(d_pixels), cuMemFree(cu_d_pixels), context); \
MAKE_CURR_CONTEXT_OPERATION(cudaFree(ssaa_buffer), cuMemFree(CUssaa_buffer), context); \
MAKE_CURR_CONTEXT_OPERATION(cudaFreeHost(pixels), cuMemFreeHost(pixels), context); \
MAKE_CURR_CONTEXT_OPERATION(cudaFreeHost(compressed), cuMemFreeHost(compressed), context); \
} while(0)
/// Macro to allocate necessary GPU and host memory for non-image data.
/// This includes device memory for the color palette (`d_palette`/`cu_palette`)
/// and the total iteration counter (`d_total_iterations`/`cu_d_total_iterations`),
/// host pinned memory for the total iteration counter (`h_total_iterations`),
/// and CUDA streams for asynchronous operations (`stream`/`CUss`, `dataStream`/`CUssData`).
/// It also copies the initial palette data to the device.
#define ALLOCATE_ALL_NON_IMAGE_MEMORY() \
do { \
unsigned int zero = 0; \
MAKE_CURR_CONTEXT_OPERATION(cudaMalloc(&d_palette, palette.size() * sizeof(Color)), cuMemAlloc(&cu_palette, sizeof(Color) * paletteSize), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaMemcpy(d_palette, palette.data(), palette.size() * sizeof(Color), cudaMemcpyHostToDevice), cuMemcpyHtoD(cu_palette, palette.data(), sizeof(Color) * paletteSize), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaStreamCreate(&stream), cuStreamCreate(&CUss, 0), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaMalloc(&d_total_iterations, sizeof(unsigned int)), cuMemAlloc(&cu_d_total_iterations, sizeof(unsigned int)), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaMemset(d_total_iterations, 0, sizeof(unsigned int)), cuMemcpyHtoD(cu_d_total_iterations, &zero, sizeof(unsigned int)), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaMallocHost(&h_total_iterations , sizeof(unsigned int)), cuMemHostAlloc((void**)&h_total_iterations, sizeof(unsigned int), 0), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaStreamCreate(&dataStream), cuStreamCreate(&CUssData, 0), context);\
} while(0)
/// Macro to free all GPU and host memory allocated for non-image data.
/// Uses MAKE_CURR_CONTEXT_OPERATION to handle both CUDA Runtime and Driver APIs.
#define FREE_ALL_NON_IMAGE_MEMORY() \
do { \
MAKE_CURR_CONTEXT_OPERATION(cudaStreamDestroy(stream), cuStreamDestroy(CUss), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaFreeHost(h_total_iterations), cuMemFreeHost(h_total_iterations), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaStreamDestroy(dataStream), cuStreamDestroy(CUssData), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaFree(d_total_iterations), cuMemFree(cu_d_total_iterations), context);\
MAKE_CURR_CONTEXT_OPERATION(cudaFree(d_palette), cuMemFree(cu_palette), context);\
} while(0)
Class header:
/// Namespace to define structs representing different fractal types.
/// These are used as template parameters for `FractalBase`.
namespace fractals {
struct mandelbrot{};
struct julia{};
};
/// Enum for rendering quality states.
/// 'good' is typically faster for interactive use, 'best' uses higher
/// resolution and anti-aliasing for final output.
enum class render_state {
good,
best
};
/// Enum for the CUDA computation context.
/// 'CUDA' refers to the CUDA Runtime API.
/// 'NVRTC' refers to using NVRTC for runtime compilation, implying the CUDA Driver API.
enum class context_type {
CUDA,
NVRTC
};
static bool initialized_nvrtc;
/// Base class for fractal rendering, providing common functionality.
/// It is templated on a `Derived` type (e.g., `fractals::mandelbrot`) to allow
/// for specific fractal logic.
template <typename Derived>
class FractalBase : public sf::Transformable, public sf::Drawable {
protected:
// Custom Formula Properties (NVRTC related)
std::thread compile_thread; // Thread for asynchronous NVRTC compilation
std::string compute_capability; // GPU compute capability string (e.g., "compute_75")
std::future<std::string> current_compile_future; // Future to get compilation result/log
std::atomic<unsigned int> progress_compiling_percentage = 0; // Compilation progress
std::string log_buffer; // Buffer for NVRTC compilation log
context_type context = context_type::CUDA; // Current CUDA context (Runtime or NVRTC/Driver)
bool custom_formula = false; // Flag indicating if a custom formula is active
std::string kernel_code; // Stores the custom kernel code string (not explicitly used in provided snippets)
CUcontext ctx; // CUDA Driver API context
CUdevice device; // CUDA Driver API device
CUmodule module; // CUDA Driver API module (for NVRTC compiled code)
std::atomic<bool> is_compiling = false; // Flag indicating if compilation is in progress
bool module_loaded = false; // Flag indicating if an NVRTC module is loaded
bool created_context; // Flag indicating if a CUDA Driver API context was created by this instance
CUfunction kernelFloat; // CUDA Driver API function pointer (float precision custom kernel)
CUfunction kernelDouble; // CUDA Driver API function pointer (double precision custom kernel)
CUfunction kernelAntialiasing; // CUDA Driver API function pointer (anti-aliasing kernel)
CUdeviceptr cu_d_total_iterations; // Device pointer for total iterations (Driver API)
CUdeviceptr cu_d_pixels; // Device pointer for pixel data (Driver API)
CUdeviceptr cu_palette; // Device pointer for palette (Driver API)
CUdeviceptr CUssaa_buffer; // Device pointer for SSAA buffer (Driver API)
CUstream CUss; // CUDA stream for NVRTC rendering (Driver API)
CUstream CUssData; // CUDA stream for NVRTC data transfers (Driver API)
/// Flag indicating if a CUDA-capable GPU is available.
bool isCudaAvailable = false;
// Palette properties
Color* d_palette; // Device pointer for the color palette (CUDA Runtime)
std::vector<Color> palette; // Host-side color palette
int paletteSize;
// Pixel buffers
unsigned char* d_pixels; // Device pointer for pixel data (CUDA Runtime, main render target)
unsigned char* pixels; // Host (pinned) memory for pixel data (transfer from d_pixels or for CPU rendering)
unsigned char* ssaa_buffer; // Device pointer for anti-aliasing buffer (CUDA Runtime)
unsigned char* compressed; // Host (pinned) memory for anti-aliased pixel data
// Rendering properties
render_state state = render_state::good; // Current rendering quality state
// CUDA properties (Runtime API)
dim3 dimGrid; // Grid dimensions for kernel launch
dim3 dimBlock; // Block dimensions for kernel launch
unsigned int basic_width; // Base rendering width (before SSAA scaling)
unsigned int basic_height; // Base rendering height
unsigned int width; // Current rendering width (can be basic_width or 2*basic_width for SSAA)
unsigned int height; // Current rendering height
cudaStream_t stream; // Main CUDA stream for rendering (Runtime API)
cudaStream_t dataStream; // Separate CUDA stream for data transfers (Runtime API)
unsigned int* d_total_iterations; // Device pointer for total iterations (CUDA Runtime)
unsigned int* h_total_iterations; // Host (pinned) memory for total iterations
public:
FractalBase();
~FractalBase();
bool get_isCudaAvailable() {return isCudaAvailable;}
void set_context(context_type ctx);
void set_grid(dim3 block);
void post_processing();
std::shared_future<std::string> set_custom_formula(const std::string& formula);
context_type get_context();
void render(render_state quality);
void render(
render_state quality,
double mouse_x, double mouse_y
);
void draw(sf::RenderTarget& target, sf::RenderStates states) const override;
};