I'm looking for feedback and suggestions on improving the performance and quality of my CUDA kernel for rendering the Mandelbrot set. I've implemented a "ping-pong" style coloring and smoothing technique. I have such questions:
- Are there any obvious bottlenecks or areas where I could improve speed?
- Are there alternative algorithms for smoothing or coloring that might be more efficient or more pretty?
Context:
The kernel calculates the Mandelbrot set on the GPU, coloring each pixel based on the number of iterations it takes for the complex number to escape. I'm using a color palette and a smoothing technique to create a more visually pleasing result. While the C++ standard library includes a complex number class, direct utilization of this class within CUDA kernels can be problematic due to compatibility issues.
Current Implementation:
/**
* @brief CUDA kernel function to calculate and render the Mandelbrot set.
* This kernel is executed in parallel by multiple CUDA cores on the GPU.
* Each core has predefined amount of threads that calculate the color of a single pixel based on its position.
*
* @param pixels Pointer to the pixel data buffer in device memory.
* @param size_of_pixels Size of the pixel data buffer.
* @param width Image width.
* @param height Image height.
* @param zoom_x Zoom level along the x-axis.
* @param zoom_y Zoom level along the y-axis.
* @param x_offset Offset in the x-direction to move the view.
* @param y_offset Offset in the y-direction to move the view.
* @param d_palette Color palette in device memory to color the Mandelbrot set.
* @param paletteSize Size of the color palette.
* @param maxIterations Maximum iterations for Mandelbrot calculation.
* @param d_total_iterations Pointer to store the total number of iterations performed.
*/
__global__ void fractal_rendering(
unsigned char* pixels, const size_t size_of_pixels, const int width, const int height,
const double zoom_x, const double zoom_y, const double x_offset, const double y_offset,
sf::Color* d_palette, const int paletteSize, const double maxIterations, unsigned int* d_total_iterations) {
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
const unsigned int id = threadIdx.y * blockDim.x + threadIdx.x;
if (x == 0 && y == 0) {
*d_total_iterations = 0;
}
const size_t expected_size = width * height * 4;
const double scale_factor = static_cast<double>(size_of_pixels) / static_cast<double>(expected_size);
if (x < width && y < height) {
__shared__ unsigned int total_iterations[1024];
const double real = x / zoom_x - x_offset;
const double imag = y / zoom_y - y_offset;
double new_real = 0.0;
double z_real = 0.0;
double z_imag = 0.0;
double current_iteration = 0;
double z_comp = z_real * z_real + z_imag * z_imag;
while (z_comp < 4 && current_iteration < maxIterations) {
new_real = (z_real * z_real - z_imag * z_imag) + real;
z_imag = 2 * z_real * z_imag + imag;
z_real = new_real;
z_comp = z_real * z_real + z_imag * z_imag;
current_iteration++;
}
total_iterations[id] = static_cast<unsigned int>(current_iteration);
__syncthreads();
for (unsigned int s = blockDim.x * blockDim.y / 2; s > 0; s >>= 1) {
if (id < s) {
total_iterations[id] += total_iterations[id + s];
}
__syncthreads();
}
if (id == 0) {
atomicAdd(d_total_iterations, total_iterations[0]);
}
unsigned char r, g, b;
if (current_iteration == maxIterations) {
r = g = b = 0;
}
else {
//modulus = hypot(z_real, z_imag);
//double escape_radius = 2.0;
//if (modulus > escape_radius) {
// double nu = log2(log2(modulus) - log2(escape_radius));
// current_iteration = current_iteration + 1 - nu;
//}
float smooth_iteration = current_iteration + 1.0f - log2f(log2f(sqrtf(z_real * z_real + z_imag * z_imag)));
const float cycle_scale_factor = 25.0f;
float virtual_pos = smooth_iteration * cycle_scale_factor;
float normalized_pingpong = fmodf(virtual_pos / static_cast<float>(paletteSize -1), 2.0f);
if (normalized_pingpong < 0.0f) {
normalized_pingpong += 2.0f;
}
float t_interp;
if (normalized_pingpong <= 1.0f) {
t_interp = normalized_pingpong;
} else {
t_interp = 2.0f - normalized_pingpong;
}
float float_index = t_interp * (paletteSize - 1);
int index1 = static_cast<int>(floorf(float_index));
int index2 = min(paletteSize - 1, index1 + 1);
index1 = max(0, index1);
float t_local = fmodf(float_index, 1.0f);
if (t_local < 0.0f) t_local += 1.0f;
sf::Color color1 = getPaletteColor(index1, paletteSize, d_palette);
sf::Color color2 = getPaletteColor(index2, paletteSize, d_palette);
float r_f = static_cast<float>(color1.r) + t_local * (static_cast<float>(color2.r) - static_cast<float>(color1.r));
float g_f = static_cast<float>(color1.g) + t_local * (static_cast<float>(color2.g) - static_cast<float>(color1.g));
float b_f = static_cast<float>(color1.b) + t_local * (static_cast<float>(color2.b) - static_cast<float>(color1.b));
r = static_cast<unsigned char>(max(0.0f, min(255.0f, r_f)));
g = static_cast<unsigned char>(max(0.0f, min(255.0f, g_f)));
b = static_cast<unsigned char>(max(0.0f, min(255.0f, g_f)));
}
const unsigned int base_index = (y * width + x) * 4;
for (int i = 0; i < scale_factor * 4; i += 4) {
const unsigned int index = base_index + i;
pixels[index] = r;
pixels[index + 1] = g;
pixels[index + 2] = b;
pixels[index + 3] = 255;
}
}
}
System Information:
OS: Arch Linux
GPU: GTX 1660 Ti (compute capability 7.5)
CUDA version: 12.8
Those are the metrics provided by my nvprof on 800x600 resolution, 2k iterations:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 44.29% 4.33782s 36 120.49ms 31.461ms 521.08ms fractal_rendering(unsigned char*, unsigned long, int, int, double, double, double, double, sf::Color*, int, double, unsigned int*)
29.14% 2.85385s 2806 1.0171ms 238.95us 7.5175ms fractal_rendering(unsigned char*, unsigned long, int, int, float, float, float, float, sf::Color*, int, float, unsigned int*)

maxIterationsis adouble... Jus' sayin'... Running this on a '286, when those were the cat's pyjamas, I found reasonable success deriving the max iterations to use from the degree of zooming (magnification <=> scaling), using a logarithmic relationship. Renders took literal hours, for lower res displays, but were worth waiting for. Enjoy the exploration. \$\endgroup\$doublecalcs. Here is Part 1 of an enthusiastic intro to the technique... (No idea how much it might slow things down...) Cheers! \$\endgroup\$