Skip to main content
added 125 characters in body
Source Link
user555045
  • 12.4k
  • 1
  • 19
  • 39

It seems to behave well now, staying around 110% of the theoretical optimum for any size I test. Perhaps some small tweaks are still possible. For example, unrolling the i loop by 4 instead of 2 improved it slightly in my tests, but the difference is kind of small.

It seems to behave well now, staying around 110% of the theoretical optimum for any size I test. Perhaps some small tweaks are still possible.

It seems to behave well now, staying around 110% of the theoretical optimum for any size I test. Perhaps some small tweaks are still possible. For example, unrolling the i loop by 4 instead of 2 improved it slightly in my tests, but the difference is kind of small.

even faster version
Source Link
user555045
  • 12.4k
  • 1
  • 19
  • 39
       64 128  256  512  1024  2048
naive  90 305 1360 2700  9500   TLE
v1     19  45  170  340  1460  7900
v2     18  40  170  245  1030  4300
v3     22  44   85  150   380   950
v4     17  35   70  150   310   750
v5     18  35   70  140   275   550
ideal  16  32   64  128   256   512
size_t jb = std::min(512u, N);
size_t kb = std::min(24u, N);

for (size_t jj = 0; jj < N; jj += jb)
{
    for (size_t kk = 0; kk < N; kk += kb)
    {
        for (size_t i = 0; i < N; i += 2) {
            for (size_t j = jj; j < jj + jb; j += 16) {
                __m256i sumA_1, sumB_1, sumA_2, sumB_2;
                if (kk == 0) {
                    sumA_1 = sumB_1 = sumA_2 = sumB_2 = _mm256_setzero_si256();
                }
                else {
                    sumA_1 = _mm256_load_si256((__m256i*)&result[i][j]);
                    sumB_1 = _mm256_load_si256((__m256i*)&result[i][j + 8]);
                    sumA_2 = _mm256_load_si256((__m256i*)&result[i + 1][j]);
                    sumB_2 = _mm256_load_si256((__m256i*)&result[i + 1][j + 8]);
                }
                size_t limit = std::min(N, kk + kb);
                for (size_t k = kk; k < limit; k++) {
                    auto bc_mat1_1 = _mm256_set1_epi32(mat1[i][k]);
                    auto vecA_mat2 = _mm256_loadu_si256((__m256i*)&mat2[k][j]);
                    auto vecB_mat2 = _mm256_loadu_si256((__m256i*)&mat2[k][j + 8]);
                    sumA_1 = _mm256_add_epi32(sumA_1, _mm256_mullo_epi32(bc_mat1_1, vecA_mat2));
                    sumB_1 = _mm256_add_epi32(sumB_1, _mm256_mullo_epi32(bc_mat1_1, vecB_mat2));
                    auto bc_mat1_2 = _mm256_set1_epi32(mat1[i + 1][k]);
                    sumA_2 = _mm256_add_epi32(sumA_2, _mm256_mullo_epi32(bc_mat1_2, vecA_mat2));
                    sumB_2 = _mm256_add_epi32(sumB_2, _mm256_mullo_epi32(bc_mat1_2, vecB_mat2));
                }
                _mm256_storeu_si256((__m256i*)&result[i][j], sumA_1);
                _mm256_storeu_si256((__m256i*)&result[i][j + 8], sumB_1);
                _mm256_storeu_si256((__m256i*)&result[i + 1][j], sumA_2);
                _mm256_storeu_si256((__m256i*)&result[i + 1][j + 8], sumB_2);
            }
        }
    }
}

Yet an other version, with even more tiling and with rearranging matrix 2. Added to the table of times above. Of course, some time can be saved if that matrix can be assumed to already be in that order, but I counted the rearranging in the benchmarks. That overhead scales as O(N²) while the meat of the algorithm scales as O(N³) so for a large matrix it does not represent a significant cost anyway.

It seems to behave well now, staying around 110% of the theoretical optimum for any size I test. Perhaps some small tweaks are still possible.

v5:

size_t ib = std::min(256, (int)N);
size_t jb = std::min(512, (int)N);
size_t kb = std::min(16, (int)N);

int *mat2 = (int*)_aligned_malloc(N * N * sizeof(int), 32);
size_t m2idx = 0;
for (size_t jj = 0; jj < N; jj += jb)
{
    for (size_t kk = 0; kk < N; kk += kb)
    {           
        for (size_t j = jj; j < jj + jb; j += 16)
        {
            for (size_t k = kk; k < kk + kb; k++)
            {
                auto vecA_mat2 = _mm256_loadu_si256((__m256i*)&mat2in[k][j]);
                auto vecB_mat2 = _mm256_loadu_si256((__m256i*)&mat2in[k][j + 8]);
                _mm256_storeu_si256((__m256i*)&mat2[m2idx], vecA_mat2);
                _mm256_storeu_si256((__m256i*)&mat2[m2idx + 8], vecB_mat2);
                m2idx += 16;
            }
        }
    }
}


for (size_t ii = 0; ii < N; ii += ib) {
    for (size_t jj = 0; jj < N; jj += jb) {
        for (size_t kk = 0; kk < N; kk += kb) {
            for (size_t i = ii; i < ii + ib; i += 2) {
                for (size_t j = jj; j < jj + jb; j += 16) {
                    size_t m2idx = (j - jj) * kb + kk * jb + jj * N;
                    __m256i sumA_1, sumB_1, sumA_2, sumB_2;
                    if (kk == 0) {
                        sumA_1 = sumB_1 = sumA_2 = sumB_2 = _mm256_setzero_si256();
                    }
                    else {
                        sumA_1 = _mm256_load_si256((__m256i*)&result[i][j]);
                        sumB_1 = _mm256_load_si256((__m256i*)&result[i][j + 8]);
                        sumA_2 = _mm256_load_si256((__m256i*)&result[i + 1][j]);
                        sumB_2 = _mm256_load_si256((__m256i*)&result[i + 1][j + 8]);
                    }
                    
                    for (size_t k = kk; k < kk + kb; k++) {
                        auto bc_mat1_1 = _mm256_set1_epi32(mat1[i][k]);
                        auto vecA_mat2 = _mm256_loadu_si256((__m256i*)(mat2 + m2idx));
                        auto vecB_mat2 = _mm256_loadu_si256((__m256i*)(mat2 + m2idx + 8));
                        sumA_1 = _mm256_add_epi32(sumA_1, _mm256_mullo_epi32(bc_mat1_1, vecA_mat2));
                        sumB_1 = _mm256_add_epi32(sumB_1, _mm256_mullo_epi32(bc_mat1_1, vecB_mat2));
                        auto bc_mat1_2 = _mm256_set1_epi32(mat1[i + 1][k]);
                        sumA_2 = _mm256_add_epi32(sumA_2, _mm256_mullo_epi32(bc_mat1_2, vecA_mat2));
                        sumB_2 = _mm256_add_epi32(sumB_2, _mm256_mullo_epi32(bc_mat1_2, vecB_mat2));
                        m2idx += 16;
                    }
                    _mm256_storeu_si256((__m256i*)&result[i][j], sumA_1);
                    _mm256_storeu_si256((__m256i*)&result[i][j + 8], sumB_1);
                    _mm256_storeu_si256((__m256i*)&result[i + 1][j], sumA_2);
                    _mm256_storeu_si256((__m256i*)&result[i + 1][j + 8], sumB_2);
                }
            }
        }
    }
}

_aligned_free(mat2);
       64 128  256  512  1024  2048
naive  90 305 1360 2700  9500   TLE
v1     19  45  170  340  1460  7900
v2     18  40  170  245  1030  4300
v3     22  44   85  150   380   950
v4     17  35   70  150   310   750
ideal  16  32   64  128   256   512
size_t jb = std::min(512u, N);
size_t kb = std::min(24u, N);

for (size_t jj = 0; jj < N; jj += jb)
{
    for (size_t kk = 0; kk < N; kk += kb)
    {
        for (size_t i = 0; i < N; i += 2) {
            for (size_t j = jj; j < jj + jb; j += 16) {
                __m256i sumA_1, sumB_1, sumA_2, sumB_2;
                if (kk == 0) {
                    sumA_1 = sumB_1 = sumA_2 = sumB_2 = _mm256_setzero_si256();
                }
                else {
                    sumA_1 = _mm256_load_si256((__m256i*)&result[i][j]);
                    sumB_1 = _mm256_load_si256((__m256i*)&result[i][j + 8]);
                    sumA_2 = _mm256_load_si256((__m256i*)&result[i + 1][j]);
                    sumB_2 = _mm256_load_si256((__m256i*)&result[i + 1][j + 8]);
                }
                size_t limit = std::min(N, kk + kb);
                for (size_t k = kk; k < limit; k++) {
                    auto bc_mat1_1 = _mm256_set1_epi32(mat1[i][k]);
                    auto vecA_mat2 = _mm256_loadu_si256((__m256i*)&mat2[k][j]);
                    auto vecB_mat2 = _mm256_loadu_si256((__m256i*)&mat2[k][j + 8]);
                    sumA_1 = _mm256_add_epi32(sumA_1, _mm256_mullo_epi32(bc_mat1_1, vecA_mat2));
                    sumB_1 = _mm256_add_epi32(sumB_1, _mm256_mullo_epi32(bc_mat1_1, vecB_mat2));
                    auto bc_mat1_2 = _mm256_set1_epi32(mat1[i + 1][k]);
                    sumA_2 = _mm256_add_epi32(sumA_2, _mm256_mullo_epi32(bc_mat1_2, vecA_mat2));
                    sumB_2 = _mm256_add_epi32(sumB_2, _mm256_mullo_epi32(bc_mat1_2, vecB_mat2));
                }
                _mm256_storeu_si256((__m256i*)&result[i][j], sumA_1);
                _mm256_storeu_si256((__m256i*)&result[i][j + 8], sumB_1);
                _mm256_storeu_si256((__m256i*)&result[i + 1][j], sumA_2);
                _mm256_storeu_si256((__m256i*)&result[i + 1][j + 8], sumB_2);
            }
        }
    }
}
       64 128  256  512  1024  2048
naive  90 305 1360 2700  9500   TLE
v1     19  45  170  340  1460  7900
v2     18  40  170  245  1030  4300
v3     22  44   85  150   380   950
v4     17  35   70  150   310   750
v5     18  35   70  140   275   550
ideal  16  32   64  128   256   512
size_t jb = std::min(512u, N);
size_t kb = std::min(24u, N);

for (size_t jj = 0; jj < N; jj += jb)
{
    for (size_t kk = 0; kk < N; kk += kb)
    {
        for (size_t i = 0; i < N; i += 2) {
            for (size_t j = jj; j < jj + jb; j += 16) {
                __m256i sumA_1, sumB_1, sumA_2, sumB_2;
                if (kk == 0) {
                    sumA_1 = sumB_1 = sumA_2 = sumB_2 = _mm256_setzero_si256();
                }
                else {
                    sumA_1 = _mm256_load_si256((__m256i*)&result[i][j]);
                    sumB_1 = _mm256_load_si256((__m256i*)&result[i][j + 8]);
                    sumA_2 = _mm256_load_si256((__m256i*)&result[i + 1][j]);
                    sumB_2 = _mm256_load_si256((__m256i*)&result[i + 1][j + 8]);
                }
                size_t limit = std::min(N, kk + kb);
                for (size_t k = kk; k < limit; k++) {
                    auto bc_mat1_1 = _mm256_set1_epi32(mat1[i][k]);
                    auto vecA_mat2 = _mm256_loadu_si256((__m256i*)&mat2[k][j]);
                    auto vecB_mat2 = _mm256_loadu_si256((__m256i*)&mat2[k][j + 8]);
                    sumA_1 = _mm256_add_epi32(sumA_1, _mm256_mullo_epi32(bc_mat1_1, vecA_mat2));
                    sumB_1 = _mm256_add_epi32(sumB_1, _mm256_mullo_epi32(bc_mat1_1, vecB_mat2));
                    auto bc_mat1_2 = _mm256_set1_epi32(mat1[i + 1][k]);
                    sumA_2 = _mm256_add_epi32(sumA_2, _mm256_mullo_epi32(bc_mat1_2, vecA_mat2));
                    sumB_2 = _mm256_add_epi32(sumB_2, _mm256_mullo_epi32(bc_mat1_2, vecB_mat2));
                }
                _mm256_storeu_si256((__m256i*)&result[i][j], sumA_1);
                _mm256_storeu_si256((__m256i*)&result[i][j + 8], sumB_1);
                _mm256_storeu_si256((__m256i*)&result[i + 1][j], sumA_2);
                _mm256_storeu_si256((__m256i*)&result[i + 1][j + 8], sumB_2);
            }
        }
    }
}

Yet an other version, with even more tiling and with rearranging matrix 2. Added to the table of times above. Of course, some time can be saved if that matrix can be assumed to already be in that order, but I counted the rearranging in the benchmarks. That overhead scales as O(N²) while the meat of the algorithm scales as O(N³) so for a large matrix it does not represent a significant cost anyway.

It seems to behave well now, staying around 110% of the theoretical optimum for any size I test. Perhaps some small tweaks are still possible.

v5:

size_t ib = std::min(256, (int)N);
size_t jb = std::min(512, (int)N);
size_t kb = std::min(16, (int)N);

int *mat2 = (int*)_aligned_malloc(N * N * sizeof(int), 32);
size_t m2idx = 0;
for (size_t jj = 0; jj < N; jj += jb)
{
    for (size_t kk = 0; kk < N; kk += kb)
    {           
        for (size_t j = jj; j < jj + jb; j += 16)
        {
            for (size_t k = kk; k < kk + kb; k++)
            {
                auto vecA_mat2 = _mm256_loadu_si256((__m256i*)&mat2in[k][j]);
                auto vecB_mat2 = _mm256_loadu_si256((__m256i*)&mat2in[k][j + 8]);
                _mm256_storeu_si256((__m256i*)&mat2[m2idx], vecA_mat2);
                _mm256_storeu_si256((__m256i*)&mat2[m2idx + 8], vecB_mat2);
                m2idx += 16;
            }
        }
    }
}


for (size_t ii = 0; ii < N; ii += ib) {
    for (size_t jj = 0; jj < N; jj += jb) {
        for (size_t kk = 0; kk < N; kk += kb) {
            for (size_t i = ii; i < ii + ib; i += 2) {
                for (size_t j = jj; j < jj + jb; j += 16) {
                    size_t m2idx = (j - jj) * kb + kk * jb + jj * N;
                    __m256i sumA_1, sumB_1, sumA_2, sumB_2;
                    if (kk == 0) {
                        sumA_1 = sumB_1 = sumA_2 = sumB_2 = _mm256_setzero_si256();
                    }
                    else {
                        sumA_1 = _mm256_load_si256((__m256i*)&result[i][j]);
                        sumB_1 = _mm256_load_si256((__m256i*)&result[i][j + 8]);
                        sumA_2 = _mm256_load_si256((__m256i*)&result[i + 1][j]);
                        sumB_2 = _mm256_load_si256((__m256i*)&result[i + 1][j + 8]);
                    }
                    
                    for (size_t k = kk; k < kk + kb; k++) {
                        auto bc_mat1_1 = _mm256_set1_epi32(mat1[i][k]);
                        auto vecA_mat2 = _mm256_loadu_si256((__m256i*)(mat2 + m2idx));
                        auto vecB_mat2 = _mm256_loadu_si256((__m256i*)(mat2 + m2idx + 8));
                        sumA_1 = _mm256_add_epi32(sumA_1, _mm256_mullo_epi32(bc_mat1_1, vecA_mat2));
                        sumB_1 = _mm256_add_epi32(sumB_1, _mm256_mullo_epi32(bc_mat1_1, vecB_mat2));
                        auto bc_mat1_2 = _mm256_set1_epi32(mat1[i + 1][k]);
                        sumA_2 = _mm256_add_epi32(sumA_2, _mm256_mullo_epi32(bc_mat1_2, vecA_mat2));
                        sumB_2 = _mm256_add_epi32(sumB_2, _mm256_mullo_epi32(bc_mat1_2, vecB_mat2));
                        m2idx += 16;
                    }
                    _mm256_storeu_si256((__m256i*)&result[i][j], sumA_1);
                    _mm256_storeu_si256((__m256i*)&result[i][j + 8], sumB_1);
                    _mm256_storeu_si256((__m256i*)&result[i + 1][j], sumA_2);
                    _mm256_storeu_si256((__m256i*)&result[i + 1][j + 8], sumB_2);
                }
            }
        }
    }
}

_aligned_free(mat2);
added 21 characters in body
Source Link
user555045
  • 12.4k
  • 1
  • 19
  • 39

The biggest performance problem (for small matrices), and a common mistake, is doing this:

The biggest performance problem, and a common mistake, is doing this:

The biggest performance problem (for small matrices), and a common mistake, is doing this:

added 170 characters in body
Source Link
user555045
  • 12.4k
  • 1
  • 19
  • 39
Loading
more optimized versions for larger matrices
Source Link
user555045
  • 12.4k
  • 1
  • 19
  • 39
Loading
some typos
Source Link
user555045
  • 12.4k
  • 1
  • 19
  • 39
Loading
Source Link
user555045
  • 12.4k
  • 1
  • 19
  • 39
Loading