提交记录 13574


用户 题目 状态 得分 用时 内存 语言 代码长度
saffah mmmd1k. 测测你的双精度矩阵乘法-1k Accepted 100 142.416 ms 8200 KB C++ 3.55 KB
提交时间 评测时间
2020-08-05 15:54:57 2020-08-05 15:55:01
#pragma GCC optimize("Ofast,unroll-loops,no-stack-protector")
#pragma GCC target("avx2,fma")
#include <string.h>
#include <x86intrin.h>

#define n 1024
#define idx(i, j) ((i) * n + (j))

static void gao(int s, int x, int y, int z, int dx, int dy, int dz, int dx2, int dy2, int dz2, int dx3, int dy3, int dz3, const double* __restrict__ A, const double* __restrict__ B, double* __restrict__ C) {
    if (s == 5) {
        const int m = 32;
        for (int i = 0; i < m; ++i) {
            __m256d c0 = *(__m256d*) &C[idx(x + i, z + 4 * 0)];
            __m256d c1 = *(__m256d*) &C[idx(x + i, z + 4 * 1)];
            __m256d c2 = *(__m256d*) &C[idx(x + i, z + 4 * 2)];
            __m256d c3 = *(__m256d*) &C[idx(x + i, z + 4 * 3)];
            __m256d c4 = *(__m256d*) &C[idx(x + i, z + 4 * 4)];
            __m256d c5 = *(__m256d*) &C[idx(x + i, z + 4 * 5)];
            __m256d c6 = *(__m256d*) &C[idx(x + i, z + 4 * 6)];
            __m256d c7 = *(__m256d*) &C[idx(x + i, z + 4 * 7)];
            for (int j = 0; j < m; ++j) {
                const double a = A[idx(x + i, y + j)];
                c0 += a * *(__m256d*) &B[idx(y + j, z + 4 * 0)];
                c1 += a * *(__m256d*) &B[idx(y + j, z + 4 * 1)];
                c2 += a * *(__m256d*) &B[idx(y + j, z + 4 * 2)];
                c3 += a * *(__m256d*) &B[idx(y + j, z + 4 * 3)];
                c4 += a * *(__m256d*) &B[idx(y + j, z + 4 * 4)];
                c5 += a * *(__m256d*) &B[idx(y + j, z + 4 * 5)];
                c6 += a * *(__m256d*) &B[idx(y + j, z + 4 * 6)];
                c7 += a * *(__m256d*) &B[idx(y + j, z + 4 * 7)];
            }
            *(__m256d*) &C[idx(x + i, z + 4 * 0)] = c0;
            *(__m256d*) &C[idx(x + i, z + 4 * 1)] = c1;
            *(__m256d*) &C[idx(x + i, z + 4 * 2)] = c2;
            *(__m256d*) &C[idx(x + i, z + 4 * 3)] = c3;
            *(__m256d*) &C[idx(x + i, z + 4 * 4)] = c4;
            *(__m256d*) &C[idx(x + i, z + 4 * 5)] = c5;
            *(__m256d*) &C[idx(x + i, z + 4 * 6)] = c6;
            *(__m256d*) &C[idx(x + i, z + 4 * 7)] = c7;
        }
        return;
    }
    --s;
    if (dx < 0) x -= dx << s;
    if (dy < 0) y -= dy << s;
    if (dz < 0) z -= dz << s;
    if (dx2 < 0) x -= dx2 << s;
    if (dy2 < 0) y -= dy2 << s;
    if (dz2 < 0) z -= dz2 << s;
    if (dx3 < 0) x -= dx3 << s;
    if (dy3 < 0) y -= dy3 << s;
    if (dz3 < 0) z -= dz3 << s;
    gao(s, x, y, z, dx2, dy2, dz2, dx3, dy3, dz3, dx, dy, dz, A, B, C);
    gao(s, x + (dx << s), y + (dy << s), z + (dz << s), dx3, dy3, dz3, dx, dy, dz, dx2, dy2, dz2, A, B, C);
    gao(s, x + (dx << s) + (dx2 << s), y + (dy << s) + (dy2 << s), z + (dz << s) + (dz2 << s), dx3, dy3, dz3, dx, dy, dz, dx2, dy2, dz2, A, B, C);
    gao(s, x + (dx2 << s), y + (dy2 << s), z + (dz2 << s), -dx, -dy, -dz, -dx2, -dy2, -dz2, dx3, dy3, dz3, A, B, C);
    gao(s, x + (dx2 << s) + (dx3 << s), y + (dy2 << s) + (dy3 << s), z + (dz2 << s) + (dz3 << s), -dx, -dy, -dz, -dx2, -dy2, -dz2, dx3, dy3, dz3, A, B, C);
    gao(s, x + (dx << s) + (dx2 << s) + (dx3 << s), y + (dy << s) + (dy2 << s) + (dy3 << s), z + (dz << s) + (dz2 << s) + (dz3 << s), -dx3, -dy3, -dz3, dx, dy, dz, -dx2, -dy2, -dz2, A, B, C);
    gao(s, x + (dx << s) + (dx3 << s), y + (dy << s) + (dy3 << s), z + (dz << s) + (dz3 << s), -dx3, -dy3, -dz3, dx, dy, dz, -dx2, -dy2, -dz2, A, B, C);
    gao(s, x + (dx3 << s), y + (dy3 << s), z + (dz3 << s), dx2, dy2, dz2, -dx3, -dy3, -dz3, -dx, -dy, -dz, A, B, C);
}
void matrix_multiply(int, const double* A, const double* B, double* C) {
    memset(C, 0, 1024 * 1024 * sizeof(double));
    gao(10, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, A, B, C);
}

CompilationN/AN/ACompile OKScore: N/A

Testcase #1142.416 ms8 MB + 8 KBAcceptedScore: 100


Judge Duck Online | 评测鸭在线
Server Time: 2026-03-24 03:38:39 | Loaded in 1 ms | Server Status
个人娱乐项目,仅供学习交流使用 | 捐赠