#pragma GCC optimize("Ofast,no-stack-protector")
#pragma GCC target("avx2,fma")
#include <string.h>
#include <x86intrin.h>
#define n 1024
#define lda 1056
#define idx(i, j) (((i) * lda) + (j))
static void kernel32(long x, long y, long z, const double* __restrict__ A, const double* __restrict__ B, double* __restrict__ C) {
const long m = 32;
for (long k = 0; k < 2; ++k, z += 16) {
for (long j = 0; j < 8; ++j, y += 4) {
__m256d b00 = _mm256_load_pd(&B[idx(y + 0, z + 4 * 0)]);
__m256d b01 = _mm256_load_pd(&B[idx(y + 0, z + 4 * 1)]);
__m256d b02 = _mm256_load_pd(&B[idx(y + 0, z + 4 * 2)]);
__m256d b03 = _mm256_load_pd(&B[idx(y + 0, z + 4 * 3)]);
__m256d b10 = _mm256_load_pd(&B[idx(y + 1, z + 4 * 0)]);
__m256d b11 = _mm256_load_pd(&B[idx(y + 1, z + 4 * 1)]);
__m256d b12 = _mm256_load_pd(&B[idx(y + 1, z + 4 * 2)]);
__m256d b13 = _mm256_load_pd(&B[idx(y + 1, z + 4 * 3)]);
__m256d b20 = _mm256_load_pd(&B[idx(y + 2, z + 4 * 0)]);
__m256d b21 = _mm256_load_pd(&B[idx(y + 2, z + 4 * 1)]);
__m256d b22 = _mm256_load_pd(&B[idx(y + 2, z + 4 * 2)]);
__m256d b23 = _mm256_load_pd(&B[idx(y + 2, z + 4 * 3)]);
__m256d b30 = _mm256_load_pd(&B[idx(y + 3, z + 4 * 0)]);
__m256d b31 = _mm256_load_pd(&B[idx(y + 3, z + 4 * 1)]);
__m256d b32 = _mm256_load_pd(&B[idx(y + 3, z + 4 * 2)]);
__m256d b33 = _mm256_load_pd(&B[idx(y + 3, z + 4 * 3)]);
for (long i = 0; i < m; ++i) {
__m256d c0 = _mm256_load_pd(&C[idx(x + i, z + 4 * 0)]);
__m256d c1 = _mm256_load_pd(&C[idx(x + i, z + 4 * 1)]);
__m256d c2 = _mm256_load_pd(&C[idx(x + i, z + 4 * 2)]);
__m256d c3 = _mm256_load_pd(&C[idx(x + i, z + 4 * 3)]);
__m256d a;
a = _mm256_set1_pd(A[idx(x + i, y + 0)]);
c0 += a * b00;
c1 += a * b01;
c2 += a * b02;
c3 += a * b03;
a = _mm256_set1_pd(A[idx(x + i, y + 1)]);
c0 += a * b10;
c1 += a * b11;
c2 += a * b12;
c3 += a * b13;
a = _mm256_set1_pd(A[idx(x + i, y + 2)]);
c0 += a * b20;
c1 += a * b21;
c2 += a * b22;
c3 += a * b23;
a = _mm256_set1_pd(A[idx(x + i, y + 3)]);
c0 += a * b30;
c1 += a * b31;
c2 += a * b32;
c3 += a * b33;
_mm256_store_pd(&C[idx(x + i, z + 4 * 0)], c0);
_mm256_store_pd(&C[idx(x + i, z + 4 * 1)], c1);
_mm256_store_pd(&C[idx(x + i, z + 4 * 2)], c2);
_mm256_store_pd(&C[idx(x + i, z + 4 * 3)], c3);
}
}
y -= 32;
}
}
static void gao(int s, int x, int y, int z, int dx, int dy, int dz, int dx2, int dy2, int dz2, int dx3, int dy3, int dz3, const double* __restrict__ A, const double* __restrict__ B, double* __restrict__ C) {
if (s == 5) {
kernel32(x, y, z, A, B, C);
return;
}
--s;
if (dx < 0) x -= dx << s;
if (dy < 0) y -= dy << s;
if (dz < 0) z -= dz << s;
if (dx2 < 0) x -= dx2 << s;
if (dy2 < 0) y -= dy2 << s;
if (dz2 < 0) z -= dz2 << s;
if (dx3 < 0) x -= dx3 << s;
if (dy3 < 0) y -= dy3 << s;
if (dz3 < 0) z -= dz3 << s;
gao(s, x, y, z, dx2, dy2, dz2, dx3, dy3, dz3, dx, dy, dz, A, B, C);
gao(s, x + (dx << s), y + (dy << s), z + (dz << s), dx3, dy3, dz3, dx, dy, dz, dx2, dy2, dz2, A, B, C);
gao(s, x + (dx << s) + (dx2 << s), y + (dy << s) + (dy2 << s), z + (dz << s) + (dz2 << s), dx3, dy3, dz3, dx, dy, dz, dx2, dy2, dz2, A, B, C);
gao(s, x + (dx2 << s), y + (dy2 << s), z + (dz2 << s), -dx, -dy, -dz, -dx2, -dy2, -dz2, dx3, dy3, dz3, A, B, C);
gao(s, x + (dx2 << s) + (dx3 << s), y + (dy2 << s) + (dy3 << s), z + (dz2 << s) + (dz3 << s), -dx, -dy, -dz, -dx2, -dy2, -dz2, dx3, dy3, dz3, A, B, C);
gao(s, x + (dx << s) + (dx2 << s) + (dx3 << s), y + (dy << s) + (dy2 << s) + (dy3 << s), z + (dz << s) + (dz2 << s) + (dz3 << s), -dx3, -dy3, -dz3, dx, dy, dz, -dx2, -dy2, -dz2, A, B, C);
gao(s, x + (dx << s) + (dx3 << s), y + (dy << s) + (dy3 << s), z + (dz << s) + (dz3 << s), -dx3, -dy3, -dz3, dx, dy, dz, -dx2, -dy2, -dz2, A, B, C);
gao(s, x + (dx3 << s), y + (dy3 << s), z + (dz3 << s), dx2, dy2, dz2, -dx3, -dy3, -dz3, -dx, -dy, -dz, A, B, C);
}
static void convert(const double* __restrict__ src, double* __restrict__ dst) {
// memcpy(dst, src, sizeof(double) * 1024 * 1024);
for (int i = 0; i < 1024; ++i)
memcpy(dst + i * lda, src + i * 1024, sizeof(double) * 1024);
}
static void iconvert(const double* __restrict__ src, double* __restrict__ dst) {
// memcpy(dst, src, sizeof(double) * 1024 * 1024);
for (int i = 0; i < 1024; ++i)
memcpy(dst + i * 1024, src + i * lda, sizeof(double) * 1024);
}
void matrix_multiply(int, const double* _A, const double* _B, double* _C) {
double A[lda * lda], B[lda * lda], C[lda * lda] __attribute__((aligned(4096)));
convert(_A, A);
convert(_B, B);
memset(C, 0, sizeof(C));
gao(10, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, A, B, C);
iconvert(C, _C);
}
Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
Testcase #1 | 78.65 ms | 33 MB + 552 KB | Accepted | Score: 100 | 显示更多 |