// 32 KiB
#pragma GCC target("avx2,fma")
#include <string.h>
#include <x86intrin.h>
// dst += src * k
static inline void vector_mul_add(int n, double *dst, const double *src, double k) {
__m256d K = _mm256_set1_pd(k);
#define DST(i) (* (__m256d *) (dst + (i)))
#define SRC(i) (* (__m256d *) (src + (i)))
// assume 4096-aligned
for (int i = 0; i < n; i += 32) {
DST(i + 4 * 0) = _mm256_fmadd_pd(SRC(i + 4 * 0), K, DST(i + 4 * 0));
DST(i + 4 * 1) = _mm256_fmadd_pd(SRC(i + 4 * 1), K, DST(i + 4 * 1));
DST(i + 4 * 2) = _mm256_fmadd_pd(SRC(i + 4 * 2), K, DST(i + 4 * 2));
DST(i + 4 * 3) = _mm256_fmadd_pd(SRC(i + 4 * 3), K, DST(i + 4 * 3));
DST(i + 4 * 4) = _mm256_fmadd_pd(SRC(i + 4 * 4), K, DST(i + 4 * 4));
DST(i + 4 * 5) = _mm256_fmadd_pd(SRC(i + 4 * 5), K, DST(i + 4 * 5));
DST(i + 4 * 6) = _mm256_fmadd_pd(SRC(i + 4 * 6), K, DST(i + 4 * 6));
DST(i + 4 * 7) = _mm256_fmadd_pd(SRC(i + 4 * 7), K, DST(i + 4 * 7));
}
}
void matrix_multiply(int n, const double *A, const double *B, double *C) {
const int i_step = 256;
const int k_step = 16;
const int j_step = 256;
memset(C, 0, n * n * sizeof(double));
for (int i_start = 0; i_start < n; i_start += i_step) {
int i_end = i_start + i_step;
for (int k_start = 0; k_start < n; k_start += k_step) {
int k_end = k_start + k_step <= n ? k_start + k_step : n;
for (int j_start = 0; j_start < n; j_start += j_step) {
for (int i = i_start; i < i_end; i++) {
const double *ai = A + i * n;
double *ci = C + i * n;
// bk[] footprint: 8 * 16 * 256 = 32 KiB
for (int k = k_start; k < k_end; k++) {
const double *bk = B + k * n;
const double aik = ai[k];
vector_mul_add(j_step, ci + j_start, bk + j_start, aik);
}
}
}
}
}
}
| Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
| Testcase #1 | 131.986 ms | 8 MB + 8 KB | Accepted | Score: 100 | 显示更多 |