提交记录 13597
| 提交时间 |
评测时间 |
| 2020-08-05 17:35:07 |
2020-08-05 17:35:11 |
// ikj registers
#pragma GCC target("avx2,fma")
#include <string.h>
#include <x86intrin.h>
void matrix_multiply(int n, const double *A, const double *B, double *C) {
const int i_step = 256;
const int k_step = 23;
const int j_step = 60; // 15 * 4
memset(C, 0, n * n * sizeof(double));
for (int i_start = 0; i_start < n; i_start += i_step) {
int i_end = i_start + i_step;
for (int k_start = 0; k_start < n; k_start += k_step) {
int k_end = k_start + k_step <= n ? k_start + k_step : n;
for (int j_start = 0; j_start < 60 * 17; j_start += j_step) {
for (int i = i_start; i < i_end; i++) {
const double *ai = A + i * n;
double *ci = C + i * n;
double *ci_s = ci + j_start;
#define LOOP8(f) f(0) f(1) f(2) f(3) f(4) f(5) f(6) f(7)
#define LOOP15(f) f(0) f(1) f(2) f(3) f(4) f(5) f(6) f(7) \
f(8) f(9) f(10) f(11) f(12) f(13) f(14)
#define LOOP(f) LOOP15(f)
#define CI(i) (* (__m256d *) (ci_s + (i) * 4))
#define CI_r(i) ci_##i
#define load(i) __m256d CI_r(i) = CI(i);
LOOP(load)
for (int k = k_start; k < k_end; k++) {
const double *bk = B + k * n;
const double *bk_s = bk + j_start;
const double aik = ai[k];
__m256d K = _mm256_set1_pd(aik);
#define BK(i) (* (__m256d *) (bk_s + (i) * 4))
#define add(i) CI_r(i) = _mm256_fmadd_pd(BK(i), K, CI_r(i));
LOOP(add)
}
#define store(i) CI(i) = CI_r(i);
LOOP(store)
}
}
// Remaining 1
{
const int j_start = 60 * 17;
for (int i = i_start; i < i_end; i++) {
const double *ai = A + i * n;
double *ci = C + i * n;
double *ci_s = ci + j_start;
#define LOOP1(f) f(0)
LOOP1(load)
for (int k = k_start; k < k_end; k++) {
const double *bk = B + k * n;
const double *bk_s = bk + j_start;
const double aik = ai[k];
__m256d K = _mm256_set1_pd(aik);
LOOP1(add)
}
LOOP1(store)
}
}
}
}
}
| Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
| Testcase #1 | 106.8 ms | 8 MB + 8 KB | Accepted | Score: 100 | 显示更多 |
Judge Duck Online | 评测鸭在线
Server Time: 2026-03-24 02:06:39 | Loaded in 1 ms | Server Status
个人娱乐项目,仅供学习交流使用 | 捐赠