提交记录 17687


用户 题目 状态 得分 用时 内存 语言 代码长度
platelet mmmd1k. 测测你的双精度矩阵乘法-1k Runtime Error 0 24.46 us 32 KB C++ 1.82 KB
提交时间 评测时间
2022-05-07 22:50:42 2022-05-07 22:50:44
#include <algorithm>
#pragma GCC target("avx2,fma")

const int n = 1024;
using vector = __attribute((vector_size(32))) double;

#define loadC(i, j) *(vector*)&c[i * n + j]
void matrix_multiply(int N, const double *a, const double *b, double *c) {
	const auto B = (vector*) b;
	const int s2 = 96, s1 = 40;
	for (int i = 0; i < n; i += s2)
		for (int l = 0, lim = std::min(i + s2, n); l < n; l += s1)
			for (int x = 0, r = std::min(l + s1, n); x < n; x += 6)
				for (int y = i; y < lim; y += 8) {
					vector t00, t01, t10, t11, t20, t21, t30, t31, t40, t41, t50, t51;
					t00 = loadC(x + 0, y), t01 = loadC(x + 0, y + 4);
					t10 = loadC(x + 1, y), t11 = loadC(x + 1, y + 4);
					t20 = loadC(x + 2, y), t21 = loadC(x + 2, y + 4);
					t30 = loadC(x + 3, y), t31 = loadC(x + 3, y + 4);
					t40 = loadC(x + 4, y), t41 = loadC(x + 4, y + 4);
					t50 = loadC(x + 5, y), t51 = loadC(x + 5, y + 4);
					for (int k = l; k < r; k++) {
						vector b0 = B[(k * n + y) / 4], b1 = B[(k * n + y) / 4 + 1];
						vector a0 = vector{} + a[(x + 0) * n + k];
						t00 += a0 * b0, t01 += a0 * b1;
						vector a1 = vector{} + a[(x + 1) * n + k];
						t10 += a1 * b0, t11 += a1 * b1;
						vector a2 = vector{} + a[(x + 2) * n + k];
						t20 += a2 * b0, t21 += a2 * b1;
						vector a3 = vector{} + a[(x + 3) * n + k];
						t30 += a3 * b0, t31 += a3 * b1;
						vector a4 = vector{} + a[(x + 4) * n + k];
						t40 += a4 * b0, t41 += a4 * b1;
						vector a5 = vector{} + a[(x + 5) * n + k];
						t50 += a5 * b0, t51 += a5 * b1;
					}
					loadC(x + 0, y) = t00, loadC(x + 0, y + 4) = t01;
					loadC(x + 1, y) = t10, loadC(x + 1, y + 4) = t11;
					loadC(x + 2, y) = t20, loadC(x + 2, y + 4) = t21;
					loadC(x + 3, y) = t30, loadC(x + 3, y + 4) = t31;
					loadC(x + 4, y) = t40, loadC(x + 4, y + 4) = t41;
					loadC(x + 5, y) = t50, loadC(x + 5, y + 4) = t51;
				}
}

CompilationN/AN/ACompile OKScore: N/A

Testcase #124.46 us32 KBRuntime ErrorScore: 0


Judge Duck Online | 评测鸭在线
Server Time: 2025-07-02 08:56:14 | Loaded in 0 ms | Server Status
个人娱乐项目,仅供学习交流使用 | 捐赠