提交记录 17689


用户 题目 状态 得分 用时 内存 语言 代码长度
platelet mmmd1k. 测测你的双精度矩阵乘法-1k Accepted 100 120.413 ms 8200 KB C++ 2.84 KB
提交时间 评测时间
2022-05-07 23:03:41 2022-05-07 23:03:44
#include <algorithm>
#pragma GCC optimize("Ofast")
#pragma GCC target("avx2,fma")

const int n = 1024;
using vector = __attribute((vector_size(32))) double;

#define loadC(i, j) *(vector*)&c[(i) * n + j]
void matrix_multiply(int N, const double *a, const double *b, double *c) {
	const auto B = (vector*) b;
	const int s2 = 96, s1 = 40;
	for (int i = 0; i < n; i += s2)
		for (int l = 0, lim = std::min(i + s2, n); l < n; l += s1) {
			int r = std::min(l + s1, n);
			for (int x = 0; x < n - 4; x += 6)
				for (int y = i; y < lim; y += 8) {
					vector t00, t01, t10, t11, t20, t21, t30, t31, t40, t41, t50, t51;
					t00 = loadC(x + 0, y), t01 = loadC(x + 0, y + 4);
					t10 = loadC(x + 1, y), t11 = loadC(x + 1, y + 4);
					t20 = loadC(x + 2, y), t21 = loadC(x + 2, y + 4);
					t30 = loadC(x + 3, y), t31 = loadC(x + 3, y + 4);
					t40 = loadC(x + 4, y), t41 = loadC(x + 4, y + 4);
					t50 = loadC(x + 5, y), t51 = loadC(x + 5, y + 4);
					for (int k = l; k < r; k++) {
						vector b0 = B[(k * n + y) / 4], b1 = B[(k * n + y) / 4 + 1];
						vector a0 = vector{} + a[(x + 0) * n + k];
						t00 += a0 * b0, t01 += a0 * b1;
						vector a1 = vector{} + a[(x + 1) * n + k];
						t10 += a1 * b0, t11 += a1 * b1;
						vector a2 = vector{} + a[(x + 2) * n + k];
						t20 += a2 * b0, t21 += a2 * b1;
						vector a3 = vector{} + a[(x + 3) * n + k];
						t30 += a3 * b0, t31 += a3 * b1;
						vector a4 = vector{} + a[(x + 4) * n + k];
						t40 += a4 * b0, t41 += a4 * b1;
						vector a5 = vector{} + a[(x + 5) * n + k];
						t50 += a5 * b0, t51 += a5 * b1;
					}
					loadC(x + 0, y) = t00, loadC(x + 0, y + 4) = t01;
					loadC(x + 1, y) = t10, loadC(x + 1, y + 4) = t11;
					loadC(x + 2, y) = t20, loadC(x + 2, y + 4) = t21;
					loadC(x + 3, y) = t30, loadC(x + 3, y + 4) = t31;
					loadC(x + 4, y) = t40, loadC(x + 4, y + 4) = t41;
					loadC(x + 5, y) = t50, loadC(x + 5, y + 4) = t51;
				}
			const int x = 1020;
			for (int y = i; y < lim; y += 8) {
				vector t00, t01, t10, t11, t20, t21, t30, t31;
				t00 = loadC(x + 0, y), t01 = loadC(x + 0, y + 4);
				t10 = loadC(x + 1, y), t11 = loadC(x + 1, y + 4);
				t20 = loadC(x + 2, y), t21 = loadC(x + 2, y + 4);
				t30 = loadC(x + 3, y), t31 = loadC(x + 3, y + 4);
				for (int k = l; k < r; k++) {
					vector b0 = B[(k * n + y) / 4], b1 = B[(k * n + y) / 4 + 1];
					vector a0 = vector{} + a[(x + 0) * n + k];
					t00 += a0 * b0, t01 += a0 * b1;
					vector a1 = vector{} + a[(x + 1) * n + k];
					t10 += a1 * b0, t11 += a1 * b1;
					vector a2 = vector{} + a[(x + 2) * n + k];
					t20 += a2 * b0, t21 += a2 * b1;
					vector a3 = vector{} + a[(x + 3) * n + k];
					t30 += a3 * b0, t31 += a3 * b1;
				}
				loadC(x + 0, y) = t00, loadC(x + 0, y + 4) = t01;
				loadC(x + 1, y) = t10, loadC(x + 1, y + 4) = t11;
				loadC(x + 2, y) = t20, loadC(x + 2, y + 4) = t21;
				loadC(x + 3, y) = t30, loadC(x + 3, y + 4) = t31;
			}
		}
}

CompilationN/AN/ACompile OKScore: N/A

Testcase #1120.413 ms8 MB + 8 KBAcceptedScore: 100


Judge Duck Online | 评测鸭在线
Server Time: 2024-04-23 23:35:53 | Loaded in 1 ms | Server Status
个人娱乐项目,仅供学习交流使用