#include <algorithm>
#pragma GCC optimize("Ofast")
#pragma GCC target("avx2,fma")
const int n = 1024;
using vector = __attribute((vector_size(32))) double;
#define loadC(i, j) *(vector*)&c[(i) * n + j]
void matrix_multiply(int N, const double *a, const double *b, double *c) {
const auto B = (vector*) b;
const int s2 = 96, s1 = 40;
for (int i = 0; i < n; i += s2)
for (int l = 0, lim = std::min(i + s2, n); l < n; l += s1) {
int r = std::min(l + s1, n);
for (int x = 0; x < n - 4; x += 6)
for (int y = i; y < lim; y += 8) {
vector t00, t01, t10, t11, t20, t21, t30, t31, t40, t41, t50, t51;
t00 = loadC(x + 0, y), t01 = loadC(x + 0, y + 4);
t10 = loadC(x + 1, y), t11 = loadC(x + 1, y + 4);
t20 = loadC(x + 2, y), t21 = loadC(x + 2, y + 4);
t30 = loadC(x + 3, y), t31 = loadC(x + 3, y + 4);
t40 = loadC(x + 4, y), t41 = loadC(x + 4, y + 4);
t50 = loadC(x + 5, y), t51 = loadC(x + 5, y + 4);
for (int k = l; k < r; k++) {
vector b0 = B[(k * n + y) / 4], b1 = B[(k * n + y) / 4 + 1];
vector a0 = vector{} + a[(x + 0) * n + k];
t00 += a0 * b0, t01 += a0 * b1;
vector a1 = vector{} + a[(x + 1) * n + k];
t10 += a1 * b0, t11 += a1 * b1;
vector a2 = vector{} + a[(x + 2) * n + k];
t20 += a2 * b0, t21 += a2 * b1;
vector a3 = vector{} + a[(x + 3) * n + k];
t30 += a3 * b0, t31 += a3 * b1;
vector a4 = vector{} + a[(x + 4) * n + k];
t40 += a4 * b0, t41 += a4 * b1;
vector a5 = vector{} + a[(x + 5) * n + k];
t50 += a5 * b0, t51 += a5 * b1;
}
loadC(x + 0, y) = t00, loadC(x + 0, y + 4) = t01;
loadC(x + 1, y) = t10, loadC(x + 1, y + 4) = t11;
loadC(x + 2, y) = t20, loadC(x + 2, y + 4) = t21;
loadC(x + 3, y) = t30, loadC(x + 3, y + 4) = t31;
loadC(x + 4, y) = t40, loadC(x + 4, y + 4) = t41;
loadC(x + 5, y) = t50, loadC(x + 5, y + 4) = t51;
}
const int x = 1020;
for (int y = i; y < lim; y += 8) {
vector t00, t01, t10, t11, t20, t21, t30, t31;
t00 = loadC(x + 0, y), t01 = loadC(x + 0, y + 4);
t10 = loadC(x + 1, y), t11 = loadC(x + 1, y + 4);
t20 = loadC(x + 2, y), t21 = loadC(x + 2, y + 4);
t30 = loadC(x + 3, y), t31 = loadC(x + 3, y + 4);
for (int k = l; k < r; k++) {
vector b0 = B[(k * n + y) / 4], b1 = B[(k * n + y) / 4 + 1];
vector a0 = vector{} + a[(x + 0) * n + k];
t00 += a0 * b0, t01 += a0 * b1;
vector a1 = vector{} + a[(x + 1) * n + k];
t10 += a1 * b0, t11 += a1 * b1;
vector a2 = vector{} + a[(x + 2) * n + k];
t20 += a2 * b0, t21 += a2 * b1;
vector a3 = vector{} + a[(x + 3) * n + k];
t30 += a3 * b0, t31 += a3 * b1;
}
loadC(x + 0, y) = t00, loadC(x + 0, y + 4) = t01;
loadC(x + 1, y) = t10, loadC(x + 1, y + 4) = t11;
loadC(x + 2, y) = t20, loadC(x + 2, y + 4) = t21;
loadC(x + 3, y) = t30, loadC(x + 3, y + 4) = t31;
}
}
}
Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
Testcase #1 | 120.413 ms | 8 MB + 8 KB | Accepted | Score: 100 | 显示更多 |