#ifdef memset0
# include <bits/stdc++.h>
const int n = 1024;
const double eps = 3 * n * n * DBL_EPSILON;
void matrix_multiply(int, const double*, const double*, double*);
void simple_matrix_multiply(int, const double*, const double*, double*);
std::mt19937 rng(20040602 /*❤️*/ ^ std::chrono::steady_clock::now().time_since_epoch().count());
template<class T> inline T rand(T l, T r) { return std::uniform_int_distribution<T>(l, r)(rng); }
inline int id(int i, int j) { return i * n + j; }
void simple_matrix_multiply(int n, const double* A, const double* B, double* C) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
C[id(i, j)] = 0;
for (int k = 0; k < n; k++) C[id(i, j)] += A[id(i, k)] * B[id(k, j)];
}
}
}
int main() {
double A[n * n], B[n * n], C[n * n], D[n * n];
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++) {
A[id(i, j)] = rng() / (double)rng.max();
B[id(i, j)] = rng() / (double)rng.max();
}
uint32_t simple_timer = 0;
simple_timer -= clock();
simple_matrix_multiply(n, A, B, D);
simple_timer += clock();
uint32_t timer = 0;
timer -= clock();
matrix_multiply(n, A, B, C);
timer += clock();
double delta = 0;
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++) {
double t = C[id(i, j)] - D[id(i, j)];
if (t < 0) {
delta = std::max(delta, -t);
} else {
delta = std::max(delta, t);
}
}
if (delta > eps) {
printf("Wrong Answer (%.12lf)\n", delta);
return 0;
}
printf("Accepted (time=%.12lf, %.2lf%%)\n", timer / (double)CLOCKS_PER_SEC, timer / (double)simple_timer);
return 0;
}
#endif
#pragma GCC target("avx")
#pragma GCC target("popcnt")
#pragma GCC optimize("unroll-loops")
#pragma GCC optimize("inline-functions")
#pragma GCC optimize("no-stack-protector")
#include <stdint.h>
#include <string.h>
#define n 1024
#define idx(i, j) ((i)*n + (j))
void matrix_multiply(int _, const double* A, const double* _B, double* C) {
double B[n * n];
memcpy(B, _B, sizeof(B));
for (uint32_t i = 0; i < n; i++) {
double t;
double* a = B + ((i + 1) << 10) + i;
double* b = B + (i << 10) + (i + 1);
for (uint32_t j = i + 1; j < n; j++, a += (1 << 10), b++) {
t = *b;
*b = *a;
*a = t;
}
}
for (uint32_t i = 0; i < n; i++) {
for (uint32_t j = 0; j < n; j++) {
uint32_t k = 0;
const double* a = A + (i << 10);
const double* b = B + (j << 10);
register double s[8];
s[0] = s[1] = s[2] = s[3] = s[4] = s[5] = s[6] = s[7] = 0;
while (k < n) {
s[0] += (*a) * (*b);
s[1] += (*(a + 1)) * (*(b + 1));
s[2] += (*(a + 2)) * (*(b + 2));
s[3] += (*(a + 3)) * (*(b + 3));
s[4] += (*(a + 4)) * (*(b + 4));
s[5] += (*(a + 5)) * (*(b + 5));
s[6] += (*(a + 6)) * (*(b + 6));
s[7] += (*(a + 7)) * (*(b + 7));
k += 8;
a += 8;
b += 8;
}
C[(i << 10) | j] = s[0] + s[1] + s[2] + s[3] + s[4] + s[5] + s[6] + s[7];
}
}
}
Compilation | N/A | N/A | Compile OK | Score: N/A | 显示更多 |
Testcase #1 | 575.528 ms | 16 MB + 8 KB | Accepted | Score: 100 | 显示更多 |