#include <cstdlib>
#define idx(i, j) ((i) * n + (j))
void matrix_multiply_small(int n, const double *A, const double *B, double *C) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
C[idx(i, j)] = 0;
}
}
for (int i = 0; i < n; i++) {
for (int k = 0; k < n; k++) {
for (int j = 0; j < n; j++) {
C[idx(i, j)] += A[idx(i, k)] * B[idx(k, j)];
}
}
}
}
void construct_sub_matrix(int n, const double *A, double **X) {
int m = n >> 1;
for (int i = 0, id = 0; i < m; i++)
for (int j = 0; j < m; j++)
X[0][id++] = A[idx(i, j)];
for (int i = 0, id = 0; i < m; i++)
for (int j = m; j < n; j++)
X[1][id++] = A[idx(i, j)];
for (int i = m, id = 0; i < n; i++)
for (int j = 0; j < m; j++)
X[2][id++] = A[idx(i, j)];
for (int i = m, id = 0; i < n; i++)
for (int j = m; j < n; j++)
X[3][id++] = A[idx(i, j)];
}
void matrix_multiply(int n, const double *A, const double *B, double *C) {
if (n <= 32) {
matrix_multiply_small(n, A, B, C);
return;
}
double *X[8], *M[8];
int m = n >> 1;
for (int index = 0; index < 8; index++) {
X[index] = (double*) malloc(sizeof(double) * m * m);
M[index] = (double*) malloc(sizeof(double) * m * m);
}
double **Y = X + 4;
construct_sub_matrix(n, A, X);
construct_sub_matrix(n, B, Y);
matrix_multiply(m, X[0], Y[0], M[0]);
matrix_multiply(m, X[0], Y[1], M[1]);
matrix_multiply(m, X[1], Y[2], M[2]);
matrix_multiply(m, X[1], Y[3], M[3]);
matrix_multiply(m, X[2], Y[0], M[4]);
matrix_multiply(m, X[2], Y[1], M[5]);
matrix_multiply(m, X[3], Y[2], M[6]);
matrix_multiply(m, X[3], Y[3], M[7]);
for (int i = 0, id = 0; i < m; i++)
for (int j = 0; j < m; j++)
C[idx(i, j)] = M[0][id] + M[2][id], ++id;
for (int i = 0, id = 0; i < m; i++)
for (int j = m; j < n; j++)
C[idx(i, j)] = M[1][id] + M[3][id], ++id;
for (int i = m, id = 0; i < n; i++)
for (int j = 0; j < m; j++)
C[idx(i, j)] = M[4][id] + M[6][id], ++id;
for (int i = m, id = 0; i < n; i++)
for (int j = m; j < n; j++)
C[idx(i, j)] = M[5][id] + M[7][id], ++id;
for (int i = 0; i < 8; i++)
free(X[i]), free(M[i]);
}