void matmul_naive (const double *a, const double *b, double *c, const int m,
    const int n, const int p, const int lda, const int ldb, const int ldc);
/*
 * Naive implementation of matrix multiplication c += ab, 
 * where a is m x n, b is n x p, and c is m x p, in column-major order. 
 *
 * The physical sizes of a, b, and c are lda x n, ldb x p, and ldc x p, 
 * but only the first m/n/m rows are used, respectively. 
 *
 *   c_{ij} += \sum_{k=0}^{n-1} a_{ik} b_{kj} 
 *
 *   a_{ij} <-> a[i + j*lda] <- column-major order
 */
void matmul_naive (const double *a, const double *b, double *c, const int m,
    const int n, const int p, const int lda, const int ldb, const int ldc)
{
    for (int i = 0; i < m; i++)
    {
        for (int j = 0; j < p; j++)
        {
            double sum = 0.0;

            for (int k = 0; k < n; k++)
            {
                sum += a[i + lda * k] * b[k + ldb * j];
            }
            c[i + ldc * j] += sum;
        }
    }
}