Refactoring: Isolating the matrix-vector product in gemm_accum()
diff --git a/src/mlp.c b/src/mlp.c
index f43a704..964c6a9 100644
--- a/src/mlp.c
+++ b/src/mlp.c
@@ -69,22 +69,29 @@
    return .5f + .5f*tansig_approx(.5f*x);
 }
 
-void compute_dense(const DenseLayer *layer, float *output, const float *input)
+static void gemm_accum(float *out, const opus_int8 *weights, int rows, int cols, int col_stride, const float *x)
 {
    int i, j;
+   for (i=0;i<rows;i++)
+   {
+      for (j=0;j<cols;j++)
+         out[i] += weights[j*col_stride + i]*x[j];
+   }
+}
+
+void compute_dense(const DenseLayer *layer, float *output, const float *input)
+{
+   int i;
    int N, M;
    int stride;
    M = layer->nb_inputs;
    N = layer->nb_neurons;
    stride = N;
    for (i=0;i<N;i++)
-   {
-      /* Compute update gate. */
-      float sum = layer->bias[i];
-      for (j=0;j<M;j++)
-         sum += layer->input_weights[j*stride + i]*input[j];
-      output[i] = WEIGHTS_SCALE*sum;
-   }
+      output[i] = layer->bias[i];
+   gemm_accum(output, layer->input_weights, N, M, stride, input);
+   for (i=0;i<N;i++)
+      output[i] *= WEIGHTS_SCALE;
    if (layer->sigmoid) {
       for (i=0;i<N;i++)
          output[i] = sigmoid_approx(output[i]);
@@ -96,45 +103,41 @@
 
 void compute_gru(const GRULayer *gru, float *state, const float *input)
 {
-   int i, j;
+   int i;
    int N, M;
    int stride;
+   float tmp[MAX_NEURONS];
    float z[MAX_NEURONS];
    float r[MAX_NEURONS];
    float h[MAX_NEURONS];
    M = gru->nb_inputs;
    N = gru->nb_neurons;
    stride = 3*N;
+   /* Compute update gate. */
    for (i=0;i<N;i++)
-   {
-      /* Compute update gate. */
-      float sum = gru->bias[i];
-      for (j=0;j<M;j++)
-         sum += gru->input_weights[j*stride + i]*input[j];
-      for (j=0;j<N;j++)
-         sum += gru->recurrent_weights[j*stride + i]*state[j];
-      z[i] = sigmoid_approx(WEIGHTS_SCALE*sum);
-   }
+      z[i] = gru->bias[i];
+   gemm_accum(z, gru->input_weights, N, M, stride, input);
+   gemm_accum(z, gru->recurrent_weights, N, N, stride, state);
    for (i=0;i<N;i++)
-   {
-      /* Compute reset gate. */
-      float sum = gru->bias[N + i];
-      for (j=0;j<M;j++)
-         sum += gru->input_weights[N + j*stride + i]*input[j];
-      for (j=0;j<N;j++)
-         sum += gru->recurrent_weights[N + j*stride + i]*state[j];
-      r[i] = sigmoid_approx(WEIGHTS_SCALE*sum);
-   }
+      z[i] = sigmoid_approx(WEIGHTS_SCALE*z[i]);
+
+   /* Compute reset gate. */
    for (i=0;i<N;i++)
-   {
-      /* Compute output. */
-      float sum = gru->bias[2*N + i];
-      for (j=0;j<M;j++)
-         sum += gru->input_weights[2*N + j*stride + i]*input[j];
-      for (j=0;j<N;j++)
-         sum += gru->recurrent_weights[2*N + j*stride + i]*state[j]*r[j];
-      h[i] = z[i]*state[i] + (1-z[i])*tansig_approx(WEIGHTS_SCALE*sum);
-   }
+      r[i] = gru->bias[N + i];
+   gemm_accum(r, &gru->input_weights[N], N, M, stride, input);
+   gemm_accum(r, &gru->recurrent_weights[N], N, N, stride, state);
+   for (i=0;i<N;i++)
+      r[i] = sigmoid_approx(WEIGHTS_SCALE*r[i]);
+
+   /* Compute output. */
+   for (i=0;i<N;i++)
+      h[i] = gru->bias[2*N + i];
+   for (i=0;i<N;i++)
+      tmp[i] = state[i] * r[i];
+   gemm_accum(h, &gru->input_weights[2*N], N, M, stride, input);
+   gemm_accum(h, &gru->recurrent_weights[2*N], N, N, stride, tmp);
+   for (i=0;i<N;i++)
+      h[i] = z[i]*state[i] + (1-z[i])*tansig_approx(WEIGHTS_SCALE*h[i]);
    for (i=0;i<N;i++)
       state[i] = h[i];
 }