Changes to facilitate multi-threading of encoding stage

Modified the encoding stage to have row level entry points with relevant
initializations and to access the token information at row level

Change-Id: Ife10e55a7c1a420ee906d711caf75002688d9e39
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 49aea69..43c5eae 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -484,23 +484,31 @@
 }
 
 static void write_modes(VP9_COMP *cpi, MACROBLOCKD *const xd,
-                        const TileInfo *const tile, vpx_writer *w,
-                        TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
-                        unsigned int *const max_mv_magnitude,
+                        const TileInfo *const tile, vpx_writer *w, int tile_row,
+                        int tile_col, unsigned int *const max_mv_magnitude,
                         int interp_filter_selected[MAX_REF_FRAMES]
                                                   [SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
-  int mi_row, mi_col;
+  int mi_row, mi_col, tile_sb_row;
+  TOKENEXTRA *tok = NULL;
+  TOKENEXTRA *tok_end = NULL;
 
   set_partition_probs(cm, xd);
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
+    tile_sb_row = mi_cols_aligned_to_sb(mi_row - tile->mi_row_start) >>
+                  MI_BLOCK_SIZE_LOG2;
+    tok = cpi->tplist[tile_row][tile_col][tile_sb_row].start;
+    tok_end = tok + cpi->tplist[tile_row][tile_col][tile_sb_row].count;
+
     vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
-      write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+      write_modes_sb(cpi, xd, tile, w, &tok, tok_end, mi_row, mi_col,
                      BLOCK_64X64, max_mv_magnitude, interp_filter_selected);
+
+    assert(tok == cpi->tplist[tile_row][tile_col][tile_sb_row].stop);
   }
 }
 
@@ -919,9 +927,8 @@
   MACROBLOCKD *const xd = &data->xd;
   vpx_start_encode(&data->bit_writer, data->dest);
   write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info,
-              &data->bit_writer, &data->tok, data->tok_end,
-              &data->max_mv_magnitude, data->interp_filter_selected);
-  assert(data->tok == data->tok_end);
+              &data->bit_writer, 0, data->tile_idx, &data->max_mv_magnitude,
+              data->interp_filter_selected);
   vpx_stop_encode(&data->bit_writer);
   return 1;
 }
@@ -978,8 +985,6 @@
       // Populate the worker data.
       data->xd = cpi->td.mb.e_mbd;
       data->tile_idx = tile_col;
-      data->tok = cpi->tile_tok[0][tile_col];
-      data->tok_end = cpi->tile_tok[0][tile_col] + cpi->tok_count[0][tile_col];
       data->max_mv_magnitude = cpi->max_mv_magnitude;
       memset(data->interp_filter_selected, 0,
              sizeof(data->interp_filter_selected[0][0]) * SWITCHABLE);
@@ -1039,7 +1044,6 @@
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   vpx_writer residual_bc;
   int tile_row, tile_col;
-  TOKENEXTRA *tok_end;
   size_t total_size = 0;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
@@ -1058,10 +1062,6 @@
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
       int tile_idx = tile_row * tile_cols + tile_col;
-      TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
-
-      tok_end = cpi->tile_tok[tile_row][tile_col] +
-                cpi->tok_count[tile_row][tile_col];
 
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
         vpx_start_encode(&residual_bc, data_ptr + total_size + 4);
@@ -1069,9 +1069,9 @@
         vpx_start_encode(&residual_bc, data_ptr + total_size);
 
       write_modes(cpi, xd, &cpi->tile_data[tile_idx].tile_info, &residual_bc,
-                  &tok, tok_end, &cpi->max_mv_magnitude,
+                  tile_row, tile_col, &cpi->max_mv_magnitude,
                   cpi->interp_filter_selected);
-      assert(tok == tok_end);
+
       vpx_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
         // size of this tile
diff --git a/vp9/encoder/vp9_bitstream.h b/vp9/encoder/vp9_bitstream.h
index 044a3bb..339c3fe 100644
--- a/vp9/encoder/vp9_bitstream.h
+++ b/vp9/encoder/vp9_bitstream.h
@@ -20,8 +20,6 @@
 typedef struct VP9BitstreamWorkerData {
   uint8_t *dest;
   int dest_size;
-  TOKENEXTRA *tok;
-  TOKENEXTRA *tok_end;
   vpx_writer bit_writer;
   int tile_idx;
   unsigned int max_mv_magnitude;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index bf32389..c169019 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -4078,7 +4078,9 @@
   const int tile_rows = 1 << cm->log2_tile_rows;
   int tile_col, tile_row;
   TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+  TOKENLIST *tplist = cpi->tplist[0][0];
   int tile_tok = 0;
+  int tplist_count = 0;
 
   if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
     if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
@@ -4109,17 +4111,50 @@
       cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
       pre_tok = cpi->tile_tok[tile_row][tile_col];
       tile_tok = allocated_tokens(*tile_info);
+
+      cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
+      tplist = cpi->tplist[tile_row][tile_col];
+      tplist_count = get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
     }
   }
 }
 
+void vp9_encode_sb_row(VP9_COMP *cpi, ThreadData *td, int tile_row,
+                       int tile_col, int mi_row) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+  TOKENEXTRA *tok = NULL;
+  int tile_sb_row;
+  int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 1) >> 1;
+
+  tile_sb_row = mi_cols_aligned_to_sb(mi_row - tile_info->mi_row_start) >>
+                MI_BLOCK_SIZE_LOG2;
+  get_start_tok(cpi, tile_row, tile_col, mi_row, &tok);
+  cpi->tplist[tile_row][tile_col][tile_sb_row].start = tok;
+
+  if (cpi->sf.use_nonrd_pick_mode)
+    encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+  else
+    encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+
+  cpi->tplist[tile_row][tile_col][tile_sb_row].stop = tok;
+  cpi->tplist[tile_row][tile_col][tile_sb_row].count =
+      (unsigned int)(cpi->tplist[tile_row][tile_col][tile_sb_row].stop -
+                     cpi->tplist[tile_row][tile_col][tile_sb_row].start);
+  assert(tok - cpi->tplist[tile_row][tile_col][tile_sb_row].start <=
+         get_token_alloc(MI_BLOCK_SIZE >> 1, tile_mb_cols));
+
+  (void)tile_mb_cols;
+}
+
 void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td, int tile_row,
                      int tile_col) {
   VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
   TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
   const TileInfo *const tile_info = &this_tile->tile_info;
-  TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
   const int mi_row_start = tile_info->mi_row_start;
   const int mi_row_end = tile_info->mi_row_end;
   int mi_row;
@@ -4130,16 +4165,8 @@
   td->mb.m_search_count_ptr = &this_tile->m_search_count;
   td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
 
-  for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE) {
-    if (cpi->sf.use_nonrd_pick_mode)
-      encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
-    else
-      encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
-  }
-  cpi->tok_count[tile_row][tile_col] =
-      (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
-  assert(tok - cpi->tile_tok[tile_row][tile_col] <=
-         allocated_tokens(*tile_info));
+  for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE)
+    vp9_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
 }
 
 static void encode_tiles(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 806f07a..0dce44c 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -495,6 +495,9 @@
   vpx_free(cpi->tile_tok[0][0]);
   cpi->tile_tok[0][0] = 0;
 
+  vpx_free(cpi->tplist[0][0]);
+  cpi->tplist[0][0] = NULL;
+
   vp9_free_pc_tree(&cpi->td);
 
   for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
@@ -829,6 +832,7 @@
 
 static void alloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
+  int sb_rows;
 
   vp9_alloc_context_buffers(cm, cm->width, cm->height);
 
@@ -842,6 +846,12 @@
                     vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
   }
 
+  sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  vpx_free(cpi->tplist[0][0]);
+  CHECK_MEM_ERROR(
+      cm, cpi->tplist[0][0],
+      vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0])));
+
   vp9_setup_pc_tree(&cpi->common, &cpi->td);
 }
 
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index e1046f1..9182539 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -284,6 +284,12 @@
 #endif
 } RowMTInfo;
 
+typedef struct {
+  TOKENEXTRA *start;
+  TOKENEXTRA *stop;
+  unsigned int count;
+} TOKENLIST;
+
 typedef struct MultiThreadHandle {
   int allocated_tile_rows;
   int allocated_tile_cols;
@@ -470,6 +476,7 @@
 
   TOKENEXTRA *tile_tok[4][1 << 6];
   uint32_t tok_count[4][1 << 6];
+  TOKENLIST *tplist[4][1 << 6];
 
   // Ambient reconstruction err target for force key frames
   int64_t ambient_err;
@@ -777,6 +784,20 @@
   return get_token_alloc(tile_mb_rows, tile_mb_cols);
 }
 
+static INLINE void get_start_tok(VP9_COMP *cpi, int tile_row, int tile_col,
+                                 int mi_row, TOKENEXTRA **tok) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+
+  int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 1) >> 1;
+  const int mb_row = (mi_row - tile_info->mi_row_start) >> 1;
+
+  *tok =
+      cpi->tile_tok[tile_row][tile_col] + get_token_alloc(mb_row, tile_mb_cols);
+}
+
 int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 #if CONFIG_VP9_HIGHBITDEPTH
 int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,