libavcodec/vvc/thread.c - third_party/ffmpeg - Git at Google

 /*
  * VVC thread logic
  *
  * Copyright (C) 2023 Nuo Mi
  *
  * This file is part of FFmpeg.
  *
  * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

 #include <stdatomic.h>

 #include "libavutil/executor.h"
 #include "libavutil/mem.h"
 #include "libavutil/thread.h"

 #include "thread.h"
 #include "ctu.h"
 #include "filter.h"
 #include "inter.h"
 #include "intra.h"
 #include "refs.h"

 typedef struct ProgressListener {
     VVCProgressListener l;
     struct VVCTask *task;
     VVCContext *s;
 } ProgressListener;

 typedef enum VVCTaskStage {
     VVC_TASK_STAGE_PARSE,
     VVC_TASK_STAGE_INTER,
     VVC_TASK_STAGE_RECON,
     VVC_TASK_STAGE_LMCS,
     VVC_TASK_STAGE_DEBLOCK_V,
     VVC_TASK_STAGE_DEBLOCK_H,
     VVC_TASK_STAGE_SAO,
     VVC_TASK_STAGE_ALF,
     VVC_TASK_STAGE_LAST
 } VVCTaskStage;

 typedef struct VVCTask {
     union {
         struct VVCTask *next;                //for executor debug only
         AVTask task;
     } u;

     VVCTaskStage stage;

     // ctu x, y, and raster scan order
     int rx, ry, rs;
     VVCFrameContext *fc;

     ProgressListener col_listener;
     ProgressListener listener[2][VVC_MAX_REF_ENTRIES];

     // for parse task only
     SliceContext *sc;
     EntryPoint *ep;
     int ctu_idx;                    //ctu idx in the current slice

     // tasks with target scores met are ready for scheduling
     atomic_uchar score[VVC_TASK_STAGE_LAST];
     atomic_uchar target_inter_score;
 } VVCTask;

 typedef struct VVCRowThread {
     atomic_int col_progress[VVC_PROGRESS_LAST];
 } VVCRowThread;

 typedef struct VVCFrameThread {
     // error return for tasks
     atomic_int ret;

     VVCRowThread *rows;
     VVCTask *tasks;

     int ctu_size;
     int ctu_width;
     int ctu_height;
     int ctu_count;

     //protected by lock
     atomic_int nb_scheduled_tasks;
     atomic_int nb_scheduled_listeners;

     int row_progress[VVC_PROGRESS_LAST];

     AVMutex lock;
     AVCond  cond;
 } VVCFrameThread;

 static void add_task(VVCContext *s, VVCTask *t)
 {
     VVCFrameThread *ft = t->fc->ft;

     atomic_fetch_add(&ft->nb_scheduled_tasks, 1);

     av_executor_execute(s->executor, &t->u.task);
 }

 static void task_init(VVCTask *t, VVCTaskStage stage, VVCFrameContext *fc, const int rx, const int ry)
 {
     memset(t, 0, sizeof(*t));
     t->stage = stage;
     t->fc    = fc;
     t->rx    = rx;
     t->ry    = ry;
     t->rs    = ry * fc->ft->ctu_width + rx;
     for (int i = 0; i < FF_ARRAY_ELEMS(t->score); i++)
         atomic_store(t->score + i, 0);
     atomic_store(&t->target_inter_score, 0);
 }

 static int task_init_parse(VVCTask *t, SliceContext *sc, EntryPoint *ep, const int ctu_idx)
 {
     if (t->sc) {
         // the task already inited, error bitstream
         return AVERROR_INVALIDDATA;
     }
     t->sc      = sc;
     t->ep      = ep;
     t->ctu_idx = ctu_idx;

     return 0;
 }

 static uint8_t task_add_score(VVCTask *t, const VVCTaskStage stage)
 {
     return atomic_fetch_add(&t->score[stage], 1) + 1;
 }

 static uint8_t task_get_score(VVCTask *t, const VVCTaskStage stage)
 {
     return atomic_load(&t->score[stage]);
 }

 //first row in tile or slice
 static int is_first_row(const VVCFrameContext *fc, const int rx, const int ry)
 {
     const VVCFrameThread *ft = fc->ft;
     const VVCPPS *pps        = fc->ps.pps;

     if (ry != pps->ctb_to_row_bd[ry]) {
         const int rs = ry * ft->ctu_width + rx;
         return fc->tab.slice_idx[rs] != fc->tab.slice_idx[rs - ft->ctu_width];
     }
     return 1;
 }

 static int task_has_target_score(VVCTask *t, const VVCTaskStage stage, const uint8_t score)
 {
     // l:left, r:right, t: top, b: bottom
     static const uint8_t target_score[] =
     {
         2,          //VVC_TASK_STAGE_RECON,     need l + rt recon
         3,          //VVC_TASK_STAGE_LMCS,      need r + b + rb recon
         1,          //VVC_TASK_STAGE_DEBLOCK_V, need l deblock v
         2,          //VVC_TASK_STAGE_DEBLOCK_H, need r deblock v + t deblock h
         5,          //VVC_TASK_STAGE_SAO,       need l + r + lb + b + rb deblock h
         8,          //VVC_TASK_STAGE_ALF,       need sao around the ctu
     };
     uint8_t target = 0;
     VVCFrameContext *fc = t->fc;

     if (stage == VVC_TASK_STAGE_PARSE) {
         const H266RawSPS *rsps = fc->ps.sps->r;
         const int wpp = rsps->sps_entropy_coding_sync_enabled_flag && !is_first_row(fc, t->rx, t->ry);
         target = 2 + wpp - 1;                           //left parse + colocation + wpp - no previous stage
     } else if (stage == VVC_TASK_STAGE_INTER) {
         target = atomic_load(&t->target_inter_score);
     } else {
         target = target_score[stage - VVC_TASK_STAGE_RECON];
     }

     //+1 for previous stage
     av_assert0(score <= target + 1);
     return score == target + 1;
 }

 static void frame_thread_add_score(VVCContext *s, VVCFrameThread *ft,
     const int rx, const int ry, const VVCTaskStage stage)
 {
     VVCTask *t = ft->tasks + ft->ctu_width * ry + rx;
     uint8_t score;

     if (rx < 0 || rx >= ft->ctu_width || ry < 0 || ry >= ft->ctu_height)
         return;

     score = task_add_score(t, stage);
     if (task_has_target_score(t, stage, score)) {
         av_assert0(s);
         av_assert0(stage == t->stage);
         add_task(s, t);
     }
 }

 static void sheduled_done(VVCFrameThread *ft, atomic_int *scheduled)
 {
     if (atomic_fetch_sub(scheduled, 1) == 1) {
         ff_mutex_lock(&ft->lock);
         ff_cond_signal(&ft->cond);
         ff_mutex_unlock(&ft->lock);
     }
 }

 static void progress_done(VVCProgressListener *_l, const int type)
 {
     const ProgressListener *l = (ProgressListener *)_l;
     const VVCTask *t          = l->task;
     VVCFrameThread *ft        = t->fc->ft;

     frame_thread_add_score(l->s, ft, t->rx, t->ry, type);
     sheduled_done(ft, &ft->nb_scheduled_listeners);
 }

 static void pixel_done(VVCProgressListener *l)
 {
     progress_done(l, VVC_TASK_STAGE_INTER);
 }

 static void mv_done(VVCProgressListener *l)
 {
     progress_done(l, VVC_TASK_STAGE_PARSE);
 }

 static void listener_init(ProgressListener *l,  VVCTask *t, VVCContext *s, const VVCProgress vp, const int y)
 {
     const int is_inter = vp == VVC_PROGRESS_PIXEL;

     l->task = t;
     l->s    = s;
     l->l.vp = vp;
     l->l.y  = y;
     l->l.progress_done = is_inter ? pixel_done : mv_done;
     if (is_inter)
         atomic_fetch_add(&t->target_inter_score, 1);
 }

 static void add_progress_listener(VVCFrame *ref, ProgressListener *l,
     VVCTask *t, VVCContext *s, const VVCProgress vp, const int y)
 {
     VVCFrameThread *ft = t->fc->ft;

     atomic_fetch_add(&ft->nb_scheduled_listeners, 1);
     listener_init(l, t, s, vp, y);
     ff_vvc_add_progress_listener(ref, (VVCProgressListener*)l);
 }

 static void schedule_next_parse(VVCContext *s, VVCFrameContext *fc, const SliceContext *sc, const VVCTask *t)
 {
     VVCFrameThread *ft = fc->ft;
     EntryPoint *ep     = t->ep;
     const VVCSPS *sps  = fc->ps.sps;

     if (sps->r->sps_entropy_coding_sync_enabled_flag) {
         if (t->rx == fc->ps.pps->ctb_to_col_bd[t->rx]) {
             EntryPoint *next = ep + 1;
             if (next < sc->eps + sc->nb_eps && !is_first_row(fc, t->rx, t->ry + 1)) {
                 memcpy(next->cabac_state, ep->cabac_state, sizeof(next->cabac_state));
                 ff_vvc_ep_init_stat_coeff(next, sps->bit_depth, sps->r->sps_persistent_rice_adaptation_enabled_flag);
             }
         }
         if (t->ry + 1 < ft->ctu_height && !is_first_row(fc, t->rx, t->ry + 1))
             frame_thread_add_score(s, ft, t->rx, t->ry + 1, VVC_TASK_STAGE_PARSE);
     }

     if (t->ctu_idx + 1 < t->ep->ctu_end) {
         const int next_rs = sc->sh.ctb_addr_in_curr_slice[t->ctu_idx + 1];
         const int next_rx = next_rs % ft->ctu_width;
         const int next_ry = next_rs / ft->ctu_width;
         frame_thread_add_score(s, ft, next_rx, next_ry, VVC_TASK_STAGE_PARSE);
     }
 }

 static void schedule_inter(VVCContext *s, VVCFrameContext *fc, const SliceContext *sc, VVCTask *t, const int rs)
 {
     const VVCSH *sh = &sc->sh;

     if (!IS_I(sh->r)) {
         CTU *ctu = fc->tab.ctus + rs;
         for (int lx = 0; lx < 2; lx++) {
             for (int i = 0; i < sh->r->num_ref_idx_active[lx]; i++) {
                 int y = ctu->max_y[lx][i];
                 VVCRefPic *refp = sc->rpl[lx].refs + i;
                 VVCFrame *ref   = refp->ref;
                 if (ref && y >= 0) {
                     if (refp->is_scaled)
                         y = y * refp->scale[1] >> 14;
                     add_progress_listener(ref, &t->listener[lx][i], t, s, VVC_PROGRESS_PIXEL, y + LUMA_EXTRA_AFTER);
                 }
             }
         }
     }
 }

 static void parse_task_done(VVCContext *s, VVCFrameContext *fc, const int rx, const int ry)
 {
     VVCFrameThread *ft  = fc->ft;
     const int rs        = ry * ft->ctu_width + rx;
     const int slice_idx = fc->tab.slice_idx[rs];
     VVCTask *t          = ft->tasks + rs;
     const SliceContext *sc = fc->slices[slice_idx];

     schedule_next_parse(s, fc, sc, t);
     schedule_inter(s, fc, sc, t, rs);
 }

 static void task_stage_done(const VVCTask *t, VVCContext *s)
 {
     VVCFrameContext *fc      = t->fc;
     VVCFrameThread *ft       = fc->ft;
     const VVCTaskStage stage = t->stage;

 #define ADD(dx, dy, stage) frame_thread_add_score(s, ft, t->rx + (dx), t->ry + (dy), stage)

     //this is a reserve map of ready_score, ordered by zigzag
     if (stage == VVC_TASK_STAGE_PARSE) {
         parse_task_done(s, fc, t->rx, t->ry);
     } else if (stage == VVC_TASK_STAGE_RECON) {
         ADD(-1,  1, VVC_TASK_STAGE_RECON);
         ADD( 1,  0, VVC_TASK_STAGE_RECON);
         ADD(-1, -1, VVC_TASK_STAGE_LMCS);
         ADD( 0, -1, VVC_TASK_STAGE_LMCS);
         ADD(-1,  0, VVC_TASK_STAGE_LMCS);
     } else if (stage == VVC_TASK_STAGE_DEBLOCK_V) {
         ADD( 1,  0,  VVC_TASK_STAGE_DEBLOCK_V);
         ADD(-1,  0,  VVC_TASK_STAGE_DEBLOCK_H);
     } else if (stage == VVC_TASK_STAGE_DEBLOCK_H) {
         ADD( 0,  1,  VVC_TASK_STAGE_DEBLOCK_H);
         ADD(-1, -1,  VVC_TASK_STAGE_SAO);
         ADD( 0, -1,  VVC_TASK_STAGE_SAO);
         ADD(-1,  0,  VVC_TASK_STAGE_SAO);
         ADD( 1, -1,  VVC_TASK_STAGE_SAO);
         ADD( 1,  0,  VVC_TASK_STAGE_SAO);
     } else if (stage == VVC_TASK_STAGE_SAO) {
         ADD(-1, -1,  VVC_TASK_STAGE_ALF);
         ADD( 0, -1,  VVC_TASK_STAGE_ALF);
         ADD(-1,  0,  VVC_TASK_STAGE_ALF);
         ADD( 1, -1,  VVC_TASK_STAGE_ALF);
         ADD(-1,  1,  VVC_TASK_STAGE_ALF);
         ADD( 1,  0,  VVC_TASK_STAGE_ALF);
         ADD( 0,  1,  VVC_TASK_STAGE_ALF);
         ADD( 1,  1,  VVC_TASK_STAGE_ALF);
     }
 }

 static int task_is_stage_ready(VVCTask *t, int add)
 {
     const VVCTaskStage stage = t->stage;
     uint8_t score;
     if (stage > VVC_TASK_STAGE_ALF)
         return 0;
     score = task_get_score(t, stage) + add;
     return task_has_target_score(t, stage, score);
 }

 static int task_ready(const AVTask *_t, void *user_data)
 {
     VVCTask *t = (VVCTask*)_t;

     return task_is_stage_ready(t, 0);
 }

 #define CHECK(a, b)                         \
     do {                                    \
         if ((a) != (b))                     \
             return (a) < (b);               \
     } while (0)

 static int task_priority_higher(const AVTask *_a, const AVTask *_b)
 {
     const VVCTask *a = (const VVCTask*)_a;
     const VVCTask *b = (const VVCTask*)_b;

     CHECK(a->fc->decode_order, b->fc->decode_order);             //decode order

     if (a->stage == VVC_TASK_STAGE_PARSE || b->stage == VVC_TASK_STAGE_PARSE) {
         CHECK(a->stage, b->stage);
         CHECK(a->ry, b->ry);
         return a->rx < b->rx;
     }

     CHECK(a->rx + a->ry + a->stage, b->rx + b->ry + b->stage);    //zigzag with type
     CHECK(a->rx + a->ry, b->rx + b->ry);                          //zigzag
     return a->ry < b->ry;
 }

 static void report_frame_progress(VVCFrameContext *fc,
    const int ry, const VVCProgress idx)
 {
     VVCFrameThread *ft = fc->ft;
     const int ctu_size = ft->ctu_size;
     int old;

     if (atomic_fetch_add(&ft->rows[ry].col_progress[idx], 1) == ft->ctu_width - 1) {
         int y;
         ff_mutex_lock(&ft->lock);
         y = old = ft->row_progress[idx];
         while (y < ft->ctu_height && atomic_load(&ft->rows[y].col_progress[idx]) == ft->ctu_width)
             y++;
         if (old != y) {
             const int progress = y == ft->ctu_height ? INT_MAX : y * ctu_size;
             ft->row_progress[idx] = y;
             ff_vvc_report_progress(fc->ref, idx, progress);
         }
         ff_mutex_unlock(&ft->lock);
     }
 }

 static int run_parse(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
 {
     int ret;
     VVCFrameContext *fc = lc->fc;
     const int rs        = t->rs;
     const CTU *ctu      = fc->tab.ctus + rs;

     lc->ep = t->ep;

     ret = ff_vvc_coding_tree_unit(lc, t->ctu_idx, rs, t->rx, t->ry);
     if (ret < 0)
         return ret;

     if (!ctu->has_dmvr)
         report_frame_progress(lc->fc, t->ry, VVC_PROGRESS_MV);

     return 0;
 }

 static int run_inter(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
 {
     VVCFrameContext *fc = lc->fc;
     const CTU *ctu      = fc->tab.ctus + t->rs;

     ff_vvc_predict_inter(lc, t->rs);

     if (ctu->has_dmvr)
         report_frame_progress(fc, t->ry, VVC_PROGRESS_MV);

     return 0;
 }

 static int run_recon(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
 {
     ff_vvc_reconstruct(lc, t->rs, t->rx, t->ry);

     return 0;
 }

 static int run_lmcs(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
 {
     VVCFrameContext *fc = lc->fc;
     VVCFrameThread *ft  = fc->ft;
     const int ctu_size  = ft->ctu_size;
     const int x0        = t->rx * ctu_size;
     const int y0        = t->ry * ctu_size;

     ff_vvc_lmcs_filter(lc, x0, y0);

     return 0;
 }

 static int run_deblock_v(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
 {
     VVCFrameContext *fc = lc->fc;
     VVCFrameThread *ft  = fc->ft;
     const int ctb_size  = ft->ctu_size;
     const int x0        = t->rx * ctb_size;
     const int y0        = t->ry * ctb_size;

     if (!lc->sc->sh.r->sh_deblocking_filter_disabled_flag) {
         ff_vvc_decode_neighbour(lc, x0, y0, t->rx, t->ry, t->rs);
         ff_vvc_deblock_vertical(lc, x0, y0, t->rs);
     }

     return 0;
 }

 static int run_deblock_h(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
 {
     VVCFrameContext *fc = lc->fc;
     VVCFrameThread *ft  = fc->ft;
     const int ctb_size  = ft->ctu_size;
     const int x0        = t->rx * ctb_size;
     const int y0        = t->ry * ctb_size;

     if (!lc->sc->sh.r->sh_deblocking_filter_disabled_flag) {
         ff_vvc_decode_neighbour(lc, x0, y0, t->rx, t->ry, t->rs);
         ff_vvc_deblock_horizontal(lc, x0, y0, t->rs);
     }
     if (fc->ps.sps->r->sps_sao_enabled_flag)
         ff_vvc_sao_copy_ctb_to_hv(lc, t->rx, t->ry, t->ry == ft->ctu_height - 1);

     return 0;
 }

 static int run_sao(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
 {
     VVCFrameContext *fc = lc->fc;
     VVCFrameThread *ft  = fc->ft;
     const int ctb_size  = ft->ctu_size;
     const int x0        = t->rx * ctb_size;
     const int y0        = t->ry * ctb_size;

     if (fc->ps.sps->r->sps_sao_enabled_flag) {
         ff_vvc_decode_neighbour(lc, x0, y0, t->rx, t->ry, t->rs);
         ff_vvc_sao_filter(lc, x0, y0);
     }

     if (fc->ps.sps->r->sps_alf_enabled_flag)
         ff_vvc_alf_copy_ctu_to_hv(lc, x0, y0);

     return 0;
 }

 static int run_alf(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
 {
     VVCFrameContext *fc = lc->fc;
     VVCFrameThread *ft  = fc->ft;
     const int ctu_size  = ft->ctu_size;
     const int x0        = t->rx * ctu_size;
     const int y0        = t->ry * ctu_size;

     if (fc->ps.sps->r->sps_alf_enabled_flag) {
         ff_vvc_decode_neighbour(lc, x0, y0, t->rx, t->ry, t->rs);
         ff_vvc_alf_filter(lc, x0, y0);
     }
     report_frame_progress(fc, t->ry, VVC_PROGRESS_PIXEL);

     return 0;
 }

 #define VVC_THREAD_DEBUG
 #ifdef VVC_THREAD_DEBUG
 const static char* task_name[] = {
     "P",
     "I",
     "R",
     "L",
     "V",
     "H",
     "S",
     "A"
 };
 #endif

 typedef int (*run_func)(VVCContext *s, VVCLocalContext *lc, VVCTask *t);

 static void task_run_stage(VVCTask *t, VVCContext *s, VVCLocalContext *lc)
 {
     int ret;
     VVCFrameContext *fc      = t->fc;
     VVCFrameThread *ft       = fc->ft;
     const VVCTaskStage stage = t->stage;
     run_func run[] = {
         run_parse,
         run_inter,
         run_recon,
         run_lmcs,
         run_deblock_v,
         run_deblock_h,
         run_sao,
         run_alf,
     };

 #ifdef VVC_THREAD_DEBUG
     av_log(s->avctx, AV_LOG_DEBUG, "frame %5d, %s(%3d, %3d)\r\n", (int)t->fc->decode_order, task_name[stage], t->rx, t->ry);
 #endif

     lc->sc = t->sc;

     if (!atomic_load(&ft->ret)) {
         if ((ret = run[stage](s, lc, t)) < 0) {
 #ifdef COMPAT_ATOMICS_WIN32_STDATOMIC_H
             intptr_t zero = 0;
 #else
             int zero = 0;
 #endif
             atomic_compare_exchange_strong(&ft->ret, &zero, ret);
             av_log(s->avctx, AV_LOG_ERROR,
                 "frame %5d, %s(%3d, %3d) failed with %d\r\n",
                 (int)fc->decode_order, task_name[stage], t->rx, t->ry, ret);
         }
     }

     task_stage_done(t, s);
     return;
 }

 static int task_run(AVTask *_t, void *local_context, void *user_data)
 {
     VVCTask *t          = (VVCTask*)_t;
     VVCContext *s       = (VVCContext *)user_data;
     VVCLocalContext *lc = local_context;
     VVCFrameThread *ft  = t->fc->ft;

     lc->fc = t->fc;

     do {
         task_run_stage(t, s, lc);
         t->stage++;
     } while (task_is_stage_ready(t, 1));

     if (t->stage != VVC_TASK_STAGE_LAST)
         frame_thread_add_score(s, ft, t->rx, t->ry, t->stage);

     sheduled_done(ft, &ft->nb_scheduled_tasks);

     return 0;
 }

 AVExecutor* ff_vvc_executor_alloc(VVCContext *s, const int thread_count)
 {
     AVTaskCallbacks callbacks = {
         s,
         sizeof(VVCLocalContext),
         task_priority_higher,
         task_ready,
         task_run,
     };
     return av_executor_alloc(&callbacks, thread_count);
 }

 void ff_vvc_executor_free(AVExecutor **e)
 {
     av_executor_free(e);
 }

 void ff_vvc_frame_thread_free(VVCFrameContext *fc)
 {
     VVCFrameThread *ft = fc->ft;

     if (!ft)
         return;

     ff_mutex_destroy(&ft->lock);
     ff_cond_destroy(&ft->cond);
     av_freep(&ft->rows);
     av_freep(&ft->tasks);
     av_freep(&ft);
 }

 static void frame_thread_init_score(VVCFrameContext *fc)
 {
     const VVCFrameThread *ft = fc->ft;
     VVCTask task;

     task_init(&task, VVC_TASK_STAGE_RECON, fc, 0, 0);

     for (int i = VVC_TASK_STAGE_RECON; i < VVC_TASK_STAGE_LAST; i++) {
         task.stage = i;

         for (task.rx = -1; task.rx <= ft->ctu_width; task.rx++) {
             task.ry = -1;                           //top
             task_stage_done(&task, NULL);
             task.ry = ft->ctu_height;               //bottom
             task_stage_done(&task, NULL);
         }

         for (task.ry = 0; task.ry < ft->ctu_height; task.ry++) {
             task.rx = -1;                           //left
             task_stage_done(&task, NULL);
             task.rx = ft->ctu_width;                //right
             task_stage_done(&task, NULL);
         }
     }
 }

 int ff_vvc_frame_thread_init(VVCFrameContext *fc)
 {
     const VVCSPS *sps  = fc->ps.sps;
     const VVCPPS *pps  = fc->ps.pps;
     VVCFrameThread *ft = fc->ft;
     int ret;

     if (!ft || ft->ctu_width != pps->ctb_width ||
         ft->ctu_height != pps->ctb_height ||
         ft->ctu_size != sps->ctb_size_y) {

         ff_vvc_frame_thread_free(fc);
         ft = av_calloc(1, sizeof(*fc->ft));
         if (!ft)
             return AVERROR(ENOMEM);

         ft->ctu_width  = fc->ps.pps->ctb_width;
         ft->ctu_height = fc->ps.pps->ctb_height;
         ft->ctu_count  = fc->ps.pps->ctb_count;
         ft->ctu_size   = fc->ps.sps->ctb_size_y;

         ft->rows = av_calloc(ft->ctu_height, sizeof(*ft->rows));
         if (!ft->rows)
             goto fail;

         ft->tasks = av_malloc(ft->ctu_count * sizeof(*ft->tasks));
         if (!ft->tasks)
             goto fail;

         if ((ret = ff_cond_init(&ft->cond, NULL)))
             goto fail;

         if ((ret = ff_mutex_init(&ft->lock, NULL))) {
             ff_cond_destroy(&ft->cond);
             goto fail;
         }
     }
     fc->ft = ft;
     ft->ret = 0;
     for (int y = 0; y < ft->ctu_height; y++) {
         VVCRowThread *row = ft->rows + y;
         memset(row->col_progress, 0, sizeof(row->col_progress));
     }

     for (int rs = 0; rs < ft->ctu_count; rs++) {
         VVCTask *t = ft->tasks + rs;
         task_init(t, VVC_TASK_STAGE_PARSE, fc, rs % ft->ctu_width, rs / ft->ctu_width);
     }

     memset(&ft->row_progress[0], 0, sizeof(ft->row_progress));

     frame_thread_init_score(fc);

     return 0;

 fail:
     if (ft) {
         av_freep(&ft->rows);
         av_freep(&ft->tasks);
         av_freep(&ft);
     }

     return AVERROR(ENOMEM);
 }

 static void check_colocation(VVCContext *s, VVCTask *t)
 {
     const VVCFrameContext *fc = t->fc;

     if (fc->ps.ph.r->ph_temporal_mvp_enabled_flag || fc->ps.sps->r->sps_sbtmvp_enabled_flag) {
         VVCFrame *col       = fc->ref->collocated_ref;
         const int first_col = t->rx == fc->ps.pps->ctb_to_col_bd[t->rx];
         if (col && first_col) {
             //we depend on bottom and right boundary, do not - 1 for y
             const int y = (t->ry << fc->ps.sps->ctb_log2_size_y);
             add_progress_listener(col, &t->col_listener, t, s, VVC_PROGRESS_MV, y);
             return;
         }
     }
     frame_thread_add_score(s, fc->ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE);
 }

 static void submit_entry_point(VVCContext *s, VVCFrameThread *ft, SliceContext *sc, EntryPoint *ep)
 {
     const int rs = sc->sh.ctb_addr_in_curr_slice[ep->ctu_start];
     VVCTask *t   = ft->tasks + rs;

     frame_thread_add_score(s, ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE);
 }

 int ff_vvc_frame_submit(VVCContext *s, VVCFrameContext *fc)
 {
     VVCFrameThread *ft = fc->ft;

     // We'll handle this in two passes:
     // Pass 0 to initialize tasks with parser, this will help detect bit stream error
     // Pass 1 to shedule location check and submit the entry point
     for (int pass = 0; pass < 2; pass++) {
         for (int i = 0; i < fc->nb_slices; i++) {
             SliceContext *sc = fc->slices[i];
             for (int j = 0; j < sc->nb_eps; j++) {
                 EntryPoint *ep = sc->eps + j;
                 for (int k = ep->ctu_start; k < ep->ctu_end; k++) {
                     const int rs = sc->sh.ctb_addr_in_curr_slice[k];
                     VVCTask *t   = ft->tasks + rs;
                     if (pass) {
                         check_colocation(s, t);
                     } else {
                         const int ret = task_init_parse(t, sc, ep, k);
                         if (ret < 0)
                             return ret;
                     }
                 }
                 if (pass)
                     submit_entry_point(s, ft, sc, ep);
             }
         }
     }
     return 0;
 }

 int ff_vvc_frame_wait(VVCContext *s, VVCFrameContext *fc)
 {
     VVCFrameThread *ft = fc->ft;

     ff_mutex_lock(&ft->lock);

     while (atomic_load(&ft->nb_scheduled_tasks) || atomic_load(&ft->nb_scheduled_listeners))
         ff_cond_wait(&ft->cond, &ft->lock);

     ff_mutex_unlock(&ft->lock);
     ff_vvc_report_frame_finished(fc->ref);

 #ifdef VVC_THREAD_DEBUG
     av_log(s->avctx, AV_LOG_DEBUG, "frame %5d done\r\n", (int)fc->decode_order);
 #endif
     return ft->ret;
 }
	/*
	* VVC thread logic
	*
	* Copyright (C) 2023 Nuo Mi
	*
	* This file is part of FFmpeg.
	*
	* FFmpeg is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* FFmpeg is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with FFmpeg; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	*/

	#include <stdatomic.h>

	#include "libavutil/executor.h"
	#include "libavutil/mem.h"
	#include "libavutil/thread.h"

	#include "thread.h"
	#include "ctu.h"
	#include "filter.h"
	#include "inter.h"
	#include "intra.h"
	#include "refs.h"

	typedef struct ProgressListener {
	VVCProgressListener l;
	struct VVCTask *task;
	VVCContext *s;
	} ProgressListener;

	typedef enum VVCTaskStage {
	VVC_TASK_STAGE_PARSE,
	VVC_TASK_STAGE_INTER,
	VVC_TASK_STAGE_RECON,
	VVC_TASK_STAGE_LMCS,
	VVC_TASK_STAGE_DEBLOCK_V,
	VVC_TASK_STAGE_DEBLOCK_H,
	VVC_TASK_STAGE_SAO,
	VVC_TASK_STAGE_ALF,
	VVC_TASK_STAGE_LAST
	} VVCTaskStage;

	typedef struct VVCTask {
	union {
	struct VVCTask *next; //for executor debug only
	AVTask task;
	} u;

	VVCTaskStage stage;

	// ctu x, y, and raster scan order
	int rx, ry, rs;
	VVCFrameContext *fc;

	ProgressListener col_listener;
	ProgressListener listener[2][VVC_MAX_REF_ENTRIES];

	// for parse task only
	SliceContext *sc;
	EntryPoint *ep;
	int ctu_idx; //ctu idx in the current slice

	// tasks with target scores met are ready for scheduling
	atomic_uchar score[VVC_TASK_STAGE_LAST];
	atomic_uchar target_inter_score;
	} VVCTask;

	typedef struct VVCRowThread {
	atomic_int col_progress[VVC_PROGRESS_LAST];
	} VVCRowThread;

	typedef struct VVCFrameThread {
	// error return for tasks
	atomic_int ret;

	VVCRowThread *rows;
	VVCTask *tasks;

	int ctu_size;
	int ctu_width;
	int ctu_height;
	int ctu_count;

	//protected by lock
	atomic_int nb_scheduled_tasks;
	atomic_int nb_scheduled_listeners;

	int row_progress[VVC_PROGRESS_LAST];

	AVMutex lock;
	AVCond cond;
	} VVCFrameThread;

	static void add_task(VVCContext s, VVCTask t)
	{
	VVCFrameThread *ft = t->fc->ft;

	atomic_fetch_add(&ft->nb_scheduled_tasks, 1);

	av_executor_execute(s->executor, &t->u.task);
	}

	static void task_init(VVCTask t, VVCTaskStage stage, VVCFrameContext fc, const int rx, const int ry)
	{
	memset(t, 0, sizeof(*t));
	t->stage = stage;
	t->fc = fc;
	t->rx = rx;
	t->ry = ry;
	t->rs = ry * fc->ft->ctu_width + rx;
	for (int i = 0; i < FF_ARRAY_ELEMS(t->score); i++)
	atomic_store(t->score + i, 0);
	atomic_store(&t->target_inter_score, 0);
	}

	static int task_init_parse(VVCTask t, SliceContext sc, EntryPoint *ep, const int ctu_idx)
	{
	if (t->sc) {
	// the task already inited, error bitstream
	return AVERROR_INVALIDDATA;
	}
	t->sc = sc;
	t->ep = ep;
	t->ctu_idx = ctu_idx;

	return 0;
	}

	static uint8_t task_add_score(VVCTask *t, const VVCTaskStage stage)
	{
	return atomic_fetch_add(&t->score[stage], 1) + 1;
	}

	static uint8_t task_get_score(VVCTask *t, const VVCTaskStage stage)
	{
	return atomic_load(&t->score[stage]);
	}

	//first row in tile or slice
	static int is_first_row(const VVCFrameContext *fc, const int rx, const int ry)
	{
	const VVCFrameThread *ft = fc->ft;
	const VVCPPS *pps = fc->ps.pps;

	if (ry != pps->ctb_to_row_bd[ry]) {
	const int rs = ry * ft->ctu_width + rx;
	return fc->tab.slice_idx[rs] != fc->tab.slice_idx[rs - ft->ctu_width];
	}
	return 1;
	}

	static int task_has_target_score(VVCTask *t, const VVCTaskStage stage, const uint8_t score)
	{
	// l:left, r:right, t: top, b: bottom
	static const uint8_t target_score[] =
	{
	2, //VVC_TASK_STAGE_RECON, need l + rt recon
	3, //VVC_TASK_STAGE_LMCS, need r + b + rb recon
	1, //VVC_TASK_STAGE_DEBLOCK_V, need l deblock v
	2, //VVC_TASK_STAGE_DEBLOCK_H, need r deblock v + t deblock h
	5, //VVC_TASK_STAGE_SAO, need l + r + lb + b + rb deblock h
	8, //VVC_TASK_STAGE_ALF, need sao around the ctu
	};
	uint8_t target = 0;
	VVCFrameContext *fc = t->fc;

	if (stage == VVC_TASK_STAGE_PARSE) {
	const H266RawSPS *rsps = fc->ps.sps->r;
	const int wpp = rsps->sps_entropy_coding_sync_enabled_flag && !is_first_row(fc, t->rx, t->ry);
	target = 2 + wpp - 1; //left parse + colocation + wpp - no previous stage
	} else if (stage == VVC_TASK_STAGE_INTER) {
	target = atomic_load(&t->target_inter_score);
	} else {
	target = target_score[stage - VVC_TASK_STAGE_RECON];
	}

	//+1 for previous stage
	av_assert0(score <= target + 1);
	return score == target + 1;
	}

	static void frame_thread_add_score(VVCContext s, VVCFrameThread ft,
	const int rx, const int ry, const VVCTaskStage stage)
	{
	VVCTask t = ft->tasks + ft->ctu_width ry + rx;
	uint8_t score;

	if (rx < 0 \|\| rx >= ft->ctu_width \|\| ry < 0 \|\| ry >= ft->ctu_height)
	return;

	score = task_add_score(t, stage);
	if (task_has_target_score(t, stage, score)) {
	av_assert0(s);
	av_assert0(stage == t->stage);
	add_task(s, t);
	}
	}

	static void sheduled_done(VVCFrameThread ft, atomic_int scheduled)
	{
	if (atomic_fetch_sub(scheduled, 1) == 1) {
	ff_mutex_lock(&ft->lock);
	ff_cond_signal(&ft->cond);
	ff_mutex_unlock(&ft->lock);
	}
	}

	static void progress_done(VVCProgressListener *_l, const int type)
	{
	const ProgressListener l = (ProgressListener )_l;
	const VVCTask *t = l->task;
	VVCFrameThread *ft = t->fc->ft;

	frame_thread_add_score(l->s, ft, t->rx, t->ry, type);
	sheduled_done(ft, &ft->nb_scheduled_listeners);
	}

	static void pixel_done(VVCProgressListener *l)
	{
	progress_done(l, VVC_TASK_STAGE_INTER);
	}

	static void mv_done(VVCProgressListener *l)
	{
	progress_done(l, VVC_TASK_STAGE_PARSE);
	}

	static void listener_init(ProgressListener l, VVCTask t, VVCContext *s, const VVCProgress vp, const int y)
	{
	const int is_inter = vp == VVC_PROGRESS_PIXEL;

	l->task = t;
	l->s = s;
	l->l.vp = vp;
	l->l.y = y;
	l->l.progress_done = is_inter ? pixel_done : mv_done;
	if (is_inter)
	atomic_fetch_add(&t->target_inter_score, 1);
	}

	static void add_progress_listener(VVCFrame ref, ProgressListener l,
	VVCTask t, VVCContext s, const VVCProgress vp, const int y)
	{
	VVCFrameThread *ft = t->fc->ft;

	atomic_fetch_add(&ft->nb_scheduled_listeners, 1);
	listener_init(l, t, s, vp, y);
	ff_vvc_add_progress_listener(ref, (VVCProgressListener*)l);
	}

	static void schedule_next_parse(VVCContext s, VVCFrameContext fc, const SliceContext sc, const VVCTask t)
	{
	VVCFrameThread *ft = fc->ft;
	EntryPoint *ep = t->ep;
	const VVCSPS *sps = fc->ps.sps;

	if (sps->r->sps_entropy_coding_sync_enabled_flag) {
	if (t->rx == fc->ps.pps->ctb_to_col_bd[t->rx]) {
	EntryPoint *next = ep + 1;
	if (next < sc->eps + sc->nb_eps && !is_first_row(fc, t->rx, t->ry + 1)) {
	memcpy(next->cabac_state, ep->cabac_state, sizeof(next->cabac_state));
	ff_vvc_ep_init_stat_coeff(next, sps->bit_depth, sps->r->sps_persistent_rice_adaptation_enabled_flag);
	}
	}
	if (t->ry + 1 < ft->ctu_height && !is_first_row(fc, t->rx, t->ry + 1))
	frame_thread_add_score(s, ft, t->rx, t->ry + 1, VVC_TASK_STAGE_PARSE);
	}

	if (t->ctu_idx + 1 < t->ep->ctu_end) {
	const int next_rs = sc->sh.ctb_addr_in_curr_slice[t->ctu_idx + 1];
	const int next_rx = next_rs % ft->ctu_width;
	const int next_ry = next_rs / ft->ctu_width;
	frame_thread_add_score(s, ft, next_rx, next_ry, VVC_TASK_STAGE_PARSE);
	}
	}

	static void schedule_inter(VVCContext s, VVCFrameContext fc, const SliceContext sc, VVCTask t, const int rs)
	{
	const VVCSH *sh = &sc->sh;

	if (!IS_I(sh->r)) {
	CTU *ctu = fc->tab.ctus + rs;
	for (int lx = 0; lx < 2; lx++) {
	for (int i = 0; i < sh->r->num_ref_idx_active[lx]; i++) {
	int y = ctu->max_y[lx][i];
	VVCRefPic *refp = sc->rpl[lx].refs + i;
	VVCFrame *ref = refp->ref;
	if (ref && y >= 0) {
	if (refp->is_scaled)
	y = y * refp->scale[1] >> 14;
	add_progress_listener(ref, &t->listener[lx][i], t, s, VVC_PROGRESS_PIXEL, y + LUMA_EXTRA_AFTER);
	}
	}
	}
	}
	}

	static void parse_task_done(VVCContext s, VVCFrameContext fc, const int rx, const int ry)
	{
	VVCFrameThread *ft = fc->ft;
	const int rs = ry * ft->ctu_width + rx;
	const int slice_idx = fc->tab.slice_idx[rs];
	VVCTask *t = ft->tasks + rs;
	const SliceContext *sc = fc->slices[slice_idx];

	schedule_next_parse(s, fc, sc, t);
	schedule_inter(s, fc, sc, t, rs);
	}

	static void task_stage_done(const VVCTask t, VVCContext s)
	{
	VVCFrameContext *fc = t->fc;
	VVCFrameThread *ft = fc->ft;
	const VVCTaskStage stage = t->stage;

	#define ADD(dx, dy, stage) frame_thread_add_score(s, ft, t->rx + (dx), t->ry + (dy), stage)

	//this is a reserve map of ready_score, ordered by zigzag
	if (stage == VVC_TASK_STAGE_PARSE) {
	parse_task_done(s, fc, t->rx, t->ry);
	} else if (stage == VVC_TASK_STAGE_RECON) {
	ADD(-1, 1, VVC_TASK_STAGE_RECON);
	ADD( 1, 0, VVC_TASK_STAGE_RECON);
	ADD(-1, -1, VVC_TASK_STAGE_LMCS);
	ADD( 0, -1, VVC_TASK_STAGE_LMCS);
	ADD(-1, 0, VVC_TASK_STAGE_LMCS);
	} else if (stage == VVC_TASK_STAGE_DEBLOCK_V) {
	ADD( 1, 0, VVC_TASK_STAGE_DEBLOCK_V);
	ADD(-1, 0, VVC_TASK_STAGE_DEBLOCK_H);
	} else if (stage == VVC_TASK_STAGE_DEBLOCK_H) {
	ADD( 0, 1, VVC_TASK_STAGE_DEBLOCK_H);
	ADD(-1, -1, VVC_TASK_STAGE_SAO);
	ADD( 0, -1, VVC_TASK_STAGE_SAO);
	ADD(-1, 0, VVC_TASK_STAGE_SAO);
	ADD( 1, -1, VVC_TASK_STAGE_SAO);
	ADD( 1, 0, VVC_TASK_STAGE_SAO);
	} else if (stage == VVC_TASK_STAGE_SAO) {
	ADD(-1, -1, VVC_TASK_STAGE_ALF);
	ADD( 0, -1, VVC_TASK_STAGE_ALF);
	ADD(-1, 0, VVC_TASK_STAGE_ALF);
	ADD( 1, -1, VVC_TASK_STAGE_ALF);
	ADD(-1, 1, VVC_TASK_STAGE_ALF);
	ADD( 1, 0, VVC_TASK_STAGE_ALF);
	ADD( 0, 1, VVC_TASK_STAGE_ALF);
	ADD( 1, 1, VVC_TASK_STAGE_ALF);
	}
	}

	static int task_is_stage_ready(VVCTask *t, int add)
	{
	const VVCTaskStage stage = t->stage;
	uint8_t score;
	if (stage > VVC_TASK_STAGE_ALF)
	return 0;
	score = task_get_score(t, stage) + add;
	return task_has_target_score(t, stage, score);
	}

	static int task_ready(const AVTask _t, void user_data)
	{
	VVCTask t = (VVCTask)_t;

	return task_is_stage_ready(t, 0);
	}

	#define CHECK(a, b) \
	do { \
	if ((a) != (b)) \
	return (a) < (b); \
	} while (0)

	static int task_priority_higher(const AVTask _a, const AVTask _b)
	{
	const VVCTask a = (const VVCTask)_a;
	const VVCTask b = (const VVCTask)_b;

	CHECK(a->fc->decode_order, b->fc->decode_order); //decode order

	if (a->stage == VVC_TASK_STAGE_PARSE \|\| b->stage == VVC_TASK_STAGE_PARSE) {
	CHECK(a->stage, b->stage);
	CHECK(a->ry, b->ry);
	return a->rx < b->rx;
	}

	CHECK(a->rx + a->ry + a->stage, b->rx + b->ry + b->stage); //zigzag with type
	CHECK(a->rx + a->ry, b->rx + b->ry); //zigzag
	return a->ry < b->ry;
	}

	static void report_frame_progress(VVCFrameContext *fc,
	const int ry, const VVCProgress idx)
	{
	VVCFrameThread *ft = fc->ft;
	const int ctu_size = ft->ctu_size;
	int old;

	if (atomic_fetch_add(&ft->rows[ry].col_progress[idx], 1) == ft->ctu_width - 1) {
	int y;
	ff_mutex_lock(&ft->lock);
	y = old = ft->row_progress[idx];
	while (y < ft->ctu_height && atomic_load(&ft->rows[y].col_progress[idx]) == ft->ctu_width)
	y++;
	if (old != y) {
	const int progress = y == ft->ctu_height ? INT_MAX : y * ctu_size;
	ft->row_progress[idx] = y;
	ff_vvc_report_progress(fc->ref, idx, progress);
	}
	ff_mutex_unlock(&ft->lock);
	}
	}

	static int run_parse(VVCContext s, VVCLocalContext lc, VVCTask *t)
	{
	int ret;
	VVCFrameContext *fc = lc->fc;
	const int rs = t->rs;
	const CTU *ctu = fc->tab.ctus + rs;

	lc->ep = t->ep;

	ret = ff_vvc_coding_tree_unit(lc, t->ctu_idx, rs, t->rx, t->ry);
	if (ret < 0)
	return ret;

	if (!ctu->has_dmvr)
	report_frame_progress(lc->fc, t->ry, VVC_PROGRESS_MV);

	return 0;
	}

	static int run_inter(VVCContext s, VVCLocalContext lc, VVCTask *t)
	{
	VVCFrameContext *fc = lc->fc;
	const CTU *ctu = fc->tab.ctus + t->rs;

	ff_vvc_predict_inter(lc, t->rs);

	if (ctu->has_dmvr)
	report_frame_progress(fc, t->ry, VVC_PROGRESS_MV);

	return 0;
	}

	static int run_recon(VVCContext s, VVCLocalContext lc, VVCTask *t)
	{
	ff_vvc_reconstruct(lc, t->rs, t->rx, t->ry);

	return 0;
	}

	static int run_lmcs(VVCContext s, VVCLocalContext lc, VVCTask *t)
	{
	VVCFrameContext *fc = lc->fc;
	VVCFrameThread *ft = fc->ft;
	const int ctu_size = ft->ctu_size;
	const int x0 = t->rx * ctu_size;
	const int y0 = t->ry * ctu_size;

	ff_vvc_lmcs_filter(lc, x0, y0);

	return 0;
	}

	static int run_deblock_v(VVCContext s, VVCLocalContext lc, VVCTask *t)
	{
	VVCFrameContext *fc = lc->fc;
	VVCFrameThread *ft = fc->ft;
	const int ctb_size = ft->ctu_size;
	const int x0 = t->rx * ctb_size;
	const int y0 = t->ry * ctb_size;

	if (!lc->sc->sh.r->sh_deblocking_filter_disabled_flag) {
	ff_vvc_decode_neighbour(lc, x0, y0, t->rx, t->ry, t->rs);
	ff_vvc_deblock_vertical(lc, x0, y0, t->rs);
	}

	return 0;
	}

	static int run_deblock_h(VVCContext s, VVCLocalContext lc, VVCTask *t)
	{
	VVCFrameContext *fc = lc->fc;
	VVCFrameThread *ft = fc->ft;
	const int ctb_size = ft->ctu_size;
	const int x0 = t->rx * ctb_size;
	const int y0 = t->ry * ctb_size;

	if (!lc->sc->sh.r->sh_deblocking_filter_disabled_flag) {
	ff_vvc_decode_neighbour(lc, x0, y0, t->rx, t->ry, t->rs);
	ff_vvc_deblock_horizontal(lc, x0, y0, t->rs);
	}
	if (fc->ps.sps->r->sps_sao_enabled_flag)
	ff_vvc_sao_copy_ctb_to_hv(lc, t->rx, t->ry, t->ry == ft->ctu_height - 1);

	return 0;
	}

	static int run_sao(VVCContext s, VVCLocalContext lc, VVCTask *t)
	{
	VVCFrameContext *fc = lc->fc;
	VVCFrameThread *ft = fc->ft;
	const int ctb_size = ft->ctu_size;
	const int x0 = t->rx * ctb_size;
	const int y0 = t->ry * ctb_size;

	if (fc->ps.sps->r->sps_sao_enabled_flag) {
	ff_vvc_decode_neighbour(lc, x0, y0, t->rx, t->ry, t->rs);
	ff_vvc_sao_filter(lc, x0, y0);
	}

	if (fc->ps.sps->r->sps_alf_enabled_flag)
	ff_vvc_alf_copy_ctu_to_hv(lc, x0, y0);

	return 0;
	}

	static int run_alf(VVCContext s, VVCLocalContext lc, VVCTask *t)
	{
	VVCFrameContext *fc = lc->fc;
	VVCFrameThread *ft = fc->ft;
	const int ctu_size = ft->ctu_size;
	const int x0 = t->rx * ctu_size;
	const int y0 = t->ry * ctu_size;

	if (fc->ps.sps->r->sps_alf_enabled_flag) {
	ff_vvc_decode_neighbour(lc, x0, y0, t->rx, t->ry, t->rs);
	ff_vvc_alf_filter(lc, x0, y0);
	}
	report_frame_progress(fc, t->ry, VVC_PROGRESS_PIXEL);

	return 0;
	}

	#define VVC_THREAD_DEBUG
	#ifdef VVC_THREAD_DEBUG
	const static char* task_name[] = {
	"P",
	"I",
	"R",
	"L",
	"V",
	"H",
	"S",
	"A"
	};
	#endif

	typedef int (run_func)(VVCContext s, VVCLocalContext lc, VVCTask t);

	static void task_run_stage(VVCTask t, VVCContext s, VVCLocalContext *lc)
	{
	int ret;
	VVCFrameContext *fc = t->fc;
	VVCFrameThread *ft = fc->ft;
	const VVCTaskStage stage = t->stage;
	run_func run[] = {
	run_parse,
	run_inter,
	run_recon,
	run_lmcs,
	run_deblock_v,
	run_deblock_h,
	run_sao,
	run_alf,
	};

	#ifdef VVC_THREAD_DEBUG
	av_log(s->avctx, AV_LOG_DEBUG, "frame %5d, %s(%3d, %3d)\r\n", (int)t->fc->decode_order, task_name[stage], t->rx, t->ry);
	#endif

	lc->sc = t->sc;

	if (!atomic_load(&ft->ret)) {
	if ((ret = run[stage](s, lc, t)) < 0) {
	#ifdef COMPAT_ATOMICS_WIN32_STDATOMIC_H
	intptr_t zero = 0;
	#else
	int zero = 0;
	#endif
	atomic_compare_exchange_strong(&ft->ret, &zero, ret);
	av_log(s->avctx, AV_LOG_ERROR,
	"frame %5d, %s(%3d, %3d) failed with %d\r\n",
	(int)fc->decode_order, task_name[stage], t->rx, t->ry, ret);
	}
	}

	task_stage_done(t, s);
	return;
	}

	static int task_run(AVTask _t, void local_context, void *user_data)
	{
	VVCTask t = (VVCTask)_t;
	VVCContext s = (VVCContext )user_data;
	VVCLocalContext *lc = local_context;
	VVCFrameThread *ft = t->fc->ft;

	lc->fc = t->fc;

	do {
	task_run_stage(t, s, lc);
	t->stage++;
	} while (task_is_stage_ready(t, 1));

	if (t->stage != VVC_TASK_STAGE_LAST)
	frame_thread_add_score(s, ft, t->rx, t->ry, t->stage);

	sheduled_done(ft, &ft->nb_scheduled_tasks);

	return 0;
	}

	AVExecutor* ff_vvc_executor_alloc(VVCContext *s, const int thread_count)
	{
	AVTaskCallbacks callbacks = {
	s,
	sizeof(VVCLocalContext),
	task_priority_higher,
	task_ready,
	task_run,
	};
	return av_executor_alloc(&callbacks, thread_count);
	}

	void ff_vvc_executor_free(AVExecutor **e)
	{
	av_executor_free(e);
	}

	void ff_vvc_frame_thread_free(VVCFrameContext *fc)
	{
	VVCFrameThread *ft = fc->ft;

	if (!ft)
	return;

	ff_mutex_destroy(&ft->lock);
	ff_cond_destroy(&ft->cond);
	av_freep(&ft->rows);
	av_freep(&ft->tasks);
	av_freep(&ft);
	}

	static void frame_thread_init_score(VVCFrameContext *fc)
	{
	const VVCFrameThread *ft = fc->ft;
	VVCTask task;

	task_init(&task, VVC_TASK_STAGE_RECON, fc, 0, 0);

	for (int i = VVC_TASK_STAGE_RECON; i < VVC_TASK_STAGE_LAST; i++) {
	task.stage = i;

	for (task.rx = -1; task.rx <= ft->ctu_width; task.rx++) {
	task.ry = -1; //top
	task_stage_done(&task, NULL);
	task.ry = ft->ctu_height; //bottom
	task_stage_done(&task, NULL);
	}

	for (task.ry = 0; task.ry < ft->ctu_height; task.ry++) {
	task.rx = -1; //left
	task_stage_done(&task, NULL);
	task.rx = ft->ctu_width; //right
	task_stage_done(&task, NULL);
	}
	}
	}

	int ff_vvc_frame_thread_init(VVCFrameContext *fc)
	{
	const VVCSPS *sps = fc->ps.sps;
	const VVCPPS *pps = fc->ps.pps;
	VVCFrameThread *ft = fc->ft;
	int ret;

	if (!ft \|\| ft->ctu_width != pps->ctb_width \|\|
	ft->ctu_height != pps->ctb_height \|\|
	ft->ctu_size != sps->ctb_size_y) {

	ff_vvc_frame_thread_free(fc);
	ft = av_calloc(1, sizeof(*fc->ft));
	if (!ft)
	return AVERROR(ENOMEM);

	ft->ctu_width = fc->ps.pps->ctb_width;
	ft->ctu_height = fc->ps.pps->ctb_height;
	ft->ctu_count = fc->ps.pps->ctb_count;
	ft->ctu_size = fc->ps.sps->ctb_size_y;

	ft->rows = av_calloc(ft->ctu_height, sizeof(*ft->rows));
	if (!ft->rows)
	goto fail;

	ft->tasks = av_malloc(ft->ctu_count * sizeof(*ft->tasks));
	if (!ft->tasks)
	goto fail;

	if ((ret = ff_cond_init(&ft->cond, NULL)))
	goto fail;

	if ((ret = ff_mutex_init(&ft->lock, NULL))) {
	ff_cond_destroy(&ft->cond);
	goto fail;
	}
	}
	fc->ft = ft;
	ft->ret = 0;
	for (int y = 0; y < ft->ctu_height; y++) {
	VVCRowThread *row = ft->rows + y;
	memset(row->col_progress, 0, sizeof(row->col_progress));
	}

	for (int rs = 0; rs < ft->ctu_count; rs++) {
	VVCTask *t = ft->tasks + rs;
	task_init(t, VVC_TASK_STAGE_PARSE, fc, rs % ft->ctu_width, rs / ft->ctu_width);
	}

	memset(&ft->row_progress[0], 0, sizeof(ft->row_progress));

	frame_thread_init_score(fc);

	return 0;

	fail:
	if (ft) {
	av_freep(&ft->rows);
	av_freep(&ft->tasks);
	av_freep(&ft);
	}

	return AVERROR(ENOMEM);
	}

	static void check_colocation(VVCContext s, VVCTask t)
	{
	const VVCFrameContext *fc = t->fc;

	if (fc->ps.ph.r->ph_temporal_mvp_enabled_flag \|\| fc->ps.sps->r->sps_sbtmvp_enabled_flag) {
	VVCFrame *col = fc->ref->collocated_ref;
	const int first_col = t->rx == fc->ps.pps->ctb_to_col_bd[t->rx];
	if (col && first_col) {
	//we depend on bottom and right boundary, do not - 1 for y
	const int y = (t->ry << fc->ps.sps->ctb_log2_size_y);
	add_progress_listener(col, &t->col_listener, t, s, VVC_PROGRESS_MV, y);
	return;
	}
	}
	frame_thread_add_score(s, fc->ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE);
	}

	static void submit_entry_point(VVCContext s, VVCFrameThread ft, SliceContext sc, EntryPoint ep)
	{
	const int rs = sc->sh.ctb_addr_in_curr_slice[ep->ctu_start];
	VVCTask *t = ft->tasks + rs;

	frame_thread_add_score(s, ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE);
	}

	int ff_vvc_frame_submit(VVCContext s, VVCFrameContext fc)
	{
	VVCFrameThread *ft = fc->ft;

	// We'll handle this in two passes:
	// Pass 0 to initialize tasks with parser, this will help detect bit stream error
	// Pass 1 to shedule location check and submit the entry point
	for (int pass = 0; pass < 2; pass++) {
	for (int i = 0; i < fc->nb_slices; i++) {
	SliceContext *sc = fc->slices[i];
	for (int j = 0; j < sc->nb_eps; j++) {
	EntryPoint *ep = sc->eps + j;
	for (int k = ep->ctu_start; k < ep->ctu_end; k++) {
	const int rs = sc->sh.ctb_addr_in_curr_slice[k];
	VVCTask *t = ft->tasks + rs;
	if (pass) {
	check_colocation(s, t);
	} else {
	const int ret = task_init_parse(t, sc, ep, k);
	if (ret < 0)
	return ret;
	}
	}
	if (pass)
	submit_entry_point(s, ft, sc, ep);
	}
	}
	}
	return 0;
	}

	int ff_vvc_frame_wait(VVCContext s, VVCFrameContext fc)
	{
	VVCFrameThread *ft = fc->ft;

	ff_mutex_lock(&ft->lock);

	while (atomic_load(&ft->nb_scheduled_tasks) \|\| atomic_load(&ft->nb_scheduled_listeners))
	ff_cond_wait(&ft->cond, &ft->lock);

	ff_mutex_unlock(&ft->lock);
	ff_vvc_report_frame_finished(fc->ref);

	#ifdef VVC_THREAD_DEBUG
	av_log(s->avctx, AV_LOG_DEBUG, "frame %5d done\r\n", (int)fc->decode_order);
	#endif
	return ft->ret;
	}