libavcodec/magicyuvenc.c - third_party/ffmpeg - Git at Google

 /*
  * MagicYUV encoder
  * Copyright (c) 2017 Paul B Mahol
  *
  * This file is part of FFmpeg.
  *
  * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */

 #include <stdlib.h>
 #include <string.h>

 #include "libavutil/cpu.h"
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
 #include "libavutil/pixdesc.h"
 #include "libavutil/qsort.h"

 #include "avcodec.h"
 #include "bytestream.h"
 #include "codec_internal.h"
 #include "encode.h"
 #include "put_bits.h"
 #include "lossless_videoencdsp.h"

 #define MAGICYUV_EXTRADATA_SIZE 32

 typedef enum Prediction {
     LEFT = 1,
     GRADIENT,
     MEDIAN,
 } Prediction;

 typedef struct HuffEntry {
     uint8_t  len;
     uint32_t code;
 } HuffEntry;

 typedef struct PTable {
     int     value;  ///< input value
     int64_t prob;   ///< number of occurences of this value in input
 } PTable;

 typedef struct Slice {
     unsigned pos;
     unsigned size;
     uint8_t *slice;
     uint8_t *bitslice;
     PTable counts[256];
 } Slice;

 typedef struct MagicYUVContext {
     const AVClass       *class;
     int                  frame_pred;
     int                  planes;
     uint8_t              format;
     int                  slice_height;
     int                  nb_slices;
     int                  correlate;
     int                  hshift[4];
     int                  vshift[4];
     unsigned             bitslice_size;
     uint8_t             *decorrelate_buf[2];
     Slice               *slices;
     HuffEntry            he[4][256];
     LLVidEncDSPContext   llvidencdsp;
     void (*predict)(struct MagicYUVContext *s, const uint8_t *src, uint8_t *dst,
                     ptrdiff_t stride, int width, int height);
 } MagicYUVContext;

 static void left_predict(MagicYUVContext *s,
                          const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
                          int width, int height)
 {
     uint8_t prev = 0;
     int i, j;

     for (i = 0; i < width; i++) {
         dst[i] = src[i] - prev;
         prev   = src[i];
     }
     dst += width;
     src += stride;
     for (j = 1; j < height; j++) {
         prev = src[-stride];
         for (i = 0; i < width; i++) {
             dst[i] = src[i] - prev;
             prev   = src[i];
         }
         dst += width;
         src += stride;
     }
 }

 static void gradient_predict(MagicYUVContext *s,
                              const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
                              int width, int height)
 {
     int left = 0, top, lefttop;
     int i, j;

     for (i = 0; i < width; i++) {
         dst[i] = src[i] - left;
         left   = src[i];
     }
     dst += width;
     src += stride;
     for (j = 1; j < height; j++) {
         top = src[-stride];
         left = src[0] - top;
         dst[0] = left;
         for (i = 1; i < width; i++) {
             top = src[i - stride];
             lefttop = src[i - (stride + 1)];
             left = src[i-1];
             dst[i] = (src[i] - top) - left + lefttop;
         }
         dst += width;
         src += stride;
     }
 }

 static void median_predict(MagicYUVContext *s,
                            const uint8_t *src, uint8_t *dst, ptrdiff_t stride,
                            int width, int height)
 {
     int left = 0, lefttop;
     int i, j;

     for (i = 0; i < width; i++) {
         dst[i] = src[i] - left;
         left   = src[i];
     }
     dst += width;
     src += stride;
     for (j = 1; j < height; j++) {
         left = lefttop = src[-stride];
         s->llvidencdsp.sub_median_pred(dst, src - stride, src, width, &left, &lefttop);
         dst += width;
         src += stride;
     }
 }

 static av_cold int magy_encode_init(AVCodecContext *avctx)
 {
     MagicYUVContext *s = avctx->priv_data;
     PutByteContext pb;

     switch (avctx->pix_fmt) {
     case AV_PIX_FMT_GBRP:
         avctx->codec_tag = MKTAG('M', '8', 'R', 'G');
         s->correlate = 1;
         s->format = 0x65;
         break;
     case AV_PIX_FMT_GBRAP:
         avctx->codec_tag = MKTAG('M', '8', 'R', 'A');
         s->correlate = 1;
         s->format = 0x66;
         break;
     case AV_PIX_FMT_YUV420P:
         avctx->codec_tag = MKTAG('M', '8', 'Y', '0');
         s->hshift[1] =
         s->vshift[1] =
         s->hshift[2] =
         s->vshift[2] = 1;
         s->format = 0x69;
         break;
     case AV_PIX_FMT_YUV422P:
         avctx->codec_tag = MKTAG('M', '8', 'Y', '2');
         s->hshift[1] =
         s->hshift[2] = 1;
         s->format = 0x68;
         break;
     case AV_PIX_FMT_YUV444P:
         avctx->codec_tag = MKTAG('M', '8', 'Y', '4');
         s->format = 0x67;
         break;
     case AV_PIX_FMT_YUVA444P:
         avctx->codec_tag = MKTAG('M', '8', 'Y', 'A');
         s->format = 0x6a;
         break;
     case AV_PIX_FMT_GRAY8:
         avctx->codec_tag = MKTAG('M', '8', 'G', '0');
         s->format = 0x6b;
         break;
     }

     ff_llvidencdsp_init(&s->llvidencdsp);

     s->planes = av_pix_fmt_count_planes(avctx->pix_fmt);

     s->nb_slices = (avctx->slices <= 0) ? av_cpu_count() : avctx->slices;
     s->nb_slices = FFMIN(s->nb_slices, avctx->height >> s->vshift[1]);
     s->nb_slices = FFMAX(1, s->nb_slices);
     s->slice_height = FFALIGN((avctx->height + s->nb_slices - 1) / s->nb_slices, 1 << s->vshift[1]);
     s->nb_slices = (avctx->height + s->slice_height - 1) / s->slice_height;
     s->slices = av_calloc(s->nb_slices * s->planes, sizeof(*s->slices));
     if (!s->slices)
         return AVERROR(ENOMEM);

     if (s->correlate) {
         size_t max_align = av_cpu_max_align();
         size_t aligned_width = FFALIGN(avctx->width, max_align);
         s->decorrelate_buf[0] = av_calloc(2U * (s->nb_slices * s->slice_height),
                                           aligned_width);
         if (!s->decorrelate_buf[0])
             return AVERROR(ENOMEM);
         s->decorrelate_buf[1] = s->decorrelate_buf[0] + (s->nb_slices * s->slice_height) * aligned_width;
     }

     s->bitslice_size = avctx->width * s->slice_height + 2;
     for (int n = 0; n < s->nb_slices; n++) {
         for (int i = 0; i < s->planes; i++) {
             Slice *sl = &s->slices[n * s->planes + i];

             sl->bitslice = av_malloc(s->bitslice_size + AV_INPUT_BUFFER_PADDING_SIZE);
             sl->slice = av_malloc(avctx->width * (s->slice_height + 2) +
                                                      AV_INPUT_BUFFER_PADDING_SIZE);
             if (!sl->slice || !sl->bitslice) {
                 av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer.\n");
                 return AVERROR(ENOMEM);
             }
         }
     }

     switch (s->frame_pred) {
     case LEFT:     s->predict = left_predict;     break;
     case GRADIENT: s->predict = gradient_predict; break;
     case MEDIAN:   s->predict = median_predict;   break;
     }

     avctx->extradata_size = MAGICYUV_EXTRADATA_SIZE;

     avctx->extradata = av_mallocz(avctx->extradata_size +
                                   AV_INPUT_BUFFER_PADDING_SIZE);

     if (!avctx->extradata) {
         av_log(avctx, AV_LOG_ERROR, "Could not allocate extradata.\n");
         return AVERROR(ENOMEM);
     }

     bytestream2_init_writer(&pb, avctx->extradata, MAGICYUV_EXTRADATA_SIZE);
     bytestream2_put_le32(&pb, MKTAG('M', 'A', 'G', 'Y'));
     bytestream2_put_le32(&pb, 32);
     bytestream2_put_byte(&pb, 7);
     bytestream2_put_byte(&pb, s->format);
     bytestream2_put_byte(&pb, 12);
     bytestream2_put_byte(&pb, 0);

     bytestream2_put_byte(&pb, 0);
     bytestream2_put_byte(&pb, 0);
     bytestream2_put_byte(&pb, 32);
     bytestream2_put_byte(&pb, 0);

     bytestream2_put_le32(&pb, avctx->width);
     bytestream2_put_le32(&pb, avctx->height);
     bytestream2_put_le32(&pb, avctx->width);
     bytestream2_put_le32(&pb, avctx->height);

     return 0;
 }

 static void calculate_codes(HuffEntry *he, uint16_t codes_count[33])
 {
     for (unsigned i = 32, nb_codes = 0; i > 0; i--) {
         uint16_t curr = codes_count[i];   // # of leafs of length i
         codes_count[i] = nb_codes / 2;    // # of non-leaf nodes on level i
         nb_codes = codes_count[i] + curr; // # of nodes on level i
     }

     for (unsigned i = 0; i < 256; i++) {
         he[i].code = codes_count[he[i].len];
         codes_count[he[i].len]++;
     }
 }

 static void count_usage(const uint8_t *src, int width,
                         int height, PTable *counts)
 {
     for (int j = 0; j < height; j++) {
         for (int i = 0; i < width; i++)
             counts[src[i]].prob++;
         src += width;
     }
 }

 typedef struct PackageMergerList {
     int nitems;             ///< number of items in the list and probability      ex. 4
     int item_idx[515];      ///< index range for each item in items                   0, 2, 5, 9, 13
     int probability[514];   ///< probability of each item                             3, 8, 18, 46
     int items[257 * 16];    ///< chain of all individual values that make up items    A, B, A, B, C, A, B, C, D, C, D, D, E
 } PackageMergerList;

 static int compare_by_prob(const void *a, const void *b)
 {
     const PTable *a2 = a;
     const PTable *b2 = b;
     return a2->prob - b2->prob;
 }

 static void magy_huffman_compute_bits(PTable *prob_table, HuffEntry *distincts,
                                       uint16_t codes_counts[33],
                                       int size, int max_length)
 {
     PackageMergerList list_a, list_b, *to = &list_a, *from = &list_b, *temp;
     int times, i, j, k;
     int nbits[257] = {0};
     int min;

     av_assert0(max_length > 0);

     to->nitems = 0;
     from->nitems = 0;
     to->item_idx[0] = 0;
     from->item_idx[0] = 0;
     AV_QSORT(prob_table, size, PTable, compare_by_prob);

     for (times = 0; times <= max_length; times++) {
         to->nitems = 0;
         to->item_idx[0] = 0;

         j = 0;
         k = 0;

         if (times < max_length) {
             i = 0;
         }
         while (i < size || j + 1 < from->nitems) {
             to->nitems++;
             to->item_idx[to->nitems] = to->item_idx[to->nitems - 1];
             if (i < size &&
                 (j + 1 >= from->nitems ||
                  prob_table[i].prob <
                      from->probability[j] + from->probability[j + 1])) {
                 to->items[to->item_idx[to->nitems]++] = prob_table[i].value;
                 to->probability[to->nitems - 1] = prob_table[i].prob;
                 i++;
             } else {
                 for (k = from->item_idx[j]; k < from->item_idx[j + 2]; k++) {
                     to->items[to->item_idx[to->nitems]++] = from->items[k];
                 }
                 to->probability[to->nitems - 1] =
                     from->probability[j] + from->probability[j + 1];
                 j += 2;
             }
         }
         temp = to;
         to = from;
         from = temp;
     }

     min = (size - 1 < from->nitems) ? size - 1 : from->nitems;
     for (i = 0; i < from->item_idx[min]; i++) {
         nbits[from->items[i]]++;
     }

     for (i = 0; i < size; i++) {
         distincts[i].len = nbits[i];
         codes_counts[nbits[i]]++;
     }
 }

 static int count_plane_slice(AVCodecContext *avctx, int n, int plane)
 {
     MagicYUVContext *s = avctx->priv_data;
     Slice *sl = &s->slices[n * s->planes + plane];
     const uint8_t *dst = sl->slice;
     PTable *counts = sl->counts;

     memset(counts, 0, sizeof(sl->counts));

     count_usage(dst, AV_CEIL_RSHIFT(avctx->width, s->hshift[plane]),
                 AV_CEIL_RSHIFT(s->slice_height, s->vshift[plane]), counts);

     return 0;
 }

 static int encode_table(AVCodecContext *avctx,
                         PutBitContext *pb, HuffEntry *he, int plane)
 {
     MagicYUVContext *s = avctx->priv_data;
     PTable counts[256] = { {0} };
     uint16_t codes_counts[33] = { 0 };

     for (int n = 0; n < s->nb_slices; n++) {
         Slice *sl = &s->slices[n * s->planes + plane];
         PTable *slice_counts = sl->counts;

         for (int i = 0; i < 256; i++)
             counts[i].prob = slice_counts[i].prob;
     }

     for (int i = 0; i < 256; i++) {
         counts[i].prob++;
         counts[i].value = i;
     }

     magy_huffman_compute_bits(counts, he, codes_counts, 256, 12);

     calculate_codes(he, codes_counts);

     for (int i = 0; i < 256; i++) {
         put_bits(pb, 1, 0);
         put_bits(pb, 7, he[i].len);
     }

     return 0;
 }

 static int encode_plane_slice_raw(const uint8_t *src, uint8_t *dst, unsigned dst_size,
                                   int width, int height, int prediction)
 {
     unsigned count = width * height;

     dst[0] = 1;
     dst[1] = prediction;

     memcpy(dst + 2, src, count);
     count += 2;
     AV_WN32(dst + count, 0);
     if (count & 3)
         count += 4 - (count & 3);

     return count;
 }

 static int encode_plane_slice(const uint8_t *src, uint8_t *dst, unsigned dst_size,
                               int width, int height, HuffEntry *he, int prediction)
 {
     const uint8_t *osrc = src;
     PutBitContext pb;
     int count;

     init_put_bits(&pb, dst, dst_size);

     put_bits(&pb, 8, 0);
     put_bits(&pb, 8, prediction);

     for (int j = 0; j < height; j++) {
         for (int i = 0; i < width; i++) {
             const int idx = src[i];
             const int len = he[idx].len;
             if (put_bits_left(&pb) < len + 32)
                 return encode_plane_slice_raw(osrc, dst, dst_size, width, height, prediction);
             put_bits(&pb, len, he[idx].code);
         }

         src += width;
     }

     count = put_bits_count(&pb) & 0x1F;

     if (count)
         put_bits(&pb, 32 - count, 0);

     flush_put_bits(&pb);

     return put_bytes_output(&pb);
 }

 static int encode_slice(AVCodecContext *avctx, void *tdata,
                         int n, int threadnr)
 {
     MagicYUVContext *s = avctx->priv_data;
     const int slice_height = s->slice_height;
     const int last_height = FFMIN(slice_height, avctx->height - n * slice_height);
     const int height = (n < (s->nb_slices - 1)) ? slice_height : last_height;

     for (int i = 0; i < s->planes; i++) {
         Slice *sl = &s->slices[n * s->planes + i];

         sl->size =
             encode_plane_slice(sl->slice,
                                sl->bitslice,
                                s->bitslice_size,
                                AV_CEIL_RSHIFT(avctx->width, s->hshift[i]),
                                AV_CEIL_RSHIFT(height, s->vshift[i]),
                                s->he[i], s->frame_pred);
     }

     return 0;
 }

 static int predict_slice(AVCodecContext *avctx, void *tdata,
                          int n, int threadnr)
 {
     size_t max_align = av_cpu_max_align();
     const int aligned_width = FFALIGN(avctx->width, max_align);
     MagicYUVContext *s = avctx->priv_data;
     const int slice_height = s->slice_height;
     const int last_height = FFMIN(slice_height, avctx->height - n * slice_height);
     const int height = (n < (s->nb_slices - 1)) ? slice_height : last_height;
     const int width = avctx->width;
     AVFrame *frame = tdata;

     if (s->correlate) {
         uint8_t *decorrelated[2] = { s->decorrelate_buf[0] + n * slice_height * aligned_width,
                                      s->decorrelate_buf[1] + n * slice_height * aligned_width };
         const int decorrelate_linesize = aligned_width;
         const uint8_t *const data[4] = { decorrelated[0], frame->data[0] + n * slice_height * frame->linesize[0],
                                          decorrelated[1], s->planes == 4 ? frame->data[3] + n * slice_height * frame->linesize[3] : NULL };
         const uint8_t *r, *g, *b;
         const int linesize[4]  = { decorrelate_linesize, frame->linesize[0],
                                    decorrelate_linesize, frame->linesize[3] };

         g = frame->data[0] + n * slice_height * frame->linesize[0];
         b = frame->data[1] + n * slice_height * frame->linesize[1];
         r = frame->data[2] + n * slice_height * frame->linesize[2];

         for (int i = 0; i < height; i++) {
             s->llvidencdsp.diff_bytes(decorrelated[0], b, g, width);
             s->llvidencdsp.diff_bytes(decorrelated[1], r, g, width);
             g += frame->linesize[0];
             b += frame->linesize[1];
             r += frame->linesize[2];
             decorrelated[0] += decorrelate_linesize;
             decorrelated[1] += decorrelate_linesize;
         }

         for (int i = 0; i < s->planes; i++) {
             Slice *sl = &s->slices[n * s->planes + i];

             s->predict(s, data[i], sl->slice, linesize[i],
                        frame->width, height);
         }
     } else {
         for (int i = 0; i < s->planes; i++) {
             Slice *sl = &s->slices[n * s->planes + i];

             s->predict(s, frame->data[i] + n * (slice_height >> s->vshift[i]) * frame->linesize[i],
                        sl->slice,
                        frame->linesize[i],
                        AV_CEIL_RSHIFT(frame->width, s->hshift[i]),
                        AV_CEIL_RSHIFT(height, s->vshift[i]));
         }
     }

     for (int p = 0; p < s->planes; p++)
         count_plane_slice(avctx, n, p);

     return 0;
 }

 static int magy_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                              const AVFrame *frame, int *got_packet)
 {
     MagicYUVContext *s = avctx->priv_data;
     const int width = avctx->width, height = avctx->height;
     const int slice_height = s->slice_height;
     unsigned tables_size;
     PutBitContext pbit;
     PutByteContext pb;
     int pos, ret = 0;

     ret = ff_alloc_packet(avctx, pkt, (256 + 4 * s->nb_slices + width * height) *
                           s->planes + 256);
     if (ret < 0)
         return ret;

     bytestream2_init_writer(&pb, pkt->data, pkt->size);
     bytestream2_put_le32(&pb, MKTAG('M', 'A', 'G', 'Y'));
     bytestream2_put_le32(&pb, 32); // header size
     bytestream2_put_byte(&pb, 7);  // version
     bytestream2_put_byte(&pb, s->format);
     bytestream2_put_byte(&pb, 12); // max huffman length
     bytestream2_put_byte(&pb, 0);

     bytestream2_put_byte(&pb, 0);
     bytestream2_put_byte(&pb, 0);
     bytestream2_put_byte(&pb, 32); // coder type
     bytestream2_put_byte(&pb, 0);

     bytestream2_put_le32(&pb, avctx->width);
     bytestream2_put_le32(&pb, avctx->height);
     bytestream2_put_le32(&pb, avctx->width);
     bytestream2_put_le32(&pb, slice_height);
     bytestream2_put_le32(&pb, 0);

     for (int i = 0; i < s->planes; i++) {
         bytestream2_put_le32(&pb, 0);
         for (int j = 1; j < s->nb_slices; j++)
             bytestream2_put_le32(&pb, 0);
     }

     bytestream2_put_byte(&pb, s->planes);

     for (int i = 0; i < s->planes; i++) {
         for (int n = 0; n < s->nb_slices; n++)
             bytestream2_put_byte(&pb, n * s->planes + i);
     }

     avctx->execute2(avctx, predict_slice, (void *)frame, NULL, s->nb_slices);

     init_put_bits(&pbit, pkt->data + bytestream2_tell_p(&pb), bytestream2_get_bytes_left_p(&pb));

     for (int i = 0; i < s->planes; i++)
         encode_table(avctx, &pbit, s->he[i], i);

     tables_size = put_bytes_count(&pbit, 1);
     bytestream2_skip_p(&pb, tables_size);

     avctx->execute2(avctx, encode_slice, NULL, NULL, s->nb_slices);

     for (int n = 0; n < s->nb_slices; n++) {
         for (int i = 0; i < s->planes; i++) {
             Slice *sl = &s->slices[n * s->planes + i];

             sl->pos = bytestream2_tell_p(&pb);

             bytestream2_put_buffer(&pb, sl->bitslice, sl->size);
         }
     }

     pos = bytestream2_tell_p(&pb);
     bytestream2_seek_p(&pb, 32, SEEK_SET);
     bytestream2_put_le32(&pb, s->slices[0].pos - 32);
     for (int i = 0; i < s->planes; i++) {
         for (int n = 0; n < s->nb_slices; n++) {
             Slice *sl = &s->slices[n * s->planes + i];

             bytestream2_put_le32(&pb, sl->pos - 32);
         }
     }
     bytestream2_seek_p(&pb, pos, SEEK_SET);

     pkt->size   = bytestream2_tell_p(&pb);

     *got_packet = 1;

     return 0;
 }

 static av_cold int magy_encode_close(AVCodecContext *avctx)
 {
     MagicYUVContext *s = avctx->priv_data;

     for (int i = 0; i < s->planes * s->nb_slices && s->slices; i++) {
         Slice *sl = &s->slices[i];

         av_freep(&sl->slice);
         av_freep(&sl->bitslice);
     }
     av_freep(&s->slices);
     av_freep(&s->decorrelate_buf);

     return 0;
 }

 #define OFFSET(x) offsetof(MagicYUVContext, x)
 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 static const AVOption options[] = {
     { "pred", "Prediction method", OFFSET(frame_pred), AV_OPT_TYPE_INT, {.i64=LEFT}, LEFT, MEDIAN, VE, .unit = "pred" },
     { "left",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = LEFT },     0, 0, VE, .unit = "pred" },
     { "gradient", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = GRADIENT }, 0, 0, VE, .unit = "pred" },
     { "median",   NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MEDIAN },   0, 0, VE, .unit = "pred" },
     { NULL},
 };

 static const AVClass magicyuv_class = {
     .class_name = "magicyuv",
     .item_name  = av_default_item_name,
     .option     = options,
     .version    = LIBAVUTIL_VERSION_INT,
 };

 const FFCodec ff_magicyuv_encoder = {
     .p.name           = "magicyuv",
     CODEC_LONG_NAME("MagicYUV video"),
     .p.type           = AVMEDIA_TYPE_VIDEO,
     .p.id             = AV_CODEC_ID_MAGICYUV,
     .p.capabilities   = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
                         AV_CODEC_CAP_SLICE_THREADS |
                         AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE,
     .priv_data_size   = sizeof(MagicYUVContext),
     .p.priv_class     = &magicyuv_class,
     .init             = magy_encode_init,
     .close            = magy_encode_close,
     FF_CODEC_ENCODE_CB(magy_encode_frame),
     .p.pix_fmts       = (const enum AVPixelFormat[]) {
                           AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRAP, AV_PIX_FMT_YUV422P,
                           AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUVA444P, AV_PIX_FMT_GRAY8,
                           AV_PIX_FMT_NONE
                       },
     .caps_internal    = FF_CODEC_CAP_INIT_CLEANUP,
 };
	/*
	* MagicYUV encoder
	* Copyright (c) 2017 Paul B Mahol
	*
	* This file is part of FFmpeg.
	*
	* FFmpeg is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* FFmpeg is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with FFmpeg; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	*/

	#include <stdlib.h>
	#include <string.h>

	#include "libavutil/cpu.h"
	#include "libavutil/mem.h"
	#include "libavutil/opt.h"
	#include "libavutil/pixdesc.h"
	#include "libavutil/qsort.h"

	#include "avcodec.h"
	#include "bytestream.h"
	#include "codec_internal.h"
	#include "encode.h"
	#include "put_bits.h"
	#include "lossless_videoencdsp.h"

	#define MAGICYUV_EXTRADATA_SIZE 32

	typedef enum Prediction {
	LEFT = 1,
	GRADIENT,
	MEDIAN,
	} Prediction;

	typedef struct HuffEntry {
	uint8_t len;
	uint32_t code;
	} HuffEntry;

	typedef struct PTable {
	int value; ///< input value
	int64_t prob; ///< number of occurences of this value in input
	} PTable;

	typedef struct Slice {
	unsigned pos;
	unsigned size;
	uint8_t *slice;
	uint8_t *bitslice;
	PTable counts[256];
	} Slice;

	typedef struct MagicYUVContext {
	const AVClass *class;
	int frame_pred;
	int planes;
	uint8_t format;
	int slice_height;
	int nb_slices;
	int correlate;
	int hshift[4];
	int vshift[4];
	unsigned bitslice_size;
	uint8_t *decorrelate_buf[2];
	Slice *slices;
	HuffEntry he[4][256];
	LLVidEncDSPContext llvidencdsp;
	void (predict)(struct MagicYUVContext s, const uint8_t src, uint8_t dst,
	ptrdiff_t stride, int width, int height);
	} MagicYUVContext;

	static void left_predict(MagicYUVContext *s,
	const uint8_t src, uint8_t dst, ptrdiff_t stride,
	int width, int height)
	{
	uint8_t prev = 0;
	int i, j;

	for (i = 0; i < width; i++) {
	dst[i] = src[i] - prev;
	prev = src[i];
	}
	dst += width;
	src += stride;
	for (j = 1; j < height; j++) {
	prev = src[-stride];
	for (i = 0; i < width; i++) {
	dst[i] = src[i] - prev;
	prev = src[i];
	}
	dst += width;
	src += stride;
	}
	}

	static void gradient_predict(MagicYUVContext *s,
	const uint8_t src, uint8_t dst, ptrdiff_t stride,
	int width, int height)
	{
	int left = 0, top, lefttop;
	int i, j;

	for (i = 0; i < width; i++) {
	dst[i] = src[i] - left;
	left = src[i];
	}
	dst += width;
	src += stride;
	for (j = 1; j < height; j++) {
	top = src[-stride];
	left = src[0] - top;
	dst[0] = left;
	for (i = 1; i < width; i++) {
	top = src[i - stride];
	lefttop = src[i - (stride + 1)];
	left = src[i-1];
	dst[i] = (src[i] - top) - left + lefttop;
	}
	dst += width;
	src += stride;
	}
	}

	static void median_predict(MagicYUVContext *s,
	const uint8_t src, uint8_t dst, ptrdiff_t stride,
	int width, int height)
	{
	int left = 0, lefttop;
	int i, j;

	for (i = 0; i < width; i++) {
	dst[i] = src[i] - left;
	left = src[i];
	}
	dst += width;
	src += stride;
	for (j = 1; j < height; j++) {
	left = lefttop = src[-stride];
	s->llvidencdsp.sub_median_pred(dst, src - stride, src, width, &left, &lefttop);
	dst += width;
	src += stride;
	}
	}

	static av_cold int magy_encode_init(AVCodecContext *avctx)
	{
	MagicYUVContext *s = avctx->priv_data;
	PutByteContext pb;

	switch (avctx->pix_fmt) {
	case AV_PIX_FMT_GBRP:
	avctx->codec_tag = MKTAG('M', '8', 'R', 'G');
	s->correlate = 1;
	s->format = 0x65;
	break;
	case AV_PIX_FMT_GBRAP:
	avctx->codec_tag = MKTAG('M', '8', 'R', 'A');
	s->correlate = 1;
	s->format = 0x66;
	break;
	case AV_PIX_FMT_YUV420P:
	avctx->codec_tag = MKTAG('M', '8', 'Y', '0');
	s->hshift[1] =
	s->vshift[1] =
	s->hshift[2] =
	s->vshift[2] = 1;
	s->format = 0x69;
	break;
	case AV_PIX_FMT_YUV422P:
	avctx->codec_tag = MKTAG('M', '8', 'Y', '2');
	s->hshift[1] =
	s->hshift[2] = 1;
	s->format = 0x68;
	break;
	case AV_PIX_FMT_YUV444P:
	avctx->codec_tag = MKTAG('M', '8', 'Y', '4');
	s->format = 0x67;
	break;
	case AV_PIX_FMT_YUVA444P:
	avctx->codec_tag = MKTAG('M', '8', 'Y', 'A');
	s->format = 0x6a;
	break;
	case AV_PIX_FMT_GRAY8:
	avctx->codec_tag = MKTAG('M', '8', 'G', '0');
	s->format = 0x6b;
	break;
	}

	ff_llvidencdsp_init(&s->llvidencdsp);

	s->planes = av_pix_fmt_count_planes(avctx->pix_fmt);

	s->nb_slices = (avctx->slices <= 0) ? av_cpu_count() : avctx->slices;
	s->nb_slices = FFMIN(s->nb_slices, avctx->height >> s->vshift[1]);
	s->nb_slices = FFMAX(1, s->nb_slices);
	s->slice_height = FFALIGN((avctx->height + s->nb_slices - 1) / s->nb_slices, 1 << s->vshift[1]);
	s->nb_slices = (avctx->height + s->slice_height - 1) / s->slice_height;
	s->slices = av_calloc(s->nb_slices * s->planes, sizeof(*s->slices));
	if (!s->slices)
	return AVERROR(ENOMEM);

	if (s->correlate) {
	size_t max_align = av_cpu_max_align();
	size_t aligned_width = FFALIGN(avctx->width, max_align);
	s->decorrelate_buf[0] = av_calloc(2U * (s->nb_slices * s->slice_height),
	aligned_width);
	if (!s->decorrelate_buf[0])
	return AVERROR(ENOMEM);
	s->decorrelate_buf[1] = s->decorrelate_buf[0] + (s->nb_slices * s->slice_height) * aligned_width;
	}

	s->bitslice_size = avctx->width * s->slice_height + 2;
	for (int n = 0; n < s->nb_slices; n++) {
	for (int i = 0; i < s->planes; i++) {
	Slice sl = &s->slices[n s->planes + i];

	sl->bitslice = av_malloc(s->bitslice_size + AV_INPUT_BUFFER_PADDING_SIZE);
	sl->slice = av_malloc(avctx->width * (s->slice_height + 2) +
	AV_INPUT_BUFFER_PADDING_SIZE);
	if (!sl->slice \|\| !sl->bitslice) {
	av_log(avctx, AV_LOG_ERROR, "Cannot allocate temporary buffer.\n");
	return AVERROR(ENOMEM);
	}
	}
	}

	switch (s->frame_pred) {
	case LEFT: s->predict = left_predict; break;
	case GRADIENT: s->predict = gradient_predict; break;
	case MEDIAN: s->predict = median_predict; break;
	}

	avctx->extradata_size = MAGICYUV_EXTRADATA_SIZE;

	avctx->extradata = av_mallocz(avctx->extradata_size +
	AV_INPUT_BUFFER_PADDING_SIZE);

	if (!avctx->extradata) {
	av_log(avctx, AV_LOG_ERROR, "Could not allocate extradata.\n");
	return AVERROR(ENOMEM);
	}

	bytestream2_init_writer(&pb, avctx->extradata, MAGICYUV_EXTRADATA_SIZE);
	bytestream2_put_le32(&pb, MKTAG('M', 'A', 'G', 'Y'));
	bytestream2_put_le32(&pb, 32);
	bytestream2_put_byte(&pb, 7);
	bytestream2_put_byte(&pb, s->format);
	bytestream2_put_byte(&pb, 12);
	bytestream2_put_byte(&pb, 0);

	bytestream2_put_byte(&pb, 0);
	bytestream2_put_byte(&pb, 0);
	bytestream2_put_byte(&pb, 32);
	bytestream2_put_byte(&pb, 0);

	bytestream2_put_le32(&pb, avctx->width);
	bytestream2_put_le32(&pb, avctx->height);
	bytestream2_put_le32(&pb, avctx->width);
	bytestream2_put_le32(&pb, avctx->height);

	return 0;
	}

	static void calculate_codes(HuffEntry *he, uint16_t codes_count[33])
	{
	for (unsigned i = 32, nb_codes = 0; i > 0; i--) {
	uint16_t curr = codes_count[i]; // # of leafs of length i
	codes_count[i] = nb_codes / 2; // # of non-leaf nodes on level i
	nb_codes = codes_count[i] + curr; // # of nodes on level i
	}

	for (unsigned i = 0; i < 256; i++) {
	he[i].code = codes_count[he[i].len];
	codes_count[he[i].len]++;
	}
	}

	static void count_usage(const uint8_t *src, int width,
	int height, PTable *counts)
	{
	for (int j = 0; j < height; j++) {
	for (int i = 0; i < width; i++)
	counts[src[i]].prob++;
	src += width;
	}
	}

	typedef struct PackageMergerList {
	int nitems; ///< number of items in the list and probability ex. 4
	int item_idx[515]; ///< index range for each item in items 0, 2, 5, 9, 13
	int probability[514]; ///< probability of each item 3, 8, 18, 46
	int items[257 * 16]; ///< chain of all individual values that make up items A, B, A, B, C, A, B, C, D, C, D, D, E
	} PackageMergerList;

	static int compare_by_prob(const void a, const void b)
	{
	const PTable *a2 = a;
	const PTable *b2 = b;
	return a2->prob - b2->prob;
	}

	static void magy_huffman_compute_bits(PTable prob_table, HuffEntry distincts,
	uint16_t codes_counts[33],
	int size, int max_length)
	{
	PackageMergerList list_a, list_b, to = &list_a, from = &list_b, *temp;
	int times, i, j, k;
	int nbits[257] = {0};
	int min;

	av_assert0(max_length > 0);

	to->nitems = 0;
	from->nitems = 0;
	to->item_idx[0] = 0;
	from->item_idx[0] = 0;
	AV_QSORT(prob_table, size, PTable, compare_by_prob);

	for (times = 0; times <= max_length; times++) {
	to->nitems = 0;
	to->item_idx[0] = 0;

	j = 0;
	k = 0;

	if (times < max_length) {
	i = 0;
	}
	while (i < size \|\| j + 1 < from->nitems) {
	to->nitems++;
	to->item_idx[to->nitems] = to->item_idx[to->nitems - 1];
	if (i < size &&
	(j + 1 >= from->nitems \|\|
	prob_table[i].prob <
	from->probability[j] + from->probability[j + 1])) {
	to->items[to->item_idx[to->nitems]++] = prob_table[i].value;
	to->probability[to->nitems - 1] = prob_table[i].prob;
	i++;
	} else {
	for (k = from->item_idx[j]; k < from->item_idx[j + 2]; k++) {
	to->items[to->item_idx[to->nitems]++] = from->items[k];
	}
	to->probability[to->nitems - 1] =
	from->probability[j] + from->probability[j + 1];
	j += 2;
	}
	}
	temp = to;
	to = from;
	from = temp;
	}

	min = (size - 1 < from->nitems) ? size - 1 : from->nitems;
	for (i = 0; i < from->item_idx[min]; i++) {
	nbits[from->items[i]]++;
	}

	for (i = 0; i < size; i++) {
	distincts[i].len = nbits[i];
	codes_counts[nbits[i]]++;
	}
	}

	static int count_plane_slice(AVCodecContext *avctx, int n, int plane)
	{
	MagicYUVContext *s = avctx->priv_data;
	Slice sl = &s->slices[n s->planes + plane];
	const uint8_t *dst = sl->slice;
	PTable *counts = sl->counts;

	memset(counts, 0, sizeof(sl->counts));

	count_usage(dst, AV_CEIL_RSHIFT(avctx->width, s->hshift[plane]),
	AV_CEIL_RSHIFT(s->slice_height, s->vshift[plane]), counts);

	return 0;
	}

	static int encode_table(AVCodecContext *avctx,
	PutBitContext pb, HuffEntry he, int plane)
	{
	MagicYUVContext *s = avctx->priv_data;
	PTable counts[256] = { {0} };
	uint16_t codes_counts[33] = { 0 };

	for (int n = 0; n < s->nb_slices; n++) {
	Slice sl = &s->slices[n s->planes + plane];
	PTable *slice_counts = sl->counts;

	for (int i = 0; i < 256; i++)
	counts[i].prob = slice_counts[i].prob;
	}

	for (int i = 0; i < 256; i++) {
	counts[i].prob++;
	counts[i].value = i;
	}

	magy_huffman_compute_bits(counts, he, codes_counts, 256, 12);

	calculate_codes(he, codes_counts);

	for (int i = 0; i < 256; i++) {
	put_bits(pb, 1, 0);
	put_bits(pb, 7, he[i].len);
	}

	return 0;
	}

	static int encode_plane_slice_raw(const uint8_t src, uint8_t dst, unsigned dst_size,
	int width, int height, int prediction)
	{
	unsigned count = width * height;

	dst[0] = 1;
	dst[1] = prediction;

	memcpy(dst + 2, src, count);
	count += 2;
	AV_WN32(dst + count, 0);
	if (count & 3)
	count += 4 - (count & 3);

	return count;
	}

	static int encode_plane_slice(const uint8_t src, uint8_t dst, unsigned dst_size,
	int width, int height, HuffEntry *he, int prediction)
	{
	const uint8_t *osrc = src;
	PutBitContext pb;
	int count;

	init_put_bits(&pb, dst, dst_size);

	put_bits(&pb, 8, 0);
	put_bits(&pb, 8, prediction);

	for (int j = 0; j < height; j++) {
	for (int i = 0; i < width; i++) {
	const int idx = src[i];
	const int len = he[idx].len;
	if (put_bits_left(&pb) < len + 32)
	return encode_plane_slice_raw(osrc, dst, dst_size, width, height, prediction);
	put_bits(&pb, len, he[idx].code);
	}

	src += width;
	}

	count = put_bits_count(&pb) & 0x1F;

	if (count)
	put_bits(&pb, 32 - count, 0);

	flush_put_bits(&pb);

	return put_bytes_output(&pb);
	}

	static int encode_slice(AVCodecContext avctx, void tdata,
	int n, int threadnr)
	{
	MagicYUVContext *s = avctx->priv_data;
	const int slice_height = s->slice_height;
	const int last_height = FFMIN(slice_height, avctx->height - n * slice_height);
	const int height = (n < (s->nb_slices - 1)) ? slice_height : last_height;

	for (int i = 0; i < s->planes; i++) {
	Slice sl = &s->slices[n s->planes + i];

	sl->size =
	encode_plane_slice(sl->slice,
	sl->bitslice,
	s->bitslice_size,
	AV_CEIL_RSHIFT(avctx->width, s->hshift[i]),
	AV_CEIL_RSHIFT(height, s->vshift[i]),
	s->he[i], s->frame_pred);
	}

	return 0;
	}

	static int predict_slice(AVCodecContext avctx, void tdata,
	int n, int threadnr)
	{
	size_t max_align = av_cpu_max_align();
	const int aligned_width = FFALIGN(avctx->width, max_align);
	MagicYUVContext *s = avctx->priv_data;
	const int slice_height = s->slice_height;
	const int last_height = FFMIN(slice_height, avctx->height - n * slice_height);
	const int height = (n < (s->nb_slices - 1)) ? slice_height : last_height;
	const int width = avctx->width;
	AVFrame *frame = tdata;

	if (s->correlate) {
	uint8_t decorrelated[2] = { s->decorrelate_buf[0] + n slice_height * aligned_width,
	s->decorrelate_buf[1] + n * slice_height * aligned_width };
	const int decorrelate_linesize = aligned_width;
	const uint8_t const data[4] = { decorrelated[0], frame->data[0] + n slice_height * frame->linesize[0],
	decorrelated[1], s->planes == 4 ? frame->data[3] + n * slice_height * frame->linesize[3] : NULL };
	const uint8_t r, g, *b;
	const int linesize[4] = { decorrelate_linesize, frame->linesize[0],
	decorrelate_linesize, frame->linesize[3] };

	g = frame->data[0] + n * slice_height * frame->linesize[0];
	b = frame->data[1] + n * slice_height * frame->linesize[1];
	r = frame->data[2] + n * slice_height * frame->linesize[2];

	for (int i = 0; i < height; i++) {
	s->llvidencdsp.diff_bytes(decorrelated[0], b, g, width);
	s->llvidencdsp.diff_bytes(decorrelated[1], r, g, width);
	g += frame->linesize[0];
	b += frame->linesize[1];
	r += frame->linesize[2];
	decorrelated[0] += decorrelate_linesize;
	decorrelated[1] += decorrelate_linesize;
	}

	for (int i = 0; i < s->planes; i++) {
	Slice sl = &s->slices[n s->planes + i];

	s->predict(s, data[i], sl->slice, linesize[i],
	frame->width, height);
	}
	} else {
	for (int i = 0; i < s->planes; i++) {
	Slice sl = &s->slices[n s->planes + i];

	s->predict(s, frame->data[i] + n * (slice_height >> s->vshift[i]) * frame->linesize[i],
	sl->slice,
	frame->linesize[i],
	AV_CEIL_RSHIFT(frame->width, s->hshift[i]),
	AV_CEIL_RSHIFT(height, s->vshift[i]));
	}
	}

	for (int p = 0; p < s->planes; p++)
	count_plane_slice(avctx, n, p);

	return 0;
	}

	static int magy_encode_frame(AVCodecContext avctx, AVPacket pkt,
	const AVFrame frame, int got_packet)
	{
	MagicYUVContext *s = avctx->priv_data;
	const int width = avctx->width, height = avctx->height;
	const int slice_height = s->slice_height;
	unsigned tables_size;
	PutBitContext pbit;
	PutByteContext pb;
	int pos, ret = 0;

	ret = ff_alloc_packet(avctx, pkt, (256 + 4 * s->nb_slices + width * height) *
	s->planes + 256);
	if (ret < 0)
	return ret;

	bytestream2_init_writer(&pb, pkt->data, pkt->size);
	bytestream2_put_le32(&pb, MKTAG('M', 'A', 'G', 'Y'));
	bytestream2_put_le32(&pb, 32); // header size
	bytestream2_put_byte(&pb, 7); // version
	bytestream2_put_byte(&pb, s->format);
	bytestream2_put_byte(&pb, 12); // max huffman length
	bytestream2_put_byte(&pb, 0);

	bytestream2_put_byte(&pb, 0);
	bytestream2_put_byte(&pb, 0);
	bytestream2_put_byte(&pb, 32); // coder type
	bytestream2_put_byte(&pb, 0);

	bytestream2_put_le32(&pb, avctx->width);
	bytestream2_put_le32(&pb, avctx->height);
	bytestream2_put_le32(&pb, avctx->width);
	bytestream2_put_le32(&pb, slice_height);
	bytestream2_put_le32(&pb, 0);

	for (int i = 0; i < s->planes; i++) {
	bytestream2_put_le32(&pb, 0);
	for (int j = 1; j < s->nb_slices; j++)
	bytestream2_put_le32(&pb, 0);
	}

	bytestream2_put_byte(&pb, s->planes);

	for (int i = 0; i < s->planes; i++) {
	for (int n = 0; n < s->nb_slices; n++)
	bytestream2_put_byte(&pb, n * s->planes + i);
	}

	avctx->execute2(avctx, predict_slice, (void *)frame, NULL, s->nb_slices);

	init_put_bits(&pbit, pkt->data + bytestream2_tell_p(&pb), bytestream2_get_bytes_left_p(&pb));

	for (int i = 0; i < s->planes; i++)
	encode_table(avctx, &pbit, s->he[i], i);

	tables_size = put_bytes_count(&pbit, 1);
	bytestream2_skip_p(&pb, tables_size);

	avctx->execute2(avctx, encode_slice, NULL, NULL, s->nb_slices);

	for (int n = 0; n < s->nb_slices; n++) {
	for (int i = 0; i < s->planes; i++) {
	Slice sl = &s->slices[n s->planes + i];

	sl->pos = bytestream2_tell_p(&pb);

	bytestream2_put_buffer(&pb, sl->bitslice, sl->size);
	}
	}

	pos = bytestream2_tell_p(&pb);
	bytestream2_seek_p(&pb, 32, SEEK_SET);
	bytestream2_put_le32(&pb, s->slices[0].pos - 32);
	for (int i = 0; i < s->planes; i++) {
	for (int n = 0; n < s->nb_slices; n++) {
	Slice sl = &s->slices[n s->planes + i];

	bytestream2_put_le32(&pb, sl->pos - 32);
	}
	}
	bytestream2_seek_p(&pb, pos, SEEK_SET);

	pkt->size = bytestream2_tell_p(&pb);

	*got_packet = 1;

	return 0;
	}

	static av_cold int magy_encode_close(AVCodecContext *avctx)
	{
	MagicYUVContext *s = avctx->priv_data;

	for (int i = 0; i < s->planes * s->nb_slices && s->slices; i++) {
	Slice *sl = &s->slices[i];

	av_freep(&sl->slice);
	av_freep(&sl->bitslice);
	}
	av_freep(&s->slices);
	av_freep(&s->decorrelate_buf);

	return 0;
	}

	#define OFFSET(x) offsetof(MagicYUVContext, x)
	#define VE AV_OPT_FLAG_VIDEO_PARAM \| AV_OPT_FLAG_ENCODING_PARAM
	static const AVOption options[] = {
	{ "pred", "Prediction method", OFFSET(frame_pred), AV_OPT_TYPE_INT, {.i64=LEFT}, LEFT, MEDIAN, VE, .unit = "pred" },
	{ "left", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = LEFT }, 0, 0, VE, .unit = "pred" },
	{ "gradient", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = GRADIENT }, 0, 0, VE, .unit = "pred" },
	{ "median", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = MEDIAN }, 0, 0, VE, .unit = "pred" },
	{ NULL},
	};

	static const AVClass magicyuv_class = {
	.class_name = "magicyuv",
	.item_name = av_default_item_name,
	.option = options,
	.version = LIBAVUTIL_VERSION_INT,
	};

	const FFCodec ff_magicyuv_encoder = {
	.p.name = "magicyuv",
	CODEC_LONG_NAME("MagicYUV video"),
	.p.type = AVMEDIA_TYPE_VIDEO,
	.p.id = AV_CODEC_ID_MAGICYUV,
	.p.capabilities = AV_CODEC_CAP_DR1 \| AV_CODEC_CAP_FRAME_THREADS \|
	AV_CODEC_CAP_SLICE_THREADS \|
	AV_CODEC_CAP_ENCODER_REORDERED_OPAQUE,
	.priv_data_size = sizeof(MagicYUVContext),
	.p.priv_class = &magicyuv_class,
	.init = magy_encode_init,
	.close = magy_encode_close,
	FF_CODEC_ENCODE_CB(magy_encode_frame),
	.p.pix_fmts = (const enum AVPixelFormat[]) {
	AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRAP, AV_PIX_FMT_YUV422P,
	AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUVA444P, AV_PIX_FMT_GRAY8,
	AV_PIX_FMT_NONE
	},
	.caps_internal = FF_CODEC_CAP_INIT_CLEANUP,
	};