Packman Build Service PMBS

We truncated the diff of some files because they were too big. If you want to see the full diff for every file, click here.

Changes of Revision 18

kvazaar.changes Changed

@@ -1,4 +1,40 @@
 -------------------------------------------------------------------
+Wed Jan  4 11:29:30 UTC 2023 - Luigi Baldoni <aloisio@gmx.com>
+
+- Update to version 2.2.0
+  Features:
+  * Updated Region of Interest (ROI) functionality to allow
+    separate ROI map for each frame
+  * Improve inter search
+  * Update cabac context during search to improve the accuracy
+    of bit cost estimation
+  * Move intra chroma search option from --rd 3 to its own
+    option --(no-)intra-chroma-search and fast bipred to
+    --(no-)fast-bipred
+  * Change maximum rd level to 4, where 3 performs more rd
+    search for inter and 4 performs full intra search
+  * Add --(no-)combine-intra-cus for controlling whether the
+    larger intra blocks are tried even when search at current
+    depth is disabled
+  * Add --force-inter for debugging purposes to force all PUs in
+    inter slices to use best inter mode
+  Optimizations:
+  * AVX2 implementations of bidirectional blending
+  Fixes:
+  * Make sure the dpb is more than max_num_reorder_pics 899c672
+  * Compute proper count of buffered frames for vps and sps. Use
+    common function d4880be
+  * Fix some strategy function pointer signatures a400504
+  External contributors:
+  * build: fix automake warning by @bradh
+  * cli: add missing newlines in usage by @bradh
+  * refactor SEI by @bradh
+  * cli: minor api doc fix by @bradh
+  * add sudo ldconfig by @binbinzhm
+  * Enable -mpopcnt and -mlzcnt on AVX2 by @klondi
+- Bump sover to 7
+
+-------------------------------------------------------------------
 Wed Oct 13 15:32:42 UTC 2021 - Luigi Baldoni <aloisio@gmx.com>
 
 - Update to version 2.1.0

kvazaar.spec Changed

kvazaar-2.1.0.tar.gz/LICENSE -> kvazaar-2.2.0.tar.gz/LICENSE Changed

kvazaar-2.2.0.tar.gz/LICENSE.EXT.greatest Changed

kvazaar-2.2.0.tar.gz/LICENSE.EXT.x264asm Changed

kvazaar-2.1.0.tar.gz/Makefile.am -> kvazaar-2.2.0.tar.gz/Makefile.am Changed

kvazaar-2.1.0.tar.gz/README.md -> kvazaar-2.2.0.tar.gz/README.md Changed

@@ -2,7 +2,7 @@
 =======
 An open-source HEVC encoder licensed under 3-clause BSD
 
-Join channel #ultravideo(https://web.libera.chat/#ultravideo) in Libera.Chat(https://libera.chat/) IRC network to contact us.
+Join channel #ultravideo(https://web.libera.chat/#ultravideo) in Libera.Chat(https://libera.chat/) IRC network to contact us or come to our Discord !Discord(https://img.shields.io/discord/973260924288901140?style=plastic)(https://discord.gg/fZpub7BPUA)
 
 Kvazaar is still under development. Speed and RD-quality will continue to improve.
 
@@ -49,7 +49,7 @@
 
 ### Parameters
 
-comment: # (BEGIN KVAZAAR HELP MESSAGE)
+comment: # "BEGIN KVAZAAR HELP MESSAGE"
 ```
 Usage:
 kvazaar -i <input> --input-res <width>x<height> -o <output>
@@ -156,11 +156,20 @@
                                    - frametile: Constrain within the tile.
                                    - frametilemargin: Constrain even more.
       --roi <filename>       : Use a delta QP map for region of interest.
-                               Reads an array of delta QP values from a text
-                               file. The file format is: width and height of
-                               the QP delta map followed by width*height delta
-                               QP values in raster order. The map can be of any
-                               size and will be scaled to the video size.
+                               Reads an array of delta QP values from a file.
+                               Text and binary files are supported and detected
+                               from the file extension (.txt/.bin). If a known
+                               extension is not found, the file is treated as
+                               a text file. The file can include one or many
+                               ROI frames each in the following format:
+                               width and height of the QP delta map followed
+                               by width * height delta QP values in raster
+                               order. In binary format, width and height are
+                               32-bit integers whereas the delta QP values are
+                               signed 8-bit values. The map can be of any size
+                               and will be scaled to the video size. The file
+                               reading will loop if end of the file is reached.
+                               See roi.txt in the examples folder.
       --set-qp-in-cu         : Set QP at CU level keeping pic_init_qp_minus26.
                                in PPS and slice_qp_delta in slize header zero.
       --(no-)erp-aqp         : Use adaptive QP for 360 degree video with
@@ -191,18 +200,22 @@
       --(no-)signhide        : Sign hiding disabled
       --(no-)smp             : Symmetric motion partition disabled
       --(no-)amp             : Asymmetric motion partition disabled
-      --rd <integer>         : Intra mode search complexity 0
+      --rd <integer>         : Mode search complexity 0
                                    - 0: Skip intra if inter is good enough.
                                    - 1: Rough intra mode search with SATD.
-                                   - 2: Refine intra mode search with SSE.
-                                   - 3: Try all intra modes and enable intra
-                                        chroma mode search.
+                                   - 2: Refine mode search with SSE.
+                                   - 3: More SSE candidates for inter and
+                                        chroma mode search for 4x4 intra.
+                                   - 4: Even more SSE candidates for both.
+                                   - 5: Try all intra modes.
       --(no-)mv-rdo          : Rate-distortion optimized motion vector costs
                                disabled
       --(no-)zero-coeff-rdo  : If a CU is set inter, check if forcing zero
                                residual improves the RD cost. enabled
       --(no-)full-intra-search : Try all intra modes during rough search.
                                disabled
+      --(no-)intra-chroma-search : Test non-derived intra chroma modes.
+                                   disabled
       --(no-)transform-skip  : Try transform skip disabled
       --me <string>          : Integer motion estimation algorithm hexbs
                                    - hexbs: Hexagon Based Search
@@ -218,6 +231,7 @@
                                    - 2: + 1/2-pixel diagonal
                                    - 3: + 1/4-pixel horizontal and vertical
                                    - 4: + 1/4-pixel diagonal
+      --(no-)fast-bipred     : Only perform fast bipred search. enabled
       --pu-depth-inter <int>-<int> : Inter prediction units sizes 0-3
                                    - 0, 1, 2, 3: from 64x64 to 8x8
                                    - Accepts a list of values separated by ','
@@ -233,6 +247,16 @@
       --ml-pu-depth-intra    : Predict the pu-depth-intra using machine
                                 learning trees, overrides the
                                 --pu-depth-intra parameter. disabled
+      --(no-)combine-intra-cus: Whether the encoder tries to code a cu
+                                   on lower depth even when search is not
+                                   performed on said depth. Should only
+                                   be disabled if cus absolutely must not
+                                   be larger than limited by the search.
+                                   enabled
+      --force-inter          : Force the encoder to use inter always.
+                               This is mostly for debugging and is not
+                               guaranteed to produce sensible bitstream or
+                               work at all. disabled
       --tr-depth-intra <int> : Transform split depth for intra blocks 0
       --(no-)bipred          : Bi-prediction disabled
       --cu-split-termination <string> : CU split search termination zero
@@ -287,8 +311,8 @@
                                    - u<int>: Number of tile columns of uniform
                                              width.
       --tiles-height-split <string>|u<int> :
-                                   - <string>: A comma-separated list of tile row
-                                               column pixel coordinates.
+                                   - <string>: A comma-separated list of tile
+                                               row column pixel coordinates.
                                    - u<int>: Number of tile rows of uniform
                                              height.
       --slices <string>      : Control how slices are used.
@@ -328,7 +352,7 @@
   -w, --width <integer>       : Use --input-res.
   -h, --height <integer>      : Use --input-res.
 ```
-comment: # (END KVAZAAR HELP MESSAGE)
+comment: # "END KVAZAAR HELP MESSAGE"
 
 
 ### LP-GOP syntax
@@ -410,6 +434,7 @@
     ./configure
     make
     sudo make install
+    sudo ldconfig
 
 See `./configure --help` for more options.

kvazaar-2.1.0.tar.gz/appveyor.yml -> kvazaar-2.2.0.tar.gz/appveyor.yml Changed

kvazaar-2.1.0.tar.gz/configure.ac -> kvazaar-2.2.0.tar.gz/configure.ac Changed

kvazaar-2.1.0.tar.gz/doc/kvazaar.1 -> kvazaar-2.2.0.tar.gz/doc/kvazaar.1 Changed

@@ -1,4 +1,4 @@
-.TH KVAZAAR "1" "October 2021" "kvazaar v2.1.0" "User Commands"
+.TH KVAZAAR "1" "January 2023" "kvazaar v2.2.0" "User Commands"
 .SH NAME
 kvazaar \- open source HEVC encoder
 .SH SYNOPSIS
@@ -180,11 +180,20 @@
 .TP
 \fB\-\-roi <filename>      
 Use a delta QP map for region of interest.
-Reads an array of delta QP values from a text
-file. The file format is: width and height of
-the QP delta map followed by width*height delta
-QP values in raster order. The map can be of any
-size and will be scaled to the video size.
+Reads an array of delta QP values from a file.
+Text and binary files are supported and detected
+from the file extension (.txt/.bin). If a known
+extension is not found, the file is treated as
+a text file. The file can include one or many
+ROI frames each in the following format:
+width and height of the QP delta map followed
+by width * height delta QP values in raster
+order. In binary format, width and height are
+32\-bit integers whereas the delta QP values are
+signed 8\-bit values. The map can be of any size
+and will be scaled to the video size. The file
+reading will loop if end of the file is reached.
+See roi.txt in the examples folder.
 .TP
 \fB\-\-set\-qp\-in\-cu        
 Set QP at CU level keeping pic_init_qp_minus26.
@@ -243,12 +252,14 @@
 Asymmetric motion partition disabled
 .TP
 \fB\-\-rd <integer>        
-Intra mode search complexity 0
+Mode search complexity 0
     \- 0: Skip intra if inter is good enough.
     \- 1: Rough intra mode search with SATD.
-    \- 2: Refine intra mode search with SSE.
-    \- 3: Try all intra modes and enable intra
-         chroma mode search.
+    \- 2: Refine mode search with SSE.
+    \- 3: More SSE candidates for inter and
+         chroma mode search for 4x4 intra.
+    \- 4: Even more SSE candidates for both.
+    \- 5: Try all intra modes.
 .TP
 \fB\-\-(no\-)mv\-rdo         
 Rate\-distortion optimized motion vector costs
@@ -262,6 +273,10 @@
 Try all intra modes during rough search.
 disabled
 .TP
+\fB\-\-(no\-)intra\-chroma\-search
+Test non\-derived intra chroma modes.
+    disabled
+.TP
 \fB\-\-(no\-)transform\-skip 
 Try transform skip disabled
 .TP
@@ -285,6 +300,9 @@
     \- 3: + 1/4\-pixel horizontal and vertical
     \- 4: + 1/4\-pixel diagonal
 .TP
+\fB\-\-(no\-)fast\-bipred    
+Only perform fast bipred search. enabled
+.TP
 \fB\-\-pu\-depth\-inter <int>\-<int>
 Inter prediction units sizes 0\-3
     \- 0, 1, 2, 3: from 64x64 to 8x8
@@ -306,6 +324,19 @@
  learning trees, overrides the
  \-\-pu\-depth\-intra parameter. disabled
 .TP
+\fB\-\-(no\-)combine\-intra\-cus: Whether the encoder tries to code a cu
+    on lower depth even when search is not
+    performed on said depth. Should only
+    be disabled if cus absolutely must not
+    be larger than limited by the search.
+    enabled
+.TP
+\fB\-\-force\-inter         
+Force the encoder to use inter always.
+This is mostly for debugging and is not
+guaranteed to produce sensible bitstream or
+work at all. disabled
+.TP
 \fB\-\-tr\-depth\-intra <int>
 Transform split depth for intra blocks 0
 .TP
@@ -398,8 +429,8 @@
               width.
 .TP
 \fB\-\-tiles\-height\-split <string>|u<int>
-    \- <string>: A comma\-separated list of tile row
-                column pixel coordinates.
+    \- <string>: A comma\-separated list of tile
+                row column pixel coordinates.
     \- u<int>: Number of tile rows of uniform
               height.
 .TP

kvazaar-2.1.0.tar.gz/src/Makefile.am -> kvazaar-2.2.0.tar.gz/src/Makefile.am Changed

kvazaar-2.1.0.tar.gz/src/bitstream.c -> kvazaar-2.2.0.tar.gz/src/bitstream.c Changed

kvazaar-2.1.0.tar.gz/src/cabac.c -> kvazaar-2.2.0.tar.gz/src/cabac.c Changed

@@ -95,6 +95,7 @@
   data->num_buffered_bytes = 0;
   data->buffered_byte = 0xff;
   data->only_count = 0; // By default, write bits out
+  data->update = 0; 
 }
 
 /**
@@ -103,8 +104,7 @@
 void kvz_cabac_encode_bin(cabac_data_t * const data, const uint32_t bin_value)
 {
   uint32_t lps;
-
-
+  
   lps = kvz_g_auc_lpst_tableCTX_STATE(data->cur_ctx)(data->range >> 6) & 3;
   data->range -= lps;
 
@@ -272,15 +272,19 @@
  * \param symbol Value of coeff_abs_level_minus3.
  * \param r_param Reference to Rice parameter.
  */
-void kvz_cabac_write_coeff_remain(cabac_data_t * const cabac, const uint32_t symbol, const uint32_t r_param)
+int kvz_cabac_write_coeff_remain(cabac_data_t* const cabac, const uint32_t symbol, const uint32_t r_param)
 {
   int32_t code_number = symbol;
   uint32_t length;
 
+  int bits = 0;
+
   if (code_number < (3 << r_param)) {
     length = code_number >> r_param;
     CABAC_BINS_EP(cabac, (1 << (length + 1)) - 2 , length + 1, "coeff_abs_level_remaining");
+    bits += length + 1;
     CABAC_BINS_EP(cabac, (code_number % (1 << r_param)), r_param, "coeff_abs_level_remaining");
+    bits += r_param;
   } else {
     length = r_param;
     code_number = code_number - (3 << r_param);
@@ -289,8 +293,11 @@
       ++length;
     }
     CABAC_BINS_EP(cabac, (1 << (3 + length + 1 - r_param)) - 2, 3 + length + 1 - r_param, "coeff_abs_level_remaining");
+    bits += 3 + length + 1 - r_param;
     CABAC_BINS_EP(cabac, code_number, length, "coeff_abs_level_remaining");
+    bits += length;
   }
+  return bits;
 }
 
 void kvz_cabac_write_coeff_remain_encry(struct encoder_state_t * const state, cabac_data_t * const cabac,const uint32_t symbol, const uint32_t r_param, int32_t base_level)
@@ -488,26 +495,28 @@
 /**
  * \brief
  */
-void kvz_cabac_write_unary_max_symbol(cabac_data_t * const data, cabac_ctx_t * const ctx, uint32_t symbol, const int32_t offset, const uint32_t max_symbol)
+void kvz_cabac_write_unary_max_symbol(cabac_data_t * const data, 
+  cabac_ctx_t * const ctx, 
+  uint32_t symbol,
+  const int32_t offset,
+  const uint32_t max_symbol, 
+  double* bits_out)
 {
   int8_t code_last = max_symbol > symbol;
 
   assert(symbol <= max_symbol);
 
   if (!max_symbol) return;
-
-  data->cur_ctx = &ctx0;
-  CABAC_BIN(data, symbol, "ums");
+  
+  CABAC_FBITS_UPDATE(data, &ctx0, symbol, *bits_out, "ums");
 
   if (!symbol) return;
 
   while (--symbol) {
-    data->cur_ctx = &ctxoffset;
-    CABAC_BIN(data, 1, "ums");
+    CABAC_FBITS_UPDATE(data, &ctxoffset, 1, *bits_out, "ums");
   }
   if (code_last) {
-    data->cur_ctx = &ctxoffset;
-    CABAC_BIN(data, 0, "ums");
+    CABAC_FBITS_UPDATE(data, &ctxoffset, 0,*bits_out, "ums");
   }
 }
 
@@ -544,7 +553,7 @@
 /**
  * \brief
  */
-void kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state,
+uint32_t kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state,
                                   cabac_data_t * const data,
                                   uint32_t symbol,
                                   uint32_t count)
@@ -572,5 +581,6 @@
       bins                     = ( (bins >> (num_bins >>1) ) << (num_bins >>1) ) | state->crypto_prev_pos;
     }
   }
-  kvz_cabac_encode_bins_ep(data, bins, num_bins);
+  CABAC_BINS_EP(data, bins, num_bins, "ep_ex_golomb");
+  return num_bins;
 }

kvazaar-2.1.0.tar.gz/src/cabac.h -> kvazaar-2.2.0.tar.gz/src/cabac.h Changed

@@ -58,7 +58,8 @@
   uint32_t   buffered_byte;
   int32_t    num_buffered_bytes;
   int32_t    bits_left;
-  int8_t     only_count;
+  int8_t     only_count : 4;
+  int8_t     update : 4;
   bitstream_t *stream;
 
   // CONTEXTS
@@ -115,17 +116,27 @@
 void kvz_cabac_encode_bin_trm(cabac_data_t *data, uint8_t bin_value);
 void kvz_cabac_write(cabac_data_t *data);
 void kvz_cabac_finish(cabac_data_t *data);
-void kvz_cabac_write_coeff_remain(cabac_data_t *cabac, uint32_t symbol,
-                              uint32_t r_param);
+int kvz_cabac_write_coeff_remain(cabac_data_t* cabac, uint32_t symbol,
+                                 uint32_t r_param);
 void kvz_cabac_write_coeff_remain_encry(struct encoder_state_t * const state, cabac_data_t * const cabac, const uint32_t symbol,
                                         const uint32_t r_param, int32_t base_level);
-void kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data,
+uint32_t kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data,
                                   uint32_t symbol, uint32_t count);
 void kvz_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx,
-                                  uint32_t symbol, int32_t offset,
-                                  uint32_t max_symbol);
+                                      uint32_t symbol, int32_t offset,
+                                      uint32_t max_symbol, double* bits_out);
 void kvz_cabac_write_unary_max_symbol_ep(cabac_data_t *data, unsigned int symbol, unsigned int max_symbol);
 
+extern const float kvz_f_entropy_bits128;
+#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits(ctx)->uc_state ^ (val)
+
+#define CABAC_FBITS_UPDATE(cabac, ctx, val, bits, name) do { \
+  if((cabac)->only_count) (bits) += kvz_f_entropy_bits(ctx)->uc_state ^ (val); \
+  if((cabac)->update) {\
+    (cabac)->cur_ctx = ctx;\
+    CABAC_BIN((cabac), (val), (name));\
+  } \
+} while(0)
 
 // Macros
 #define CTX_STATE(ctx) ((ctx)->uc_state >> 1)
@@ -133,24 +144,25 @@
 #define CTX_UPDATE_LPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_lps (ctx)->uc_state ; }
 #define CTX_UPDATE_MPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_mps (ctx)->uc_state ; }
 
+
 #ifdef VERBOSE
   #define CABAC_BIN(data, value, name) { \
-    uint32_t prev_state = (data)->ctx->uc_state; \
-    kvz_cabac_encode_bin((data), (value)) \
-    printf("%s = %u, state = %u -> %u\n", \
-           (name), (uint32_t)(value), prev_state, (data)->ctx->uc_state); }
+    uint32_t prev_state = (data)->cur_ctx->uc_state; \
+    kvz_cabac_encode_bin((data), (value)); \
+    if(!(data)->only_count)  printf("%s = %u, state = %u -> %u MPS = %u\n", \
+           (name), (uint32_t)(value), prev_state, (data)->cur_ctx->uc_state, CTX_MPS((data)->cur_ctx)); }
 
   #define CABAC_BINS_EP(data, value, bins, name) { \
-    uint32_t prev_state = (data)->ctx->uc_state; \
+    uint32_t prev_state = (data)->cur_ctx->uc_state; \
     kvz_cabac_encode_bins_ep((data), (value), (bins)); \
-    printf("%s = %u(%u bins), state = %u -> %u\n", \
-           (name), (uint32_t)(value), (bins), prev_state, (data)->ctx->uc_state); }
+    if(!(data)->only_count) printf("%s = %u(%u bins), state = %u -> %u\n", \
+           (name), (uint32_t)(value), (bins), prev_state, (data)->cur_ctx->uc_state); }
 
   #define CABAC_BIN_EP(data, value, name) { \
-    uint32_t prev_state = (data)->ctx->uc_state; \
+    uint32_t prev_state = (data)->cur_ctx->uc_state; \
     kvz_cabac_encode_bin_ep((data), (value)); \
-    printf("%s = %u, state = %u -> %u\n", \
-           (name), (uint32_t)(value), prev_state, (data)->ctx->uc_state); }
+    if(!(data)->only_count) printf("%s = %u, state = %u -> %u\n", \
+           (name), (uint32_t)(value), prev_state, (data)->cur_ctx->uc_state); }
 #else
   #define CABAC_BIN(data, value, name) \
     kvz_cabac_encode_bin((data), (value));

kvazaar-2.1.0.tar.gz/src/cfg.c -> kvazaar-2.2.0.tar.gz/src/cfg.c Changed

@@ -139,9 +139,9 @@
   cfg->gop_lp_definition.t = 1;
   cfg->open_gop = true;
 
-  cfg->roi.width = 0;
-  cfg->roi.height = 0;
-  cfg->roi.dqps = NULL;
+  cfg->roi.file_path = NULL;
+  cfg->roi.format = KVZ_ROI_TXT;
+
   cfg->set_qp_in_cu = false;
 
   cfg->erp_aqp = false;
@@ -183,6 +183,11 @@
   cfg->fastrd_sampling_on = 0;
   cfg->fastrd_accuracy_check_on = 0;
   cfg->fastrd_learning_outdir_fn = NULL;
+
+  cfg->combine_intra_cus = 1;
+  cfg->force_inter = 0;
+  cfg->intra_chroma_search = 0;
+  cfg->fast_bipred = 1;
   return 1;
 }
 
@@ -190,11 +195,11 @@
 {
   if (cfg) {
     FREE_POINTER(cfg->cqmfile);
+    FREE_POINTER(cfg->roi.file_path);
     FREE_POINTER(cfg->fast_coeff_table_fn);
     FREE_POINTER(cfg->tiles_width_split);
     FREE_POINTER(cfg->tiles_height_split);
     FREE_POINTER(cfg->slice_addresses_in_ts);
-    FREE_POINTER(cfg->roi.dqps);
     FREE_POINTER(cfg->optional_key);
     FREE_POINTER(cfg->fastrd_learning_outdir_fn);
   }
@@ -700,7 +705,7 @@
       },
       {
         "veryslow",
-        "rd", "2",
+        "rd", "3",
         "pu-depth-intra", "1-4",
         "pu-depth-inter", "0-3",
         "me", "tz",
@@ -728,7 +733,7 @@
       },
       {
         "placebo",
-        "rd", "2",
+        "rd", "3",
         "pu-depth-intra", "1-4",
         "pu-depth-inter", "0-3",
         "me", "tz",
@@ -1241,60 +1246,29 @@
   }
   else if OPT("implicit-rdpcm")
     cfg->implicit_rdpcm = (bool)atobool(value);
-  else if OPT("roi") {
-    // The ROI description is as follows:
-    // First number is width, second number is height,
-    // then follows width * height number of dqp values.
-    FILE* f = fopen(value, "rb");
-    if (!f) {
-      fprintf(stderr, "Could not open ROI file.\n");
-      return 0;
-    }
-
-    int width = 0;
-    int height = 0;
-    if (!fscanf(f, "%d", &width) || !fscanf(f, "%d", &height)) {
-      fprintf(stderr, "Failed to read ROI size.\n");
-      fclose(f);
-      return 0;
-    }
-
-    if (width <= 0 || height <= 0) {
-      fprintf(stderr, "Invalid ROI size: %dx%d.\n", width, height);
-      fclose(f);
-      return 0;
-    }
 
-    if (width > 10000 || height > 10000) {
-      fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n");
-      fclose(f);
-      return 0;
-    }
+  else if OPT("roi") {
+    static enum kvz_roi_format const formats = { KVZ_ROI_TXT, KVZ_ROI_BIN };
+    static const char * const format_names = { "txt", "bin", NULL };
 
-    const unsigned size = width * height;
-    int8_t *dqp_array  = calloc((size_t)size, sizeof(cfg->roi.dqps0));
-    if (!dqp_array) {
-      fprintf(stderr, "Failed to allocate memory for ROI table.\n");
-      fclose(f);
+    char *roi_file = strdup(value);
+    if (!roi_file) {
+      fprintf(stderr, "Failed to allocate memory for ROI file name.\n");
       return 0;
     }
+    FREE_POINTER(cfg->roi.file_path);
+    cfg->roi.file_path = roi_file;
 
-    FREE_POINTER(cfg->roi.dqps);
-    cfg->roi.dqps   = dqp_array;
-    cfg->roi.width  = width;
-    cfg->roi.height = height;
-
-    for (int i = 0; i < size; ++i) {
-      int number; // Need a pointer to int for fscanf
-      if (fscanf(f, "%d", &number) != 1) {
-        fprintf(stderr, "Reading ROI file failed.\n");
-        fclose(f);
-        return 0;
-      }
-      dqp_arrayi = CLIP(-51, 51, number);
+    // Get file extension or the substring after the last dot
+    char *maybe_extension = strrchr(cfg->roi.file_path, '.');
+    if (!maybe_extension) {
+      cfg->roi.format = KVZ_ROI_TXT;
+    } else {
+      maybe_extension++;
+      int8_t format;
+      bool unknown_format = !parse_enum(maybe_extension, format_names, &format);
+      cfg->roi.format = unknown_format ? KVZ_ROI_TXT : formatsformat;
     }
-
-    fclose(f);
   }
   else if OPT("set-qp-in-cu") {
     cfg->set_qp_in_cu = (bool)atobool(value);
@@ -1421,6 +1395,18 @@
   else if OPT("stats-file-prefix") {
     cfg->stats_file_prefix = strdup(value);
   }
+  else if OPT("combine-intra-cus") {
+    cfg->combine_intra_cus = atobool(value);
+  }
+  else if OPT("force-inter") {
+    cfg->force_inter = atobool(value);
+  }
+  else if OPT("intra-chroma-search") {
+    cfg->intra_chroma_search = atobool(value);
+  }
+  else if OPT("fast-bipred") {
+    cfg->fast_bipred = atobool(value);
+  }
   else {
     return 0;
   }
@@ -1612,8 +1598,8 @@
     error = 1;
   }
 
-  if (cfg->rdo < 0 || cfg->rdo > 3) {
-    fprintf(stderr, "Input error: --rd parameter out of range 0..3\n");
+  if (cfg->rdo < 0 || cfg->rdo > 6) {
+    fprintf(stderr, "Input error: --rd parameter out of range 0..5\n");
     error = 1;
   }

kvazaar-2.1.0.tar.gz/src/cli.c -> kvazaar-2.2.0.tar.gz/src/cli.c Changed

@@ -141,6 +141,7 @@
   { "force-level",        required_argument, NULL, 0 },
   { "high-tier",                no_argument, NULL, 0 },
   { "me-steps",           required_argument, NULL, 0 },
+  { "roi-file",           required_argument, NULL, 0 },
   { "fast-residual-cost", required_argument, NULL, 0 },
   { "set-qp-in-cu",             no_argument, NULL, 0 },
   { "open-gop",                 no_argument, NULL, 0 },
@@ -167,6 +168,14 @@
   { "fastrd-sampling",          no_argument, NULL, 0 },
   { "fastrd-accuracy-check",    no_argument, NULL, 0 },
   { "fastrd-outdir",      required_argument, NULL, 0 },
+  { "combine-intra-cus",        no_argument, NULL, 0 },
+  { "no-combine-intra-cus",     no_argument, NULL, 0 },
+  { "force-inter",              no_argument, NULL, 0 },
+  { "no-force-inter",           no_argument, NULL, 0 },
+  { "intra-chroma-search",      no_argument, NULL, 0 },
+  { "no-intra-chroma-search",   no_argument, NULL, 0 },
+  { "fast-bipred",              no_argument, NULL, 0 },
+  { "no-fast-bipred",           no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -500,11 +509,20 @@
     "                                   - frametile: Constrain within the tile.\n"
     "                                   - frametilemargin: Constrain even more.\n"
     "      --roi <filename>       : Use a delta QP map for region of interest.\n"
-    "                               Reads an array of delta QP values from a text\n"
-    "                               file. The file format is: width and height of\n"
-    "                               the QP delta map followed by width*height delta\n"
-    "                               QP values in raster order. The map can be of any\n"
-    "                               size and will be scaled to the video size.\n"
+    "                               Reads an array of delta QP values from a file.\n"
+    "                               Text and binary files are supported and detected\n"
+    "                               from the file extension (.txt/.bin). If a known\n"
+    "                               extension is not found, the file is treated as\n"
+    "                               a text file. The file can include one or many\n"
+    "                               ROI frames each in the following format:\n"
+    "                               width and height of the QP delta map followed\n"
+    "                               by width * height delta QP values in raster\n"
+    "                               order. In binary format, width and height are\n"
+    "                               32-bit integers whereas the delta QP values are\n"
+    "                               signed 8-bit values. The map can be of any size\n"
+    "                               and will be scaled to the video size. The file\n"
+    "                               reading will loop if end of the file is reached.\n"
+    "                               See roi.txt in the examples folder.\n"
     "      --set-qp-in-cu         : Set QP at CU level keeping pic_init_qp_minus26.\n"
     "                               in PPS and slice_qp_delta in slize header zero.\n"
     "      --(no-)erp-aqp         : Use adaptive QP for 360 degree video with\n"
@@ -536,18 +554,22 @@
     "      --(no-)signhide        : Sign hiding disabled\n"
     "      --(no-)smp             : Symmetric motion partition disabled\n"
     "      --(no-)amp             : Asymmetric motion partition disabled\n"
-    "      --rd <integer>         : Intra mode search complexity 0\n"
+    "      --rd <integer>         : Mode search complexity 0\n"
     "                                   - 0: Skip intra if inter is good enough.\n"
     "                                   - 1: Rough intra mode search with SATD.\n"
-    "                                   - 2: Refine intra mode search with SSE.\n"
-    "                                   - 3: Try all intra modes and enable intra\n"
-    "                                        chroma mode search.\n"
+    "                                   - 2: Refine mode search with SSE.\n"
+    "                                   - 3: More SSE candidates for inter and\n"
+    "                                        chroma mode search for 4x4 intra.\n"
+    "                                   - 4: Even more SSE candidates for both.\n"
+    "                                   - 5: Try all intra modes.\n"
     "      --(no-)mv-rdo          : Rate-distortion optimized motion vector costs\n"
     "                               disabled\n"
     "      --(no-)zero-coeff-rdo  : If a CU is set inter, check if forcing zero\n"
     "                               residual improves the RD cost. enabled\n"
     "      --(no-)full-intra-search : Try all intra modes during rough search.\n"
     "                               disabled\n"
+    "      --(no-)intra-chroma-search : Test non-derived intra chroma modes.\n"
+    "                                   disabled\n"
     "      --(no-)transform-skip  : Try transform skip disabled\n"
     "      --me <string>          : Integer motion estimation algorithm hexbs\n"
     "                                   - hexbs: Hexagon Based Search\n"
@@ -563,6 +585,7 @@
     "                                   - 2: + 1/2-pixel diagonal\n"
     "                                   - 3: + 1/4-pixel horizontal and vertical\n"
     "                                   - 4: + 1/4-pixel diagonal\n"
+    "      --(no-)fast-bipred     : Only perform fast bipred search. enabled\n"
     "      --pu-depth-inter <int>-<int> : Inter prediction units sizes 0-3\n"
     "                                   - 0, 1, 2, 3: from 64x64 to 8x8\n"
     "                                   - Accepts a list of values separated by ','\n"
@@ -578,6 +601,16 @@
     "      --ml-pu-depth-intra    : Predict the pu-depth-intra using machine\n"
     "                                learning trees, overrides the\n"
     "                                --pu-depth-intra parameter. disabled\n"
+    "      --(no-)combine-intra-cus: Whether the encoder tries to code a cu\n"
+    "                                   on lower depth even when search is not\n"
+    "                                   performed on said depth. Should only\n"
+    "                                   be disabled if cus absolutely must not\n"
+    "                                   be larger than limited by the search.\n"
+    "                                   enabled\n"
+    "      --force-inter          : Force the encoder to use inter always.\n"
+    "                               This is mostly for debugging and is not\n"
+    "                               guaranteed to produce sensible bitstream or\n"
+    "                               work at all. disabled\n"
     "      --tr-depth-intra <int> : Transform split depth for intra blocks 0\n"
     "      --(no-)bipred          : Bi-prediction disabled\n"
     "      --cu-split-termination <string> : CU split search termination zero\n"
@@ -633,8 +666,8 @@
     "                                   - u<int>: Number of tile columns of uniform\n"
     "                                             width.\n"
     "      --tiles-height-split <string>|u<int> :\n"
-    "                                   - <string>: A comma-separated list of tile row\n"
-    "                                               column pixel coordinates.\n"
+    "                                   - <string>: A comma-separated list of tile\n"
+    "                                               row column pixel coordinates.\n"
     "                                   - u<int>: Number of tile rows of uniform\n"
     "                                             height.\n"
     "      --slices <string>      : Control how slices are used.\n"

kvazaar-2.1.0.tar.gz/src/cli.h -> kvazaar-2.2.0.tar.gz/src/cli.h Changed

kvazaar-2.1.0.tar.gz/src/encmain.c -> kvazaar-2.2.0.tar.gz/src/encmain.c Changed

kvazaar-2.1.0.tar.gz/src/encode_coding_tree.c -> kvazaar-2.2.0.tar.gz/src/encode_coding_tree.c Changed

@@ -63,11 +63,12 @@
 void kvz_encode_last_significant_xy(cabac_data_t * const cabac,
                                     uint8_t lastpos_x, uint8_t lastpos_y,
                                     uint8_t width, uint8_t height,
-                                    uint8_t type, uint8_t scan)
+                                    uint8_t type, uint8_t scan, double* bits_out)
 {
   const int index = kvz_math_floor_log2(width) - 2;
   uint8_t ctx_offset = type ? 0 : (index * 3 + (index + 1) / 4);
   uint8_t shift = type ? index : (index + 3) / 4;
+  double bits = 0;
 
   cabac_ctx_t *base_ctx_x = (type ? cabac->ctx.cu_ctx_last_x_chroma : cabac->ctx.cu_ctx_last_x_luma);
   cabac_ctx_t *base_ctx_y = (type ? cabac->ctx.cu_ctx_last_y_chroma : cabac->ctx.cu_ctx_last_y_luma);
@@ -81,37 +82,36 @@
 
   // x prefix
   for (int last_x = 0; last_x < group_idx_x; last_x++) {
-    cabac->cur_ctx = &base_ctx_xctx_offset + (last_x >> shift);
-    CABAC_BIN(cabac, 1, "last_sig_coeff_x_prefix");
+    CABAC_FBITS_UPDATE(cabac, &base_ctx_xctx_offset + (last_x >> shift), 1, bits, "last_sig_coeff_x_prefix");
   }
   if (group_idx_x < g_group_idxwidth - 1) {
-    cabac->cur_ctx = &base_ctx_xctx_offset + (group_idx_x >> shift);
-    CABAC_BIN(cabac, 0, "last_sig_coeff_x_prefix");
+    CABAC_FBITS_UPDATE(cabac, &base_ctx_xctx_offset + (group_idx_x >> shift), 0, bits, "last_sig_coeff_x_prefix");
   }
 
   // y prefix
   for (int last_y = 0; last_y < group_idx_y; last_y++) {
-    cabac->cur_ctx = &base_ctx_yctx_offset + (last_y >> shift);
-    CABAC_BIN(cabac, 1, "last_sig_coeff_y_prefix");
+    CABAC_FBITS_UPDATE(cabac, &base_ctx_yctx_offset + (last_y >> shift), 1, bits, "last_sig_coeff_y_prefix");
   }
   if (group_idx_y < g_group_idxheight - 1) {
-    cabac->cur_ctx = &base_ctx_yctx_offset + (group_idx_y >> shift);
-    CABAC_BIN(cabac, 0, "last_sig_coeff_y_prefix");
+    CABAC_FBITS_UPDATE(cabac, &base_ctx_yctx_offset + (group_idx_y >> shift), 0, bits, "last_sig_coeff_y_prefix");
   }
 
   // last_sig_coeff_x_suffix
   if (group_idx_x > 3) {
     const int suffix = lastpos_x - g_min_in_groupgroup_idx_x;
-    const int bits = (group_idx_x - 2) / 2;
-    CABAC_BINS_EP(cabac, suffix, bits, "last_sig_coeff_x_suffix");
+    const int write_bits = (group_idx_x - 2) / 2;
+    CABAC_BINS_EP(cabac, suffix, write_bits, "last_sig_coeff_x_suffix");
+    if (cabac->only_count) bits += write_bits;
   }
 
   // last_sig_coeff_y_suffix
   if (group_idx_y > 3) {
     const int suffix = lastpos_y - g_min_in_groupgroup_idx_y;
-    const int bits = (group_idx_y - 2) / 2;
-    CABAC_BINS_EP(cabac, suffix, bits, "last_sig_coeff_y_suffix");
+    const int write_bits = (group_idx_y - 2) / 2;
+    CABAC_BINS_EP(cabac, suffix, write_bits, "last_sig_coeff_y_suffix");
+    if (cabac->only_count) bits += write_bits;
   }
+  if (cabac->only_count && bits_out) *bits_out += bits;
 }
 
 static void encode_transform_unit(encoder_state_t * const state,
@@ -142,7 +142,7 @@
                          width,
                          0,
                          scan_idx,
-                         cur_pu->tr_skip);
+                         cur_pu->tr_skip, NULL);
   }
 
   if (depth == MAX_DEPTH + 1) {
@@ -172,11 +172,11 @@
     const coeff_t *coeff_v = &state->coeff->vxy_to_zorder(LCU_WIDTH_C, x_local, y_local);
 
     if (cbf_is_set(cur_pu->cbf, depth, COLOR_U)) {
-      kvz_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, 2, scan_idx, 0);
+      kvz_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, 2, scan_idx, 0, NULL);
     }
 
     if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
-      kvz_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, 2, scan_idx, 0);
+      kvz_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, 2, scan_idx, 0, NULL);
     }
   }
 }
@@ -290,7 +290,7 @@
 
       // cu_qp_delta_abs prefix
       cabac->cur_ctx = &cabac->ctx.cu_qp_delta_abs0;
-      kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5);
+      kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.cu_qp_delta_abs, MIN(qp_delta_abs, 5), 1, 5, NULL);
 
       if (qp_delta_abs >= 5) {
         // cu_qp_delta_abs suffix
@@ -308,16 +308,18 @@
   }
 }
 
-static void encode_inter_prediction_unit(encoder_state_t * const state,
-                                         cabac_data_t * const cabac,
-                                         const cu_info_t * const cur_cu,
-                                         int x, int y, int width, int height,
-                                         int depth)
+void kvz_encode_inter_prediction_unit(encoder_state_t * const state,
+                                      cabac_data_t * const cabac,
+                                      const cu_info_t * const cur_cu,
+                                      int x, int y, int width, int height,
+                                      int depth, lcu_t* lcu, double* bits_out)
 {
   // Mergeflag
   int16_t num_cand = 0;
-  cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);
-  CABAC_BIN(cabac, cur_cu->merged, "MergeFlag");
+  double bits = 0;
+
+  CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), cur_cu->merged, bits, "MergeFlag");
+
   num_cand = state->encoder_control->cfg.max_merge;
   if (cur_cu->merged) { //merge
     if (num_cand > 1) {
@@ -325,10 +327,10 @@
       for (ui = 0; ui < num_cand - 1; ui++) {
         int32_t symbol = (ui != cur_cu->merge_idx);
         if (ui == 0) {
-          cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
-          CABAC_BIN(cabac, symbol, "MergeIndex");
+          CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_idx_ext_model), symbol, bits, "MergeIndex");
         } else {
           CABAC_BIN_EP(cabac,symbol,"MergeIndex");
+          if(cabac->only_count) bits += 1;
         }
         if (symbol == 0) break;
       }
@@ -339,12 +341,10 @@
       uint8_t inter_dir = cur_cu->inter.mv_dir-1;
 
       if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 8) {
-        cabac->cur_ctx = &(cabac->ctx.inter_dirdepth);
-        CABAC_BIN(cabac, (inter_dir == 2), "inter_pred_idc");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dirdepth), inter_dir == 2, bits, "inter_pred_idc");
       }
       if (inter_dir < 2) {
-        cabac->cur_ctx = &(cabac->ctx.inter_dir4);
-        CABAC_BIN(cabac, inter_dir, "inter_pred_idc");
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.inter_dir4), inter_dir, bits, "inter_pred_idc");
       }
     }
 
@@ -359,9 +359,8 @@
       if (ref_LX_size > 1) {
         // parseRefFrmIdx
         int32_t ref_frame = cur_cu->inter.mv_refref_list_idx;
-
-        cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model0);
-        CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX");
+        
+        CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_ref_pic_model0), (ref_frame != 0), bits, "ref_idx_lX");
 
         if (ref_frame > 0) {
           ref_frame--;
@@ -373,9 +372,10 @@
 
             if (i == 0) {
               cabac->cur_ctx = &cabac->ctx.cu_ref_pic_model1;
-              CABAC_BIN(cabac, symbol, "ref_idx_lX");
+              CABAC_FBITS_UPDATE(cabac, &cabac->ctx.cu_ref_pic_model1, symbol, bits, "ref_idx_lX");
             } else {
               CABAC_BIN_EP(cabac, symbol, "ref_idx_lX");
+              if (cabac->only_count) bits += 1;
             }
             if (symbol == 0) break;
           }
@@ -385,16 +385,26 @@
       if (state->frame->ref_list != REF_PIC_LIST_1 || cur_cu->inter.mv_dir != 3) {
 
         int16_t mv_cand22;
-        kvz_inter_get_mv_cand_cua(
+        if (lcu) {
+          kvz_inter_get_mv_cand(
+            state, 
+            x, y, width, height,
+            mv_cand, cur_cu, 
+            lcu, ref_list_idx);
+        }
+        else {
+          kvz_inter_get_mv_cand_cua(
             state,
             x, y, width, height,
-            mv_cand, cur_cu, ref_list_idx);
+            mv_cand, cur_cu, ref_list_idx
+          );
+        }
 
         uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx);
         const int32_t mvd_hor = cur_cu->inter.mvref_list_idx0 - mv_candcu_mv_cand0;
         const int32_t mvd_ver = cur_cu->inter.mvref_list_idx1 - mv_candcu_mv_cand1;
 
-        kvz_encode_mvd(state, cabac, mvd_hor, mvd_ver);
+        kvz_encode_mvd(state, cabac, mvd_hor, mvd_ver, bits_out);

kvazaar-2.1.0.tar.gz/src/encode_coding_tree.h -> kvazaar-2.2.0.tar.gz/src/encode_coding_tree.h Changed

@@ -49,11 +49,31 @@
 void kvz_encode_mvd(encoder_state_t * const state,
                     cabac_data_t *cabac,
                     int32_t mvd_hor,
-                    int32_t mvd_ver);
+                    int32_t mvd_ver,
+                    double* bits_out);
+
+double kvz_mock_encode_coding_unit(
+  encoder_state_t* const state,
+  cabac_data_t* cabac,
+  int x, int y, int depth,
+  lcu_t* lcu, cu_info_t* cur_cu);
+
+double kvz_encode_part_mode(encoder_state_t* const state,
+  cabac_data_t* const cabac,
+  const cu_info_t* const cur_cu,
+  int depth);
+
+void kvz_encode_inter_prediction_unit(encoder_state_t* const state,
+                                      cabac_data_t* const cabac,
+                                      const cu_info_t* const cur_cu,
+                                      int x, int y, int width, int height,
+                                      int depth, 
+                                      lcu_t* lcu,
+                                      double* bits_out);
 
 void kvz_encode_last_significant_xy(cabac_data_t * const cabac,
                                     uint8_t lastpos_x, uint8_t lastpos_y,
                                     uint8_t width, uint8_t height,
-                                    uint8_t type, uint8_t scan);
+                                    uint8_t type, uint8_t scan, double* bits_out);
 
 #endif // ENCODE_CODING_TREE_H_

kvazaar-2.1.0.tar.gz/src/encoder.c -> kvazaar-2.2.0.tar.gz/src/encoder.c Changed

@@ -32,9 +32,6 @@
 
 #include "encoder.h"
 
-// This define is required for M_PI on Windows.
-#define _USE_MATH_DEFINES
-#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -45,14 +42,6 @@
 #include "kvz_math.h"
 #include "fast_coeff_cost.h"
 
-/**
- * \brief Strength of QP adjustments when using adaptive QP for 360 video.
- *
- * Determined empirically.
- */
-static const double ERP_AQP_STRENGTH = 3.0;
-
-
 static int encoder_control_init_gop_layer_weights(encoder_control_t * const);
 
 static unsigned cfg_num_threads(void)
@@ -137,82 +126,6 @@
 
 
 /**
- * \brief Return weight for 360 degree ERP video
- *
- * Returns the scaling factor of area from equirectangular projection to
- * spherical surface.
- *
- * \param y   y-coordinate of the pixel
- * \param h   height of the picture
- */
-static double ws_weight(int y, int h)
-{
-  return cos((y - 0.5 * h + 0.5) * (M_PI / h));
-}
-
-
-
-/**
- * \brief Update ROI QPs for 360 video with equirectangular projection.
- *
- * Writes updated ROI parameters to encoder->cfg.roi.
- *
- * \param encoder       encoder control
- * \param orig_roi      original delta QPs or NULL
- * \param orig_width    width of orig_roi
- * \param orig_height   height of orig_roi
- */
-static void init_erp_aqp_roi(encoder_control_t* encoder,
-                             int8_t *orig_roi,
-                             int32_t orig_width,
-                             int32_t orig_height)
-{
-  // Update ROI with WS-PSNR delta QPs.
-  int height = encoder->in.height_in_lcu;
-  int width  = orig_roi ? orig_width : 1;
-
-  int frame_height = encoder->in.real_height;
-
-  encoder->cfg.roi.width  = width;
-  encoder->cfg.roi.height = height;
-  encoder->cfg.roi.dqps   = calloc(width * height, sizeof(orig_roi0));
-
-  double total_weight = 0.0;
-  for (int y = 0; y < frame_height; y++) {
-    total_weight += ws_weight(y, frame_height);
-  }
-
-  for (int y_lcu = 0; y_lcu < height; y_lcu++) {
-    int y_orig = LCU_WIDTH * y_lcu;
-    int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig);
-
-    double lcu_weight = 0.0;
-    for (int y = y_orig; y < y_orig + lcu_height; y++) {
-      lcu_weight += ws_weight(y, frame_height);
-    }
-    // Normalize.
-    lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height);
-
-    int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight));
-
-    if (orig_roi) {
-      // If a ROI array already exists, we copy the existing values to the
-      // new array while adding qp_delta to each.
-      int y_roi = y_lcu * orig_height / height;
-      for (int x = 0; x < width; x++) {
-        encoder->cfg.roi.dqpsx + y_lcu * width =
-          CLIP(-51, 51, orig_roix + y_roi * width + qp_delta);
-      }
-
-    } else {
-      // Otherwise, simply write qp_delta to the ROI array.
-      encoder->cfg.roi.dqpsy_lcu = qp_delta;
-    }
-  }
-}
-
-
-/**
  * \brief Allocate and initialize an encoder control structure.
  *
  * \param cfg   encoder configuration
@@ -353,6 +266,16 @@
     encoder->scaling_list.use_default_list = 1;
   }
 
+  // ROI / delta QP
+  if (cfg->roi.file_path) {
+    const char *mode2 = { "r", "rb" };
+    encoder->roi_file = fopen(cfg->roi.file_path, modecfg->roi.format);
+    if (!encoder->roi_file) {
+      fprintf(stderr, "Could not open ROI file.\n");
+      goto init_failed;
+    }
+  }
+
   if (cfg->fast_coeff_table_fn) {
     FILE *fast_coeff_table_f = fopen(cfg->fast_coeff_table_fn, "rb");
     if (fast_coeff_table_f == NULL) {
@@ -396,32 +319,10 @@
     goto init_failed;
   }
 
-  if (cfg->erp_aqp) {
-    init_erp_aqp_roi(encoder,
-                     cfg->roi.dqps,
-                     cfg->roi.width,
-                     cfg->roi.height);
-
-  } else if (cfg->roi.dqps) {
-    // Copy delta QP array for ROI coding.
-    const size_t roi_size = encoder->cfg.roi.width * encoder->cfg.roi.height;
-    encoder->cfg.roi.dqps = calloc(roi_size, sizeof(cfg->roi.dqps0));
-    memcpy(encoder->cfg.roi.dqps,
-           cfg->roi.dqps,
-           roi_size * sizeof(*cfg->roi.dqps));
-
-  }
-
   // NOTE: When tr_depth_inter is equal to 0, the transform is still split
   // for SMP and AMP partition units.
   encoder->tr_depth_inter = 0;
 
-  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) {
-    encoder->max_qp_delta_depth = 0;
-  } else {
-    encoder->max_qp_delta_depth = -1;
-  }
-
   //Tiles
   encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 ||
                           encoder->cfg.tiles_height_count > 1;
@@ -724,7 +625,7 @@
 
   FREE_POINTER(encoder->tiles_tile_id);
 
-  FREE_POINTER(encoder->cfg.roi.dqps);
+  FREE_POINTER(encoder->cfg.roi.file_path);
   FREE_POINTER(encoder->cfg.optional_key);
 
   kvz_scalinglist_destroy(&encoder->scaling_list);
@@ -734,6 +635,10 @@
 
   kvz_close_rdcost_outfiles();
 
+  if (encoder->roi_file) {
+    fclose(encoder->roi_file);
+  }
+
   free(encoder);
 }

kvazaar-2.1.0.tar.gz/src/encoder.h -> kvazaar-2.2.0.tar.gz/src/encoder.h Changed

kvazaar-2.1.0.tar.gz/src/encoder_state-bitstream.c -> kvazaar-2.2.0.tar.gz/src/encoder_state-bitstream.c Changed

@@ -9,15 +9,15 @@
  * 
  * * Redistributions of source code must retain the above copyright notice, this
  *   list of conditions and the following disclaimer.
- * 
+ *
  * * Redistributions in binary form must reproduce the above copyright notice, this
  *   list of conditions and the following disclaimer in the documentation and/or
  *   other materials provided with the distribution.
- * 
+ *
  * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -48,43 +48,44 @@
 #include "kvz_math.h"
 #include "nal.h"
 #include "scalinglist.h"
+#include "sei.h"
 #include "tables.h"
 #include "threadqueue.h"
 #include "videoframe.h"
 #include "rate_control.h"
 
 
-static void encoder_state_write_bitstream_aud(encoder_state_t * const state)
+static void encoder_state_write_bitstream_aud(encoder_state_t *const state)
 {
-  bitstream_t * const stream = &state->stream;
+  bitstream_t *const stream = &state->stream;
   kvz_nal_write(stream, KVZ_NAL_AUD_NUT, 0, 1);
 
   uint8_t pic_type = state->frame->slicetype == KVZ_SLICE_I ? 0
-                   : state->frame->slicetype == KVZ_SLICE_P ? 1
-                   :                                       2;
+    : state->frame->slicetype == KVZ_SLICE_P ? 1
+    : 2;
   WRITE_U(stream, pic_type, 3, "pic_type");
 
   kvz_bitstream_add_rbsp_trailing_bits(stream);
 }
 
 static void encoder_state_write_bitstream_PTL(bitstream_t *stream,
-                                              encoder_state_t * const state)
+  encoder_state_t *const state)
 {
   // PTL
   // Profile Tier
   WRITE_U(stream, 0, 2, "general_profile_space");
   WRITE_U(stream, state->encoder_control->cfg.high_tier, 1, "general_tier_flag");
   // Main Profile == 1,  Main 10 profile == 2
-  WRITE_U(stream, (state->encoder_control->bitdepth == 8)?1:2, 5, "general_profile_idc");
+  WRITE_U(stream, (state->encoder_control->bitdepth == 8) ? 1 : 2, 5, "general_profile_idc");
   /* Compatibility flags should be set at general_profile_idc
    *  (so with general_profile_idc = 1, compatibility_flag1 should be 1)
    * According to specification, when compatibility_flag1 is set,
    *  compatibility_flag2 should be set too.
    */
-  WRITE_U(stream, 3<<29, 32, "general_profile_compatibility_flag");
+  WRITE_U(stream, 3 << 29, 32, "general_profile_compatibility_flag");
 
   WRITE_U(stream, 1, 1, "general_progressive_source_flag");
-  WRITE_U(stream, state->encoder_control->in.source_scan_type!= 0, 1, "general_interlaced_source_flag");
+  WRITE_U(stream, state->encoder_control->in.source_scan_type != 0, 1, "general_interlaced_source_flag");
   WRITE_U(stream, 0, 1, "general_non_packed_constraint_flag");
   WRITE_U(stream, 0, 1, "general_frame_only_constraint_flag");
 
@@ -106,6 +107,25 @@
   // end PTL
 }
 
+static uint8_t max_required_dpb_size(const encoder_control_t * const encoder)
+{
+  int max_buffer = 1;
+  for (int g = 0; g < encoder->cfg.gop_len; ++g) {
+    int neg_refs = encoder->cfg.gopg.ref_neg_count;
+    int pos_refs = encoder->cfg.gopg.ref_pos_count;
+    if (neg_refs + pos_refs + 1 > max_buffer) max_buffer = neg_refs + pos_refs + 1;
+  }
+
+  if (encoder->cfg.gop_len == 0) max_buffer = encoder->cfg.ref_frames + 1;
+
+  return max_buffer;
+}
+
+static uint8_t max_num_reorder_pics(const encoder_control_t * const encoder)
+{
+  return encoder->cfg.gop_lowdelay ? 0 : MAX(encoder->cfg.gop_len - 1, 0);
+}
+
 static void encoder_state_write_bitstream_vid_parameter_set(bitstream_t* stream,
                                                             encoder_state_t * const state)
 {
@@ -125,17 +145,12 @@
 
   WRITE_U(stream, 0, 1, "vps_sub_layer_ordering_info_present_flag");
 
-  if (encoder->cfg.gop_lowdelay) {
-    const int dpb = encoder->cfg.ref_frames;
-    WRITE_UE(stream, dpb - 1, "vps_max_dec_pic_buffering_minus1");
-    WRITE_UE(stream, 0, "vps_max_num_reorder_pics");
-  }
-  else {
-    // Clip to non-negative values to prevent problems with GOP=0
-    const int dpb = MIN(16, encoder->cfg.gop_len);
-    WRITE_UE(stream, MAX(dpb - 1, 0), "vps_max_dec_pic_buffering_minus1");
-    WRITE_UE(stream, MAX(encoder->cfg.gop_len - 1, 0), "vps_max_num_reorder_pics");
-  }
+  int max_buffer  = max_required_dpb_size(encoder);
+  int max_reorder = max_num_reorder_pics(encoder);
+  if (max_buffer - 1 < max_reorder) max_buffer = max_reorder + 1;
+  WRITE_UE(stream, max_buffer - 1, "vps_max_dec_pic_buffering_minus1");
+  WRITE_UE(stream, max_reorder, "vps_max_num_reorder_pics");
+
   WRITE_UE(stream, 0, "vps_max_latency_increase");
 
   WRITE_U(stream, 0, 6, "vps_max_nuh_reserved_zero_layer_id");
@@ -402,16 +417,12 @@
   WRITE_U(stream, 0, 1, "sps_sub_layer_ordering_info_present_flag");
 
   //for each layer
-  if (encoder->cfg.gop_lowdelay) {
-    const int dpb = encoder->cfg.ref_frames;
-    WRITE_UE(stream, dpb - 1, "sps_max_dec_pic_buffering_minus1");
-    WRITE_UE(stream, 0, "sps_max_num_reorder_pics");
-  } else {
-    // Clip to non-negative values to prevent problems with GOP=0
-    const int dpb = MIN(16, encoder->cfg.gop_len);
-    WRITE_UE(stream, MAX(dpb - 1, 0), "sps_max_dec_pic_buffering_minus1");
-    WRITE_UE(stream, MAX(encoder->cfg.gop_len - 1, 0), "sps_max_num_reorder_pics");
-  }
+  int max_buffer  = max_required_dpb_size(encoder);
+  int max_reorder = max_num_reorder_pics(encoder);
+  if (max_buffer - 1 < max_reorder) max_buffer = max_reorder + 1;
+  WRITE_UE(stream, max_buffer - 1, "sps_max_dec_pic_buffering_minus1");
+  WRITE_UE(stream, max_reorder, "sps_max_num_reorder_pics");
+
   WRITE_UE(stream, 0, "sps_max_latency_increase_plus1");
   //end for
 
@@ -493,10 +504,10 @@
   WRITE_U(stream, 0, 1, "constrained_intra_pred_flag");
   WRITE_U(stream, encoder->cfg.trskip_enable, 1, "transform_skip_enabled_flag");
 
-  if (encoder->max_qp_delta_depth >= 0) {
+  if (state->frame->max_qp_delta_depth >= 0) {
     // Use separate QP for each LCU when rate control is enabled.
     WRITE_U(stream, 1, 1, "cu_qp_delta_enabled_flag");
-    WRITE_UE(stream, encoder->max_qp_delta_depth, "diff_cu_qp_delta_depth");
+    WRITE_UE(stream, state->frame->max_qp_delta_depth, "diff_cu_qp_delta_depth");
   } else {
     WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag");
   }
@@ -560,22 +571,47 @@
   kvz_bitstream_add_rbsp_trailing_bits(stream);
 }
 
+static void sei_write_payload_type(bitstream_t *stream, const int payloadType)
+{
+  int i;
+  for (i = 0; i <= payloadType - 255; i += 255) {
+    WRITE_U(stream, FF_BYTE, 8, "ff_byte");
+  }
+  WRITE_U(stream, payloadType - i, 8, "last_payload_type_byte");
+}
+
+static void sei_write_payload_size(bitstream_t *stream, const int payloadSize)
+{
+  int i;
+  for (i = 0; i <= payloadSize - 255; i += 255) {
+    WRITE_U(stream, FF_BYTE, 8, "ff_byte");
+  }
+  WRITE_U(stream, payloadSize - i, 8, "last_payload_size_byte");
+}
+
+static void sei_write_user_defined_unregistered(bitstream_t *stream, const uint8_t * const uuid, const uint8_t * const user_data_payload_byte, const int length)
+{
+  int i;
+  sei_write_payload_type(stream, SEI_PAYLOAD_TYPE_USER_DATA_UNREGISTERED);
+  sei_write_payload_size(stream, (sizeof encoder_info_uuid) + length);
+  for (i = 0; i < 16; i++) {
+    WRITE_U(stream, uuidi, 8, "uuid_iso_iec_11578");
+  }
+  for (i = 0; i < length; i++) {
+    WRITE_U(stream, user_data_payload_bytei, 8, "user_data_payload_byte");
+  }
+  kvz_bitstream_align(stream);
+}
+
 static void encoder_state_write_bitstream_prefix_sei_version(encoder_state_t * const state)
 {
 #define STR_BUF_LEN 1000
   bitstream_t * const stream = &state->stream;
-  int i, length;
+  int length;

kvazaar-2.1.0.tar.gz/src/encoderstate.c -> kvazaar-2.2.0.tar.gz/src/encoderstate.c Changed

@@ -32,6 +32,9 @@
 
 #include "encoderstate.h"
 
+ // This define is required for M_PI on Windows.
+#define _USE_MATH_DEFINES
+#include <ctype.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -51,6 +54,13 @@
 
 #include "strategies/strategies-picture.h"
 
+/**
+ * \brief Strength of QP adjustments when using adaptive QP for 360 video.
+ *
+ * Determined empirically.
+ */
+static const double ERP_AQP_STRENGTH = 3.0;
+
 
 int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
   int i;
@@ -570,7 +580,7 @@
   cu_info_t *cu = kvz_cu_array_at(state->tile->frame->cu_array, x, y);
   const int cu_width = LCU_WIDTH >> depth;
 
-  if (depth <= state->encoder_control->max_qp_delta_depth) {
+  if (depth <= state->frame->max_qp_delta_depth) {
     *prev_qp = -1;
   }
 
@@ -650,7 +660,7 @@
 
   encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search);
 
-  if (encoder->max_qp_delta_depth >= 0) {
+  if (state->frame->max_qp_delta_depth >= 0) {
     int last_qp = state->last_qp;
     int prev_qp = -1;
     set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
@@ -675,6 +685,7 @@
   const uint64_t existing_bits = kvz_bitstream_tell(&state->stream);
 
   //Encode SAO
+  state->cabac.update = 1;
   if (encoder->cfg.sao_type) {
     encode_sao(state, lcu->position.x, lcu->position.y, &frame->sao_lumalcu->position.y * frame->width_in_lcu + lcu->position.x, &frame->sao_chromalcu->position.y * frame->width_in_lcu + lcu->position.x);
   }
@@ -725,6 +736,7 @@
       kvz_crypto_delete(&state->crypto_hdl);
     }
   }
+  state->cabac.update = 0;
 
   pthread_mutex_lock(&state->frame->rc_lock);
   const uint32_t bits = kvz_bitstream_tell(&state->stream) - existing_bits;
@@ -1252,6 +1264,154 @@
   }
 }
 
+
+/**
+ * \brief Return weight for 360 degree ERP video
+ *
+ * Returns the scaling factor of area from equirectangular projection to
+ * spherical surface.
+ *
+ * \param y   y-coordinate of the pixel
+ * \param h   height of the picture
+ */
+static double ws_weight(int y, int h)
+{
+  return cos((y - 0.5 * h + 0.5) * (M_PI / h));
+}
+
+
+/**
+ * \brief Update ROI QPs for 360 video with equirectangular projection.
+ *
+ * Updates the ROI parameters in frame->roi.
+ *
+ * \param encoder       encoder control
+ * \param frame         frame that will have the ROI map
+ */
+static void init_erp_aqp_roi(const encoder_control_t *encoder, kvz_picture *frame)
+{
+  int8_t *orig_roi    = frame->roi.roi_array;
+  int32_t orig_width  = frame->roi.width;
+  int32_t orig_height = frame->roi.height;
+
+  // Update ROI with WS-PSNR delta QPs.
+  int new_height = encoder->in.height_in_lcu;
+  int new_width = orig_roi ? orig_width : 1;
+  int8_t *new_array = calloc(new_width * new_height, sizeof(orig_roi0));
+
+  int frame_height = encoder->in.real_height;
+
+  double total_weight = 0.0;
+  for (int y = 0; y < frame_height; y++) {
+    total_weight += ws_weight(y, frame_height);
+  }
+
+  for (int y_lcu = 0; y_lcu < new_height; y_lcu++) {
+    int y_orig = LCU_WIDTH * y_lcu;
+    int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig);
+
+    double lcu_weight = 0.0;
+    for (int y = y_orig; y < y_orig + lcu_height; y++) {
+      lcu_weight += ws_weight(y, frame_height);
+    }
+    // Normalize.
+    lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height);
+
+    int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight));
+
+    if (orig_roi) {
+      // If a ROI array already exists, we copy the existing values to the
+      // new array while adding qp_delta to each.
+      int y_roi = y_lcu * orig_height / new_height;
+      for (int x = 0; x < new_width; x++) {
+        new_arrayx + y_lcu * new_width =
+          CLIP(-51, 51, orig_roix + y_roi * new_width + qp_delta);
+      }
+
+    } else {
+      // Otherwise, simply write qp_delta to the ROI array.
+      new_arrayy_lcu = qp_delta;
+    }
+  }
+
+  // Update new values
+  frame->roi.width = new_width;
+  frame->roi.height = new_height;
+  frame->roi.roi_array = new_array;
+  FREE_POINTER(orig_roi);
+}
+
+
+static void next_roi_frame_from_file(kvz_picture *frame, FILE *file, enum kvz_roi_format format) {
+  // The ROI description is as follows:
+  // First number is width, second number is height,
+  // then follows width * height number of dqp values.
+
+  // Rewind the (seekable) ROI file when end of file is reached.
+  // Allows a single ROI frame to be used for a whole sequence
+  // and looping with --loop-input. Skips possible whitespace.
+  if (ftell(file) != -1L) {
+    int c = fgetc(file);
+    while (format == KVZ_ROI_TXT && isspace(c)) c = fgetc(file);
+    ungetc(c, file);
+    if (c == EOF) rewind(file);
+  }
+
+  int *width  = &frame->roi.width;
+  int *height = &frame->roi.height;
+
+  bool failed = false;
+
+  if (format == KVZ_ROI_TXT) failed = !fscanf(file, "%d", width) || !fscanf(file, "%d", height);
+  if (format == KVZ_ROI_BIN) failed = fread(&frame->roi, 4, 2, file) != 2;
+  
+  if (failed) {
+    fprintf(stderr, "Failed to read ROI size.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  if (*width <= 0 || *height <= 0) {
+    fprintf(stderr, "Invalid ROI size: %dx%d.\n", *width, *height);
+    fclose(file);
+    assert(0);
+  }
+
+  if (*width > 10000 || *height > 10000) {
+    fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  const unsigned size = (*width) * (*height);
+  int8_t *dqp_array = calloc((size_t)size, sizeof(frame->roi.roi_array0));
+  if (!dqp_array) {
+    fprintf(stderr, "Failed to allocate memory for ROI table.\n");
+    fclose(file);
+    assert(0);
+  }
+
+  FREE_POINTER(frame->roi.roi_array);
+  frame->roi.roi_array = dqp_array;
+
+  if (format == KVZ_ROI_TXT) {
+    for (int i = 0; i < size; ++i) {
+      int number; // Need a pointer to int for fscanf
+      if (fscanf(file, "%d", &number) != 1) {
+        fprintf(stderr, "Reading ROI file failed.\n");
+        fclose(file);
+        assert(0);
+      }

kvazaar-2.1.0.tar.gz/src/encoderstate.h -> kvazaar-2.2.0.tar.gz/src/encoderstate.h Changed

kvazaar-2.1.0.tar.gz/src/fast_coeff_cost.c -> kvazaar-2.2.0.tar.gz/src/fast_coeff_cost.c Changed

kvazaar-2.1.0.tar.gz/src/fast_coeff_cost.h -> kvazaar-2.2.0.tar.gz/src/fast_coeff_cost.h Changed

@@ -45,60 +45,61 @@
 
 // Weights for 4 buckets (coeff 0, coeff 1, coeff 2, coeff >= 3), for QPs from
 // 0 to MAX_FAST_COEFF_COST_QP
-static const float default_fast_coeff_cost_wts4 = {
-  // Just extend it by stretching the first actual values..
-  {0.164240, 4.161530, 3.509033, 6.928047},
-  {0.164240, 4.161530, 3.509033, 6.928047},
-  {0.164240, 4.161530, 3.509033, 6.928047},
-  {0.164240, 4.161530, 3.509033, 6.928047},
-  {0.164240, 4.161530, 3.509033, 6.928047},
-  {0.164240, 4.161530, 3.509033, 6.928047},
-  {0.164240, 4.161530, 3.509033, 6.928047},
-  {0.164240, 4.161530, 3.509033, 6.928047},
-  {0.164240, 4.161530, 3.509033, 6.928047},
-  {0.164240, 4.161530, 3.509033, 6.928047},
-  // up to here
-  {0.164240, 4.161530, 3.509033, 6.928047},
-  {0.162844, 4.055940, 3.564467, 6.861493},
-  {0.128729, 4.311973, 3.942837, 6.935403},
-  {0.110956, 4.433190, 3.945753, 6.877697},
-  {0.095026, 4.483547, 4.194173, 6.781540},
-  {0.075046, 4.633703, 4.084193, 6.698600},
-  {0.052426, 4.967223, 4.027210, 6.549197},
-  {0.040219, 5.141820, 3.982650, 6.461557},
-  {0.035090, 5.192493, 3.830950, 6.418477},
-  {0.029845, 5.211647, 3.815457, 6.345440},
-  {0.023522, 5.322213, 3.816537, 6.360677},
-  {0.021305, 5.225923, 3.842700, 6.325787},
-  {0.015878, 5.183090, 3.956003, 6.329680},
-  {0.010430, 5.099230, 4.176803, 6.305400},
-  {0.008433, 5.030257, 4.237587, 6.270133},
-  {0.006500, 4.969247, 4.339397, 6.217827},
-  {0.004929, 4.923500, 4.442413, 6.183523},
-  {0.003715, 4.915583, 4.429090, 6.125320},
-  {0.003089, 4.883907, 4.562790, 6.156447},
-  {0.002466, 4.881063, 4.629883, 6.142643},
-  {0.002169, 4.882493, 4.646313, 6.127663},
-  {0.002546, 4.793337, 4.837413, 6.199270},
-  {0.001314, 4.808853, 4.828337, 6.243437},
-  {0.001154, 4.862603, 4.846883, 6.205523},
-  {0.000984, 4.866403, 4.859330, 6.240893},
-  {0.000813, 4.856633, 4.924527, 6.293413},
-  {0.001112, 4.789260, 5.009880, 6.433540},
-  {0.000552, 4.760747, 5.090447, 6.599380},
-  {0.000391, 4.961447, 5.111033, 6.756370},
-  {0.000332, 4.980953, 5.138127, 6.867420},
-  {0.000201, 5.181957, 4.740160, 6.460997},
-  {0.000240, 5.185390, 4.874840, 6.819093},
-  {0.000130, 5.270350, 4.734213, 6.826240},
-  {0.000104, 5.371937, 4.595087, 6.659253},
-  {0.000083, 5.362000, 4.617470, 6.837770},
-  {0.000069, 5.285997, 4.754993, 7.159043},
-  {0.000049, 5.488470, 4.396107, 6.727357},
-  {0.000058, 4.958940, 4.580460, 6.477740},
-  {0.000028, 5.521253, 4.440493, 7.205017},
-  {0.000000, 0.000000, 0.000000, 0.000000},
-  {0.000019, 5.811260, 4.399110, 7.336310},
+static const double default_fast_coeff_cost_wts4 = {
+{0.162000, 4.126087, 3.499517, 6.969847},
+{0.162000, 4.126087, 3.499517, 6.969847},
+{0.162000, 4.126087, 3.499517, 6.969847},
+{0.162000, 4.126087, 3.499517, 6.969847},
+{0.162000, 4.126087, 3.499517, 6.969847},
+{0.162000, 4.126087, 3.499517, 6.969847},
+{0.162000, 4.126087, 3.499517, 6.969847},
+{0.162000, 4.126087, 3.499517, 6.969847},
+{0.162000, 4.126087, 3.499517, 6.969847},
+{0.162000, 4.126087, 3.499517, 6.969847},
+{0.162000, 4.126087, 3.499517, 6.969847},
+{0.157760, 4.037673, 3.558663, 6.895640},
+{0.127943, 4.308060, 3.916680, 6.962907},
+{0.110555, 4.422860, 3.944640, 6.898343},
+{0.094532, 4.479287, 4.161790, 6.804273},
+{0.074032, 4.629857, 4.042727, 6.722910},
+{0.051644, 4.960970, 4.001523, 6.556783},
+{0.039513, 5.133963, 3.951247, 6.472487},
+{0.034188, 5.185183, 3.805350, 6.418810},
+{0.028981, 5.203517, 3.785043, 6.351090},
+{0.022543, 5.315690, 3.796553, 6.347457},
+{0.020300, 5.221910, 3.817927, 6.322733},
+{0.015400, 5.170127, 3.937963, 6.326643},
+{0.010147, 5.088577, 4.143093, 6.293030},
+{0.008239, 5.017160, 4.204780, 6.267220},
+{0.006386, 4.956723, 4.303120, 6.208533},
+{0.004876, 4.912990, 4.400863, 6.175370},
+{0.003707, 4.905997, 4.388617, 6.134007},
+{0.003089, 4.872320, 4.521937, 6.153827},
+{0.002479, 4.864330, 4.591423, 6.152587},
+{0.002180, 4.864427, 4.607133, 6.141223},
+{0.002556, 4.771863, 4.793583, 6.232397},
+{0.001316, 4.793543, 4.787927, 6.272543},
+{0.001169, 4.845383, 4.787190, 6.235333},
+{0.001000, 4.849327, 4.805003, 6.273347},
+{0.000830, 4.839947, 4.866000, 6.346927},
+{0.001131, 4.772140, 4.969497, 6.448050},
+{0.000553, 4.743423, 5.050670, 6.663760},
+{0.000466, 4.800883, 5.034373, 6.601250},
+{0.000400, 4.797313, 5.079183, 6.743547},
+{0.000333, 4.783170, 5.142737, 6.869933},
+{0.000355, 4.915657, 5.217510, 7.225673},
+{0.000186, 4.973477, 5.151287, 7.280497},
+{0.000113, 5.316010, 4.509893, 6.585287},
+{0.000091, 5.304703, 4.553107, 6.773803},
+{0.000076, 5.263460, 4.689990, 6.962153},
+{0.000064, 5.190947, 4.733550, 7.100820},
+{0.000053, 5.180677, 4.833283, 7.340667},
+{0.000047, 5.182963, 4.829380, 7.338863},
+{0.000032, 5.389257, 4.518127, 7.265003},
+{0.000020, 5.970297, 3.981997, 7.201180},
+{0.000000, 0.000000, 0.000000, 0.000000},
+
+
 };
 
 typedef struct encoder_state_t encoder_state_t;

kvazaar-2.1.0.tar.gz/src/filter.c -> kvazaar-2.2.0.tar.gz/src/filter.c Changed

kvazaar-2.1.0.tar.gz/src/global.h -> kvazaar-2.2.0.tar.gz/src/global.h Changed

kvazaar-2.1.0.tar.gz/src/image.c -> kvazaar-2.2.0.tar.gz/src/image.c Changed

@@ -100,6 +100,10 @@
 
   im->interlacing = KVZ_INTERLACING_NONE;
 
+  im->roi.roi_array = NULL;
+  im->roi.width = 0;
+  im->roi.height = 0;
+
   return im;
 }
 
@@ -126,6 +130,7 @@
     kvz_image_free(im->base_image);
   } else {
     free(im->fulldata_buf);
+    if (im->roi.roi_array) FREE_POINTER(im->roi.roi_array);
   }
 
   // Make sure freed data won't be used.
@@ -186,6 +191,8 @@
   im->pts = 0;
   im->dts = 0;
 
+  im->roi = orig_image->roi;
+
   return im;
 }
 
@@ -218,27 +225,6 @@
   FREE_POINTER(yuv);
 }
 
-hi_prec_buf_t * kvz_hi_prec_buf_t_alloc(int luma_size)
-{
-  // Get buffers with separate mallocs in order to take advantage of
-  // automatic buffer overrun checks.
-  hi_prec_buf_t *yuv = (hi_prec_buf_t *)malloc(sizeof(*yuv));
-  yuv->y = (int16_t *)malloc(luma_size * sizeof(*yuv->y));
-  yuv->u = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->u));
-  yuv->v = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->v));
-  yuv->size = luma_size;
-
-  return yuv;
-}
-
-void kvz_hi_prec_buf_t_free(hi_prec_buf_t * yuv)
-{
-  free(yuv->y);
-  free(yuv->u);
-  free(yuv->v);
-  free(yuv);
-}
-
 static INLINE uint32_t reg_sad_maybe_optimized(const kvz_pixel * const data1, const kvz_pixel * const data2,
                                   const int32_t width, const int32_t height, const uint32_t stride1,
                                   const uint32_t stride2, optimized_sad_func_ptr_t optimized_sad)

kvazaar-2.1.0.tar.gz/src/image.h -> kvazaar-2.2.0.tar.gz/src/image.h Changed

kvazaar-2.1.0.tar.gz/src/inter.c -> kvazaar-2.2.0.tar.gz/src/inter.c Changed

@@ -52,14 +52,15 @@
 } merge_candidates_t;
 
 
-static void inter_recon_frac_luma(const encoder_state_t *const state,
-  const kvz_picture *const ref,
-  int32_t xpos,
-  int32_t ypos,
-  int32_t block_width,
-  int32_t block_height,
-  const int16_t mv_param2,
-  lcu_t *lcu)
+static void inter_recon_frac_luma(const encoder_state_t * const state,
+                                  const kvz_picture * const ref,
+                                  int32_t xpos,
+                                  int32_t ypos,
+                                  int32_t block_width,
+                                  int32_t block_height,
+                                  const int16_t mv_param2,
+                                  yuv_t *out,
+                                  unsigned out_stride)
 {
   int mv_frac_x = (mv_param0 & 3);
   int mv_frac_y = (mv_param1 & 3);
@@ -100,8 +101,8 @@
     ext_s,
     block_width,
     block_height,
-    lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
-    LCU_WIDTH,
+    out->y,
+    out_stride,
     mv_frac_x,
     mv_frac_y,
     mv_param);
@@ -114,7 +115,8 @@
   int32_t block_width,
   int32_t block_height,
   const int16_t mv_param2,
-  hi_prec_buf_t *hi_prec_out)
+  yuv_im_t *out,
+  const unsigned out_stride)
 {
   int mv_frac_x = (mv_param0 & 3);
   int mv_frac_y = (mv_param1 & 3);
@@ -155,8 +157,8 @@
     ext_s,
     block_width,
     block_height,
-    hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
-    LCU_WIDTH,
+    out->y,
+    out_stride,
     mv_frac_x,
     mv_frac_y,
     mv_param);
@@ -164,16 +166,21 @@
 
 static void inter_recon_frac_chroma(const encoder_state_t *const state,
   const kvz_picture *const ref,
-  int32_t xpos,
-  int32_t ypos,
-  int32_t block_width,
-  int32_t block_height,
+  int32_t pu_x,
+  int32_t pu_y,
+  int32_t pu_w,
+  int32_t pu_h,
   const int16_t mv_param2,
-  lcu_t *lcu)
+  yuv_t *out,
+  const unsigned out_stride)
 {
   int mv_frac_x = (mv_param0 & 7);
   int mv_frac_y = (mv_param1 & 7);
 
+  // Take into account chroma subsampling
+  unsigned pb_w = pu_w / 2;
+  unsigned pb_h = pu_h / 2;
+
   // Space for extrapolated pixels and the part from the picture.
   // Some extra for AVX2.
   // The extrapolation function will set the pointers and stride.
@@ -189,10 +196,10 @@
     .src_w = ref->width / 2,
     .src_h = ref->height / 2,
     .src_s = ref->stride / 2,
-    .blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param0 >> 3),
-    .blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param1 >> 3),
-    .blk_w = block_width / 2,
-    .blk_h = block_height / 2,
+    .blk_x = (state->tile->offset_x + pu_x) / 2 + (mv_param0 >> 3),
+    .blk_y = (state->tile->offset_y + pu_y) / 2 + (mv_param1 >> 3),
+    .blk_w = pb_w,
+    .blk_h = pb_h,
     .pad_l = KVZ_CHROMA_FILTER_OFFSET,
     .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
     .pad_t = KVZ_CHROMA_FILTER_OFFSET,
@@ -211,10 +218,10 @@
   kvz_sample_octpel_chroma(state->encoder_control,
     ext_origin,
     ext_s,
-    block_width / 2,
-    block_height / 2,
-    lcu->rec.u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
-    LCU_WIDTH_C,
+    pb_w,
+    pb_h,
+    out->u,
+    out_stride,
     mv_frac_x,
     mv_frac_y,
     mv_param);
@@ -225,10 +232,10 @@
   kvz_sample_octpel_chroma(state->encoder_control,
     ext_origin,
     ext_s,
-    block_width / 2,
-    block_height / 2,
-    lcu->rec.v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
-    LCU_WIDTH_C,
+    pb_w,
+    pb_h,
+    out->v,
+    out_stride,
     mv_frac_x,
     mv_frac_y,
     mv_param);
@@ -236,16 +243,21 @@
 
 static void inter_recon_frac_chroma_hi(const encoder_state_t *const state,
   const kvz_picture *const ref,
-  int32_t xpos,
-  int32_t ypos,
-  int32_t block_width,
-  int32_t block_height,
+  int32_t pu_x,
+  int32_t pu_y,
+  int32_t pu_w,
+  int32_t pu_h,
   const int16_t mv_param2,
-  hi_prec_buf_t *hi_prec_out)
+  yuv_im_t *out,
+  const unsigned out_stride)
 {
   int mv_frac_x = (mv_param0 & 7);
   int mv_frac_y = (mv_param1 & 7);
 
+  // Take into account chroma subsampling
+  unsigned pb_w = pu_w / 2;
+  unsigned pb_h = pu_h / 2;
+
   // Space for extrapolated pixels and the part from the picture.
   // Some extra for AVX2.
   // The extrapolation function will set the pointers and stride.
@@ -261,10 +273,10 @@
     .src_w = ref->width / 2,
     .src_h = ref->height / 2,
     .src_s = ref->stride / 2,
-    .blk_x = (state->tile->offset_x + xpos) / 2 + (mv_param0 >> 3),
-    .blk_y = (state->tile->offset_y + ypos) / 2 + (mv_param1 >> 3),
-    .blk_w = block_width / 2,
-    .blk_h = block_height / 2,
+    .blk_x = (state->tile->offset_x + pu_x) / 2 + (mv_param0 >> 3),
+    .blk_y = (state->tile->offset_y + pu_y) / 2 + (mv_param1 >> 3),
+    .blk_w = pb_w,
+    .blk_h = pb_h,
     .pad_l = KVZ_CHROMA_FILTER_OFFSET,
     .pad_r = KVZ_EXT_PADDING_CHROMA - KVZ_CHROMA_FILTER_OFFSET,
     .pad_t = KVZ_CHROMA_FILTER_OFFSET,
@@ -283,10 +295,10 @@
   kvz_sample_octpel_chroma_hi(state->encoder_control,
     ext_origin,
     ext_s,
-    block_width / 2,
-    block_height / 2,
-    hi_prec_out->u + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
-    LCU_WIDTH_C,
+    pb_w,
+    pb_h,
+    out->u,
+    out_stride,
     mv_frac_x,
     mv_frac_y,
     mv_param);
@@ -297,10 +309,10 @@
   kvz_sample_octpel_chroma_hi(state->encoder_control,
     ext_origin,
     ext_s,
-    block_width / 2,
-    block_height / 2,
-    hi_prec_out->v + ((ypos / 2) % LCU_WIDTH_C) * LCU_WIDTH_C + ((xpos / 2) % LCU_WIDTH_C),
-    LCU_WIDTH_C,
+    pb_w,
+    pb_h,
+    out->v,
+    out_stride,
     mv_frac_x,
     mv_frac_y,
     mv_param);

kvazaar-2.1.0.tar.gz/src/inter.h -> kvazaar-2.2.0.tar.gz/src/inter.h Changed

kvazaar-2.1.0.tar.gz/src/kvazaar.h -> kvazaar-2.2.0.tar.gz/src/kvazaar.h Changed

@@ -97,6 +97,8 @@
 typedef uint16_t kvz_pixel;
 #endif
 
+typedef int16_t kvz_pixel_im;  // For intermediate precision (interpolation/bipred).
+
 /**
  * \brief Opaque data structure representing one instance of the encoder.
  */
@@ -248,6 +250,11 @@
   KVZ_FORMAT_YUV = 2
 };
 
+enum kvz_roi_format
+{
+  KVZ_ROI_TXT = 0,
+  KVZ_ROI_BIN = 1
+};
 
 // Map from input format to chroma format.
 #define KVZ_FORMAT2CSP(format) ((enum kvz_chroma_format)"\0\1\2\3"format)
@@ -386,10 +393,9 @@
   int32_t implicit_rdpcm; /*!< \brief Enable implicit residual DPCM. */
 
   struct {
-    int32_t width;
-    int32_t height;
-    int8_t *dqps;
-  } roi; /*!< \since 3.14.0 \brief Map of delta QPs for region of interest coding. */
+    char *file_path;
+    enum kvz_roi_format format;
+  } roi; /*!< \brief Specify delta QPs for region of interest coding. */
 
   unsigned slices; /*!< \since 3.15.0 \brief How to map slices to frame. */
 
@@ -477,6 +483,15 @@
 
   char *fastrd_learning_outdir_fn;
 
+  /** \brief whether to try combining intra cus at the lower depth when search
+   *         is not performed at said depth*/
+  uint8_t combine_intra_cus;
+
+  uint8_t force_inter;
+
+  uint8_t intra_chroma_search;
+
+  uint8_t fast_bipred;
 } kvz_config;
 
 /**
@@ -508,6 +523,14 @@
   enum kvz_chroma_format chroma_format;
 
   int32_t ref_pocs16;
+
+  struct
+  {
+    int width;
+    int height;
+    int8_t *roi_array;
+  } roi;
+
 } kvz_picture;
 
 /**
@@ -752,6 +775,9 @@
    * the bitstream, length of the bitstream, the reconstructed frame, the
    * original frame and frame info in data_out, len_out, pic_out, src_out and
    * info_out, respectively. Otherwise, set the output parameters to NULL.
+   * 
+   * Region of interest (ROI) / delta QP map can be specified in the input
+   * picture's ROI field but only when a ROI file is not used.
    *
    * After passing all of the input frames, the caller should keep calling this
    * function with pic_in set to NULL, until no more data is returned in the

kvazaar-2.1.0.tar.gz/src/rate_control.c -> kvazaar-2.2.0.tar.gz/src/rate_control.c Changed

@@ -1085,21 +1085,23 @@
   const encoder_control_t * const ctrl = state->encoder_control;
   lcu_stats_t *lcu = kvz_get_lcu_stats(state, pos.x, pos.y);
 
-  if (ctrl->cfg.roi.dqps != NULL) {
-    vector2d_t lcu = {
+  if (state->tile->frame->source->roi.roi_array) {
+    vector2d_t lcu_vec = {
       pos.x + state->tile->lcu_offset_x,
       pos.y + state->tile->lcu_offset_y
     };
     vector2d_t roi = {
-      lcu.x * ctrl->cfg.roi.width / ctrl->in.width_in_lcu,
-      lcu.y * ctrl->cfg.roi.height / ctrl->in.height_in_lcu
+      lcu_vec.x * state->tile->frame->source->roi.width / ctrl->in.width_in_lcu,
+      lcu_vec.y * state->tile->frame->source->roi.height / ctrl->in.height_in_lcu
     };
-    int roi_index = roi.x + roi.y * ctrl->cfg.roi.width;
-    int dqp = ctrl->cfg.roi.dqpsroi_index;
+    int roi_index = roi.x + roi.y * state->tile->frame->source->roi.width;
+    int dqp = state->tile->frame->source->roi.roi_arrayroi_index;
+    if(dqp != 0) {
+      pos.x = 0;
+    }
     state->qp = CLIP_TO_QP(state->frame->QP + dqp);
     state->lambda = qp_to_lambda(state, state->qp);
     state->lambda_sqrt = sqrt(state->lambda);
-
   }
   else if (ctrl->cfg.target_bitrate > 0) {
     const uint32_t pixels    = MIN(LCU_WIDTH, state->tile->frame->width  - LCU_WIDTH * pos.x) *

kvazaar-2.1.0.tar.gz/src/rdo.c -> kvazaar-2.2.0.tar.gz/src/rdo.c Changed

@@ -165,17 +165,17 @@
 {
 #define RD_SAMPLING_MAX_FN_LENGTH 4095
   static const char *basename_tmpl = "/%02i.txt";
-  char fn_templateRD_SAMPLING_MAX_FN_LENGTH + 1;
+  char fn_templateRD_SAMPLING_MAX_FN_LENGTH + 1 = {0};
   char fnRD_SAMPLING_MAX_FN_LENGTH + 1;
   int rv = 0, qp;
 
   // As long as QP is a two-digit number, template and produced string should
   // be equal in length ("%i" -> "22")
   assert(RD_SAMPLING_MAX_LAST_QP <= 99);
-  assert(strlen(fn_template) <= RD_SAMPLING_MAX_FN_LENGTH);
 
   strncpy(fn_template, dir_path, RD_SAMPLING_MAX_FN_LENGTH);
   strncat(fn_template, basename_tmpl, RD_SAMPLING_MAX_FN_LENGTH - strlen(dir_path));
+  assert(strlen(fn_template) <= RD_SAMPLING_MAX_FN_LENGTH);
 
   for (qp = 0; qp <= RD_SAMPLING_MAX_LAST_QP; qp++) {
     pthread_mutex_t *curr = outfile_mutex + qp;
@@ -233,7 +233,7 @@
  *
  * \returns bits needed to code input coefficients
  */
-static INLINE uint32_t get_coeff_cabac_cost(
+static INLINE double get_coeff_cabac_cost(
     const encoder_state_t * const state,
     const coeff_t *coeff,
     int32_t width,
@@ -253,12 +253,11 @@
   // Take a copy of the CABAC so that we don't overwrite the contexts when
   // counting the bits.
   cabac_data_t cabac_copy;
-  memcpy(&cabac_copy, &state->cabac, sizeof(cabac_copy));
+  memcpy(&cabac_copy, &state->search_cabac, sizeof(cabac_copy));
 
   // Clear bytes and bits and set mode to "count"
   cabac_copy.only_count = 1;
-  cabac_copy.num_buffered_bytes = 0;
-  cabac_copy.bits_left = 23;
+  double bits = 0;
 
   // Execute the coding function.
   // It is safe to drop the const modifier since state won't be modified
@@ -269,12 +268,15 @@
                        width,
                        type,
                        scan_mode,
-                       0);
-
-  return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3);
+                       0,
+                       &bits);
+  if(cabac_copy.update) {
+    memcpy((cabac_data_t *)&state->search_cabac, &cabac_copy, sizeof(cabac_copy));
+  }
+  return bits;
 }
 
-static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, uint32_t ccc)
+static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, double ccc)
 {
   pthread_mutex_t *mtx = outfile_mutex + qp;
 
@@ -290,14 +292,14 @@
   pthread_mutex_unlock(mtx);
 }
 
-static INLINE void save_accuracy(int qp, uint32_t ccc, uint32_t fast_cost)
+static INLINE void save_accuracy(int qp, double ccc, double fast_cost)
 {
   pthread_mutex_t *mtx = outfile_mutex + qp;
 
   assert(qp <= RD_SAMPLING_MAX_LAST_QP);
 
   pthread_mutex_lock(mtx);
-  fprintf(fastrd_learning_outfileqp, "%u %u\n", fast_cost, ccc);
+  fprintf(fastrd_learning_outfileqp, "%f %f\n", fast_cost, ccc);
   pthread_mutex_unlock(mtx);
 }
 
@@ -310,7 +312,7 @@
  *
  * \returns       number of bits needed to code coefficients
  */
-uint32_t kvz_get_coeff_cost(const encoder_state_t * const state,
+double kvz_get_coeff_cost(const encoder_state_t * const state,
                             const coeff_t *coeff,
                             int32_t width,
                             int32_t type,
@@ -329,15 +331,15 @@
       return UINT32_MAX; // Hush little compiler don't you cry, not really gonna return anything after assert(0)
     } else {
       uint64_t weights = kvz_fast_coeff_get_weights(state);
-      uint32_t fast_cost = kvz_fast_coeff_cost(coeff, width, weights);
+      double fast_cost = kvz_fast_coeff_cost(coeff, width, weights);
       if (check_accuracy) {
-        uint32_t ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
+        double ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
         save_accuracy(state->qp, ccc, fast_cost);
       }
       return fast_cost;
     }
   } else {
-    uint32_t ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
+    double ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
     if (save_cccs) {
       save_ccc(state->qp, coeff, width * width, ccc);
     }
@@ -1007,37 +1009,33 @@
 /**
  * Calculate cost of actual motion vectors using CABAC coding
  */
-uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
-                                       const cabac_data_t* cabac,
-                                       const int32_t mvd_hor,
-                                       const int32_t mvd_ver)
+double kvz_get_mvd_coding_cost_cabac(const encoder_state_t* state,
+                                     const cabac_data_t* cabac,
+                                     const int32_t mvd_hor,
+                                     const int32_t mvd_ver)
 {
   cabac_data_t cabac_copy = *cabac;
   cabac_copy.only_count = 1;
-
+  double bits = 0;
   // It is safe to drop const here because cabac->only_count is set.
-  kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver);
-
-  uint32_t bitcost =
-    ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3)) -
-    ((23 - cabac->bits_left)     + (cabac->num_buffered_bytes << 3));
+  kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver, &bits);
 
-  return bitcost;
+  return bits;
 }
 
 /** MVD cost calculation with CABAC
 * \returns int
 * Calculates Motion Vector cost and related costs using CABAC coding
 */
-uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
-                                 int x,
-                                 int y,
-                                 int mv_shift,
-                                 int16_t mv_cand22,
-                                 inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
-                                 int16_t num_cand,
-                                 int32_t ref_idx,
-                                 uint32_t *bitcost)
+double kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
+                               int x,
+                               int y,
+                               int mv_shift,
+                               int16_t mv_cand22,
+                               inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
+                               int16_t num_cand,
+                               int32_t ref_idx,
+                               double* bitcost)
 {
   cabac_data_t state_cabac_copy;
   cabac_data_t* cabac;
@@ -1064,14 +1062,13 @@
   }
 
   // Store cabac state and contexts
-  memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t));
+  memcpy(&state_cabac_copy, &state->search_cabac, sizeof(cabac_data_t));
 
   // Clear bytes and bits and set mode to "count"
   state_cabac_copy.only_count = 1;
-  state_cabac_copy.num_buffered_bytes = 0;
-  state_cabac_copy.bits_left = 23;
 
   cabac = &state_cabac_copy;
+  double bits = 0;
 
   if (!merged) {
     vector2d_t mvd1 = {
@@ -1082,8 +1079,8 @@
       x - mv_cand10,
       y - mv_cand11,
     };
-    uint32_t cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y);
-    uint32_t cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y);
+    double cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y);
+    double cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y);
 
     // Select candidate 1 if it has lower cost
     if (cand2_cost < cand1_cost) {
@@ -1096,7 +1093,7 @@
 
   cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);
 
-  CABAC_BIN(cabac, merged, "MergeFlag");
+  CABAC_FBITS_UPDATE(cabac, &(cabac->ctx.cu_merge_flag_ext_model), merged, bits, "MergeFlag");
   num_cand = state->encoder_control->cfg.max_merge;
   if (merged) {
     if (num_cand > 1) {

kvazaar-2.1.0.tar.gz/src/rdo.h -> kvazaar-2.2.0.tar.gz/src/rdo.h Changed

@@ -54,7 +54,7 @@
 void  kvz_rdoq(encoder_state_t *state, coeff_t *coef, coeff_t *dest_coeff, int32_t width,
            int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth);
 
-uint32_t kvz_get_coeff_cost(const encoder_state_t * const state,
+double kvz_get_coeff_cost(const encoder_state_t * const state,
                             const coeff_t *coeff,
                             int32_t width,
                             int32_t type,
@@ -71,10 +71,10 @@
 
 kvz_mvd_cost_func kvz_calc_mvd_cost_cabac;
 
-uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
-                                       const cabac_data_t* cabac,
-                                       int32_t mvd_hor,
-                                       int32_t mvd_ver);
+double kvz_get_mvd_coding_cost_cabac(const encoder_state_t* state,
+                                     const cabac_data_t* cabac,
+                                     int32_t mvd_hor,
+                                     int32_t mvd_ver);
 
 // Number of fixed point fractional bits used in the fractional bit table.
 #define CTX_FRAC_BITS 15
@@ -85,7 +85,5 @@
 #define CTX_ENTROPY_BITS(ctx, val) kvz_entropy_bits(ctx)->uc_state ^ (val)
 
 // Floating point fractional bits, derived from kvz_entropy_bits
-extern const float kvz_f_entropy_bits128;
-#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits(ctx)->uc_state ^ (val)
 
 #endif

kvazaar-2.1.0.tar.gz/src/sao.c -> kvazaar-2.2.0.tar.gz/src/sao.c Changed

@@ -49,63 +49,64 @@
 }
 
 
-static float sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left)
+static double sao_mode_bits_none(const encoder_state_t * const state, sao_info_t *sao_top, sao_info_t *sao_left)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
   // FL coded merges.
   if (sao_left != NULL) {
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
   if (sao_top != NULL) {    
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
 
   // TR coded type_idx_, none = 0
   ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+  CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_type");
 
   return mode_bits;
 }
 
-static float sao_mode_bits_merge(const encoder_state_t * const state,
+static double sao_mode_bits_merge(const encoder_state_t * const state,
                                  int8_t merge_cand) {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
   // FL coded merges.
   ctx = &(cabac->ctx.sao_merge_flag_model);
 
-  mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 1);
+  CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 1, mode_bits, "sao_merge_flag");
   if (merge_cand == 1) return mode_bits;
-  mode_bits += CTX_ENTROPY_FBITS(ctx, merge_cand == 2);
+  CABAC_FBITS_UPDATE(cabac, ctx, merge_cand == 2, mode_bits, "sao_merge_flag");
   return mode_bits;
 }
 
 
-static float sao_mode_bits_edge(const encoder_state_t * const state,
+static double sao_mode_bits_edge(const encoder_state_t * const state,
                               int edge_class, int offsetsNUM_SAO_EDGE_CATEGORIES,
                               sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
   // FL coded merges.
   if (sao_left != NULL) {
-    ctx = &(cabac->ctx.sao_merge_flag_model);   
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    ctx = &(cabac->ctx.sao_merge_flag_model);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
   if (sao_top != NULL) {
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
 
   // TR coded type_idx_, edge = 2 = cMax
   ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0;
+  CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type");
+  mode_bits += 1.0;
 
   // TR coded offsets.
   for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++) {
@@ -126,26 +127,27 @@
 }
 
 
-static float sao_mode_bits_band(const encoder_state_t * const state,
+static double sao_mode_bits_band(const encoder_state_t * const state,
                               int band_position2, int offsets10,
                               sao_info_t *sao_top, sao_info_t *sao_left, unsigned buf_cnt)
 {
-  float mode_bits = 0.0;
-  const cabac_data_t * const cabac = &state->cabac;
-  const cabac_ctx_t *ctx = NULL;
+  double mode_bits = 0.0;
+  cabac_data_t * cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = NULL;
   // FL coded merges.
   if (sao_left != NULL) {
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
   if (sao_top != NULL) {
     ctx = &(cabac->ctx.sao_merge_flag_model);
-    mode_bits += CTX_ENTROPY_FBITS(ctx, 0);
+    CABAC_FBITS_UPDATE(cabac, ctx, 0, mode_bits, "sao_merge_flag");
   }
 
   // TR coded sao_type_idx_, band = 1
   ctx = &(cabac->ctx.sao_type_idx_model);
-  mode_bits += CTX_ENTROPY_FBITS(ctx, 1) + 1.0;
+  CABAC_FBITS_UPDATE(cabac, ctx, 1, mode_bits, "sao_type");
+  mode_bits += 1.0;
 
   // TR coded offsets and possible FL coded offset signs.
   for (unsigned buf_index = 0; buf_index < buf_cnt; buf_index++)
@@ -552,7 +554,8 @@
   // Choose between SAO and doing nothing, taking into account the
   // rate-distortion cost of coding do nothing.
   {
-    int cost_of_nothing = (int)(sao_mode_bits_none(state, sao_top, sao_left) * state->lambda + 0.5);
+    float mode_bits_none = sao_mode_bits_none(state, sao_top, sao_left);
+    int cost_of_nothing = (int)(mode_bits_none * state->lambda + 0.5);
     if (sao_out->ddistortion >= cost_of_nothing) {
       sao_out->type = SAO_TYPE_NONE;
       merge_cost0 = cost_of_nothing;

kvazaar-2.1.0.tar.gz/src/search.c -> kvazaar-2.2.0.tar.gz/src/search.c Changed

@@ -37,6 +37,7 @@
 
 #include "cabac.h"
 #include "encoder.h"
+#include "encode_coding_tree.h"
 #include "imagelist.h"
 #include "inter.h"
 #include "intra.h"
@@ -59,14 +60,6 @@
 // Cost threshold for doing intra search in inter frames with --rd=0.
 static const int INTRA_THRESHOLD = 8;
 
-// Modify weight of luma SSD.
-#ifndef LUMA_MULT
-# define LUMA_MULT 0.8
-#endif
-// Modify weight of chroma SSD.
-#ifndef CHROMA_MULT
-# define CHROMA_MULT 1.5
-#endif
 
 static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to)
 {
@@ -215,16 +208,16 @@
   const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);
 
   double ssd = 0.0;
-  ssd += LUMA_MULT * kvz_pixels_calc_ssd(
+  ssd += KVZ_LUMA_MULT * kvz_pixels_calc_ssd(
     &lcu->ref.yluma_index, &lcu->rec.yluma_index,
     LCU_WIDTH, LCU_WIDTH, cu_width
     );
   if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != KVZ_CSP_400) {
-    ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+    ssd += KVZ_CHROMA_MULT * kvz_pixels_calc_ssd(
       &lcu->ref.uchroma_index, &lcu->rec.uchroma_index,
       LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
       );
-    ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+    ssd += KVZ_CHROMA_MULT * kvz_pixels_calc_ssd(
       &lcu->ref.vchroma_index, &lcu->rec.vchroma_index,
       LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
       );
@@ -246,11 +239,12 @@
 * prediction unit data needs to be coded.
 */
 double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
-                       const int x_px, const int y_px, const int depth,
-                       const cu_info_t *const pred_cu,
-                       lcu_t *const lcu)
+                           const int x_px, const int y_px, const int depth,
+                           const cu_info_t *const pred_cu,
+                           lcu_t *const lcu)
 {
   const int width = LCU_WIDTH >> depth;
+  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0);
 
   // cur_cu is used for TU parameters.
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
@@ -264,14 +258,25 @@
 
   const uint8_t tr_depth = tr_cu->tr_depth - depth;
 
+  cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
+
   // Add transform_tree split_transform_flag bit cost.
   bool intra_split_flag = pred_cu->type == CU_INTRA && pred_cu->part_size == SIZE_NxN && depth == 3;
+  int max_tr_depth;
+  if (pred_cu->type == CU_INTRA) {
+    max_tr_depth = state->encoder_control->cfg.tr_depth_intra + intra_split_flag;
+  }
+  else {
+    max_tr_depth = state->encoder_control->tr_depth_inter;
+  }
   if (width <= TR_MAX_WIDTH
       && width > TR_MIN_WIDTH
-      && !intra_split_flag)
+      && !intra_split_flag
+      && MIN(tr_cu->tr_depth, depth) - tr_cu->depth < max_tr_depth
+      && !skip_residual_coding)
   {
-    const cabac_ctx_t *ctx = &(state->cabac.ctx.trans_subdiv_model5 - (6 - depth));
-    tr_tree_bits += CTX_ENTROPY_FBITS(ctx, tr_depth > 0);
+    cabac_ctx_t *ctx = &(cabac->ctx.trans_subdiv_model5 - (6 - depth));
+    CABAC_FBITS_UPDATE(cabac, ctx, tr_depth > 0, tr_tree_bits, "tr_split_search");
   }
 
   if (tr_depth > 0) {
@@ -286,14 +291,35 @@
     return sum + tr_tree_bits * state->lambda;
   }
 
+
+  if (cabac->update && tr_cu->tr_depth == tr_cu->depth && !skip_residual_coding) {
+    // Because these need to be coded before the luma cbf they also need to be counted
+    // before the cabac state changes. However, since this branch is only executed when
+    // calculating the last RD cost it is not problem to include the chroma cbf costs in
+    // luma, because the chroma cost is calculated right after the luma cost.
+    // However, if we have different tr_depth, the bits cannot be written in correct
+    // order anyways so do not touch the chroma cbf here.
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      cabac_ctx_t* cr_ctx = &(cabac->ctx.qt_cbf_model_chromadepth - tr_cu->depth);
+      cabac->cur_ctx = cr_ctx;
+      int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
+      int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V);
+      CABAC_FBITS_UPDATE(cabac, cr_ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
+      CABAC_FBITS_UPDATE(cabac, cr_ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
+    }
+  }
+
   // Add transform_tree cbf_luma bit cost.
+  const int is_tr_split = tr_cu->tr_depth - tr_cu->depth;
+  int is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_Y);
   if (pred_cu->type == CU_INTRA ||
-      tr_depth > 0 ||
+      is_tr_split ||
       cbf_is_set(tr_cu->cbf, depth, COLOR_U) ||
       cbf_is_set(tr_cu->cbf, depth, COLOR_V))
   {
-    const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_luma!tr_depth);
-    tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y));
+    cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_luma!is_tr_split);
+
+    CABAC_FBITS_UPDATE(cabac, ctx, is_set, tr_tree_bits, "cbf_y_search");
   }
 
   // SSD between reconstruction and original
@@ -305,26 +331,29 @@
                                         width);
   }
 
-  {
+
+  if (!skip_residual_coding) {
     int8_t luma_scan_mode = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
     const coeff_t *coeffs = &lcu->coeff.yxy_to_zorder(LCU_WIDTH, x_px, y_px);
 
-    coeff_bits += kvz_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode);
+    if(is_set)
+      coeff_bits += kvz_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode);
   }
 
   double bits = tr_tree_bits + coeff_bits;
-  return (double)ssd * LUMA_MULT + bits * state->lambda;
+  return (double)ssd * KVZ_LUMA_MULT + bits * state->lambda;
 }
 
 
 double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
-                         const int x_px, const int y_px, const int depth,
-                         const cu_info_t *const pred_cu,
-                         lcu_t *const lcu)
+                             const int x_px, const int y_px, const int depth,
+                             const cu_info_t *const pred_cu,
+                             lcu_t *const lcu)
 {
   const vector2d_t lcu_px = { x_px / 2, y_px / 2 };
   const int width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
+  const int skip_residual_coding = pred_cu->skipped || (pred_cu->type == CU_INTER && pred_cu->cbf == 0);
 
   double tr_tree_bits = 0;
   double coeff_bits = 0;
@@ -338,20 +367,25 @@
     return 0;
   }
 
-  if (depth < MAX_PU_DEPTH) {
+  int u_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_U);
+  int v_is_set = cbf_is_set(pred_cu->cbf, depth, COLOR_V);
+  // See luma for why the second condition
+  if (depth < MAX_PU_DEPTH && (!state->search_cabac.update || tr_cu->tr_depth != tr_cu->depth) && !skip_residual_coding) {
     const int tr_depth = depth - pred_cu->depth;
-    const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_chromatr_depth);
+    cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+    cabac_ctx_t *ctx = &(cabac->ctx.qt_cbf_model_chromatr_depth);
+    cabac->cur_ctx = ctx;
     if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
-      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U));
+      CABAC_FBITS_UPDATE(cabac, ctx, u_is_set, tr_tree_bits, "cbf_cb_search");
     }
     if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
-      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V));
+      CABAC_FBITS_UPDATE(cabac, ctx, v_is_set, tr_tree_bits, "cbf_cb_search");
     }
   }
 
   if (tr_cu->tr_depth > depth) {
     int offset = LCU_WIDTH >> (depth + 1);
-    int sum = 0;
+    double sum = 0;
 
     sum += kvz_cu_rd_cost_chroma(state, x_px, y_px, depth + 1, pred_cu, lcu);
     sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px, depth + 1, pred_cu, lcu);
@@ -374,16 +408,147 @@
     ssd = ssd_u + ssd_v;
   }
 
+  if (!skip_residual_coding)
   {

kvazaar-2.1.0.tar.gz/src/search.h -> kvazaar-2.2.0.tar.gz/src/search.h Changed

@@ -44,18 +44,48 @@
 #include "image.h"
 #include "constraint.h"
 
+#define MAX_UNIT_STATS_MAP_SIZE MAX(MAX_REF_PIC_COUNT, MRG_MAX_NUM_CANDS)
+
+ // Modify weight of luma SSD.
+#ifndef KVZ_LUMA_MULT
+# define KVZ_LUMA_MULT 0.8
+#endif
+// Modify weight of chroma SSD.
+#ifndef KVZ_CHROMA_MULT
+# define KVZ_CHROMA_MULT 1.5
+#endif
+
+ /**
+  *  \brief Data collected during search processes.
+  * 
+  *         The intended use is to collect statistics of the
+  *         searched coding/prediction units. Data related to
+  *         a specific unit is found at index i. The arrays
+  *         should be indexed by elements of the "keys" array
+  *         that will be sorted by the RD costs of the units.         
+  */
+typedef struct unit_stats_map_t {
+
+  cu_info_t unitMAX_UNIT_STATS_MAP_SIZE; //!< list of searched units
+  double    costMAX_UNIT_STATS_MAP_SIZE; //!< list of matching RD costs
+  double    bitsMAX_UNIT_STATS_MAP_SIZE; //!< list of matching bit costs  
+  int8_t    keysMAX_UNIT_STATS_MAP_SIZE; //!< list of keys (indices) to elements in the other arrays
+  int       size;                    //!< number of active elements in the lists
+} unit_stats_map_t;
+
 void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length);
+void kvz_sort_keys_by_cost(unit_stats_map_t *__restrict map);
 
 void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf);
 
 double kvz_cu_rd_cost_luma(const encoder_state_t *const state,
-                       const int x_px, const int y_px, const int depth,
-                       const cu_info_t *const pred_cu,
-                       lcu_t *const lcu);
+                           const int x_px, const int y_px, const int depth,
+                           const cu_info_t *const pred_cu,
+                           lcu_t *const lcu);
 double kvz_cu_rd_cost_chroma(const encoder_state_t *const state,
-                         const int x_px, const int y_px, const int depth,
-                         const cu_info_t *const pred_cu,
-                         lcu_t *const lcu);
+                             const int x_px, const int y_px, const int depth,
+                             const cu_info_t *const pred_cu,
+                             lcu_t *const lcu);
 void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth);
 
 void kvz_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);

kvazaar-2.1.0.tar.gz/src/search_inter.c -> kvazaar-2.2.0.tar.gz/src/search_inter.c Changed

@@ -37,6 +37,7 @@
 
 #include "cabac.h"
 #include "encoder.h"
+#include "encode_coding_tree.h"
 #include "image.h"
 #include "imagelist.h"
 #include "inter.h"
@@ -68,7 +69,7 @@
   /**
    * \brief Top-left corner of the PU
    */
-  const vector2d_t origin;
+  vector2d_t origin;
   int32_t width;
   int32_t height;
 
@@ -79,19 +80,6 @@
   kvz_mvd_cost_func *mvd_cost_func;
 
   /**
-   * \brief Best motion vector among the ones tested so far
-   */
-  vector2d_t best_mv;
-  /**
-   * \brief Cost of best_mv
-   */
-  uint32_t best_cost;
-  /**
-   * \brief Bit cost of best_mv
-   */
-  uint32_t best_bitcost;
-
-  /**
    * \brief Possible optimized SAD implementation for the width, leave as
    *        NULL for arbitrary-width blocks
    */
@@ -203,20 +191,25 @@
 /**
  * \brief Calculate cost for an integer motion vector.
  *
- * Updates info->best_mv, info->best_cost and info->best_bitcost to the new
+ * Updates best_mv, best_cost and best_bitcost to the new
  * motion vector if it yields a lower cost than the current one.
  *
  * If the motion vector violates the MV constraints for tiles or WPP, the
  * cost is not set.
  *
- * \return true if info->best_mv was changed, false otherwise
+ * \return true if best_mv was changed, false otherwise
  */
-static bool check_mv_cost(inter_search_info_t *info, int x, int y)
+static bool check_mv_cost(inter_search_info_t *info,
+                          int x,
+                          int y,
+                          double *best_cost,
+                          double* best_bits,
+                          vector2d_t *best_mv)
 {
   if (!intmv_within_tile(info, x, y)) return false;
 
-  uint32_t bitcost = 0;
-  uint32_t cost = kvz_image_calc_sad(
+  double bitcost = 0;
+  double cost = kvz_image_calc_sad(
       info->pic,
       info->ref,
       info->origin.x,
@@ -228,25 +221,25 @@
       info->optimized_sad
   );
 
-  if (cost >= info->best_cost) return false;
+  if (cost >= *best_cost) return false;
 
   cost += info->mvd_cost_func(
       info->state,
       x, y, 2,
       info->mv_cand,
-      info->merge_cand,
-      info->num_merge_cand,
+      NULL,
+      0,
       info->ref_idx,
       &bitcost
   );
 
-  if (cost >= info->best_cost) return false;
+  if (cost >= *best_cost) return false;
 
   // Set to motion vector in quarter pixel precision.
-  info->best_mv.x = x * 4;
-  info->best_mv.y = y * 4;
-  info->best_cost = cost;
-  info->best_bitcost = bitcost;
+  best_mv->x = x * 4;
+  best_mv->y = y * 4;
+  *best_cost = cost;
+  *best_bits = bitcost;
 
   return true;
 }
@@ -254,10 +247,10 @@
 
 static unsigned get_ep_ex_golomb_bitcost(unsigned symbol)
 {
-  // Calculate 2 * log2(symbol + 2)
+  // Calculate 2 * log2(symbol )
 
   unsigned bins = 0;
-  symbol += 2;
+  symbol += 0;
   if (symbol >= 1 << 8) { bins += 16; symbol >>= 8; }
   if (symbol >= 1 << 4) { bins += 8; symbol >>= 4; }
   if (symbol >= 1 << 2) { bins += 4; symbol >>= 2; }
@@ -297,12 +290,16 @@
  * \brief Select starting point for integer motion estimation search.
  *
  * Checks the zero vector, extra_mv and merge candidates and updates
- * info->best_mv to the best one.
+ * best_mv to the best one.
  */
-static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv)
+static void select_starting_point(inter_search_info_t *info,
+                                  vector2d_t extra_mv,
+                                  double *best_cost,
+                                  double* best_bits,
+                                  vector2d_t *best_mv)
 {
   // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list.
-  check_mv_cost(info, 0, 0);
+  check_mv_cost(info, 0, 0, best_cost, best_bits, best_mv);
 
   // Change to integer precision.
   extra_mv.x >>= 2;
@@ -310,7 +307,7 @@
 
   // Check mv_in if it's not one of the merge candidates.
   if ((extra_mv.x != 0 || extra_mv.y != 0) && !mv_in_merge(info, extra_mv)) {
-    check_mv_cost(info, extra_mv.x, extra_mv.y);
+    check_mv_cost(info, extra_mv.x, extra_mv.y, best_cost, best_bits, best_mv);
   }
 
   // Go through candidates
@@ -322,24 +319,26 @@
 
     if (x == 0 && y == 0) continue;
 
-    check_mv_cost(info, x, y);
+    check_mv_cost(info, x, y, best_cost, best_bits, best_mv);
   }
 }
 
 
-static uint32_t get_mvd_coding_cost(const encoder_state_t *state,
-                                    const cabac_data_t* cabac,
-                                    const int32_t mvd_hor,
-                                    const int32_t mvd_ver)
+static double get_mvd_coding_cost(const encoder_state_t* state,
+  const cabac_data_t* cabac,
+  const int32_t mvd_hor,
+  const int32_t mvd_ver)
 {
-  unsigned bitcost = 0;
+  double bitcost = 4 << CTX_FRAC_BITS;
   const vector2d_t abs_mvd = { abs(mvd_hor), abs(mvd_ver) };
+  bitcost += abs_mvd.x == 1 ? 1 << CTX_FRAC_BITS : (0 * (1 << CTX_FRAC_BITS));
+  bitcost += abs_mvd.y == 1 ? 1 << CTX_FRAC_BITS : (0 * (1 << CTX_FRAC_BITS));
 
   bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x) << CTX_FRAC_BITS;
   bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y) << CTX_FRAC_BITS;
 
   // Round and shift back to integer bits.
-  return (bitcost + CTX_FRAC_HALF_BIT) >> CTX_FRAC_BITS;
+  return bitcost / (1 << CTX_FRAC_BITS);
 }
 
 
@@ -347,7 +346,7 @@
                           int16_t mv_cand22,
                           int32_t mv_x,
                           int32_t mv_y,
-                          uint32_t *cost_out)
+                          double*cost_out)
 {
   const bool same_cand =
     (mv_cand00 == mv_cand10 && mv_cand01 == mv_cand11);
@@ -357,7 +356,7 @@
     return 0;
   }
 
-  uint32_t (*mvd_coding_cost)(const encoder_state_t * const state,
+  double (*mvd_coding_cost)(const encoder_state_t * const state,
                               const cabac_data_t*,
                               int32_t, int32_t);
   if (state->encoder_control->cfg.mv_rdo) {
@@ -366,12 +365,12 @@
     mvd_coding_cost = get_mvd_coding_cost;
   }

kvazaar-2.1.0.tar.gz/src/search_inter.h -> kvazaar-2.2.0.tar.gz/src/search_inter.h Changed

@@ -64,20 +64,20 @@
   HPEL_POS_DIA = 2
 };
 
-typedef uint32_t kvz_mvd_cost_func(const encoder_state_t *state,
+typedef double kvz_mvd_cost_func(const encoder_state_t *state,
                                   int x, int y,
                                   int mv_shift,
                                   int16_t mv_cand22,
                                   inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
                                   int16_t num_cand,
                                   int32_t ref_idx,
-                                  uint32_t *bitcost);
+                                  double *bitcost);
 
 void kvz_search_cu_inter(encoder_state_t * const state,
                          int x, int y, int depth,
                          lcu_t *lcu,
                          double *inter_cost,
-                         uint32_t *inter_bitcost);
+                         double* inter_bitcost);
 
 void kvz_search_cu_smp(encoder_state_t * const state,
                        int x, int y,
@@ -85,12 +85,20 @@
                        part_mode_t part_mode,
                        lcu_t *lcu,
                        double *inter_cost,
-                       uint32_t *inter_bitcost);
+                       double* inter_bitcost);
 
 
 unsigned kvz_inter_satd_cost(const encoder_state_t* state,
                              const lcu_t *lcu,
                              int x,
                              int y);
+void kvz_cu_cost_inter_rd2(encoder_state_t* const state,
+  int x, int y, int depth,
+  cu_info_t* cur_cu,
+  lcu_t* lcu,
+  double* inter_cost,
+  double* inter_bitcost);
+
+int kvz_get_skip_context(int x, int y, lcu_t* const lcu, cu_array_t* const cu_a);
 
 #endif // SEARCH_INTER_H_

kvazaar-2.1.0.tar.gz/src/search_intra.c -> kvazaar-2.2.0.tar.gz/src/search_intra.c Changed

@@ -98,11 +98,11 @@
 
     // Add the offset bit costs of signaling 'luma and chroma use trskip',
     // versus signaling 'luma and chroma don't use trskip' to the SAD cost.
-    const cabac_ctx_t *ctx = &state->cabac.ctx.transform_skip_model_luma;
+    const cabac_ctx_t *ctx = &state->search_cabac.ctx.transform_skip_model_luma;
     double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0);
 
     if (state->encoder_control->chroma_format != KVZ_CSP_400) {
-      ctx = &state->cabac.ctx.transform_skip_model_chroma;
+      ctx = &state->search_cabac.ctx.transform_skip_model_chroma;
       trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
     }
 
@@ -248,7 +248,7 @@
   //     max_depth.
   // - Min transform size hasn't been reached (MAX_PU_DEPTH).
   if (depth < max_depth && depth < MAX_PU_DEPTH) {
-    split_cost = 3 * state->lambda;
+    split_cost = 0;
 
     split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu);
     if (split_cost < nosplit_cost) {
@@ -267,8 +267,8 @@
     // Add bits for split_transform_flag = 1, because transform depth search bypasses
     // the normal recursion in the cost functions.
     if (depth >= 1 && depth <= 3) {
-      const cabac_ctx_t *ctx = &(state->cabac.ctx.trans_subdiv_model5 - (6 - depth));
-      tr_split_bit += CTX_ENTROPY_FBITS(ctx, 1);
+      cabac_ctx_t *ctx = &(state->search_cabac.ctx.trans_subdiv_model5 - (6 - depth));
+      CABAC_FBITS_UPDATE(&state->search_cabac, ctx, 1, tr_split_bit, "tr_split");
     }
 
     // Add cost of cbf chroma bits on transform tree.
@@ -280,12 +280,12 @@
     if (state->encoder_control->chroma_format != KVZ_CSP_400) {
       const uint8_t tr_depth = depth - pred_cu->depth;
 
-      const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_chromatr_depth);
+      cabac_ctx_t *ctx = &(state->search_cabac.ctx.qt_cbf_model_chromatr_depth);
       if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
-        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U));
+        CABAC_FBITS_UPDATE(&state->search_cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U), cbf_bits, "cbf_cb");
       }
       if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
-        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V));
+        CABAC_FBITS_UPDATE(&state->search_cabac, ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V), cbf_bits, "cbf_cr");
       }
     }
 
@@ -524,9 +524,8 @@
 
   // Add prediction mode coding cost as the last thing. We don't want this
   // affecting the halving search.
-  int lambda_cost = (int)(state->lambda_sqrt + 0.5);
   for (int mode_i = 0; mode_i < modes_selected; ++mode_i) {
-    costsmode_i += lambda_cost * kvz_luma_mode_bits(state, modesmode_i, intra_preds);
+    costsmode_i += state->lambda_sqrt * kvz_luma_mode_bits(state, modesmode_i, intra_preds);
   }
 
   #undef PARALLEL_BLKS
@@ -594,21 +593,23 @@
   }
 
   for(int rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode ++) {
-    int rdo_bitcost = kvz_luma_mode_bits(state, modesrdo_mode, intra_preds);
-    costsrdo_mode = rdo_bitcost * (int)(state->lambda + 0.5);
+    double rdo_bitcost = kvz_luma_mode_bits(state, modesrdo_mode, intra_preds);
+    costsrdo_mode = rdo_bitcost * state->lambda;
 
     // Perform transform split search and save mode RD cost for the best one.
     cu_info_t pred_cu;
     pred_cu.depth = depth;
     pred_cu.type = CU_INTRA;
     pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N);
+    pred_cu.skipped = 0;
+    pred_cu.merged = 0;
     pred_cu.intra.mode = modesrdo_mode;
     pred_cu.intra.mode_chroma = modesrdo_mode;
     FILL(pred_cu.cbf, 0);
 
     // Reset transform split data in lcu.cu for this area.
     kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);
-
+    
     double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modesrdo_mode, MAX_INT, &pred_cu, lcu);
     costsrdo_mode += mode_cost;
 
@@ -622,6 +623,7 @@
   // Update order according to new costs
   kvz_sort_modes(modes, costs, modes_to_check);
 
+
   // The best transform split hierarchy is not saved anywhere, so to get the
   // transform split hierarchy the search has to be performed again with the
   // best mode.
@@ -642,7 +644,8 @@
 
 double kvz_luma_mode_bits(const encoder_state_t *state, int8_t luma_mode, const int8_t *intra_preds)
 {
-  double mode_bits;
+  cabac_data_t* cabac = (cabac_data_t *)&state->search_cabac;
+  double mode_bits = 0;
 
   bool mode_in_preds = false;
   for (int i = 0; i < 3; ++i) {
@@ -651,8 +654,21 @@
     }
   }
 
-  const cabac_ctx_t *ctx = &(state->cabac.ctx.intra_mode_model);
-  mode_bits = CTX_ENTROPY_FBITS(ctx, mode_in_preds);
+  cabac_ctx_t *ctx = &(cabac->ctx.intra_mode_model);
+  CABAC_FBITS_UPDATE(cabac, ctx, mode_in_preds, mode_bits, "prev_intra_luma_pred_flag_search");
+  if (state->search_cabac.update) {
+    if(mode_in_preds) {
+      CABAC_BIN_EP(cabac, !(luma_mode == intra_preds0), "mpm_idx");
+      if(luma_mode != intra_preds0) {
+        CABAC_BIN_EP(cabac, !(luma_mode == intra_preds1), "mpm_idx");        
+      }
+    }
+    else {
+      // This value should be transformed for actual coding,
+      // but here the value does not actually matter, just that we write 5 bits
+      CABAC_BINS_EP(cabac, luma_mode, 5, "rem_intra_luma_pred_mode");
+    }
+  }
 
   if (mode_in_preds) {
     mode_bits += ((luma_mode == intra_preds0) ? 1 : 2);
@@ -666,12 +682,20 @@
 
 double kvz_chroma_mode_bits(const encoder_state_t *state, int8_t chroma_mode, int8_t luma_mode)
 {
-  const cabac_ctx_t *ctx = &(state->cabac.ctx.chroma_pred_model0);
-  double mode_bits;
-  if (chroma_mode == luma_mode) {
-    mode_bits = CTX_ENTROPY_FBITS(ctx, 0);
-  } else {
-    mode_bits = 2.0 + CTX_ENTROPY_FBITS(ctx, 1);
+  cabac_data_t* cabac = (cabac_data_t*)&state->search_cabac;
+  cabac_ctx_t *ctx = &(cabac->ctx.chroma_pred_model0);
+
+  double mode_bits = 0;
+  CABAC_FBITS_UPDATE(cabac, ctx, chroma_mode != luma_mode, mode_bits, "intra_chroma_pred_mode");
+  if (chroma_mode != luma_mode) {
+    mode_bits += 2.0;
+  }
+
+  if(cabac->update) {
+    if(chroma_mode != luma_mode) {
+      // Again it does not matter what we actually write here
+      CABAC_BINS_EP(cabac, 0, 2, "intra_chroma_pred_mode");      
+    }
   }
 
   return mode_bits;
@@ -706,9 +730,11 @@
                          depth,
                          -1, chroma.mode, // skip luma
                          NULL, lcu);
+      double bits = 0;
       chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);
 
       double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode);
+      bits += mode_bits;
       chroma.cost += mode_bits * state->lambda;
 
       if (chroma.cost < best_chroma.cost) {
@@ -742,10 +768,11 @@
   // is always one of the modes, so 2 means the final decision is made
   // between luma mode and one other mode that looks the best
   // according to search_intra_chroma_rough.
-  const int8_t modes_in_depth5 = { 1, 1, 1, 1, 2 };
-  int num_modes = modes_in_depthdepth;
-
-  if (state->encoder_control->cfg.rdo == 3) {
+  int num_modes = 2;
+  if(state->encoder_control->cfg.rdo >= 4 && depth == 4) {
+    num_modes = 5;
+  }
+  if (state->encoder_control->cfg.rdo >= 5) {
     num_modes = 5;
   }
 
@@ -792,7 +819,6 @@
                          int8_t *mode_out, double *cost_out)
 {
   const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
-  const int8_t cu_width = LCU_WIDTH >> depth;
   const int_fast8_t log2_width = LOG2_LCU_WIDTH - depth;
 
   cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
@@ -827,7 +853,7 @@
   kvz_pixel *ref_pixels = &lcu->ref.ylcu_px.x + lcu_px.y * LCU_WIDTH;
 
   int8_t number_of_modes;
-  bool skip_rough_search = (depth == 0 || state->encoder_control->cfg.rdo >= 3);
+  bool skip_rough_search = (depth == 0 || state->encoder_control->cfg.rdo >= 5);
   if (!skip_rough_search) {

kvazaar-2.2.0.tar.gz/src/sei.h Added

@@ -0,0 +1,55 @@
+#ifndef UDU_SEI_H_
+#define UDU_SEI_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (c) 2022, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ *
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+/**
+ * \file
+ * Supplemental Enhancement Information (SEI)
+ */
+
+#include "global.h" // IWYU pragma: keep
+
+#define SEI_PAYLOAD_TYPE_PIC_TIMING 1
+#define SEI_PAYLOAD_TYPE_USER_DATA_UNREGISTERED 5
+#define SEI_PAYLOAD_TYPE_DECODED_PICTURE_HASH 132
+
+// Flag value used for length / value extension.
+#define FF_BYTE 0xFF
+
+// UUID for the encoder info user defined unregistered message
+// random uuid_iso_iec_11578 generated with www.famkruithof.net/uuid/uuidgen
+static const uint8_t encoder_info_uuid16 = {
+    0x32, 0xfe, 0x46, 0x6c, 0x98, 0x41, 0x42, 0x69,
+    0xae, 0x35, 0x6a, 0x91, 0x54, 0x9e, 0xf3, 0xf1};
+
+#endif // UDU_SEI_H_

kvazaar-2.1.0.tar.gz/src/strategies/avx2/encode_coding_tree-avx2.c -> kvazaar-2.2.0.tar.gz/src/strategies/avx2/encode_coding_tree-avx2.c Changed

@@ -252,7 +252,8 @@
                                uint8_t width,
                                uint8_t type,
                                int8_t scan_mode,
-                               int8_t tr_skip)
+                               int8_t tr_skip,
+                               double* bits_out)
 {
   const encoder_control_t * const encoder = state->encoder_control;
   int c1 = 1;
@@ -260,6 +261,7 @@
   uint8_t last_coeff_y = 0;
   int32_t i;
   uint32_t sig_coeffgroup_nzs8 * 8 = { 0 };
+  double bits = 0;
 
   int8_t be_valid = encoder->cfg.signhide_enable;
   int32_t scan_pos_sig;
@@ -361,7 +363,7 @@
   // transform skip flag
   if(width == 4 && encoder->cfg.trskip_enable) {
     cabac->cur_ctx = (type == 0) ? &(cabac->ctx.transform_skip_model_luma) : &(cabac->ctx.transform_skip_model_chroma);
-    CABAC_BIN(cabac, tr_skip, "transform_skip_flag");
+    CABAC_FBITS_UPDATE(cabac, cabac->cur_ctx, tr_skip, bits, "transform_skip_flag");
   }
 
   last_coeff_x = pos_last & (width - 1);
@@ -374,7 +376,8 @@
                                  width,
                                  width,
                                  type,
-                                 scan_mode);
+                                 scan_mode,
+                                 bits_out);
 
   scan_pos_sig = scan_pos_last;
 
@@ -406,8 +409,7 @@
       uint32_t sig_coeff_group   = (sig_coeffgroup_nzscg_blk_pos != 0);
       uint32_t ctx_sig  = kvz_context_get_sig_coeff_group(sig_coeffgroup_nzs, cg_pos_x,
                                                       cg_pos_y, width);
-      cabac->cur_ctx = &base_coeff_group_ctxctx_sig;
-      CABAC_BIN(cabac, sig_coeff_group, "coded_sub_block_flag");
+      CABAC_FBITS_UPDATE(cabac, &base_coeff_group_ctxctx_sig, sig_coeff_group, bits, "coded_sub_block_flag");
     }
 
     if (sig_coeffgroup_nzscg_blk_pos) {
@@ -464,8 +466,7 @@
 
         if (curr_esc_flag | num_non_zero) {
           ctx_sig = ctx_sig_bufid;
-          cabac->cur_ctx = &baseCtxctx_sig;
-          CABAC_BIN(cabac, curr_sig, "sig_coeff_flag");
+          CABAC_FBITS_UPDATE(cabac, &baseCtxctx_sig, curr_sig, bits, "sig_coeff_flag");
         }
 
         if (curr_sig) {
@@ -519,8 +520,7 @@
         uint32_t shift = idx << 1;
         uint32_t symbol = (coeffs_gt1_bits >> shift) & 1;
 
-        cabac->cur_ctx = &base_ctx_modc1;
-        CABAC_BIN(cabac, symbol, "coeff_abs_level_greater1_flag");
+        CABAC_FBITS_UPDATE(cabac, &base_ctx_modc1, symbol, bits, "coeff_abs_level_greater1_flag");
 
         c1 = (c1s_nextiter >> shift) & 3;
       }
@@ -532,9 +532,7 @@
         if (first_c2_flag_idx != -1) {
           uint32_t shift = (first_c2_flag_idx << 1) + 1;
           uint8_t symbol = (coeffs_gt2_bits >> shift) & 1;
-          cabac->cur_ctx = &base_ctx_mod0;
-
-          CABAC_BIN(cabac, symbol, "coeff_abs_level_greater2_flag");
+          CABAC_FBITS_UPDATE(cabac, &base_ctx_mod0, symbol, bits, "coeff_abs_level_greater2_flag");
         }
       }
       int32_t shiftamt = (be_valid && sign_hidden) ? 1 : 0;
@@ -546,6 +544,7 @@
         }
       }
       CABAC_BINS_EP(cabac, coeff_signs, nnz, "coeff_sign_flag");
+      if (cabac->only_count) bits += nnz;
 
       if (c1 == 0 || num_non_zero > C1FLAG_NUMBER) {
 
@@ -586,7 +585,7 @@
             if (!cabac->only_count && (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFFS)) {
               kvz_cabac_write_coeff_remain_encry(state, cabac, level_diff, go_rice_param, base_level);
             } else {
-              kvz_cabac_write_coeff_remain(cabac, level_diff, go_rice_param);
+              bits += kvz_cabac_write_coeff_remain(cabac, level_diff, go_rice_param);
             }
 
             if (curr_abs_coeff > 3 * (1 << go_rice_param)) {
@@ -602,6 +601,7 @@
     num_non_zero = 0;
     coeff_signs = 0;
   }
+  if (cabac->only_count) *bits_out += bits;
 }
 #endif // COMPILE_INTEL_AVX2

kvazaar-2.1.0.tar.gz/src/strategies/avx2/encode_coding_tree-avx2.h -> kvazaar-2.2.0.tar.gz/src/strategies/avx2/encode_coding_tree-avx2.h Changed

kvazaar-2.1.0.tar.gz/src/strategies/avx2/picture-avx2.c -> kvazaar-2.2.0.tar.gz/src/strategies/avx2/picture-avx2.c Changed

@@ -769,238 +769,704 @@
   }
 }
 
-static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
- const int hi_prec_luma_rec1,
- const int hi_prec_chroma_rec0,
- const int hi_prec_chroma_rec1,
- const int height,
- const int width,
- const int ypos,
- const int xpos,
- const hi_prec_buf_t*high_precision_rec0,
- const hi_prec_buf_t*high_precision_rec1,
- lcu_t* lcu,
- uint8_t* temp_lcu_y,
- uint8_t* temp_lcu_u,
- uint8_t* temp_lcu_v,
-bool predict_luma,
-bool predict_chroma)
+static INLINE void scatter_ymm_4x8_8bit(kvz_pixel * dst, __m256i ymm, unsigned dst_stride)
 {
-  int y_in_lcu, x_in_lcu;
-  int shift = 15 - KVZ_BIT_DEPTH;
-  int offset = 1 << (shift - 1);
-  __m256i temp_epi8, temp_y_epi32, sample0_epi32, sample1_epi32, temp_epi16;
-  int32_t * pointer = 0;
-  __m256i offset_epi32 = _mm256_set1_epi32(offset);
-
-  for (int temp_y = 0; temp_y < height; ++temp_y) {
-
-   y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
-
-   for (int temp_x = 0; temp_x < width; temp_x += 8) {
-    x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+  __m128i ymm_lo = _mm256_castsi256_si128(ymm);
+  __m128i ymm_hi = _mm256_extracti128_si256(ymm, 1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(ymm_lo); dst += dst_stride;
+  *(uint32_t *)dst = _mm_extract_epi32(ymm_lo, 1); dst += dst_stride;
+  *(uint32_t *)dst = _mm_extract_epi32(ymm_lo, 2); dst += dst_stride;
+  *(uint32_t *)dst = _mm_extract_epi32(ymm_lo, 3); dst += dst_stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(ymm_hi); dst += dst_stride;
+  *(uint32_t *)dst = _mm_extract_epi32(ymm_hi, 1); dst += dst_stride;
+  *(uint32_t *)dst = _mm_extract_epi32(ymm_hi, 2); dst += dst_stride;
+  *(uint32_t *)dst = _mm_extract_epi32(ymm_hi, 3);
+}
 
-    if (predict_luma) {
-      bool use_8_elements = ((temp_x + 8) <= width);
+static INLINE void scatter_ymm_8x4_8bit(kvz_pixel *dst, __m256i ymm, unsigned dst_stride)
+{
+  __m256d ymm_as_m256d = _mm256_castsi256_pd(ymm);
+  __m128d ymm_lo = _mm256_castpd256_pd128(ymm_as_m256d);
+  __m128d ymm_hi = _mm256_extractf128_pd(ymm_as_m256d, 1);
+  _mm_storel_pd((double*)dst, ymm_lo); dst += dst_stride;
+  _mm_storeh_pd((double*)dst, ymm_lo); dst += dst_stride;
+  _mm_storel_pd((double*)dst, ymm_hi); dst += dst_stride;
+  _mm_storeh_pd((double*)dst, ymm_hi);
+}
 
-      if (!use_8_elements) {
-        if (width < 4) {
-          // If width is smaller than 4 there's no need to use SIMD
-          for (int temp_i = 0; temp_i < width; ++temp_i) {
-            x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1));
+static INLINE void scatter_ymm_16x2_8bit(kvz_pixel *dst, __m256i ymm, unsigned dst_stride)
+{
+  __m128i ymm_lo = _mm256_castsi256_si128(ymm);
+  __m128i ymm_hi = _mm256_extracti128_si256(ymm, 1);
+  _mm_storeu_si128((__m128i *)dst, ymm_lo); dst += dst_stride;
+  _mm_storeu_si128((__m128i *)dst, ymm_hi);
+}
 
-            int sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu : (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-            int sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu : (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+static INLINE void scatter_ymm_12x2_8bit(kvz_pixel *dst, __m256i ymm, unsigned dst_stride)
+{
+  __m256i mask_a = _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0);
+  __m256i mask_b = _mm256_setr_epi32(0, 0, 0, -1, -1, -1, 0, 0);
+  _mm256_maskstore_epi32((int32_t*)dst, mask_a, ymm); dst += dst_stride - 3 * 4;
+  _mm256_maskstore_epi32((int32_t*)dst, mask_b, ymm);
+}
 
-            lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
-          }
+static INLINE void bipred_average_px_px_template_avx2(kvz_pixel *dst,
+  kvz_pixel *px_L0,
+  kvz_pixel *px_L1,
+  unsigned pu_w,
+  unsigned pu_h,
+  unsigned dst_stride)
+{
+  bool has_pow2_width = _mm_popcnt_u32(pu_w) == 1;
+  bool area_mod_32 = (pu_w * pu_h) % 32;
+  assert(!(pu_w == 4 && pu_h == 4) && "Branch for 4x4 not yet implemented.");
+  assert(!(pu_w == 2 && pu_h == 8) && "Branch for 2x8 not yet implemented.");
+
+  if (has_pow2_width && area_mod_32 == 0) {
+    for (int i = 0; i < pu_w * pu_h; i += 32) {
+
+      int y = i / pu_w;
+      int x = i % pu_w;
+
+      __m256i sample_L0 = _mm256_loadu_si256((__m256i*)&px_L0i);
+      __m256i sample_L1 = _mm256_loadu_si256((__m256i*)&px_L1i);
+      __m256i avg       = _mm256_avg_epu8(sample_L0, sample_L1);
+
+      switch (pu_w) {
+        case  4: scatter_ymm_4x8_8bit( &dsty * dst_stride + x, avg, dst_stride); break;
+        case  8: scatter_ymm_8x4_8bit( &dsty * dst_stride + x, avg, dst_stride); break;
+        case 16: scatter_ymm_16x2_8bit(&dsty * dst_stride + x, avg, dst_stride); break;
+        case 32: // Same as case 64
+        case 64: _mm256_storeu_si256((__m256i *)&dsty * dst_stride + x, avg); break;
+        default:
+          assert(0 && "Unexpected block width.");
+          break;
+      }
+    }
+  } else if (area_mod_32 == 0) {
+    for (int i = 0; i < pu_w * pu_h; i += 24) {
+
+      int y = i / pu_w;
+      int x = i % pu_w;
+
+      // Last 64 bits of the 256 are not used to simplify the loop
+      __m256i mask      = _mm256_setr_epi64x(-1, -1, -1, 0);
+      __m256i sample_L0 = _mm256_maskload_epi64((const long long*)&px_L0i, mask);
+      __m256i sample_L1 = _mm256_maskload_epi64((const long long*)&px_L1i, mask);
+      __m256i avg       = _mm256_avg_epu8(sample_L0, sample_L1);
+
+      switch (pu_w) {
+        case 12: scatter_ymm_12x2_8bit(&dsty * dst_stride + x, avg, dst_stride); break;
+        case 24: // Same as case 48
+        case 48: _mm256_maskstore_epi64((long long*)&dsty * dst_stride + x, mask, avg); break;
+        default:
+          assert(0 && "Unexpected block width.");
+          break;
+      }
+    }
+  } else {
+    // 8x2, 8x6, 6x8 blocks (and maybe 2x8 in the future)
+    switch (pu_w) {
+      __m128i sample_L0, sample_L1, avg;
+      case 8: // 8x2, 8x6
+        for (int i = 0; i < pu_w * pu_h; i += 16) {
+
+          int y = i / pu_w;
+
+          sample_L0 = _mm_loadu_si128((__m128i*)&px_L0i);
+          sample_L1 = _mm_loadu_si128((__m128i*)&px_L1i);
+          avg       = _mm_avg_epu8(sample_L0, sample_L1);
+          _mm_storel_epi64((__m128i*)&dsty * dst_stride, avg);
+          _mm_storeh_pd((double*)&dst(y + 1) * dst_stride, _mm_castsi128_pd(avg));
         }
-
-        else {
-          // Load total of 4 elements from memory to vector
-          sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu))) :
-            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)&(temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
-
-
-          sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu))) :
-            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
-
-
-          // (sample1 + sample2 + offset)>>shift 
-          temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
-          temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32);
-          temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift);
-
-          // Pack the bits from 32-bit to 8-bit
-          temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32);
-          temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
-          temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
-
-          pointer = (int32_t*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu);
-          *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
-
-
-
-          for (int temp_i = temp_x + 4; temp_i < width; ++temp_i) {
-            x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1));
-
-            int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu : (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-            int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu : (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-
-            lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (uint8_t)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
-          }
-
+        break;
+      case 6: // 6x8
+        for (int i = 0; i < pu_w * pu_h; i += 12) {
+
+          int y = i / pu_w;
+
+          __m128i mask      = _mm_setr_epi32(-1, -1, -1, 0);
+          __m128i sample_L0 = _mm_maskload_epi32((const int*)(&px_L0i), mask);
+          __m128i sample_L1 = _mm_maskload_epi32((const int*)(&px_L1i), mask);
+          __m128i avg       = _mm_avg_epu8(sample_L0, sample_L1);
+

kvazaar-2.1.0.tar.gz/src/strategies/avx2/quant-avx2.c -> kvazaar-2.2.0.tar.gz/src/strategies/avx2/quant-avx2.c Changed

kvazaar-2.1.0.tar.gz/src/strategies/generic/encode_coding_tree-generic.c -> kvazaar-2.2.0.tar.gz/src/strategies/generic/encode_coding_tree-generic.c Changed

@@ -43,9 +43,11 @@
                                   uint8_t width,
                                   uint8_t type,
                                   int8_t scan_mode,
-                                  int8_t tr_skip)
+                                  int8_t tr_skip,
+                                  double* bits_out)
 {
   const encoder_control_t * const encoder = state->encoder_control;
+  double bits = 0;
   int c1 = 1;
   uint8_t last_coeff_x = 0;
   uint8_t last_coeff_y = 0;
@@ -111,7 +113,7 @@
   // transform skip flag
   if(width == 4 && encoder->cfg.trskip_enable) {
     cabac->cur_ctx = (type == 0) ? &(cabac->ctx.transform_skip_model_luma) : &(cabac->ctx.transform_skip_model_chroma);
-    CABAC_BIN(cabac, tr_skip, "transform_skip_flag");
+    CABAC_FBITS_UPDATE(cabac, cabac->cur_ctx, tr_skip, bits, "transform_skip_flag");
   }
 
   last_coeff_x = pos_last & (width - 1);
@@ -124,7 +126,8 @@
                                  width,
                                  width,
                                  type,
-                                 scan_mode);
+                                 scan_mode,
+                                 bits_out);
 
   scan_pos_sig  = scan_pos_last;
 
@@ -157,8 +160,7 @@
       uint32_t sig_coeff_group   = (sig_coeffgroup_flagcg_blk_pos != 0);
       uint32_t ctx_sig  = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
                                                       cg_pos_y, width);
-      cabac->cur_ctx = &base_coeff_group_ctxctx_sig;
-      CABAC_BIN(cabac, sig_coeff_group, "coded_sub_block_flag");
+      CABAC_FBITS_UPDATE(cabac, &base_coeff_group_ctxctx_sig, sig_coeff_group, bits, "coded_sub_block_flag");
     }
 
     if (sig_coeffgroup_flagcg_blk_pos) {
@@ -174,8 +176,7 @@
         if (scan_pos_sig > sub_pos || i == 0 || num_non_zero) {
           ctx_sig  = kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
                                              log2_block_size, type);
-          cabac->cur_ctx = &baseCtxctx_sig;
-          CABAC_BIN(cabac, sig, "sig_coeff_flag");
+          CABAC_FBITS_UPDATE(cabac, &baseCtxctx_sig, sig, bits, "sig_coeff_flag");
         }
 
         if (sig) {
@@ -214,8 +215,7 @@
 
       for (idx = 0; idx < num_c1_flag; idx++) {
         uint32_t symbol = (abs_coeffidx > 1) ? 1 : 0;
-        cabac->cur_ctx = &base_ctx_modc1;
-        CABAC_BIN(cabac, symbol, "coeff_abs_level_greater1_flag");
+        CABAC_FBITS_UPDATE(cabac, &base_ctx_modc1, symbol, bits, "coeff_abs_level_greater1_flag");
 
         if (symbol) {
           c1 = 0;
@@ -234,8 +234,7 @@
 
         if (first_c2_flag_idx != -1) {
           uint8_t symbol = (abs_coefffirst_c2_flag_idx > 2) ? 1 : 0;
-          cabac->cur_ctx      = &base_ctx_mod0;
-          CABAC_BIN(cabac, symbol, "coeff_abs_level_greater2_flag");
+          CABAC_FBITS_UPDATE(cabac, &base_ctx_mod0, symbol, bits, "coeff_abs_level_greater2_flag");
         }
       }
       if (be_valid && sign_hidden) {
@@ -245,11 +244,13 @@
             coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero-1);
           }
         CABAC_BINS_EP(cabac, coeff_signs , (num_non_zero - 1), "coeff_sign_flag");
+        if (cabac->only_count) bits += num_non_zero - 1;
       } else {
         if (!cabac->only_count)
           if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS)
             coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero);
         CABAC_BINS_EP(cabac, coeff_signs, num_non_zero, "coeff_sign_flag");
+        if (cabac->only_count) bits += num_non_zero;
       }
 
       if (c1 == 0 || num_non_zero > C1FLAG_NUMBER) {
@@ -263,9 +264,9 @@
               if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFFS)
                 kvz_cabac_write_coeff_remain_encry(state, cabac, abs_coeffidx - base_level, go_rice_param, base_level);
               else
-                kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
+                bits += kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
             } else
-              kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
+              bits += kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
 
             if (abs_coeffidx > 3 * (1 << go_rice_param)) {
               go_rice_param = MIN(go_rice_param + 1, 4);
@@ -279,6 +280,7 @@
       }
     }
   }
+  if (cabac->only_count) *bits_out += bits;
 }
 
 int kvz_strategy_register_encode_generic(void* opaque, uint8_t bitdepth)

kvazaar-2.1.0.tar.gz/src/strategies/generic/encode_coding_tree-generic.h -> kvazaar-2.2.0.tar.gz/src/strategies/generic/encode_coding_tree-generic.h Changed

kvazaar-2.1.0.tar.gz/src/strategies/generic/ipol-generic.c -> kvazaar-2.2.0.tar.gz/src/strategies/generic/ipol-generic.c Changed

@@ -131,7 +131,16 @@
   return temp;
 }
 
-void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2)
+void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  kvz_pixel *dst,
+  int16_t dst_stride,
+  int8_t hor_flag,
+  int8_t ver_flag,
+  const int16_t mv2)
 {
   //TODO: horizontal and vertical only filtering
   int32_t x, y;
@@ -669,7 +678,16 @@
   }
 }
 
-void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height,kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2)
+void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  kvz_pixel *dst,
+  int16_t dst_stride,
+  int8_t hor_flag,
+  int8_t ver_flag,
+  const int16_t mv2)
 {
   //TODO: horizontal and vertical only filtering
   int32_t x, y;

kvazaar-2.1.0.tar.gz/src/strategies/generic/picture-generic.c -> kvazaar-2.2.0.tar.gz/src/strategies/generic/picture-generic.c Changed

@@ -547,60 +547,121 @@
   return ssd >> (2*(KVZ_BIT_DEPTH-8));
 }
 
-static void inter_recon_bipred_generic(const int hi_prec_luma_rec0,
-  const int hi_prec_luma_rec1,
-  const int hi_prec_chroma_rec0,
-  const int hi_prec_chroma_rec1,
-  int32_t height,
-  int32_t width,
-  int32_t ypos,
-  int32_t xpos,
-  const hi_prec_buf_t*high_precision_rec0,
-  const hi_prec_buf_t*high_precision_rec1,
-  lcu_t* lcu,
-  kvz_pixel* temp_lcu_y,
-  kvz_pixel* temp_lcu_u,
-  kvz_pixel* temp_lcu_v,
-  bool predict_luma,
-  bool predict_chroma) {
-
-  int shift = 15 - KVZ_BIT_DEPTH;
-  int offset = 1 << (shift - 1);
-
-  int y_in_lcu;
-  int x_in_lcu;
-
-  //After reconstruction, merge the predictors by taking an average of each pixel
-  for (int temp_y = 0; temp_y < height; ++temp_y) {
-
+static void bipred_average_px_px(kvz_pixel *dst,
+  kvz_pixel *px_L0,
+  kvz_pixel *px_L1,
+  unsigned pu_w,
+  unsigned pu_h,
+  unsigned dst_stride)
+{
+  int32_t shift = 15 - KVZ_BIT_DEPTH; // TODO: defines
+  int32_t offset = 1 << (shift - 1);
+
+  for (int i = 0; i < pu_w * pu_h; ++i)
+  {
+    int y = i / pu_w;
+    int x = i % pu_w;
+    int16_t sample_L0 = px_L0i << (14 - KVZ_BIT_DEPTH);
+    int16_t sample_L1 = px_L1i << (14 - KVZ_BIT_DEPTH);
+    int32_t rounded = (sample_L0 + sample_L1 + offset) >> shift;
+    dsty * dst_stride + x = kvz_fast_clip_32bit_to_pixel(rounded);
+  }
+}
 
-    for (int temp_x = 0; temp_x < width; ++temp_x) {
-      y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
-      x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+static void bipred_average_im_im(kvz_pixel *dst,
+  kvz_pixel_im *im_L0,
+  kvz_pixel_im *im_L1,
+  unsigned pu_w,
+  unsigned pu_h,
+  unsigned dst_stride)
+{
+  int32_t shift = 15 - KVZ_BIT_DEPTH; // TODO: defines
+  int32_t offset = 1 << (shift - 1);
+
+  for (int i = 0; i < pu_w * pu_h; ++i)
+  {
+    int y = i / pu_w;
+    int x = i % pu_w;
+    int16_t sample_L0 = im_L0i;
+    int16_t sample_L1 = im_L1i;
+    int32_t rounded = (sample_L0 + sample_L1 + offset) >> shift;
+    dsty * dst_stride + x = kvz_fast_clip_32bit_to_pixel(rounded);
+  }
+}
 
-      if (predict_luma) {
-        int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu : (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-        int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu : (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+static void bipred_average_px_im(kvz_pixel *dst,
+  kvz_pixel *px,
+  kvz_pixel_im *im,
+  unsigned pu_w,
+  unsigned pu_h,
+  unsigned dst_stride)
+{
+  int32_t shift = 15 - KVZ_BIT_DEPTH; // TODO: defines
+  int32_t offset = 1 << (shift - 1);
+
+  for (int i = 0; i < pu_w * pu_h; ++i)
+  {
+    int y = i / pu_w;
+    int x = i % pu_w;
+    int16_t sample_px = pxi << (14 - KVZ_BIT_DEPTH);
+    int16_t sample_im = imi;
+    int32_t rounded = (sample_px + sample_im + offset) >> shift;
+    dsty * dst_stride + x = kvz_fast_clip_32bit_to_pixel(rounded);
+  }
+}
 
-        lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
-      }
+static void bipred_average_generic(lcu_t *const lcu,
+  const yuv_t *const px_L0,
+  const yuv_t *const px_L1,
+  const yuv_im_t *const im_L0,
+  const yuv_im_t *const im_L1,
+  const unsigned pu_x,
+  const unsigned pu_y,
+  const unsigned pu_w,
+  const unsigned pu_h,
+  const unsigned im_flags_L0,
+  const unsigned im_flags_L1,
+  const bool predict_luma,
+  const bool predict_chroma) {
 
-      if (predict_chroma && (temp_x < width >> 1 && temp_y < height >> 1)) {
+  //After reconstruction, merge the predictors by taking an average of each pixel
+  if (predict_luma) {
+    unsigned pb_offset = SUB_SCU(pu_y) * LCU_WIDTH + SUB_SCU(pu_x);
 
-        y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
-        x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+    if (!(im_flags_L0 & 1) && !(im_flags_L1 & 1)) {
+      bipred_average_px_px(lcu->rec.y + pb_offset, px_L0->y, px_L1->y, pu_w, pu_h, LCU_WIDTH);
 
-        int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + x_in_lcu : (temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-        int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + x_in_lcu : (lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-        lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
+    } else if ((im_flags_L0 & 1) && (im_flags_L1 & 1)) {
+      bipred_average_im_im(lcu->rec.y + pb_offset, im_L0->y, im_L1->y, pu_w, pu_h, LCU_WIDTH);
 
-        int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + x_in_lcu : (temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-        int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + x_in_lcu : (lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-        lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
-      }
+    } else {
+      kvz_pixel    *src_px = (im_flags_L0 & 1) ? px_L1->y : px_L0->y;
+      kvz_pixel_im *src_im = (im_flags_L0 & 1) ? im_L0->y : im_L1->y;
+      bipred_average_px_im(lcu->rec.y + pb_offset, src_px, src_im, pu_w, pu_h, LCU_WIDTH);
+    }
+  }
+  if (predict_chroma) {
+    unsigned pb_offset = SUB_SCU(pu_y) / 2 * LCU_WIDTH_C + SUB_SCU(pu_x) / 2;
+    unsigned pb_w = pu_w / 2;
+    unsigned pb_h = pu_h / 2;
+
+    if (!(im_flags_L0 & 2) && !(im_flags_L1 & 2)) {
+      bipred_average_px_px(lcu->rec.u + pb_offset, px_L0->u, px_L1->u, pb_w, pb_h, LCU_WIDTH_C);
+      bipred_average_px_px(lcu->rec.v + pb_offset, px_L0->v, px_L1->v, pb_w, pb_h, LCU_WIDTH_C);
+
+    } else if ((im_flags_L0 & 2) && (im_flags_L1 & 2)) {
+      bipred_average_im_im(lcu->rec.u + pb_offset, im_L0->u, im_L1->u, pb_w, pb_h, LCU_WIDTH_C);
+      bipred_average_im_im(lcu->rec.v + pb_offset, im_L0->v, im_L1->v, pb_w, pb_h, LCU_WIDTH_C);
+
+    } else {
+      kvz_pixel    *src_px_u = (im_flags_L0 & 2) ? px_L1->u : px_L0->u;
+      kvz_pixel_im *src_im_u = (im_flags_L0 & 2) ? im_L0->u : im_L1->u;
+      kvz_pixel    *src_px_v = (im_flags_L0 & 2) ? px_L1->v : px_L0->v;
+      kvz_pixel_im *src_im_v = (im_flags_L0 & 2) ? im_L0->v : im_L1->v;
+      bipred_average_px_im(lcu->rec.u + pb_offset, src_px_u, src_im_u, pb_w, pb_h, LCU_WIDTH_C);
+      bipred_average_px_im(lcu->rec.v + pb_offset, src_px_v, src_im_v, pb_w, pb_h, LCU_WIDTH_C);
     }
   }
-
 }
 
 
@@ -746,7 +807,7 @@
   success &= kvz_strategyselector_register(opaque, "satd_any_size_quad", "generic", 0, &satd_any_size_quad_generic);
 
   success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "generic", 0, &pixels_calc_ssd_generic);
-  success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "generic", 0, &inter_recon_bipred_generic);
+  success &= kvz_strategyselector_register(opaque, "bipred_average", "generic", 0, &bipred_average_generic);
 
   success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "generic", 0, &get_optimized_sad_generic);
   success &= kvz_strategyselector_register(opaque, "ver_sad", "generic", 0, &ver_sad_generic);

kvazaar-2.1.0.tar.gz/src/strategies/generic/quant-generic.c -> kvazaar-2.2.0.tar.gz/src/strategies/generic/quant-generic.c Changed

kvazaar-2.1.0.tar.gz/src/strategies/strategies-dct.h -> kvazaar-2.2.0.tar.gz/src/strategies/strategies-dct.h Changed

kvazaar-2.1.0.tar.gz/src/strategies/strategies-encode.h -> kvazaar-2.2.0.tar.gz/src/strategies/strategies-encode.h Changed

kvazaar-2.1.0.tar.gz/src/strategies/strategies-ipol.h -> kvazaar-2.2.0.tar.gz/src/strategies/strategies-ipol.h Changed

@@ -43,15 +43,15 @@
 #include "kvazaar.h"
 #include "search_inter.h"
 
-// AVX2 implementation of horizontal filter reads and
-// writes two rows for luma and four for chroma at a time.
-// Extra vertical padding is added to prevent segfaults.
-// Horizontal padding is not needed even if one extra byte
-// is read because kvz_image_alloc adds enough padding.
-#define KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * KVZ_EXT_BLOCK_W_LUMA)
-#define KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * KVZ_EXT_BLOCK_W_CHROMA)
-#define KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD ((KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH)
-#define KVZ_IPOL_MAX_IM_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C)
+ // AVX2 implementation of horizontal filter reads and
+ // writes two rows for luma and four for chroma at a time.
+ // Extra vertical padding is added to prevent segfaults.
+ // Needs one extra byte for input buffer to prevent ASAN
+ // error because AVX2 reads one extra byte in the end.
+#define KVZ_IPOL_MAX_INPUT_SIZE_LUMA_SIMD   ((KVZ_EXT_BLOCK_W_LUMA   + 1) * KVZ_EXT_BLOCK_W_LUMA   + 1)
+#define KVZ_IPOL_MAX_INPUT_SIZE_CHROMA_SIMD ((KVZ_EXT_BLOCK_W_CHROMA + 3) * KVZ_EXT_BLOCK_W_CHROMA + 1)
+#define KVZ_IPOL_MAX_IM_SIZE_LUMA_SIMD      ((KVZ_EXT_BLOCK_W_LUMA   + 1) * LCU_WIDTH)
+#define KVZ_IPOL_MAX_IM_SIZE_CHROMA_SIMD    ((KVZ_EXT_BLOCK_W_CHROMA + 3) * LCU_WIDTH_C)
 
 // On top of basic interpolation, FME needs one extra
 // column and row for ME (left and up). Adding the
@@ -101,6 +101,28 @@
 typedef void(kvz_sample_quarterpel_luma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
 typedef void(kvz_sample_octpel_chroma_hi_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
 
+typedef void(kvz_sample_14bit_quarterpel_luma_func)(const encoder_control_t * const encoder,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  int16_t *dst,
+  int16_t dst_stride,
+  int8_t hor_flag,
+  int8_t ver_flag,
+  const int16_t mv2);
+
+typedef void(kvz_sample_14bit_octpel_chroma_func)(const encoder_control_t *const encoder,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  int16_t *dst,
+  int16_t dst_stride,
+  int8_t hor_flag,
+  int8_t ver_flag,
+  const int16_t mv2);
+
 // Declare function pointers.
 extern ipol_blocks_func * kvz_filter_hpel_blocks_hor_ver_luma;
 extern ipol_blocks_func * kvz_filter_hpel_blocks_diag_luma;

kvazaar-2.1.0.tar.gz/src/strategies/strategies-picture.c -> kvazaar-2.2.0.tar.gz/src/strategies/strategies-picture.c Changed

kvazaar-2.1.0.tar.gz/src/strategies/strategies-picture.h -> kvazaar-2.2.0.tar.gz/src/strategies/strategies-picture.h Changed

@@ -133,22 +133,19 @@
                                 int32_t width, int32_t height, uint32_t pic_stride,
                                 uint32_t ref_stride, uint32_t left, uint32_t right);
 
-typedef void (inter_recon_bipred_func)(const int hi_prec_luma_rec0,
-    const int hi_prec_luma_rec1,
-    const int hi_prec_chroma_rec0,
-    const int hi_prec_chroma_rec1,
-    int height,
-    int width,
-    int ypos,
-    int xpos,
-    const hi_prec_buf_t*high_precision_rec0,
-    const hi_prec_buf_t*high_precision_rec1,
-    lcu_t* lcu,
-    kvz_pixel temp_lcu_yLCU_WIDTH*LCU_WIDTH,
-    kvz_pixel temp_lcu_uLCU_WIDTH_C*LCU_WIDTH_C,
-    kvz_pixel temp_lcu_vLCU_WIDTH_C*LCU_WIDTH_C,
-    bool predict_luma,
-    bool predict_chroma);  
+typedef void (inter_recon_bipred_func)(lcu_t * const lcu,
+  const yuv_t *const px_L0,
+  const yuv_t *const px_L1,
+  const yuv_im_t *const im_L0,
+  const yuv_im_t *const im_L1,
+  const unsigned pu_x,
+  const unsigned pu_y,
+  const unsigned pu_w,
+  const unsigned pu_h,
+  const unsigned im_flags_L0,
+  const unsigned im_flags_L1,
+  const bool predict_luma,
+  const bool predict_chroma);
 
 typedef double (pixel_var_func)(const kvz_pixel *buf, const uint32_t len);
 
@@ -184,7 +181,7 @@
 
 extern pixels_calc_ssd_func *kvz_pixels_calc_ssd;
 
-extern inter_recon_bipred_func * kvz_inter_recon_bipred_blend;
+extern inter_recon_bipred_func * kvz_bipred_average;
 
 extern get_optimized_sad_func *kvz_get_optimized_sad;
 extern ver_sad_func *kvz_ver_sad;
@@ -223,7 +220,7 @@
   {"satd_64x64_dual", (void**) &kvz_satd_64x64_dual}, \
   {"satd_any_size_quad", (void**) &kvz_satd_any_size_quad}, \
   {"pixels_calc_ssd", (void**) &kvz_pixels_calc_ssd}, \
-  {"inter_recon_bipred", (void**) &kvz_inter_recon_bipred_blend}, \
+  {"bipred_average", (void**) &kvz_bipred_average}, \
   {"get_optimized_sad", (void**) &kvz_get_optimized_sad}, \
   {"ver_sad", (void**) &kvz_ver_sad}, \
   {"hor_sad", (void**) &kvz_hor_sad}, \

kvazaar-2.1.0.tar.gz/src/strategies/strategies-quant.h -> kvazaar-2.2.0.tar.gz/src/strategies/strategies-quant.h Changed

@@ -45,18 +45,18 @@
 #include "tables.h"
 
 // Declare function pointers.
-typedef unsigned (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
+typedef void (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
   int32_t height, int8_t type, int8_t scan_idx, int8_t block_type);
-typedef unsigned (quant_residual_func)(encoder_state_t *const state,
+typedef int32_t (quant_residual_func)(encoder_state_t *const state,
   const cu_info_t *const cur_cu, const int width, const color_t color,
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const kvz_pixel *const ref_in, const kvz_pixel *const pred_in,
   kvz_pixel *rec_out, coeff_t *coeff_out,
   bool early_skip);
-typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
+typedef void (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width,
   int32_t height, int8_t type, int8_t block_type);
-typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);
+typedef double (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, uint64_t weights);
 
 typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);

kvazaar-2.1.0.tar.gz/src/transform.c -> kvazaar-2.2.0.tar.gz/src/transform.c Changed

@@ -250,25 +250,23 @@
   struct {
     kvz_pixel rec4*4;
     coeff_t coeff4*4;
-    uint32_t cost;
+    double cost;
     int has_coeffs;
   } skip, noskip, *best;
-
-  const int bit_cost = (int)(state->lambda + 0.5);
   
   noskip.has_coeffs = kvz_quantize_residual(
       state, cur_cu, width, color, scan_order,
       0, in_stride, 4,
       ref_in, pred_in, noskip.rec, noskip.coeff, false);
   noskip.cost = kvz_pixels_calc_ssd(ref_in, noskip.rec, in_stride, 4, 4);
-  noskip.cost += kvz_get_coeff_cost(state, noskip.coeff, 4, 0, scan_order) * bit_cost;
+  noskip.cost += kvz_get_coeff_cost(state, noskip.coeff, 4, 0, scan_order) * state->lambda;
 
   skip.has_coeffs = kvz_quantize_residual(
     state, cur_cu, width, color, scan_order,
     1, in_stride, 4,
     ref_in, pred_in, skip.rec, skip.coeff, false);
   skip.cost = kvz_pixels_calc_ssd(ref_in, skip.rec, in_stride, 4, 4);
-  skip.cost += kvz_get_coeff_cost(state, skip.coeff, 4, 0, scan_order) * bit_cost;
+  skip.cost += kvz_get_coeff_cost(state, skip.coeff, 4, 0, scan_order) * state->lambda;
 
   if (noskip.cost <= skip.cost) {
     *trskip_out = 0;

kvazaar-2.1.0.tar.gz/tools/appveyor-install.sh -> kvazaar-2.2.0.tar.gz/tools/appveyor-install.sh Changed