Packman Build Service PMBS

Changes of Revision 13

kvazaar.changes Changed

@@ -1,4 +1,70 @@
 -------------------------------------------------------------------
+Fri Nov 17 14:01:40 UTC 2017 - aloisio@gmx.com
+
+- Update to version 1.2.0
+  Features:
+  * Intra prediction mode encryption with
+    --crypto=intra_pred_modes (2b8ce5e)
+  * Adaptive QP for 360° video with --erp-aqp (26adef4)
+  * New selection algorithm for --owf=auto and --threads=auto
+    (8c4a347)
+  * Added an option to set the encryption key using --key (2e13091)
+  * Added an option to limit SAO to band offset or edge offset
+  only with --sao=band and --sao=edge (8674c0f)
+  Optimization:
+  * Reduced number of intra modes checked when using --rd=2
+    (2cad317)
+  * Reduced inter-frame CTU dependencies caused by SAO (050e90d)
+  * Changed to a faster calculation for coefficient costs when
+    using --rd=0 (1ead9c0)
+  Fixes:
+  * Fixed long motion vectors not getting clipped (#158, 85e2a40)
+  * Fixed order of pictures in reconstruction debug output when
+    --gop=8 is used (#101, aae141f)
+  * Fixed a use-after-free when encoding very few frames with
+    --gop=8 (#161, 2991962)
+  * Fixed a crash when video size is not a multiple of the
+    smallest CU size (2f2405d)
+  * Fixed invalid bitstream when QP is too large (382636d)
+  * Fixed a race condition causing a deadlock (5f8e17d)
+  * Fixed a memory leak in encryption (8654b48)
+  * Fixed I-frames not being IRAP frames when using GOP (00c9f52,
+    841597e)
+  * Fixed computing inter and intra costs with different metrics
+    (afc13f1)
+  * Fixed reliance on undefined behavior (b41f0fa, 924cf85)
+  * Fixed --mv-constraint=frametilemargin constraining motion
+    vectors too much (409d211)
+  * Fixed using --bipred with --tmvp (#160, 9974380)
+  User Interface:
+  * Changed type of kvz_config.roi.dqps from uint8_t* to int8_t.
+    Delta QP values for --roi may now be negative. (79cb3a2)
+  * Changed PSNR display format (20d6444)
+  Building:
+  * Default to no -Werror. Run configure with --enable-werror to
+    enable it. (033bc6b)
+  * make check now runs valgrind tests that used to only run on
+    Travis. Programs ffmpeg, valgrind and TAppDecoderStatic should
+    be found from $PATH (6bbe5e1)
+  Refactoring:
+  * Removed duplicate code in inter MVP and merge candidate
+    selection (4fb0783)
+  * Removed duplicate code in intra reconstruction for luma and
+    chroma (e944416)
+  * Changed functions for writing the CU tree bitstream to use
+    luma pixel coordinates (610c91b, f5eef7f)
+  * Removed duplicate code in functions for writing intra CU
+    bitstream with and without encryption (525a518)
+  * Removed duplicate code in helper functions in search.c
+    (2c73476)
+  * Gathered function parameters for inter search functions into a
+    single struct (2fa3d82)
+
+- Refreshed kvazaar.memset.patch
+
+- Bumped library version to 4
+
+-------------------------------------------------------------------
 Wed Feb 22 12:34:40 UTC 2017 - scarabeus@opensuse.org
 
 - Bit of spec cleanup

kvazaar.spec Changed

kvazaar.memset.patch Changed

@@ -1,10 +1,8 @@
-gcc7-7.1.1+r248152-1.2
-  112s rdo.c: In function 'kvz_rdoq':
-  112s rdo.c:563:14: error: 'memset' used with length equal to number of elements without multiplication by element size -Werror=memset-elt-size
-  112s      case 16: memset(sig_coeffgroup_flag, 0, 16 * sizeof(sig_coeffgroup_flag0)); break;
---- a/src/rdo.c
-+++ b/src/rdo.c
-@@ -555,6 +555,7 @@ void kvz_rdoq(encoder_state_t * const st
+Index: kvazaar-1.2.0/src/rdo.c
+===================================================================
+--- kvazaar-1.2.0.orig/src/rdo.c
++++ kvazaar-1.2.0/src/rdo.c
+@@ -593,6 +593,7 @@ void kvz_rdoq(encoder_state_t * const st
  
    uint32_t cg_num = width * height >> 4;
  
@@ -12,8 +10,8 @@
    // Explicitly tell the only possible numbers of elements to be zeroed.
    // Hope the compiler is able to utilize this information.
    switch (cg_num) {
-@@ -564,6 +565,9 @@ void kvz_rdoq(encoder_state_t * const st
-     case 64: memset(sig_coeffgroup_flag, 0, 64 * sizeof(sig_coeffgroup_flag0)); break;
+@@ -602,6 +603,9 @@ void kvz_rdoq(encoder_state_t * const st
+     case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); break;
      default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups");
    }
 +#else

kvazaar-1.1.0.tar.gz/.travis-install.sh Deleted

kvazaar-1.1.0.tar.gz/.travis-script.sh Deleted

kvazaar-1.1.0.tar.gz/.gitignore -> kvazaar-1.2.0.tar.gz/.gitignore Changed

kvazaar-1.2.0.tar.gz/.travis-install.bash Added

kvazaar-1.1.0.tar.gz/.travis.yml -> kvazaar-1.2.0.tar.gz/.travis.yml Changed

@@ -1,137 +1,43 @@
 language: c
 
-env:
-  global:
-  - TEST_DIM=264x130
-  - TEST_FRAMES=10
-
-# Use container based infrastructure
+# Use container based infrastructure.
 sudo: false
 
-# Use this the global requirements list for valgrind tests, because those are the most numerous.
 addons:
   apt:
     sources:
-    - ubuntu-toolchain-r-test
+      - ubuntu-toolchain-r-test
     packages:
-    - autoconf
-    - libtool
-    - p7zip-full  # to uncompress our own ffmpeg binary
-    - valgrind
-    - yasm
+      - autoconf
+      - gcc-4.8
+      - libtool
+      - valgrind
+      - yasm
 
 matrix:
   fast_finish: true
-  
+
   include:
     - compiler: clang
-      addons:
-        apt:
-          sources:
-          - ubuntu-toolchain-r-test
-          packages:
-          - autoconf
-          - libtool
-          - yasm
-    
     - compiler: gcc-4.8
-      addons:
-        apt:
-          sources:
-          - ubuntu-toolchain-r-test
-          packages:
-          - autoconf
-          - gcc-4.8
-          - libtool
-          - yasm
 
     # We have some Mac specific code and Mac sometimes has odd build issues.
     - os: osx
       compiler: clang  # gcc is actually clang on Travis OS X
-
-    # Check for external symbols without kvz_ prefix.
-    - compiler: gcc-4.8
+      install: true
       script:
         - ./autogen.sh
-        - ./configure && make
-        - (! nm -go --defined-only src/.libs/libkvazaar.a | grep -v ' kvz_') || (echo 'ERROR Only symbols prefixed with kvz_ should be exported from libkvazaar.'; false)
-      addons:
-        apt:
-          sources:
-          - ubuntu-toolchain-r-test
-          packages:
-          - autoconf
-          - gcc-4.8
-          - libtool
-          - yasm
-
-    # Tests trying to use invalid input dimensions
-    - env: EXPECTED_STATUS=1 PARAMS="-i src/kvazaar --input-res=1x65 -o /dev/null"
-      addons:
-        apt:
-          sources:
-          - ubuntu-toolchain-r-test
-          packages:
-          - autoconf
-          - libtool
-          - yasm
+        - ./configure --enable-werror
+        - make --jobs=2 V=1
 
-    # These valgrind tests are slow, so they are performed with the minimum
-    # number of small frames and fast settings.
-    
-    # Tests for interlace
-    - env: VALGRIND_TEST="--source-scan-type=tff -p0 --preset=ultrafast --threads=2 --owf=1 --wpp"
-    
-    # Tests for owf, wpp and tiles. There is lots of separate branches of
-    # code related to owf=0 and owf!=0, which is why all permutations are
-    # tried.
-    - env: VALGRIND_TEST="-p4 -r1 --owf=1 --threads=0 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
-    - env: VALGRIND_TEST="-p4 -r1 --owf=0 --threads=0 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
-    - env: VALGRIND_TEST="-p4 -r2 --owf=1 --threads=2 --wpp --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
-    - env: VALGRIND_TEST="-p4 -r2 --owf=0 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
-    - env: VALGRIND_TEST="-p4 -r2 --owf=1 --threads=2 --tiles-height-split=u2 --no-wpp --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
-    - env: VALGRIND_TEST="-p4 -r2 --owf=0 --threads=2 --tiles-height-split=u2 --no-wpp --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
-    
-    # Tests for rdoq, sao, deblock and signhide and subme.
-    - env: VALGRIND_TEST="-p0 -r1 --threads=2 --wpp --owf=1 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=1 --pu-depth-intra=2-3"
-    - env: VALGRIND_TEST="-p0 -r1 --threads=2 --wpp --owf=1 --rd=0 --no-rdoq --no-signhide --subme=0"
-    - env: VALGRIND_TEST="-p0 -r1 --threads=2 --wpp --owf=1 --rd=0 --no-deblock --no-sao --subme=0"
-    
-    # Tests for all-intra.
-    - env: VALGRIND_TEST="-p1 --threads=2 --owf=1 --rd=1 --no-rdoq --no-deblock --no-sao --no-signhide"
-    - env: VALGRIND_TEST="-p1 --threads=2 --owf=1 --rd=2 --no-rdoq --no-deblock --no-sao --no-signhide --no-transform-skip"
-
-    # Tests for SMP and AMP blocks.
-    - env: TEST_FRAMES=4 VALGRIND_TEST="--threads=2 --owf=1 --wpp --smp"
-    - env: TEST_FRAMES=4 VALGRIND_TEST="--threads=2 --owf=1 --wpp --amp"
-    - env: TEST_FRAMES=4 VALGRIND_TEST="--threads=2 --owf=1 --wpp --smp --amp"
-
-    # Tests for rate control
-    - env: VALGRIND_TEST="--bitrate=500000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
-    
-    # Tests for GOP, with and without OWF.
-    - env: TEST_FRAMES=20 VALGRIND_TEST="--gop=8 -p0 --threads=2 --wpp --owf=1 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
-    - env: TEST_FRAMES=10 VALGRIND_TEST="--gop=8 -p0 --threads=2 --wpp --owf=4 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
-    - env: TEST_FRAMES=20 VALGRIND_TEST="--gop=8 -p0 --threads=2 --wpp --owf=0 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
-    
-    # Tests for --mv-constraint
-    - env: VALGRIND_TEST="--threads=2 --owf=1 --preset=ultrafast --pu-depth-inter=0-3 --mv-constraint=frametilemargin"
-    - env: VALGRIND_TEST="--threads=2 --owf=1 --preset=ultrafast --subme=4 --mv-constraint=frametilemargin"
-    
-    # Tests for --slices
-    - env: TEST_DIM=512x256 VALGRIND_TEST="--threads=2 --owf=1 --preset=ultrafast --tiles=2x2 --slices=tiles"
-    - env: VALGRIND_TEST="--threads=2 --owf=1 --preset=ultrafast --slices=wpp"
-    
-    # Test weird shapes.
-    - env: TEST_DIM=16x16 VALGRIND_TEST="--threads=2 --owf=1 --preset=veryslow"
-    - env: TEST_DIM=256x16 VALGRIND_TEST="--threads=2 --owf=1 --preset=veryslow"
-    - env: TEST_DIM=16x256 VALGRIND_TEST="--threads=2 --owf=1 --preset=veryslow"
-
-install:
-  - source .travis-install.sh
+install: bash .travis-install.bash
 
 script:
-  - source .travis-script.sh
-  
+  - ./autogen.sh
+  - ./configure --enable-werror
+  - make --jobs=2 V=1
+  - make check VERBOSE=1
+
 after_script:
-  - set +e # Disable errors to work around Travis not knowing how to fix their stuff.
+  # Disable errors to work around Travis not knowing how to fix their stuff.
+  - set +e

kvazaar-1.1.0.tar.gz/README.md -> kvazaar-1.2.0.tar.gz/README.md Changed

@@ -100,6 +100,8 @@
                                    delta QP values in raster order.
                                    The delta QP map can be any size or aspect
                                    ratio, and will be mapped to LCU's.
+      --(no-)erp-aqp         : Use adaptive QP for 360 video with
+                               equirectangular projection
 
 Compression tools:
       --deblock <beta:tc>  : Deblocking
@@ -226,26 +228,26 @@
 placebo. The effects of the presets are listed in the following table,
 where the names have been abbreviated to fit the layout in GitHub.
 
-                     | 0-uf  | 1-sf  | 2-vf  | 3-fr  | 4-f   | 5-m   | 6-s   | 7-sr  | 8-vs  | 9-p
--------------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | -----
-rd                   | 0     | 0     | 0     | 1     | 1     | 1     | 1     | 1     | 1     | 1
-pu-depth-intra       | 2-3   | 2-3   | 2-3   | 2-3   | 2-3   | 1-3   | 1-3   | 1-3   | 1-4   | 1-4
-pu-depth-inter       | 2-3   | 2-3   | 2-3   | 1-3   | 1-3   | 1-3   | 1-3   | 0-3   | 0-3   | 0-3
-me                   | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | tz
-ref                  | 1     | 1     | 1     | 1     | 1     | 1     | 2     | 2     | 3     | 4
-deblock              | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1
-signhide             | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1
-subme                | 0     | 0     | 2     | 2     | 4     | 4     | 4     | 4     | 4     | 4
-sao                  | 0     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1
-rdoq                 | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1     | 1
-rdoq-skip            | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 0
-transform-skip       | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1
-mv-rdo               | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1
-full-intra-search    | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0
-smp                  | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1
-amp                  | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1
-cu-split-termination | zero  | zero  | zero  | zero  | zero  | zero  | zero  | zero  | zero  | off
-me-early-termination | sens. | sens. | sens. | sens. | on    | on    | on    | on    | on    | off
+|                      | 0-uf  | 1-sf  | 2-vf  | 3-fr  | 4-f   | 5-m   | 6-s   | 7-sr  | 8-vs  | 9-p   |
+| -------------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- |
+| rd                   | 0     | 0     | 0     | 1     | 1     | 1     | 1     | 1     | 1     | 1     |
+| pu-depth-intra       | 2-3   | 2-3   | 2-3   | 2-3   | 2-3   | 1-3   | 1-3   | 1-3   | 1-4   | 1-4   |
+| pu-depth-inter       | 2-3   | 2-3   | 2-3   | 1-3   | 1-3   | 1-3   | 1-3   | 0-3   | 0-3   | 0-3   |
+| me                   | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | tz    |
+| ref                  | 1     | 1     | 1     | 1     | 1     | 1     | 2     | 2     | 3     | 4     |
+| deblock              | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     |
+| signhide             | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1     |
+| subme                | 0     | 0     | 2     | 2     | 4     | 4     | 4     | 4     | 4     | 4     |
+| sao                  | 0     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     |
+| rdoq                 | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1     | 1     |
+| rdoq-skip            | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 0     |
+| transform-skip       | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     |
+| mv-rdo               | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     |
+| full-intra-search    | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     |
+| smp                  | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     |
+| amp                  | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     |
+| cu-split-termination | zero  | zero  | zero  | zero  | zero  | zero  | zero  | zero  | zero  | off   |
+| me-early-termination | sens. | sens. | sens. | sens. | on    | on    | on    | on    | on    | off   |
 
 
 ## Kvazaar library

kvazaar-1.1.0.tar.gz/build/kvazaar_tests/kvazaar_tests.vcxproj -> kvazaar-1.2.0.tar.gz/build/kvazaar_tests/kvazaar_tests.vcxproj Changed

kvazaar-1.1.0.tar.gz/build/kvazaar_tests/kvazaar_tests.vcxproj.filters -> kvazaar-1.2.0.tar.gz/build/kvazaar_tests/kvazaar_tests.vcxproj.filters Changed

kvazaar-1.1.0.tar.gz/build/yasm/vsyasm.props -> kvazaar-1.2.0.tar.gz/build/yasm/vsyasm.props Changed

@@ -9,16 +9,23 @@
     <YASMDependsOn
       Condition="'$(ConfigurationType)' != 'Makefile'">_SelectedFiles;$(YASMDependsOn)</YASMDependsOn>
   </PropertyGroup>
+  
+  <PropertyGroup Condition="'$(Platform)' == 'Win32'">
+    <YASMFormat>win32</YASMFormat>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)' == 'x64'">
+    <YASMFormat>win64</YASMFormat>
+  </PropertyGroup>
   <ItemDefinitionGroup>
     <YASM>
       <Debug>False</Debug>
       <ObjectFile>$(IntDir)</ObjectFile>
       <PreProc>0</PreProc>
       <Parser>0</Parser>
-      <CommandLineTemplate>vsyasm.exe -Xvc -f $(Platform) AllOptions AdditionalOptions Inputs</CommandLineTemplate>
+      <CommandLineTemplate>vsyasm.exe -Xvc -f $(YASMFormat) AllOptions AdditionalOptions Inputs</CommandLineTemplate>
       <Outputs>%(ObjectFile)</Outputs>
       <ExecutionDescription>Assembling %(Filename)%(Extension)</ExecutionDescription>
       <ShowOnlyRuleProperties>false</ShowOnlyRuleProperties>
     </YASM>
   </ItemDefinitionGroup>
-</Project>
\ No newline at end of file
+</Project>

kvazaar-1.1.0.tar.gz/configure.ac -> kvazaar-1.2.0.tar.gz/configure.ac Changed

@@ -22,8 +22,8 @@
 #   - Increment when making new releases and major or minor was not changed since last release.
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
-ver_major=3
-ver_minor=15
+ver_major=4
+ver_minor=0
 ver_release=0
 
 # Prevents configure from adding a lot of defines to the CFLAGS
@@ -32,7 +32,7 @@
 AC_CONFIG_MACRO_DIR(m4)
 AC_CONFIG_AUX_DIR(build-aux)
 
-AM_INIT_AUTOMAKE(-Wall -Werror dist-bzip2 dist-xz foreign subdir-objects)
+AM_INIT_AUTOMAKE(-Wall dist-bzip2 dist-xz foreign subdir-objects)
 AM_SILENT_RULES(yes)
 
 AC_PROG_CC
@@ -56,6 +56,10 @@
 KVZ_CFLAGS="-Wall -Wtype-limits -Wvla -I$srcdir/src -I$srcdir/src/extras -ftree-vectorize -fvisibility=hidden"
 CFLAGS="$KVZ_CFLAGS $CFLAGS"
 
+AC_SEARCH_LIBS(log, m c, , exit 1)
+AC_SEARCH_LIBS(pow, m c, , exit 1)
+AC_SEARCH_LIBS(sqrt, m c, , exit 1)
+
 AC_ARG_WITH(cryptopp,
     AS_HELP_STRING(--with-cryptopp,
         Build with cryptopp Enables selective encryption.))
@@ -76,21 +80,24 @@
 
 CPPFLAGS="-DKVZ_DLL_EXPORTS $CPPFLAGS"
 
-AC_SEARCH_LIBS(log, m c, , exit 1)
-AC_SEARCH_LIBS(pow, m c, , exit 1)
-AC_SEARCH_LIBS(sqrt, m c, , exit 1)
-
 
+# We need to force AX_PTHREAD to check -pthread -lpthread since otherwise
+# it only outputs -pthread for GCC. Without -lpthread GCC does not link the
+# shared library against the pthread library (even though it does link the
+# executable).
+PTHREAD_CFLAGS=-pthread
+PTHREAD_LIBS=-lpthread
 
 # This does workarounds for pthreads on various compilers.
-AX_PTHREAD
+AX_PTHREAD(,AC_MSG_ERROR(POSIX threads not found))
+
 CFLAGS="$PTHREAD_CFLAGS $CFLAGS"
 LIBS="$PTHREAD_LIBS $LIBS"
 CC="$PTHREAD_CC"
 
-# --disable-werror
-AC_ARG_ENABLE(werror, AS_HELP_STRING(--disable-werror, don't treat warnings as errors no),
-              , CFLAGS="-Werror $CFLAGS"
+# --enable-werror
+AC_ARG_ENABLE(werror, AS_HELP_STRING(--enable-werror, treat warnings as errors no),
+              CFLAGS="-Werror $CFLAGS", 
 )

kvazaar-1.1.0.tar.gz/doc/kvazaar.1 -> kvazaar-1.2.0.tar.gz/doc/kvazaar.1 Changed

kvazaar-1.1.0.tar.gz/src/Makefile.am -> kvazaar-1.2.0.tar.gz/src/Makefile.am Changed

@@ -29,10 +29,21 @@
 	cli.c \
 	yuv_io.c \
 	yuv_io.h
+
 kvazaar_LDADD = libkvazaar.la $(LIBS)
 
 kvazaar_CPPFLAGS = -DKVZ_VERSION="`$(srcdir)/../tools/version.sh`"
 
+if USE_CRYPTOPP
+kvazaar_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+else
+kvazaar_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+endif
+
 libkvazaar_la_SOURCES = \
 	bitstream.c \
 	bitstream.h \
@@ -144,15 +155,21 @@
 	libsse2.la \
 	libsse41.la
 
+libkvazaar_la_LDFLAGS = $(AM_LDFLAGS) -no-undefined -version-number $(KVZ_API_VERSION)
+
 if USE_CRYPTOPP
 libkvazaar_la_SOURCES += \
 	extras/crypto.h \
 	extras/crypto.cpp
+libkvazaar_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) \
+	$(libkvazaar_la_LDFLAGS) $(LDFLAGS) -o $@
+else
+libkvazaar_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(libkvazaar_la_LDFLAGS) $(LDFLAGS) -o $@
 endif
 
-libkvazaar_la_LDFLAGS = $(AM_LDFLAGS) -no-undefined -version-number $(KVZ_API_VERSION)
-
-
 libaltivec_la_SOURCES = \
 	strategies/altivec/picture-altivec.c \
 	strategies/altivec/picture-altivec.h
@@ -170,7 +187,6 @@
 	strategies/avx2/quant-avx2.h \
 	strategies/avx2/sao-avx2.c \
 	strategies/avx2/sao-avx2.h
-	
 
 libsse2_la_SOURCES = \
 	strategies/sse2/picture-sse2.c \

kvazaar-1.1.0.tar.gz/src/cabac.c -> kvazaar-1.2.0.tar.gz/src/cabac.c Changed

@@ -297,9 +297,9 @@
     //m_pcBinIf->encodeBinsEP(Suffix, r_param);
    if(r_param==1) {
      if(!(( base_level ==2 )&& (codeNumber==4 || codeNumber==5) ) ) {
-       uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
-       state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 1;
-       CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 1, "coeff_abs_level_remaining");
+       uint32_t key = kvz_crypto_get_key(state->crypto_hdl, 1);
+       state->crypto_prev_pos  = ( Suffix + ( state->crypto_prev_pos^key ) ) & 1;
+       CABAC_BINS_EP(cabac, state->crypto_prev_pos, 1, "coeff_abs_level_remaining");
        //m_pcBinIf->encodeBinsEP(m_prev_pos, 1);
      } else {
        CABAC_BINS_EP(cabac, Suffix, 1, "coeff_abs_level_remaining");
@@ -309,65 +309,65 @@
    else
     if(r_param==2) {
        if( base_level ==1) {
-    	 uint32_t key    =ff_get_key(&state->tile->dbs_g, 2);
-         state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 3;
-         CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+    	 uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
+         state->crypto_prev_pos  = ( Suffix + ( state->crypto_prev_pos^key ) ) & 3;
+         CABAC_BINS_EP(cabac, state->crypto_prev_pos, 2, "coeff_abs_level_remaining");
          //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
        } else
          if( base_level ==2) {
            if(codeNumber<=7 || codeNumber>=12) {
-        	 uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
-             state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 3;
-             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+        	 uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
+             state->crypto_prev_pos  = ( Suffix + ( state->crypto_prev_pos^key ) ) & 3;
+             CABAC_BINS_EP(cabac, state->crypto_prev_pos, 2, "coeff_abs_level_remaining");
              //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
            }
            else
              if(codeNumber<10) {
-                uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
-                state->tile->m_prev_pos  = (( (Suffix&1) + ( state->tile->m_prev_pos^key )) & 1);
-                CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+                uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 1);
+                state->crypto_prev_pos  = (( (Suffix&1) + ( state->crypto_prev_pos^key )) & 1);
+                CABAC_BINS_EP(cabac, state->crypto_prev_pos, 2, "coeff_abs_level_remaining");
                 //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
              } else
                CABAC_BINS_EP(cabac, Suffix, 2, "coeff_abs_level_remaining");
                //m_pcBinIf->encodeBinsEP(Suffix, 2);
          } else { //base_level=3
            if(codeNumber<=7 || codeNumber>11) {
-             uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
-             state->tile->m_prev_pos  = (Suffix + ( state->tile->m_prev_pos^key ) ) & 3;
-             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+             uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
+             state->crypto_prev_pos  = (Suffix + ( state->crypto_prev_pos^key ) ) & 3;
+             CABAC_BINS_EP(cabac, state->crypto_prev_pos, 2, "coeff_abs_level_remaining");
              //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
            } else {
-             uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
-             state->tile->m_prev_pos  = ((Suffix&2))+(( (Suffix&1) + ( state->tile->m_prev_pos^key)) & 1);
-             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+             uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 1);
+             state->crypto_prev_pos  = ((Suffix&2))+(( (Suffix&1) + ( state->crypto_prev_pos^key)) & 1);
+             CABAC_BINS_EP(cabac, state->crypto_prev_pos, 2, "coeff_abs_level_remaining");
              //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
            }
          }
      } else
        if(r_param==3) {
          if( base_level ==1) {
-           uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
-           state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 7;
-           CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+           uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 3);
+           state->crypto_prev_pos  = ( Suffix + ( state->crypto_prev_pos^key ) ) & 7;
+           CABAC_BINS_EP(cabac, state->crypto_prev_pos, 3, "coeff_abs_level_remaining");
            //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
          }
          else if( base_level ==2) {
            if(codeNumber<=15 || codeNumber>23) {
-             uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
-             state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 7;
-             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+             uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 3);
+             state->crypto_prev_pos  = ( Suffix + ( state->crypto_prev_pos^key ) ) & 7;
+             CABAC_BINS_EP(cabac, state->crypto_prev_pos, 3, "coeff_abs_level_remaining");
              //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
            } else
              if(codeNumber<=19){
-               uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
-               state->tile->m_prev_pos  = ((Suffix&4))+(( (Suffix&3) + (state->tile->m_prev_pos^key )) & 3);
-               CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+               uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
+               state->crypto_prev_pos  = ((Suffix&4))+(( (Suffix&3) + (state->crypto_prev_pos^key )) & 3);
+               CABAC_BINS_EP(cabac, state->crypto_prev_pos, 3, "coeff_abs_level_remaining");
                //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
              } else
                if(codeNumber<=21){
-            	 uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
-                 state->tile->m_prev_pos  = 4+(( (Suffix&1) + ( state->tile->m_prev_pos^key )) & 1);
-                 CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+            	 uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 1);
+                 state->crypto_prev_pos  = 4+(( (Suffix&1) + ( state->crypto_prev_pos^key )) & 1);
+                 CABAC_BINS_EP(cabac, state->crypto_prev_pos, 3, "coeff_abs_level_remaining");
                  //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
                } else
                  CABAC_BINS_EP(cabac, Suffix, 3, "coeff_abs_level_remaining");
@@ -376,82 +376,82 @@
            CABAC_BINS_EP(cabac, Suffix, 3, "coeff_abs_level_remaining");
            //m_pcBinIf->encodeBinsEP(Suffix, 3);
            if(codeNumber<=15 || codeNumber>23) {
-             uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
-             state->tile->m_prev_pos  = (Suffix + ( state->tile->m_prev_pos^key ) ) & 7;
-             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+             uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 3);
+             state->crypto_prev_pos  = (Suffix + ( state->crypto_prev_pos^key ) ) & 7;
+             CABAC_BINS_EP(cabac, state->crypto_prev_pos, 3, "coeff_abs_level_remaining");
              //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
            } else
              if(codeNumber<=19) {
-               uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
-               state->tile->m_prev_pos  = (( (Suffix&3) + ( state->tile->m_prev_pos^key )) &3);
-               CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+               uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
+               state->crypto_prev_pos  = (( (Suffix&3) + ( state->crypto_prev_pos^key )) &3);
+               CABAC_BINS_EP(cabac, state->crypto_prev_pos, 3, "coeff_abs_level_remaining");
                //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
              } else
                if(codeNumber<=23) {
-                 uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
-                 state->tile->m_prev_pos  = (Suffix&6)+(( (Suffix&1) + (state->tile->m_prev_pos^key )) & 1);
-                 CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+                 uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 1);
+                 state->crypto_prev_pos  = (Suffix&6)+(( (Suffix&1) + (state->crypto_prev_pos^key )) & 1);
+                 CABAC_BINS_EP(cabac, state->crypto_prev_pos, 3, "coeff_abs_level_remaining");
                  //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
                }
          }
        } else
          if(r_param==4) {
            if( base_level ==1) {
-             uint32_t key    = ff_get_key(&state->tile->dbs_g, 4);
-             state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 15;
-             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+             uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 4);
+             state->crypto_prev_pos  = ( Suffix + ( state->crypto_prev_pos^key ) ) & 15;
+             CABAC_BINS_EP(cabac, state->crypto_prev_pos, 4, "coeff_abs_level_remaining");
              //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
            } else
              if( base_level ==2) {
                if(codeNumber<=31 || codeNumber>47) {
-                 uint32_t key    = ff_get_key(&state->tile->dbs_g, 4);
-                 state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 15;
-                 CABAC_BINS_EP(cabac, state->tile->m_prev_pos, r_param, "coeff_abs_level_remaining");
+                 uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 4);
+                 state->crypto_prev_pos  = ( Suffix + ( state->crypto_prev_pos^key ) ) & 15;
+                 CABAC_BINS_EP(cabac, state->crypto_prev_pos, r_param, "coeff_abs_level_remaining");
                  //m_pcBinIf->encodeBinsEP(m_prev_pos, r_param);
                } else
                  if(codeNumber<=39) {
-                   uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
-                   state->tile->m_prev_pos  = (( (Suffix&7) + ( state->tile->m_prev_pos^key )) & 7);
-                   CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                   uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 3);
+                   state->crypto_prev_pos  = (( (Suffix&7) + ( state->crypto_prev_pos^key )) & 7);
+                   CABAC_BINS_EP(cabac, state->crypto_prev_pos, 4, "coeff_abs_level_remaining");
                    //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
                  } else
                    if(codeNumber<=43) {
-                     uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
-                     state->tile->m_prev_pos  = 8+(( (Suffix&3) + ( state->tile->m_prev_pos^key )) & 3);
-                     CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                     uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
+                     state->crypto_prev_pos  = 8+(( (Suffix&3) + ( state->crypto_prev_pos^key )) & 3);
+                     CABAC_BINS_EP(cabac, state->crypto_prev_pos, 4, "coeff_abs_level_remaining");
                      //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
                    } else
                      if(codeNumber<=45){
-                       uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
-                       state->tile->m_prev_pos  = 12+(( (Suffix&1) + ( state->tile->m_prev_pos^key )) & 1);
-                       CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                       uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 1);
+                       state->crypto_prev_pos  = 12+(( (Suffix&1) + ( state->crypto_prev_pos^key )) & 1);
+                       CABAC_BINS_EP(cabac, state->crypto_prev_pos, 4, "coeff_abs_level_remaining");
                        //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
                      } else
                        CABAC_BINS_EP(cabac, Suffix, 4, "coeff_abs_level_remaining");
                        //m_pcBinIf->encodeBinsEP(Suffix, 4);
              } else {//base_level=3
                if(codeNumber<=31 || codeNumber>47) {
-                 uint32_t key    = ff_get_key(&state->tile->dbs_g, 4);
-                 state->tile->m_prev_pos  = (Suffix + ( state->tile->m_prev_pos^key ) ) & 15;
-                 CABAC_BINS_EP(cabac, state->tile->m_prev_pos, r_param, "coeff_abs_level_remaining");
+                 uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 4);
+                 state->crypto_prev_pos  = (Suffix + ( state->crypto_prev_pos^key ) ) & 15;
+                 CABAC_BINS_EP(cabac, state->crypto_prev_pos, r_param, "coeff_abs_level_remaining");
                  //m_pcBinIf->encodeBinsEP(m_prev_pos, r_param);
                } else
                  if(codeNumber<=39) {
-                   uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
-                   state->tile->m_prev_pos  = (( (Suffix&7) + ( state->tile->m_prev_pos^key )) & 7);
-                   CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                   uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 3);
+                   state->crypto_prev_pos  = (( (Suffix&7) + ( state->crypto_prev_pos^key )) & 7);
+                   CABAC_BINS_EP(cabac, state->crypto_prev_pos, 4, "coeff_abs_level_remaining");
                    //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
                  } else
                    if(codeNumber<=43) {
-                     uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
-                     state->tile->m_prev_pos  = 8+(( (Suffix&3) + ( state->tile->m_prev_pos^key )) & 3);
-                     CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                     uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
+                     state->crypto_prev_pos  = 8+(( (Suffix&3) + ( state->crypto_prev_pos^key )) & 3);
+                     CABAC_BINS_EP(cabac, state->crypto_prev_pos, 4, "coeff_abs_level_remaining");
                      //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
                    } else
                      if(codeNumber<=47) {
-                       uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
-                       state->tile->m_prev_pos  = (Suffix&14)+(( (Suffix&1) + (state->tile->m_prev_pos^key )) & 1);
-                       CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                       uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 1);
+                       state->crypto_prev_pos  = (Suffix&14)+(( (Suffix&1) + (state->crypto_prev_pos^key )) & 1);
+                       CABAC_BINS_EP(cabac, state->crypto_prev_pos, 4, "coeff_abs_level_remaining");
                        //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
                      }
              }
@@ -466,10 +466,10 @@
     CABAC_BINS_EP(cabac, (1 << (3 + length + 1 - r_param)) - 2, 3 + length + 1 - r_param, "coeff_abs_level_remaining");
     //m_pcBinIf->encodeBinsEP((1<<(COEF_REMAIN_BIN_REDUCTION+length+1-r_param))-2,COEF_REMAIN_BIN_REDUCTION+length+1-r_param);
     uint32_t Suffix = codeNumber;
-    uint32_t key    = ff_get_key(&state->tile->dbs_g, length);
+    uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, length);
     uint32_t mask   = ( (1<<length ) -1 );
-    state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & mask;
-    CABAC_BINS_EP(cabac, state->tile->m_prev_pos, length, "coeff_abs_level_remaining");
+    state->crypto_prev_pos  = ( Suffix + ( state->crypto_prev_pos^key ) ) & mask;
+    CABAC_BINS_EP(cabac, state->crypto_prev_pos, length, "coeff_abs_level_remaining");
     //m_pcBinIf->encodeBinsEP(m_prev_pos,length);
   }
 }
@@ -532,7 +532,10 @@
 /**
  * \brief
  */
-void kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t * const data, uint32_t symbol, uint32_t count)
+void kvz_cabac_write_ep_ex_golomb(encoder_state_t * const state,
+                                  cabac_data_t * const data,
+                                  uint32_t symbol,
+                                  uint32_t count)
 {
   uint32_t bins = 0;
   int32_t num_bins = 0;
@@ -548,13 +551,13 @@
 
   bins      = (bins << count) | symbol;
   num_bins += count;
-  if (!state->cabac.only_count) {
+  if (!data->only_count) {
     if (state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_MVs) {
       uint32_t key, mask;
-      key                      = ff_get_key(&state->tile->dbs_g, num_bins>>1);
+      key                      = kvz_crypto_get_key(state->crypto_hdl, num_bins>>1);
       mask                     = ( (1<<(num_bins >>1) ) -1 );
-      state->tile->m_prev_pos  = ( bins + ( state->tile->m_prev_pos^key ) ) & mask;
-      bins                     = ( (bins >> (num_bins >>1) ) << (num_bins >>1) ) | state->tile->m_prev_pos;
+      state->crypto_prev_pos  = ( bins + ( state->crypto_prev_pos^key ) ) & mask;
+      bins                     = ( (bins >> (num_bins >>1) ) << (num_bins >>1) ) | state->crypto_prev_pos;
     }
   }
   kvz_cabac_encode_bins_ep(data, bins, num_bins);

kvazaar-1.1.0.tar.gz/src/cfg.c -> kvazaar-1.2.0.tar.gz/src/cfg.c Changed

@@ -44,7 +44,7 @@
   cfg->deblock_enable  = 1;
   cfg->deblock_beta    = 0;
   cfg->deblock_tc      = 0;
-  cfg->sao_enable      = 1;
+  cfg->sao_type        = 3;
   cfg->rdoq_enable     = 1;
   cfg->rdoq_skip       = 1;
   cfg->signhide_enable = true;
@@ -119,8 +119,12 @@
   cfg->roi.height = 0;
   cfg->roi.dqps = NULL;
 
+  cfg->erp_aqp = false;
+
   cfg->slices = KVZ_SLICES_NONE;
 
+  cfg->optional_key = NULL;
+
   return 1;
 }
 
@@ -132,6 +136,7 @@
     FREE_POINTER(cfg->tiles_height_split);
     FREE_POINTER(cfg->slice_addresses_in_ts);
     FREE_POINTER(cfg->roi.dqps);
+    FREE_POINTER(cfg->optional_key);
   }
   free(cfg);
 
@@ -228,6 +233,54 @@
   return 1;
 }
 
+static int parse_uint8(const char *numstr,uint8_t* number,int min, int max)                               
+{
+  char *tail;
+  int d = strtol(numstr, &tail, 10);
+  if (*tail || d < min || d > max){
+    fprintf(stderr, "Expected number between %d and %d\n", min, max);
+    if(number)
+      *number = 0;
+    return 0;
+  } else{
+    if (number)
+      *number = (uint8_t) d;
+    return 1;
+  }
+}
+
+static int parse_array(const char *array, uint8_t *coeff_key, int size,
+                            int min, int max)
+{
+  char *key = strdup(array);
+  const char delim = ",;:";
+  char *token;
+  int i = 0;
+
+  token = strtok(key, delim);
+  while(token!=NULL&&i<size){
+    if (!parse_uint8(token, &coeff_keyi, min, max))
+    {
+      free(key);
+      return 0;
+    }
+    i++;
+    token = strtok(NULL, delim);
+  }
+  if(i>=size && (token != NULL)){
+    fprintf(stderr, "parsing failed : too many members.\n");
+    free(key);
+    return 0;
+  }
+  else if (i<size){
+    fprintf(stderr, "parsing failed : too few members.\n");
+    free(key);
+    return 0;
+  }
+  free(key);
+  return 1;
+}
+
 static int parse_slice_specification(const char* const arg, int32_t * const nslices, int32_t** const array) {
   const char* current_arg = NULL;
   int32_t current_value;
@@ -309,10 +362,12 @@
 
   static const char * const cu_split_termination_names = { "zero", "off", NULL };
   static const char * const crypto_toggle_names = { "off", "on", NULL };
-  static const char * const crypto_feature_names = { "mvs", "mv_signs", "trans_coeffs", "trans_coeff_signs", NULL };
+  static const char * const crypto_feature_names = { "mvs", "mv_signs", "trans_coeffs", "trans_coeff_signs", "intra_pred_modes", NULL };
 
   static const char * const me_early_termination_names = { "off", "on", "sensitive", NULL };
 
+  static const char * const sao_names = { "off", "edge", "band", "full", NULL };
+
   static const char * const preset_values1120*2 = {
       { 
         "ultrafast", 
@@ -324,7 +379,7 @@
         "deblock", "0:0",
         "signhide", "0",
         "subme", "0",
-        "sao", "0",
+        "sao", "off",
         "rdoq", "0",
         "rdoq-skip", "1",
         "transform-skip", "0", 
@@ -347,7 +402,7 @@
         "deblock", "0:0",
         "signhide", "0",
         "subme", "0",
-        "sao", "1",
+        "sao", "full",
         "rdoq", "0",
         "rdoq-skip", "1",
         "transform-skip", "0",
@@ -370,7 +425,7 @@
         "deblock", "0:0",
         "signhide", "0",
         "subme", "2",
-        "sao", "1",
+        "sao", "full",
         "rdoq", "0",
         "rdoq-skip", "1",
         "transform-skip", "0",
@@ -393,7 +448,7 @@
         "deblock", "0:0",
         "signhide", "0",
         "subme", "2",
-        "sao", "1",
+        "sao", "full",
         "rdoq", "0",
         "rdoq-skip", "1",
         "transform-skip", "0",
@@ -416,7 +471,7 @@
         "deblock", "0:0",
         "signhide", "0",
         "subme", "4",
-        "sao", "1",
+        "sao", "full",
         "rdoq", "0",
         "rdoq-skip", "1",
         "transform-skip", "0",
@@ -439,7 +494,7 @@
         "deblock", "0:0",
         "signhide", "0",
         "subme", "4",
-        "sao", "1",
+        "sao", "full",
         "rdoq", "1",
         "rdoq-skip", "1",
         "transform-skip", "0",
@@ -462,7 +517,7 @@
         "deblock", "0:0",
         "signhide", "1",
         "subme", "4",
-        "sao", "1",
+        "sao", "full",
         "rdoq", "1",
         "rdoq-skip", "1",
         "transform-skip", "0",
@@ -485,7 +540,7 @@
         "deblock", "0:0",
         "signhide", "1",
         "subme", "4",
-        "sao", "1",
+        "sao", "full",
         "rdoq", "1",
         "rdoq-skip", "1",
         "transform-skip", "0",
@@ -508,7 +563,7 @@
         "deblock", "0:0",
         "signhide", "1",
         "subme", "4",
-        "sao", "1",
+        "sao", "full",
         "rdoq", "1",
         "rdoq-skip", "1",
         "transform-skip", "0",
@@ -531,7 +586,7 @@
         "deblock", "0:0",
         "signhide", "1",
         "subme", "4",
-        "sao", "1",
+        "sao", "full",
         "rdoq", "1",
         "rdoq-skip", "0",
         "transform-skip", "1",
@@ -599,8 +654,11 @@
       cfg->deblock_enable = atobool(value);
     }
   }
-  else if OPT("sao")
-    cfg->sao_enable = atobool(value);
+  else if OPT("sao") {
+    int8_t sao_type = 0;
+    if (!parse_enum(value, sao_names, &sao_type)) sao_type = atobool(value) ? 3 : 0;
+    cfg->sao_type = sao_type;
+  }
   else if OPT("rdoq")
     cfg->rdoq_enable = atobool(value);
   else if OPT("signhide")
@@ -945,6 +1003,12 @@
 
     return 1;
   }
+  else if OPT("key"){
+    int size_key = 16;
+    FREE_POINTER(cfg->optional_key);
+    cfg->optional_key = (uint8_t *)malloc(sizeof(uint8_t)*size_key);
+    return parse_array(value, cfg->optional_key, size_key, 0, 255);
+  }
   else if OPT("me-early-termination"){
     int8_t mode = 0;
     int result = parse_enum(value, me_early_termination_names, &mode);
@@ -1021,7 +1085,7 @@
     }
 
     const unsigned size = width * height;
-    uint8_t *dqp_array  = calloc((size_t)size, sizeof(cfg->roi.dqps0));
+    int8_t *dqp_array  = calloc((size_t)size, sizeof(cfg->roi.dqps0));
     if (!dqp_array) {
       fprintf(stderr, "Failed to allocate memory for ROI table.\n");
       fclose(f);
@@ -1040,11 +1104,13 @@
         fclose(f);
         return 0;
       }
-      dqp_arrayi = (uint8_t)number;
+      dqp_arrayi = CLIP(-51, 51, number);
     }
 
     fclose(f);
   }
+  else if OPT("erp-aqp")
+    cfg->erp_aqp = (bool)atobool(value);
   else
     return 0;
 #undef OPT
@@ -1251,6 +1317,11 @@
     error = 1;
   }
 
+  if (cfg->qp != CLIP_TO_QP(cfg->qp)) {
+      fprintf(stderr, "Input error: --qp parameter out of range 0..51\n");
+      error = 1;
+  }
+
   if (cfg->target_bitrate < 0) {
       fprintf(stderr, "Input error: --bitrate must be nonnegative\n");
       error = 1;

kvazaar-1.1.0.tar.gz/src/cli.c -> kvazaar-1.2.0.tar.gz/src/cli.c Changed

@@ -47,7 +47,7 @@
   { "input-fps",          required_argument, NULL, 0 },
   { "deblock",            required_argument, NULL, 0 },
   { "no-deblock",               no_argument, NULL, 0 },
-  { "sao",                      no_argument, NULL, 0 },
+  { "sao",                optional_argument, NULL, 0 },
   { "no-sao",                   no_argument, NULL, 0 },
   { "rdoq",                     no_argument, NULL, 0 },
   { "no-rdoq",                  no_argument, NULL, 0 },
@@ -107,6 +107,7 @@
   { "hash",               required_argument, NULL, 0 },
   {"cu-split-termination",required_argument, NULL, 0 },
   { "crypto",             required_argument, NULL, 0 },
+  { "key",                required_argument, NULL, 0 },
   { "me-early-termination",required_argument, NULL, 0 },
   { "lossless",                 no_argument, NULL, 0 },
   { "no-lossless",              no_argument, NULL, 0 },
@@ -119,6 +120,8 @@
   { "implicit-rdpcm",           no_argument, NULL, 0 },
   { "no-implicit-rdpcm",        no_argument, NULL, 0 },
   { "roi",                required_argument, NULL, 0 },
+  { "erp-aqp",                  no_argument, NULL, 0 },
+  { "no-erp-aqp",               no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -388,6 +391,8 @@
     "                                   delta QP values in raster order.\n"
     "                                   The delta QP map can be any size or aspect\n"
     "                                   ratio, and will be mapped to LCU's.\n"
+    "      --(no-)erp-aqp         : Use adaptive QP for 360 video with\n"
+    "                               equirectangular projection\n"
     "\n"
     /* Word wrap to this width to stay under 80 characters (including ") ************/
     "Compression tools:\n"
@@ -497,19 +502,23 @@
 
 void print_frame_info(const kvz_frame_info *const info,
                       const double frame_psnr3,
-                      const uint32_t bytes)
+                      const uint32_t bytes,
+                      const bool print_psnr)
 {
-  fprintf(stderr, "POC %4d QP %2d (%c-frame) %10d bits PSNR: %2.4f %2.4f %2.4f",
+  fprintf(stderr, "POC %4d QP %2d (%c-frame) %10d bits",
           info->poc,
           info->qp,
           "BPI"info->slice_type % 3,
-          bytes << 3,
-          frame_psnr0, frame_psnr1, frame_psnr2);
+          bytes << 3);
+  if (print_psnr) {
+    fprintf(stderr, " PSNR Y %2.4f U %2.4f V %2.4f",
+            frame_psnr0, frame_psnr1, frame_psnr2);
+  }
 
   if (info->slice_type != KVZ_SLICE_I) {
     // Print reference picture lists
     fprintf(stderr, " L0 ");
-    for (int j = info->ref_list_len0 - 1; j >= 0; j--) {
+    for (int j = 0; j < info->ref_list_len0; j++) {
       fprintf(stderr, "%d ", info->ref_list0j);
     }
     fprintf(stderr, " L1 ");

kvazaar-1.1.0.tar.gz/src/cli.h -> kvazaar-1.2.0.tar.gz/src/cli.h Changed

kvazaar-1.1.0.tar.gz/src/cu.c -> kvazaar-1.2.0.tar.gz/src/cu.c Changed

@@ -78,33 +78,6 @@
 };
 
 
-#define BLIT_COEFF_CASE(n) case n:\
-  for (y = 0; y < n; ++y) {\
-    memcpy(&dsty*dst_stride, &origy*orig_stride, n * sizeof(coeff_t));\
-  }\
-  break;
-
-void kvz_coefficients_blit(const coeff_t * const orig, coeff_t * const dst,
-                         const unsigned width, const unsigned height,
-                         const unsigned orig_stride, const unsigned dst_stride)
-{
-  unsigned y;
-  
-  int nxn_width = (width == height) ? width : 0;
-  switch (nxn_width) {
-    BLIT_COEFF_CASE(4)
-    BLIT_COEFF_CASE(8)
-    BLIT_COEFF_CASE(16)
-    BLIT_COEFF_CASE(32)
-    BLIT_COEFF_CASE(64)
-  default:
-    for (y = 0; y < height; ++y) {
-      memcpy(&dsty*dst_stride, &origy*orig_stride, width * sizeof(coeff_t));
-    }
-    break;
-  }
-}
-
 cu_info_t* kvz_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px)
 {
   return (cu_info_t*) kvz_cu_array_at_const(cua, x_px, y_px);
@@ -115,7 +88,7 @@
 {
   assert(x_px < cua->width);
   assert(y_px < cua->height);
-  return &(cua)->data(x_px >> 2) + (y_px >> 2) * ((cua)->width >> 2);
+  return &(cua)->data(x_px >> 2) + (y_px >> 2) * ((cua)->stride >> 2);
 }
 
 
@@ -125,82 +98,99 @@
  * \param width   width of the array in luma pixels
  * \param height  height of the array in luma pixels
  */
-cu_array_t * kvz_cu_array_alloc(const int width, const int height) {
+cu_array_t * kvz_cu_array_alloc(const int width, const int height)
+{
   cu_array_t *cua = MALLOC(cu_array_t, 1);
 
-  // Round up to a multiple of cell width and divide by cell width.
-  const int width_scu  = (width  + 15) >> 2;
-  const int height_scu = (height + 15) >> 2;
-  assert(width_scu  * 16 >= width);
-  assert(height_scu * 16 >= height);
+  // Round up to a multiple of LCU width and divide by cell width.
+  const int width_scu  = CEILDIV(width,  LCU_WIDTH) * LCU_WIDTH / SCU_WIDTH;
+  const int height_scu = CEILDIV(height, LCU_WIDTH) * LCU_WIDTH / SCU_WIDTH;
   const unsigned cu_array_size = width_scu * height_scu;
-  cua->data = calloc(cu_array_size, sizeof(cu_info_t));
-  cua->width  = width_scu  << 2;
-  cua->height = height_scu << 2;
+
+  cua->base     = NULL;
+  cua->data     = calloc(cu_array_size, sizeof(cu_info_t));
+  cua->width    = width_scu  * SCU_WIDTH;
+  cua->height   = height_scu * SCU_WIDTH;
+  cua->stride   = cua->width;
   cua->refcount = 1;
 
   return cua;
 }
 
 
-int kvz_cu_array_free(cu_array_t * const cua)
+cu_array_t * kvz_cu_subarray(cu_array_t *base,
+                             const unsigned x_offset,
+                             const unsigned y_offset,
+                             const unsigned width,
+                             const unsigned height)
+{
+  assert(x_offset + width <= base->width);
+  assert(y_offset + height <= base->height);
+
+  if (x_offset == 0 &&
+      y_offset == 0 &&
+      width == base->width &&
+      height == base->height)
+  {
+    return kvz_cu_array_copy_ref(base);
+  }
+
+  cu_array_t *cua = MALLOC(cu_array_t, 1);
+
+  // Find the real base array.
+  cu_array_t *real_base = base;
+  while (real_base->base) {
+    real_base = real_base->base;
+  }
+  cua->base     = kvz_cu_array_copy_ref(real_base);
+  cua->data     = kvz_cu_array_at(base, x_offset, y_offset);
+  cua->width    = width;
+  cua->height   = height;
+  cua->stride   = base->stride;
+  cua->refcount = 1;
+
+  return cua;
+}
+
+void kvz_cu_array_free(cu_array_t **cua_ptr)
 {
-  int32_t new_refcount;
-  if (!cua) return 1;
+  cu_array_t *cua = *cua_ptr;
+  if (cua == NULL) return;
+  *cua_ptr = NULL;
+
+  int new_refcount = KVZ_ATOMIC_DEC(&cua->refcount);
+  if (new_refcount > 0) {
+    // Still we have some references, do nothing.
+    return;
+  }
 
-  new_refcount = KVZ_ATOMIC_DEC(&(cua->refcount));
-  //Still we have some references, do nothing
-  if (new_refcount > 0) return 1;
+  assert(new_refcount == 0);
 
-  FREE_POINTER(cua->data);
-  free(cua);
+  if (!cua->base) {
+    FREE_POINTER(cua->data);
+  } else {
+    kvz_cu_array_free(&cua->base);
+    cua->data = NULL;
+  }
 
-  return 1;
+  FREE_POINTER(cua);
 }
 
 
 /**
- * \brief Copy part of a cu array to another cu array.
- *
- * All values are in luma pixels.
+ * \brief Get a new pointer to a cu array.
  *
- * \param dst     destination array
- * \param dst_x   x-coordinate of the left edge of the copied area in dst
- * \param dst_y   y-coordinate of the top edge of the copied area in dst
- * \param src     source array
- * \param src_x   x-coordinate of the left edge of the copied area in src
- * \param src_y   y-coordinate of the top edge of the copied area in src
- * \param width   width of the area to copy
- * \param height  height of the area to copy
+ * Increment reference count and return the cu array.
  */
-void kvz_cu_array_copy(cu_array_t* dst,       int dst_x, int dst_y,
-                       const cu_array_t* src, int src_x, int src_y,
-                       int width, int height)
+cu_array_t * kvz_cu_array_copy_ref(cu_array_t* cua)
 {
-  // Convert values from pixel coordinates to array indices.
-  int src_stride = src->width >> 2;
-  int dst_stride = dst->width >> 2;
-  const cu_info_t* src_ptr = &src->data(src_x >> 2) + (src_y >> 2) * src_stride;
-  cu_info_t* dst_ptr       = &dst->data(dst_x >> 2) + (dst_y >> 2) * dst_stride;
-
-  // Number of bytes to copy per row.
-  const size_t row_size = sizeof(cu_info_t) * (width >> 2);
-
-  width = MIN(width,   MIN(src->width  - src_x, dst->width  - dst_x));
-  height = MIN(height, MIN(src->height - src_y, dst->height - dst_y));
-
-  assert(src_x + width  <= src->width);
-  assert(src_y + height <= src->height);
-  assert(dst_x + width  <= dst->width);
-  assert(dst_y + height <= dst->height);
-
-  for (int i = 0; i < (height >> 2); ++i) {
-    memcpy(dst_ptr, src_ptr, row_size);
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
+  // The caller should have had another reference.
+  assert(cua->refcount > 0);
+  KVZ_ATOMIC_INC(&cua->refcount);
+  return cua;
 }
 
+
 /**
  * \brief Copy an lcu to a cu array.
  *
@@ -213,7 +203,7 @@
  */
 void kvz_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src)
 {
-  const int dst_stride = dst->width >> 2;
+  const int dst_stride = dst->stride >> 2;
   for (int y = 0; y < LCU_WIDTH; y += SCU_WIDTH) {
     for (int x = 0; x < LCU_WIDTH; x += SCU_WIDTH) {
       const cu_info_t *from_cu = LCU_GET_CU_AT_PX(src, x, y);

kvazaar-1.1.0.tar.gz/src/cu.h -> kvazaar-1.2.0.tar.gz/src/cu.h Changed

@@ -138,10 +138,13 @@
       int8_t mode;
       int8_t mode_chroma;
       int8_t tr_skip;    //!< \brief transform skip flag
+#if KVZ_SEL_ENCRYPTION
+      int8_t mode_encry;
+#endif
     } intra;
     struct {
       int16_t mv22;  // \brief Motion vectors for L0 and L1
-      uint8_t mv_ref2; // \brief Index of the encoder_control.ref array.
+      uint8_t mv_ref2; // \brief Index of the L0 and L1 array.
       uint8_t mv_cand0 : 3; // \brief selected MV candidate
       uint8_t mv_cand1 : 3; // \brief selected MV candidate
       uint8_t mv_dir   : 2; // \brief Probably describes if mv_ref is L0, L1 or both (bi-pred)
@@ -178,20 +181,26 @@
   (cu).inter.cost, (cu).inter.bitcost, (cu).inter.mv0, (cu).inter.mv1, (cu).inter.mvd0, (cu).inter.mvd1, \
   (cu).inter.mv_cand, (cu).inter.mv_ref, (cu).inter.mv_dir, (cu).inter.mode)
 
-typedef struct {
-  cu_info_t *data; //!< \brief cu array
+typedef struct cu_array_t {
+  struct cu_array_t *base; //!< \brief base cu array or NULL
+  cu_info_t *data;  //!< \brief cu array
   int32_t width;    //!< \brief width of the array in pixels
   int32_t height;   //!< \brief height of the array in pixels
+  int32_t stride;   //!< \brief stride of the array in pixels
   int32_t refcount; //!< \brief number of references to this cu_array
 } cu_array_t;
 
-cu_array_t * kvz_cu_array_alloc(int width, int height);
-int kvz_cu_array_free(cu_array_t *cua);
 cu_info_t* kvz_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px);
 const cu_info_t* kvz_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px);
-void kvz_cu_array_copy(cu_array_t* dst,       int dst_x, int dst_y,
-                       const cu_array_t* src, int src_x, int src_y,
-                       int width, int height);
+
+cu_array_t * kvz_cu_array_alloc(const int width, const int height);
+cu_array_t * kvz_cu_subarray(cu_array_t *base,
+                             const unsigned x_offset,
+                             const unsigned y_offset,
+                             const unsigned width,
+                             const unsigned height);
+void kvz_cu_array_free(cu_array_t **cua_ptr);
+cu_array_t * kvz_cu_array_copy_ref(cu_array_t* cua);
 
 
 /**
@@ -221,7 +230,54 @@
   kvz_pixel vLCU_REF_PX_WIDTH / 2 + 1;
 } lcu_ref_px_t;
 
-typedef struct {
+/**
+ * \brief Coefficients of an LCU
+ *
+ * Coefficients inside a single TU are stored in row-major order. TUs
+ * themselves are stored in a zig-zag order, so that the coefficients of
+ * a TU are contiguous in memory.
+ *
+ * Example storage order for a 32x32 pixel TU tree
+ *
+ \verbatim
+
+   +------+------+------+------+---------------------------+
+   |   0  |  16  |  64  |  80  |                           |
+   |   -  |   -  |   -  |   -  |                           |
+   |  15  |  31  |  79  |  95  |                           |
+   +------+------+------+------+                           |
+   |  32  |  48  |  96  | 112  |                           |
+   |   -  |   -  |   -  |   -  |                           |
+   |  47  |  63  | 111  | 127  |                           |
+   +------+------+------+------+         256 - 511         |
+   | 128  | 144  | 192  | 208  |                           |
+   |   -  |   -  |   -  |   -  |                           |
+   | 143  | 159  | 207  | 223  |                           |
+   +------+------+------+------+                           |
+   | 160  | 176  | 224  | 240  |                           |
+   |   -  |   -  |   -  |   -  |                           |
+   | 175  | 191  | 239  | 255  |                           |
+   +------+------+------+------+-------------+------+------+
+   | 512  | 528  |             |             | 832  | 848  |
+   |   -  |   -  |             |             |   -  |   -  |
+   | 527  | 543  |             |             | 847  | 863  |
+   +------+------+  576 - 639  |  768 - 831  +------+------+
+   | 544  | 560  |             |             | 864  | 880  |
+   |   -  |   -  |             |             |   -  |   -  |
+   | 559  | 575  |             |             | 879  | 895  |
+   +------+------+-------------+-------------+------+------+
+   |             |             |             |             |
+   |             |             |             |             |
+   |             |             |             |             |
+   |  640 - 703  |  704 - 767  |  896 - 959  |  960 - 1023 |
+   |             |             |             |             |
+   |             |             |             |             |
+   |             |             |             |             |
+   +-------------+-------------+-------------+-------------+
+
+ \endverbatim
+ */
+typedef ALIGNED(8) struct {
   coeff_t yLCU_LUMA_SIZE;
   coeff_t uLCU_CHROMA_SIZE;
   coeff_t vLCU_CHROMA_SIZE;
@@ -287,6 +343,72 @@
 #define LCU_GET_CU_AT_PX(lcu, x_px, y_px) \
   (&(lcu)->cuLCU_CU_OFFSET + ((x_px) >> 2) + ((y_px) >> 2) * LCU_T_CU_WIDTH)
 
+
+/**
+ * \brief  Copy a part of a coeff_t array to another.
+ *
+ * \param width  Size of the block to be copied in pixels.
+ * \param src    Pointer to the source array.
+ * \param dest   Pointer to the destination array.
+ */
+static INLINE void copy_coeffs(const coeff_t *__restrict src,
+                               coeff_t *__restrict dest,
+                               size_t width)
+{
+  memcpy(dest, src, width * width * sizeof(coeff_t));
+}
+
+
+/**
+ * \brief  Convert (x, y) coordinates to z-order index.
+ *
+ * Only works for widths and coordinates divisible by four. Width must be
+ * a power of two in range 4..64.
+ *
+ * \param width   size of the containing block
+ * \param x       x-coordinate
+ * \param y       y-coordinate
+ * \return        index in z-order
+ */
+static INLINE unsigned xy_to_zorder(unsigned width, unsigned x, unsigned y)
+{
+  assert(width % 4 == 0 && width >= 4 && width <= 64);
+  assert(x % 4 == 0 && x < width);
+  assert(y % 4 == 0 && y < width);
+
+  unsigned result = 0;
+
+  switch (width) {
+    case 64:
+      result += x / 32 * (32*32);
+      result += y / 32 * (64*32);
+      x %= 32;
+      y %= 32;
+      // fallthrough
+    case 32:
+      result += x / 16 * (16*16);
+      result += y / 16 * (32*16);
+      x %= 16;
+      y %= 16;
+      // fallthrough
+    case 16:
+      result += x / 8 * ( 8*8);
+      result += y / 8 * (16*8);
+      x %= 8;
+      y %= 8;
+      // fallthrough
+    case 8:
+      result += x / 4 * (4*4);
+      result += y / 4 * (8*4);
+      // fallthrough
+    case 4:
+      break;
+  }
+
+  return result;
+}
+
+
 #define CHECKPOINT_LCU(prefix_str, lcu) do { \
   CHECKPOINT_CU(prefix_str " cu0", (lcu).cu0); \
   CHECKPOINT_CU(prefix_str " cu1", (lcu).cu1); \
@@ -373,10 +495,6 @@
 } while(0)
 
 
-void kvz_coefficients_blit(const coeff_t *orig, coeff_t *dst,
-                         unsigned width, unsigned height,
-                         unsigned orig_stride, unsigned dst_stride);
-
 #define NUM_CBF_DEPTHS 5
 static const uint16_t cbf_masksNUM_CBF_DEPTHS = { 0x1f, 0x0f, 0x07, 0x03, 0x1 };

kvazaar-1.1.0.tar.gz/src/encmain.c -> kvazaar-1.2.0.tar.gz/src/encmain.c Changed

@@ -83,11 +83,11 @@
   }
 }
 
-#if KVZ_BIT_DEPTH == 8
-#define PSNRMAX (255.0 * 255.0)
-#else
-  #define PSNRMAX ((double)PIXEL_MAX * (double)PIXEL_MAX)
-#endif
+/**
+ * \brief Value that is printed instead of PSNR when SSE is zero.
+ */
+static const double MAX_PSNR = 999.99;
+static const double MAX_SQUARED_ERROR = (double)PIXEL_MAX * (double)PIXEL_MAX;
 
 /**
  * \brief Calculates image PSNR value
@@ -105,28 +105,31 @@
 
   int32_t pixels = src->width * src->height;
   int colors = rec->chroma_format == KVZ_CSP_400 ? 1 : 3;
+  double sse3 = { 0.0 };
 
   for (int32_t c = 0; c < colors; ++c) {
     int32_t num_pixels = pixels;
     if (c != COLOR_Y) {
       num_pixels >>= 2;
     }
-    psnrc = 0;
     for (int32_t i = 0; i < num_pixels; ++i) {
       const int32_t error = src->dataci - rec->dataci;
-      psnrc += error * error;
+      ssec += error * error;
     }
 
     // Avoid division by zero
-    if (psnrc == 0) psnrc = 99.0;
-    psnrc = 10 * log10((num_pixels * PSNRMAX) / ((double)psnrc));;
+    if (ssec == 0.0) {
+      psnrc = MAX_PSNR;
+    } else {
+      psnrc = 10.0 * log10(num_pixels * MAX_SQUARED_ERROR / ssec);
+    }
   }
 }
 
 typedef struct {
-  // Mutexes for synchronization.
-  pthread_mutex_t* input_mutex;
-  pthread_mutex_t* main_thread_mutex;
+  // Semaphores for synchronization.
+  kvz_sem_t* available_input_slots;
+  kvz_sem_t* filled_input_slots;
 
   // Parameters passed from main thread to input thread.
   FILE* input;
@@ -141,9 +144,6 @@
   int retval;
 } input_handler_args;
 
-#define PTHREAD_LOCK(l) if (pthread_mutex_lock((l)) != 0) { fprintf(stderr, "pthread_mutex_lock(%s) failed!\n", #l); assert(0); return 0; }
-#define PTHREAD_UNLOCK(l) if (pthread_mutex_unlock((l)) != 0) { fprintf(stderr, "pthread_mutex_unlock(%s) failed!\n", #l); assert(0); return 0; }
-
 #define RETVAL_RUNNING 0
 #define RETVAL_FAILURE 1
 #define RETVAL_EOF 2
@@ -193,7 +193,7 @@
     // Set PTS to make sure we pass it on correctly.
     frame_in->pts = frames_read;
 
-    bool read_success = yuv_io_read(args->input, 
+    bool read_success = yuv_io_read(args->input,
                                     args->opts->config->width,
                                     args->opts->config->height,
                                     args->encoder->cfg.input_bitdepth,
@@ -242,30 +242,65 @@
     }
 
     // Wait until main thread is ready to receive the next frame.
-    PTHREAD_LOCK(args->input_mutex);
+    kvz_sem_wait(args->available_input_slots);
     args->img_in = frame_in;
     args->retval = retval;
     // Unlock main_thread_mutex to notify main thread that the new img_in
     // and retval have been placed to args.
-    PTHREAD_UNLOCK(args->main_thread_mutex);
+    kvz_sem_post(args->filled_input_slots);
 
     frame_in = NULL;
   }
 
 done:
   // Wait until main thread is ready to receive the next frame.
-  PTHREAD_LOCK(args->input_mutex);
+  kvz_sem_wait(args->available_input_slots);
   args->img_in = NULL;
   args->retval = retval;
   // Unlock main_thread_mutex to notify main thread that the new img_in
   // and retval have been placed to args.
-  PTHREAD_UNLOCK(args->main_thread_mutex);
+  kvz_sem_post(args->filled_input_slots);
 
   // Do some cleaning up.
   args->api->picture_free(frame_in);
 
   pthread_exit(NULL);
-  return 0;
+  return NULL;
+}
+
+
+void output_recon_pictures(const kvz_api *const api,
+                           FILE *recout,
+                           kvz_picture *bufferKVZ_MAX_GOP_LENGTH,
+                           int *buffer_size,
+                           uint64_t *next_pts,
+                           unsigned width,
+                           unsigned height)
+{
+  bool picture_written;
+  do {
+    picture_written = false;
+    for (int i = 0; i < *buffer_size; i++) {
+
+      kvz_picture *pic = bufferi;
+      if (pic->pts == *next_pts) {
+        // Output the picture and remove it.
+        if (!yuv_io_write(recout, pic, width, height)) {
+          fprintf(stderr, "Failed to write reconstructed picture!\n");
+        }
+        api->picture_free(pic);
+        picture_written = true;
+        (*next_pts)++;
+
+        // Move rest of the pictures one position backward.
+        for (i++; i < *buffer_size; i++) {
+          bufferi - 1 = bufferi;
+          bufferi = NULL;
+        }
+        (*buffer_size)--;
+      }
+    }
+  } while (picture_written);
 }
 
 
@@ -287,15 +322,37 @@
   clock_t start_time = clock();
   clock_t encoding_start_cpu_time;
   KVZ_CLOCK_T encoding_start_real_time;
-  
+
   clock_t encoding_end_cpu_time;
   KVZ_CLOCK_T encoding_end_real_time;
 
+  // PTS of the reconstructed picture that should be output next.
+  // Only used with --debug.
+  uint64_t next_recon_pts = 0;
+  // Buffer for storing reconstructed pictures that are not to be output
+  // yet (i.e. in wrong order because GOP is used).
+  // Only used with --debug.
+  kvz_picture *recon_bufferKVZ_MAX_GOP_LENGTH = { NULL };
+  int recon_buffer_size = 0;
+
+  // Semaphores for synchronizing the input reader thread and the main
+  // thread.
+  //
+  // available_input_slots tells whether the main thread is currently using
+  // input_handler_args.img_in. (0 = in use, 1 = not in use)
+  //
+  // filled_input_slots tells whether there is a new input picture (or NULL
+  // if the input has ended) in input_handler_args.img_in placed by the
+  // input reader thread. (0 = no new image, 1 = one new image)
+  //
+  kvz_sem_t *available_input_slots = NULL;
+  kvz_sem_t *filled_input_slots = NULL;
+
 #ifdef _WIN32
   // Stderr needs to be text mode to convert \n to \r\n in Windows.
   setmode( _fileno( stderr ), _O_TEXT );
 #endif
-      
+
   CHECKPOINTS_INIT();
 
   const kvz_api * const api = kvz_api_get(8);
@@ -379,17 +436,15 @@
 
     pthread_t input_thread;
 
-    pthread_mutex_t input_mutex = PTHREAD_MUTEX_INITIALIZER;
-    pthread_mutex_t main_thread_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-    // Lock both mutexes at startup
-    PTHREAD_LOCK(&main_thread_mutex);
-    PTHREAD_LOCK(&input_mutex);
+    available_input_slots = calloc(1, sizeof(kvz_sem_t));
+    filled_input_slots    = calloc(1, sizeof(kvz_sem_t));
+    kvz_sem_init(available_input_slots, 0);
+    kvz_sem_init(filled_input_slots,    0);
 
     // Give arguments via struct to the input thread
     input_handler_args in_args = {
-      .input_mutex = NULL,
-      .main_thread_mutex = NULL,
+      .available_input_slots = available_input_slots,
+      .filled_input_slots    = filled_input_slots,
 
       .input = input,
       .api = api,
@@ -401,8 +456,8 @@
       .img_in = NULL,
       .retval = RETVAL_RUNNING,
     };
-    in_args.input_mutex = &input_mutex;
-    in_args.main_thread_mutex = &main_thread_mutex;
+    in_args.available_input_slots = available_input_slots;
+    in_args.filled_input_slots    = filled_input_slots;
 
     if (pthread_create(&input_thread, NULL, input_read_thread, (void*)&in_args) != 0) {
       fprintf(stderr, "pthread_create failed!\n");
@@ -414,11 +469,12 @@
 
       // Skip mutex locking if the input thread does not exist.
       if (in_args.retval == RETVAL_RUNNING) {
-        // Unlock input_mutex so that the input thread can write the new
-        // img_in and retval to in_args.
-        PTHREAD_UNLOCK(&input_mutex);
-        // Wait until the input thread has updated in_args.
-        PTHREAD_LOCK(&main_thread_mutex);
+        // Increase available_input_slots so that the input thread can
+        // write the new img_in and retval to in_args.
+        kvz_sem_post(available_input_slots);
+        // Wait until the input thread has updated in_args and then
+        // decrease filled_input_slots.
+        kvz_sem_wait(filled_input_slots);
 
         cur_in_img = in_args.img_in;
         in_args.img_in = NULL;
@@ -484,12 +540,20 @@
         if (recout) {
           // Since chunks_out was not NULL, img_rec should have been set.
           assert(img_rec);
-          if (!yuv_io_write(recout,
-                            img_rec,
-                            opts->config->width,
-                            opts->config->height)) {
-            fprintf(stderr, "Failed to write reconstructed picture!\n");
-          }
+
+          // Move img_rec to the recon buffer.
+          assert(recon_buffer_size < KVZ_MAX_GOP_LENGTH);
+          recon_bufferrecon_buffer_size++ = img_rec;
+          img_rec = NULL;
+
+          // Try to output some reconstructed pictures.
+          output_recon_pictures(api,
+                                recout,
+                                recon_buffer,
+                                &recon_buffer_size,
+                                &next_recon_pts,
+                                opts->config->width,
+                                opts->config->height);
         }
 
         frames_done += 1;
@@ -497,7 +561,7 @@
         psnr_sum1 += frame_psnr1;
         psnr_sum2 += frame_psnr2;
 
-        print_frame_info(&info_out, frame_psnr, len_out);
+        print_frame_info(&info_out, frame_psnr, len_out, encoder->cfg.calc_psnr);
       }
 
       api->picture_free(cur_in_img);
@@ -510,12 +574,15 @@
     encoding_end_cpu_time = clock();
     // Coding finished
 
+    // All reconstructed pictures should have been output.
+    assert(recon_buffer_size == 0);
+
     // Print statistics of the coding
     fprintf(stderr, " Processed %d frames, %10llu bits",
             frames_done,
             (long long unsigned int)bitstream_length * 8);
-    if (frames_done > 0) {
-      fprintf(stderr, " AVG PSNR: %2.4f %2.4f %2.4f",
+    if (encoder->cfg.calc_psnr && frames_done > 0) {
+      fprintf(stderr, " AVG PSNR Y %2.4f U %2.4f V %2.4f",
               psnr_sum0 / frames_done,
               psnr_sum1 / frames_done,
               psnr_sum2 / frames_done);
@@ -540,6 +607,12 @@
   retval = EXIT_FAILURE;
 
 done:
+  // destroy semaphores
+  if (available_input_slots) kvz_sem_destroy(available_input_slots);
+  if (filled_input_slots)    kvz_sem_destroy(filled_input_slots);
+  FREE_POINTER(available_input_slots);
+  FREE_POINTER(filled_input_slots);
+
   // deallocate structures
   if (enc) api->encoder_close(enc);
   if (opts) cmdline_opts_free(api, opts);

kvazaar-1.1.0.tar.gz/src/encode_coding_tree.c -> kvazaar-1.2.0.tar.gz/src/encode_coding_tree.c Changed

@@ -46,13 +46,11 @@
  * This method encodes the X and Y component within a block of the last
  * significant coefficient.
  */
-static void encode_last_significant_xy(encoder_state_t * const state,
+static void encode_last_significant_xy(cabac_data_t * const cabac,
                                        uint8_t lastpos_x, uint8_t lastpos_y,
                                        uint8_t width, uint8_t height,
                                        uint8_t type, uint8_t scan)
 {
-  cabac_data_t * const cabac = &state->cabac;
-
   const int index = kvz_math_floor_log2(width) - 2;
   uint8_t ctx_offset = type ? 0 : (index * 3 + (index + 1) / 4);
   uint8_t shift = type ? index : (index + 3) / 4;
@@ -103,14 +101,14 @@
 }
 
 void kvz_encode_coeff_nxn(encoder_state_t * const state,
-                          coeff_t *coeff,
+                          cabac_data_t * const cabac,
+                          const coeff_t *coeff,
                           uint8_t width,
                           uint8_t type,
                           int8_t scan_mode,
                           int8_t tr_skip)
 {
   const encoder_control_t * const encoder = state->encoder_control;
-  cabac_data_t * const cabac = &state->cabac;
   int c1 = 1;
   uint8_t last_coeff_x = 0;
   uint8_t last_coeff_y = 0;
@@ -183,8 +181,13 @@
   last_coeff_y = (uint8_t)(pos_last >> log2_block_size);
 
   // Code last_coeff_x and last_coeff_y
-  encode_last_significant_xy(state, last_coeff_x, last_coeff_y, width, width,
-                             type, scan_mode);
+  encode_last_significant_xy(cabac,
+                             last_coeff_x,
+                             last_coeff_y,
+                             width,
+                             width,
+                             type,
+                             scan_mode);
 
   scan_pos_sig  = scan_pos_last;
 
@@ -300,15 +303,15 @@
       }
       if (be_valid && sign_hidden) {
     	coeff_signs = coeff_signs >> 1;
-    	if(!state->cabac.only_count)
-    	  if (state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) {
-    	    coeff_signs = coeff_signs ^ ff_get_key(&state->tile->dbs_g, num_non_zero-1);
+    	if (!cabac->only_count)
+    	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) {
+    	    coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero-1);
     	  }
         CABAC_BINS_EP(cabac, coeff_signs , (num_non_zero - 1), "coeff_sign_flag");
       } else {
-        if(!state->cabac.only_count)
-    	  if (state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS)
-    	    coeff_signs = coeff_signs ^ ff_get_key(&state->tile->dbs_g, num_non_zero);
+        if (!cabac->only_count)
+    	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS)
+    	    coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero);
         CABAC_BINS_EP(cabac, coeff_signs, num_non_zero, "coeff_sign_flag");
       }
 
@@ -319,9 +322,9 @@
           int32_t base_level  = (idx < C1FLAG_NUMBER) ? (2 + first_coeff2) : 1;
 
           if (abs_coeffidx >= base_level) {
-        	if(!state->cabac.only_count) {
-        	  if (state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFFS)
-                kvz_cabac_write_coeff_remain_encry(state, cabac, abs_coeffidx - base_level, go_rice_param, base_level);
+        	if (!cabac->only_count) {
+        	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFFS)
+                    kvz_cabac_write_coeff_remain_encry(state, cabac, abs_coeffidx - base_level, go_rice_param, base_level);
         	  else
         		kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
         	} else
@@ -342,7 +345,7 @@
 }
 
 static void encode_transform_unit(encoder_state_t * const state,
-                                  int x_pu, int y_pu, int depth)
+                                  int x, int y, int depth)
 {
   assert(depth >= 1 && depth <= MAX_PU_DEPTH);
 
@@ -350,79 +353,60 @@
   const uint8_t width = LCU_WIDTH >> depth;
   const uint8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
 
-  const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, x_pu << 2, y_pu << 2);
-
-  const int x_cu = x_pu / 2;
-  const int y_cu = y_pu / 2;
-  const cu_info_t *cur_cu = kvz_videoframe_get_cu_const(frame, x_cu, y_cu);
-
-  coeff_t coeff_yLCU_WIDTH*LCU_WIDTH+1;
-  coeff_t coeff_uLCU_WIDTH*LCU_WIDTH>>2;
-  coeff_t coeff_vLCU_WIDTH*LCU_WIDTH>>2;
-  int32_t coeff_stride = frame->width;
+  const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, x, y);
 
   int8_t scan_idx = kvz_get_scan_order(cur_pu->type, cur_pu->intra.mode, depth);
 
   int cbf_y = cbf_is_set(cur_pu->cbf, depth, COLOR_Y);
 
   if (cbf_y) {
-    int x = x_pu * (LCU_WIDTH >> MAX_PU_DEPTH);
-    int y = y_pu * (LCU_WIDTH >> MAX_PU_DEPTH);
-    coeff_t *orig_pos = &frame->coeff_yx + y * frame->width;
-    for (y = 0; y < width; y++) {
-      for (x = 0; x < width; x++) {
-        coeff_yx+y*width = orig_posx;
-      }
-      orig_pos += coeff_stride;
-    }
-  }
-
-  // CoeffNxN
-  // Residual Coding
-  if (cbf_y) {
-    kvz_encode_coeff_nxn(state, coeff_y, width, 0, scan_idx, cur_pu->intra.tr_skip);
+    int x_local = x % LCU_WIDTH;
+    int y_local = y % LCU_WIDTH;
+    const coeff_t *coeff_y = &state->coeff->yxy_to_zorder(LCU_WIDTH, x_local, y_local);
+
+    // CoeffNxN
+    // Residual Coding
+    kvz_encode_coeff_nxn(state,
+                         &state->cabac,
+                         coeff_y,
+                         width,
+                         0,
+                         scan_idx,
+                         cur_pu->intra.tr_skip);
   }
 
-  if (depth == MAX_DEPTH + 1 && !(x_pu % 2 && y_pu % 2)) {
+  if (depth == MAX_DEPTH + 1) {
     // For size 4x4 luma transform the corresponding chroma transforms are
-    // also of size 4x4 covering 8x8 luma pixels. The residual is coded
-    // in the last transform unit so for the other ones, don't do anything.
-    return;
+    // also of size 4x4 covering 8x8 luma pixels. The residual is coded in
+    // the last transform unit.
+    if (x % 8 == 0 || y % 8 == 0) {
+      // Not the last luma transform block so there is nothing more to do.
+      return;
+    } else {
+      // Time to to code the chroma transform blocks. Move to the top-left
+      // corner of the block.
+      x -= 4;
+      y -= 4;
+      cur_pu = kvz_cu_array_at_const(frame->cu_array, x, y);
+    }
   }
 
-  bool chroma_cbf_set = cbf_is_set(cur_cu->cbf, depth, COLOR_U) ||
-                        cbf_is_set(cur_cu->cbf, depth, COLOR_V);
+  bool chroma_cbf_set = cbf_is_set(cur_pu->cbf, depth, COLOR_U) ||
+                        cbf_is_set(cur_pu->cbf, depth, COLOR_V);
   if (chroma_cbf_set) {
-    int x, y;
-    coeff_t *orig_pos_u, *orig_pos_v;
-
-    if (depth <= MAX_DEPTH) {
-      x = x_pu * (LCU_WIDTH >> (MAX_PU_DEPTH + 1));
-      y = y_pu * (LCU_WIDTH >> (MAX_PU_DEPTH + 1));
-    } else {
-      // for 4x4 select top left pixel of the CU.
-      x = x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1));
-      y = y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1));
-    }
-    orig_pos_u = &frame->coeff_ux + y * (frame->width >> 1);
-    orig_pos_v = &frame->coeff_vx + y * (frame->width >> 1);
-    for (y = 0; y < (width_c); y++) {
-      for (x = 0; x < (width_c); x++) {
-        coeff_ux+y*(width_c) = orig_pos_ux;
-        coeff_vx+y*(width_c) = orig_pos_vx;
-      }
-      orig_pos_u += coeff_stride>>1;
-      orig_pos_v += coeff_stride>>1;
-    }
+    int x_local = (x >> 1) % LCU_WIDTH_C;
+    int y_local = (y >> 1) % LCU_WIDTH_C;
+    scan_idx = kvz_get_scan_order(cur_pu->type, cur_pu->intra.mode_chroma, depth);
 
-    scan_idx = kvz_get_scan_order(cur_cu->type, cur_cu->intra.mode_chroma, depth);
+    const coeff_t *coeff_u = &state->coeff->uxy_to_zorder(LCU_WIDTH_C, x_local, y_local);
+    const coeff_t *coeff_v = &state->coeff->vxy_to_zorder(LCU_WIDTH_C, x_local, y_local);
 
-    if (cbf_is_set(cur_cu->cbf, depth, COLOR_U)) {
-      kvz_encode_coeff_nxn(state, coeff_u, width_c, 2, scan_idx, 0);
+    if (cbf_is_set(cur_pu->cbf, depth, COLOR_U)) {
+      kvz_encode_coeff_nxn(state, &state->cabac, coeff_u, width_c, 2, scan_idx, 0);
     }
 
-    if (cbf_is_set(cur_cu->cbf, depth, COLOR_V)) {
-      kvz_encode_coeff_nxn(state, coeff_v, width_c, 2, scan_idx, 0);
+    if (cbf_is_set(cur_pu->cbf, depth, COLOR_V)) {
+      kvz_encode_coeff_nxn(state, &state->cabac, coeff_v, width_c, 2, scan_idx, 0);
     }
   }
 }
@@ -437,21 +421,21 @@
  * \param parent_coeff_v  What was signlaed at previous level for cbf_cr.
  */
 static void encode_transform_coeff(encoder_state_t * const state,
-                                   int32_t x_pu,
-                                   int32_t y_pu,
+                                   int32_t x,
+                                   int32_t y,
                                    int8_t depth,
                                    int8_t tr_depth,
                                    uint8_t parent_coeff_u,
                                    uint8_t parent_coeff_v)
 {
   cabac_data_t * const cabac = &state->cabac;
+  const encoder_control_t *const ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
 
-  const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, x_pu << 2, y_pu << 2);
-
-  const int32_t x_cu = x_pu / 2;
-  const int32_t y_cu = y_pu / 2;
-  const cu_info_t *cur_cu = kvz_videoframe_get_cu_const(frame, x_cu, y_cu);
+  const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, x, y);
+  // Round coordinates down to a multiple of 8 to get the location of the
+  // containing CU.
+  const cu_info_t *cur_cu = kvz_cu_array_at_const(frame->cu_array, x & ~7, y & ~7);
 
   // NxN signifies implicit transform split at the first transform level.
   // There is a similar implicit split for inter, but it is only used when
@@ -459,8 +443,12 @@
   int intra_split_flag = (cur_cu->type == CU_INTRA && cur_cu->part_size == SIZE_NxN);
 
   // The implicit split by intra NxN is not counted towards max_tr_depth.
-  int tr_depth_intra = state->encoder_control->cfg.tr_depth_intra;
-  int max_tr_depth = (cur_cu->type == CU_INTRA ? tr_depth_intra + intra_split_flag : TR_DEPTH_INTER);
+  int max_tr_depth;
+  if (cur_cu->type == CU_INTRA) {
+    max_tr_depth = ctrl->cfg.tr_depth_intra + intra_split_flag;
+  } else {
+    max_tr_depth = ctrl->tr_depth_inter;
+  }
 
   int8_t split = (cur_cu->tr_depth > depth);
 
@@ -498,11 +486,13 @@
   }
 
   if (split) {
-    uint8_t pu_offset = 1 << (MAX_PU_DEPTH - (depth + 1));
-    encode_transform_coeff(state, x_pu, y_pu, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
-    encode_transform_coeff(state, x_pu + pu_offset, y_pu,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
-    encode_transform_coeff(state, x_pu, y_pu + pu_offset,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
-    encode_transform_coeff(state, x_pu + pu_offset, y_pu + pu_offset,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    uint8_t offset = LCU_WIDTH >> (depth + 1);
+    int x2 = x + offset;
+    int y2 = y + offset;
+    encode_transform_coeff(state, x,  y,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    encode_transform_coeff(state, x2, y,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    encode_transform_coeff(state, x,  y2, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    encode_transform_coeff(state, x2, y2, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
     return;
   }
 
@@ -511,7 +501,7 @@
   // - transform depth > 0
   // - we have chroma coefficients at this level
   // When it is not present, it is inferred to be 1.
-  if(cur_cu->type == CU_INTRA || tr_depth > 0 || cb_flag_u || cb_flag_v) {
+  if (cur_cu->type == CU_INTRA || tr_depth > 0 || cb_flag_u || cb_flag_v) {
       cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_luma!tr_depth);
       CABAC_BIN(cabac, cb_flag_y, "cbf_luma");
   }
@@ -539,7 +529,7 @@
       state->ref_qp = state->qp;
     }
 
-    encode_transform_unit(state, x_pu, y_pu, depth);
+    encode_transform_unit(state, x, y, depth);
   }
 }
 
@@ -570,15 +560,6 @@
     }
   } else {
     uint32_t ref_list_idx;
-    uint32_t j;
-    int ref_list2 = { 0, 0 };
-    for (j = 0; j < state->frame->ref->used_size; j++) {
-      if (state->frame->ref->pocsj < state->frame->poc) {
-        ref_list0++;
-      } else {
-        ref_list1++;
-      }
-    }
 
     // Void TEncSbac::codeInterDir( TComDataCU* pcCU, UInt uiAbsPartIdx )
     if (state->frame->slicetype == KVZ_SLICE_B)
@@ -602,16 +583,20 @@
 
     for (ref_list_idx = 0; ref_list_idx < 2; ref_list_idx++) {
       if (cur_cu->inter.mv_dir & (1 << ref_list_idx)) {
-        if (ref_listref_list_idx > 1) {
+
+        // size of the current reference index list (L0/L1)
+        uint8_t ref_LX_size = state->frame->ref_LX_sizeref_list_idx;
+
+        if (ref_LX_size > 1) {
           // parseRefFrmIdx
-          int32_t ref_frame = state->frame->refmapcur_cu->inter.mv_refref_list_idx.idx;
+          int32_t ref_frame = cur_cu->inter.mv_refref_list_idx;
 
           cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model0);
           CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX");
 
           if (ref_frame > 0) {
             int32_t i;
-            int32_t ref_num = ref_listref_list_idx - 2;
+            int32_t ref_num = ref_LX_size - 2;
 
             cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model1);
             ref_frame--;
@@ -668,7 +653,7 @@
             uint32_t mvd_hor_sign = (mvd_hor>0)?0:1;
             if(!state->cabac.only_count)
               if (state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_MV_SIGNS)
-                mvd_hor_sign = mvd_hor_sign^ff_get_key(&state->tile->dbs_g, 1);
+                mvd_hor_sign = mvd_hor_sign^kvz_crypto_get_key(state->crypto_hdl, 1);
             CABAC_BIN_EP(cabac, mvd_hor_sign, "mvd_sign_flag_hor");
           }
           if (ver_abs_gr0) {
@@ -678,7 +663,7 @@
             uint32_t mvd_ver_sign = (mvd_ver>0)?0:1;
             if(!state->cabac.only_count)
               if (state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_MV_SIGNS)
-                mvd_ver_sign = mvd_ver_sign^ff_get_key(&state->tile->dbs_g, 1);
+                mvd_ver_sign = mvd_ver_sign^kvz_crypto_get_key(state->crypto_hdl, 1);
             CABAC_BIN_EP(cabac, mvd_ver_sign, "mvd_sign_flag_ver");
           }
         }
@@ -694,13 +679,72 @@
   } // if !merge
 }
 
+
+static INLINE uint8_t intra_mode_encryption(encoder_state_t * const state,
+                                            uint8_t intra_pred_mode)
+{
+  const uint8_t sets317 =
+  {
+    {  0,  1,  2,  3,  4,  5, 15, 16, 17, 18, 19, 20, 21, 31, 32, 33, 34},  /* 17 */
+    { 22, 23, 24, 25, 27, 28, 29, 30, -1, -1, -1, -1, -1, -1, -1, -1, -1},  /* 9  */
+    {  6,  7,  8,  9, 11, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1}   /* 9  */
+  };
+
+  const uint8_t nb_elems3 = {17, 8, 8};
+
+  if (intra_pred_mode == 26 || intra_pred_mode == 10) {
+    // correct chroma intra prediction mode
+    return intra_pred_mode;
+
+  } else {
+    uint8_t keybits, scan_dir, elem_idx=0;
+
+    keybits = kvz_crypto_get_key(state->crypto_hdl, 5);
+
+    scan_dir = SCAN_DIAG;
+    if (intra_pred_mode > 5  && intra_pred_mode < 15) {
+      scan_dir = SCAN_VER;
+    }
+    if (intra_pred_mode > 21 && intra_pred_mode < 31) {
+      scan_dir = SCAN_HOR;
+    }
+
+    for (int i = 0; i < nb_elemsscan_dir; i++) {
+      if (intra_pred_mode == setsscan_diri) {
+        elem_idx = i;
+        break;
+      }
+    }
+
+    keybits = keybits % nb_elemsscan_dir;
+    keybits = (elem_idx + keybits) % nb_elemsscan_dir;
+
+    return setsscan_dirkeybits;
+  }
+}
+
+
 static void encode_intra_coding_unit(encoder_state_t * const state,
                                      cabac_data_t * const cabac,
                                      const cu_info_t * const cur_cu,
-                                     int x_ctb, int y_ctb, int depth)
+                                     int x, int y, int depth)
 {
   const videoframe_t * const frame = state->tile->frame;
-  uint8_t intra_pred_mode4;
+  uint8_t intra_pred_mode_actual4;
+  uint8_t *intra_pred_mode = intra_pred_mode_actual;
+
+#if KVZ_SEL_ENCRYPTION
+  const bool do_crypto =
+    !state->cabac.only_count &&
+    state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_INTRA_MODE;
+#else
+  const bool do_crypto = false;
+#endif
+
+  uint8_t intra_pred_mode_encry4 = {-1, -1, -1, -1};
+  if (do_crypto) {
+    intra_pred_mode = intra_pred_mode_encry;
+  }
 
   uint8_t intra_pred_mode_chroma = cur_cu->intra.mode_chroma;
   int8_t intra_preds43 = {{-1, -1, -1},{-1, -1, -1},{-1, -1, -1},{-1, -1, -1}};
@@ -720,8 +764,8 @@
   const int cu_width = LCU_WIDTH >> depth;
 
   for (int j = 0; j < num_pred_units; ++j) {
-    const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x_ctb << 3, j);
-    const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y_ctb << 3, j);
+    const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, j);
+    const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, j);
     const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y);
 
     const cu_info_t *left_pu = NULL;
@@ -737,12 +781,26 @@
       above_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y - 1);
     }
 
-    kvz_intra_get_dir_luma_predictor(pu_x, pu_y,
-                                     intra_predsj,
-                                     cur_pu,
-                                     left_pu, above_pu);
+    if (do_crypto) {
+#if KVZ_SEL_ENCRYPTION
+      // Need to wrap in preprocessor directives because this function is
+      // only defined when KVZ_SEL_ENCRYPTION is defined.
+      kvz_intra_get_dir_luma_predictor_encry(pu_x, pu_y,
+                                             intra_predsj,
+                                             cur_pu,
+                                             left_pu, above_pu);
+#endif
+    } else {
+      kvz_intra_get_dir_luma_predictor(pu_x, pu_y,
+                                       intra_predsj,
+                                       cur_pu,
+                                       left_pu, above_pu);
+    }
 
-    intra_pred_modej = cur_pu->intra.mode;
+    intra_pred_mode_actualj = cur_pu->intra.mode;
+    if (do_crypto) {
+      intra_pred_mode_encryj = intra_mode_encryption(state, cur_pu->intra.mode);
+    }
 
     for (int i = 0; i < 3; i++) {
       if (intra_predsji == intra_pred_modej) {
@@ -751,6 +809,26 @@
       }
     }
     flagj = (mpm_predsj == -1) ? 0 : 1;
+
+#if KVZ_SEL_ENCRYPTION
+    // Need to wrap in preprocessor directives because
+    // cu_info_t.intra.mode_encry is only defined when KVZ_SEL_ENCRYPTION
+    // is defined.
+    if (do_crypto) {
+      // Set the modified intra_pred_mode of the current pu here to make it
+      // available from its neighbours for mpm decision.
+
+      // FIXME: there might be a more efficient way to propagate mode_encry
+      // for future use from left and above PUs
+      const int pu_width = PU_GET_W(cur_cu->part_size, cu_width, j);
+      for (int y = pu_y; y < pu_y + pu_width; y += 4 ) {
+        for (int x = pu_x; x < pu_x + pu_width; x += 4) {
+          cu_info_t *cu = kvz_cu_array_at(frame->cu_array, x, y);
+          cu->intra.mode_encry = intra_pred_mode_encryj;
+        }
+      }
+    }
+#endif
   }
 
   cabac->cur_ctx = &(cabac->ctx.intra_mode_model);
@@ -790,14 +868,14 @@
     unsigned pred_mode = 5;
     unsigned chroma_pred_modes4 = {0, 26, 10, 1};
 
-    if (intra_pred_mode_chroma == intra_pred_mode0) {
+    if (intra_pred_mode_chroma == intra_pred_mode_actual0) {
       pred_mode = 4;
     } else if (intra_pred_mode_chroma == 34) {
       // Angular 34 mode is possible only if intra pred mode is one of the
       // possible chroma pred modes, in which case it is signaled with that
       // duplicate mode.
       for (int i = 0; i < 4; ++i) {
-        if (intra_pred_mode0 == chroma_pred_modesi) pred_mode = i;
+        if (intra_pred_mode_actual0 == chroma_pred_modesi) pred_mode = i;
       }
     } else {
       for (int i = 0; i < 4; ++i) {
@@ -829,7 +907,7 @@
     }
   }
 
-  encode_transform_coeff(state, x_ctb * 2, y_ctb * 2, depth, 0, 0, 0);
+  encode_transform_coeff(state, x, y, depth, 0, 0, 0);
 }
 
 static void encode_part_mode(encoder_state_t * const state,
@@ -916,37 +994,48 @@
 }
 
 void kvz_encode_coding_tree(encoder_state_t * const state,
-                            uint16_t x_ctb,
-                            uint16_t y_ctb,
+                            uint16_t x,
+                            uint16_t y,
                             uint8_t depth)
 {
   cabac_data_t * const cabac = &state->cabac;
+  const encoder_control_t * const ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
-  const cu_info_t *cur_cu = kvz_videoframe_get_cu_const(frame, x_ctb, y_ctb);
+  const cu_info_t *cur_cu   = kvz_cu_array_at_const(frame->cu_array, x, y);
+
+  const cu_info_t *left_cu  = NULL;
+  if (x > 0) {
+    left_cu = kvz_cu_array_at_const(frame->cu_array, x - 1, y);
+  }
+  const cu_info_t *above_cu = NULL;
+  if (y > 0) {
+    above_cu = kvz_cu_array_at_const(frame->cu_array, x, y - 1);
+  }
+
   uint8_t split_flag = GET_SPLITDATA(cur_cu, depth);
   uint8_t split_model = 0;
 
-  //Absolute ctb
-  uint16_t abs_x_ctb = x_ctb + (state->tile->lcu_offset_x * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
-  uint16_t abs_y_ctb = y_ctb + (state->tile->lcu_offset_y * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
+  // Absolute coordinates
+  uint16_t abs_x = x + state->tile->offset_x;
+  uint16_t abs_y = y + state->tile->offset_y;
 
   // Check for slice border FIXME
-  uint8_t border_x = ((state->encoder_control->in.width) < (abs_x_ctb * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth))) ? 1 : 0;
-  uint8_t border_y = ((state->encoder_control->in.height) < (abs_y_ctb * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth))) ? 1 : 0;
-  uint8_t border_split_x = ((state->encoder_control->in.width)  < ((abs_x_ctb + 1) * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)))) ? 0 : 1;
-  uint8_t border_split_y = ((state->encoder_control->in.height) < ((abs_y_ctb + 1) * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)))) ? 0 : 1;
-  uint8_t border = border_x | border_y; /*!< are we in any border CU */
+  bool border_x = ctrl->in.width  < abs_x + (LCU_WIDTH >> depth);
+  bool border_y = ctrl->in.height < abs_y + (LCU_WIDTH >> depth);
+  bool border_split_x = ctrl->in.width  >= abs_x + (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1));
+  bool border_split_y = ctrl->in.height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1));
+  bool border = border_x || border_y; /*!< are we in any border CU */
 
   // When not in MAX_DEPTH, insert split flag and split the blocks if needed
   if (depth != MAX_DEPTH) {
     // Implisit split flag when on border
     if (!border) {
       // Get left and top block split_flags and if they are present and true, increase model number
-      if (x_ctb > 0 && GET_SPLITDATA(kvz_videoframe_get_cu_const(frame, x_ctb - 1, y_ctb), depth) == 1) {
+      if (left_cu && GET_SPLITDATA(left_cu, depth) == 1) {
         split_model++;
       }
 
-      if (y_ctb > 0 && GET_SPLITDATA(kvz_videoframe_get_cu_const(frame, x_ctb, y_ctb - 1), depth) == 1) {
+      if (above_cu && GET_SPLITDATA(above_cu, depth) == 1) {
         split_model++;
       }
 
@@ -956,18 +1045,19 @@
 
     if (split_flag || border) {
       // Split blocks and remember to change x and y block positions
-      uint8_t change = 1<<(MAX_DEPTH-1-depth);
-      kvz_encode_coding_tree(state, x_ctb, y_ctb, depth + 1); // x,y
+      int offset = LCU_WIDTH >> (depth + 1);
+
+      kvz_encode_coding_tree(state, x, y, depth + 1);
 
       // TODO: fix when other half of the block would not be completely over the border
       if (!border_x || border_split_x) {
-        kvz_encode_coding_tree(state, x_ctb + change, y_ctb, depth + 1);
+        kvz_encode_coding_tree(state, x + offset, y, depth + 1);
       }
       if (!border_y || border_split_y) {
-        kvz_encode_coding_tree(state, x_ctb, y_ctb + change, depth + 1);
+        kvz_encode_coding_tree(state, x, y + offset, depth + 1);
       }
       if (!border || (border_split_x && border_split_y)) {
-        kvz_encode_coding_tree(state, x_ctb + change, y_ctb + change, depth + 1);
+        kvz_encode_coding_tree(state, x + offset, y + offset, depth + 1);
       }
       return;
     }
@@ -978,27 +1068,25 @@
     CABAC_BIN(cabac, 1, "cu_transquant_bypass_flag");
   }
 
-    // Encode skip flag
+  // Encode skip flag
   if (state->frame->slicetype != KVZ_SLICE_I) {
-    int8_t ctx_skip = 0; // uiCtxSkip = aboveskipped + leftskipped;
-    int ui;
-    int16_t num_cand = MRG_MAX_NUM_CANDS;
-    // Get left and top skipped flags and if they are present and true, increase context number
-    if (x_ctb > 0 && (kvz_videoframe_get_cu_const(frame, x_ctb - 1, y_ctb))->skipped) {
+    // uiCtxSkip = aboveskipped + leftskipped;
+    int8_t ctx_skip = 0;
+
+    if (left_cu && left_cu->skipped) {
       ctx_skip++;
     }
-
-    if (y_ctb > 0 && (kvz_videoframe_get_cu_const(frame, x_ctb, y_ctb - 1))->skipped) {
+    if (above_cu && above_cu->skipped) {
       ctx_skip++;
     }
 
     cabac->cur_ctx = &(cabac->ctx.cu_skip_flag_modelctx_skip);
     CABAC_BIN(cabac, cur_cu->skipped, "SkipFlag");
 
-    // IF SKIP
     if (cur_cu->skipped) {
+      int16_t num_cand = MRG_MAX_NUM_CANDS;
       if (num_cand > 1) {
-        for (ui = 0; ui < num_cand - 1; ui++) {
+        for (int ui = 0; ui < num_cand - 1; ui++) {
           int32_t symbol = (ui != cur_cu->merge_idx);
           if (ui == 0) {
             cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
@@ -1015,8 +1103,6 @@
     }
   }
 
-  // ENDIF SKIP
-
   // Prediction mode
   if (state->frame->slicetype != KVZ_SLICE_I) {
     cabac->cur_ctx = &(cabac->ctx.cu_pred_mode_model);
@@ -1031,8 +1117,8 @@
     const int cu_width = LCU_WIDTH >> depth;
 
     for (int i = 0; i < num_pu; ++i) {
-      const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x_ctb << 3, i);
-      const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y_ctb << 3, i);
+      const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i);
+      const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i);
       const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i);
       const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i);
       const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y);
@@ -1051,57 +1137,52 @@
       // Code (possible) coeffs to bitstream
 
       if (cbf) {
-        encode_transform_coeff(state, x_ctb * 2, y_ctb * 2, depth, 0, 0, 0);
+        encode_transform_coeff(state, x, y, depth, 0, 0, 0);
       }
     }
   } else if (cur_cu->type == CU_INTRA) {
-    encode_intra_coding_unit(state, cabac, cur_cu, x_ctb, y_ctb, depth);
+    encode_intra_coding_unit(state, cabac, cur_cu, x, y, depth);
   }
 
-    #if ENABLE_PCM == 1
+#if ENABLE_PCM
   // Code IPCM block
-  if (cur_cu->type == CU_PCM) {
+  else if (cur_cu->type == CU_PCM) {
     kvz_cabac_encode_bin_trm(cabac, 1); // IPCMFlag == 1
-      kvz_cabac_finish(cabac);
-      kvz_bitstream_add_rbsp_trailing_bits(cabac.stream);
-    // PCM sample
-      {
-      unsigned y, x;
-
-      pixel *base_y = &cur_pic->y_datax_ctb * (LCU_WIDTH >> (MAX_DEPTH))    + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH))) * encoder->in.width;
-      pixel *base_u = &cur_pic->u_data(x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * encoder->in.width / 2);
-      pixel *base_v = &cur_pic->v_data(x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * encoder->in.width / 2);
+    kvz_cabac_finish(cabac);
+    kvz_bitstream_add_rbsp_trailing_bits(cabac.stream);
 
-      // Luma
-      for (y = 0; y < LCU_WIDTH >> depth; y++) {
-        for (x = 0; x < LCU_WIDTH >> depth; x++) {
-          kvz_bitstream_put(cabac.stream, base_yx + y * encoder->in.width, 8);
-          }
-        }
+    // PCM sample
+    pixel *base_y = &cur_pic->y_datax     + y * encoder->in.width;
+    pixel *base_u = &cur_pic->u_datax / 2 + y / 2 * encoder->in.width / 2;
+    pixel *base_v = &cur_pic->v_datax / 2 + y / 2 * encoder->in.width / 2;
+
+    // Luma
+    for (unsigned y_px = 0; y_px < LCU_WIDTH >> depth; y_px++) {
+      for (unsigned  x_px = 0; x_px < LCU_WIDTH >> depth; x_px++) {
+        kvz_bitstream_put(cabac.stream, base_yx_px + y_px * encoder->in.width, 8);
+      }
+    }
 
-      // Chroma
-      if (encoder->in.video_format != FORMAT_400) {
-        for (y = 0; y < LCU_WIDTH >> (depth + 1); y++) {
-          for (x = 0; x < LCU_WIDTH >> (depth + 1); x++) {
-            kvz_bitstream_put(cabac.stream, base_ux + y * (encoder->in.width >> 1), 8);
-          }
+    // Chroma
+    if (encoder->in.video_format != FORMAT_400) {
+      for (unsigned y_px = 0; y_px < LCU_WIDTH >> (depth + 1); y_px++) {
+        for (unsigned x_px = 0; x_px < LCU_WIDTH >> (depth + 1); x_px++) {
+          kvz_bitstream_put(cabac.stream, base_ux_px + y_px * (encoder->in.width >> 1), 8);
         }
-        for (y = 0; y < LCU_WIDTH >> (depth + 1); y++) {
-          for (x = 0; x < LCU_WIDTH >> (depth + 1); x++) {
-            kvz_bitstream_put(cabac.stream, base_vx + y * (encoder->in.width >> 1), 8);
-          }
+      }
+      for (unsigned y_px = 0; y_px < LCU_WIDTH >> (depth + 1); y_px++) {
+        for (unsigned x_px = 0; x_px < LCU_WIDTH >> (depth + 1); x_px++) {
+          kvz_bitstream_put(cabac.stream, base_vx_px + y_px * (encoder->in.width >> 1), 8);
         }
       }
     }
-    // end PCM sample
-      kvz_cabac_start(cabac);
-  } // end Code IPCM block
-#endif /* END ENABLE_PCM */
-  else { /* Should not happend */
+    kvz_cabac_start(cabac);
+  }
+#endif
+
+  else {
+    // CU type not set. Should not happen.
     assert(0);
     exit(1);
   }
-
-   /* end prediction unit */
-  /* end coding_unit */
 }

kvazaar-1.1.0.tar.gz/src/encode_coding_tree.h -> kvazaar-1.2.0.tar.gz/src/encode_coding_tree.h Changed

kvazaar-1.1.0.tar.gz/src/encoder.c -> kvazaar-1.2.0.tar.gz/src/encoder.c Changed

@@ -20,6 +20,9 @@
 
 #include "encoder.h"
 
+// This define is required for M_PI on Windows.
+#define _USE_MATH_DEFINES
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -27,90 +30,170 @@
 #include "strategyselector.h"
 
 
+/**
+ * \brief Strength of QP adjustments when using adaptive QP for 360 video.
+ *
+ * Determined empirically.
+ */
+static const double ERP_AQP_STRENGTH = 3.0;
+
+
 static int encoder_control_init_gop_layer_weights(encoder_control_t * const);
 
-static int size_of_wpp_ends(int threads)
+static unsigned cfg_num_threads(void)
 {
-  // Based on the shape of the area where all threads can't yet run in parallel.
-  return 4 * threads * threads - 2 * threads;
+  if (kvz_g_hardware_flags.logical_cpu_count == 0) {
+    // Default to 4 if we don't know the number of CPUs.
+    return 4;
+  }
+
+  return kvz_g_hardware_flags.logical_cpu_count;
 }
 
-static int select_owf_auto(const kvz_config *const cfg)
+
+static int get_max_parallelism(const encoder_control_t *const encoder)
 {
-  if (cfg->intra_period == 1) {
-    if (cfg->wpp) {
-      // If wpp is on, select owf such that less than 15% of the
-      // frame is covered by the are threads can not work at the same time.
-      const int lcu_width = CEILDIV(cfg->width, LCU_WIDTH);
-      const int lcu_height = CEILDIV(cfg->height, LCU_WIDTH);
-
-      // Find the largest number of threads per frame that satifies the
-      // the condition: wpp start/stop inefficiency takes up  less than 15%
-      // of frame area.
-      int threads_per_frame = 1;
-      const int wpp_treshold = lcu_width * lcu_height * 15 / 100;
-      while ((threads_per_frame + 1) * 2 < lcu_width &&
-        threads_per_frame + 1 < lcu_height &&
-        size_of_wpp_ends(threads_per_frame + 1) < wpp_treshold) {
-        ++threads_per_frame;
-      }
+  const int width_lcu  = CEILDIV(encoder->cfg.width, LCU_WIDTH);
+  const int height_lcu = CEILDIV(encoder->cfg.height, LCU_WIDTH);
+  const int wpp_limit  = MIN(height_lcu, CEILDIV(width_lcu, 2));
+  const int par_frames = encoder->cfg.owf + 1;
 
-      const int threads = MAX(cfg->threads, 1);
-      const int frames = CEILDIV(threads, threads_per_frame);
+  int parallelism = 0;
 
-      // Convert from number of parallel frames to number of additional frames.
-      return CLIP(0, threads - 1, frames - 1);
+  if (encoder->cfg.intra_period == 1) {
+    int threads_per_frame;
+    if (encoder->cfg.wpp) {
+      // Usually limited by width because starting to code a CTU requires
+      // that the next two CTUs in the row above have been completed.
+      threads_per_frame = wpp_limit;
     } else {
-      // If wpp is not on, select owf such that there is enough
-      // tiles for twice the number of threads.
-
-      int tiles_per_frame = cfg->tiles_width_count * cfg->tiles_height_count;
-      int threads = (cfg->threads > 1 ? cfg->threads : 1);
-      int frames = CEILDIV(threads * 4, tiles_per_frame);
-
-      // Limit number of frames to 1.25x the number of threads for the case
-      // where there is only 1 tile per frame.
-      frames = CLIP(1, threads * 4 / 3, frames);
-      return frames - 1;
+      // One thread for each tile.
+      threads_per_frame = encoder->cfg.tiles_width_count *
+                          encoder->cfg.tiles_height_count;
     }
+    // Divide by two since all frames cannot achieve the maximum
+    // parallelism all the time.
+    parallelism = par_frames * threads_per_frame / 2;
+
   } else {
-    // Try and estimate a good number of parallel frames for inter.
-    const int lcu_width = CEILDIV(cfg->width, LCU_WIDTH);
-    const int lcu_height = CEILDIV(cfg->height, LCU_WIDTH);
-    int threads_per_frame = MIN(lcu_width / 2, lcu_height);
-    int threads = cfg->threads;
-
-    // If all threads fit into one frame, at least two parallel frames should
-    // be used to reduce the effect of WPP spin-up and wind-down.
-    int frames = 1;
-
-    while (threads > 0 && threads_per_frame > 0) {
-      frames += 1;
-      threads -= threads_per_frame;
-      threads_per_frame -= 2;
-    }
+    if (encoder->cfg.wpp) {
+      const int last_diagonal = (width_lcu - 1) + (height_lcu - 1) * 2;
+
+      // Index of a diagonal. The diagonal contains CTUs whose coordinates
+      // satisfy x + 2*y == diagonal. We start the sum from the longest
+      // diagonal.
+      int diagonal = CEILDIV(last_diagonal, 2);
+
+      // Difference between diagonal indices in consecutive frames.
+      const int frame_delay = 1 + encoder->max_inter_ref_lcu.right +
+                              2 * encoder->max_inter_ref_lcu.down;
+      int step = frame_delay;
+      int direction = -1;
+
+      // Compute number of threads for each parallel frame.
+      for (int num_frames = 0; num_frames < par_frames; num_frames++) {
+        if (diagonal < 0 || diagonal > last_diagonal) {
+          // No room for more threads.
+          break;
+        }
 
-    if (cfg->gop_len && cfg->gop_lowdelay && cfg->gop_lp_definition.t > 1) {
-      // Temporal skipping makes every other frame very fast to encode so
-      // more parallel frames should be used.
-      frames *= 2;
+        // Count number of CTUs on the diagonal.
+        if (diagonal < MIN(2 * height_lcu, width_lcu)) {
+          parallelism += 1 + diagonal / 2;
+        } else {
+          parallelism += MIN(
+            wpp_limit,
+            height_lcu + CEILDIV(width_lcu, 2) - 1 - CEILDIV(diagonal, 2)
+          );
+        }
+        diagonal += direction * step;
+        step += frame_delay;
+        direction = -direction;
+      }
+
+    } else {
+      parallelism = encoder->cfg.tiles_width_count *
+                    encoder->cfg.tiles_height_count;
     }
-    return CLIP(0, cfg->threads * 2 - 1, frames - 1);
   }
+
+  return parallelism;
 }
 
 
-static unsigned cfg_num_threads(void)
+/**
+ * \brief Return weight for 360 degree ERP video
+ *
+ * Returns the scaling factor of area from equirectangular projection to
+ * spherical surface.
+ *
+ * \param y   y-coordinate of the pixel
+ * \param h   height of the picture
+ */
+static double ws_weight(int y, int h)
+{
+  return cos((y - 0.5 * h + 0.5) * (M_PI / h));
+}
+
+
+
+/**
+ * \brief Update ROI QPs for 360 video with equirectangular projection.
+ *
+ * Writes updated ROI parameters to encoder->cfg.roi.
+ *
+ * \param encoder       encoder control
+ * \param orig_roi      original delta QPs or NULL
+ * \param orig_width    width of orig_roi
+ * \param orig_height   height of orig_roi
+ */
+static void init_erp_aqp_roi(encoder_control_t* encoder,
+                             int8_t *orig_roi,
+                             int32_t orig_width,
+                             int32_t orig_height)
 {
-  unsigned cpus = kvz_g_hardware_flags.physical_cpu_count;
-  unsigned fake_cpus = kvz_g_hardware_flags.logical_cpu_count - cpus;
+  // Update ROI with WS-PSNR delta QPs.
+  int height = encoder->in.height_in_lcu;
+  int width  = orig_roi ? orig_width : 1;
+
+  int frame_height = encoder->in.real_height;
 
-  // Default to 4 if we don't know the number of CPUs.
-  if (cpus == 0) return 4;
+  encoder->cfg.roi.width  = width;
+  encoder->cfg.roi.height = height;
+  encoder->cfg.roi.dqps   = calloc(width * height, sizeof(orig_roi0));
+
+  double total_weight = 0.0;
+  for (int y = 0; y < frame_height; y++) {
+    total_weight += ws_weight(y, frame_height);
+  }
+
+  for (int y_lcu = 0; y_lcu < height; y_lcu++) {
+    int y_orig = LCU_WIDTH * y_lcu;
+    int lcu_height = MIN(LCU_WIDTH, frame_height - y_orig);
+
+    double lcu_weight = 0.0;
+    for (int y = y_orig; y < y_orig + lcu_height; y++) {
+      lcu_weight += ws_weight(y, frame_height);
+    }
+    // Normalize.
+    lcu_weight = (lcu_weight * frame_height) / (total_weight * lcu_height);
+
+    int8_t qp_delta = round(-ERP_AQP_STRENGTH * log2(lcu_weight));
+
+    if (orig_roi) {
+      // If a ROI array already exists, we copy the existing values to the
+      // new array while adding qp_delta to each.
+      int y_roi = y_lcu * orig_height / height;
+      for (int x = 0; x < width; x++) {
+        encoder->cfg.roi.dqpsx + y_lcu * width =
+          CLIP(-51, 51, orig_roix + y_roi * width + qp_delta);
+      }
 
-  // 1.5 times the number of physical cores seems to be a good compromise
-  // when hyperthreading is available on Haswell.
-  return cpus + fake_cpus / 2;
+    } else {
+      // Otherwise, simply write qp_delta to the ROI array.
+      encoder->cfg.roi.dqpsy_lcu = qp_delta;
+    }
+  }
 }
 
 
@@ -148,21 +231,53 @@
   encoder->cfg.tiles_height_split = NULL;
   encoder->cfg.slice_addresses_in_ts = NULL;
 
-  if (encoder->cfg.threads == -1) {
-    encoder->cfg.threads = cfg_num_threads();
-  }
-
   if (encoder->cfg.gop_len > 0) {
     if (encoder->cfg.gop_lowdelay) {
       kvz_config_process_lp_gop(&encoder->cfg);
     }
   }
 
+  encoder->max_inter_ref_lcu.right = 1;
+  encoder->max_inter_ref_lcu.down  = 1;
+
+  int max_threads = encoder->cfg.threads;
+  if (max_threads < 0) {
+    max_threads = cfg_num_threads();
+  }
+  max_threads = MAX(1, max_threads);
+
   // Need to set owf before initializing threadqueue.
   if (encoder->cfg.owf < 0) {
-    encoder->cfg.owf = select_owf_auto(&encoder->cfg);
+    int best_parallelism = 0;
+
+    for (encoder->cfg.owf = 0; true; encoder->cfg.owf++) {
+      int parallelism = get_max_parallelism(encoder);
+
+      if (parallelism <= best_parallelism) {
+        // No improvement over previous OWF.
+        encoder->cfg.owf--;
+        break;
+      }
+
+      best_parallelism = parallelism;
+      if (parallelism >= max_threads) {
+        // Cannot have more parallelism than there are threads.
+        break;
+      }
+    }
+
+    // Add two frames so that we have frames ready to be coded when one is
+    // completed.
+    encoder->cfg.owf += 2;
+
     fprintf(stderr, "--owf=auto value set to %d.\n", encoder->cfg.owf);
   }
+
+  if (encoder->cfg.threads < 0) {
+    encoder->cfg.threads = MIN(max_threads, get_max_parallelism(encoder));
+    fprintf(stderr, "--threads=auto value set to %d.\n", encoder->cfg.threads);
+  }
+
   if (encoder->cfg.source_scan_type != KVZ_INTERLACING_NONE) {
     // If using interlaced coding with OWF, the OWF has to be an even number
     // to ensure that the pair of fields will be output for the same picture.
@@ -171,11 +286,8 @@
     }
   }
 
-  encoder->threadqueue = MALLOC(threadqueue_queue_t, 1);
-  if (!encoder->threadqueue ||
-      !kvz_threadqueue_init(encoder->threadqueue,
-                        encoder->cfg.threads,
-                        encoder->cfg.owf > 0)) {
+  encoder->threadqueue = kvz_threadqueue_init(encoder->cfg.threads);
+  if (!encoder->threadqueue) {
     fprintf(stderr, "Could not initialize threadqueue.\n");
     goto init_failed;
   }
@@ -219,15 +331,30 @@
     goto init_failed;
   }
 
-  // Copy delta QP array for ROI coding.
-  if (cfg->roi.dqps) {
+  if (cfg->erp_aqp) {
+    init_erp_aqp_roi(encoder,
+                     cfg->roi.dqps,
+                     cfg->roi.width,
+                     cfg->roi.height);
+
+  } else if (cfg->roi.dqps) {
+    // Copy delta QP array for ROI coding.
     const size_t roi_size = encoder->cfg.roi.width * encoder->cfg.roi.height;
     encoder->cfg.roi.dqps = calloc(roi_size, sizeof(cfg->roi.dqps0));
     memcpy(encoder->cfg.roi.dqps,
            cfg->roi.dqps,
            roi_size * sizeof(*cfg->roi.dqps));
+
   }
 
+  encoder->lcu_dqp_enabled = cfg->target_bitrate > 0 || encoder->cfg.roi.dqps;
+
+  // When tr_depth_inter is equal to 0, inter transform split flag defaults
+  // to 1 for SMP and AMP partition units. We want to avoid the extra
+  // transform split so we set tr_depth_inter to 1 when SMP or AMP
+  // partition modes are enabled.
+  encoder->tr_depth_inter = (encoder->cfg.smp_enable || encoder->cfg.amp_enable) ? 1 : 0;
+
   //Tiles
   encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 ||
                           encoder->cfg.tiles_height_count > 1;
@@ -467,7 +594,7 @@
   // lossless coding.
   if (encoder->cfg.lossless) {
     encoder->cfg.deblock_enable  = false;
-    encoder->cfg.sao_enable      = false;
+    encoder->cfg.sao_type        = false;
     encoder->cfg.signhide_enable = false;
     encoder->cfg.trskip_enable   = false;
   }
@@ -490,6 +617,12 @@
     encoder->cfg.vps_period = -1;
   }
 
+  if(encoder->cfg.optional_key){
+    encoder->cfg.optional_key = MALLOC(uint8_t,16);
+    if (!encoder->cfg.optional_key) goto init_failed;
+    memcpy(encoder->cfg.optional_key, cfg->optional_key, 16);
+  }
+
   return encoder;
 
 init_failed:
@@ -520,13 +653,12 @@
   FREE_POINTER(encoder->tiles_tile_id);
 
   FREE_POINTER(encoder->cfg.roi.dqps);
+  FREE_POINTER(encoder->cfg.optional_key);
 
   kvz_scalinglist_destroy(&encoder->scaling_list);
 
-  if (encoder->threadqueue) {
-    kvz_threadqueue_finalize(encoder->threadqueue);
-  }
-  FREE_POINTER(encoder->threadqueue);
+  kvz_threadqueue_free(encoder->threadqueue);
+  encoder->threadqueue = NULL;
 
   free(encoder);
 }

kvazaar-1.1.0.tar.gz/src/encoder.h -> kvazaar-1.2.0.tar.gz/src/encoder.h Changed

kvazaar-1.1.0.tar.gz/src/encoder_state-bitstream.c -> kvazaar-1.2.0.tar.gz/src/encoder_state-bitstream.c Changed

@@ -389,7 +389,7 @@
   WRITE_UE(stream, MAX_DEPTH, "log2_diff_max_min_coding_block_size");
   WRITE_UE(stream, 0, "log2_min_transform_block_size_minus2");   // 4x4
   WRITE_UE(stream, 3, "log2_diff_max_min_transform_block_size"); // 4x4...32x32
-  WRITE_UE(stream, TR_DEPTH_INTER, "max_transform_hierarchy_depth_inter");
+  WRITE_UE(stream, encoder->tr_depth_inter, "max_transform_hierarchy_depth_inter");
   WRITE_UE(stream, encoder->cfg.tr_depth_intra, "max_transform_hierarchy_depth_intra");
 
   // scaling list
@@ -401,7 +401,7 @@
 
   WRITE_U(stream, (encoder->cfg.amp_enable ? 1 : 0), 1, "amp_enabled_flag");
 
-  WRITE_U(stream, encoder->cfg.sao_enable ? 1 : 0, 1,
+  WRITE_U(stream, encoder->cfg.sao_type ? 1 : 0, 1,
           "sample_adaptive_offset_enabled_flag");
   WRITE_U(stream, ENABLE_PCM, 1, "pcm_enabled_flag");
   #if ENABLE_PCM == 1
@@ -455,7 +455,7 @@
   WRITE_U(stream, 0, 1, "constrained_intra_pred_flag");
   WRITE_U(stream, encoder->cfg.trskip_enable, 1, "transform_skip_enabled_flag");
 
-  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps != NULL) {
+  if (encoder->lcu_dqp_enabled) {
     // Use separate QP for each LCU when rate control is enabled.
     WRITE_U(stream, 1, 1, "cu_qp_delta_enabled_flag");
     WRITE_UE(stream, 0, "diff_cu_qp_delta_depth");
@@ -544,7 +544,7 @@
   s += sprintf(s, " %dx%d", cfg->width, cfg->height);
   s += sprintf(s, " deblock=%d:%d:%d", cfg->deblock_enable,
                cfg->deblock_beta, cfg->deblock_tc);
-  s += sprintf(s, " sao=%d", cfg->sao_enable);
+  s += sprintf(s, " sao=%d", cfg->sao_type);
   s += sprintf(s, " intra_period=%d", cfg->intra_period);
   s += sprintf(s, " qp=%d", cfg->qp);
   s += sprintf(s, " ref=%d", cfg->ref_frames);
@@ -731,7 +731,7 @@
 
       WRITE_UE(stream, encoder->cfg.gop_len?delta_poc - last_poc - 1:0, "delta_poc_s0_minus1");
       last_poc = delta_poc;
-      WRITE_U(stream,1,1, "used_by_curr_pic_s0_flag");
+      WRITE_U(stream, !state->frame->is_irap, 1, "used_by_curr_pic_s0_flag");
     }
     last_poc = 0;
     poc_shift = 0;
@@ -758,12 +758,12 @@
       
       WRITE_UE(stream, encoder->cfg.gop_len ? delta_poc - last_poc - 1 : 0, "delta_poc_s1_minus1");
       last_poc = delta_poc;
-      WRITE_U(stream, 1, 1, "used_by_curr_pic_s1_flag");
+      WRITE_U(stream, !state->frame->is_irap, 1, "used_by_curr_pic_s1_flag");
     }
     //WRITE_UE(stream, 0, "short_term_ref_pic_set_idx");
     
     if (state->encoder_control->cfg.tmvp_enable) {
-      WRITE_U(stream, ref_negative?1:0, 1, "slice_temporal_mvp_enabled_flag");
+      WRITE_U(stream, ref_negative ? 1 : 0, 1, "slice_temporal_mvp_enabled_flag");
     }
   }
 
@@ -771,7 +771,7 @@
   //end if
 
 
-  if (encoder->cfg.sao_enable) {
+  if (encoder->cfg.sao_type) {
     WRITE_U(stream, 1, 1, "slice_sao_luma_flag");
     if (encoder->chroma_format != KVZ_CSP_400) {
       WRITE_U(stream, 1, 1, "slice_sao_chroma_flag");
@@ -942,9 +942,7 @@
   encoder_state_t * state,
   bool independent)
 {
-  uint8_t nal_type = (state->frame->is_idr_frame ? KVZ_NAL_IDR_W_RADL : KVZ_NAL_TRAIL_R);
-
-  kvz_nal_write(stream, nal_type, 0, state->frame->first_nal);
+  kvz_nal_write(stream, state->frame->pictype, 0, state->frame->first_nal);
   state->frame->first_nal = false;
 
   kvz_encoder_state_write_bitstream_slice_header(stream, state, independent);
@@ -1018,19 +1016,13 @@
     kvz_bitstream_add_rbsp_trailing_bits(stream);
   }
 
-  {
-    PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
-    encoder_state_write_bitstream_children(state);
-    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, encoder->threadqueue, "type=write_bitstream_append,frame=%d,encoder_type=%c", state->frame->num, state->type);
-  }
-  
+  encoder_state_write_bitstream_children(state);
+
   if (state->encoder_control->cfg.hash != KVZ_HASH_NONE) {
-    PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
     // Calculate checksum
     add_checksum(state);
-    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, encoder->threadqueue, "type=write_bitstream_checksum,frame=%d,encoder_type=%c", state->frame->num, state->type);
   }
-  
+
   //Get bitstream length for stats
   uint64_t newpos = kvz_bitstream_tell(stream);
   state->stats_bitstream_length = (newpos >> 3) - (curpos >> 3);

kvazaar-1.1.0.tar.gz/src/encoder_state-ctors_dtors.c -> kvazaar-1.2.0.tar.gz/src/encoder_state-ctors_dtors.c Changed

@@ -29,7 +29,6 @@
 #include "encoder.h"
 #include "encoder_state-geometry.h"
 #include "encoderstate.h"
-#include "extras/crypto.h"
 #include "image.h"
 #include "imagelist.h"
 #include "kvazaar.h"
@@ -82,16 +81,12 @@
     printf("Error allocating videoframe!\r\n");
     return 0;
   }
-  
-  // Init coeff data table
-  //FIXME: move them
-  state->tile->frame->coeff_y = MALLOC(coeff_t, width * height);
-  state->tile->frame->coeff_u = MALLOC(coeff_t, (width * height) >> 2);
-  state->tile->frame->coeff_v = MALLOC(coeff_t, (width * height) >> 2);
-  
+
   state->tile->lcu_offset_x = lcu_offset_x;
   state->tile->lcu_offset_y = lcu_offset_y;
-  
+  state->tile->offset_x     = lcu_offset_x * LCU_WIDTH;
+  state->tile->offset_y     = lcu_offset_y * LCU_WIDTH;
+
   state->tile->lcu_offset_in_ts = encoder->tiles_ctb_addr_rs_to_tslcu_offset_x + lcu_offset_y * encoder->in.width_in_lcu;
   
   // hor_buf_search and ver_buf_search store single row/col from each LCU row/col.
@@ -105,13 +100,15 @@
 
   state->tile->hor_buf_search = kvz_yuv_t_alloc(luma_size, chroma_size_hor);
   state->tile->ver_buf_search = kvz_yuv_t_alloc(luma_size, chroma_size_ver);
-  
-  if (encoder->cfg.sao_enable) {
+
+  if (encoder->cfg.sao_type) {
     state->tile->hor_buf_before_sao = kvz_yuv_t_alloc(luma_size, chroma_size_hor);
+    state->tile->ver_buf_before_sao = kvz_yuv_t_alloc(luma_size, chroma_size_ver);
   } else {
     state->tile->hor_buf_before_sao = NULL;
+    state->tile->ver_buf_before_sao = NULL;
   }
-  
+
   if (encoder->cfg.wpp) {
     int num_jobs = state->tile->frame->width_in_lcu * state->tile->frame->height_in_lcu;
     state->tile->wf_jobs = MALLOC(threadqueue_job_t*, num_jobs);
@@ -132,21 +129,27 @@
 static void encoder_state_config_tile_finalize(encoder_state_t * const state) {
   if (state->tile == NULL) return;
 
-  if (state->tile->hor_buf_before_sao) kvz_yuv_t_free(state->tile->hor_buf_before_sao);
-  
   kvz_yuv_t_free(state->tile->hor_buf_search);
   kvz_yuv_t_free(state->tile->ver_buf_search);
-  
+  kvz_yuv_t_free(state->tile->hor_buf_before_sao);
+  kvz_yuv_t_free(state->tile->ver_buf_before_sao);
+
+  if (state->encoder_control->cfg.wpp) {
+    int num_jobs = state->tile->frame->width_in_lcu * state->tile->frame->height_in_lcu;
+    for (int i = 0; i < num_jobs; ++i) {
+      kvz_threadqueue_free_job(&state->tile->wf_jobsi);
+    }
+  }
+
   kvz_videoframe_free(state->tile->frame);
   state->tile->frame = NULL;
-  if (state->encoder_control->cfg.crypto_features && state->tile->dbs_g) {
-    DeleteCryptoC(state->tile->dbs_g);
-  }
   FREE_POINTER(state->tile->wf_jobs);
 }
 
-static int encoder_state_config_slice_init(encoder_state_t * const state, 
-                                          const int start_address_in_ts, const int end_address_in_ts) {
+static int encoder_state_config_slice_init(encoder_state_t * const state,
+                                           const int start_address_in_ts,
+                                           const int end_address_in_ts)
+{
   state->slice->id = -1;
   for (int i = 0; i < state->encoder_control->slice_count; ++i) {
     if (state->encoder_control->slice_addresses_in_tsi == start_address_in_ts) {
@@ -308,6 +311,7 @@
   child_state->parent = parent_state;
   child_state->children = MALLOC(encoder_state_t, 1);
   child_state->children0.encoder_control = NULL;
+  child_state->crypto_hdl = NULL;
   child_state->tqj_bitstream_written = NULL;
   child_state->tqj_recon_done = NULL;
   
@@ -326,7 +330,6 @@
       return 0;
     }
 
-    child_state->tile->dbs_g = NULL;  // Not used. The used state is in the sub-tile.
     child_state->slice = MALLOC(encoder_state_config_slice_t, 1);
     if (!child_state->slice || !encoder_state_config_slice_init(child_state, 0, encoder->in.width_in_lcu * encoder->in.height_in_lcu - 1)) {
       fprintf(stderr, "Could not initialize encoder_state->slice!\n");
@@ -461,9 +464,6 @@
         new_child->type  = ENCODER_STATE_TYPE_TILE;
         new_child->frame = child_state->frame;
         new_child->tile  = MALLOC(encoder_state_config_tile_t, 1);
-        if (child_state->encoder_control->cfg.crypto_features) {
-          new_child->tile->dbs_g = CreateC();
-        }
         new_child->slice = child_state->slice;
         new_child->wfrow = child_state->wfrow;
         
@@ -706,4 +706,7 @@
   }
   
   kvz_bitstream_finalize(&state->stream);
+
+  kvz_threadqueue_free_job(&state->tqj_recon_done);
+  kvz_threadqueue_free_job(&state->tqj_bitstream_written);
 }

kvazaar-1.1.0.tar.gz/src/encoderstate.c -> kvazaar-1.2.0.tar.gz/src/encoderstate.c Changed

@@ -35,6 +35,10 @@
 #include "sao.h"
 #include "search.h"
 #include "tables.h"
+#include "threadqueue.h"
+
+#define SAO_BUF_WIDTH (LCU_WIDTH + SAO_DELAY_PX + 2)
+#define SAO_BUF_WIDTH_C (SAO_BUF_WIDTH / 2)
 
 
 int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
@@ -48,7 +52,127 @@
   return 1;
 }
 
-static void encoder_state_recdata_to_bufs(encoder_state_t * const state, const lcu_order_element_t * const lcu, yuv_t * const hor_buf, yuv_t * const ver_buf) {
+/**
+ * \brief Save edge pixels before SAO to buffers.
+ *
+ * Copies pixels at the edges of the area that will be filtered with SAO to
+ * the given buffers. If deblocking is enabled, the pixels must have been
+ * deblocked before this.
+ *
+ * The saved pixels will be needed later when doing SAO for the neighboring
+ * areas.
+ */
+static void encoder_state_recdata_before_sao_to_bufs(
+    encoder_state_t * const state,
+    const lcu_order_element_t * const lcu,
+    yuv_t * const hor_buf,
+    yuv_t * const ver_buf)
+{
+  videoframe_t* const frame = state->tile->frame;
+
+  if (hor_buf && lcu->below) {
+    // Copy the bottommost row that will be filtered with SAO to the
+    // horizontal buffer.
+    vector2d_t pos = {
+      .x = lcu->position_px.x,
+      .y = lcu->position_px.y + LCU_WIDTH - SAO_DELAY_PX - 1,
+    };
+    // Copy all pixels that have been deblocked.
+    int length = lcu->size.x - DEBLOCK_DELAY_PX;
+
+    if (!lcu->right) {
+      // If there is no LCU to the right, the last pixels will be
+      // filtered too.
+      length += DEBLOCK_DELAY_PX;
+    }
+
+    if (lcu->left) {
+      // The rightmost pixels of the CTU to the left will also be filtered.
+      pos.x -= DEBLOCK_DELAY_PX;
+      length += DEBLOCK_DELAY_PX;
+    }
+
+    const unsigned from_index = pos.x + pos.y * frame->rec->stride;
+    // NOTE: The horizontal buffer is indexed by
+    //    x_px + y_lcu * frame->width
+    // where x_px is in pixels and y_lcu in number of LCUs.
+    const unsigned to_index = pos.x + lcu->position.y * frame->width;
+
+    kvz_pixels_blit(&frame->rec->yfrom_index,
+                    &hor_buf->yto_index,
+                    length, 1,
+                    frame->rec->stride,
+                    frame->width);
+
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      const unsigned from_index_c = (pos.x / 2) + (pos.y / 2) * frame->rec->stride / 2;
+      const unsigned to_index_c = (pos.x / 2) + lcu->position.y * frame->width / 2;
+
+      kvz_pixels_blit(&frame->rec->ufrom_index_c,
+                      &hor_buf->uto_index_c,
+                      length / 2, 1,
+                      frame->rec->stride / 2,
+                      frame->width / 2);
+      kvz_pixels_blit(&frame->rec->vfrom_index_c,
+                      &hor_buf->vto_index_c,
+                      length / 2, 1,
+                      frame->rec->stride / 2,
+                      frame->width / 2);
+    }
+  }
+
+  if (ver_buf && lcu->right) {
+    // Copy the rightmost column that will be filtered with SAO to the
+    // vertical buffer.
+    vector2d_t pos = {
+      .x = lcu->position_px.x + LCU_WIDTH - SAO_DELAY_PX - 1,
+      .y = lcu->position_px.y,
+    };
+    int length = lcu->size.y - DEBLOCK_DELAY_PX;
+
+    if (!lcu->below) {
+      // If there is no LCU below, the last pixels will be filtered too.
+      length += DEBLOCK_DELAY_PX;
+    }
+
+    if (lcu->above) {
+      // The bottommost pixels of the CTU above will also be filtered.
+      pos.y -= DEBLOCK_DELAY_PX;
+      length += DEBLOCK_DELAY_PX;
+    }
+
+    const unsigned from_index = pos.x + pos.y * frame->rec->stride;
+    // NOTE: The vertical buffer is indexed by
+    //    x_lcu * frame->height + y_px
+    // where x_lcu is in number of LCUs and y_px in pixels.
+    const unsigned to_index = lcu->position.x * frame->height + pos.y;
+
+    kvz_pixels_blit(&frame->rec->yfrom_index,
+                    &ver_buf->yto_index,
+                    1, length,
+                    frame->rec->stride, 1);
+
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      const unsigned from_index_c = (pos.x / 2) + (pos.y / 2) * frame->rec->stride / 2;
+      const unsigned to_index_c = lcu->position.x * frame->height / 2 + pos.y / 2;
+
+      kvz_pixels_blit(&frame->rec->ufrom_index_c,
+                      &ver_buf->uto_index_c,
+                      1, length / 2,
+                      frame->rec->stride / 2, 1);
+      kvz_pixels_blit(&frame->rec->vfrom_index_c,
+                      &ver_buf->vto_index_c,
+                      1, length / 2,
+                      frame->rec->stride / 2, 1);
+    }
+  }
+}
+
+static void encoder_state_recdata_to_bufs(encoder_state_t * const state,
+                                          const lcu_order_element_t * const lcu,
+                                          yuv_t * const hor_buf,
+                                          yuv_t * const ver_buf)
+{
   videoframe_t* const frame = state->tile->frame;
   
   if (hor_buf) {
@@ -107,6 +231,209 @@
   
 }
 
+/**
+ * \brief Do SAO reconstuction for all available pixels.
+ *
+ * Does SAO reconstruction for all pixels that are available after the
+ * given LCU has been deblocked. This means the following pixels:
+ *  - bottom-right block of SAO_DELAY_PX times SAO_DELAY_PX in the lcu to
+ *    the left and up
+ *  - the rightmost SAO_DELAY_PX pixels of the LCU to the left (excluding
+ *    the bottommost pixel)
+ *  - the bottommost SAO_DELAY_PX pixels of the LCU above (excluding the
+ *    rightmost pixels)
+ *  - all pixels inside the LCU, excluding the rightmost SAO_DELAY_PX and
+ *    bottommost SAO_DELAY_PX
+ */
+static void encoder_sao_reconstruct(const encoder_state_t *const state,
+                                    const lcu_order_element_t *const lcu)
+{
+  videoframe_t *const frame = state->tile->frame;
+
+  // Temporary buffers for SAO input pixels.
+  kvz_pixel sao_buf_y_arraySAO_BUF_WIDTH * SAO_BUF_WIDTH;
+  kvz_pixel sao_buf_u_arraySAO_BUF_WIDTH_C * SAO_BUF_WIDTH_C;
+  kvz_pixel sao_buf_v_arraySAO_BUF_WIDTH_C * SAO_BUF_WIDTH_C;
+
+  // Pointers to the top-left pixel of the LCU in the buffers.
+  kvz_pixel *const sao_buf_y = &sao_buf_y_array(SAO_DELAY_PX + 1) * (SAO_BUF_WIDTH + 1);
+  kvz_pixel *const sao_buf_u = &sao_buf_u_array(SAO_DELAY_PX/2 + 1) * (SAO_BUF_WIDTH_C + 1);
+  kvz_pixel *const sao_buf_v = &sao_buf_v_array(SAO_DELAY_PX/2 + 1) * (SAO_BUF_WIDTH_C + 1);
+
+  const int x_offsets3 = {
+    // If there is an lcu to the left, we need to filter its rightmost
+    // pixels.
+    lcu->left ? -SAO_DELAY_PX : 0,
+    0,
+    // If there is an lcu to the right, the rightmost pixels of this LCU
+    // are filtered when filtering that LCU. Otherwise we filter them now.
+    lcu->size.x - (lcu->right ? SAO_DELAY_PX : 0),
+  };
+
+  const int y_offsets3 = {
+    // If there is an lcu above, we need to filter its bottommost pixels.
+    lcu->above ? -SAO_DELAY_PX : 0,
+    0,
+    // If there is an lcu below, the bottommost pixels of this LCU are
+    // filtered when filtering that LCU. Otherwise we filter them now.
+    lcu->size.y - (lcu->below ? SAO_DELAY_PX : 0),
+  };
+
+  // Number of pixels around the block that need to be copied to the
+  // buffers.
+  const int border_left  = lcu->left  ? 1 : 0;
+  const int border_right = lcu->right ? 1 : 0;
+  const int border_above = lcu->above ? 1 : 0;
+  const int border_below = lcu->below ? 1 : 0;
+
+  // Index of the pixel at the intersection of the top and left borders.
+  const int border_index = (x_offsets0 - border_left) +
+                           (y_offsets0 - border_above) * SAO_BUF_WIDTH;
+  const int border_index_c = (x_offsets0/2 - border_left) +
+                             (y_offsets0/2 - border_above) * SAO_BUF_WIDTH_C;
+  // Width and height of the whole area to filter.
+  const int width  = x_offsets2 - x_offsets0;
+  const int height = y_offsets2 - y_offsets0;
+
+  // Copy bordering pixels from above and left to buffers.
+  if (lcu->above) {
+    const int from_index = (lcu->position_px.x + x_offsets0 - border_left) +
+                           (lcu->position.y - 1) * frame->width;
+    kvz_pixels_blit(&state->tile->hor_buf_before_sao->yfrom_index,
+                    &sao_buf_yborder_index,
+                    width + border_left + border_right,
+                    1,
+                    frame->width,
+                    SAO_BUF_WIDTH);
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      const int from_index_c = (lcu->position_px.x + x_offsets0)/2 - border_left +
+                               (lcu->position.y - 1) * frame->width/2;
+      kvz_pixels_blit(&state->tile->hor_buf_before_sao->ufrom_index_c,
+                      &sao_buf_uborder_index_c,
+                      width/2 + border_left + border_right,
+                      1,
+                      frame->width/2,
+                      SAO_BUF_WIDTH_C);
+      kvz_pixels_blit(&state->tile->hor_buf_before_sao->vfrom_index_c,
+                      &sao_buf_vborder_index_c,
+                      width/2 + border_left + border_right,
+                      1,
+                      frame->width/2,
+                      SAO_BUF_WIDTH_C);
+    }
+  }
+  if (lcu->left) {
+    const int from_index = (lcu->position.x - 1) * frame->height +
+                           (lcu->position_px.y + y_offsets0 - border_above);
+    kvz_pixels_blit(&state->tile->ver_buf_before_sao->yfrom_index,
+                    &sao_buf_yborder_index,
+                    1,
+                    height + border_above + border_below,
+                    1,
+                    SAO_BUF_WIDTH);
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      const int from_index_c = (lcu->position.x - 1) * frame->height/2 +
+                               (lcu->position_px.y + y_offsets0)/2 - border_above;
+      kvz_pixels_blit(&state->tile->ver_buf_before_sao->ufrom_index_c,
+                      &sao_buf_uborder_index_c,
+                      1,
+                      height/2 + border_above + border_below,
+                      1,
+                      SAO_BUF_WIDTH_C);
+      kvz_pixels_blit(&state->tile->ver_buf_before_sao->vfrom_index_c,
+                      &sao_buf_vborder_index_c,
+                      1,
+                      height/2 + border_above + border_below,
+                      1,
+                      SAO_BUF_WIDTH_C);
+    }
+  }
+  // Copy pixels that will be filtered and bordering pixels from right and
+  // below.
+  const int from_index = (lcu->position_px.x + x_offsets0) +
+                         (lcu->position_px.y + y_offsets0) * frame->rec->stride;
+  const int to_index = x_offsets0 + y_offsets0 * SAO_BUF_WIDTH;
+  kvz_pixels_blit(&frame->rec->yfrom_index,
+                  &sao_buf_yto_index,
+                  width + border_right,
+                  height + border_below,
+                  frame->rec->stride,
+                  SAO_BUF_WIDTH);
+  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+    const int from_index_c = (lcu->position_px.x + x_offsets0)/2 +
+                             (lcu->position_px.y + y_offsets0)/2 * frame->rec->stride/2;
+    const int to_index_c = x_offsets0/2 + y_offsets0/2 * SAO_BUF_WIDTH_C;
+    kvz_pixels_blit(&frame->rec->ufrom_index_c,
+                    &sao_buf_uto_index_c,
+                    width/2 + border_right,
+                    height/2 + border_below,
+                    frame->rec->stride/2,
+                    SAO_BUF_WIDTH_C);
+    kvz_pixels_blit(&frame->rec->vfrom_index_c,
+                    &sao_buf_vto_index_c,
+                    width/2 + border_right,
+                    height/2 + border_below,
+                    frame->rec->stride/2,
+                    SAO_BUF_WIDTH_C);
+  }
+
+  // We filter the pixels in four parts:
+  //  1. Pixels that belong to the LCU above and to the left
+  //  2. Pixels that belong to the LCU above
+  //  3. Pixels that belong to the LCU to the left
+  //  4. Pixels that belong to the current LCU
+  for (int y_offset_index = 0; y_offset_index < 2; y_offset_index++) {
+    for (int x_offset_index = 0; x_offset_index < 2; x_offset_index++) {
+      const int x = x_offsetsx_offset_index;
+      const int y = y_offsetsy_offset_index;
+      const int width = x_offsetsx_offset_index + 1 - x;
+      const int height = y_offsetsy_offset_index + 1 - y;
+
+      if (width == 0 || height == 0) continue;
+
+      const int lcu_x = (lcu->position_px.x + x) >> LOG2_LCU_WIDTH;
+      const int lcu_y = (lcu->position_px.y + y) >> LOG2_LCU_WIDTH;
+      const int lcu_index = lcu_x + lcu_y * frame->width_in_lcu;
+      const sao_info_t *sao_luma   = &frame->sao_lumalcu_index;
+      const sao_info_t *sao_chroma = &frame->sao_chromalcu_index;
+
+      kvz_sao_reconstruct(state,
+                          &sao_buf_yx + y * SAO_BUF_WIDTH,
+                          SAO_BUF_WIDTH,
+                          lcu->position_px.x + x,
+                          lcu->position_px.y + y,
+                          width,
+                          height,
+                          sao_luma,
+                          COLOR_Y);
+
+      if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+        // Coordinates in chroma pixels.
+        int x_c = x >> 1;
+        int y_c = y >> 1;
+
+        kvz_sao_reconstruct(state,
+                            &sao_buf_ux_c + y_c * SAO_BUF_WIDTH_C,
+                            SAO_BUF_WIDTH_C,
+                            lcu->position_px.x / 2 + x_c,
+                            lcu->position_px.y / 2 + y_c,
+                            width / 2,
+                            height / 2,
+                            sao_chroma,
+                            COLOR_U);
+        kvz_sao_reconstruct(state,
+                            &sao_buf_vx_c + y_c * SAO_BUF_WIDTH_C,
+                            SAO_BUF_WIDTH_C,
+                            lcu->position_px.x / 2 + x_c,
+                            lcu->position_px.y / 2 + y_c,
+                            width / 2,
+                            height / 2,
+                            sao_chroma,
+                            COLOR_V);
+      }
+    }
+  }
+}
 
 static void encode_sao_color(encoder_state_t * const state, sao_info_t *sao,
                              color_t color_i)
@@ -273,62 +600,49 @@
 
   kvz_set_lcu_lambda_and_qp(state, lcu->position);
 
+  lcu_coeff_t coeff;
+  state->coeff = &coeff;
+
   //This part doesn't write to bitstream, it's only search, deblock and sao
-  
   kvz_search_lcu(state, lcu->position_px.x, lcu->position_px.y, state->tile->hor_buf_search, state->tile->ver_buf_search);
-    
+
   encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search);
 
   if (encoder->cfg.deblock_enable) {
-    if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps != NULL) {
+    if (encoder->lcu_dqp_enabled) {
       set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, false);
     }
 
     kvz_filter_deblock_lcu(state, lcu->position_px.x, lcu->position_px.y);
   }
 
-  if (encoder->cfg.sao_enable) {
+  if (encoder->cfg.sao_type) {
+    // Save the post-deblocking but pre-SAO pixels of the LCU to a buffer
+    // so that they can be used in SAO reconstruction later.
+    encoder_state_recdata_before_sao_to_bufs(state,
+                                             lcu,
+                                             state->tile->hor_buf_before_sao,
+                                             state->tile->ver_buf_before_sao);
     kvz_sao_search_lcu(state, lcu->position.x, lcu->position.y);
+    encoder_sao_reconstruct(state, lcu);
   }
 
-  // Copy LCU cu_array to main states cu_array, because that is the only one
-  // which is given to the next frame through image_list_t.
-  {
-    PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
-
-    encoder_state_t *main_state = state;
-    while (main_state->parent) main_state = main_state->parent;
-    assert(main_state != state);
-
-    const unsigned tile_x_px = state->tile->lcu_offset_x << LOG2_LCU_WIDTH;
-    const unsigned tile_y_px = state->tile->lcu_offset_y << LOG2_LCU_WIDTH;
-    const unsigned x_px = lcu->position_px.x;
-    const unsigned y_px = lcu->position_px.y;
-    kvz_cu_array_copy(main_state->tile->frame->cu_array,
-                      x_px + tile_x_px, y_px + tile_y_px,
-                      state->tile->frame->cu_array,
-                      x_px, y_px,
-                      LCU_WIDTH, LCU_WIDTH);
-
-    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, state->encoder_control->threadqueue, "type=copy_cuinfo,frame=%d,tile=%d", state->frame->num, state->tile->id);
-  }
-  
   //Now write data to bitstream (required to have a correct CABAC state)
   const uint64_t existing_bits = kvz_bitstream_tell(&state->stream);
-  
+
   //Encode SAO
-  if (encoder->cfg.sao_enable) {
+  if (encoder->cfg.sao_type) {
     encode_sao(state, lcu->position.x, lcu->position.y, &frame->sao_lumalcu->position.y * frame->width_in_lcu + lcu->position.x, &frame->sao_chromalcu->position.y * frame->width_in_lcu + lcu->position.x);
   }
-  
 
   // QP delta is not used when rate control is turned off.
-  state->must_code_qp_delta = (
-      state->encoder_control->cfg.target_bitrate > 0
-      || state->encoder_control->cfg.roi.dqps != NULL);
+  state->must_code_qp_delta = encoder->lcu_dqp_enabled;
 
   //Encode coding tree
-  kvz_encode_coding_tree(state, lcu->position.x << MAX_DEPTH, lcu->position.y << MAX_DEPTH, 0);
+  kvz_encode_coding_tree(state, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, 0);
+
+  // Coeffs are not needed anymore.
+  state->coeff = NULL;
 
   bool end_of_slice_segment_flag;
   if (state->encoder_control->cfg.slices & KVZ_SLICES_WPP) {
@@ -366,6 +680,8 @@
       kvz_bitstream_align_zero(state->cabac.stream);
 
       kvz_cabac_start(&state->cabac);
+
+      kvz_crypto_delete(&state->crypto_hdl);
     }
   }
 
@@ -383,38 +699,23 @@
       }
     }
   }
-  
-  if (encoder->cfg.sao_enable && lcu->above) {
-    // Add the post-deblocking but pre-SAO pixels of the LCU row above this
-    // row to a buffer so this row can use them on it's own SAO
-    // reconstruction.
-
-    // The pixels need to be taken to from the LCU to the top-left, because
-    // not all of the pixels could be deblocked before prediction of this
-    // LCU was reconstructed.
-    if (lcu->above->left) {
-      encoder_state_recdata_to_bufs(state, lcu->above->left, state->tile->hor_buf_before_sao, NULL);
-    }
-    // If this is the last LCU in the row, we can save the pixels from the top
-    // also, as they have been fully deblocked.
-    if (!lcu->right) {
-      encoder_state_recdata_to_bufs(state, lcu->above, state->tile->hor_buf_before_sao, NULL);
-    }
-  }
 }
 
-static void encoder_state_encode_leaf(encoder_state_t * const state) {
+static void encoder_state_encode_leaf(encoder_state_t * const state)
+{
   assert(state->is_leaf);
   assert(state->lcu_order_count > 0);
 
-  const kvz_config *cfg = &state->encoder_control->cfg;
-  if (cfg->crypto_features) {
-    InitC(state->tile->dbs_g);
-    state->tile->m_prev_pos = 0;
-  }
+  const encoder_control_t *ctrl = state->encoder_control;
+  const kvz_config *cfg = &ctrl->cfg;
 
   state->ref_qp = state->frame->QP;
 
+  if (cfg->crypto_features) {
+    state->crypto_hdl = kvz_crypto_create(cfg);
+    state->crypto_prev_pos = 0;
+  }
+
   // Select whether to encode the frame/tile in current thread or to define
   // wavefront jobs for other threads to handle.
   bool wavefront = state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW;
@@ -424,38 +725,24 @@
     // frame is encoded. Deblocking and SAO search is done during LCU encoding.
 
     for (int i = 0; i < state->lcu_order_count; ++i) {
-      PERFORMANCE_MEASURE_START(KVZ_PERF_LCU);
-
       encoder_state_worker_encode_lcu(&state->lcu_orderi);
-
-#ifdef KVZ_DEBUG
-      {
-        const lcu_order_element_t * const lcu = &state->lcu_orderi;
-        PERFORMANCE_MEASURE_END(KVZ_PERF_LCU, state->encoder_control->threadqueue, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", state->frame->num, state->tile->id, state->slice->id, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1);
-      }
-#endif //KVZ_DEBUG
-    }
-    
-    if (state->encoder_control->cfg.sao_enable) {
-      PERFORMANCE_MEASURE_START(KVZ_PERF_SAOREC);
-      kvz_sao_reconstruct_frame(state);
-      PERFORMANCE_MEASURE_END(KVZ_PERF_SAOREC, state->encoder_control->threadqueue, "type=kvz_sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", state->frame->num, state->tile->id, state->slice->id, state->lcu_order0.position.y + state->tile->lcu_offset_y, state->lcu_orderstate->lcu_order_count - 1.position.y + state->tile->lcu_offset_y,
-        state->tile->lcu_offset_x * LCU_WIDTH, state->tile->frame->width + state->tile->lcu_offset_x * LCU_WIDTH - 1,
-        state->tile->lcu_offset_y * LCU_WIDTH, state->tile->frame->height + state->tile->lcu_offset_y * LCU_WIDTH - 1
-      );
     }
   } else {
     // Add each LCU in the wavefront row as it's own job to the queue.
 
     // Select which frame dependancies should be set to.
     const encoder_state_t * ref_state = NULL;
-    if (cfg->gop_lowdelay &&
-        cfg->gop_len > 0 &&
-        state->previous_encoder_state != state)
+
+    if (state->frame->slicetype == KVZ_SLICE_I) {
+      // I-frames have no references.
+      ref_state = NULL;
+    } else if (cfg->gop_lowdelay &&
+               cfg->gop_len > 0 &&
+               state->previous_encoder_state != state)
     {
       // For LP-gop, depend on the state of the first reference.
-      int ref_neg = cfg->gop(state->frame->poc - 1) % cfg->gop_len.ref_neg0;
-      if (ref_neg > state->encoder_control->cfg.owf) {
+      int ref_neg = cfg->gopstate->frame->gop_offset.ref_neg0;
+      if (ref_neg > cfg->owf) {
         // If frame is not within OWF range, it's already done.
         ref_state = NULL;
       } else {
@@ -473,16 +760,12 @@
     for (int i = 0; i < state->lcu_order_count; ++i) {
       const lcu_order_element_t * const lcu = &state->lcu_orderi;
 
-#ifdef KVZ_DEBUG
-      char job_description256;
-      sprintf(job_description, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", state->frame->num, state->tile->id, state->slice->id, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1);
-#else
-      char* job_description = NULL;
-#endif
-      state->tile->wf_jobslcu->id = kvz_threadqueue_submit(state->encoder_control->threadqueue, encoder_state_worker_encode_lcu, (void*)lcu, 1, job_description);
-      
+      kvz_threadqueue_free_job(&state->tile->wf_jobslcu->id);
+      state->tile->wf_jobslcu->id = kvz_threadqueue_job_create(encoder_state_worker_encode_lcu, (void*)lcu);
+      threadqueue_job_t **job = &state->tile->wf_jobslcu->id;
+
       // If job object was returned, add dependancies and allow it to run.
-      if (state->tile->wf_jobslcu->id) {
+      if (job0) {
         // Add inter frame dependancies when ecoding more than one frame at
         // once. The added dependancy is for the first LCU of each wavefront
         // row to depend on the reconstruction status of the row below in the
@@ -491,37 +774,39 @@
             state->previous_encoder_state->tqj_recon_done &&
             state->frame->slicetype != KVZ_SLICE_I)
         {
-          if (!lcu->left) {
-            const lcu_order_element_t * const ref_lcu = &ref_state->lcu_orderi;
-            if (lcu->below) {
-              kvz_threadqueue_job_dep_add(state->tile->wf_jobslcu->id, ref_lcu->below->encoder_state->tqj_recon_done);
-            } else {
-              kvz_threadqueue_job_dep_add(state->tile->wf_jobslcu->id, ref_lcu->encoder_state->tqj_recon_done);
-            }
+          // We need to wait until the CTUs whose pixels we refer to are
+          // done before we can start this CTU.
+          const lcu_order_element_t *dep_lcu = lcu;
+          for (int i = 0; dep_lcu->below && i < ctrl->max_inter_ref_lcu.down; i++) {
+            dep_lcu = dep_lcu->below;
+          }
+          for (int i = 0; dep_lcu->right && i < ctrl->max_inter_ref_lcu.right; i++) {
+            dep_lcu = dep_lcu->right;
           }
+          kvz_threadqueue_job_dep_add(job0, ref_state->tile->wf_jobsdep_lcu->id);
         }
 
         // Add local WPP dependancy to the LCU on the left.
         if (lcu->left) {
-          kvz_threadqueue_job_dep_add(state->tile->wf_jobslcu->id, state->tile->wf_jobslcu->id - 1);
+          kvz_threadqueue_job_dep_add(job0, job-1);
         }
         // Add local WPP dependancy to the LCU on the top right.
         if (lcu->above) {
           if (lcu->above->right) {
-            kvz_threadqueue_job_dep_add(state->tile->wf_jobslcu->id, state->tile->wf_jobslcu->id - state->tile->frame->width_in_lcu + 1);
+            kvz_threadqueue_job_dep_add(job0, job-state->tile->frame->width_in_lcu + 1);
           } else {
-            kvz_threadqueue_job_dep_add(state->tile->wf_jobslcu->id, state->tile->wf_jobslcu->id - state->tile->frame->width_in_lcu);
+            kvz_threadqueue_job_dep_add(job0, job-state->tile->frame->width_in_lcu);
           }
         }
 
-        kvz_threadqueue_job_unwait_job(state->encoder_control->threadqueue, state->tile->wf_jobslcu->id);
-      }
+        kvz_threadqueue_submit(state->encoder_control->threadqueue, state->tile->wf_jobslcu->id);
 
-      // In the case where SAO is not enabled, the wavefront row is
-      // done when the last LCU in the row is done.
-      if (!state->encoder_control->cfg.sao_enable && i + 1 == state->lcu_order_count) {
-        assert(!state->tqj_recon_done);
-        state->tqj_recon_done = state->tile->wf_jobslcu->id;
+        // The wavefront row is done when the last LCU in the row is done.
+        if (i + 1 == state->lcu_order_count) {
+          assert(!state->tqj_recon_done);
+          state->tqj_recon_done =
+            kvz_threadqueue_copy_ref(state->tile->wf_jobslcu->id);
+        }
       }
     }
   }
@@ -541,76 +826,14 @@
     int wpp_row = sub_state->wfrow->lcu_offset_y;
     int tile_width = sub_state->tile->frame->width_in_lcu;
     int end_of_row = (wpp_row + 1) * tile_width - 1;
-    threadqueue_job_t *job = sub_state->tile->wf_jobsend_of_row;
-
     assert(!sub_state->tqj_bitstream_written);
-    sub_state->tqj_bitstream_written = job;
-    return;
-  }
-}
-
-typedef struct {
-  int y;
-  const encoder_state_t * encoder_state;
-} worker_sao_reconstruct_lcu_data;
-
-static void encoder_state_worker_sao_reconstruct_lcu(void *opaque) {
-  worker_sao_reconstruct_lcu_data *data = opaque;
-  videoframe_t * const frame = data->encoder_state->tile->frame;
-  unsigned stride = frame->width_in_lcu;
-  int x;
-  
-  //TODO: copy only needed data
-  kvz_pixel *new_y_data = MALLOC(kvz_pixel, frame->width * frame->height);
-  kvz_pixel *new_u_data = NULL;
-  kvz_pixel *new_v_data = NULL;
-  if (frame->rec->chroma_format != KVZ_CSP_400) {
-    new_u_data = MALLOC(kvz_pixel, (frame->width * frame->height) >> 2);
-    new_v_data = MALLOC(kvz_pixel, (frame->width * frame->height) >> 2);
-  }
-  
-  const int offset = frame->width * (data->y*LCU_WIDTH);
-  const int offset_c = frame->width/2 * (data->y*LCU_WIDTH_C);
-  int num_pixels = frame->width * (LCU_WIDTH + 2);
-  
-  if (num_pixels + offset > frame->width * frame->height) {
-    num_pixels = frame->width * frame->height - offset;
-  }
-  
-  memcpy(&new_y_dataoffset, &frame->rec->yoffset, sizeof(kvz_pixel) * num_pixels);
-  if (frame->rec->chroma_format != KVZ_CSP_400) {
-    memcpy(&new_u_dataoffset_c, &frame->rec->uoffset_c, sizeof(kvz_pixel) * num_pixels >> 2);
-    memcpy(&new_v_dataoffset_c, &frame->rec->voffset_c, sizeof(kvz_pixel) * num_pixels >> 2);
-  }
-  
-  if (data->y>0) {
-    //copy first row from buffer
-    memcpy(&new_y_dataframe->width * (data->y*LCU_WIDTH-1), &data->encoder_state->tile->hor_buf_before_sao->yframe->width * (data->y-1), frame->width * sizeof(kvz_pixel));
-    if (frame->rec->chroma_format != KVZ_CSP_400) {
-      memcpy(&new_u_dataframe->width / 2 * (data->y*LCU_WIDTH_C - 1), &data->encoder_state->tile->hor_buf_before_sao->uframe->width / 2 * (data->y - 1), frame->width / 2 * sizeof(kvz_pixel));
-      memcpy(&new_v_dataframe->width / 2 * (data->y*LCU_WIDTH_C - 1), &data->encoder_state->tile->hor_buf_before_sao->vframe->width / 2 * (data->y - 1), frame->width / 2 * sizeof(kvz_pixel));
+    if (sub_state->tile->wf_jobsend_of_row) {
+      sub_state->tqj_bitstream_written =
+        kvz_threadqueue_copy_ref(sub_state->tile->wf_jobsend_of_row);
     }
   }
-
-  for (x = 0; x < frame->width_in_lcu; x++) {
-  // sao_do_rdo(encoder, lcu.x, lcu.y, sao_luma, sao_chroma);
-    sao_info_t *sao_luma = &frame->sao_lumadata->y * stride + x;
-    sao_info_t *sao_chroma = &frame->sao_chromadata->y * stride + x;
-    kvz_sao_reconstruct(data->encoder_state->encoder_control, frame, new_y_data, x, data->y, sao_luma, COLOR_Y);
-    if (frame->rec->chroma_format != KVZ_CSP_400) {
-      kvz_sao_reconstruct(data->encoder_state->encoder_control, frame, new_u_data, x, data->y, sao_chroma, COLOR_U);
-      kvz_sao_reconstruct(data->encoder_state->encoder_control, frame, new_v_data, x, data->y, sao_chroma, COLOR_V);
-    }
-  }
-  
-  free(new_y_data);
-  free(new_u_data);
-  free(new_v_data);
-
-  free(opaque);
 }
 
-
 static int encoder_state_tree_is_a_chain(const encoder_state_t * const state) {
   if (!state->children0.encoder_control) return 1;
   if (state->children1.encoder_control) return 0;
@@ -620,62 +843,66 @@
 static void encoder_state_encode(encoder_state_t * const main_state) {
   //If we have children, encode at child level
   if (main_state->children0.encoder_control) {
-    int i=0;
     //If we have only one child, than it cannot be the last split in tree
     int node_is_the_last_split_in_tree = (main_state->children1.encoder_control != 0);
-    
-    for (i=0; main_state->childreni.encoder_control; ++i) {
+
+    for (int i = 0; main_state->childreni.encoder_control; ++i) {
       encoder_state_t *sub_state = &(main_state->childreni);
-      
+
       if (sub_state->tile != main_state->tile) {
-        const int offset_x = sub_state->tile->lcu_offset_x * LCU_WIDTH;
-        const int offset_y = sub_state->tile->lcu_offset_y * LCU_WIDTH;
+        const int offset_x = sub_state->tile->offset_x;
+        const int offset_y = sub_state->tile->offset_y;
         const int width = MIN(sub_state->tile->frame->width_in_lcu * LCU_WIDTH, main_state->tile->frame->width - offset_x);
         const int height = MIN(sub_state->tile->frame->height_in_lcu * LCU_WIDTH, main_state->tile->frame->height - offset_y);
-        
-        if (sub_state->tile->frame->source) {
-          kvz_image_free(sub_state->tile->frame->source);
-          sub_state->tile->frame->source = NULL;
-        }
-        if (sub_state->tile->frame->rec) {
-          kvz_image_free(sub_state->tile->frame->rec);
-          sub_state->tile->frame->rec = NULL;
-        }
-        
-        assert(!sub_state->tile->frame->source);
-        assert(!sub_state->tile->frame->rec);
-        sub_state->tile->frame->source = kvz_image_make_subimage(main_state->tile->frame->source, offset_x, offset_y, width, height);
-        sub_state->tile->frame->rec = kvz_image_make_subimage(main_state->tile->frame->rec, offset_x, offset_y, width, height);
+
+        kvz_image_free(sub_state->tile->frame->source);
+        sub_state->tile->frame->source = NULL;
+
+        kvz_image_free(sub_state->tile->frame->rec);
+        sub_state->tile->frame->rec = NULL;
+
+        kvz_cu_array_free(&sub_state->tile->frame->cu_array);
+
+        sub_state->tile->frame->source = kvz_image_make_subimage(
+            main_state->tile->frame->source,
+            offset_x,
+            offset_y,
+            width,
+            height
+        );
+        sub_state->tile->frame->rec = kvz_image_make_subimage(
+            main_state->tile->frame->rec,
+            offset_x,
+            offset_y,
+            width,
+            height
+        );
+        sub_state->tile->frame->cu_array = kvz_cu_subarray(
+            main_state->tile->frame->cu_array,
+            offset_x,
+            offset_y,
+            sub_state->tile->frame->width_in_lcu * LCU_WIDTH,
+            sub_state->tile->frame->height_in_lcu * LCU_WIDTH
+        );
       }
-      
+
       //To be the last split, we require that every child is a chain
-      node_is_the_last_split_in_tree = node_is_the_last_split_in_tree && encoder_state_tree_is_a_chain(&main_state->childreni);
+      node_is_the_last_split_in_tree =
+        node_is_the_last_split_in_tree &&
+        encoder_state_tree_is_a_chain(&main_state->childreni);
     }
     //If it's the latest split point
     if (node_is_the_last_split_in_tree) {
-      for (i=0; main_state->childreni.encoder_control; ++i) {
+      for (int i = 0; main_state->childreni.encoder_control; ++i) {
         //If we don't have wavefronts, parallelize encoding of children.
         if (main_state->childreni.type != ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
-#ifdef KVZ_DEBUG
-          char job_description256;
-          switch (main_state->childreni.type) {
-            case ENCODER_STATE_TYPE_TILE: 
-              sprintf(job_description, "type=encode_child,frame=%d,tile=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", main_state->childreni.frame->num, main_state->childreni.tile->id, main_state->childreni.lcu_order0.position.y + main_state->childreni.tile->lcu_offset_y, main_state->childreni.lcu_order0.position.y + main_state->childreni.tile->lcu_offset_y, 
-                      main_state->childreni.lcu_order0.position_px.x + main_state->childreni.tile->lcu_offset_x * LCU_WIDTH, main_state->childreni.lcu_ordermain_state->childreni.lcu_order_count-1.position_px.x + main_state->childreni.lcu_ordermain_state->childreni.lcu_order_count-1.size.x + main_state->childreni.tile->lcu_offset_x * LCU_WIDTH - 1,
-                      main_state->childreni.lcu_order0.position_px.y + main_state->childreni.tile->lcu_offset_y * LCU_WIDTH, main_state->childreni.lcu_ordermain_state->childreni.lcu_order_count-1.position_px.y + main_state->childreni.lcu_ordermain_state->childreni.lcu_order_count-1.size.y + main_state->childreni.tile->lcu_offset_y * LCU_WIDTH - 1);
-              break;
-            case ENCODER_STATE_TYPE_SLICE:
-              sprintf(job_description, "type=encode_child,frame=%d,slice=%d,start_in_ts=%d", main_state->childreni.frame->num, main_state->childreni.slice->id, main_state->childreni.slice->start_in_ts);
-              break;
-            default:
-              sprintf(job_description, "type=encode_child,frame=%d,invalid", main_state->childreni.frame->num);
-              break;
-          }
-#else
-          char* job_description = NULL;
-#endif
-          main_state->childreni.tqj_recon_done = kvz_threadqueue_submit(main_state->encoder_control->threadqueue, encoder_state_worker_encode_children, &(main_state->childreni), 1, job_description);
-          if (main_state->childreni.previous_encoder_state != &main_state->childreni && main_state->childreni.previous_encoder_state->tqj_recon_done && !main_state->childreni.frame->is_idr_frame) {
+          kvz_threadqueue_free_job(&main_state->childreni.tqj_recon_done);
+          main_state->childreni.tqj_recon_done =
+            kvz_threadqueue_job_create(encoder_state_worker_encode_children, &main_state->childreni);
+          if (main_state->childreni.previous_encoder_state != &main_state->childreni &&
+              main_state->childreni.previous_encoder_state->tqj_recon_done &&
+              !main_state->childreni.frame->is_irap)
+          {
 #if 0
             // Disabled due to non-determinism.
             if (main_state->encoder_control->cfg->mv_constraint == KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN)
@@ -691,70 +918,15 @@
               }
             }
           }
-          kvz_threadqueue_job_unwait_job(main_state->encoder_control->threadqueue, main_state->childreni.tqj_recon_done);
+          kvz_threadqueue_submit(main_state->encoder_control->threadqueue, main_state->childreni.tqj_recon_done);
         } else {
           //Wavefront rows have parallelism at LCU level, so we should not launch multiple threads here!
           //FIXME: add an assert: we can only have wavefront children
           encoder_state_worker_encode_children(&(main_state->childreni));
         }
       }
-      
-      // Add SAO reconstruction jobs and their dependancies when using WPP coding.
-      if (main_state->encoder_control->cfg.sao_enable && 
-          main_state->children0.type == ENCODER_STATE_TYPE_WAVEFRONT_ROW)
-      {
-        int y;
-        videoframe_t * const frame = main_state->tile->frame;
-        threadqueue_job_t *previous_job = NULL;
-        
-        for (y = 0; y < frame->height_in_lcu; ++y) {
-          // Queue a single job performing SAO reconstruction for the whole wavefront row.
-
-          worker_sao_reconstruct_lcu_data *data = MALLOC(worker_sao_reconstruct_lcu_data, 1);
-          threadqueue_job_t *job;
-#ifdef KVZ_DEBUG
-          char job_description256;
-          sprintf(job_description, "type=sao,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d", main_state->frame->num, main_state->tile->id, main_state->tile->lcu_offset_x * LCU_WIDTH, main_state->tile->lcu_offset_x * LCU_WIDTH + main_state->tile->frame->width - 1, (main_state->tile->lcu_offset_y + y) * LCU_WIDTH, MIN(main_state->tile->lcu_offset_y * LCU_WIDTH + main_state->tile->frame->height, (main_state->tile->lcu_offset_y + y + 1) * LCU_WIDTH)-1);
-#else
-          char* job_description = NULL;
-#endif
-          data->y = y;
-          data->encoder_state = main_state;
-          
-          job = kvz_threadqueue_submit(main_state->encoder_control->threadqueue, encoder_state_worker_sao_reconstruct_lcu, data, 1, job_description);
-          
-          // This dependancy is needed, because the pre-SAO pixels from the LCU row
-          // below this one are read straigh from the frame.
-          if (previous_job) {
-            kvz_threadqueue_job_dep_add(job, previous_job);
-          }
-          previous_job = job;
-          
-          // This depepndancy ensures that the bottom edge of this LCU row
-          // has been fully deblocked.
-          if (y < frame->height_in_lcu - 1) {
-            // Not last row: depend on the last LCU of the row below.
-            kvz_threadqueue_job_dep_add(job, main_state->tile->wf_jobs(y + 1) * frame->width_in_lcu + frame->width_in_lcu - 1);
-          } else {
-            // Last row: depend on the last LCU of the row
-            kvz_threadqueue_job_dep_add(job, main_state->tile->wf_jobs(y + 0) * frame->width_in_lcu + frame->width_in_lcu - 1);
-          }
-          kvz_threadqueue_job_unwait_job(main_state->encoder_control->threadqueue, job);
-          
-          // The wavefront row is finished, when the SAO-reconstruction is
-          // finished.
-          main_state->childreny.tqj_recon_done = job;
-          
-          if (y == frame->height_in_lcu - 1) {
-            // This tile is finished, when the reconstruction of the last
-            // WPP-row is finished.
-            assert(!main_state->tqj_recon_done);
-            main_state->tqj_recon_done = job;
-          }
-        }
-      }
     } else {
-      for (i=0; main_state->childreni.encoder_control; ++i) {
+      for (int i = 0; main_state->childreni.encoder_control; ++i) {
         encoder_state_worker_encode_children(&(main_state->childreni));
       }
     }
@@ -773,81 +945,50 @@
 }
 
 
-static void encoder_ref_insertion_sort(int reflist16, int length) {
+static void encoder_ref_insertion_sort(const encoder_state_t *const state, uint8_t reflist16, uint8_t length) {
 
   for (uint8_t i = 1; i < length; ++i) {
-    const int16_t cur_poc = reflisti;
-    int16_t j = i;
-    while (j > 0 && cur_poc < reflistj - 1) {
+    const uint8_t cur_idx = reflisti;
+    const int32_t cur_poc = state->frame->ref->pocscur_idx;
+    int8_t j = i;
+    while (j > 0 && cur_poc > state->frame->ref->pocsreflistj - 1) {
       reflistj = reflistj - 1;
       --j;
     }
-    reflistj = cur_poc;
+    reflistj = cur_idx;
   }
 }
 
 /**
- * \brief Return reference picture lists.
+ * \brief Generate reference picture lists.
  *
  * \param state             main encoder state
- * \param ref_list_len_out  Returns the lengths of the reference lists.
- * \param ref_list_poc_out  Returns two lists of POCs of the reference pictures.
  */
-void kvz_encoder_get_ref_lists(const encoder_state_t *const state,
-                               int ref_list_len_out2,
-                               int ref_list_poc_out216)
+void kvz_encoder_create_ref_lists(const encoder_state_t *const state)
 {
-  FILL_ARRAY(ref_list_len_out, 0, 2);
+  // TODO check possibility to add L0 references to L1 list also
+  
+  FILL_ARRAY(state->frame->ref_LX_size, 0, 2);
 
   // List all pocs of lists
   int j = 0;
   for (j = 0; j < state->frame->ref->used_size; j++) {
     if (state->frame->ref->pocsj < state->frame->poc) {
-      ref_list_poc_out0ref_list_len_out0 = state->frame->ref->pocsj;
-      ref_list_len_out0++;
+      state->frame->ref_LX0state->frame->ref_LX_size0 = j;
+      state->frame->ref_LX_size0 += 1;
     } else {
-      ref_list_poc_out1ref_list_len_out1 = state->frame->ref->pocsj;
-      ref_list_len_out1++;
+      state->frame->ref_LX1state->frame->ref_LX_size1 = j;
+      state->frame->ref_LX_size1 += 1;
     }
   }
 
-  // Fill the rest of ref_list_poc_out array with -1s.
+  // Fill the rest with -1s.
   for (; j < 16; j++) {
-    ref_list_poc_out0j = -1;
-    ref_list_poc_out1j = -1;
+    state->frame->ref_LX0j = (uint8_t) -1;
+    state->frame->ref_LX1j = (uint8_t) -1;
   }
 
-  encoder_ref_insertion_sort(ref_list_poc_out0, ref_list_len_out0);
-  encoder_ref_insertion_sort(ref_list_poc_out1, ref_list_len_out1);
-}
-
-static void encoder_state_ref_sort(encoder_state_t *state) {
-  int ref_list_len2;
-  int ref_list_poc216;
-
-  kvz_encoder_get_ref_lists(state, ref_list_len, ref_list_poc);
-
-  for (int j = 0; j < state->frame->ref->used_size; j++) {
-    if (state->frame->ref->pocsj < state->frame->poc) {
-      for (int ref_idx = 0; ref_idx < ref_list_len0; ref_idx++) {
-        if (ref_list_poc0ref_idx == state->frame->ref->pocsj) {
-          state->frame->refmapj.idx = ref_list_len0 - ref_idx - 1;
-          break;
-        }
-      }
-      state->frame->refmapj.list = 1;
-
-    } else {
-      for (int ref_idx = 0; ref_idx < ref_list_len1; ref_idx++) {
-        if (ref_list_poc1ref_idx == state->frame->ref->pocsj) {
-          state->frame->refmapj.idx = ref_idx;
-          break;
-        }
-      }
-      state->frame->refmapj.list = 2;
-    }
-    state->frame->refmapj.poc = state->frame->ref->pocsj;
-  }
+  encoder_ref_insertion_sort(state, state->frame->ref_LX0, state->frame->ref_LX_size0);
 }
 
 /**
@@ -855,7 +996,7 @@
  */
 static void encoder_state_remove_refs(encoder_state_t *state) {
   const encoder_control_t * const encoder = state->encoder_control;
-  
+
   int neg_refs = encoder->cfg.gopstate->frame->gop_offset.ref_neg_count;
   int pos_refs = encoder->cfg.gopstate->frame->gop_offset.ref_pos_count;
 
@@ -865,7 +1006,10 @@
   } else {
     target_ref_num = encoder->cfg.ref_frames;
   }
-  if (state->frame->slicetype == KVZ_SLICE_I) {
+
+  if (state->frame->pictype == KVZ_NAL_IDR_W_RADL ||
+      state->frame->pictype == KVZ_NAL_IDR_N_LP)
+  {
     target_ref_num = 0;
   }
 
@@ -877,7 +1021,7 @@
       bool is_referenced = false;
 
       int ref_poc = state->frame->ref->pocsref;
-      
+
       for (int i = 0; i < neg_refs; i++) {
         int ref_relative_poc = -encoder->cfg.gopstate->frame->gop_offset.ref_negi;
         if (ref_poc == state->frame->poc + ref_relative_poc) {
@@ -886,7 +1030,6 @@
         }
       }
 
-      
       for (int i = 0; i < pos_refs; i++) {
         int ref_relative_poc = encoder->cfg.gopstate->frame->gop_offset.ref_posi;
         if (ref_poc == state->frame->poc + ref_relative_poc) {
@@ -895,6 +1038,20 @@
         }
       }
 
+      if (ref_poc < state->frame->irap_poc &&
+          state->frame->irap_poc < state->frame->poc)
+      {
+        // Trailing frames cannot refer to leading frames.
+        is_referenced = false;
+      }
+
+      if (encoder->cfg.intra_period > 0 &&
+          ref_poc < state->frame->irap_poc - encoder->cfg.intra_period)
+      {
+        // No frame can refer past the two preceding IRAP frames.
+        is_referenced = false;
+      }
+
       if (!is_referenced) {
         // This reference is not referred to by this frame, it must be removed.
         kvz_image_list_rem(state->frame->ref, ref);
@@ -911,16 +1068,6 @@
   assert(state->frame->ref->used_size <= target_ref_num);
 }
 
-static void encoder_state_reset_poc(encoder_state_t *state) {
-  state->frame->poc = 0;
-  kvz_videoframe_set_poc(state->tile->frame, 0);
-
-  for (int i = 0; state->childreni.encoder_control; ++i) {
-    encoder_state_t *sub_state = &(state->childreni);
-    encoder_state_reset_poc(sub_state);
-  }
-}
-
 static void encoder_set_source_picture(encoder_state_t * const state, kvz_picture* frame)
 {
   assert(!state->tile->frame->source);
@@ -949,8 +1096,8 @@
   }
 
   //Clear the jobs
-  state->tqj_bitstream_written = NULL;
-  state->tqj_recon_done = NULL;
+  kvz_threadqueue_free_job(&state->tqj_bitstream_written);
+  kvz_threadqueue_free_job(&state->tqj_recon_done);
 
   for (int i = 0; state->childreni.encoder_control; ++i) {
     encoder_state_init_children(&state->childreni);
@@ -980,56 +1127,71 @@
 
   encoder_set_source_picture(state, frame);
 
+  assert(!state->tile->frame->cu_array);
+  state->tile->frame->cu_array = kvz_cu_array_alloc(
+      state->tile->frame->width,
+      state->tile->frame->height
+  );
+
+  // Set POC.
   if (state->frame->num == 0) {
-    state->frame->is_idr_frame = true;
-  }  else if (cfg->gop_len) {
-    // Closed GOP / CRA is not yet supported.
-    state->frame->is_idr_frame = false;
-  
+    state->frame->poc = 0;
+  } else if (cfg->gop_len && !cfg->gop_lowdelay) {
     // Calculate POC according to the global frame counter and GOP structure
     int32_t poc = state->frame->num - 1;
     int32_t poc_offset = cfg->gopstate->frame->gop_offset.poc_offset;
     state->frame->poc = poc - poc % cfg->gop_len + poc_offset;
     kvz_videoframe_set_poc(state->tile->frame, state->frame->poc);
+  } else if (cfg->intra_period > 0) {
+    state->frame->poc = state->frame->num % cfg->intra_period;
   } else {
-    bool is_i_idr = (cfg->intra_period == 1 && state->frame->num % 2 == 0);
-    bool is_p_idr = (cfg->intra_period > 1 && (state->frame->num % cfg->intra_period) == 0);
-    state->frame->is_idr_frame = is_i_idr || is_p_idr;
+    state->frame->poc = state->frame->num;
   }
- 
-  if (state->frame->is_idr_frame) {
-    encoder_state_reset_poc(state);
-    state->frame->slicetype = KVZ_SLICE_I;
-    state->frame->pictype = KVZ_NAL_IDR_W_RADL;
+
+  // Check whether the frame is a keyframe or not.
+  if (state->frame->num == 0) {
+    state->frame->is_irap = true;
   } else {
-    if (cfg->intra_period == 1) {
-      state->frame->slicetype = KVZ_SLICE_I;
-    } else if (cfg->gop_len != 0) {
-      state->frame->slicetype = KVZ_SLICE_B;
-    } else {
-      state->frame->slicetype = KVZ_SLICE_P;
-    }
+    state->frame->is_irap =
+      cfg->intra_period > 0 &&
+      (state->frame->poc % cfg->intra_period) == 0;
+  }
+  if (state->frame->is_irap) {
+    state->frame->irap_poc = state->frame->poc;
+  }
 
-    // Use P-slice for lowdelay.
-    if (state->frame->slicetype == KVZ_SLICE_B &&
-        cfg->gop_len > 0 &&
-        cfg->gop_lowdelay) {
-      state->frame->slicetype = KVZ_SLICE_P;
+  // Set pictype.
+  if (state->frame->is_irap) {
+    if (state->frame->num == 0 ||
+        cfg->intra_period == 1 ||
+        cfg->gop_len == 0 ||
+        cfg->gop_lowdelay)
+    {
+      state->frame->pictype = KVZ_NAL_IDR_W_RADL;
+    } else {
+      state->frame->pictype = KVZ_NAL_CRA_NUT;
     }
-
+  } else if (state->frame->poc < state->frame->irap_poc) {
+    state->frame->pictype = KVZ_NAL_RASL_R;
+  } else {
     state->frame->pictype = KVZ_NAL_TRAIL_R;
-    if (state->encoder_control->cfg.gop_len) {
-      if (cfg->intra_period > 1 && (state->frame->poc % cfg->intra_period) == 0) {
-        state->frame->slicetype = KVZ_SLICE_I;
-      }
-    }
-
   }
 
   encoder_state_remove_refs(state);
-  encoder_state_ref_sort(state);
+  kvz_encoder_create_ref_lists(state);
+
+  // Set slicetype.
+  if (state->frame->is_irap) {
+    state->frame->slicetype = KVZ_SLICE_I;
+  } else if (state->frame->ref_LX_size1 > 0) {
+    state->frame->slicetype = KVZ_SLICE_B;
+  } else {
+    state->frame->slicetype = KVZ_SLICE_P;
+  }
 
-  normalize_lcu_weights(state);
+  if (cfg->target_bitrate > 0 && state->frame->num > cfg->owf) {
+    normalize_lcu_weights(state);
+  }
   kvz_set_picture_lambda_and_qp(state);
 
   encoder_state_init_children(state);
@@ -1051,39 +1213,22 @@
 
 void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame)
 {
-  {
-    PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
-    encoder_state_init_new_frame(state, frame);
-    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, state->encoder_control->threadqueue, "type=init_new_frame,frame=%d,poc=%d", state->frame->num, state->frame->poc);
-  }
-  {
-    PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
-    encoder_state_encode(state);
-    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, state->encoder_control->threadqueue, "type=encode,frame=%d", state->frame->num);
-  }
-  //kvz_threadqueue_flush(main_state->encoder_control->threadqueue);
-  {
-    threadqueue_job_t *job;
-#ifdef KVZ_DEBUG
-    char job_description256;
-    sprintf(job_description, "type=write_bitstream,frame=%d", state->frame->num);
-#else
-    char* job_description = NULL;
-#endif
-
-    job = kvz_threadqueue_submit(state->encoder_control->threadqueue, kvz_encoder_state_worker_write_bitstream, (void*) state, 1, job_description);
-    
-    _encode_one_frame_add_bitstream_deps(state, job);
-    if (state->previous_encoder_state != state && state->previous_encoder_state->tqj_bitstream_written) {
-      //We need to depend on previous bitstream generation
-      kvz_threadqueue_job_dep_add(job, state->previous_encoder_state->tqj_bitstream_written);
-    }
-    kvz_threadqueue_job_unwait_job(state->encoder_control->threadqueue, job);
-    assert(!state->tqj_bitstream_written);
-    state->tqj_bitstream_written = job;
+  encoder_state_init_new_frame(state, frame);
+  encoder_state_encode(state);
+
+  threadqueue_job_t *job =
+    kvz_threadqueue_job_create(kvz_encoder_state_worker_write_bitstream, state);
+
+  _encode_one_frame_add_bitstream_deps(state, job);
+  if (state->previous_encoder_state != state && state->previous_encoder_state->tqj_bitstream_written) {
+    //We need to depend on previous bitstream generation
+    kvz_threadqueue_job_dep_add(job, state->previous_encoder_state->tqj_bitstream_written);
   }
+  kvz_threadqueue_submit(state->encoder_control->threadqueue, job);
+  assert(!state->tqj_bitstream_written);
+  state->tqj_bitstream_written = job;
+
   state->frame->done = 0;
-  //kvz_threadqueue_flush(main_state->encoder_control->threadqueue);
 }
 
 
@@ -1105,9 +1250,11 @@
   if (state->frame->num == -1) {
     // We're at the first frame, so don't care about all this stuff.
     state->frame->num = 0;
-    state->frame->poc   = 0;
+    state->frame->poc = 0;
+    state->frame->irap_poc = 0;
     assert(!state->tile->frame->source);
     assert(!state->tile->frame->rec);
+    assert(!state->tile->frame->cu_array);
     state->frame->prepared = 1;
     return;
   }
@@ -1116,13 +1263,13 @@
   encoder_state_t *prev_state = state->previous_encoder_state;
 
   if (state->previous_encoder_state != state) {
-    kvz_cu_array_free(state->tile->frame->cu_array);
-    state->tile->frame->cu_array = NULL;
+    kvz_cu_array_free(&state->tile->frame->cu_array);
     unsigned width  = state->tile->frame->width_in_lcu  * LCU_WIDTH;
     unsigned height = state->tile->frame->height_in_lcu * LCU_WIDTH;
     state->tile->frame->cu_array = kvz_cu_array_alloc(width, height);
 
     kvz_image_list_copy_contents(state->frame->ref, prev_state->frame->ref);
+    kvz_encoder_create_ref_lists(state);
   }
 
   if (!encoder->cfg.gop_len ||
@@ -1136,8 +1283,9 @@
     kvz_image_list_add(state->frame->ref,
                    prev_state->tile->frame->rec,
                    prev_state->tile->frame->cu_array,
-                   prev_state->frame->poc);
-    kvz_cu_array_free(state->tile->frame->cu_array);
+                   prev_state->frame->poc,
+                   prev_state->frame->ref_LX);
+    kvz_cu_array_free(&state->tile->frame->cu_array);
     unsigned height = state->tile->frame->height_in_lcu * LCU_WIDTH;
     unsigned width  = state->tile->frame->width_in_lcu  * LCU_WIDTH;
     state->tile->frame->cu_array = kvz_cu_array_alloc(width, height);
@@ -1146,12 +1294,16 @@
   // Remove source and reconstructed picture.
   kvz_image_free(state->tile->frame->source);
   state->tile->frame->source = NULL;
+
   kvz_image_free(state->tile->frame->rec);
   state->tile->frame->rec = NULL;
 
+  kvz_cu_array_free(&state->tile->frame->cu_array);
+
   // Update POC and frame count.
   state->frame->num = prev_state->frame->num + 1;
-  state->frame->poc   = prev_state->frame->poc   + 1;
+  state->frame->poc = prev_state->frame->poc + 1;
+  state->frame->irap_poc = prev_state->frame->irap_poc;
 
   state->frame->prepared = 1;
 }

kvazaar-1.1.0.tar.gz/src/encoderstate.h -> kvazaar-1.2.0.tar.gz/src/encoderstate.h Changed

@@ -81,6 +81,7 @@
   int32_t num;       /*!< \brief Frame number */
   int32_t poc;       /*!< \brief Picture order count */
   int8_t gop_offset; /*!< \brief Offset in the gop structure */
+  int32_t irap_poc;  /*!< \brief POC of the associated IRAP picture */
 
   /**
    * \brief Frame-level quantization parameter
@@ -91,17 +92,16 @@
   //! \brief quantization factor
   double QP_factor;
 
-  //Current picture available references
+  //! Current pictures available for references
   image_list_t *ref;
   int8_t ref_list;
 
-  struct {
-    int32_t poc;
-    int8_t list;
-    int8_t idx;
-  } refmap16;
-  
-  bool is_idr_frame;
+  //! L0 and L1 reference index list
+  uint8_t ref_LX216;
+  //! L0 reference index list size
+  uint8_t ref_LX_size2;
+
+  bool is_irap;
   uint8_t pictype;
   enum kvz_slice_type slicetype;
 
@@ -153,11 +153,15 @@
   videoframe_t *frame;
   
   int32_t id;
-  
+
   //Tile: offset in LCU for current encoder_state in global coordinates
   int32_t lcu_offset_x;
   int32_t lcu_offset_y;
-  
+
+  //Tile: offset in pixels
+  int32_t offset_x;
+  int32_t offset_y;
+
   //Position of the first element in tile scan in global coordinates
   int32_t lcu_offset_in_ts;
   
@@ -169,18 +173,20 @@
   // LCU-column. They are packed such that each LCU-column index maps to the
   // x-coordinate.
   yuv_t *ver_buf_search;
-  
-  // This is a buffer for the deblocked bottom pixels of every LCU-row in the
-  // tile. They are packed such that each LCU-row index maps to the y-coordinate.
+
+  // This is a buffer for the deblocked bottom pixels of every LCU in the
+  // tile. They are packed such that each LCU-row index maps to the
+  // y-coordinate.
   yuv_t *hor_buf_before_sao;
-  
+
+  // This is a buffer for the deblocked right pixels of every LCU in the
+  // tile. They are packed such that each LCU-column index maps to the
+  // x-coordinate.
+  yuv_t *ver_buf_before_sao;
+
   //Jobs for each individual LCU of a wavefront row.
   threadqueue_job_t **wf_jobs;
 
-  // Instance of encryption generator by tile
-  Crypto_Handle dbs_g;
-  uint32_t m_prev_pos;
-
 } encoder_state_config_tile_t;
 
 typedef struct encoder_state_config_slice_t {
@@ -243,6 +249,10 @@
   bitstream_t stream;
   cabac_data_t cabac;
 
+  // Crypto stuff
+  crypto_handle_t *crypto_hdl;
+  uint32_t crypto_prev_pos;
+
   uint32_t stats_bitstream_length; //Bitstream length written in bytes
 
   //! \brief Lambda for SSE
@@ -263,6 +273,11 @@
    */
   int8_t ref_qp;
 
+  /**
+   * \brief Coeffs for the LCU.
+   */
+  lcu_coeff_t *coeff;
+
   //Jobs to wait for
   threadqueue_job_t * tqj_recon_done; //Reconstruction is done
   threadqueue_job_t * tqj_bitstream_written; //Bitstream is written
@@ -277,9 +292,7 @@
 
 coeff_scan_order_t kvz_get_scan_order(int8_t cu_type, int intra_mode, int depth);
 
-void kvz_encoder_get_ref_lists(const encoder_state_t *const state,
-                               int ref_list_len_out2,
-                               int ref_list_poc_out216);
+void kvz_encoder_create_ref_lists(const encoder_state_t *const state);
 
 lcu_stats_t* kvz_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y);

kvazaar-1.1.0.tar.gz/src/extras/crypto.cpp -> kvazaar-1.2.0.tar.gz/src/extras/crypto.cpp Changed

@@ -1,132 +1,140 @@
 #include <extras/crypto.h>
 
 #ifndef KVZ_SEL_ENCRYPTION
-extern int kvz_make_vs_ignore_crypto_not_having_symbols;
 int kvz_make_vs_ignore_crypto_not_having_symbols = 0;
 #else
+
 #include <cryptopp/aes.h>
 #include <cryptopp/modes.h>
 #include <cryptopp/osrng.h>
-typedef struct AESDecoder {
+
 #if AESEncryptionStreamMode
-        CryptoPP::CFB_Mode<CryptoPP::AES>::Encryption *CFBdec;
+  typedef CryptoPP::CFB_Mode<CryptoPP::AES>::Encryption cipher_t;
 #else
-    CryptoPP::CFB_Mode<CryptoPP::AES>::Decryption *CFBdec;
+  typedef CryptoPP::CFB_Mode<CryptoPP::AES>::Decryption cipher_t;
 #endif
 
-    byte keyCryptoPP::AES::DEFAULT_KEYLENGTH, ivCryptoPP::AES::BLOCKSIZE, out_stream_counterCryptoPP::AES::BLOCKSIZE, counterCryptoPP::AES::BLOCKSIZE;
-    int couter_avail, counter_index, counter_index_pos;
-} AESDecoder;
+struct crypto_handle_t {
+  cipher_t *cipher;
+  byte keyCryptoPP::AES::DEFAULT_KEYLENGTH;
+  byte ivCryptoPP::AES::BLOCKSIZE;
+  byte out_stream_counterCryptoPP::AES::BLOCKSIZE;
+  byte counterCryptoPP::AES::BLOCKSIZE;
+  int couter_avail;
+  int counter_index;
+  int counter_index_pos;
+};
 
 
-AESDecoder* Create() {
-	AESDecoder * AESdecoder = (AESDecoder *)malloc(sizeof(AESDecoder));
-	return AESdecoder;
-}
-void  Init(AESDecoder* AESdecoder) {
-    int init_val32 = {201, 75, 219, 152, 6, 245, 237, 107, 179, 194, 81, 29, 66, 98, 198, 0, 16, 213, 27, 56, 255, 127, 242, 112, 97, 126, 197, 204, 25, 59, 38, 30};
-    for(int i=0;i<16; i++) {
-        AESdecoder->iv i     = init_vali;
-        AESdecoder->counteri = init_val5+i;
-        AESdecoder->keyi     = init_vali+16;
-    }
-#if AESEncryptionStreamMode
-    AESdecoder->CFBdec = new CryptoPP::CFB_Mode<CryptoPP::AES >::Encryption(AESdecoder->key, CryptoPP::AES::DEFAULT_KEYLENGTH, AESdecoder->iv);
-#else
-    AESdecoder->CFBdec = new CryptoPP::CFB_Mode<CryptoPP::AES >::Decryption(AESdecoder->key, CryptoPP::AES::DEFAULT_KEYLENGTH, AESdecoder->iv);
-#endif
-    AESdecoder->couter_avail      = 0;
-    AESdecoder->counter_index     = 0;
-    AESdecoder->counter_index_pos = 0;
-}
+static uint8_t default_IV16 = {201, 75, 219, 152, 6, 245, 237, 107, 179, 194, 81, 29, 66, 98, 198, 0};
+static uint8_t default_key16 = {16, 213, 27, 56, 255, 127, 242, 112, 97, 126, 197, 204, 25, 59, 38, 30};
 
-void DeleteCrypto(AESDecoder * AESdecoder) {
-    if(AESdecoder)
-        free(AESdecoder);
-}
 
-void Decrypt(AESDecoder *AESdecoder, const unsigned char *in_stream, int size_bits, unsigned char  *out_stream) {
-    int nb_bytes = ceil((double)size_bits/8);
-    AESdecoder->CFBdec->ProcessData(out_stream, in_stream, nb_bytes);
-    if(size_bits&7)
-        AESdecoder->CFBdec->SetKeyWithIV(AESdecoder->key, CryptoPP::AES::DEFAULT_KEYLENGTH, AESdecoder->iv);
-    
-}
-void Incr_counter (unsigned char *counter) {
-    counter0++;
-}
+crypto_handle_t* kvz_crypto_create(const kvz_config *cfg)
+{
+  crypto_handle_t* hdl = (crypto_handle_t*)calloc(1, sizeof(crypto_handle_t));
 
-#if AESEncryptionStreamMode
-void Decrypt_counter(AESDecoder * AESdecoder) {
-    AESdecoder->CFBdec->ProcessData(AESdecoder->out_stream_counter, AESdecoder->counter, 16);
-    AESdecoder->couter_avail      = 128;
-    AESdecoder->counter_index     = 15;
-    AESdecoder->counter_index_pos = 8;
-    Incr_counter(AESdecoder->counter);
-}
-#endif
+  uint8_t *key;
+  if(cfg->optional_key!=NULL)
+    key = cfg->optional_key;
+  else
+    key = default_key;
 
-#if AESEncryptionStreamMode
-unsigned int get_key (AESDecoder * AESdecoder, int nb_bits) {
-    unsigned int key_ = 0;
-    if(nb_bits > 32) {
-        printf("The Generator can not generate more than 32 bit %d \n", nb_bits);
-        return 0;
-    }
-    if( !nb_bits )
-        return 0;
-    if(!AESdecoder->couter_avail)
-        Decrypt_counter(AESdecoder);
-
-    if(AESdecoder->couter_avail >= nb_bits)
-        AESdecoder->couter_avail -= nb_bits;
-    else
-        AESdecoder->couter_avail = 0;
-    int nb = 0;
-    while( nb_bits ) {
-        if( nb_bits >= AESdecoder->counter_index_pos )
-            nb = AESdecoder->counter_index_pos;
-        else
-            nb = nb_bits;
-        key_ <<= nb;
-        key_ += (AESdecoder->out_stream_counterAESdecoder->counter_index & ((1<<nb)-1));
-        AESdecoder->out_stream_counterAESdecoder->counter_index >>= nb;
-        nb_bits -= nb;
-
-        if(AESdecoder->counter_index && nb == AESdecoder->counter_index_pos ) {
-            AESdecoder->counter_index--;
-            AESdecoder->counter_index_pos = 8;
-        } else {
-            AESdecoder->counter_index_pos -= nb;
-            if(nb_bits) {
-                Decrypt_counter(AESdecoder);
-                AESdecoder->couter_avail -=  nb_bits;
-            }
-        }
-    }
-    return key_;
-}
-#endif
-Crypto_Handle CreateC() {
-	AESDecoder* AESdecoder = Create();
-	    return AESdecoder;
+  for (int i = 0; i < 16; i++) {
+    hdl->iv i     = default_IVi;
+    hdl->counteri = (i<11)? default_IV5+i : keyi-11;
+    hdl->keyi     = keyi;
+  }
+
+  hdl->cipher = new cipher_t(hdl->key, CryptoPP::AES::DEFAULT_KEYLENGTH, hdl->iv);
+
+  hdl->couter_avail      = 0;
+  hdl->counter_index     = 0;
+  hdl->counter_index_pos = 0;
+
+  return hdl;
 }
 
-void InitC(Crypto_Handle hdl) {
-    Init((AESDecoder*)hdl);
+void kvz_crypto_delete(crypto_handle_t **hdl)
+{
+  if (*hdl) {
+    delete (*hdl)->cipher;
+    (*hdl)->cipher = NULL;
+  }
+  FREE_POINTER(*hdl);
 }
 
+void kvz_crypto_decrypt(crypto_handle_t* hdl,
+                        const uint8_t *in_stream,
+                        int size_bits,
+                        uint8_t *out_stream)
+{
+  int num_bytes = ceil((double)size_bits/8);
+  hdl->cipher->ProcessData(out_stream, in_stream, num_bytes);
+  if (size_bits & 7) {
+    hdl->cipher->SetKeyWithIV(hdl->key, CryptoPP::AES::DEFAULT_KEYLENGTH, hdl->iv);
+  }
+}
 #if AESEncryptionStreamMode
-unsigned int ff_get_key (Crypto_Handle *hdl, int nb_bits) {
-    return get_key ((AESDecoder*)*hdl, nb_bits);
+static void increment_counter(unsigned char *counter)
+{
+  counter0++;
 }
-#endif
-void DecryptC(Crypto_Handle hdl, const unsigned char *in_stream, int size_bits, unsigned char  *out_stream) {
-    Decrypt((AESDecoder*)hdl, in_stream, size_bits, out_stream);
+
+static void decrypt_counter(crypto_handle_t *hdl)
+{
+  hdl->cipher->ProcessData(hdl->out_stream_counter, hdl->counter, 16);
+  hdl->couter_avail      = 128;
+  hdl->counter_index     = 15;
+  hdl->counter_index_pos = 8;
+  increment_counter(hdl->counter);
 }
 
-void DeleteCryptoC(Crypto_Handle hdl) {
-	  DeleteCrypto((AESDecoder *)hdl);
+unsigned kvz_crypto_get_key(crypto_handle_t *hdl, int nb_bits)
+{
+  unsigned key = 0;
+  if (nb_bits > 32) {
+      fprintf(stderr, "The generator cannot generate %d bits (max 32 bits)\n", nb_bits);
+      return 0;
+  }
+  if (nb_bits == 0) return 0;
+
+  if (!hdl->couter_avail) {
+    decrypt_counter(hdl);
+  }
+
+  if(hdl->couter_avail >= nb_bits) {
+      hdl->couter_avail -= nb_bits;
+  } else {
+      hdl->couter_avail = 0;
+  }
+
+  int nb = 0;
+  while (nb_bits) {
+    if (nb_bits >= hdl->counter_index_pos) {
+      nb = hdl->counter_index_pos;
+    } else {
+      nb = nb_bits;
+    }
+
+    key <<= nb;
+    key += hdl->out_stream_counterhdl->counter_index & ((1 << nb) - 1);
+    hdl->out_stream_counterhdl->counter_index >>= nb;
+    nb_bits -= nb;
+
+    if (hdl->counter_index && nb == hdl->counter_index_pos) {
+      hdl->counter_index--;
+      hdl->counter_index_pos = 8;
+    } else {
+      hdl->counter_index_pos -= nb;
+      if (nb_bits) {
+        decrypt_counter(hdl);
+        hdl->couter_avail -=  nb_bits;
+      }
+    }
+  }
+  return key;
 }
+#endif // AESEncryptionStreamMode
 
 #endif // KVZ_SEL_ENCRYPTION

kvazaar-1.1.0.tar.gz/src/extras/crypto.h -> kvazaar-1.2.0.tar.gz/src/extras/crypto.h Changed

@@ -2,6 +2,10 @@
 #define CRYPTO_H_
 
 #include "global.h"
+#include "../cfg.h"
+
+#include <stdio.h>
+#include <math.h>
 
 #ifdef KVZ_SEL_ENCRYPTION
 #define STUBBED extern
@@ -9,73 +13,60 @@
 #define STUBBED static
 #endif
 
-#include <stdio.h>
-#include <math.h>
-#define AESEncryptionStreamMode      1
+#define AESEncryptionStreamMode 1
+
 #ifdef __cplusplus
 extern "C" {
 #endif
-    typedef void* Crypto_Handle;
-    STUBBED Crypto_Handle CreateC();
-    STUBBED void InitC(Crypto_Handle hdl);
-    STUBBED void DecryptC(Crypto_Handle hdl, const unsigned char *in_stream, int size_bits, unsigned char  *out_stream);
+
+typedef struct crypto_handle_t crypto_handle_t;
+
+STUBBED crypto_handle_t* kvz_crypto_create(const kvz_config *cfg);
+STUBBED void kvz_crypto_decrypt(crypto_handle_t* hdl,
+                                const uint8_t *in_stream,
+                                int size_bits,
+                                uint8_t *out_stream);
+STUBBED void kvz_crypto_delete(crypto_handle_t **hdl);
+
 #if AESEncryptionStreamMode
-    STUBBED unsigned int ff_get_key(Crypto_Handle *hdl, int nb_bits);
+STUBBED unsigned kvz_crypto_get_key(crypto_handle_t *hdl, int num_bits);
 #endif
-    STUBBED void DeleteCryptoC(Crypto_Handle hdl);
 
 #ifdef __cplusplus
 }
 #endif
 
+#undef STUBBED
+
 
 #ifndef KVZ_SEL_ENCRYPTION
-// Provide static stubs to allow linking without libcryptopp and allows us to
-// avoid sprinkling ifdefs everywhere and having a bunch of code that's not
-// compiled during normal development.
+// Provide static stubs to allow linking without libcryptopp and allows us
+// to avoid sprinkling ifdefs everywhere and having a bunch of code that's
+// not compiled during normal development.
 // Provide them in the header so we can avoid compiling the cpp file, which
 // means we don't need a C++ compiler when crypto is not enabled.
 
-#include <stdio.h>
-#include <stdint.h>
-#include <inttypes.h>
-
-static uintptr_t handle_id = 1;
-
-static INLINE Crypto_Handle CreateC() {
-  printf("Crypto CreateC %" PRIuPTR "\n", handle_id);
-  return (void*)(handle_id++);
-}
-static INLINE void InitC(Crypto_Handle hdl) {
-  printf("Crypto InitC %" PRIuPTR "\n", (uintptr_t)hdl);
-}
-
-static INLINE void DecryptC(Crypto_Handle hdl, const unsigned char *in_stream,
-              int size_bits, unsigned char  *out_stream)
+static INLINE crypto_handle_t* kvz_crypto_create(const kvz_config *cfg)
 {
-  // Stub.
-  printf("Crypto DecryptC %" PRIuPTR "\n", (uintptr_t)hdl);
+  return NULL;
 }
 
+static INLINE void kvz_crypto_decrypt(crypto_handle_t* hdl,
+                                const uint8_t *in_stream,
+                                int size_bits,
+                                uint8_t *out_stream)
+{}
+
+static INLINE void kvz_crypto_delete(crypto_handle_t **hdl)
+{}
+
 #if AESEncryptionStreamMode
-static INLINE unsigned int ff_get_key(Crypto_Handle *hdl, int nb_bits)
+static INLINE unsigned kvz_crypto_get_key(crypto_handle_t *hdl, int num_bits)
 {
-  // Stub.
-  static Crypto_Handle ff_get_key_last_hdl = 0;
-  if (*hdl != ff_get_key_last_hdl) {
-    printf("Crypto ff_get_key %" PRIuPTR "\n", (uintptr_t)*hdl);
-  }
-  ff_get_key_last_hdl = *hdl;
   return 0;
 }
 #endif
 
-static INLINE void DeleteCryptoC(Crypto_Handle hdl)
-{
-  // Stub.
-  printf("Crypto DeleteCryptoC %" PRIuPTR "\n", (uintptr_t)hdl);
-}
-
 #endif // KVZ_SEL_ENCRYPTION
 
 #endif // CRYPTO_H_

kvazaar-1.1.0.tar.gz/src/filter.c -> kvazaar-1.2.0.tar.gz/src/filter.c Changed

@@ -168,7 +168,7 @@
   int16_t m4 = src0;
   int16_t m5 = srcoffset;
 
-  delta = CLIP(-tc,tc, (((m4 - m3) << 2) + m2 - m5 + 4 ) >> 3);
+  delta = CLIP(-tc,tc, (((m4 - m3) * 4) + m2 - m5 + 4 ) >> 3);
   if(!part_P_nofilter) {
     src-offset = CLIP(0, (1 << encoder->bitdepth) - 1, m3 + delta);
   }
@@ -262,9 +262,7 @@
 
 static int8_t get_qp_y_pred(const encoder_state_t* state, int x, int y, edge_dir dir)
 {
-  if (state->encoder_control->cfg.target_bitrate <= 0
-      && state->encoder_control->cfg.roi.dqps == NULL)
-  {
+  if (!state->encoder_control->lcu_dqp_enabled) {
     return state->qp;
   }
 
@@ -403,10 +401,13 @@
           // Non-zero residual/coeffs and transform boundary
           // Neither CU is intra so tr_depth <= MAX_DEPTH.
           strength = 1;       
-        } else if (cu_p->inter.mv_dir != 3 && cu_q->inter.mv_dir != 3 && ((abs(cu_q->inter.mvcu_q->inter.mv_dir - 10 - cu_p->inter.mvcu_p->inter.mv_dir - 10) >= 4) || (abs(cu_q->inter.mvcu_q->inter.mv_dir - 11 - cu_p->inter.mvcu_p->inter.mv_dir - 11) >= 4))) {
+        } else if (cu_p->inter.mv_dir != 3 && cu_q->inter.mv_dir != 3 &&
+                 ((abs(cu_q->inter.mvcu_q->inter.mv_dir - 10 - cu_p->inter.mvcu_p->inter.mv_dir - 10) >= 4) ||
+                  (abs(cu_q->inter.mvcu_q->inter.mv_dir - 11 - cu_p->inter.mvcu_p->inter.mv_dir - 11) >= 4))) {
           // Absolute motion vector diff between blocks >= 1 (Integer pixel)
           strength = 1;
-        } else if (cu_p->inter.mv_dir != 3 && cu_q->inter.mv_dir != 3 && cu_q->inter.mv_refcu_q->inter.mv_dir - 1 != cu_p->inter.mv_refcu_p->inter.mv_dir - 1) {
+        } else if (cu_p->inter.mv_dir != 3 && cu_q->inter.mv_dir != 3 &&
+                   cu_q->inter.mv_refcu_q->inter.mv_dir - 1 != cu_p->inter.mv_refcu_p->inter.mv_dir - 1) {
           strength = 1;
         }
         
@@ -431,10 +432,10 @@
             cu_p->inter.mv10 = 0;
             cu_p->inter.mv11 = 0;
           }
-          const int refP0 = (cu_p->inter.mv_dir & 1) ? cu_p->inter.mv_ref0 : -1;
-          const int refP1 = (cu_p->inter.mv_dir & 2) ? cu_p->inter.mv_ref1 : -1;
-          const int refQ0 = (cu_q->inter.mv_dir & 1) ? cu_q->inter.mv_ref0 : -1;
-          const int refQ1 = (cu_q->inter.mv_dir & 2) ? cu_q->inter.mv_ref1 : -1;
+          const int refP0 = (cu_p->inter.mv_dir & 1) ? state->frame->ref_LX0cu_p->inter.mv_ref0 : -1;
+          const int refP1 = (cu_p->inter.mv_dir & 2) ? state->frame->ref_LX1cu_p->inter.mv_ref1 : -1;
+          const int refQ0 = (cu_q->inter.mv_dir & 1) ? state->frame->ref_LX0cu_q->inter.mv_ref0 : -1;
+          const int refQ1 = (cu_q->inter.mv_dir & 2) ? state->frame->ref_LX1cu_q->inter.mv_ref1 : -1;
           const int16_t* mvQ0 = cu_q->inter.mv0;
           const int16_t* mvQ1 = cu_q->inter.mv1;

kvazaar-1.1.0.tar.gz/src/global.h -> kvazaar-1.2.0.tar.gz/src/global.h Changed

@@ -117,10 +117,6 @@
 //! Search is started at depth 0 and goes in Z-order to MAX_PU_DEPTH, see search_cu()
 #define MAX_PU_DEPTH 4
 
-//! Minimum log2 transform sizes.
-//! spec: max_transform_hierarchy_depth_inter
-#define TR_DEPTH_INTER 2
-
 //! spec: pcm_enabled_flag, Setting to 1 will enable using PCM blocks (current intra-search does not consider PCM)
 #define ENABLE_PCM 0
 
@@ -150,6 +146,28 @@
 #define LCU_LUMA_SIZE (LCU_WIDTH * LCU_WIDTH)
 #define LCU_CHROMA_SIZE (LCU_WIDTH * LCU_WIDTH >> 2)
 
+/**
+ * \brief Number of pixels to delay deblocking.
+ *
+ * Number of pixels at the bottom and right side of the LCU that are not
+ * deblocked until when filtering the neighboring LCU. The last four chroma
+ * pixels of the horizontal edges within the LCU are deblocked with the LCU
+ * to the right. Therefore, DEBLOCK_DELAY_PX is set to 8 pixels.
+ */
+#define DEBLOCK_DELAY_PX 8
+
+/**
+ * \brief Number of pixels to delay SAO in horizontal and vertical
+ * directions.
+ *
+ * Number of pixels at the bottom and right side of the LCU that are not
+ * filtered with SAO until when filtering the neighboring LCU. SAO
+ * reconstruction requires that a one pixels border has been deblocked for
+ * both luma and chroma.  Therefore, SAO_DELAY_PX is set to
+ * DEBLOCK_DELAY_PX + 2.
+ */
+#define SAO_DELAY_PX (DEBLOCK_DELAY_PX + 2)
+
 #define MAX_REF_PIC_COUNT 16
 
 #define AMVP_MAX_NUM_CANDS 2
@@ -162,6 +180,7 @@
 #define MIN(a,b) (((a)<(b))?(a):(b))
 #define CLIP(low,high,value) MAX((low),MIN((high),(value)))
 #define CLIP_TO_PIXEL(value) CLIP(0, PIXEL_MAX, (value))
+#define CLIP_TO_QP(value) CLIP(0, 51, (value))
 #define SWAP(a,b,swaptype) { swaptype tempval; tempval = a; a = b; b = tempval; }
 #define CU_WIDTH_FROM_DEPTH(depth) (LCU_WIDTH >> depth)
 #define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val))
@@ -181,7 +200,7 @@
 // NOTE: When making a release, check to see if incrementing libversion in 
 // configure.ac is necessary.
 #ifndef KVZ_VERSION
-#define KVZ_VERSION 1.1.0
+#define KVZ_VERSION 1.2.0
 #endif
 #define VERSION_STRING QUOTE_EXPAND(KVZ_VERSION)
 
@@ -206,6 +225,12 @@
 #define SIMD_ALIGNMENT 32
 
 #ifdef _MSC_VER
+  #define ALIGNED(alignment) __declspec(align(alignment))
+#else
+  #define ALIGNED(alignment) __attribute__((aligned (alignment)))
+#endif
+
+#ifdef _MSC_VER
 // Buggy VS2010 throws intellisense warnings if void* is not casted.
   #define MALLOC(type, num) (type *)malloc(sizeof(type) * (num))
 #else
@@ -219,7 +244,11 @@
 // Fill a structure or a static array with val bytes.
 #define FILL(var, val) memset(&(var), (val), sizeof(var))
 // Fill a number of elements in an array with val bytes.
-#define FILL_ARRAY(ar, val, size) memset((ar), (val), (size) * sizeof(*(ar)))
+#define FILL_ARRAY(ar, val, size) \
+{\
+  void *temp_ptr = (void*)(ar);\
+  memset((temp_ptr), (val), (size) * sizeof(*(ar)));\
+}
 
 #define FREE_POINTER(pointer) { free((void*)pointer); pointer = NULL; }
 #define MOVE_POINTER(dst_pointer,src_pointer) { dst_pointer = src_pointer; src_pointer = NULL; }

kvazaar-1.1.0.tar.gz/src/image.c -> kvazaar-1.2.0.tar.gz/src/image.c Changed

@@ -23,6 +23,7 @@
 #include <limits.h>
 #include <stdlib.h>
 
+#include "strategies/strategies-ipol.h"
 #include "strategies/strategies-picture.h"
 #include "threads.h"
 
@@ -191,12 +192,14 @@
   return yuv;
 }
 
-void kvz_yuv_t_free(yuv_t * yuv)
+void kvz_yuv_t_free(yuv_t *yuv)
 {
-  free(yuv->y);
-  free(yuv->u);
-  free(yuv->v);
-  free(yuv);
+  if (yuv) {
+    FREE_POINTER(yuv->y);
+    FREE_POINTER(yuv->u);
+    FREE_POINTER(yuv->v);
+  }
+  FREE_POINTER(yuv);
 }
 
 hi_prec_buf_t * kvz_hi_prec_buf_t_alloc(int luma_size)
@@ -447,21 +450,19 @@
 * \param pic        Image for the block we are trying to find.
 * \param ref        Image where we are trying to find the block.
 *
-* \returns  
+* \returns          Sum of absolute differences
 */
-unsigned kvz_image_calc_sad(const kvz_picture *pic, const kvz_picture *ref, int pic_x, int pic_y, int ref_x, int ref_y,
-                        int block_width, int block_height, int max_px_below_lcu) {
+unsigned kvz_image_calc_sad(const kvz_picture *pic,
+                            const kvz_picture *ref,
+                            int pic_x,
+                            int pic_y,
+                            int ref_x,
+                            int ref_y,
+                            int block_width,
+                            int block_height)
+{
   assert(pic_x >= 0 && pic_x <= pic->width - block_width);
   assert(pic_y >= 0 && pic_y <= pic->height - block_height);
-  
-  // Check that we are not referencing pixels that are not final.
-  if (max_px_below_lcu >= 0) {
-    int next_lcu_row_px = ((pic_y >> LOG2_LCU_WIDTH) + 1) << LOG2_LCU_WIDTH;
-    int px_below_lcu = ref_y + block_height - next_lcu_row_px;
-    if (px_below_lcu > max_px_below_lcu) {
-      return INT_MAX;
-    }
-  }
 
   if (ref_x >= 0 && ref_x <= ref->width  - block_width &&
       ref_y >= 0 && ref_y <= ref->height - block_height)
@@ -479,6 +480,74 @@
 
 
 /**
+* \brief Calculate interpolated SATD between two blocks.
+*
+* \param pic        Image for the block we are trying to find.
+* \param ref        Image where we are trying to find the block.
+*/
+unsigned kvz_image_calc_satd(const kvz_picture *pic,
+                             const kvz_picture *ref,
+                             int pic_x,
+                             int pic_y,
+                             int ref_x,
+                             int ref_y,
+                             int block_width,
+                             int block_height)
+{
+  assert(pic_x >= 0 && pic_x <= pic->width - block_width);
+  assert(pic_y >= 0 && pic_y <= pic->height - block_height);
+
+  if (ref_x >= 0 && ref_x <= ref->width  - block_width &&
+      ref_y >= 0 && ref_y <= ref->height - block_height)
+  {
+    // Reference block is completely inside the frame, so just calculate the
+    // SAD directly. This is the most common case, which is why it's first.
+    const kvz_pixel *pic_data = &pic->ypic_y * pic->stride + pic_x;
+    const kvz_pixel *ref_data = &ref->yref_y * ref->stride + ref_x;
+    return kvz_satd_any_size(block_width,
+                             block_height,
+                             pic_data,
+                             pic->stride,
+                             ref_data,
+                             ref->stride) >> (KVZ_BIT_DEPTH - 8);
+  } else {
+    // Extrapolate pixels from outside the frame.
+    kvz_extended_block block;
+    kvz_get_extended_block(pic_x,
+                           pic_y,
+                           ref_x - pic_x,
+                           ref_y - pic_y,
+                           0,
+                           0,
+                           ref->y,
+                           ref->width,
+                           ref->height,
+                           0,
+                           block_width,
+                           block_height,
+                           &block);
+
+    const kvz_pixel *pic_data = &pic->ypic_y * pic->stride + pic_x;
+
+    unsigned satd = kvz_satd_any_size(block_width,
+                                      block_height,
+                                      pic_data,
+                                      pic->stride,
+                                      block.buffer,
+                                      block.stride) >> (KVZ_BIT_DEPTH - 8);
+
+    if (block.malloc_used) {
+      FREE_POINTER(block.buffer);
+    }
+
+    return satd;
+  }
+}
+
+
+
+
+/**
  * \brief BLock Image Transfer from one buffer to another.
  *
  * It's a stupidly simple loop that copies pixels.

kvazaar-1.1.0.tar.gz/src/image.h -> kvazaar-1.2.0.tar.gz/src/image.h Changed

@@ -74,8 +74,24 @@
 
 
 //Algorithms
-unsigned kvz_image_calc_sad(const kvz_picture *pic, const kvz_picture *ref, int pic_x, int pic_y, int ref_x, int ref_y,
-                        int block_width, int block_height, int max_lcu_below);
+unsigned kvz_image_calc_sad(const kvz_picture *pic,
+                            const kvz_picture *ref,
+                            int pic_x,
+                            int pic_y,
+                            int ref_x,
+                            int ref_y,
+                            int block_width,
+                            int block_height);
+
+
+unsigned kvz_image_calc_satd(const kvz_picture *pic,
+                             const kvz_picture *ref,
+                             int pic_x,
+                             int pic_y,
+                             int ref_x,
+                             int ref_y,
+                             int block_width,
+                             int block_height);
 
 
 void kvz_pixels_blit(const kvz_pixel* orig, kvz_pixel *dst,

kvazaar-1.1.0.tar.gz/src/imagelist.c -> kvazaar-1.2.0.tar.gz/src/imagelist.c Changed

@@ -36,9 +36,10 @@
 {
   image_list_t *list = (image_list_t *)malloc(sizeof(image_list_t));
   list->size      = size;
-  list->images    = malloc(sizeof(kvz_picture*) * size);
-  list->cu_arrays = malloc(sizeof(cu_array_t*)  * size);
-  list->pocs      = malloc(sizeof(int32_t)      * size);
+  list->images    = malloc(sizeof(kvz_picture*)  * size);
+  list->cu_arrays = malloc(sizeof(cu_array_t*)   * size);
+  list->pocs      = malloc(sizeof(int32_t)       * size);
+  list->ref_LXs   = malloc(sizeof(*list->ref_LXs) * size);
   list->used_size = 0;
 
   return list;
@@ -55,6 +56,7 @@
   list->images = (kvz_picture**)realloc(list->images, sizeof(kvz_picture*) * size);
   list->cu_arrays = (cu_array_t**)realloc(list->cu_arrays, sizeof(cu_array_t*) * size);
   list->pocs = realloc(list->pocs, sizeof(int32_t) * size);
+  list->ref_LXs = realloc(list->ref_LXs, sizeof(*list->ref_LXs) * size);
   list->size = size;
   return size == 0 || (list->images && list->cu_arrays && list->pocs);
 }
@@ -71,9 +73,13 @@
     for (i = 0; i < list->used_size; ++i) {
       kvz_image_free(list->imagesi);
       list->imagesi = NULL;
-      kvz_cu_array_free(list->cu_arraysi);
+      kvz_cu_array_free(&list->cu_arraysi);
       list->cu_arraysi = NULL;
       list->pocsi = 0;
+      for (int j = 0; j < 16; j++) {
+        list->ref_LXsi0j = 0;
+        list->ref_LXsi1j = 0;
+      }
     }
   }
 
@@ -81,10 +87,12 @@
     free(list->images);
     free(list->cu_arrays);
     free(list->pocs);
+    free(list->ref_LXs);
   }
   list->images = NULL;
   list->cu_arrays = NULL;
   list->pocs = NULL;
+  list->ref_LXs = NULL;
   free(list);
   return 1;
 }
@@ -95,7 +103,7 @@
  * \param picture_list list to use
  * \return 1 on success
  */
-int kvz_image_list_add(image_list_t *list, kvz_picture *im, cu_array_t *cua, int32_t poc)
+int kvz_image_list_add(image_list_t *list, kvz_picture *im, cu_array_t *cua, int32_t poc, uint8_t ref_LX216)
 {
   int i = 0;
   if (KVZ_ATOMIC_INC(&(im->refcount)) == 1) {
@@ -119,11 +127,19 @@
     list->imagesi = list->imagesi - 1;
     list->cu_arraysi = list->cu_arraysi - 1;
     list->pocsi = list->pocsi - 1;
+    for (int j = 0; j < 16; j++) {
+      list->ref_LXsi0j = list->ref_LXsi - 10j;
+      list->ref_LXsi1j = list->ref_LXsi - 11j;
+    }
   }
 
   list->images0 = im;
   list->cu_arrays0 = cua;
   list->pocs0 = poc;
+  for (int j = 0; j < 16; j++) {
+    list->ref_LXs00j = ref_LX0j;
+    list->ref_LXs01j = ref_LX1j;
+  }
   
   list->used_size++;
   return 1;
@@ -145,17 +161,17 @@
 
   kvz_image_free(list->imagesn);
 
-  if (!kvz_cu_array_free(list->cu_arraysn)) {
-    fprintf(stderr, "Could not free cu_array!\n");
-    assert(0); //Stop here
-    return 0;
-  }
+  kvz_cu_array_free(&list->cu_arraysn);
 
   // The last item is easy to remove
   if (n == list->used_size - 1) {
     list->imagesn = NULL;
     list->cu_arraysn = NULL;
     list->pocsn = 0;
+    for (int j = 0; j < 16; j++) {
+      list->ref_LXsn0j = 0;
+      list->ref_LXsn1j = 0;
+    }
     list->used_size--;
   } else {
     int i = n;
@@ -164,10 +180,18 @@
       list->imagesi = list->imagesi + 1;
       list->cu_arraysi = list->cu_arraysi + 1;
       list->pocsi = list->pocsi + 1;
+      for (int j = 0; j < 16; j++) {
+        list->ref_LXsi0j = list->ref_LXsi + 10j;
+        list->ref_LXsi1j = list->ref_LXsi + 11j;
+      }
     }
     list->imageslist->used_size - 1 = NULL;
     list->cu_arrayslist->used_size - 1 = NULL;
     list->pocslist->used_size - 1 = 0;
+    for (int j = 0; j < 16; j++) {
+      list->ref_LXslist->used_size - 10j = 0;
+      list->ref_LXslist->used_size - 11j = 0;
+    }
     list->used_size--;
   }
 
@@ -181,7 +205,7 @@
   }
   
   for (i = source->used_size - 1; i >= 0; --i) {
-    kvz_image_list_add(target, source->imagesi, source->cu_arraysi, source->pocsi);
+    kvz_image_list_add(target, source->imagesi, source->cu_arraysi, source->pocsi, source->ref_LXsi);
   }
   return 1;
 }

kvazaar-1.1.0.tar.gz/src/imagelist.h -> kvazaar-1.2.0.tar.gz/src/imagelist.h Changed

kvazaar-1.1.0.tar.gz/src/input_frame_buffer.c -> kvazaar-1.2.0.tar.gz/src/input_frame_buffer.c Changed

kvazaar-1.1.0.tar.gz/src/inter.c -> kvazaar-1.2.0.tar.gz/src/inter.c Changed

@@ -31,6 +31,14 @@
 #include "videoframe.h"
 
 
+typedef struct {
+  const cu_info_t *a2;
+  const cu_info_t *b3;
+  const cu_info_t *c3;
+  const cu_info_t *h;
+} merge_candidates_t;
+
+
 static void inter_recon_frac_luma(const encoder_state_t * const state,
                                   const kvz_picture * const ref,
                                   int32_t xpos,
@@ -53,8 +61,8 @@
                          ypos,
                          mv_param0 >> 2,
                          mv_param1 >> 2,
-                         state->tile->lcu_offset_x * LCU_WIDTH,
-                         state->tile->lcu_offset_y * LCU_WIDTH,
+                         state->tile->offset_x,
+                         state->tile->offset_y,
                          ref->y,
                          ref->width,
                          ref->height,
@@ -98,8 +106,8 @@
                          ypos,
                          mv_param0 >> 2,
                          mv_param1 >> 2,
-                         state->tile->lcu_offset_x * LCU_WIDTH,
-                         state->tile->lcu_offset_y * LCU_WIDTH,
+                         state->tile->offset_x,
+                         state->tile->offset_y,
                          ref->y,
                          ref->width,
                          ref->height,
@@ -146,14 +154,34 @@
   kvz_extended_block src_v = { 0, 0, 0, 0 };
 
   //Fractional chroma U
-  kvz_get_extended_block(xpos, ypos, (mv_param0 >> 2) >> 1, (mv_param1 >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C,
-    ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_height, &src_u);
+  kvz_get_extended_block(xpos, ypos,
+                         (mv_param0 >> 2) >> 1,
+                         (mv_param1 >> 2) >> 1,
+                         state->tile->offset_x >> 1,
+                         state->tile->offset_y >> 1,
+                         ref->u,
+                         ref->width >> 1,
+                         ref->height >> 1,
+                         FILTER_SIZE_C,
+                         block_width,
+                         block_height,
+                         &src_u);
   kvz_sample_octpel_chroma(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
     block_height, lcu->rec.u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
 
   //Fractional chroma V
-  kvz_get_extended_block(xpos, ypos, (mv_param0 >> 2) >> 1, (mv_param1 >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C,
-    ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_height, &src_v);
+  kvz_get_extended_block(xpos, ypos,
+                         (mv_param0 >> 2) >> 1,
+                         (mv_param1 >> 2) >> 1,
+                         state->tile->offset_x >> 1,
+                         state->tile->offset_y >> 1,
+                         ref->v,
+                         ref->width >> 1,
+                         ref->height >> 1,
+                         FILTER_SIZE_C,
+                         block_width,
+                         block_height,
+                         &src_v);
   kvz_sample_octpel_chroma(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
     block_height, lcu->rec.v + (ypos  % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
 
@@ -190,8 +218,8 @@
                          ypos,
                          (mv_param0 >> 2) >> 1,
                          (mv_param1 >> 2) >> 1,
-                         state->tile->lcu_offset_x * LCU_WIDTH_C,
-                         state->tile->lcu_offset_y * LCU_WIDTH_C,
+                         state->tile->offset_x >> 1,
+                         state->tile->offset_y >> 1,
                          ref->u,
                          ref->width >> 1,
                          ref->height >> 1,
@@ -215,8 +243,8 @@
                          ypos,
                          (mv_param0 >> 2) >> 1,
                          (mv_param1 >> 2) >> 1,
-                         state->tile->lcu_offset_x * LCU_WIDTH_C,
-                         state->tile->lcu_offset_y * LCU_WIDTH_C,
+                         state->tile->offset_x >> 1,
+                         state->tile->offset_y >> 1,
                          ref->v,
                          ref->width >> 1,
                          ref->height >> 1,
@@ -300,17 +328,13 @@
                          lcu_t *lcu,
                          hi_prec_buf_t *hi_prec_out)
 {
-  const vector2d_t tile_in_frame = {
-    state->tile->lcu_offset_x * LCU_WIDTH,
-    state->tile->lcu_offset_y * LCU_WIDTH
-  };
   const vector2d_t pu_in_tile = { xpos, ypos };
   const vector2d_t pu_in_lcu = { xpos % LCU_WIDTH, ypos % LCU_WIDTH };
 
   const vector2d_t mv_in_pu = { mv_param0 >> 2, mv_param1 >> 2 };
   const vector2d_t mv_in_frame = {
-    mv_in_pu.x + pu_in_tile.x + tile_in_frame.x,
-    mv_in_pu.y + pu_in_tile.y + tile_in_frame.y
+    mv_in_pu.x + pu_in_tile.x + state->tile->offset_x,
+    mv_in_pu.y + pu_in_tile.y + state->tile->offset_y
   };
 
   const bool mv_is_outside_frame = mv_in_frame.x < 0 ||
@@ -642,24 +666,26 @@
 
 
 /**
-* \brief Get merge candidates for current block
-* \param encoder encoder control struct to use
-* \param x block x position in SCU
-* \param y block y position in SCU
-* \param width current block width
-* \param height current block height
-* \param H candidate H
-* \param C1 candidate C1
-*/
-static void kvz_inter_get_temporal_merge_candidates(const encoder_state_t * const state,
-                                             int32_t x,
-                                             int32_t y,
-                                             int32_t width,
-                                             int32_t height,
-                                             cu_info_t **C3,
-                                             cu_info_t **H,
-                                             uint8_t ref_list,
-                                             uint8_t ref_idx) {
+ * \brief Get merge candidates for current block
+ *
+ * \param state     encoder control state to use
+ * \param x         block x position in SCU
+ * \param y         block y position in SCU
+ * \param width     current block width
+ * \param height    current block height
+ * \param ref_list  which reference list, L0 is 1 and L1 is 2
+ * \param ref_idx   index in the reference list
+ * \param cand_out  will be filled with C3 and H candidates
+ */
+static void get_temporal_merge_candidates(const encoder_state_t * const state,
+                                          int32_t x,
+                                          int32_t y,
+                                          int32_t width,
+                                          int32_t height,
+                                          uint8_t ref_list,
+                                          uint8_t ref_idx,
+                                          merge_candidates_t *cand_out)
+{
   /*
   Predictor block locations
   _________
@@ -670,22 +696,19 @@
             |H|
   */
 
-  *C3 = NULL;
-  *H  = NULL;
+  cand_out->c3 = cand_out->h = NULL;
 
   // Find temporal reference
   if (state->frame->ref->used_size) {
-    uint32_t colocated_ref = UINT_MAX;
+    uint32_t colocated_ref;
 
     // Select L0/L1 ref_idx reference
-    for (int temporal_cand = 0; temporal_cand < state->frame->ref->used_size; temporal_cand++) {
-      if (state->frame->refmaptemporal_cand.list == ref_list && state->frame->refmaptemporal_cand.idx == ref_idx) {
-        colocated_ref = temporal_cand;
-        break;
-      }
+    if (state->frame->ref_LX_sizeref_list-1 > ref_idx) {
+      colocated_ref = state->frame->ref_LXref_list - 1ref_idx;
+    } else {
+      // not found
+      return;
     }
-    
-    if (colocated_ref == UINT_MAX) return;
 
     cu_array_t *ref_cu_array = state->frame->ref->cu_arrayscolocated_ref;
     int cu_per_width = ref_cu_array->width / SCU_WIDTH;
@@ -707,7 +730,7 @@
       if (H_offset >= 0) {
         // Only use when it's inter block
         if (ref_cu_array->dataH_offset.type == CU_INTER) {
-          *H = &ref_cu_array->dataH_offset;
+          cand_out->h = &ref_cu_array->dataH_offset;
         }
       }
     }
@@ -718,7 +741,7 @@
     if (xColCtr < state->encoder_control->in.width && yColCtr < state->encoder_control->in.height) {
       uint32_t C3_offset = ((xColCtr >> 4) << 4) / SCU_WIDTH + ((((yColCtr >> 4) << 4) / SCU_WIDTH) * cu_per_width);
       if (ref_cu_array->dataC3_offset.type == CU_INTER) {
-        *C3 = &ref_cu_array->dataC3_offset;
+        cand_out->c3 = &ref_cu_array->dataC3_offset;
       }
     }
   }
@@ -737,12 +760,8 @@
  * \param height          block height in pixels
  * \param picture_width   tile width in pixels
  * \param picture_height  tile height in pixels
- * \param b0              Returns the b0 candidate.
- * \param b1              Returns the b1 candidate.
- * \param b2              Returns the b2 candidate.
- * \param a0              Returns the a0 candidate.
- * \param a1              Returns the a1 candidate.
  * \param lcu             current LCU
+ * \param cand_out        will be filled with A and B candidates
  */
 static void get_spatial_merge_candidates(int32_t x,
                                          int32_t y,
@@ -750,12 +769,8 @@
                                          int32_t height,
                                          int32_t picture_width,
                                          int32_t picture_height,
-                                         cu_info_t **b0,
-                                         cu_info_t **b1,
-                                         cu_info_t **b2,
-                                         cu_info_t **a0,
-                                         cu_info_t **a1,
-                                         lcu_t *lcu)
+                                         lcu_t *lcu,
+                                         merge_candidates_t *cand_out)
 {
   /*
   Predictor block locations
@@ -771,59 +786,55 @@
   int32_t y_local = SUB_SCU(y);
   // A0 and A1 availability testing
   if (x != 0) {
-    *a1 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height - 1);
-    // Do not check (*a1)->coded because the block above is always coded before
+    cu_info_t *a1 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height - 1);
+    // Do not check a1->coded because the block above is always coded before
     // the current one and the flag is not set when searching an SMP block.
-    if ((*a1)->type == CU_INTER) {
-      inter_clear_cu_unused(*a1);
-    } else {
-      *a1 = NULL;
+    if (a1->type == CU_INTER) {
+      inter_clear_cu_unused(a1);
+      cand_out->a1 = a1;
     }
 
     if (y_local + height < LCU_WIDTH && y + height < picture_height) {
-      *a0 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height);
-      if ((*a0)->type == CU_INTER && is_a0_cand_coded(x, y, width, height)) {
-        inter_clear_cu_unused(*a0);
-      } else {
-        *a0 = NULL;
+      cu_info_t *a0 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height);
+      if (a0->type == CU_INTER && is_a0_cand_coded(x, y, width, height)) {
+        inter_clear_cu_unused(a0);
+        cand_out->a0 = a0;
       }
     }
   }
 
   // B0, B1 and B2 availability testing
   if (y != 0) {
+    cu_info_t *b0 = NULL;
     if (x + width < picture_width) {
       if (x_local + width < LCU_WIDTH) {
-        *b0 = LCU_GET_CU_AT_PX(lcu, x_local + width, y_local - 1);
+        b0 = LCU_GET_CU_AT_PX(lcu, x_local + width, y_local - 1);
       } else if (y_local == 0) {
         // Special case, top-right CU
-        *b0 = LCU_GET_TOP_RIGHT_CU(lcu);
+        b0 = LCU_GET_TOP_RIGHT_CU(lcu);
       }
     }
-    if ((*b0) && (*b0)->type == CU_INTER && is_b0_cand_coded(x, y, width, height)) {
-      inter_clear_cu_unused(*b0);
-    } else {
-      *b0 = NULL;
+    if (b0 && b0->type == CU_INTER && is_b0_cand_coded(x, y, width, height)) {
+      inter_clear_cu_unused(b0);
+      cand_out->b0 = b0;
     }
 
-    *b1 = LCU_GET_CU_AT_PX(lcu, x_local + width - 1, y_local - 1);
-    // Do not check (*b1)->coded because the block to the left is always coded
+    cu_info_t *b1 = LCU_GET_CU_AT_PX(lcu, x_local + width - 1, y_local - 1);
+    // Do not check b1->coded because the block to the left is always coded
     // before the current one and the flag is not set when searching an SMP
     // block.
-    if ((*b1)->type == CU_INTER) {
-      inter_clear_cu_unused(*b1);
-    } else {
-      *b1 = NULL;
+    if (b1->type == CU_INTER) {
+      inter_clear_cu_unused(b1);
+      cand_out->b1 = b1;
     }
 
     if (x != 0) {
-      *b2 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local - 1);
-      // Do not check (*b2)->coded because the block above and to the left is
+      cu_info_t *b2 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local - 1);
+      // Do not check b2->coded because the block above and to the left is
       // always coded before the current one.
-      if ((*b2)->type == CU_INTER) {
-        inter_clear_cu_unused(*b2);
-      } else {
-        *b2 = NULL;
+      if (b2->type == CU_INTER) {
+        inter_clear_cu_unused(b2);
+        cand_out->b2 = b2;
       }
     }
   }
@@ -843,11 +854,7 @@
  * \param height          block height in pixels
  * \param picture_width   tile width in pixels
  * \param picture_height  tile height in pixels
- * \param b0              Returns the b0 candidate.
- * \param b1              Returns the b1 candidate.
- * \param b2              Returns the b2 candidate.
- * \param a0              Returns the a0 candidate.
- * \param a1              Returns the a1 candidate.
+ * \param cand_out        will be filled with A and B candidates
  */
 static void get_spatial_merge_candidates_cua(const cu_array_t *cua,
                                              int32_t x,
@@ -856,11 +863,7 @@
                                              int32_t height,
                                              int32_t picture_width,
                                              int32_t picture_height,
-                                             const cu_info_t **b0,
-                                             const cu_info_t **b1,
-                                             const cu_info_t **b2,
-                                             const cu_info_t **a0,
-                                             const cu_info_t **a1)
+                                             merge_candidates_t *cand_out)
 {
   /*
   Predictor block locations
@@ -876,16 +879,16 @@
   int32_t y_local = SUB_SCU(y);
   // A0 and A1 availability testing
   if (x != 0) {
-    *a1 = kvz_cu_array_at_const(cua, x - 1, y + height - 1);
+    const cu_info_t *a1 = kvz_cu_array_at_const(cua, x - 1, y + height - 1);
     // The block above is always coded before the current one.
-    if ((*a1)->type != CU_INTER) {
-      *a1 = NULL;
+    if (a1->type == CU_INTER) {
+      cand_out->a1 = a1;
     }
 
     if (y_local + height < LCU_WIDTH && y + height < picture_height) {
-      *a0 = kvz_cu_array_at_const(cua, x - 1, y + height);
-      if ((*a0)->type != CU_INTER || !is_a0_cand_coded(x, y, width, height)) {
-        *a0 = NULL;
+      const cu_info_t *a0 = kvz_cu_array_at_const(cua, x - 1, y + height);
+      if (a0->type == CU_INTER && is_a0_cand_coded(x, y, width, height)) {
+        cand_out->a0 = a0;
       }
     }
   }
@@ -893,191 +896,227 @@
   // B0, B1 and B2 availability testing
   if (y != 0) {
     if (x + width < picture_width && (x_local + width < LCU_WIDTH || y_local == 0)) {
-      *b0 = kvz_cu_array_at_const(cua, x + width, y - 1);
-      if ((*b0)->type != CU_INTER || !is_b0_cand_coded(x, y, width, height)) {
-        *b0 = NULL;
+      const cu_info_t *b0 = kvz_cu_array_at_const(cua, x + width, y - 1);
+      if (b0->type == CU_INTER && is_b0_cand_coded(x, y, width, height)) {
+        cand_out->b0 = b0;
       }
     }
 
-    *b1 = kvz_cu_array_at_const(cua, x + width - 1, y - 1);
+    const cu_info_t *b1 = kvz_cu_array_at_const(cua, x + width - 1, y - 1);
     // The block to the left is always coded before the current one.
-    if ((*b1)->type != CU_INTER) {
-      *b1 = NULL;
+    if (b1->type == CU_INTER) {
+      cand_out->b1 = b1;
     }
 
     if (x != 0) {
-      *b2 = kvz_cu_array_at_const(cua, x - 1, y - 1);
+      const cu_info_t *b2 = kvz_cu_array_at_const(cua, x - 1, y - 1);
       // The block above and to the left is always coded before the current
       // one.
-      if ((*b2)->type != CU_INTER) {
-        *b2 = NULL;
+      if (b2->type == CU_INTER) {
+        cand_out->b2 = b2;
       }
     }
   }
 }
 
+static INLINE int16_t get_scaled_mv(int16_t mv, int scale)
+{
+  int32_t scaled = scale * mv;
+  return CLIP(-32768, 32767, (scaled + 127 + (scaled < 0)) >> 8);
+}
+
+static void apply_mv_scaling_pocs(int32_t current_poc,
+                                  int32_t current_ref_poc,
+                                  int32_t neighbor_poc,
+                                  int32_t neighbor_ref_poc,
+                                  int16_t mv_cand2)
+{
+  int32_t diff_current  = current_poc  - current_ref_poc;
+  int32_t diff_neighbor = neighbor_poc - neighbor_ref_poc;
+
+  if (diff_current == diff_neighbor) return;
+
+  diff_current  = CLIP(-128, 127, diff_current);
+  diff_neighbor = CLIP(-128, 127, diff_neighbor);
+
+  int scale = CLIP(-4096, 4095,
+    (diff_current * ((0x4000 + (abs(diff_neighbor) >> 1)) / diff_neighbor) + 32) >> 6);
+
+  mv_cand0 = get_scaled_mv(mv_cand0, scale);
+  mv_cand1 = get_scaled_mv(mv_cand1, scale);
+}
+
+static INLINE void apply_mv_scaling(const encoder_state_t *state,
+                                    const cu_info_t *current_cu,
+                                    const cu_info_t *neighbor_cu,
+                                    int8_t current_reflist,
+                                    int8_t neighbor_reflist,
+                                    int16_t mv_cand2)
+{
+  apply_mv_scaling_pocs(state->frame->poc,
+                        state->frame->ref->pocs
+                          state->frame->ref_LXcurrent_reflist
+                            current_cu->inter.mv_refcurrent_reflist,
+                        state->frame->poc,
+                        state->frame->ref->pocs
+                          state->frame->ref_LXneighbor_reflist
+                            neighbor_cu->inter.mv_refneighbor_reflist,
+                        mv_cand);
+}
+
 /**
- * \brief Pick two mv candidates from the spatial and temporal candidates.
+ * \brief Try to add a temporal MVP or merge candidate.
+ *
+ * \param state         encoder state
+ * \param current_ref   index of the picture referenced by the current CU
+ * \param colocated     colocated CU
+ * \param reflist       either 0 (for L0) or 1 (for L1)
+ * \paramout mv_out   Returns the motion vector
+ *
+ * \return Whether a temporal candidate was added or not.
  */
-static void get_mv_cand_from_candidates(const encoder_state_t * const state,
-                                     int32_t x,
-                                     int32_t y,
-                                     int32_t width,
-                                     int32_t height,
-                                     const cu_info_t *b0,
-                                     const cu_info_t *b1,
-                                     const cu_info_t *b2,
-                                     const cu_info_t *a0,
-                                     const cu_info_t *a1,
-                                     const cu_info_t *c3,
-                                     const cu_info_t *h,
+static bool add_temporal_candidate(const encoder_state_t *state,
+                                   uint8_t current_ref,
+                                   const cu_info_t *colocated,
+                                   int32_t reflist,
+                                   int16_t mv_out2)
+{
+  if (!colocated) return false;
+
+  int colocated_ref;
+  if (state->frame->ref_LX_size0 > 0) {
+    // get the first reference from L0 if it exists
+    colocated_ref = state->frame->ref_LX00;
+  } else {
+    // otherwise no candidate added
+    return false;
+  }
+
+  // When there are reference pictures from the future (POC > current POC)
+  // in L0 or L1, the primary list for the colocated PU is the inverse of
+  // collocated_from_l0_flag. Otherwise it is equal to reflist.
+  //
+  // In Kvazaar, the L1 list is only used for future pictures and the slice
+  // type is set to KVZ_SLICE_B if and only if L1 is used. Therefore we can
+  // simply check the slice type here. Kvazaar always sets
+  // collocated_from_l0_flag so the list is L1 for B-slices.
+  int col_list = state->frame->slicetype == KVZ_SLICE_P ? reflist : 1;
+
+  if ((colocated->inter.mv_dir & (col_list + 1)) == 0) {
+    // Use the other list if the colocated PU does not have a MV for the
+    // primary list.
+    col_list = 1 - col_list;
+  }
+
+  mv_out0 = colocated->inter.mvcol_list0;
+  mv_out1 = colocated->inter.mvcol_list1;
+  apply_mv_scaling_pocs(
+    state->frame->poc,
+    state->frame->ref->pocscurrent_ref,
+    state->frame->ref->pocscolocated_ref,
+    state->frame->ref->imagescolocated_ref->ref_pocs
+      state->frame->ref->ref_LXscolocated_ref
+        col_listcolocated->inter.mv_refcol_list,
+    mv_out
+  );
+
+  return true;
+}
+
+static INLINE bool add_mvp_candidate(const encoder_state_t *state,
                                      const cu_info_t *cur_cu,
+                                     const cu_info_t *cand,
                                      int8_t reflist,
-                                     int16_t mv_cand22)
+                                     bool scaling,
+                                     int16_t mv_cand_out2)
 {
+  if (!cand) return false;
+
+  assert(cand->inter.mv_dir != 0);
+  const int cand_list = cand->inter.mv_dir & (1 << reflist) ? reflist : !reflist;
+
+  if (scaling) {
+    mv_cand_out0 = cand->inter.mvcand_list0;
+    mv_cand_out1 = cand->inter.mvcand_list1;
+    apply_mv_scaling(state, cur_cu, cand, reflist, cand_list, mv_cand_out);
+    return true;
+  }
+
+  if (cand->inter.mv_dir & (1 << cand_list) &&
+      state->frame->ref_LXcand_listcand->inter.mv_refcand_list == 
+      state->frame->ref_LXreflistcur_cu->inter.mv_refreflist)
+  {
+    mv_cand_out0 = cand->inter.mvcand_list0;
+    mv_cand_out1 = cand->inter.mvcand_list1;
+    return true;
+  }
+
+  return false;
+}
+
+/**
+ * \brief Pick two mv candidates from the spatial and temporal candidates.
+ */
+static void get_mv_cand_from_candidates(const encoder_state_t * const state,
+                                        int32_t x,
+                                        int32_t y,
+                                        int32_t width,
+                                        int32_t height,
+                                        const merge_candidates_t *merge_cand,
+                                        const cu_info_t *cur_cu,
+                                        int8_t reflist,
+                                        int16_t mv_cand22)
+{
+  const cu_info_t *const *a = merge_cand->a;
+  const cu_info_t *const *b = merge_cand->b;
+  const cu_info_t *c3 = merge_cand->c3;
+  const cu_info_t *h  = merge_cand->h;
+
   uint8_t candidates = 0;
   uint8_t b_candidates = 0;
-  int8_t reflist2nd = !reflist;
-
- #define CALCULATE_SCALE(cu,tb,td) ((tb * ((0x4000 + (abs(td)>>1))/td) + 32) >> 6)
-#define APPLY_MV_SCALING(cu, cand, list) {int td = state->frame->poc - state->frame->ref->pocs(cu)->inter.mv_reflist;\
-                                   int tb = state->frame->poc - state->frame->ref->pocscur_cu->inter.mv_refreflist;\
-                                   if (td != tb) { \
-                                      int scale = CALCULATE_SCALE(cu,tb,td); \
-                                       mv_candcand0 = ((scale * (cu)->inter.mvlist0 + 127 + (scale * (cu)->inter.mvlist0 < 0)) >> 8 ); \
-                                       mv_candcand1 = ((scale * (cu)->inter.mvlist1 + 127 + (scale * (cu)->inter.mvlist1 < 0)) >> 8 ); }}
-
-  // Left predictors
-  if (a0 && (
-    ((a0->inter.mv_dir & 1) && a0->inter.mv_ref0 == cur_cu->inter.mv_refreflist) ||
-    ((a0->inter.mv_dir & 2) && a0->inter.mv_ref1 == cur_cu->inter.mv_refreflist))) {
-    if (a0->inter.mv_dir & (1 << reflist) && a0->inter.mv_refreflist == cur_cu->inter.mv_refreflist) {
-      mv_candcandidates0 = a0->inter.mvreflist0;
-      mv_candcandidates1 = a0->inter.mvreflist1;
-    } else {
-      mv_candcandidates0 = a0->inter.mvreflist2nd0;
-      mv_candcandidates1 = a0->inter.mvreflist2nd1;
-    }
-    candidates++;
-  } else if (a1 && (
-    ((a1->inter.mv_dir & 1) && a1->inter.mv_ref0 == cur_cu->inter.mv_refreflist) ||
-    ((a1->inter.mv_dir & 2) && a1->inter.mv_ref1 == cur_cu->inter.mv_refreflist))) {
-    if (a1->inter.mv_dir & (1 << reflist) && a1->inter.mv_refreflist == cur_cu->inter.mv_refreflist) {
-      mv_candcandidates0 = a1->inter.mvreflist0;
-      mv_candcandidates1 = a1->inter.mvreflist1;
-    } else {
-      mv_candcandidates0 = a1->inter.mvreflist2nd0;
-      mv_candcandidates1 = a1->inter.mvreflist2nd1;
+
+  // Left predictors without scaling
+  for (int i = 0; i < 2; i++) {
+    if (add_mvp_candidate(state, cur_cu, ai, reflist, false, mv_candcandidates)) {
+      candidates++;
+      break;
     }
-    candidates++;
   }
 
-  if(!candidates) {
-      // Left predictors
-    if (a0) {
-      if (a0->inter.mv_dir & (1 << reflist)) {
-        mv_candcandidates0 = a0->inter.mvreflist0;
-        mv_candcandidates1 = a0->inter.mvreflist1;
-        APPLY_MV_SCALING(a0, candidates, reflist);
-      } else {
-        mv_candcandidates0 = a0->inter.mvreflist2nd0;
-        mv_candcandidates1 = a0->inter.mvreflist2nd1;
-        APPLY_MV_SCALING(a0, candidates, reflist2nd);
-      }
-      candidates++;
-    } else if (a1) {
-      if (a1->inter.mv_dir & (1 << reflist)) {
-        mv_candcandidates0 = a1->inter.mvreflist0;
-        mv_candcandidates1 = a1->inter.mvreflist1;
-        APPLY_MV_SCALING(a1, candidates, reflist);
-      } else {
-        mv_candcandidates0 = a1->inter.mvreflist2nd0;
-        mv_candcandidates1 = a1->inter.mvreflist2nd1;
-        APPLY_MV_SCALING(a1, candidates, reflist2nd);
+  // Left predictors with scaling
+  if (candidates == 0) {
+    for (int i = 0; i < 2; i++) {
+      if (add_mvp_candidate(state, cur_cu, ai, reflist, true, mv_candcandidates)) {
+        candidates++;
+        break;
       }
-      candidates++;
     }
   }
 
-  // Top predictors
-  if (b0 && (
-    ((b0->inter.mv_dir & 1) && b0->inter.mv_ref0 == cur_cu->inter.mv_refreflist) ||
-    ((b0->inter.mv_dir & 2) && b0->inter.mv_ref1 == cur_cu->inter.mv_refreflist))) {
-    if (b0->inter.mv_dir & (1 << reflist) && b0->inter.mv_refreflist == cur_cu->inter.mv_refreflist) {
-      mv_candcandidates0 = b0->inter.mvreflist0;
-      mv_candcandidates1 = b0->inter.mvreflist1;
-    } else {
-      mv_candcandidates0 = b0->inter.mvreflist2nd0;
-      mv_candcandidates1 = b0->inter.mvreflist2nd1;
-    }
-    b_candidates++;
-  } else if (b1 && (
-    ((b1->inter.mv_dir & 1) && b1->inter.mv_ref0 == cur_cu->inter.mv_refreflist) ||
-    ((b1->inter.mv_dir & 2) && b1->inter.mv_ref1 == cur_cu->inter.mv_refreflist))) {
-    if (b1->inter.mv_dir & (1 << reflist) && b1->inter.mv_refreflist == cur_cu->inter.mv_refreflist) {
-      mv_candcandidates0 = b1->inter.mvreflist0;
-      mv_candcandidates1 = b1->inter.mvreflist1;
-    } else {
-      mv_candcandidates0 = b1->inter.mvreflist2nd0;
-      mv_candcandidates1 = b1->inter.mvreflist2nd1;
-    }
-    b_candidates++;
-  } else if (b2 && (
-    ((b2->inter.mv_dir & 1) && b2->inter.mv_ref0 == cur_cu->inter.mv_refreflist) ||
-    ((b2->inter.mv_dir & 2) && b2->inter.mv_ref1 == cur_cu->inter.mv_refreflist))) {
-    if (b2->inter.mv_dir & (1 << reflist) && b2->inter.mv_refreflist == cur_cu->inter.mv_refreflist) {
-      mv_candcandidates0 = b2->inter.mvreflist0;
-      mv_candcandidates1 = b2->inter.mvreflist1;
-    } else {
-      mv_candcandidates0 = b2->inter.mvreflist2nd0;
-      mv_candcandidates1 = b2->inter.mvreflist2nd1;
+  // Top predictors without scaling
+  for (int i = 0; i < 3; i++) {
+    if (add_mvp_candidate(state, cur_cu, bi, reflist, false, mv_candcandidates)) {
+      b_candidates++;
+      break;
     }
-    b_candidates++;
   }
+
   candidates += b_candidates;
 
-  // When a1 or a0 is available, we dont check for secondary B candidates
-  if (a1 || a0) {
+  // When a1 or a0 is available, we dont check for secondary B candidates.
+  if (a0 || a1) {
     b_candidates = 1;
-  } else if(candidates != 2) {
+  } else if (candidates != 2) {
     b_candidates = 0;
   }
 
-  if(!b_candidates) {
-    // Top predictors
-    if (b0) {
-      if (b0->inter.mv_dir & (1 << reflist)) {
-        mv_candcandidates0 = b0->inter.mvreflist0;
-        mv_candcandidates1 = b0->inter.mvreflist1;
-        APPLY_MV_SCALING(b0, candidates, reflist);
-      } else {
-        mv_candcandidates0 = b0->inter.mvreflist2nd0;
-        mv_candcandidates1 = b0->inter.mvreflist2nd1;
-        APPLY_MV_SCALING(b0, candidates, reflist2nd);
-      }
-      candidates++;
-    } else if (b1) {
-      if (b1->inter.mv_dir & (1 << reflist)) {
-        mv_candcandidates0 = b1->inter.mvreflist0;
-        mv_candcandidates1 = b1->inter.mvreflist1;
-        APPLY_MV_SCALING(b1, candidates, reflist);
-      } else {
-        mv_candcandidates0 = b1->inter.mvreflist2nd0;
-        mv_candcandidates1 = b1->inter.mvreflist2nd1;
-        APPLY_MV_SCALING(b1, candidates, reflist2nd);
-      }
-      candidates++;
-    } else if (b2) {
-      if (b2->inter.mv_dir & (1 << reflist)) {
-        mv_candcandidates0 = b2->inter.mvreflist0;
-        mv_candcandidates1 = b2->inter.mvreflist1;
-        APPLY_MV_SCALING(b2, candidates, reflist);
-      } else {
-        mv_candcandidates0 = b2->inter.mvreflist2nd0;
-        mv_candcandidates1 = b2->inter.mvreflist2nd1;
-        APPLY_MV_SCALING(b2, candidates, reflist2nd);
+  if (!b_candidates) {
+    // Top predictors with scaling
+    for (int i = 0; i < 3; i++) {
+      if (add_mvp_candidate(state, cur_cu, bi, reflist, true, mv_candcandidates)) {
+        candidates++;
+        break;
       }
-      candidates++;
     }
   }
 
@@ -1086,70 +1125,22 @@
     candidates = 1;
   }
 
-  // Use Temporal Motion Vector Prediction when enabled
-  if (state->encoder_control->cfg.tmvp_enable) {
-    /*
-    Predictor block locations
-    __________
-    |CurrentPU|
-    | |C0|__  |
-    |    |C3| |
-    |_________|_
-              |H|
-    */
-
-    // TMVP required at least two sequential P/B-frames
-    if (state->frame->poc > 1 && state->frame->ref->used_size && candidates < AMVP_MAX_NUM_CANDS) {
-
-      // Use "H" as the primary predictor and "C3" as secondary
-      const cu_info_t *selected_CU = (h != NULL) ? h : (c3 != NULL) ? c3 : NULL;
-
-      if (selected_CU) {
-        uint32_t colocated_ref = UINT_MAX;
-        uint32_t colocated_ref_poc = 0;
-        int td, tb;
-
-        //ToDo: allow other than L00 for prediction
-
-        //Fetch ref idx of the selected CU in L00 ref list                    
-        for (int temporal_cand = 0; temporal_cand < state->frame->ref->used_size; temporal_cand++) {
-          if (state->frame->refmaptemporal_cand.list == 1 && state->frame->refmaptemporal_cand.idx == 0) {
-            colocated_ref = temporal_cand;
-            break;
-          }
-        }
-
-        if (colocated_ref != UINT_MAX) {
-
-          uint8_t used_reflist = reflist;
-
-          colocated_ref_poc = state->frame->ref->pocscolocated_ref;
-
-          if (!(selected_CU->inter.mv_dir & (1 << used_reflist))) {
-            used_reflist = !reflist;
-          }
-
-          // The reference id the colocated block is using
-          uint32_t colocated_ref_mv_ref = selected_CU->inter.mv_refused_reflist;
-
-          td = colocated_ref_poc - state->frame->ref->imagescolocated_ref->ref_pocscolocated_ref_mv_ref;
-          tb = state->frame->poc - state->frame->ref->pocscur_cu->inter.mv_refreflist;
-
-          if (td == tb) {
-            mv_candcandidates0 = selected_CU->inter.mvused_reflist0;
-            mv_candcandidates1 = selected_CU->inter.mvused_reflist1;
-          } else {
-            int scale = CALCULATE_SCALE(NULL, tb, td);
-            mv_candcandidates0 = ((scale * selected_CU->inter.mvused_reflist0 + 127 + ((scale * selected_CU->inter.mvused_reflist0) < 0)) >> 8);
-            mv_candcandidates1 = ((scale * selected_CU->inter.mvused_reflist1 + 127 + ((scale * selected_CU->inter.mvused_reflist1) < 0)) >> 8);
-          }
-           
-          candidates++;
-
-        }
-      }
-#undef CALCULATE_SCALE
-    }
+  // Use Temporal Motion Vector Prediction when enabled.
+  // TMVP required at least two sequential P/B-frames.
+  bool can_use_tmvp =
+    state->encoder_control->cfg.tmvp_enable &&
+    state->frame->poc > 1 &&
+    state->frame->ref->used_size &&
+    candidates < AMVP_MAX_NUM_CANDS &&
+    (h != NULL || c3 != NULL);
+
+  if (can_use_tmvp && add_temporal_candidate(state,
+                                             state->frame->ref_LXreflistcur_cu->inter.mv_refreflist,
+                                             (h != NULL) ? h : c3,
+                                             reflist,
+                                             mv_candcandidates))
+  {
+    candidates++;
   }
 
   // Fill with (0,0)
@@ -1158,8 +1149,6 @@
     mv_candcandidates1 = 0;
     candidates++;
   }
-#undef CALCULATE_SCALE
-#undef APPLY_MV_SCALING
 }
 
 /**
@@ -1185,13 +1174,15 @@
                            lcu_t *lcu,
                            int8_t reflist)
 {
-  cu_info_t *b0, *b1, *b2, *a0, *a1, *c3, *h;
-  b0 = b1 = b2 = a0 = a1 = c3 = h = NULL;
+  merge_candidates_t merge_cand = { {0, 0}, {0, 0, 0}, 0, 0 };
+
   get_spatial_merge_candidates(x, y, width, height,
-                               state->tile->frame->width, state->tile->frame->height,
-                               &b0, &b1, &b2, &a0, &a1, lcu);
-  kvz_inter_get_temporal_merge_candidates(state, x, y, width, height, &c3, &h, 1, 0);
-  get_mv_cand_from_candidates(state, x, y, width, height, b0, b1, b2, a0, a1, c3, h, cur_cu, reflist, mv_cand);
+                               state->tile->frame->width,
+                               state->tile->frame->height,
+                               lcu,
+                               &merge_cand);
+  get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
+  get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand);
 }
 
 /**
@@ -1215,17 +1206,54 @@
                                const cu_info_t* cur_cu,
                                int8_t reflist)
 {
-  const cu_info_t *b0, *b1, *b2, *a0, *a1;
-  cu_info_t *c3, *h;
-  b0 = b1 = b2 = a0 = a1 = c3 = h = NULL;
-  
+  merge_candidates_t merge_cand = { {0, 0}, {0, 0, 0}, 0, 0 };
+
   const cu_array_t *cua = state->tile->frame->cu_array;
   get_spatial_merge_candidates_cua(cua,
                                    x, y, width, height,
                                    state->tile->frame->width, state->tile->frame->height,
-                                   &b0, &b1, &b2, &a0, &a1);
-  kvz_inter_get_temporal_merge_candidates(state, x, y, width, height, &c3, &h, 1, 0);
-  get_mv_cand_from_candidates(state, x, y, width, height, b0, b1, b2, a0, a1, c3, h, cur_cu, reflist, mv_cand);
+                                   &merge_cand);
+  get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
+  get_mv_cand_from_candidates(state, x, y, width, height, &merge_cand, cur_cu, reflist, mv_cand);
+}
+
+static bool is_duplicate_candidate(const cu_info_t* cu1, const cu_info_t* cu2)
+{
+  if (!cu2) return false;
+  if (cu1->inter.mv_dir != cu2->inter.mv_dir) return false;
+
+  for (int reflist = 0; reflist < 2; reflist++) {
+    if (cu1->inter.mv_dir & (1 << reflist)) {
+      if (cu1->inter.mvreflist0  != cu2->inter.mvreflist0  ||
+          cu1->inter.mvreflist1  != cu2->inter.mvreflist1  ||
+          cu1->inter.mv_refreflist != cu2->inter.mv_refreflist) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool add_merge_candidate(const cu_info_t *cand,
+                                const cu_info_t *possible_duplicate1,
+                                const cu_info_t *possible_duplicate2,
+                                inter_merge_cand_t *merge_cand_out)
+{
+  if (!cand ||
+      is_duplicate_candidate(cand, possible_duplicate1) ||
+      is_duplicate_candidate(cand, possible_duplicate2)) {
+    return false;
+  }
+
+  merge_cand_out->mv00 = cand->inter.mv00;
+  merge_cand_out->mv01 = cand->inter.mv01;
+  merge_cand_out->mv10 = cand->inter.mv10;
+  merge_cand_out->mv11 = cand->inter.mv11;
+  merge_cand_out->ref0   = cand->inter.mv_ref0; // L0/L1 references
+  merge_cand_out->ref1   = cand->inter.mv_ref1;
+  merge_cand_out->dir      = cand->inter.mv_dir;
+  return true;
 }
 
 /**
@@ -1239,7 +1267,6 @@
  * \param use_b1    true, if candidate b1 can be used
  * \param mv_cand   Returns the merge candidates.
  * \param lcu       lcu containing the block
- * \param ref_idx   current reference index (used only by TMVP)
  * \return          number of merge candidates
  */
 uint8_t kvz_inter_get_merge_cand(const encoder_state_t * const state,
@@ -1247,228 +1274,63 @@
                                  int32_t width, int32_t height,
                                  bool use_a1, bool use_b1,
                                  inter_merge_cand_t mv_candMRG_MAX_NUM_CANDS,
-                                 lcu_t *lcu,
-                                 uint8_t ref_idx)
+                                 lcu_t *lcu)
 {
   uint8_t candidates = 0;
-  int8_t duplicate = 0;
-
-  cu_info_t *b0, *b1, *b2, *a0, *a1;
   int8_t zero_idx = 0;
-  b0 = b1 = b2 = a0 = a1 = NULL;
-  get_spatial_merge_candidates(x, y, width, height,
-                               state->tile->frame->width, state->tile->frame->height,
-                               &b0, &b1, &b2, &a0, &a1, lcu);
-
-  if (!use_a1) a1 = NULL;
-  if (!use_b1) b1 = NULL;
-
-#define CHECK_DUPLICATE(CU1,CU2) {duplicate = 0; if ((CU2) && \
-                                                     (CU1)->inter.mv_dir == (CU2)->inter.mv_dir && \
-                                                    (!(((CU1)->inter.mv_dir & 1) && ((CU2)->inter.mv_dir & 1)) || \
-                                                      ((CU1)->inter.mv00 == (CU2)->inter.mv00 && \
-                                                       (CU1)->inter.mv01 ==  (CU2)->inter.mv01 && \
-                                                       (CU1)->inter.mv_ref0 == (CU2)->inter.mv_ref0) ) && \
-                                                    (!(((CU1)->inter.mv_dir & 2) && ((CU2)->inter.mv_dir & 2) )  || \
-                                                      ((CU1)->inter.mv10 == (CU2)->inter.mv10 && \
-                                                       (CU1)->inter.mv11 == (CU2)->inter.mv11 && \
-                                                       (CU1)->inter.mv_ref1 == (CU2)->inter.mv_ref1) ) \
-                                                      ) duplicate = 1; }
-
-  if (a1) {
-    mv_candcandidates.mv00 = a1->inter.mv00;
-    mv_candcandidates.mv01 = a1->inter.mv01;
-    mv_candcandidates.mv10 = a1->inter.mv10;
-    mv_candcandidates.mv11 = a1->inter.mv11;
-    mv_candcandidates.ref0 = a1->inter.mv_ref0;
-    mv_candcandidates.ref1 = a1->inter.mv_ref1;
-    mv_candcandidates.dir = a1->inter.mv_dir;
-    candidates++;
-  }
-
-  if (b1) {
-    if(candidates) CHECK_DUPLICATE(b1, a1);
-    if(!duplicate) {
-      mv_candcandidates.mv00 = b1->inter.mv00;
-      mv_candcandidates.mv01 = b1->inter.mv01;
-      mv_candcandidates.mv10 = b1->inter.mv10;
-      mv_candcandidates.mv11 = b1->inter.mv11;
-      mv_candcandidates.ref0 = b1->inter.mv_ref0;
-      mv_candcandidates.ref1 = b1->inter.mv_ref1;
-      mv_candcandidates.dir = b1->inter.mv_dir;
-      candidates++;
-    }
-  }
-
-  if (b0) {
-    if(candidates) CHECK_DUPLICATE(b0,b1);
-    if(!duplicate) {
-      mv_candcandidates.mv00 = b0->inter.mv00;
-      mv_candcandidates.mv01 = b0->inter.mv01;
-      mv_candcandidates.mv10 = b0->inter.mv10;
-      mv_candcandidates.mv11 = b0->inter.mv11;
-      mv_candcandidates.ref0 = b0->inter.mv_ref0;
-      mv_candcandidates.ref1 = b0->inter.mv_ref1;
-      mv_candcandidates.dir = b0->inter.mv_dir;
-      candidates++;
-    }
-  }
 
-  if (a0) {
-    if(candidates) CHECK_DUPLICATE(a0,a1);
-    if(!duplicate) {
-      mv_candcandidates.mv00 = a0->inter.mv00;
-      mv_candcandidates.mv01 = a0->inter.mv01;
-      mv_candcandidates.mv10 = a0->inter.mv10;
-      mv_candcandidates.mv11 = a0->inter.mv11;
-      mv_candcandidates.ref0 = a0->inter.mv_ref0;
-      mv_candcandidates.ref1 = a0->inter.mv_ref1;
-      mv_candcandidates.dir = a0->inter.mv_dir;
-      candidates++;
-    }
-  }
+  merge_candidates_t merge_cand = { {0, 0}, {0, 0, 0}, 0, 0 };
 
-  if (candidates != 4) {
-    if (b2) {
-      CHECK_DUPLICATE(b2,a1);
-      if(!duplicate) {
-        CHECK_DUPLICATE(b2,b1);
-        if(!duplicate) {
-          mv_candcandidates.mv00 = b2->inter.mv00;
-          mv_candcandidates.mv01 = b2->inter.mv01;
-          mv_candcandidates.mv10 = b2->inter.mv10;
-          mv_candcandidates.mv11 = b2->inter.mv11;
-          mv_candcandidates.ref0 = b2->inter.mv_ref0;
-          mv_candcandidates.ref1 = b2->inter.mv_ref1;
-          mv_candcandidates.dir = b2->inter.mv_dir;
-          candidates++;
-        }
+  get_spatial_merge_candidates(x, y, width, height,
+                               state->tile->frame->width,
+                               state->tile->frame->height,
+                               lcu,
+                               &merge_cand);
+
+  const cu_info_t **a = merge_cand.a;
+  const cu_info_t **b = merge_cand.b;
+
+  if (!use_a1) a1 = NULL;
+  if (!use_b1) b1 = NULL;
+
+  if (add_merge_candidate(a1, NULL, NULL, &mv_candcandidates)) candidates++;
+  if (add_merge_candidate(b1, a1, NULL, &mv_candcandidates)) candidates++;
+  if (add_merge_candidate(b0, b1, NULL, &mv_candcandidates)) candidates++;
+  if (add_merge_candidate(a0, a1, NULL, &mv_candcandidates)) candidates++;
+  if (candidates < 4 &&
+      add_merge_candidate(b2, a1, b1, &mv_candcandidates)) candidates++;
+
+  bool can_use_tmvp =
+    state->encoder_control->cfg.tmvp_enable &&
+    candidates < MRG_MAX_NUM_CANDS &&
+    state->frame->ref->used_size;
+
+  if (can_use_tmvp) {
+    mv_candcandidates.dir = 0;
+
+    const int max_reflist = (state->frame->slicetype == KVZ_SLICE_B ? 1 : 0);
+    for (int reflist = 0; reflist <= max_reflist; reflist++) {
+      // Fetch temporal candidates for the current CU
+      get_temporal_merge_candidates(state, x, y, width, height, 1, 0, &merge_cand);
+      // TODO: enable L1 TMVP candidate
+      // get_temporal_merge_candidates(state, x, y, width, height, 2, 0, &merge_cand);
+
+      const cu_info_t *temporal_cand =
+        (merge_cand.h != NULL) ? merge_cand.h : merge_cand.c3;
+
+      if (add_temporal_candidate(state,
+                                 // Reference index 0 is always used for
+                                 // the temporal merge candidate.
+                                 state->frame->ref_LXreflist0,
+                                 temporal_cand,
+                                 reflist,
+                                 mv_candcandidates.mvreflist)) {
+        mv_candcandidates.refreflist = 0;
+        mv_candcandidates.dir |= (1 << reflist);
       }
     }
-  }
-
-  if (state->encoder_control->cfg.tmvp_enable) {
-
-    #define CALCULATE_SCALE(tb,td) ((tb * ((0x4000 + (abs(td) >> 1)) / td) + 32) >> 6)
-
-    if (candidates < MRG_MAX_NUM_CANDS && state->frame->ref->used_size) {
-
-      uint32_t colocated_ref = UINT_MAX;
-      uint32_t colocated_ref_poc = 0;
-      int32_t td, tb;
-      uint8_t selected_reflist = 0;
-
-      cu_info_t *c3_L0 = NULL;
-      cu_info_t *h_L0 = NULL;
-      
-      // Fetch temporal candidates for the current CU, , L00
-      kvz_inter_get_temporal_merge_candidates(state, x, y, width, height, &c3_L0, &h_L0, 1, 0);
 
-      cu_info_t *selected_CU = NULL;
-
-      selected_CU = (h_L0 != NULL) ? h_L0 : (c3_L0 != NULL) ? c3_L0 : NULL;
-
-
-      mv_candcandidates.dir = 0;
-
-      // Find LIST_0 reference
-      if (selected_CU) {
-
-        if (!(selected_CU->inter.mv_dir & (selected_reflist + 1))) {
-          selected_reflist = !selected_reflist;
-        }
-
-        uint8_t colocated_ref_found = 0;
-
-        //Fetch ref idx of the selected CU in L00 ref list                    
-        for (int32_t temporal_cand = 0; temporal_cand < state->frame->ref->used_size; temporal_cand++) {
-          if (state->frame->refmaptemporal_cand.list == 1 && state->frame->refmaptemporal_cand.idx == 0) {
-            colocated_ref = temporal_cand;
-            colocated_ref_found = 1;
-            break;
-          }
-        }
-
-        if (colocated_ref_found) {
-          colocated_ref_poc = state->frame->ref->pocscolocated_ref;
-
-          // The reference id the colocated block is using
-          uint32_t colocated_ref_mv_ref = selected_CU->inter.mv_refselected_reflist;
-
-          td = colocated_ref_poc - state->frame->ref->imagescolocated_ref->ref_pocscolocated_ref_mv_ref;
-          tb = state->frame->poc - state->frame->ref->pocsref_idx;
-
-          mv_candcandidates.dir |= 1;
-
-          if (td == tb) {
-            mv_candcandidates.mv00 = selected_CU->inter.mvselected_reflist0;
-            mv_candcandidates.mv01 = selected_CU->inter.mvselected_reflist1;
-          } else {
-            int32_t scale = CALCULATE_SCALE(tb, td);
-            mv_candcandidates.mv00 = ((scale * selected_CU->inter.mvselected_reflist0 + 127 + ((scale * selected_CU->inter.mvselected_reflist0) < 0)) >> 8);
-            mv_candcandidates.mv01 = ((scale * selected_CU->inter.mvselected_reflist1 + 127 + ((scale * selected_CU->inter.mvselected_reflist1) < 0)) >> 8);
-          }
-          mv_candcandidates.ref0 = colocated_ref;
-        }
-      }
-
-
-      if (state->frame->slicetype == KVZ_SLICE_B) {
-
-        selected_reflist = 1;
-
-        // ToDo: enable L1 TMVP candidate
-        // Fetch temporal candidates for the current CU, L00
-        kvz_inter_get_temporal_merge_candidates(state, x, y, width, height, &c3_L0, &h_L0, 1, 0);
-        //kvz_inter_get_temporal_merge_candidates(state, x, y, width, height, &c3_L1, &h_L1, 2, 0);
-
-        selected_CU = (h_L0 != NULL) ? h_L0 : (c3_L0 != NULL) ? c3_L0 : NULL;
-
-        // Find LIST_1 reference
-        if (selected_CU) {
-          if (!(selected_CU->inter.mv_dir & (selected_reflist + 1))) {
-            selected_reflist = !selected_reflist;
-          }
-          uint8_t colocated_ref_found = 0;
-          
-          //Fetch ref idx of the selected CU in L00 ref list                    
-          for (int32_t temporal_cand = 0; temporal_cand < state->frame->ref->used_size; temporal_cand++) {
-            if (state->frame->refmaptemporal_cand.list == 1 && state->frame->refmaptemporal_cand.idx == 0) {
-              colocated_ref = temporal_cand;
-              colocated_ref_found = 1;
-              break;
-            }
-          }
-
-          colocated_ref_poc = state->frame->ref->pocscolocated_ref;
-
-          if (colocated_ref_found) {
-            // The reference id the colocated block is using
-            uint32_t colocated_ref_mv_ref = selected_CU->inter.mv_refselected_reflist;
-
-            // POC differences in current and in candidate
-            td = colocated_ref_poc - state->frame->ref->imagescolocated_ref->ref_pocscolocated_ref_mv_ref;
-            tb = state->frame->poc - state->frame->ref->pocsref_idx;
-            mv_candcandidates.dir |= 2;
-
-            // No need for scaling when POC difference is the same
-            if (td == tb) {
-              mv_candcandidates.mv10 = selected_CU->inter.mvselected_reflist0;
-              mv_candcandidates.mv11 = selected_CU->inter.mvselected_reflist1;
-            } else {
-              int32_t scale = CALCULATE_SCALE(tb, td);
-              mv_candcandidates.mv10 = ((scale * selected_CU->inter.mvselected_reflist0 + 127 + ((scale * selected_CU->inter.mvselected_reflist0) < 0)) >> 8);
-              mv_candcandidates.mv11 = ((scale * selected_CU->inter.mvselected_reflist1 + 127 + ((scale * selected_CU->inter.mvselected_reflist1) < 0)) >> 8);
-            }
-            mv_candcandidates.ref1 = colocated_ref;
-          }
-        }
-      }
-        
-      if (mv_candcandidates.dir != 0) candidates++;
-
-    }
-    #undef CALCULATE_SCALE
+    if (mv_candcandidates.dir != 0) candidates++;
   }
 
   if (candidates < MRG_MAX_NUM_CANDS && state->frame->slicetype == KVZ_SLICE_B) {
@@ -1493,9 +1355,11 @@
         mv_candcandidates.ref0    = mv_candi.ref0;
         mv_candcandidates.ref1    = mv_candj.ref1;
 
-        if (mv_candi.ref0 == mv_candj.ref1 &&
-          mv_candi.mv00 == mv_candj.mv10 && 
-          mv_candi.mv01 == mv_candj.mv11) {
+        if (state->frame->ref_LX0mv_candi.ref0 ==
+            state->frame->ref_LX1mv_candj.ref1
+            &&
+            mv_candi.mv00 == mv_candj.mv10 && 
+            mv_candi.mv01 == mv_candj.mv11) {
           // Not a candidate
         } else {
           candidates++;
@@ -1519,12 +1383,12 @@
     }
     num_ref = MIN(ref_negative, ref_positive);
   }
-  
+
   // Add (0,0) prediction
   while (candidates != MRG_MAX_NUM_CANDS) {
     mv_candcandidates.mv00 = 0;
     mv_candcandidates.mv01 = 0;
-    mv_candcandidates.ref0 = (zero_idx>=num_ref-1)?0:zero_idx;
+    mv_candcandidates.ref0 = (zero_idx >= num_ref - 1) ? 0 : zero_idx;
     mv_candcandidates.ref1 = mv_candcandidates.ref0;
     mv_candcandidates.dir = 1;
     if (state->frame->slicetype == KVZ_SLICE_B) {

kvazaar-1.1.0.tar.gz/src/inter.h -> kvazaar-1.2.0.tar.gz/src/inter.h Changed

kvazaar-1.1.0.tar.gz/src/intra.c -> kvazaar-1.2.0.tar.gz/src/intra.c Changed

@@ -114,6 +114,52 @@
   return 1;
 }
 
+#if KVZ_SEL_ENCRYPTION
+int8_t kvz_intra_get_dir_luma_predictor_encry(
+  const uint32_t x,
+  const uint32_t y,
+  int8_t *preds,
+  const cu_info_t *const cur_pu,
+  const cu_info_t *const left_pu,
+  const cu_info_t *const above_pu)
+{
+  // The default mode if block is not coded yet is INTRA_DC.
+  int8_t left_intra_dir  = 1;
+  if (left_pu && left_pu->type == CU_INTRA) {
+    left_intra_dir = left_pu->intra.mode_encry ;
+  }
+
+  int8_t above_intra_dir = 1;
+  if (above_pu && above_pu->type == CU_INTRA && y % LCU_WIDTH != 0) {
+    above_intra_dir = above_pu->intra.mode_encry;
+  }
+
+  // If the predictions are the same, add new predictions
+  if (left_intra_dir == above_intra_dir) {
+    if (left_intra_dir > 1) { // angular modes
+      preds0 = left_intra_dir;
+      preds1 = ((left_intra_dir + 29) % 32) + 2;
+      preds2 = ((left_intra_dir - 1 ) % 32) + 2;
+    } else { //non-angular
+      preds0 = 0;//PLANAR_IDX;
+      preds1 = 1;//DC_IDX;
+      preds2 = 26;//VER_IDX;
+    }
+  } else { // If we have two distinct predictions
+    preds0 = left_intra_dir;
+    preds1 = above_intra_dir;
+
+    // add planar mode if it's not yet present
+    if (left_intra_dir && above_intra_dir ) {
+      preds2 = 0; // PLANAR_IDX;
+    } else {  // Add DC mode if it's not present, otherwise 26.
+      preds2 =  (left_intra_dir+above_intra_dir)<2? 26 : 1;
+    }
+  }
+
+  return 1;
+}
+#endif
 
 static void intra_filter_reference(
   int_fast8_t log2_width,
@@ -541,126 +587,120 @@
   }
 }
 
-void kvz_intra_recon_lcu_luma(
+static void intra_recon_tb_leaf(
   encoder_state_t *const state,
   int x,
   int y,
   int depth,
   int8_t intra_mode,
-  cu_info_t *cur_cu,
-  lcu_t *lcu)
+  lcu_t *lcu,
+  color_t color)
 {
-  const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
-  if (cur_cu == NULL) {
-    cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
-  }
-  const int8_t width = LCU_WIDTH >> depth;
-
-  if (depth == 0 || cur_cu->tr_depth > depth) {
-    int offset = width / 2;
-
-    kvz_intra_recon_lcu_luma(state, x,          y,          depth+1, intra_mode, NULL, lcu);
-    kvz_intra_recon_lcu_luma(state, x + offset, y,          depth+1, intra_mode, NULL, lcu);
-    kvz_intra_recon_lcu_luma(state, x,          y + offset, depth+1, intra_mode, NULL, lcu);
-    kvz_intra_recon_lcu_luma(state, x + offset, y + offset, depth+1, intra_mode, NULL, lcu);
-
-    if (depth < MAX_DEPTH) {
-      uint16_t child_cbfs3 = {
-        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
-        LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
-        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
-      };
-      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y);
-    }
+  const kvz_config *cfg = &state->encoder_control->cfg;
+  const int shift = color == COLOR_Y ? 0 : 1;
 
-    return;
+  int log2width = LOG2_LCU_WIDTH - depth;
+  if (color != COLOR_Y && depth < MAX_PU_DEPTH) {
+    // Chroma width is half of luma width, when not at maximum depth.
+    log2width -= 1;
   }
+  const int width = 1 << log2width;
+  const int lcu_width = LCU_WIDTH >> shift;
+
+  const vector2d_t luma_px = { x, y };
+  const vector2d_t pic_px = {
+    state->tile->frame->width,
+    state->tile->frame->height,
+  };
+  const vector2d_t lcu_px = { SUB_SCU(x) >> shift, SUB_SCU(y) >> shift};
 
-  // Perform intra prediction and put the result in correct place lcu.
-  vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
-  vector2d_t luma_px = { x, y };
   kvz_intra_references refs;
-  const int_fast8_t log2_width = kvz_g_convert_to_bitwidth + 2;
-  kvz_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs);
+  kvz_intra_build_reference(log2width, color, &luma_px, &pic_px, lcu, &refs);
 
   kvz_pixel pred32 * 32;
-  const kvz_config *cfg = &state->encoder_control->cfg;
-  bool filter_boundary = !(cfg->lossless && cfg->implicit_rdpcm);
-  kvz_intra_predict(&refs, log2_width, intra_mode, COLOR_Y, pred, filter_boundary);
-  
-  kvz_pixel *block_in_lcu = &lcu->rec.ylcu_px.x + lcu_px.y * LCU_WIDTH;
-  kvz_pixels_blit(pred, block_in_lcu, width, width, width, LCU_WIDTH);
-
-  kvz_quantize_lcu_luma_residual(state, x, y, depth, cur_cu, lcu);
+  const bool filter_boundary = color == COLOR_Y && !(cfg->lossless && cfg->implicit_rdpcm);
+  kvz_intra_predict(&refs, log2width, intra_mode, color, pred, filter_boundary);
+
+  const int index = lcu_px.x + lcu_px.y * lcu_width;
+  kvz_pixel *block = NULL;
+  switch (color) {
+    case COLOR_Y:
+      block = &lcu->rec.yindex;
+      break;
+    case COLOR_U:
+      block = &lcu->rec.uindex;
+      break;
+    case COLOR_V:
+      block = &lcu->rec.vindex;
+      break;
+  }
+  kvz_pixels_blit(pred, block , width, width, width, lcu_width);
 }
 
-
-void kvz_intra_recon_lcu_chroma(
+/**
+ * \brief Reconstruct an intra CU
+ *
+ * \param state         encoder state
+ * \param x             x-coordinate of the CU in luma pixels
+ * \param y             y-coordinate of the CU in luma pixels
+ * \param depth         depth in the CU tree
+ * \param mode_luma     intra mode for luma, or -1 to skip luma recon
+ * \param mode_chroma   intra mode for chroma, or -1 to skip chroma recon
+ * \param cur_cu        pointer to the CU, or NULL to fetch CU from LCU
+ * \param lcu           containing LCU
+ */
+void kvz_intra_recon_cu(
   encoder_state_t *const state,
   int x,
   int y,
   int depth,
-  int8_t intra_mode,
+  int8_t mode_luma,
+  int8_t mode_chroma,
   cu_info_t *cur_cu,
   lcu_t *lcu)
 {
   const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
   const int8_t width = LCU_WIDTH >> depth;
-  const int8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
-
   if (cur_cu == NULL) {
     cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
 
   if (depth == 0 || cur_cu->tr_depth > depth) {
-    int offset = width / 2;
-
-    kvz_intra_recon_lcu_chroma(state, x,          y,          depth+1, intra_mode, NULL, lcu);
-    kvz_intra_recon_lcu_chroma(state, x + offset, y,          depth+1, intra_mode, NULL, lcu);
-    kvz_intra_recon_lcu_chroma(state, x,          y + offset, depth+1, intra_mode, NULL, lcu);
-    kvz_intra_recon_lcu_chroma(state, x + offset, y + offset, depth+1, intra_mode, NULL, lcu);
-
-    if (depth <= MAX_DEPTH) {
-      uint16_t child_cbfs3 = {
-        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
-        LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
-        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
-      };
+    const int offset = width / 2;
+    const int32_t x2 = x + offset;
+    const int32_t y2 = y + offset;
+
+    kvz_intra_recon_cu(state, x,  y,  depth + 1, mode_luma, mode_chroma, NULL, lcu);
+    kvz_intra_recon_cu(state, x2, y,  depth + 1, mode_luma, mode_chroma, NULL, lcu);
+    kvz_intra_recon_cu(state, x,  y2, depth + 1, mode_luma, mode_chroma, NULL, lcu);
+    kvz_intra_recon_cu(state, x2, y2, depth + 1, mode_luma, mode_chroma, NULL, lcu);
+
+    // Propagate coded block flags from child CUs to parent CU.
+    uint16_t child_cbfs3 = {
+      LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
+      LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
+      LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
+    };
+
+    if (mode_luma != -1 && depth < MAX_DEPTH) {
+      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y);
+    }
+    if (mode_chroma != -1 && depth <= MAX_DEPTH) {
       cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_U);
       cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_V);
     }
-    return;
-  }
-
-  if (!(x & 4 || y & 4)) {
-    const int_fast8_t log2_width_c = kvz_g_convert_to_bitwidth_c + 2;
-    const vector2d_t luma_px = { x, y };
-    const vector2d_t pic_px = { state->tile->frame->width, state->tile->frame->height };
-
-    // Intra predict U-plane and put the result in lcu buffer.
-    {
-      kvz_intra_references refs;
-      kvz_intra_build_reference(log2_width_c, COLOR_U, &luma_px, &pic_px, lcu, &refs);
-
-      kvz_pixel pred32 * 32;
-      kvz_intra_predict(&refs, log2_width_c, intra_mode, COLOR_U, pred, false);
-
-      kvz_pixel *pu_in_lcu = &lcu->rec.ulcu_px.x / 2 + (lcu_px.y * LCU_WIDTH) / 4;
-      kvz_pixels_blit(pred, pu_in_lcu, width_c, width_c, width_c, LCU_WIDTH_C);
+  } else {
+    const bool has_luma = mode_luma != -1;
+    const bool has_chroma = mode_chroma != -1 && x % 8 == 0 && y % 8 == 0;
+    // Process a leaf TU.
+    if (has_luma) {
+      intra_recon_tb_leaf(state, x, y, depth, mode_luma, lcu, COLOR_Y);
     }
-
-    // Intra predict V-plane and put the result in lcu buffer.
-    {
-      kvz_intra_references refs;
-      kvz_intra_build_reference(log2_width_c, COLOR_V, &luma_px, &pic_px, lcu, &refs);
-      
-      kvz_pixel pred32 * 32;
-      kvz_intra_predict(&refs, log2_width_c, intra_mode, COLOR_V, pred, false);
-
-      kvz_pixel *pu_in_lcu = &lcu->rec.vlcu_px.x / 2 + (lcu_px.y * LCU_WIDTH) / 4;
-      kvz_pixels_blit(pred, pu_in_lcu, width_c, width_c, width_c, LCU_WIDTH_C);
+    if (has_chroma) {
+      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, lcu, COLOR_U);
+      intra_recon_tb_leaf(state, x, y, depth, mode_chroma, lcu, COLOR_V);
     }
 
-    kvz_quantize_lcu_chroma_residual(state, x, y, depth, cur_cu, lcu);
+    kvz_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu);
   }
 }

kvazaar-1.1.0.tar.gz/src/intra.h -> kvazaar-1.2.0.tar.gz/src/intra.h Changed

@@ -62,6 +62,26 @@
   const cu_info_t *const left_pu,
   const cu_info_t *const above_pu);
 
+#if KVZ_SEL_ENCRYPTION
+/**
+* \brief Function for deriving intra luma predictions with encryption
+* \param x          x-coordinate of the PU in pixels
+* \param y          y-coordinate of the PU in pixels
+* \param preds      output buffer for 3 predictions
+* \param cur_pu     PU to check
+* \param left_pu    PU to the left of cur_pu
+* \param above_pu   PU above cur_pu
+* \returns          1 if predictions are found, otherwise 0
+*/
+int8_t kvz_intra_get_dir_luma_predictor_encry(
+const uint32_t x,
+const uint32_t y,
+int8_t *preds,
+const cu_info_t *const cur_pu,
+const cu_info_t *const left_pu,
+const cu_info_t *const above_pu);
+#endif
+
 /**
 * \brief Generage angular predictions.
 * \param width    Width in pixels, range 4..32.
@@ -97,27 +117,13 @@
   kvz_pixel *dst,
   bool filter_boundary);
 
-/**
- * \brief Do a full intra prediction cycle on a CU in lcu for luma.
- */
-void kvz_intra_recon_lcu_luma(
-  encoder_state_t *const state,
-  int x,
-  int y,
-  int depth,
-  int8_t intra_mode,
-  cu_info_t *cur_cu,
-  lcu_t *lcu);
-
-/**
-* \brief Do a full intra prediction cycle on a CU in lcu for chroma.
-*/
-void kvz_intra_recon_lcu_chroma(
+void kvz_intra_recon_cu(
   encoder_state_t *const state,
   int x,
   int y,
   int depth,
-  int8_t intra_mode,
+  int8_t mode_luma,
+  int8_t mode_chroma,
   cu_info_t *cur_cu,
   lcu_t *lcu);

kvazaar-1.1.0.tar.gz/src/kvazaar.c -> kvazaar-1.2.0.tar.gz/src/kvazaar.c Changed

@@ -43,7 +43,21 @@
 static void kvazaar_close(kvz_encoder *encoder)
 {
   if (encoder) {
+    // The threadqueue must be stopped before freeing states.
+    if (encoder->control) {
+      kvz_threadqueue_stop(encoder->control->threadqueue);
+    }
+
     if (encoder->states) {
+      // Flush input frame buffer.
+      kvz_picture *pic = NULL;
+      while ((pic = kvz_encoder_feed_frame(&encoder->input_buffer,
+                                           &encoder->states0,
+                                           NULL)) != NULL) {
+        kvz_image_free(pic);
+        pic = NULL;
+      }
+
       for (unsigned i = 0; i < encoder->num_encoder_states; ++i) {
         kvz_encoder_state_finalize(&encoder->statesi);
       }
@@ -127,7 +141,20 @@
   info->qp = state->frame->QP;
   info->nal_unit_type = state->frame->pictype;
   info->slice_type = state->frame->slicetype;
-  kvz_encoder_get_ref_lists(state, info->ref_list_len, info->ref_list);
+
+  memset(info->ref_list0, 0, 16);
+  memset(info->ref_list1, 0, 16);
+
+  for (size_t i = 0; i < state->frame->ref_LX_size0; i++) {
+    info->ref_list0i = state->frame->ref->pocsstate->frame->ref_LX0i;
+  }
+
+  for (size_t i = 0; i < state->frame->ref_LX_size1; i++) {
+    info->ref_list1i = state->frame->ref->pocsstate->frame->ref_LX1i;
+  }
+
+  info->ref_list_len0 = state->frame->ref_LX_size0;
+  info->ref_list_len1 = state->frame->ref_LX_size1;
 }
 
 
@@ -244,7 +271,7 @@
     kvz_threadqueue_waitfor(enc->control->threadqueue, output_state->tqj_bitstream_written);
     // The job pointer must be set to NULL here since it won't be usable after
     // the next frame is done.
-    output_state->tqj_bitstream_written = NULL;
+    kvz_threadqueue_free_job(&output_state->tqj_bitstream_written);
 
     // Get stream length before taking chunks since that clears the stream.
     if (len_out) *len_out = kvz_bitstream_tell(&output_state->stream) / 8;

kvazaar-1.1.0.tar.gz/src/kvazaar.h -> kvazaar-1.2.0.tar.gz/src/kvazaar.h Changed

@@ -149,7 +149,8 @@
   KVZ_CRYPTO_MV_SIGNS = (1 << 1),
   KVZ_CRYPTO_TRANSF_COEFFS = (1 << 2),
   KVZ_CRYPTO_TRANSF_COEFF_SIGNS = (1 << 3),
-  KVZ_CRYPTO_ON = (1 << 4) - 1,
+  KVZ_CRYPTO_INTRA_MODE = (1 << 4),
+  KVZ_CRYPTO_ON = (1 << 5) - 1,
 };
 
 /**
@@ -198,6 +199,13 @@
   KVZ_SLICES_WPP   = (1 << 1), /*!< \brief Put each row in a slice. */
 };
 
+enum kvz_sao {
+  KVZ_SAO_OFF = 0,
+  KVZ_SAO_EDGE = 1,
+  KVZ_SAO_BAND = 2,
+  KVZ_SAO_FULL = 3
+};
+
 // Map from input format to chroma format.
 #define KVZ_FORMAT2CSP(format) ((enum kvz_chroma_format)"\0\1\2\3"format)
 
@@ -245,7 +253,7 @@
   int32_t framerate_num; /*!< \brief Framerate numerator */
   int32_t framerate_denom; /*!< \brief Framerate denominator */
   int32_t deblock_enable; /*!< \brief Flag to enable deblocking filter */
-  int32_t sao_enable;     /*!< \brief Flag to enable sample adaptive offset filter */
+  enum kvz_sao sao_type;     /*!< \brief Flag to enable sample adaptive offset filter */
   int32_t rdoq_enable;    /*!< \brief Flag to enable RD optimized quantization. */
   int32_t signhide_enable;   /*!< \brief Flag to enable sign hiding. */
   int32_t smp_enable;   /*!< \brief Flag to enable SMP blocks. */
@@ -311,6 +319,7 @@
   enum kvz_cu_split_termination cu_split_termination; /*!< \since 3.8.0 \brief Mode of cu split termination. */
 
   enum kvz_crypto_features crypto_features; /*!< \since 3.7.0 */
+  uint8_t *optional_key;
 
   enum kvz_me_early_termination me_early_termination; /*!< \since 3.8.0 \brief Mode of me early termination. */
 
@@ -333,10 +342,15 @@
   struct {
     int32_t width;
     int32_t height;
-    uint8_t *dqps;
+    int8_t *dqps;
   } roi; /*!< \since 3.14.0 \brief Map of delta QPs for region of interest coding. */
 
   unsigned slices; /*!< \since 3.15.0 \brief How to map slices to frame. */
+
+  /**
+   * \brief Use adaptive QP for 360 video with equirectangular projection.
+   */
+  int32_t erp_aqp;
 } kvz_config;
 
 /**

kvazaar-1.1.0.tar.gz/src/rate_control.c -> kvazaar-1.2.0.tar.gz/src/rate_control.c Changed

@@ -170,7 +170,7 @@
 static int8_t lambda_to_qp(const double lambda)
 {
   const int8_t qp = 4.2005 * log(lambda) + 13.7223 + 0.5;
-  return CLIP(0, 51, qp);
+  return CLIP_TO_QP(qp);
 }
 
 static double qp_to_lamba(encoder_state_t * const state, int qp)
@@ -240,10 +240,10 @@
     kvz_gop_config const * const gop = &ctrl->cfg.gopstate->frame->gop_offset;
     const int gop_len = ctrl->cfg.gop_len;
 
-    state->frame->QP = ctrl->cfg.qp;
-
     if (gop_len > 0 && state->frame->slicetype != KVZ_SLICE_I) {
-      state->frame->QP += gop->qp_offset;
+      state->frame->QP = CLIP_TO_QP(ctrl->cfg.qp + gop->qp_offset);
+    } else {
+      state->frame->QP = ctrl->cfg.qp;
     }
 
     state->frame->lambda = qp_to_lamba(state, state->frame->QP);
@@ -291,7 +291,7 @@
     };
     int roi_index = roi.x + roi.y * ctrl->cfg.roi.width;
     int dqp = ctrl->cfg.roi.dqpsroi_index;
-    state->qp = state->frame->QP + dqp;
+    state->qp = CLIP_TO_QP(state->frame->QP + dqp);
     state->lambda = qp_to_lamba(state, state->qp);
     state->lambda_sqrt = sqrt(state->frame->lambda);

kvazaar-1.1.0.tar.gz/src/rdo.c -> kvazaar-1.2.0.tar.gz/src/rdo.c Changed

@@ -33,12 +33,16 @@
 #include "tables.h"
 #include "transform.h"
 
+#include "strategies/strategies-quant.h"
+
 
 #define QUANT_SHIFT          14
 #define SCAN_SET_SIZE        16
 #define LOG2_SCAN_SET_SIZE    4
 #define SBH_THRESHOLD         4
 
+static const double COEFF_SUM_MULTIPLIER = 1.9;
+
 const uint32_t kvz_g_go_rice_range5 = { 7, 14, 26, 46, 78 };
 const uint32_t kvz_g_go_rice_prefix_len5 = { 8, 7, 6, 5, 4 };
 
@@ -140,48 +144,82 @@
 };
 
 
-/** Calculate actual (or really close to actual) bitcost for coding coefficients
+/**
+ * \brief Calculate actual (or really close to actual) bitcost for coding
+ * coefficients.
+ *
  * \param coeff coefficient array
  * \param width coeff block width
  * \param type data type (0 == luma)
+ *
  * \returns bits needed to code input coefficients
  */
-int32_t kvz_get_coeff_cost(const encoder_state_t * const state, coeff_t *coeff, int32_t width, int32_t type, int8_t scan_mode)
+static INLINE uint32_t get_coeff_cabac_cost(
+    const encoder_state_t * const state,
+    const coeff_t *coeff,
+    int32_t width,
+    int32_t type,
+    int8_t scan_mode)
 {
-  int32_t cost = 0;
-  int i;
-  int found = 0;
-  encoder_state_t state_copy;
-
   // Make sure there are coeffs present
-  for(i = 0; i < width*width; i++) {
+  bool found = false;
+  for (int i = 0; i < width*width; i++) {
     if (coeffi != 0) {
       found = 1;
       break;
     }
   }
+  if (!found) return 0;
 
-  if(!found) return 0;
-
-  // Store cabac state and contexts
-  memcpy(&state_copy,state,sizeof(encoder_state_t));
+  // Take a copy of the CABAC so that we don't overwrite the contexts when
+  // counting the bits.
+  cabac_data_t cabac_copy;
+  memcpy(&cabac_copy, &state->cabac, sizeof(cabac_copy));
 
   // Clear bytes and bits and set mode to "count"
-  state_copy.cabac.only_count = 1;
-  state_copy.cabac.num_buffered_bytes = 0;
-  state_copy.cabac.bits_left = 23;
+  cabac_copy.only_count = 1;
+  cabac_copy.num_buffered_bytes = 0;
+  cabac_copy.bits_left = 23;
+
+  // Execute the coding function.
+  // It is safe to drop the const modifier since state won't be modified
+  // when cabac.only_count is set.
+  kvz_encode_coeff_nxn((encoder_state_t*) state,
+                       &cabac_copy,
+                       coeff,
+                       width,
+                       type,
+                       scan_mode,
+                       0);
+
+  return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3);
+}
 
-  // Execute the coding function
-  kvz_encode_coeff_nxn(&state_copy, coeff, width, type, scan_mode, 0);
 
-  // Store bitcost before restoring cabac
-  cost = (23-state_copy.cabac.bits_left) + (state_copy.cabac.num_buffered_bytes << 3);
+/**
+ * \brief Estimate bitcost for coding coefficients.
+ *
+ * \param coeff   coefficient array
+ * \param width   coeff block width
+ * \param type    data type (0 == luma)
+ *
+ * \returns       number of bits needed to code coefficients
+ */
+uint32_t kvz_get_coeff_cost(const encoder_state_t * const state,
+                            const coeff_t *coeff,
+                            int32_t width,
+                            int32_t type,
+                            int8_t scan_mode)
+{
+  if (state->encoder_control->cfg.rdo > 0) {
+    return get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
 
-  return cost;
+  } else {
+    return COEFF_SUM_MULTIPLIER * kvz_coeff_abs_sum(coeff, width * width) + 0.5;
+  }
 }
 
 
-
 #define COEF_REMAIN_BIN_REDUCTION 3
 /** Calculates the cost for specific absolute transform level
  * \param abs_level scaled quantized level
@@ -191,7 +229,7 @@
  * \returns cost of given absolute transform level
  * From HM 12.0
  */
-int32_t kvz_get_ic_rate(encoder_state_t * const state,
+INLINE int32_t kvz_get_ic_rate(encoder_state_t * const state,
                     uint32_t abs_level,
                     uint16_t ctx_num_one,
                     uint16_t ctx_num_abs,
@@ -211,14 +249,14 @@
     int32_t length;
     if (symbol < (COEF_REMAIN_BIN_REDUCTION << abs_go_rice)) {
       length = symbol>>abs_go_rice;
-      rate += (length+1+abs_go_rice) << CTX_FRAC_BITS;
+      rate += (length+1+abs_go_rice) * (1 << CTX_FRAC_BITS);
     } else {
       length = abs_go_rice;
       symbol  = symbol - ( COEF_REMAIN_BIN_REDUCTION << abs_go_rice);
       while (symbol >= (1<<length)) {
         symbol -=  (1<<(length++));
       }
-      rate += (COEF_REMAIN_BIN_REDUCTION+length+1-abs_go_rice+length) << CTX_FRAC_BITS;
+      rate += (COEF_REMAIN_BIN_REDUCTION+length+1-abs_go_rice+length) * (1 << CTX_FRAC_BITS);
     }
     if (c1_idx < C1FLAG_NUMBER) {
       rate += CTX_ENTROPY_BITS(&base_one_ctxctx_num_one,1);
@@ -255,7 +293,7 @@
  * This method calculates the best quantized transform level for a given scan position.
  * From HM 12.0
  */
-uint32_t kvz_get_coded_level ( encoder_state_t * const state, double *coded_cost, double *coded_cost0, double *coded_cost_sig,
+INLINE uint32_t kvz_get_coded_level ( encoder_state_t * const state, double *coded_cost, double *coded_cost0, double *coded_cost_sig,
                            int32_t level_double, uint32_t max_abs_level,
                            uint16_t ctx_num_sig, uint16_t ctx_num_one, uint16_t ctx_num_abs,
                            uint16_t abs_go_rice,
@@ -283,7 +321,7 @@
 
   min_abs_level    = ( max_abs_level > 1 ? max_abs_level - 1 : 1 );
   for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) {
-    double err       = (double)(level_double - ( abs_level << q_bits ) );
+    double err       = (double)(level_double - ( abs_level * (1 << q_bits) ) );
     double cur_cost  = err * err * temp + state->lambda *
                        kvz_get_ic_rate( state, abs_level, ctx_num_one, ctx_num_abs,
                                     abs_go_rice, c1_idx, c2_idx, type);
@@ -450,8 +488,8 @@
           dec_bits -= 4 * CTX_FRAC_ONE_BIT;
         }
 
-        inc_bits = -quant_cost_in_bits + (inc_bits << PRECISION_INC);
-        dec_bits = quant_cost_in_bits + (dec_bits << PRECISION_INC);
+        inc_bits = -quant_cost_in_bits + inc_bits * (1 << PRECISION_INC);
+        dec_bits = quant_cost_in_bits + dec_bits * (1 << PRECISION_INC);
 
         if (inc_bits < dec_bits) {
           current.change = 1;
@@ -472,7 +510,7 @@
 
         // Add sign bit, other bits and sig_coeff goes to one.
         int bits = CTX_FRAC_ONE_BIT + sh_rates->inccurrent.pos + sh_rates->sig_coeff_inccurrent.pos;
-        current.cost = -llabs(quant_cost_in_bits) + (bits << PRECISION_INC);
+        current.cost = -llabs(quant_cost_in_bits) + bits * (1 << PRECISION_INC);
         current.change = 1;
 
         if (coeff_scan < first_nz_scan) {
@@ -558,10 +596,10 @@
   // Explicitly tell the only possible numbers of elements to be zeroed.
   // Hope the compiler is able to utilize this information.
   switch (cg_num) {
-    case  1: memset(sig_coeffgroup_flag, 0,  1 * sizeof(sig_coeffgroup_flag0)); break;
-    case  4: memset(sig_coeffgroup_flag, 0,  4 * sizeof(sig_coeffgroup_flag0)); break;
-    case 16: memset(sig_coeffgroup_flag, 0, 16 * sizeof(sig_coeffgroup_flag0)); break;
-    case 64: memset(sig_coeffgroup_flag, 0, 64 * sizeof(sig_coeffgroup_flag0)); break;
+    case  1: FILL_ARRAY(sig_coeffgroup_flag, 0,  1); break;
+    case  4: FILL_ARRAY(sig_coeffgroup_flag, 0,  4); break;
+    case 16: FILL_ARRAY(sig_coeffgroup_flag, 0, 16); break;
+    case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); break;
     default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups");
   }
 
@@ -658,7 +696,7 @@
       }
 
       if (encoder->cfg.signhide_enable) {
-        sh_rates.quant_deltablkpos = (level_double - (level << q_bits)) >> (q_bits - 8);
+        sh_rates.quant_deltablkpos = (level_double - level * (1 << q_bits)) >> (q_bits - 8);
         if (level > 0) {
           int32_t rate_now  = kvz_get_ic_rate(state, level, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type);
           int32_t rate_up   = kvz_get_ic_rate(state, level + 1, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type);
@@ -845,7 +883,9 @@
 * \returns int
 * Calculates cost of actual motion vectors using CABAC coding
 */
-uint32_t kvz_get_mvd_coding_cost_cabac(encoder_state_t * const state, vector2d_t *mvd, const cabac_data_t* real_cabac) 
+uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
+                                       vector2d_t *mvd,
+                                       const cabac_data_t* real_cabac)
 {
   uint32_t bitcost = 0;
   const int32_t mvd_hor = mvd->x;
@@ -872,13 +912,15 @@
   }
   if (hor_abs_gr0) {
     if (mvd_hor_abs > 1) {
-      kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1);
+      // It is safe to drop const here because cabac->only_count is set.
+      kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_hor_abs - 2, 1);
     }
     CABAC_BIN_EP(cabac, (mvd_hor > 0) ? 0 : 1, "mvd_sign_flag_hor");
   }
   if (ver_abs_gr0) {
     if (mvd_ver_abs > 1) {
-      kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1);
+      // It is safe to drop const here because cabac->only_count is set.
+      kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_ver_abs - 2, 1);
     }
     CABAC_BIN_EP(cabac, (mvd_ver > 0) ? 0 : 1, "mvd_sign_flag_ver");
   }
@@ -891,10 +933,16 @@
 * \returns int
 * Calculates Motion Vector cost and related costs using CABAC coding
 */
-int kvz_calc_mvd_cost_cabac(encoder_state_t * const state, int x, int y, int mv_shift,
-  int16_t mv_cand22, inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
-  int16_t num_cand, int32_t ref_idx, uint32_t *bitcost) {
-
+uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
+                                 int x,
+                                 int y,
+                                 int mv_shift,
+                                 int16_t mv_cand22,
+                                 inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
+                                 int16_t num_cand,
+                                 int32_t ref_idx,
+                                 uint32_t *bitcost)
+{
   cabac_data_t state_cabac_copy;
   cabac_data_t* cabac;
   uint32_t merge_idx;
@@ -903,15 +951,18 @@
   int8_t merged = 0;
   int8_t cur_mv_cand = 0;
 
-  x <<= mv_shift;
-  y <<= mv_shift;
+  x *= 1 << mv_shift;
+  y *= 1 << mv_shift;
 
   // Check every candidate to find a match
   for (merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) {
     if (merge_candmerge_idx.dir == 3) continue;
     if (merge_candmerge_idx.mvmerge_candmerge_idx.dir - 10 == x &&
       merge_candmerge_idx.mvmerge_candmerge_idx.dir - 11 == y &&
-      merge_candmerge_idx.refmerge_candmerge_idx.dir - 1 == ref_idx) {
+      state->frame->ref_LXmerge_candmerge_idx.dir - 1
+        merge_candmerge_idx.refmerge_candmerge_idx.dir - 1
+       == ref_idx)
+    {
       merged = 1;
       break;
     }
@@ -1030,7 +1081,8 @@
 
           if (hor_abs_gr0) {
             if (mvd_hor_abs > 1) {
-              kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1);
+              // It is safe to drop const because cabac->only_count is set.
+              kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_hor_abs - 2, 1);
             }
 
             CABAC_BIN_EP(cabac, (mvd_hor > 0) ? 0 : 1, "mvd_sign_flag_hor");
@@ -1038,7 +1090,8 @@
 
           if (ver_abs_gr0) {
             if (mvd_ver_abs > 1) {
-              kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1);
+              // It is safe to drop const because cabac->only_count is set.
+              kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_ver_abs - 2, 1);
             }
 
             CABAC_BIN_EP(cabac, (mvd_ver > 0) ? 0 : 1, "mvd_sign_flag_ver");
@@ -1056,5 +1109,5 @@
   *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3);
 
   // Store bitcost before restoring cabac
-  return *bitcost * (int32_t)(state->lambda_sqrt + 0.5);
+  return *bitcost * (uint32_t)(state->lambda_sqrt + 0.5);
 }

kvazaar-1.1.0.tar.gz/src/rdo.h -> kvazaar-1.2.0.tar.gz/src/rdo.h Changed

@@ -39,7 +39,11 @@
 void  kvz_rdoq(encoder_state_t *state, coeff_t *coef, coeff_t *dest_coeff, int32_t width,
            int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth);
 
-int32_t kvz_get_coeff_cost(const encoder_state_t *state, coeff_t *coeff, int32_t width, int32_t type, int8_t scan_mode);
+uint32_t kvz_get_coeff_cost(const encoder_state_t *state,
+                            const coeff_t *coeff,
+                            int32_t width,
+                            int32_t type,
+                            int8_t scan_mode);
 
 int32_t kvz_get_ic_rate(encoder_state_t *state, uint32_t abs_level, uint16_t ctx_num_one, uint16_t ctx_num_abs,
                     uint16_t abs_go_rice, uint32_t c1_idx, uint32_t c2_idx, int8_t type);
@@ -52,7 +56,9 @@
 
 kvz_mvd_cost_func kvz_calc_mvd_cost_cabac;
 
-uint32_t kvz_get_mvd_coding_cost_cabac(encoder_state_t * const state, vector2d_t *mvd, const cabac_data_t* cabac);
+uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
+                                       vector2d_t *mvd,
+                                       const cabac_data_t* cabac);
 
 // Number of fixed point fractional bits used in the fractional bit table.
 #define CTX_FRAC_BITS 15

kvazaar-1.1.0.tar.gz/src/sao.c -> kvazaar-1.2.0.tar.gz/src/sao.c Changed

@@ -262,182 +262,81 @@
 
 
 /**
- * \brief Calculate dimensions of the buffer used by sao reconstruction.
-
- * \param pic  Picture.
- * \param sao  Sao parameters.
- * \param rec  Top-left corner of the LCU
+ * \brief Reconstruct SAO.
+ *
+ * \param encoder         encoder state
+ * \param buffer          Buffer containing the deblocked input pixels. The
+ *                        area to filter starts at index 0.
+ * \param stride          stride of buffer
+ * \param frame_x         x-coordinate of the top-left corner in pixels
+ * \param frame_y         y-coordinate of the top-left corner in pixels
+ * \param width           width of the area to filter
+ * \param height          height of the area to filter
+ * \param sao             SAO information
+ * \param color           color plane index
  */
-static void sao_calc_band_block_dims(const videoframe_t *frame, color_t color_i,
-                                     vector2d_t *rec, vector2d_t *block)
+void kvz_sao_reconstruct(const encoder_state_t *state,
+                         const kvz_pixel *buffer,
+                         int stride,
+                         int frame_x,
+                         int frame_y,
+                         int width,
+                         int height,
+                         const sao_info_t *sao,
+                         color_t color)
 {
-  const int is_chroma = (color_i != COLOR_Y ? 1 : 0);
-  int width = frame->width >> is_chroma;
-  int height = frame->height >> is_chroma;
-  int block_width = LCU_WIDTH >> is_chroma;
-
+  const encoder_control_t *const ctrl = state->encoder_control;
+  videoframe_t *const frame = state->tile->frame;
+  const int shift = color == COLOR_Y ? 0 : 1;
 
-  // Handle right and bottom, taking care of non-LCU sized CUs.
-  if (rec->y + block_width >= height) {
-    if (rec->y + block_width >= height) {
-      block->y = height - rec->y;
-    }
-  }
-  if (rec->x + block_width >= width) {
-    if (rec->x + block_width > width) {
-      block->x = width - rec->x;
-    }
-  }
+  const int frame_width = frame->width >> shift;
+  const int frame_height = frame->height >> shift;
+  const int frame_stride = frame->rec->stride >> shift;
+  kvz_pixel *output = &frame->rec->datacolorframe_x + frame_y * frame_stride;
 
-  rec->x = 0; rec->y = 0;
-}
+  if (sao->type == SAO_TYPE_EDGE) {
+    const vector2d_t *offset = g_sao_edge_offsetssao->eo_class;
 
-/**
- * \brief Calculate dimensions of the buffer used by sao reconstruction.
- *
- * This function calculates 4 vectors that can be used to make the temporary
- * buffers required by sao_reconstruct_color.
- *
- * Vector block is the area affected by sao. Vectors tr and br are top-left
- * margin and bottom-right margin, which contain pixels that are not modified
- * by the reconstruction of this LCU but are needed by the reconstruction.
- * Vector rec is the offset from the CU to the required pixel area.
- *
- * The margins are always either 0 or 1, depending on the direction of the
- * edge offset class.
- *
- * This also takes into account borders of the picture and non-LCU sized
- * CU's at the bottom and right of the picture.
- *
- * \ CU + rec
- *  +------+
- *  |\ tl  |
- *  | +--+ |
- *  | |\ block
- *  | | \| |
- *  | +--+ |
- *  |     \ br
- *  +------+
- *
- * \param pic  Picture.
- * \param sao  Sao parameters.
- * \param rec  Top-left corner of the LCU, modified to be top-left corner of
- */
-static void sao_calc_edge_block_dims(const videoframe_t * const frame, color_t color_i,
-                                     const sao_info_t *sao, vector2d_t *rec,
-                                     vector2d_t *tl, vector2d_t *br,
-                                     vector2d_t *block)
-{
-  vector2d_t a_ofs = g_sao_edge_offsetssao->eo_class0;
-  vector2d_t b_ofs = g_sao_edge_offsetssao->eo_class1;
-  const int is_chroma = (color_i != COLOR_Y ? 1 : 0);
-  int width = frame->width >> is_chroma;
-  int height = frame->height >> is_chroma;
-  int block_width = LCU_WIDTH >> is_chroma;
-
-  // Handle top and left.
-  if (rec->y == 0) {
-    tl->y = 0;
-    if (a_ofs.y == -1 || b_ofs.y == -1) {
-      block->y -= 1;
-      tl->y += 1;
+    if (frame_x + width + offset0.x > frame_width ||
+        frame_x + width + offset1.x > frame_width)
+    {
+      // Nothing to do for the rightmost column.
+      width -= 1;
     }
-  }
-  if (rec->x == 0) {
-    tl->x = 0;
-    if (a_ofs.x == -1 || b_ofs.x == -1) {
-      block->x -= 1;
-      tl->x += 1;
+    if (frame_x + offset0.x < 0 || frame_x + offset1.x < 0) {
+      // Nothing to do for the leftmost column.
+      buffer += 1;
+      output += 1;
+      width -= 1;
     }
-  }
-
-  // Handle right and bottom, taking care of non-LCU sized CUs.
-  if (rec->y + block_width >= height) {
-    br->y = 0;
-    block->y -= block_width + rec->y - height;
-    if (a_ofs.y == 1 || b_ofs.y == 1) {
-      block->y -= 1;
-      br->y += 1;
+    if (frame_y + height + offset0.y > frame_height ||
+        frame_y + height + offset1.y > frame_height)
+    {
+      // Nothing to do for the bottommost row.
+      height -= 1;
     }
-  }
-  if (rec->x + block_width >= width) {
-    br->x = 0;
-    block->x -= block_width + rec->x - width;
-    if (a_ofs.x == 1 || b_ofs.x == 1) {
-      block->x -= 1;
-      br->x += 1;
+    if (frame_y + offset0.y < 0 || frame_y + offset1.y < 0) {
+      // Nothing to do for the topmost row.
+      buffer += stride;
+      output += frame_stride;
+      height -= 1;
     }
   }
 
-  rec->y = (rec->y == 0 ? 0 : -1);
-  rec->x = (rec->x == 0 ? 0 : -1);
-}
-
-void kvz_sao_reconstruct(const encoder_control_t * const encoder, videoframe_t * frame, const kvz_pixel *old_rec,
-                     unsigned x_ctb, unsigned y_ctb,
-                     const sao_info_t *sao, color_t color_i)
-{
-  const int is_chroma = (color_i != COLOR_Y ? 1 : 0);
-  const int pic_stride = frame->width >> is_chroma;
-  const int lcu_stride = LCU_WIDTH >> is_chroma;
-  const int buf_stride = lcu_stride + 2;
-
-  kvz_pixel *recdata = frame->rec->datacolor_i;
-  kvz_pixel buf_rec(LCU_WIDTH + 2) * (LCU_WIDTH + 2);
-  kvz_pixel new_recLCU_WIDTH * LCU_WIDTH;
-  // Calling CU_TO_PIXEL with depth 1 is the same as using block size of 32.
-  kvz_pixel *lcu_rec = &recdataCU_TO_PIXEL(x_ctb, y_ctb, is_chroma, frame->rec->stride>>is_chroma);
-  const kvz_pixel *old_lcu_rec = &old_recCU_TO_PIXEL(x_ctb, y_ctb, is_chroma, pic_stride);
-
-  vector2d_t ofs;
-  vector2d_t tl = { 1, 1 };
-  vector2d_t br = { 1, 1 };
-  vector2d_t block;
-
-  if (sao->type == SAO_TYPE_NONE) {
-    return;
-  }
-
-  ofs.x = x_ctb * lcu_stride;
-  ofs.y = y_ctb * lcu_stride;
-  block.x = lcu_stride;
-  block.y = lcu_stride;
-  if (sao->type == SAO_TYPE_BAND) {
-    tl.x = 0; tl.y = 0;
-    br.x = 0; br.y = 0;
-    sao_calc_band_block_dims(frame, color_i, &ofs, &block);
-  }
-  else {
-    sao_calc_edge_block_dims(frame, color_i, sao, &ofs, &tl, &br, &block);
+  if (sao->type != SAO_TYPE_NONE) {
+    kvz_sao_reconstruct_color(ctrl,
+                              buffer,
+                              output,
+                              sao,
+                              stride,
+                              frame_stride,
+                              width,
+                              height,
+                              color);
   }
-  
-  assert(ofs.x + tl.x + block.x + br.x <= frame->width);
-  assert(ofs.y + tl.y + block.y + br.y <= frame->height);
-  
-  CHECKPOINT("ofs.x=%d ofs.y=%d tl.x=%d tl.y=%d block.x=%d block.y=%d br.x=%d br.y=%d", 
-             ofs.x, ofs.y, tl.x, tl.y, block.x, block.y, br.x, br.y);
-  
-  // Data to tmp buffer.
-  kvz_pixels_blit(&old_lcu_recofs.y * pic_stride + ofs.x,
-                      buf_rec,
-                      tl.x + block.x + br.x,
-                      tl.y + block.y + br.y,
-                      pic_stride, buf_stride);
-
-  kvz_sao_reconstruct_color(encoder, &buf_rectl.y * buf_stride + tl.x,
-                        &new_rec(ofs.y + tl.y) * lcu_stride + ofs.x + tl.x,
-                        sao,
-                        buf_stride, lcu_stride,
-                        block.x, block.y, color_i);
-
-  // Copy reconstructed block from tmp buffer to rec image.
-  kvz_pixels_blit(&new_rec(tl.y + ofs.y) * lcu_stride + (tl.x + ofs.x),
-                      &lcu_rec(tl.y + ofs.y) * (frame->rec->stride >> is_chroma) + (tl.x + ofs.x),
-                      block.x, block.y, lcu_stride, frame->rec->stride >> is_chroma);
 }
 
 
-
 static void sao_search_edge_sao(const encoder_state_t * const state, 
                                 const kvz_pixel * data, const kvz_pixel * recdata,
                                 int block_width, int block_height,
@@ -584,10 +483,8 @@
   band_sao.offsets5 = 0;
   band_sao.eo_class = SAO_EO0;
 
-  sao_search_edge_sao(state, data, recdata, block_width, block_height, buf_cnt, &edge_sao, sao_top, sao_left);
-  sao_search_band_sao(state, data, recdata, block_width, block_height, buf_cnt, &band_sao, sao_top, sao_left);
-
-  {
+  if (state->encoder_control->cfg.sao_type & 1){
+    sao_search_edge_sao(state, data, recdata, block_width, block_height, buf_cnt, &edge_sao, sao_top, sao_left);
     float mode_bits = sao_mode_bits_edge(state, edge_sao.eo_class, edge_sao.offsets, sao_top, sao_left, buf_cnt);
     int ddistortion = (int)(mode_bits * state->lambda + 0.5);
     unsigned buf_i;
@@ -600,8 +497,12 @@
     
     edge_sao.ddistortion = ddistortion;
   }
+  else{
+    edge_sao.ddistortion = INT_MAX;
+  }
 
-  {
+  if (state->encoder_control->cfg.sao_type & 2){
+    sao_search_band_sao(state, data, recdata, block_width, block_height, buf_cnt, &band_sao, sao_top, sao_left);
     float mode_bits = sao_mode_bits_band(state, band_sao.band_position, band_sao.offsets, sao_top, sao_left, buf_cnt);
     int ddistortion = (int)(mode_bits * state->lambda + 0.5);
     unsigned buf_i;
@@ -614,6 +515,9 @@
     
     band_sao.ddistortion = ddistortion;
   }
+  else{
+    band_sao.ddistortion = INT_MAX;
+  }
 
   if (edge_sao.ddistortion <= band_sao.ddistortion) {
     *sao_out = edge_sao;
@@ -749,7 +653,8 @@
   int32_t merge_cost_chroma3 = { INT32_MAX };
   sao_info_t *sao_luma = &frame->sao_lumalcu_y * stride + lcu_x;
   sao_info_t *sao_chroma = NULL;
-  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+  int enable_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
+  if (enable_chroma) {
     sao_chroma = &frame->sao_chromalcu_y * stride + lcu_x;
   }
 
@@ -758,13 +663,13 @@
   sao_info_t *sao_left_luma   = lcu_x != 0 ? &frame->sao_luma  lcu_y       * stride + lcu_x - 1 : NULL;
   sao_info_t *sao_top_chroma  = NULL;
   sao_info_t *sao_left_chroma = NULL;
-  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+  if (enable_chroma) {
     if (lcu_y != 0) sao_top_chroma =  &frame->sao_chroma(lcu_y - 1) * stride + lcu_x;
     if (lcu_x != 0) sao_left_chroma = &frame->sao_chromalcu_y       * stride + lcu_x - 1;
   }
 
   sao_search_luma(state, frame, lcu_x, lcu_y, sao_luma, sao_top_luma, sao_left_luma, merge_cost_luma);
-  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+  if (enable_chroma) {
     sao_search_chroma(state, frame, lcu_x, lcu_y, sao_chroma, sao_top_chroma, sao_left_chroma, merge_cost_chroma);
   } else {
     merge_cost_chroma0 = 0;
@@ -803,46 +708,3 @@
     CHECKPOINT_SAO_INFO("sao_chroma", *sao_chroma);
   }
 }
-
-void kvz_sao_reconstruct_frame(encoder_state_t * const state)
-{
-  vector2d_t lcu;
-  videoframe_t * const frame = state->tile->frame;
-
-  // These are needed because SAO needs the pre-SAO pixels form left and
-  // top LCUs. Single pixel wide buffers, like what kvz_search_lcu takes, would
-  // be enough though.
-  kvz_pixel *new_y_data = MALLOC(kvz_pixel, frame->rec->width * frame->rec->height);
-  kvz_pixels_blit(frame->rec->y, new_y_data, frame->rec->width, frame->rec->height, frame->rec->stride, frame->rec->width);
-  for (lcu.y = 0; lcu.y < frame->height_in_lcu; lcu.y++) {
-    for (lcu.x = 0; lcu.x < frame->width_in_lcu; lcu.x++) {
-      unsigned stride = frame->width_in_lcu;
-      sao_info_t *sao_luma = &frame->sao_lumalcu.y * stride + lcu.x;
-      
-      // sao_do_rdo(encoder, lcu.x, lcu.y, sao_luma, sao_chroma);
-      kvz_sao_reconstruct(state->encoder_control, frame, new_y_data, lcu.x, lcu.y, sao_luma, COLOR_Y);
-    }
-  }
-  free(new_y_data);
-
-  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
-    kvz_pixel *new_u_data = MALLOC(kvz_pixel, (frame->rec->width * frame->rec->height) >> 2);
-    kvz_pixel *new_v_data = MALLOC(kvz_pixel, (frame->rec->width * frame->rec->height) >> 2);
-
-    kvz_pixels_blit(frame->rec->u, new_u_data, frame->rec->width / 2, frame->rec->height / 2, frame->rec->stride / 2, frame->rec->width / 2);
-    kvz_pixels_blit(frame->rec->v, new_v_data, frame->rec->width / 2, frame->rec->height / 2, frame->rec->stride / 2, frame->rec->width / 2);
-
-    for (lcu.y = 0; lcu.y < frame->height_in_lcu; lcu.y++) {
-      for (lcu.x = 0; lcu.x < frame->width_in_lcu; lcu.x++) {
-        unsigned stride = frame->width_in_lcu;
-        sao_info_t *sao_chroma = &frame->sao_chromalcu.y * stride + lcu.x;
-
-        kvz_sao_reconstruct(state->encoder_control, frame, new_u_data, lcu.x, lcu.y, sao_chroma, COLOR_U);
-        kvz_sao_reconstruct(state->encoder_control, frame, new_v_data, lcu.x, lcu.y, sao_chroma, COLOR_V);
-      }
-    }
-
-    free(new_u_data);
-    free(new_v_data);
-  }
-}

kvazaar-1.1.0.tar.gz/src/sao.h -> kvazaar-1.2.0.tar.gz/src/sao.h Changed

kvazaar-1.1.0.tar.gz/src/search.c -> kvazaar-1.2.0.tar.gz/src/search.c Changed

@@ -36,6 +36,7 @@
 #include "transform.h"
 #include "videoframe.h"
 #include "strategies/strategies-picture.h"
+#include "strategies/strategies-quant.h"
 
 
 #define IN_FRAME(x, y, width, height, block_width, block_height) \
@@ -43,11 +44,8 @@
   && (x) + (block_width) <= (width) \
   && (y) + (block_height) <= (height))
 
-// Cost treshold for doing intra search in inter frames with --rd=0.
-#ifndef INTRA_TRESHOLD
-# define INTRA_TRESHOLD 20
-#endif
-
+// Cost threshold for doing intra search in inter frames with --rd=0.
+static const int INTRA_THRESHOLD = 8;
 
 // Modify weight of luma SSD.
 #ifndef LUMA_MULT
@@ -58,216 +56,133 @@
 # define CHROMA_MULT 1.5
 #endif
 
-
-/**
- * Copy all non-reference CU data from depth+1 to depth.
- */
-static void work_tree_copy_up(int x_px, int y_px, int depth, lcu_t work_treeMAX_PU_DEPTH + 1)
+static INLINE void copy_cu_info(int x_local, int y_local, int width, lcu_t *from, lcu_t *to)
 {
-  assert(depth >= 0 && depth < MAX_PU_DEPTH);
-
-  // Copy non-reference CUs.
-  {
-    const int x_orig = SUB_SCU(x_px);
-    const int y_orig = SUB_SCU(y_px);
-    const int width_cu = LCU_WIDTH >> depth;
-    for (int y = y_orig; y < y_orig + width_cu; y += SCU_WIDTH) {
-      for (int x = x_orig; x < x_orig + width_cu; x += SCU_WIDTH) {
-        const cu_info_t *from_cu = LCU_GET_CU_AT_PX(&work_treedepth + 1, x, y);
-        cu_info_t *to_cu = LCU_GET_CU_AT_PX(&work_treedepth, x, y);
-        memcpy(to_cu, from_cu, sizeof(*to_cu));
-      }
-    }
-  }
-
-  // Copy reconstructed pixels.
-  {
-    const int x = SUB_SCU(x_px);
-    const int y = SUB_SCU(y_px);
-    const int width_px = LCU_WIDTH >> depth;
-    const int luma_index = x + y * LCU_WIDTH;
-    const int chroma_index = (x / 2) + (y / 2) * (LCU_WIDTH / 2);
-
-    const lcu_yuv_t *from = &work_treedepth + 1.rec;
-    lcu_yuv_t *to = &work_treedepth.rec;
-
-    const lcu_coeff_t *from_coeff = &work_treedepth + 1.coeff;
-    lcu_coeff_t *to_coeff = &work_treedepth.coeff;
-
-    kvz_pixels_blit(&from->yluma_index, &to->yluma_index,
-                    width_px, width_px, LCU_WIDTH, LCU_WIDTH);
-    if (from->chroma_format != KVZ_CSP_400) {
-      kvz_pixels_blit(&from->uchroma_index, &to->uchroma_index,
-                      width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
-      kvz_pixels_blit(&from->vchroma_index, &to->vchroma_index,
-                      width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
-    }
-
-    // Copy coefficients up. They do not have to be copied down because they
-    // are not used for the search.
-    kvz_coefficients_blit(&from_coeff->yluma_index, &to_coeff->yluma_index,
-                          width_px, width_px, LCU_WIDTH, LCU_WIDTH);
-    if (from->chroma_format != KVZ_CSP_400) {
-      kvz_coefficients_blit(&from_coeff->uchroma_index, &to_coeff->uchroma_index,
-                            width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
-      kvz_coefficients_blit(&from_coeff->vchroma_index, &to_coeff->vchroma_index,
-                            width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+  for   (int y = y_local; y < y_local + width; y += SCU_WIDTH) {
+    for (int x = x_local; x < x_local + width; x += SCU_WIDTH) {
+      *LCU_GET_CU_AT_PX(to, x, y) = *LCU_GET_CU_AT_PX(from, x, y);
     }
   }
 }
 
-
-/**
- * Copy all non-reference CU data from depth to depth+1..MAX_PU_DEPTH.
- */
-static void work_tree_copy_down(int x_px, int y_px, int depth, lcu_t work_treeMAX_PU_DEPTH + 1)
+static INLINE void copy_cu_pixels(int x_local, int y_local, int width, lcu_t *from, lcu_t *to)
 {
-  assert(depth >= 0 && depth < MAX_PU_DEPTH);
-
-  // TODO: clean up to remove the copy pasta
-  const int width_px = LCU_WIDTH >> depth;
+  const int luma_index = x_local + y_local * LCU_WIDTH;
+  const int chroma_index = (x_local / 2) + (y_local / 2) * (LCU_WIDTH / 2);
 
-  int d;
-
-  for (d = depth + 1; d < MAX_PU_DEPTH + 1; ++d) {
-    const int x_orig = SUB_SCU(x_px);
-    const int y_orig = SUB_SCU(y_px);
-
-    for (int y = y_orig; y < y_orig + width_px; y += SCU_WIDTH) {
-      for (int x = x_orig; x < x_orig + width_px; x += SCU_WIDTH) {
-        const cu_info_t *from_cu = LCU_GET_CU_AT_PX(&work_treedepth, x, y);
-        cu_info_t *to_cu = LCU_GET_CU_AT_PX(&work_treed, x, y);
-        memcpy(to_cu, from_cu, sizeof(*to_cu));
-      }
-    }
+  kvz_pixels_blit(&from->rec.yluma_index, &to->rec.yluma_index,
+                  width, width, LCU_WIDTH, LCU_WIDTH);
+  if (from->rec.chroma_format != KVZ_CSP_400) {
+    kvz_pixels_blit(&from->rec.uchroma_index, &to->rec.uchroma_index,
+                    width / 2, width / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+    kvz_pixels_blit(&from->rec.vchroma_index, &to->rec.vchroma_index,
+                    width / 2, width / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
   }
+}
 
-  // Copy reconstructed pixels.
-  for (d = depth + 1; d < MAX_PU_DEPTH + 1; ++d) {
-    const int x = SUB_SCU(x_px);
-    const int y = SUB_SCU(y_px);
-
-    const int luma_index = x + y * LCU_WIDTH;
-    const int chroma_index = (x / 2) + (y / 2) * (LCU_WIDTH / 2);
-
-    lcu_yuv_t *from = &work_treedepth.rec;
-    lcu_yuv_t *to = &work_treed.rec;
+static INLINE void copy_cu_coeffs(int x_local, int y_local, int width, lcu_t *from, lcu_t *to)
+{
+  const int luma_z = xy_to_zorder(LCU_WIDTH, x_local, y_local);
+  copy_coeffs(&from->coeff.yluma_z, &to->coeff.yluma_z, width);
 
-    kvz_pixels_blit(&from->yluma_index, &to->yluma_index,
-                    width_px, width_px, LCU_WIDTH, LCU_WIDTH);
-    if (from->chroma_format != KVZ_CSP_400) {
-      kvz_pixels_blit(&from->uchroma_index, &to->uchroma_index,
-                      width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
-      kvz_pixels_blit(&from->vchroma_index, &to->vchroma_index,
-                      width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
-    }
+  if (from->rec.chroma_format != KVZ_CSP_400) {
+    const int chroma_z = xy_to_zorder(LCU_WIDTH_C, x_local >> 1, y_local >> 1);
+    copy_coeffs(&from->coeff.uchroma_z, &to->coeff.uchroma_z, width >> 1);
+    copy_coeffs(&from->coeff.vchroma_z, &to->coeff.vchroma_z, width >> 1);
   }
 }
 
-
-void kvz_lcu_set_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth)
+/**
+ * Copy all non-reference CU data from next level to current level.
+ */
+static void work_tree_copy_up(int x_local, int y_local, int depth, lcu_t *work_tree)
 {
   const int width = LCU_WIDTH >> depth;
-  const vector2d_t lcu_cu = { SUB_SCU(x_px), SUB_SCU(y_px) };
-
-  // Depth 4 doesn't go inside the loop. Set the top-left CU.
-  LCU_GET_CU_AT_PX(lcu, lcu_cu.x, lcu_cu.y)->tr_depth = tr_depth;
-
-  for (unsigned y = 0; y < width; y += SCU_WIDTH) {
-    for (unsigned x = 0; x < width; x += SCU_WIDTH) {
-      cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, lcu_cu.x + x, lcu_cu.y + y);
-      cu->tr_depth = tr_depth;
-    }
-  }
+  copy_cu_info  (x_local, y_local, width, &work_treedepth + 1, &work_treedepth);
+  copy_cu_pixels(x_local, y_local, width, &work_treedepth + 1, &work_treedepth);
+  copy_cu_coeffs(x_local, y_local, width, &work_treedepth + 1, &work_treedepth);
 }
 
 
-static void lcu_set_intra_mode(lcu_t *lcu, int x_px, int y_px, int depth, int pred_mode, int chroma_mode, int part_mode)
+/**
+ * Copy all non-reference CU data from current level to all lower levels.
+ */
+static void work_tree_copy_down(int x_local, int y_local, int depth, lcu_t *work_tree)
 {
   const int width = LCU_WIDTH >> depth;
-  const int x_cu  = SUB_SCU(x_px);
-  const int y_cu  = SUB_SCU(y_px);
-
-  if (part_mode == SIZE_NxN) {
-    assert(depth == MAX_DEPTH + 1);
-    assert(width == SCU_WIDTH);
+  for (int i = depth + 1; i <= MAX_PU_DEPTH; i++) {
+    copy_cu_info  (x_local, y_local, width, &work_treedepth, &work_treei);
+    copy_cu_pixels(x_local, y_local, width, &work_treedepth, &work_treei);
   }
+}
 
-  if (depth > MAX_DEPTH) {
-    depth = MAX_DEPTH;
-    assert(part_mode == SIZE_NxN);
-  }
+void kvz_lcu_set_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth)
+{
+  const int x_local = SUB_SCU(x_px);
+  const int y_local = SUB_SCU(y_px);
+  const int width = LCU_WIDTH >> depth;
 
-  // Set mode in every CU covered by part_mode in this depth.
-  for (int y = y_cu; y < y_cu + width; y += SCU_WIDTH) {
-    for (int x = x_cu; x < x_cu + width; x += SCU_WIDTH) {
-      cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x, y);
-      cu->depth = depth;
-      cu->type = CU_INTRA;
-      cu->intra.mode = pred_mode;
-      cu->intra.mode_chroma = chroma_mode;
-      cu->part_size = part_mode;
+  for (unsigned y = 0; y < width; y += SCU_WIDTH) {
+    for (unsigned x = 0; x < width; x += SCU_WIDTH) {
+      LCU_GET_CU_AT_PX(lcu, x_local + x, y_local + y)->tr_depth = tr_depth;
     }
   }
 }
 
-
-static void lcu_set_inter_pu(lcu_t *lcu, int x_px, int y_px, int width, int height, cu_info_t *cur_pu)
+static void lcu_fill_cu_info(lcu_t *lcu, int x_local, int y_local, int width, int height, cu_info_t *cu)
 {
   // Set mode in every CU covered by part_mode in this depth.
-  for (int y = y_px; y < y_px + height; y += SCU_WIDTH) {
-    for (int x = x_px; x < x_px + width; x += SCU_WIDTH) {
-      cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x, y);
-      //Check if this could be moved inside the if
-      if (cu != cur_pu) {
-        cu->depth     = cur_pu->depth;
-        cu->part_size = cur_pu->part_size;
-        cu->type      = CU_INTER;
-        cu->tr_depth  = cur_pu->tr_depth;
-        cu->merged    = cur_pu->merged;
-        cu->skipped   = cur_pu->skipped;
-        memcpy(&cu->inter, &cur_pu->inter, sizeof(cur_pu->inter));
+  for (int y = y_local; y < y_local + height; y += SCU_WIDTH) {
+    for (int x = x_local; x < x_local + width; x += SCU_WIDTH) {
+      cu_info_t *to = LCU_GET_CU_AT_PX(lcu, x, y);
+      to->type      = cu->type;
+      to->depth     = cu->depth;
+      to->part_size = cu->part_size;
+
+      if (cu->type == CU_INTRA) {
+        to->intra.mode        = cu->intra.mode;
+        to->intra.mode_chroma = cu->intra.mode_chroma;
+      } else {
+        to->skipped   = cu->skipped;
+        to->merged    = cu->merged;
+        to->merge_idx = cu->merge_idx;
+        to->inter     = cu->inter;
       }
     }
   }
 }
 
-
-static void lcu_set_inter(lcu_t *lcu, int x_px, int y_px, int depth, cu_info_t *cur_cu)
+static void lcu_set_inter(lcu_t *lcu, int x_local, int y_local, int cu_width)
 {
-  const int width = LCU_WIDTH >> depth;
-  const int x_local = SUB_SCU(x_px);
-  const int y_local = SUB_SCU(y_px);
-  const int num_pu = kvz_part_mode_num_partscur_cu->part_size;
+  const part_mode_t part_mode = LCU_GET_CU_AT_PX(lcu, x_local, y_local)->part_size;
+  const int num_pu = kvz_part_mode_num_partspart_mode;
 
   for (int i = 0; i < num_pu; ++i) {
-    const int x_pu      = PU_GET_X(cur_cu->part_size, width, x_local, i);
-    const int y_pu      = PU_GET_Y(cur_cu->part_size, width, y_local, i);
-    const int width_pu  = PU_GET_W(cur_cu->part_size, width, i);
-    const int height_pu = PU_GET_H(cur_cu->part_size, width, i);
-    cu_info_t *cur_pu   = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu);
-    lcu_set_inter_pu(lcu, x_pu, y_pu, width_pu, height_pu, cur_pu);
+    const int x_pu      = PU_GET_X(part_mode, cu_width, x_local, i);
+    const int y_pu      = PU_GET_Y(part_mode, cu_width, y_local, i);
+    const int width_pu  = PU_GET_W(part_mode, cu_width, i);
+    const int height_pu = PU_GET_H(part_mode, cu_width, i);
+
+    cu_info_t *pu  = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu);
+    pu->type = CU_INTER;
+    lcu_fill_cu_info(lcu, x_pu, y_pu, width_pu, height_pu, pu);
   }
 }
 
-
-static void lcu_set_coeff(lcu_t *lcu, int x_px, int y_px, int depth, cu_info_t *cur_cu)
+static void lcu_set_coeff(lcu_t *lcu, int x_local, int y_local, int width, cu_info_t *cur_cu)
 {
-  const uint32_t width = LCU_WIDTH >> depth;
-  const uint32_t x_local = SUB_SCU(x_px);
-  const uint32_t y_local = SUB_SCU(y_px);
-  const uint32_t tr_split = cur_cu->tr_depth-cur_cu->depth;
+  const uint32_t tr_split = cur_cu->tr_depth - cur_cu->depth;
   const uint32_t mask = ~((width >> tr_split)-1);
 
   // Set coeff flags in every CU covered by part_mode in this depth.
   for (uint32_t y = y_local; y < y_local + width; y += SCU_WIDTH) {
     for (uint32_t x = x_local; x < x_local + width; x += SCU_WIDTH) {
-      cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x, y);
       // Use TU top-left CU to propagate coeff flags
       cu_info_t *cu_from = LCU_GET_CU_AT_PX(lcu, x & mask, y & mask);
-      if (cu != cu_from) {
+      cu_info_t *cu_to   = LCU_GET_CU_AT_PX(lcu, x, y);
+      if (cu_from != cu_to) {
         // Chroma coeff data is not used, luma is needed for deblocking
-        cbf_copy(&cu->cbf, cu_from->cbf, COLOR_Y);
+        cbf_copy(&cu_to->cbf, cu_from->cbf, COLOR_Y);
       }
     }
   }
@@ -344,12 +259,10 @@
   }
 
   {
-    coeff_t coeff_temp32 * 32;
     int8_t luma_scan_mode = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
+    const coeff_t *coeffs = &lcu->coeff.yxy_to_zorder(LCU_WIDTH, x_px, y_px);
 
-    // Code coeffs using cabac to get a better estimate of real coding costs.
-    kvz_coefficients_blit(&lcu->coeff.y(y_px*LCU_WIDTH) + x_px, coeff_temp, width, width, LCU_WIDTH, width);
-    coeff_bits += kvz_get_coeff_cost(state, coeff_temp, width, 0, luma_scan_mode);
+    coeff_bits += kvz_get_coeff_cost(state, coeffs, width, 0, luma_scan_mode);
   }
 
   double bits = tr_tree_bits + coeff_bits;
@@ -415,16 +328,11 @@
   }
 
   {
-    coeff_t coeff_temp16 * 16;
     int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
-    
-    kvz_coefficients_blit(&lcu->coeff.u(lcu_px.y*(LCU_WIDTH_C)) + lcu_px.x,
-                      coeff_temp, width, width, LCU_WIDTH_C, width);
-    coeff_bits += kvz_get_coeff_cost(state, coeff_temp, width, 2, scan_order);
+    const int index = xy_to_zorder(LCU_WIDTH_C, lcu_px.x, lcu_px.y);
 
-    kvz_coefficients_blit(&lcu->coeff.v(lcu_px.y*(LCU_WIDTH_C)) + lcu_px.x,
-                      coeff_temp, width, width, LCU_WIDTH_C, width);
-    coeff_bits += kvz_get_coeff_cost(state, coeff_temp, width, 2, scan_order);
+    coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.uindex, width, 2, scan_order);
+    coeff_bits += kvz_get_coeff_cost(state, &lcu->coeff.vindex, width, 2, scan_order);
   }
 
   double bits = tr_tree_bits + coeff_bits;
@@ -478,7 +386,7 @@
  * - All the final data for the LCU gets eventually copied to depth 0, which
  *   will be the final output of the recursion.
  */
-static double search_cu(encoder_state_t * const state, int x, int y, int depth, lcu_t work_treeMAX_PU_DEPTH + 1)
+static double search_cu(encoder_state_t * const state, int x, int y, int depth, lcu_t *work_tree)
 {
   const encoder_control_t* ctrl = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
@@ -491,10 +399,6 @@
 
   int x_local = SUB_SCU(x);
   int y_local = SUB_SCU(y);
-#ifdef KVZ_DEBUG
-  int debug_split = 0;
-#endif
-  PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHCU);
 
   // Stop recursion if the CU is completely outside the frame.
   if (x >= frame->width || y >= frame->height) {
@@ -502,21 +406,27 @@
     return 0;
   }
 
-  cur_cu = LCU_GET_CU_AT_PX(&work_treedepth, x_local, y_local);
+  cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
   // Assign correct depth
   cur_cu->depth = depth > MAX_DEPTH ? MAX_DEPTH : depth;
   cur_cu->tr_depth = depth > 0 ? depth : 1;
   cur_cu->type = CU_NOTSET;
   cur_cu->part_size = SIZE_2Nx2N;
+
   // If the CU is completely inside the frame at this depth, search for
   // prediction modes at this depth.
   if (x + cu_width <= frame->width &&
       y + cu_width <= frame->height)
   {
-
-    bool can_use_inter =
-        state->frame->slicetype != KVZ_SLICE_I
-        && WITHIN(depth, ctrl->cfg.pu_depth_inter.min, ctrl->cfg.pu_depth_inter.max);
+    int cu_width_inter_min = LCU_WIDTH >> ctrl->cfg.pu_depth_inter.max;
+    bool can_use_inter = state->frame->slicetype != KVZ_SLICE_I && (
+      WITHIN(depth, ctrl->cfg.pu_depth_inter.min, ctrl->cfg.pu_depth_inter.max) ||
+      // When the split was forced because the CTU is partially outside the
+      // frame, we permit inter coding even if pu_depth_inter would
+      // otherwise forbid it.
+      (x & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame->width ||
+      (y & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame->height
+    );
 
     if (can_use_inter) {
       double mode_cost;
@@ -524,7 +434,7 @@
       kvz_search_cu_inter(state,
                           x, y,
                           depth,
-                          &work_treedepth,
+                          lcu,
                           &mode_cost, &mode_bitcost);
       if (mode_cost < cost) {
         cost = mode_cost;
@@ -555,7 +465,7 @@
           cost = mode_cost;
           inter_bitcost = mode_bitcost;
           // TODO: only copy inter prediction info, not pixels
-          work_tree_copy_up(x, y, depth, work_tree);
+          work_tree_copy_up(x_local, y_local, depth, work_tree);
         }
       }
     }
@@ -565,13 +475,21 @@
     // decision after reconstructing the inter frame.
     bool skip_intra = state->encoder_control->cfg.rdo == 0
                       && cur_cu->type != CU_NOTSET
-                      && cost / (cu_width * cu_width) < INTRA_TRESHOLD;
-    if (!skip_intra
-        && WITHIN(depth, ctrl->cfg.pu_depth_intra.min, ctrl->cfg.pu_depth_intra.max))
-    {
+                      && cost / (cu_width * cu_width) < INTRA_THRESHOLD;
+
+    int32_t cu_width_intra_min = LCU_WIDTH >> ctrl->cfg.pu_depth_intra.max;
+    bool can_use_intra =
+        WITHIN(depth, ctrl->cfg.pu_depth_intra.min, ctrl->cfg.pu_depth_intra.max) ||
+        // When the split was forced because the CTU is partially outside
+        // the frame, we permit intra coding even if pu_depth_intra would
+        // otherwise forbid it.
+        (x & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->width ||
+        (y & ~(cu_width_intra_min - 1)) + cu_width_intra_min > frame->height;
+
+    if (can_use_intra && !skip_intra) {
       int8_t intra_mode;
       double intra_cost;
-      kvz_search_cu_intra(state, x, y, depth, &work_treedepth,
+      kvz_search_cu_intra(state, x, y, depth, lcu,
                           &intra_mode, &intra_cost);
       if (intra_cost < cost) {
         cost = intra_cost;
@@ -585,38 +503,37 @@
     // mode search of adjacent CUs.
     if (cur_cu->type == CU_INTRA) {
       assert(cur_cu->part_size == SIZE_2Nx2N || cur_cu->part_size == SIZE_NxN);
-      int8_t intra_mode = cur_cu->intra.mode;
-      lcu_set_intra_mode(&work_treedepth, x, y, depth,
-                         intra_mode,
-                         intra_mode,
-                         cur_cu->part_size);
-      kvz_intra_recon_lcu_luma(state, x, y, depth, intra_mode, NULL, &work_treedepth);
+      cur_cu->intra.mode_chroma = cur_cu->intra.mode;
+      lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
+      kvz_intra_recon_cu(state,
+                         x, y,
+                         depth,
+                         cur_cu->intra.mode, -1, // skip chroma
+                         NULL, lcu);
 
       if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != KVZ_CSP_400) {
-        int8_t intra_mode_chroma = intra_mode;
-
         // There is almost no benefit to doing the chroma mode search for
         // rd2. Possibly because the luma mode search already takes chroma
         // into account, so there is less of a chanse of luma mode being
         // really bad for chroma.
         if (state->encoder_control->cfg.rdo == 3) {
-          intra_mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, &work_treedepth);
-          lcu_set_intra_mode(&work_treedepth, x, y, depth,
-                             intra_mode, intra_mode_chroma,
-                             cur_cu->part_size);
+          cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu);
+          lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
         }
 
-        kvz_intra_recon_lcu_chroma(state, x, y, depth, intra_mode_chroma, NULL, &work_treedepth);
+        kvz_intra_recon_cu(state,
+                           x, y,
+                           depth,
+                           -1, cur_cu->intra.mode_chroma, // skip luma
+                           NULL, lcu);
       }
     } else if (cur_cu->type == CU_INTER) {
       // Reset transform depth because intra messes with them.
       // This will no longer be necessary if the transform depths are not shared.
       int tr_depth = depth > 0 ? depth : 1;
-      kvz_lcu_set_trdepth(&work_treedepth, x, y, depth, tr_depth);
+      kvz_lcu_set_trdepth(lcu, x, y, depth, tr_depth);
 
-      const int cu_width = LCU_WIDTH >> depth;
       const int num_pu = kvz_part_mode_num_partscur_cu->part_size;
-
       for (int i = 0; i < num_pu; ++i) {
         const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i);
         const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i);
@@ -627,33 +544,43 @@
 
         if (cur_pu->inter.mv_dir == 3) {
           const kvz_picture *const refs2 = {
-            state->frame->ref->imagescur_pu->inter.mv_ref0,
-            state->frame->ref->imagescur_pu->inter.mv_ref1,
+            state->frame->ref->images
+              state->frame->ref_LX0
+                cur_pu->inter.mv_ref0,
+            state->frame->ref->images
+              state->frame->ref_LX1
+                cur_pu->inter.mv_ref1,
           };
           kvz_inter_recon_lcu_bipred(state,
                                      refs0, refs1,
                                      pu_x, pu_y,
                                      pu_w, pu_h,
                                      cur_pu->inter.mv,
-                                     &work_treedepth);
+                                     lcu);
         } else {
           const int mv_idx = cur_pu->inter.mv_dir - 1;
+          
           const kvz_picture *const ref =
-              state->frame->ref->imagescur_pu->inter.mv_refmv_idx;
+              state->frame->ref->images
+                state->frame->ref_LXmv_idx
+                  cur_pu->inter.mv_refmv_idx;
+
           kvz_inter_recon_lcu(state,
                               ref,
                               pu_x, pu_y,
                               pu_w, pu_h,
                               cur_pu->inter.mvmv_idx,
-                              &work_treedepth,
+                              lcu,
                               0);
         }
       }
 
-      kvz_quantize_lcu_luma_residual(state, x, y, depth, NULL, &work_treedepth);
-      if (state->encoder_control->chroma_format != KVZ_CSP_400) {
-        kvz_quantize_lcu_chroma_residual(state, x, y, depth, NULL, &work_treedepth);
-      }
+      const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
+      kvz_quantize_lcu_residual(state,
+                                true, has_chroma,
+                                x, y, depth,
+                                NULL,
+                                lcu);
 
       int cbf = cbf_is_set_any(cur_cu->cbf, depth);
 
@@ -665,30 +592,36 @@
           inter_bitcost -= 1;
         }
       }
-      lcu_set_inter(&work_treedepth, x, y, depth, cur_cu);
-      lcu_set_coeff(&work_treedepth, x, y, depth, cur_cu);
+      lcu_set_inter(lcu, x_local, y_local, cu_width);
+      lcu_set_coeff(lcu, x_local, y_local, cu_width, cur_cu);
     }
   }
   if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) {
-    cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, &work_treedepth);
+    cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
     if (state->encoder_control->chroma_format != KVZ_CSP_400) {
-      cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, &work_treedepth);
+      cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu);
     }
 
     double mode_bits;
     if (cur_cu->type == CU_INTRA) {
-      mode_bits = calc_mode_bits(state, &work_treedepth, cur_cu, x, y);
+      mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y);
     } else {
       mode_bits = inter_bitcost;
     }
 
     cost += mode_bits * state->lambda;
   }
-  
+
+  bool can_split_cu =
+    // If the CU is partially outside the frame, we need to split it even
+    // if pu_depth_intra and pu_depth_inter would not permit it.
+    cur_cu->type == CU_NOTSET ||
+    depth < ctrl->cfg.pu_depth_intra.max ||
+    (state->frame->slicetype != KVZ_SLICE_I &&
+      depth < ctrl->cfg.pu_depth_inter.max);
+
   // Recursively split all the way to max search depth.
-  if (depth < ctrl->cfg.pu_depth_intra.max ||
-      (depth < ctrl->cfg.pu_depth_inter.max && state->frame->slicetype != KVZ_SLICE_I))
-  {
+  if (can_split_cu) {
     int half_cu = cu_width / 2;
     double split_cost = 0.0;
     int cbf = cbf_is_set_any(cur_cu->cbf, depth);
@@ -739,16 +672,20 @@
         cur_cu->type = CU_INTRA;
         cur_cu->part_size = SIZE_2Nx2N;
 
-        kvz_lcu_set_trdepth(&work_treedepth, x, y, depth, cur_cu->tr_depth);
-        lcu_set_intra_mode(&work_treedepth, x, y, depth,
-                           cur_cu->intra.mode, cur_cu->intra.mode_chroma,
-                           cur_cu->part_size);
-        kvz_intra_recon_lcu_luma(state, x, y, depth, cur_cu->intra.mode, NULL, &work_treedepth);
-        cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, &work_treedepth);
+        kvz_lcu_set_trdepth(lcu, x, y, depth, cur_cu->tr_depth);
+        lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
 
-        if (state->encoder_control->chroma_format != KVZ_CSP_400) {
-          kvz_intra_recon_lcu_chroma(state, x, y, depth, cur_cu->intra.mode_chroma, NULL, &work_treedepth);
-          cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, &work_treedepth);
+        const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
+        const int8_t mode_chroma = has_chroma ? cur_cu->intra.mode_chroma : -1;
+        kvz_intra_recon_cu(state,
+                           x, y,
+                           depth,
+                           cur_cu->intra.mode, mode_chroma,
+                           NULL, lcu);
+
+        cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
+        if (has_chroma) {
+          cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, lcu);
         }
 
         // Add the cost of coding no-split.
@@ -757,7 +694,7 @@
         cost += CTX_ENTROPY_FBITS(ctx, 0) * state->lambda;
 
         // Add the cost of coding intra mode only once.
-        double mode_bits = calc_mode_bits(state, &work_treedepth, cur_cu, x, y);
+        double mode_bits = calc_mode_bits(state, lcu, cur_cu, x, y);
         cost += mode_bits * state->lambda;
       }
     }
@@ -765,27 +702,22 @@
     if (split_cost < cost) {
       // Copy split modes to this depth.
       cost = split_cost;
-      work_tree_copy_up(x, y, depth, work_tree);
+      work_tree_copy_up(x_local, y_local, depth, work_tree);
 #if KVZ_DEBUG
       debug_split = 1;
 #endif
     } else if (depth > 0) {
       // Copy this CU's mode all the way down for use in adjacent CUs mode
       // search.
-      work_tree_copy_down(x, y, depth, work_tree);
+      work_tree_copy_down(x_local, y_local, depth, work_tree);
     }
   } else if (depth >= 0 && depth < MAX_PU_DEPTH) {
     // Need to copy modes down since the lower level of the work tree is used
     // when searching SMP and AMP blocks.
-    work_tree_copy_down(x, y, depth, work_tree);
+    work_tree_copy_down(x_local, y_local, depth, work_tree);
   }
 
-  PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHCU, state->encoder_control->threadqueue, "type=search_cu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d,depth=%d,split=%d,cur_cu_is_intra=%d", state->frame->num, state->tile->id, state->slice->id,
-                          (state->tile->lcu_offset_x * LCU_WIDTH) + x,
-                          (state->tile->lcu_offset_x * LCU_WIDTH) + x + (LCU_WIDTH >> depth), 
-                          (state->tile->lcu_offset_y * LCU_WIDTH) + y,
-                          (state->tile->lcu_offset_y * LCU_WIDTH) + y + (LCU_WIDTH >> depth), 
-                          depth, debug_split, (cur_cu->type==CU_INTRA)?1:0);
+  assert(cur_cu->type != CU_NOTSET);
 
   return cost;
 }
@@ -911,23 +843,15 @@
     const int pic_width = pic->width;
     const int x_max = MIN(x_px + LCU_WIDTH, pic_width) - x_px;
     const int y_max = MIN(y_px + LCU_WIDTH, pic->height) - y_px;
-    const int luma_index = x_px + y_px * pic_width;
-    const int chroma_index = (x_px / 2) + (y_px / 2) * (pic_width / 2);
 
     kvz_pixels_blit(lcu->rec.y, &pic->rec->yx_px + y_px * pic->rec->stride,
                         x_max, y_max, LCU_WIDTH, pic->rec->stride);
-    kvz_coefficients_blit(lcu->coeff.y, &pic->coeff_yluma_index,
-                        x_max, y_max, LCU_WIDTH, pic_width);
 
     if (state->encoder_control->chroma_format != KVZ_CSP_400) {
       kvz_pixels_blit(lcu->rec.u, &pic->rec->u(x_px / 2) + (y_px / 2) * (pic->rec->stride / 2),
                       x_max / 2, y_max / 2, LCU_WIDTH / 2, pic->rec->stride / 2);
       kvz_pixels_blit(lcu->rec.v, &pic->rec->v(x_px / 2) + (y_px / 2) * (pic->rec->stride / 2),
                       x_max / 2, y_max / 2, LCU_WIDTH / 2, pic->rec->stride / 2);
-      kvz_coefficients_blit(lcu->coeff.u, &pic->coeff_uchroma_index,
-                            x_max / 2, y_max / 2, LCU_WIDTH / 2, pic_width / 2);
-      kvz_coefficients_blit(lcu->coeff.v, &pic->coeff_vchroma_index,
-                            x_max / 2, y_max / 2, LCU_WIDTH / 2, pic_width / 2);
     }
   }
 }
@@ -961,4 +885,9 @@
   // The best decisions through out the LCU got propagated back to depth 0,
   // so copy those back to the frame.
   copy_lcu_to_cu_data(state, x, y, &work_tree0);
+
+  // Copy coeffs to encoder state.
+  copy_coeffs(work_tree0.coeff.y, state->coeff->y, LCU_WIDTH);
+  copy_coeffs(work_tree0.coeff.u, state->coeff->u, LCU_WIDTH_C);
+  copy_coeffs(work_tree0.coeff.v, state->coeff->v, LCU_WIDTH_C);
 }

kvazaar-1.1.0.tar.gz/src/search_inter.c -> kvazaar-1.2.0.tar.gz/src/search_inter.c Changed

@@ -35,68 +35,199 @@
 #include "videoframe.h"
 
 
+typedef struct {
+  encoder_state_t *state;
+
+  /**
+   * \brief Current frame
+   */
+  const kvz_picture *pic;
+  /**
+   * \brief Reference frame
+   */
+  const kvz_picture *ref;
+
+  /**
+   * \brief Index of the reference frame
+   */
+  int32_t ref_idx;
+
+  /**
+   * \brief Top-left corner of the PU
+   */
+  const vector2d_t origin;
+  int32_t width;
+  int32_t height;
+
+  int16_t mv_cand22;
+  inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS;
+  int32_t num_merge_cand;
+
+  kvz_mvd_cost_func *mvd_cost_func;
+
+  /**
+   * \brief Best motion vector among the ones tested so far
+   */
+  vector2d_t best_mv;
+  /**
+   * \brief Cost of best_mv
+   */
+  uint32_t best_cost;
+  /**
+   * \brief Bit cost of best_mv
+   */
+  uint32_t best_bitcost;
+} inter_search_info_t;
+
+
 /**
  * \return  True if referred block is within current tile.
  */
-static INLINE bool fracmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int height, int wpp_limit)
+static INLINE bool fracmv_within_tile(const inter_search_info_t *info, int x, int y)
 {
-  if (state->encoder_control->cfg.mv_constraint == KVZ_MV_CONSTRAIN_NONE) {
-    return (wpp_limit == -1 || y + (height << 2) <= (wpp_limit << 2));
-  };
+  const encoder_control_t *ctrl = info->state->encoder_control;
+
+  const bool is_frac_luma   = x % 4 != 0 || y % 4 != 0;
+  const bool is_frac_chroma = x % 8 != 0 || y % 8 != 0;
+
+  if (ctrl->cfg.owf && ctrl->cfg.wpp) {
+    // Check that the block does not reference pixels that are not final.
+
+    // Margin as luma pixels.
+    int margin = 0;
+    if (is_frac_luma) {
+      // Fractional motion estimation needs up to 4 pixels outside the
+      // block.
+      margin = 4;
+    } else if (is_frac_chroma) {
+      // Odd chroma interpolation needs up to 2 luma pixels outside the
+      // block.
+      margin = 2;
+    }
+
+    if (ctrl->cfg.sao_type) {
+      // Make sure we don't refer to pixels for which SAO reconstruction
+      // has not been done.
+      margin += SAO_DELAY_PX;
+    } else if (ctrl->cfg.deblock_enable) {
+      // Make sure we don't refer to pixels that have not been deblocked.
+      margin += DEBLOCK_DELAY_PX;
+    }
+
+    // Coordinates of the top-left corner of the containing LCU.
+    const vector2d_t orig_lcu = {
+      .x = info->origin.x / LCU_WIDTH,
+      .y = info->origin.y / LCU_WIDTH,
+    };
+    // Difference between the coordinates of the LCU containing the
+    // bottom-left corner of the referenced block and the LCU containing
+    // this block.
+    const vector2d_t mv_lcu = {
+      ((info->origin.x + info->width  + margin) * 4 + x) / (LCU_WIDTH << 2) - orig_lcu.x,
+      ((info->origin.y + info->height + margin) * 4 + y) / (LCU_WIDTH << 2) - orig_lcu.y,
+    };
 
+    if (mv_lcu.y > ctrl->max_inter_ref_lcu.down) {
+      return false;
+    }
+
+    if (mv_lcu.x + mv_lcu.y >
+        ctrl->max_inter_ref_lcu.down + ctrl->max_inter_ref_lcu.right)
+    {
+      return false;
+    }
+  }
+
+  if (ctrl->cfg.mv_constraint == KVZ_MV_CONSTRAIN_NONE) {
+    return true;
+  }
+
+  // Margin as luma quater pixels.
   int margin = 0;
-  if (state->encoder_control->cfg.mv_constraint == KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN) {
-    // Enforce a distance of 8 from any tile boundary.
-    margin = 4 * 4;
+  if (ctrl->cfg.mv_constraint == KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN) {
+    if (is_frac_luma) {
+      margin = 4 << 2;
+    } else if (is_frac_chroma) {
+      margin = 2 << 2;
+    }
   }
 
   // TODO implement KVZ_MV_CONSTRAIN_FRAM and KVZ_MV_CONSTRAIN_TILE.
-  const vector2d_t abs_mv = { (orig->x << 2) + x, (orig->y << 2) + y };
+  const vector2d_t abs_mv = {
+    info->origin.x * 4 + x,
+    info->origin.y * 4 + y,
+  };
 
-  // Check that both margin and wpp_limit constraints are satisfied.
-  if (abs_mv.x >= margin && abs_mv.x + (width << 2) <= (state->tile->frame->width << 2) - margin &&
-      abs_mv.y >= margin && abs_mv.y + (height << 2) <= (state->tile->frame->height << 2) - margin &&
-      (wpp_limit == -1 || y + (height << 2) <= (wpp_limit << 2)))
-  {
-    return true;
-  } else {
-    return false;
-  }
+  // Check that both margin constraints are satisfied.
+  const int from_right  =
+    (info->state->tile->frame->width  << 2) - (abs_mv.x + (info->width  << 2));
+  const int from_bottom =
+    (info->state->tile->frame->height << 2) - (abs_mv.y + (info->height << 2));
+
+  return abs_mv.x >= margin &&
+         abs_mv.y >= margin &&
+         from_right >= margin &&
+         from_bottom >= margin;
 }
 
 
-static INLINE int get_wpp_limit(const encoder_state_t *state, const vector2d_t* orig)
+/**
+ * \return  True if referred block is within current tile.
+ */
+static INLINE bool intmv_within_tile(const inter_search_info_t *info, int x, int y)
 {
-  const encoder_control_t *ctrl = state->encoder_control;
-  if (ctrl->cfg.owf && ctrl->cfg.wpp) {
-    // Limit motion vectors to the LCU-row below this row.
-    // To avoid fractional pixel interpolation depending on things outside
-    // this range, add a margin of 4 pixels.
-    // - fme needs 4 pixels
-    // - odd chroma interpolation needs 4 pixels
-    int wpp_limit = 2 * LCU_WIDTH - 4 - orig->y % LCU_WIDTH;
-    if (ctrl->cfg.deblock_enable && !ctrl->cfg.sao_enable) {
-      // As a special case, when deblocking is enabled but SAO is not, we have
-      // to avoid the possibility of interpolation filters reaching the
-      // non-deblocked pixels. The deblocking for the horizontal edge on the
-      // LCU boundary can reach 4 pixels. If SAO is enabled, this WPP-row
-      // depends on the SAO job, which depends on the deblocking having
-      // already been done.
-      wpp_limit -= 4;
-    }
-    return wpp_limit;
-  } else {
-    return -1;
-  }
+  return fracmv_within_tile(info, x * 4, y * 4);
 }
 
 
 /**
- * \return  True if referred block is within current tile.
+ * \brief Calculate cost for an integer motion vector.
+ *
+ * Updates info->best_mv, info->best_cost and info->best_bitcost to the new
+ * motion vector if it yields a lower cost than the current one.
+ *
+ * If the motion vector violates the MV constraints for tiles or WPP, the
+ * cost is not set.
+ *
+ * \return true if info->best_mv was changed, false otherwise
  */
-static INLINE bool intmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int height, int wpp_limit)
+static bool check_mv_cost(inter_search_info_t *info, int x, int y)
 {
-  return fracmv_within_tile(state, orig, x << 2, y << 2, width, height, wpp_limit);
+  if (!intmv_within_tile(info, x, y)) return false;
+
+  uint32_t bitcost = 0;
+  uint32_t cost = kvz_image_calc_sad(
+      info->pic,
+      info->ref,
+      info->origin.x,
+      info->origin.y,
+      info->state->tile->offset_x + info->origin.x + x,
+      info->state->tile->offset_y + info->origin.y + y,
+      info->width,
+      info->height
+  );
+
+  if (cost >= info->best_cost) return false;
+
+  cost += info->mvd_cost_func(
+      info->state,
+      x, y, 2,
+      info->mv_cand,
+      info->merge_cand,
+      info->num_merge_cand,
+      info->ref_idx,
+      &bitcost
+  );
+
+  if (cost >= info->best_cost) return false;
+
+  // Set to motion vector in quarter pixel precision.
+  info->best_mv.x = x * 4;
+  info->best_mv.y = y * 4;
+  info->best_cost = cost;
+  info->best_bitcost = bitcost;
+
+  return true;
 }
 
 
@@ -121,18 +252,19 @@
 }
 
 
-/**Checks if mv is one of the merge candidates
-* \return true if found else return false
-*/
-static bool mv_in_merge(const inter_merge_cand_t* merge_cand, int16_t num_cand, const vector2d_t* mv)
+/**
+ * \brief Checks if mv is one of the merge candidates.
+ * \return true if found else return false
+ */
+static bool mv_in_merge(const inter_search_info_t *info, vector2d_t mv)
 {
-  for (int i = 0; i < num_cand; ++i) {
-    if (merge_candi.dir == 3) continue;
+  for (int i = 0; i < info->num_merge_cand; ++i) {
+    if (info->merge_candi.dir == 3) continue;
     const vector2d_t merge_mv = {
-      merge_candi.mvmerge_candi.dir - 10 >> 2,
-      merge_candi.mvmerge_candi.dir - 11 >> 2
+      info->merge_candi.mvinfo->merge_candi.dir - 10 >> 2,
+      info->merge_candi.mvinfo->merge_candi.dir - 11 >> 2
     };
-    if (merge_mv.x == mv->x && merge_mv.y == mv->y) {
+    if (merge_mv.x == mv.x && merge_mv.y == mv.y) {
       return true;
     }
   }
@@ -140,49 +272,43 @@
 }
 
 
-static unsigned select_starting_point(int16_t num_cand, inter_merge_cand_t *merge_cand, vector2d_t *mv_in_out, vector2d_t *mv, encoder_state_t *const state,
-                                      const vector2d_t *orig, unsigned width, unsigned height, int wpp_limit, const kvz_picture *pic, const kvz_picture *ref,
-                                      int16_t mv_cand22, int32_t ref_idx, unsigned best_cost, unsigned *best_index, uint32_t *best_bitcost,
-                                      kvz_mvd_cost_func *calc_mvd){
+/**
+ * \brief Select starting point for integer motion estimation search.
+ *
+ * Checks the zero vector, extra_mv and merge candidates and updates
+ * info->best_mv to the best one.
+ */
+static void select_starting_point(inter_search_info_t *info, vector2d_t extra_mv)
+{
+  // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list.
+  check_mv_cost(info, 0, 0);
+
+  // Change to integer precision.
+  extra_mv.x >>= 2;
+  extra_mv.y >>= 2;
+
+  // Check mv_in if it's not one of the merge candidates.
+  if ((extra_mv.x != 0 || extra_mv.y != 0) && !mv_in_merge(info, extra_mv)) {
+    check_mv_cost(info, extra_mv.x, extra_mv.y);
+  }
+
   // Go through candidates
-  for (unsigned i = 0; i < num_cand; ++i) {
-    if (merge_candi.dir == 3) continue;
-    mv->x = merge_candi.mvmerge_candi.dir - 10 >> 2;
-    mv->y = merge_candi.mvmerge_candi.dir - 11 >> 2;
-
-    if (mv->x == 0 && mv->y == 0) continue;
-    if (!intmv_within_tile(state, orig, mv->x, mv->y, width, height, wpp_limit)) {
-      continue;
-    }
+  for (unsigned i = 0; i < info->num_merge_cand; ++i) {
+    if (info->merge_candi.dir == 3) continue;
 
-    uint32_t bitcost = 0;
-    unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-      (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x,
-      (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y,
-      width, height, -1);
-    cost += calc_mvd(state, mv->x, mv->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-
-    if (cost < best_cost) {
-      best_cost = cost;
-      *best_index = i;
-      *best_bitcost = bitcost;
-    }
-  }  
-  if (*best_index < num_cand) {
-    mv->x = merge_cand*best_index.mvmerge_cand*best_index.dir - 10 >> 2;
-    mv->y = merge_cand*best_index.mvmerge_cand*best_index.dir - 11 >> 2;
-  } else if (*best_index == num_cand) {
-    mv->x = mv_in_out->x >> 2;
-    mv->y = mv_in_out->y >> 2;
-  } else {
-    mv->x = 0;
-    mv->y = 0;
+    int x = info->merge_candi.mvinfo->merge_candi.dir - 10 >> 2;
+    int y = info->merge_candi.mvinfo->merge_candi.dir - 11 >> 2;
+
+    if (x == 0 && y == 0) continue;
+
+    check_mv_cost(info, x, y);
   }
-  return best_cost;
 }
 
 
-static uint32_t get_mvd_coding_cost(encoder_state_t * const state, vector2d_t *mvd, const cabac_data_t* cabac)
+static uint32_t get_mvd_coding_cost(const encoder_state_t *state,
+                                    vector2d_t *mvd,
+                                    const cabac_data_t* cabac)
 {
   unsigned bitcost = 0;
   const vector2d_t abs_mvd = { abs(mvd->x), abs(mvd->y) };
@@ -210,9 +336,15 @@
 }
 
 
-static int calc_mvd_cost(encoder_state_t * const state, int x, int y, int mv_shift,
-                         int16_t mv_cand22, inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
-                         int16_t num_cand,int32_t ref_idx, uint32_t *bitcost)
+static uint32_t calc_mvd_cost(const encoder_state_t *state,
+                              int x,
+                              int y,
+                              int mv_shift,
+                              int16_t mv_cand22,
+                              inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
+                              int16_t num_cand,
+                              int32_t ref_idx,
+                              uint32_t *bitcost)
 {
   uint32_t temp_bitcost = 0;
   uint32_t merge_idx;
@@ -221,15 +353,17 @@
   int8_t merged      = 0;
   int8_t cur_mv_cand = 0;
 
-  x <<= mv_shift;
-  y <<= mv_shift;
+  x *= 1 << mv_shift;
+  y *= 1 << mv_shift;
 
   // Check every candidate to find a match
   for(merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) {
     if (merge_candmerge_idx.dir == 3) continue;
     if (merge_candmerge_idx.mvmerge_candmerge_idx.dir - 10 == x &&
         merge_candmerge_idx.mvmerge_candmerge_idx.dir - 11 == y &&
-        merge_candmerge_idx.refmerge_candmerge_idx.dir - 1 == ref_idx) {
+        state->frame->ref_LXmerge_candmerge_idx.dir - 1
+          merge_candmerge_idx.refmerge_candmerge_idx.dir - 1
+         == ref_idx) {
       temp_bitcost += merge_idx;
       merged = 1;
       break;
@@ -257,81 +391,63 @@
 }
 
 
-static bool early_terminate(int16_t num_cand, inter_merge_cand_t *merge_cand, vector2d_t *mv_in_out, vector2d_t *mv, encoder_state_t *const state,
-  const vector2d_t *orig, unsigned width, unsigned height, int wpp_limit, const kvz_picture *pic, const kvz_picture *ref,
-  int16_t mv_cand22, int32_t ref_idx, unsigned *best_cost, uint32_t *bitcost_out, uint32_t *best_bitcost,
-  kvz_mvd_cost_func *calc_mvd)
+static bool early_terminate(inter_search_info_t *info)
 {
-  static const vector2d_t small_hexbs5 = {
-      { 0, 0 },
-      { 0, -1 }, { -1, 0 }, { 1, 0 }, { 0, 1 },
+  static const vector2d_t small_hexbs7 = {
+      { 0, -1 }, { -1, 0 }, { 0, 1 }, { 1, 0 },
+      { 0, -1 }, { -1, 0 }, { 0, 0 },
   };
-  double multiplier = 1;
-  // If early termination is set to fast set multiplier to 0.9
-  if (state->encoder_control->cfg.me_early_termination == KVZ_ME_EARLY_TERMINATION_SENSITIVE){
-    multiplier = 0.95;
-  }
-  const vector2d_t *offset;
-  for (int k = 0; k < 2; ++k){
-    unsigned best_index = 0;
-    for (int i = 1; i < 5; ++i) {
-      offset = &small_hexbsi;
-      if (!intmv_within_tile(state, orig, mv->x + offset->x, mv->y + offset->y, width, height, wpp_limit)) {
-        continue;
-      }
 
-      unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + offset->x,
-        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + offset->y,
-        width, height, -1);
-      unsigned bitcost;
-      cost += calc_mvd(state, mv->x + offset->x, mv->y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+  vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 };
 
-      if (cost < multiplier * *best_cost ) {
-        *best_cost = cost;
+  int first_index = 0;
+  int last_index = 3;
+
+  for (int k = 0; k < 2; ++k) {
+    double threshold;
+    if (info->state->encoder_control->cfg.me_early_termination ==
+        KVZ_ME_EARLY_TERMINATION_SENSITIVE)
+    {
+      threshold = info->best_cost * 0.95;
+    } else {
+      threshold = info->best_cost;
+    }
+
+    int best_index = 6;
+    for (int i = first_index; i <= last_index; i++) {
+      int x = mv.x + small_hexbsi.x;
+      int y = mv.y + small_hexbsi.y;
+
+      if (check_mv_cost(info, x, y)) {
         best_index = i;
-        *best_bitcost = bitcost;
       }
     }
-    // Adjust the movement vector
-    mv->x += small_hexbsbest_index.x;
-    mv->y += small_hexbsbest_index.y;
 
-    // if best match is at center we stop the search
-    if (best_index == 0){
-      // Return final movement vector in quarter-pixel precision.
-      mv_in_out->x = mv->x << 2;
-      mv_in_out->y = mv->y << 2;
+    // Adjust the movement vector
+    mv.x += small_hexbsbest_index.x;
+    mv.y += small_hexbsbest_index.y;
 
-      *bitcost_out = *best_bitcost;
+    // If best match is not better than threshold, we stop the search.
+    if (info->best_cost >= threshold) {
       return true;
     }
+
+    first_index = (best_index + 3) % 4;
+    last_index = first_index + 2;
   }
   return false;
 }
 
 
-unsigned kvz_tz_pattern_search(encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, unsigned pattern_type,
-                           const vector2d_t *orig, const int iDist, vector2d_t *mv, unsigned best_cost, int *best_dist,
-                           int16_t mv_cand22, inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS, int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
-                           int width, int height, int wpp_limit)
+void kvz_tz_pattern_search(inter_search_info_t *info,
+                           unsigned pattern_type,
+                           const int iDist,
+                           int *best_dist)
 {
-  int n_points;
-  int best_index = -1;
-  int i;
-  
-  vector2d_t mv_best = { 0, 0 };
-
-
-  kvz_mvd_cost_func *calc_mvd = calc_mvd_cost;
-  if (state->encoder_control->cfg.mv_rdo) {
-    calc_mvd = kvz_calc_mvd_cost_cabac;
-  }
-
   assert(pattern_type < 4);
 
   //implemented search patterns
-  vector2d_t pattern48 = {
+  const vector2d_t pattern48 = {
       //diamond (8 points)
       //    1    
       //         
@@ -391,14 +507,12 @@
         { iDist / 2, iDist }, { iDist, 0 }, { iDist / 2, -iDist }, { -iDist, 0 },
         { iDist / 2, iDist }, { -iDist / 2, -iDist }, { 0, 0 }, { 0, 0 }
       }
-
   };
 
-  //set the number of points to be checked
-  if (iDist == 1)
-  {
-    switch (pattern_type)
-    {
+  // Set the number of points to be checked.
+  int n_points;
+  if (iDist == 1) {
+    switch (pattern_type) {
       case 0:
         n_points = 4;
         break;
@@ -412,11 +526,8 @@
         n_points = 8;
         break;
     };
-  }
-  else
-  {
-    switch (pattern_type)
-    {
+  } else {
+    switch (pattern_type) {
       case 3:
         n_points = 6;
         break;
@@ -426,248 +537,110 @@
     };
   }
 
-  //compute SAD values for all chosen points
-  for (i = 0; i < n_points; i++)
-  {
-    vector2d_t *current = &patternpattern_typei;
-    if (!intmv_within_tile(state, orig, mv->x + current->x, mv->y + current->y, width, height, wpp_limit)) {
-      continue;
-    }
-
-    unsigned cost;
-    uint32_t bitcost;
+  const vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 };
 
-    {
-      cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y,
-                            width, height, -1);
-      cost += calc_mvd(state, mv->x + current->x, mv->y + current->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-    }
+  // Compute SAD values for all chosen points.
+  int best_index = -1;
+  for (int i = 0; i < n_points; i++) {
+    vector2d_t offset = patternpattern_typei;
+    int x = mv.x + offset.x;
+    int y = mv.y + offset.y;
 
-    if (cost < best_cost)
-    {
-      best_cost = cost;
-      *best_bitcost = bitcost;
+    if (check_mv_cost(info, x, y)) {
       best_index = i;
     }
-
   }
 
-  if (best_index >= 0)
-  {
-    mv_best = patternpattern_typebest_index;
+  if (best_index >= 0) {
     *best_dist = iDist;
   }
-  
-  mv->x += mv_best.x;
-  mv->y += mv_best.y;
-
-  return best_cost;
-
 }
 
 
-unsigned kvz_tz_raster_search(encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref,
-                          const vector2d_t *orig, vector2d_t *mv, unsigned best_cost,
-                          int16_t mv_cand22, inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS, int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
-                          int width, int height, int iSearchRange, int iRaster, int wpp_limit)
+void kvz_tz_raster_search(inter_search_info_t *info,
+                          int iSearchRange,
+                          int iRaster)
 {
-  int i;
-  int k;
-
-  vector2d_t mv_best = { 0, 0 };
+  const vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 };
 
-  kvz_mvd_cost_func *calc_mvd = calc_mvd_cost;
-  if (state->encoder_control->cfg.mv_rdo) {
-    calc_mvd = kvz_calc_mvd_cost_cabac;
-  }
-  
   //compute SAD values for every point in the iRaster downsampled version of the current search area
-  for (i = iSearchRange; i >= -iSearchRange; i -= iRaster)
-  {
-    for (k = -iSearchRange; k <= iSearchRange; k += iRaster)
-    {
-      vector2d_t current = { k, i };
-      if (!intmv_within_tile(state, orig, mv->x + current.x, mv->y + current.y, width, height, wpp_limit)) {
-        continue;
-      }
-
-      unsigned cost;
-      uint32_t bitcost;
-
-      {
-        cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-          (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k,
-          (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i,
-          width, height, -1);
-        cost += calc_mvd(state, mv->x + k, mv->y + i, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-      }
-
-      if (cost < best_cost)
-      {
-        best_cost = cost;
-        *best_bitcost = bitcost;
-        mv_best = current;
-      }
-
+  for (int y = iSearchRange; y >= -iSearchRange; y -= iRaster) {
+    for (int x = -iSearchRange; x <= iSearchRange; x += iRaster) {
+      check_mv_cost(info, mv.x + x, mv.y + y);
     }
   }
-  
-  mv->x += mv_best.x;
-  mv->y += mv_best.y;
-
-  return best_cost;
-
 }
 
 
-static unsigned tz_search(encoder_state_t * const state,
-                          unsigned width, unsigned height,
-                          const kvz_picture *pic, const kvz_picture *ref,
-                          const vector2d_t *orig, vector2d_t *mv_in_out,
-                          int16_t mv_cand22, inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
-                          int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
+static void tz_search(inter_search_info_t *info, vector2d_t extra_mv)
 {
-
   //TZ parameters
   const int iSearchRange = 96;  // search range for each stage
-  const int iRaster = 5;  // search distance limit and downsampling factor for step 3                   
+  const int iRaster = 5;  // search distance limit and downsampling factor for step 3
   const unsigned step2_type = 0;  // search patterns for steps 2 and 4
   const unsigned step4_type = 0;
   const bool bRasterRefinementEnable = true;  // enable step 4 mode 1
   const bool bStarRefinementEnable = false;   // enable step 4 mode 2 (only one mode will be executed)
 
-  vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
-
-  unsigned best_cost = UINT32_MAX;
-  uint32_t best_bitcost = 0;
-  int iDist;
   int best_dist = 0;
-  unsigned best_index = num_cand + 1;
-  int wpp_limit = get_wpp_limit(state, orig);
-
-  kvz_mvd_cost_func *calc_mvd = calc_mvd_cost;
-  if (state->encoder_control->cfg.mv_rdo) {
-    calc_mvd = kvz_calc_mvd_cost_cabac;
-  }
-
-  // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list.
-  if (intmv_within_tile(state, orig, 0, 0, width, height, wpp_limit)) {
-    best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x,
-                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y,
-                                   width, height, -1);
-    best_cost += calc_mvd(state, 0, 0, 2, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost);
-    best_index = num_cand + 1;
-  }
+  info->best_cost = UINT32_MAX;
 
-  // Check mv_in if it's not one of the merge candidates.
-  if (!mv_in_merge(merge_cand, num_cand, &mv) &&
-      intmv_within_tile(state, orig, mv.x, mv.y, width, height, wpp_limit))
-  {
-    unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                                      (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                                      (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                      width, height, -1);
-    unsigned bitcost;
-    cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-    if (cost < best_cost) {
-      best_cost = cost;
-      best_index = num_cand;
-      best_bitcost = bitcost;
-    }
-  }
-
-  // Select starting point from among merge candidates. These should include
-  // both mv_cand vectors and (0, 0).
-  best_cost = select_starting_point(num_cand, merge_cand, mv_in_out, &mv, state, orig, width, height, wpp_limit,
-                                pic, ref, mv_cand, ref_idx, best_cost, &best_index, &best_bitcost, calc_mvd);
+  // Select starting point from among merge candidates. These should
+  // include both mv_cand vectors and (0, 0).
+  select_starting_point(info, extra_mv);
 
   // Check if we should stop search
-  if (state->encoder_control->cfg.me_early_termination){
-    if (early_terminate(num_cand, merge_cand, mv_in_out, &mv, state, orig, width, height, wpp_limit,
-      pic, ref, mv_cand, ref_idx, &best_cost, bitcost_out, &best_bitcost, calc_mvd)) return best_cost;
+  if (info->state->encoder_control->cfg.me_early_termination &&
+      early_terminate(info))
+  {
+    return;
   }
 
   //step 2, grid search
-  for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
-  {
-    best_cost = kvz_tz_pattern_search(state, pic, ref, step2_type, orig, iDist, &mv, best_cost, &best_dist,
-                                  mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit);
+  for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) {
+    kvz_tz_pattern_search(info, step2_type, iDist, &best_dist);
   }
 
   //step 3, raster scan
-  if (best_dist > iRaster)
-  {
+  if (best_dist > iRaster) {
     best_dist = iRaster;
-
-    best_cost = kvz_tz_raster_search(state, pic, ref, orig, &mv, best_cost, mv_cand, merge_cand,
-                                 num_cand, ref_idx, &best_bitcost, width, height, iSearchRange, iRaster, wpp_limit);
+    kvz_tz_raster_search(info, iSearchRange, iRaster);
   }
 
   //step 4
 
   //raster refinement
-  if (bRasterRefinementEnable && best_dist > 0)
-  {
-    iDist = best_dist >> 1;
-    while (iDist > 0)
-    {
-      best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
-                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit);
-
-      iDist = iDist >> 1;
+  if (bRasterRefinementEnable && best_dist > 0) {
+    for (int iDist = best_dist >> 1; iDist > 0; iDist >>= 1) {
+      kvz_tz_pattern_search(info, step4_type, iDist, &best_dist);
     }
   }
 
   //star refinement (repeat step 2 for the current starting point)
-  if (bStarRefinementEnable && best_dist > 0)
-  {
-    for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
-    {
-      best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
-                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit);
+  if (bStarRefinementEnable && best_dist > 0) {
+    for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) {
+      kvz_tz_pattern_search(info, step4_type, iDist, &best_dist);
     }
   }
-
-  mv.x = mv.x << 2;
-  mv.y = mv.y << 2;
-
-  *mv_in_out = mv;
-  *bitcost_out = best_bitcost;
-
-  return best_cost;
 }
 
 
 /**
  * \brief Do motion search using the HEXBS algorithm.
  *
- * \param width      width of the block to search
- * \param height     height of the block to search
- * \param pic        Picture motion vector is searched for.
- * \param ref        Picture motion vector is searched from.
- * \param orig       Top left corner of the searched for block.
- * \param mv_in_out  Predicted mv in and best out. Quarter pixel precision.
- *
- * \returns  Cost of the motion vector.
+ * \param info      search info
+ * \param extra_mv  extra motion vector to check
  *
  * Motion vector is searched by first searching iteratively with the large
  * hexagon pattern until the best match is at the center of the hexagon.
  * As a final step a smaller hexagon is used to check the adjacent pixels.
  *
- * If a non 0,0 predicted motion vector predictor is given as mv_in_out,
+ * If a non 0,0 predicted motion vector predictor is given as extra_mv,
  * the 0,0 vector is also tried. This is hoped to help in the case where
  * the predicted motion vector is way off. In the future even more additional
  * points like 0,0 might be used, such as vectors from top or left.
  */
-static unsigned hexagon_search(encoder_state_t * const state,
-                               unsigned width, unsigned height,
-                               const kvz_picture *pic, const kvz_picture *ref,
-                               const vector2d_t *orig, vector2d_t *mv_in_out,
-                               int16_t mv_cand22, inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
-                               int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
+static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv)
 {
   // The start of the hexagonal pattern has been repeated at the end so that
   // the indices between 1-6 can be used as the start of a 3-point list of new
@@ -691,83 +664,36 @@
       { 0, -1 }, { -1, 0 }, { 1, 0 }, { 0, 1 }
   };
 
-  vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
-  unsigned best_cost = UINT32_MAX;
-  uint32_t best_bitcost = 0, bitcost;
-  unsigned i;
-  // Current best index, either to merge_cands, large_hebx or small_hexbs.
-  unsigned best_index = num_cand + 1;
-  int wpp_limit = get_wpp_limit(state, orig);
+  info->best_cost = UINT32_MAX;
 
-  kvz_mvd_cost_func *calc_mvd = calc_mvd_cost;
-  if (state->encoder_control->cfg.mv_rdo) {
-    calc_mvd = kvz_calc_mvd_cost_cabac;
-  }
+  // Select starting point from among merge candidates. These should
+  // include both mv_cand vectors and (0, 0).
+  select_starting_point(info, extra_mv);
 
-  // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list.
-  if (intmv_within_tile(state, orig, 0, 0, width, height, wpp_limit)) {
-    best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x,
-                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y,
-                                   width, height, -1);
-    best_cost += calc_mvd(state, 0, 0, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-    best_bitcost = bitcost;
-    best_index = num_cand + 1;
-  }
-
-  // Check mv_in if it's not one of the merge candidates.
-  if (!mv_in_merge(merge_cand, num_cand, &mv) &&
-      intmv_within_tile(state, orig, mv.x, mv.y, width, height, wpp_limit)) 
+  // Check if we should stop search
+  if (info->state->encoder_control->cfg.me_early_termination &&
+      early_terminate(info))
   {
-    unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                   width, height, -1);
-    cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-
-    if (cost < best_cost) {
-      best_cost    = cost;
-      best_index   = num_cand;
-      best_bitcost = bitcost;
-    }
+    return;
   }
 
-  // Select starting point from among merge candidates. These should include
-  // both mv_cand vectors and (0, 0).
-  best_cost = select_starting_point(num_cand, merge_cand, mv_in_out, &mv, state, orig, width, height, wpp_limit,
-                                pic, ref, mv_cand, ref_idx, best_cost, &best_index, &best_bitcost, calc_mvd);
+  vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 };
 
-  // Check if we should stop search
-  if (state->encoder_control->cfg.me_early_termination){
-    if (early_terminate(num_cand, merge_cand, mv_in_out, &mv, state, orig, width, height, wpp_limit,
-      pic, ref, mv_cand, ref_idx, &best_cost, bitcost_out, &best_bitcost, calc_mvd)) return best_cost;
-  }
+  // Current best index, either to merge_cands, large_hebx or small_hexbs.
+  int best_index = 0;
 
   // Search the initial 7 points of the hexagon.
-  best_index = 0;
-  for (i = 0; i < 7; ++i) {
-    const vector2d_t *pattern = &large_hexbsi;
-    if (!intmv_within_tile(state, orig, mv.x + pattern->x, mv.y + pattern->y, width, height, wpp_limit)) {
-      continue;
-    }
-
-    unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x,
-                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
-                                   width, height, -1);
-    cost += calc_mvd(state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-
-    if (cost < best_cost) {
-      best_cost    = cost;
-      best_index   = i;
-      best_bitcost = bitcost;
+  for (int i = 1; i < 7; ++i) {
+    if (check_mv_cost(info, mv.x + large_hexbsi.x, mv.y + large_hexbsi.y)) {
+      best_index = i;
     }
   }
 
   // Iteratively search the 3 new points around the best match, until the best
   // match is in the center.
   while (best_index != 0) {
-    unsigned start; // Starting point of the 3 offsets to be searched.
+    // Starting point of the 3 offsets to be searched.
+    unsigned start;
     if (best_index == 1) {
       start = 6;
     } else if (best_index == 8) {
@@ -782,22 +708,10 @@
     best_index = 0;
 
     // Iterate through the next 3 points.
-    for (i = 0; i < 3; ++i) {
-      const vector2d_t *offset = &large_hexbsstart + i;
-      if (!intmv_within_tile(state, orig, mv.x + offset->x, mv.y + offset->y, width, height, wpp_limit)) {
-        continue;
-      }
-
-      unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                                     (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
-                                     (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
-                                     width, height, -1);
-      cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-
-      if (cost < best_cost) {
-        best_cost    = cost;
-        best_index   = start + i;
-        best_bitcost = bitcost;
+    for (int i = 0; i < 3; ++i) {
+      vector2d_t offset = large_hexbsstart + i;
+      if (check_mv_cost(info, mv.x + offset.x, mv.y + offset.y)) {
+        best_index = start + i;
       }
     }
   }
@@ -808,115 +722,45 @@
   best_index = 0;
 
   // Do the final step of the search with a small pattern.
-  for (i = 1; i < 5; ++i) {
-    const vector2d_t *offset = &small_hexbsi;
-    if (!intmv_within_tile(state, orig, mv.x + offset->x, mv.y + offset->y, width, height, wpp_limit)) {
-      continue;
-    }
-
-    unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
-                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
-                                   width, height, -1);
-    cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-
-    if (cost > 0 && cost < best_cost) {
-      best_cost    = cost;
-      best_index   = i;
-      best_bitcost = bitcost;
-    }
+  for (int i = 1; i < 5; ++i) {
+    check_mv_cost(info, mv.x + small_hexbsi.x, mv.y + small_hexbsi.y);
   }
-
-  // Adjust the movement vector according to the final best match.
-  mv.x += small_hexbsbest_index.x;
-  mv.y += small_hexbsbest_index.y;
-
-  // Return final movement vector in quarter-pixel precision.
-  mv_in_out->x = mv.x << 2;
-  mv_in_out->y = mv.y << 2;
-
-  *bitcost_out = best_bitcost;
-
-  return best_cost;
 }
 
 
-static unsigned search_mv_full(encoder_state_t * const state,
-                               unsigned width, unsigned height,
-                               const kvz_picture *pic, const kvz_picture *ref,
-                               const vector2d_t *orig, vector2d_t *mv_in_out,
-                               int16_t mv_cand22, inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
-                               int16_t num_cand, int32_t ref_idx, const int32_t search_range, uint32_t *bitcost_out)
+static void search_mv_full(inter_search_info_t *info,
+                           int32_t search_range,
+                           vector2d_t extra_mv)
 {
-  vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
-  vector2d_t best_mv = { 0, 0 };
-  unsigned best_cost = UINT32_MAX;
-  uint32_t best_bitcost = 0, bitcost;
-  int wpp_limit = get_wpp_limit(state, orig);
-
-  kvz_mvd_cost_func *calc_mvd = calc_mvd_cost;
-  if (state->encoder_control->cfg.mv_rdo) {
-    calc_mvd = kvz_calc_mvd_cost_cabac;
-  }
-
-  // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list.
-  if (intmv_within_tile(state, orig, 0, 0, width, height, wpp_limit)) {
-    vector2d_t min_mv = { 0 - search_range, 0 - search_range };
-    vector2d_t max_mv = { 0 + search_range, 0 + search_range };
-
-    for (int y = min_mv.y; y <= max_mv.y; ++y) {
-      for (int x = min_mv.x; x <= max_mv.x; ++x) {
-        if (!intmv_within_tile(state, orig, x, y, width, height, wpp_limit)) {
-          continue;
-        }
-        unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                                           orig->x + x,
-                                           orig->y + y,
-                                           width, height, -1);
-        cost += calc_mvd(state, x, y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-        if (cost < best_cost) {
-          best_cost = cost;
-          best_bitcost = bitcost;
-          best_mv.x = x;
-          best_mv.y = y;
-        }
-      }
+  // Search around the 0-vector.
+  for (int y = -search_range; y <= search_range; y++) {
+    for (int x = -search_range; x <= search_range; x++) {
+      check_mv_cost(info, x, y);
     }
   }
 
-  // Check mv_in if it's not one of the merge candidates.
-  if (!mv_in_merge(merge_cand, num_cand, &mv) &&
-      intmv_within_tile(state, orig, mv.x, mv.y, width, height, wpp_limit))
-  {
-    vector2d_t min_mv = { mv.x - search_range, mv.y - search_range };
-    vector2d_t max_mv = { mv.x + search_range, mv.y + search_range };
+  // Change to integer precision.
+  extra_mv.x >>= 2;
+  extra_mv.y >>= 2;
 
-    for (int y = min_mv.y; y <= max_mv.y; ++y) {
-      for (int x = min_mv.x; x <= max_mv.x; ++x) {
-        if (!intmv_within_tile(state, orig, x, y, width, height, wpp_limit)) {
-          continue;
-        }
-        unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                                           orig->x + x,
-                                           orig->y + y,
-                                           width, height, -1);
-        cost += calc_mvd(state, x, y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-        if (cost < best_cost) {
-          best_cost = cost;
-          best_bitcost = bitcost;
-          best_mv.x = x;
-          best_mv.y = y;
-        }
+  // Check around extra_mv if it's not one of the merge candidates.
+  if (!mv_in_merge(info, extra_mv)) {
+    for (int y = -search_range; y <= search_range; y++) {
+      for (int x = -search_range; x <= search_range; x++) {
+        check_mv_cost(info, extra_mv.x + x, extra_mv.y + y);
       }
     }
   }
 
   // Select starting point from among merge candidates. These should include
   // both mv_cand vectors and (0, 0).
-  for (int i = 0; i < num_cand; ++i) {
-    if (merge_candi.dir == 3) continue;
-    mv.x = merge_candi.mvmerge_candi.dir - 10 >> 2;
-    mv.y = merge_candi.mvmerge_candi.dir - 11 >> 2;
+  for (int i = 0; i < info->num_merge_cand; ++i) {
+    if (info->merge_candi.dir == 3) continue;
+
+    vector2d_t mv = {
+      .x = info->merge_candi.mvinfo->merge_candi.dir - 10 >> 2,
+      .y = info->merge_candi.mvinfo->merge_candi.dir - 11 >> 2,
+    };
 
     // Ignore 0-vector because it has already been checked.
     if (mv.x == 0 && mv.y == 0) continue;
@@ -926,7 +770,7 @@
 
     for (int y = min_mv.y; y <= max_mv.y; ++y) {
       for (int x = min_mv.x; x <= max_mv.x; ++x) {
-        if (!intmv_within_tile(state, orig, x, y, width, height, wpp_limit)) {
+        if (!intmv_within_tile(info, x, y)) {
           continue;
         }
 
@@ -936,9 +780,9 @@
           int xx = 0;
           int yy = 0;
           if (j >= 0) {
-            if (merge_candj.dir == 3) continue;
-            xx = merge_candj.mvmerge_candj.dir - 10 >> 2;
-            yy = merge_candj.mvmerge_candj.dir - 11 >> 2;
+            if (info->merge_candj.dir == 3) continue;
+            xx = info->merge_candj.mvinfo->merge_candj.dir - 10 >> 2;
+            yy = info->merge_candj.mvinfo->merge_candj.dir - 11 >> 2;
           }
           if (x >= xx - search_range && x <= xx + search_range &&
               y >= yy - search_range && y <= yy + search_range)
@@ -950,51 +794,20 @@
         }
         if (already_tested) continue;
 
-        unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                                           orig->x + x,
-                                           orig->y + y,
-                                           width, height, -1);
-        cost += calc_mvd(state, x, y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-        if (cost < best_cost) {
-          best_cost = cost;
-          best_bitcost = bitcost;
-          best_mv.x = x;
-          best_mv.y = y;
-        }
+        check_mv_cost(info, x, y);
       }
     }
   }
-
-  mv_in_out->x = best_mv.x << 2;
-  mv_in_out->y = best_mv.y << 2;
-
-  *bitcost_out = best_bitcost;
-
-  return best_cost;
 }
 
 
 /**
  * \brief Do fractional motion estimation
  *
- * \param width      width of the block
- * \param height     height of the block
- * \param pic        Picture motion vector is searched for.
- * \param ref        Picture motion vector is searched from.
- * \param orig       Top left corner of the searched for block.
- * \param mv_in_out  Predicted mv in and best out. Quarter pixel precision.
- *
- * \returns  Cost of the motion vector.
- *
  * Algoritm first searches 1/2-pel positions around integer mv and after best match is found,
  * refines the search by searching best 1/4-pel postion around best 1/2-pel position.
  */
-static unsigned search_frac(encoder_state_t * const state,
-                            unsigned width, unsigned height,
-                            const kvz_picture *pic, const kvz_picture *ref,
-                            const vector2d_t *orig, vector2d_t *mv_in_out,
-                            int16_t mv_cand22, inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
-                            int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
+static void search_frac(inter_search_info_t *info)
 {
   // Map indexes to relative coordinates in the following way:
   // 5 3 6
@@ -1006,14 +819,12 @@
       {  1, -1 },  { -1,  1 },  {  1,  1 }
   };
 
-  int wpp_limit = get_wpp_limit(state, orig);
+  // Set mv to pixel precision
+  vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 };
 
-  //Set mv to halfpel precision
-  vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
   unsigned best_cost = UINT32_MAX;
   uint32_t best_bitcost = 0;
   uint32_t bitcosts4 = { 0 };
-  unsigned i;
   unsigned best_index = 0;
 
   unsigned costs4 = { 0 };
@@ -1043,69 +854,99 @@
   hpel_pos6 = fracpel_blocksHPEL_POS_DIA + (LCU_WIDTH + 1);
   hpel_pos7 = fracpel_blocksHPEL_POS_DIA + (LCU_WIDTH + 1) + 1;
 
-  int fme_level = state->encoder_control->cfg.fme_level;
+  const kvz_picture *ref = info->ref;
+  const kvz_picture *pic = info->pic;
+  vector2d_t orig = info->origin;
+  const int width = info->width;
+  const int height = info->height;
 
-  kvz_mvd_cost_func *calc_mvd = calc_mvd_cost;
-  if (state->encoder_control->cfg.mv_rdo) {
-    calc_mvd = kvz_calc_mvd_cost_cabac;
-  }
-
-  kvz_get_extended_block(orig->x, orig->y, mv.x-1, mv.y-1,
-                state->tile->lcu_offset_x * LCU_WIDTH,
-                state->tile->lcu_offset_y * LCU_WIDTH,
-                ref->y, ref->width, ref->height, FILTER_SIZE, width+1, height+1, &src);
+  const encoder_state_t *state = info->state;
+  int fme_level = state->encoder_control->cfg.fme_level;
 
-  kvz_filter_frac_blocks_luma(state->encoder_control, src.orig_topleft, src.stride, width,
-    height, fracpel_blocks, fme_level);
+  kvz_get_extended_block(orig.x, orig.y, mv.x - 1, mv.y - 1,
+                state->tile->offset_x,
+                state->tile->offset_y,
+                ref->y, ref->width, ref->height, FILTER_SIZE,
+                width+1, height+1,
+                &src);
+
+  kvz_filter_frac_blocks_luma(state->encoder_control,
+                              src.orig_topleft,
+                              src.stride,
+                              width,
+                              height,
+                              fracpel_blocks,
+                              fme_level);
 
   kvz_pixel tmp_picLCU_WIDTH*LCU_WIDTH;
-  kvz_pixels_blit(pic->y + orig->y * pic->stride + orig->x, tmp_pic, width, height, pic->stride, width);
+  kvz_pixels_blit(pic->y + orig.y * pic->stride + orig.x,
+                  tmp_pic,
+                  width,
+                  height,
+                  pic->stride,
+                  width);
 
   // Search integer position
   costs0 = kvz_satd_any_size(width, height,
                             tmp_pic, width,
                             src.orig_topleft + src.stride + 1, src.stride);
 
-  costs0 += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts0);
+  costs0 += info->mvd_cost_func(state,
+                                  mv.x, mv.y, 2,
+                                  info->mv_cand,
+                                  info->merge_cand,
+                                  info->num_merge_cand,
+                                  info->ref_idx,
+                                  &bitcosts0);
   best_cost = costs0;
   best_bitcost = bitcosts0;
 
   int last_hpel_index = (fme_level == 1) ? 4 : 8;
 
   //Set mv to half-pixel precision
-  mv.x <<= 1;
-  mv.y <<= 1;
+  mv.x *= 2;
+  mv.y *= 2;
 
   // Search halfpel positions around best integer mv
-  for (i = 1; i <= last_hpel_index; i+=4) {
+  for (int i = 1; i <= last_hpel_index; i += 4) {
     const vector2d_t *pattern4 = { &squarei, &squarei + 1, &squarei + 2, &squarei + 3 };
-    
-    int8_t within_tile4 = {
-      fracmv_within_tile(state, orig, (mv.x + pattern0->x) << 1, (mv.y + pattern0->y) << 1, width, height, wpp_limit),
-      fracmv_within_tile(state, orig, (mv.x + pattern1->x) << 1, (mv.y + pattern1->y) << 1, width, height, wpp_limit),
-      fracmv_within_tile(state, orig, (mv.x + pattern2->x) << 1, (mv.y + pattern2->y) << 1, width, height, wpp_limit),
-      fracmv_within_tile(state, orig, (mv.x + pattern3->x) << 1, (mv.y + pattern3->y) << 1, width, height, wpp_limit),
+
+    int8_t within_tile4;
+    for (int j = 0; j < 4; j++) {
+      within_tilej =
+        fracmv_within_tile(info, (mv.x + patternj->x) * 2, (mv.y + patternj->y) * 2);
     };
 
     int hpel_strides4 = {
-      (LCU_WIDTH + 1), 
-      (LCU_WIDTH + 1), 
-      (LCU_WIDTH + 1), 
+      (LCU_WIDTH + 1),
+      (LCU_WIDTH + 1),
+      (LCU_WIDTH + 1),
       (LCU_WIDTH + 1)
     };
 
     kvz_satd_any_size_quad(width, height, (const kvz_pixel**)(hpel_pos + i - 1), hpel_strides, tmp_pic, width, 4, costs, within_tile);
 
-    costs0 += calc_mvd(state, mv.x + pattern0->x, mv.y + pattern0->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts0);
-    costs1 += calc_mvd(state, mv.x + pattern1->x, mv.y + pattern1->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts1);
-    costs2 += calc_mvd(state, mv.x + pattern2->x, mv.y + pattern2->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts2);
-    costs3 += calc_mvd(state, mv.x + pattern3->x, mv.y + pattern3->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts3);
+    for (int j = 0; j < 4; j++) {
+      if (within_tilej) {
+        costsj += info->mvd_cost_func(
+            state,
+            mv.x + patternj->x,
+            mv.y + patternj->y,
+            1,
+            info->mv_cand,
+            info->merge_cand,
+            info->num_merge_cand,
+            info->ref_idx,
+            &bitcostsj
+        );
+      }
+    }
 
     for (int j = 0; j < 4; ++j) {
       if (within_tilej && costsj < best_cost) {
         best_cost = costsj;
-        best_index = i + j;
         best_bitcost = bitcostsj;
+        best_index = i + j;
       }
     }
   }
@@ -1117,8 +958,8 @@
   mv.y += squarebest_index.y;
 
   //Set mv to quarterpel precision
-  mv.x <<= 1;
-  mv.y <<= 1;
+  mv.x *= 2;
+  mv.y *= 2;
 
   if (fme_level >= 3) {
 
@@ -1127,15 +968,14 @@
     int last_qpel_index = (fme_level == 3) ? 4 : 8;
 
     //Search quarterpel points around best halfpel mv
-    for (i = 1; i <= last_qpel_index; i += 4) {
+    for (int i = 1; i <= last_qpel_index; i += 4) {
       const vector2d_t *pattern4 = { &squarei, &squarei + 1, &squarei + 2, &squarei + 3 };
 
-      int8_t within_tile4 = {
-        fracmv_within_tile(state, orig, (mv.x + pattern0->x), (mv.y + pattern0->y), width, height, wpp_limit),
-        fracmv_within_tile(state, orig, (mv.x + pattern1->x), (mv.y + pattern1->y), width, height, wpp_limit),
-        fracmv_within_tile(state, orig, (mv.x + pattern2->x), (mv.y + pattern2->y), width, height, wpp_limit),
-        fracmv_within_tile(state, orig, (mv.x + pattern3->x), (mv.y + pattern3->y), width, height, wpp_limit),
-      };
+      int8_t within_tile4;
+      for (int j = 0; j < 4; j++) {
+        within_tilej =
+          fracmv_within_tile(info, mv.x + patternj->x, mv.y + patternj->y);
+      }
 
       int qpel_indices4 = { 0 };
       int int_offset_x4 = { 0 };
@@ -1183,16 +1023,27 @@
 
       kvz_satd_any_size_quad(width, height, (const kvz_pixel**)qpel_pos, qpel_strides, tmp_pic, width, 4, costs, within_tile);
 
-      costs0 += calc_mvd(state, mv.x + pattern0->x, mv.y + pattern0->y, 0, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts0);
-      costs1 += calc_mvd(state, mv.x + pattern1->x, mv.y + pattern1->y, 0, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts1);
-      costs2 += calc_mvd(state, mv.x + pattern2->x, mv.y + pattern2->y, 0, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts2);
-      costs3 += calc_mvd(state, mv.x + pattern3->x, mv.y + pattern3->y, 0, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts3);
+      for (int j = 0; j < 4; j++) {
+        if (within_tilej) {
+          costsj += info->mvd_cost_func(
+              state,
+              mv.x + patternj->x,
+              mv.y + patternj->y,
+              0,
+              info->mv_cand,
+              info->merge_cand,
+              info->num_merge_cand,
+              info->ref_idx,
+              &bitcostsj
+          );
+        }
+      }
 
       for (int j = 0; j < 4; ++j) {
         if (within_tilej && costsj < best_cost) {
           best_cost = costsj;
-          best_index = i + j;
           best_bitcost = bitcostsj;
+          best_index = i + j;
         }
       }
     }
@@ -1202,61 +1053,79 @@
     mv.y += squarebest_index.y;
   }
 
-  mv_in_out->x = mv.x;
-  mv_in_out->y = mv.y;
-
-  *bitcost_out = best_bitcost;
+  info->best_mv = mv;
+  info->best_cost = best_cost;
+  info->best_bitcost = best_bitcost;
 
   if (src.malloc_used) free(src.buffer);
-
-  return best_cost;
 }
 
 
 /**
  * \brief Perform inter search for a single reference frame.
  */
-static void search_pu_inter_ref(encoder_state_t * const state,
-                                int x, int y,
-                                int width, int height,
+static void search_pu_inter_ref(inter_search_info_t *info,
                                 int depth,
                                 lcu_t *lcu, cu_info_t *cur_cu,
-                                int16_t mv_cand22,
-                                inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
-                                int16_t num_cand,
-                                unsigned ref_idx,
-                                uint32_t(*get_mvd_cost)(encoder_state_t * const, vector2d_t *, const cabac_data_t*),
                                 double *inter_cost,
                                 uint32_t *inter_bitcost)
 {
-  const videoframe_t * const frame = state->tile->frame;
-  kvz_picture *ref_image = state->frame->ref->imagesref_idx;
-  const vector2d_t orig = { x, y };
-  uint32_t temp_bitcost = 0;
-  uint32_t temp_cost = 0;
-  int32_t merged = 0;
-  uint8_t cu_mv_cand = 0;
-  int8_t merge_idx = 0;
-  int8_t ref_list = state->frame->refmapref_idx.list-1;
+  const kvz_config *cfg = &info->state->encoder_control->cfg;
+
+  // which list, L0 or L1, ref_idx is in and in what index
+  int8_t ref_list = -1;
+  // the index of the ref_idx in L0 or L1 list
+  int8_t LX_idx;
+  // max value of LX_idx plus one
+  const int8_t LX_IDX_MAX_PLUS_1 = MAX(info->state->frame->ref_LX_size0,
+                                       info->state->frame->ref_LX_size1);
+
+  for (LX_idx = 0; LX_idx < LX_IDX_MAX_PLUS_1; LX_idx++)
+  {
+    // check if ref_idx is in L0
+    if (LX_idx < info->state->frame->ref_LX_size0 &&
+        info->state->frame->ref_LX0LX_idx == info->ref_idx) {
+      ref_list = 0;
+      break;
+    }
+
+    // check if ref_idx is in L1
+    if (LX_idx < info->state->frame->ref_LX_size1 &&
+        info->state->frame->ref_LX1LX_idx == info->ref_idx) {
+      ref_list = 1;
+      break;
+    }
+  }
+  // ref_idx has to be found in either L0 or L1
+  assert(LX_idx < LX_IDX_MAX_PLUS_1);
+
+  // store temp values to be stored back later
   int8_t temp_ref_idx = cur_cu->inter.mv_refref_list;
+
   // Get MV candidates
-  cur_cu->inter.mv_refref_list = ref_idx;
-  kvz_inter_get_mv_cand(state, x, y, width, height, mv_cand, cur_cu, lcu, ref_list);
+  cur_cu->inter.mv_refref_list = LX_idx;
+
+  kvz_inter_get_mv_cand(info->state,
+    info->origin.x,
+    info->origin.y,
+    info->width,
+    info->height,
+    info->mv_cand,
+    cur_cu,
+    lcu,
+    ref_list);
+
+  // store old values back
   cur_cu->inter.mv_refref_list = temp_ref_idx;
 
-
   vector2d_t mv = { 0, 0 };
   {
     // Take starting point for MV search from previous frame.
     // When temporal motion vector candidates are added, there is probably
     // no point to this anymore, but for now it helps.
-    const vector2d_t tile_top_left_corner = {
-        (state->tile->lcu_offset_x << LOG2_LCU_WIDTH),
-        (state->tile->lcu_offset_y << LOG2_LCU_WIDTH)
-    };
-    const int mid_x = tile_top_left_corner.x + x + (width >> 1);
-    const int mid_y = tile_top_left_corner.y + y + (height >> 1);
-    const cu_array_t* ref_array = state->frame->ref->cu_arraysref_idx;
+    const int mid_x = info->state->tile->offset_x + info->origin.x + (info->width >> 1);
+    const int mid_y = info->state->tile->offset_y + info->origin.y + (info->height >> 1);
+    const cu_array_t* ref_array = info->state->frame->ref->cu_arraysinfo->ref_idx;
     const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y);
     if (ref_cu->type == CU_INTER) {
       if (ref_cu->inter.mv_dir & 1) {
@@ -1270,7 +1139,7 @@
   }
 
   int search_range = 32;
-  switch (state->encoder_control->cfg.ime_algorithm) {
+  switch (cfg->ime_algorithm) {
     case KVZ_IME_FULL64: search_range = 64; break;
     case KVZ_IME_FULL32: search_range = 32; break;
     case KVZ_IME_FULL16: search_range = 16; break;
@@ -1278,94 +1147,81 @@
     default: break;
   }
 
-  switch (state->encoder_control->cfg.ime_algorithm) {
+  info->best_cost = UINT32_MAX;
+
+  switch (cfg->ime_algorithm) {
     case KVZ_IME_TZ:
-      temp_cost += tz_search(state,
-                             width, height,
-                             frame->source,
-                             ref_image,
-                             &orig,
-                             &mv,
-                             mv_cand,
-                             merge_cand,
-                             num_cand,
-                             ref_idx,
-                             &temp_bitcost);
+      tz_search(info, mv);
       break;
 
-
     case KVZ_IME_FULL64:
     case KVZ_IME_FULL32:
     case KVZ_IME_FULL16:
     case KVZ_IME_FULL8:
     case KVZ_IME_FULL:
-      temp_cost += search_mv_full(state,
-                                  width, height,
-                                  frame->source,
-                                  ref_image,
-                                  &orig,
-                                  &mv,
-                                  mv_cand,
-                                  merge_cand,
-                                  num_cand,
-                                  ref_idx,
-                                  search_range,
-                                  &temp_bitcost);
+      search_mv_full(info, search_range, mv);
       break;
 
     default:
-      temp_cost += hexagon_search(state,
-                                  width, height,
-                                  frame->source,
-                                  ref_image,
-                                  &orig,
-                                  &mv,
-                                  mv_cand,
-                                  merge_cand,
-                                  num_cand,
-                                  ref_idx,
-                                  &temp_bitcost);
+      hexagon_search(info, mv);
       break;
   }
 
-  if (state->encoder_control->cfg.fme_level > 0 && temp_cost < *inter_cost) {
-    temp_cost = search_frac(state,
-                            width, height,
-                            frame->source,
-                            ref_image,
-                            &orig,
-                            &mv,
-                            mv_cand,
-                            merge_cand,
-                            num_cand,
-                            ref_idx,
-                            &temp_bitcost);
+  if (cfg->fme_level > 0 && info->best_cost < *inter_cost) {
+    search_frac(info);
+
+  } else if (info->best_cost < UINT32_MAX) {
+    // Recalculate inter cost with SATD.
+    info->best_cost = kvz_image_calc_satd(
+        info->state->tile->frame->source,
+        info->ref,
+        info->origin.x,
+        info->origin.y,
+        info->state->tile->offset_x + info->origin.x + (info->best_mv.x >> 2),
+        info->state->tile->offset_y + info->origin.y + (info->best_mv.y >> 2),
+        info->width,
+        info->height);
+    info->best_cost += info->best_bitcost * (int)(info->state->lambda_sqrt + 0.5);
   }
-  
-  merged = 0;
+
+  mv = info->best_mv;
+
+  int merged = 0;
+  int merge_idx = 0;
   // Check every candidate to find a match
-  for(merge_idx = 0; merge_idx < num_cand; merge_idx++) {
-    if (merge_candmerge_idx.dir != 3 &&
-        merge_candmerge_idx.mvmerge_candmerge_idx.dir - 10 == mv.x &&
-        merge_candmerge_idx.mvmerge_candmerge_idx.dir - 11 == mv.y &&
-        (uint32_t)merge_candmerge_idx.refmerge_candmerge_idx.dir - 1 == ref_idx) {
+  for (merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) {
+    if (info->merge_candmerge_idx.dir != 3 &&
+        info->merge_candmerge_idx.mvinfo->merge_candmerge_idx.dir - 10 == mv.x &&
+        info->merge_candmerge_idx.mvinfo->merge_candmerge_idx.dir - 11 == mv.y &&
+        (uint32_t)info->state->frame->ref_LXinfo->merge_candmerge_idx.dir - 1
+        info->merge_candmerge_idx.refinfo->merge_candmerge_idx.dir - 1 == info->ref_idx)
+    {
       merged = 1;
       break;
     }
   }
 
   // Only check when candidates are different
-  if (!merged && (mv_cand00 != mv_cand10 || mv_cand01 != mv_cand11)) {
+  int cu_mv_cand = 0;
+  if (!merged && (
+        info->mv_cand00 != info->mv_cand10 ||
+        info->mv_cand01 != info->mv_cand11))
+  {
+    uint32_t (*mvd_coding_cost)(const encoder_state_t * const state,
+                                vector2d_t *,
+                                const cabac_data_t*) =
+      cfg->mv_rdo ? kvz_get_mvd_coding_cost_cabac : get_mvd_coding_cost;
+
     vector2d_t mvd_temp1, mvd_temp2;
     int cand1_cost,cand2_cost;
 
-    mvd_temp1.x = mv.x - mv_cand00;
-    mvd_temp1.y = mv.y - mv_cand01;
-    cand1_cost = get_mvd_cost(state, &mvd_temp1, &state->cabac);
+    mvd_temp1.x = mv.x - info->mv_cand00;
+    mvd_temp1.y = mv.y - info->mv_cand01;
+    cand1_cost = mvd_coding_cost(info->state, &mvd_temp1, &info->state->cabac);
 
-    mvd_temp2.x = mv.x - mv_cand10;
-    mvd_temp2.y = mv.y - mv_cand11;
-    cand2_cost = get_mvd_cost(state, &mvd_temp2, &state->cabac);
+    mvd_temp2.x = mv.x - info->mv_cand10;
+    mvd_temp2.y = mv.y - info->mv_cand11;
+    cand2_cost = mvd_coding_cost(info->state, &mvd_temp2, &info->state->cabac);
 
     // Select candidate 1 if it has lower cost
     if (cand2_cost < cand1_cost) {
@@ -1373,20 +1229,21 @@
     }
   }
 
-  if (temp_cost < *inter_cost) {
+  if (info->best_cost < *inter_cost) {
     // Map reference index to L0/L1 pictures
     cur_cu->inter.mv_dir = ref_list+1;
-    uint8_t mv_ref_coded = state->frame->refmapref_idx.idx;
+    uint8_t mv_ref_coded = LX_idx;
+
+    cur_cu->merged                  = merged;
+    cur_cu->merge_idx               = merge_idx;
+    cur_cu->inter.mv_refref_list  = LX_idx;
+    cur_cu->inter.mvref_list0   = (int16_t)mv.x;
+    cur_cu->inter.mvref_list1   = (int16_t)mv.y;
 
-    cur_cu->merged        = merged;
-    cur_cu->merge_idx     = merge_idx;
-    cur_cu->inter.mv_refref_list = ref_idx;
-    cur_cu->inter.mvref_list0 = (int16_t)mv.x;
-    cur_cu->inter.mvref_list1 = (int16_t)mv.y;
     CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand);
 
-    *inter_cost = temp_cost;
-    *inter_bitcost = temp_bitcost + cur_cu->inter.mv_dir - 1 + mv_ref_coded;
+    *inter_cost = info->best_cost;
+    *inter_bitcost = info->best_bitcost + cur_cu->inter.mv_dir - 1 + mv_ref_coded;
   }
 }
 
@@ -1417,6 +1274,7 @@
   *inter_cost = MAX_INT;
   *inter_bitcost = MAX_INT;
 
+  const kvz_config *cfg = &state->encoder_control->cfg;
   const videoframe_t * const frame = state->tile->frame;
   const int width_cu  = LCU_WIDTH >> depth;
   const int x         = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
@@ -1435,58 +1293,39 @@
   const int y_local   = SUB_SCU(y);
   cu_info_t *cur_cu   = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
 
-  int16_t mv_cand22;
-  // Search for merge mode candidate
-  inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS;
-  // Get list of candidates
-  int16_t num_cand = 0;
-  if (!state->encoder_control->cfg.tmvp_enable) {
-    num_cand = kvz_inter_get_merge_cand(state,
-                                        x, y,
-                                        width, height,
-                                        merge_a1, merge_b1,
-                                        merge_cand,
-                                        lcu,
-                                        0);
-  }
+  inter_search_info_t info = {
+    .state          = state,
+    .pic            = frame->source,
+    .origin         = { x, y },
+    .width          = width,
+    .height         = height,
+    .mvd_cost_func  = cfg->mv_rdo ? kvz_calc_mvd_cost_cabac : calc_mvd_cost,
+  };
 
-  uint32_t(*get_mvd_cost)(encoder_state_t * const state, vector2d_t *, const cabac_data_t*) = get_mvd_coding_cost;
-  if (state->encoder_control->cfg.mv_rdo) {
-    get_mvd_cost = kvz_get_mvd_coding_cost_cabac;
-  }
+  // Search for merge mode candidates
+  info.num_merge_cand = kvz_inter_get_merge_cand(
+      state,
+      x, y,
+      width, height,
+      merge_a1, merge_b1,
+      info.merge_cand,
+      lcu
+  );
 
   // Default to candidate 0
   CU_SET_MV_CAND(cur_cu, 0, 0);
   CU_SET_MV_CAND(cur_cu, 1, 0);
 
-  uint32_t ref_idx;
-  for (ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) {
-    if (state->encoder_control->cfg.tmvp_enable) {
-      // Get list of candidates, TMVP required MV scaling for each reference
-      num_cand = kvz_inter_get_merge_cand(state,
-                                          x, y,
-                                          width, height,
-                                          merge_a1, merge_b1,
-                                          merge_cand,
-                                          lcu,
-                                          ref_idx);
-    }
-    search_pu_inter_ref(state,
-                        x, y,
-                        width, height,
-                        depth,
-                        lcu, cur_cu,
-                        mv_cand, merge_cand,
-                        num_cand,
-                        ref_idx,
-                        get_mvd_cost,
-                        inter_cost,
-                        inter_bitcost);
+  for (int ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) {
+    info.ref_idx = ref_idx;
+    info.ref = state->frame->ref->imagesref_idx;
+
+    search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost);
   }
 
   // Search bi-pred positions
   bool can_use_bipred = state->frame->slicetype == KVZ_SLICE_B
-    && state->encoder_control->cfg.bipred
+    && cfg->bipred
     && width + height >= 16; // 4x8 and 8x4 PBs are restricted to unipred
 
   if (can_use_bipred) {
@@ -1494,50 +1333,50 @@
     unsigned cu_width = LCU_WIDTH >> depth;
     static const uint8_t priorityList0 = { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 };
     static const uint8_t priorityList1 = { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 };
-    const unsigned num_cand_pairs = MIN(num_cand * (num_cand - 1), 12);
+    const unsigned num_cand_pairs =
+      MIN(info.num_merge_cand * (info.num_merge_cand - 1), 12);
 
-    kvz_mvd_cost_func *calc_mvd = calc_mvd_cost;
-    if (state->encoder_control->cfg.mv_rdo) {
-      calc_mvd = kvz_calc_mvd_cost_cabac;
-    }
+    inter_merge_cand_t *merge_cand = info.merge_cand;
 
     for (int32_t idx = 0; idx < num_cand_pairs; idx++) {
       uint8_t i = priorityList0idx;
       uint8_t j = priorityList1idx;
-      if (i >= num_cand || j >= num_cand) break;
+      if (i >= info.num_merge_cand || j >= info.num_merge_cand) break;
 
       // Find one L0 and L1 candidate according to the priority list
       if ((merge_candi.dir & 0x1) && (merge_candj.dir & 0x2)) {
-        if (merge_candi.ref0 != merge_candj.ref1 ||
-          merge_candi.mv00 != merge_candj.mv10 ||
-          merge_candi.mv01 != merge_candj.mv11) {
+        if (state->frame->ref_LX0merge_candi.ref0 !=
+            state->frame->ref_LX1merge_candj.ref1 ||
+
+            merge_candi.mv00 != merge_candj.mv10 ||
+            merge_candi.mv01 != merge_candj.mv11)
+        {
           uint32_t bitcost2;
           uint32_t cost = 0;
           int8_t cu_mv_cand = 0;
           int16_t mv22;
           kvz_pixel tmp_block64 * 64;
           kvz_pixel tmp_pic64 * 64;
-          // Force L0 and L1 references
-          if (state->frame->refmapmerge_candi.ref0.list == 2 || state->frame->refmapmerge_candj.ref1.list == 1) continue;
 
           mv00 = merge_candi.mv00;
           mv01 = merge_candi.mv01;
           mv10 = merge_candj.mv10;
           mv11 = merge_candj.mv11;
 
+          // Don't try merge candidates that don't satisfy mv constraints.
+          if (!fracmv_within_tile(&info, mv00, mv01) ||
+              !fracmv_within_tile(&info, mv10, mv11))
           {
-            // Don't try merge candidates that don't satisfy mv constraints.
-            vector2d_t orig = { x, y };
-            if (!fracmv_within_tile(state, &orig, mv00, mv01, width, height, -1) ||
-                !fracmv_within_tile(state, &orig, mv10, mv11, width, height, -1))
-            {
-              continue;
-            }
+            continue;
           }
 
           kvz_inter_recon_lcu_bipred(state,
-                                     state->frame->ref->imagesmerge_candi.ref0,
-                                     state->frame->ref->imagesmerge_candj.ref1,
+                                     state->frame->ref->images
+                                       state->frame->ref_LX0merge_candi.ref0
+                                     ,
+                                     state->frame->ref->images
+                                       state->frame->ref_LX1merge_candj.ref1
+                                     ,
                                      x, y,
                                      width,
                                      height,
@@ -1555,16 +1394,31 @@
 
           cost = kvz_satd_any_size(cu_width, cu_width, tmp_pic, cu_width, tmp_block, cu_width);
 
-          cost += calc_mvd(state, merge_candi.mv00, merge_candi.mv01, 0, mv_cand, merge_cand, 0, ref_idx, &bitcost0);
-          cost += calc_mvd(state, merge_candi.mv10, merge_candi.mv11, 0, mv_cand, merge_cand, 0, ref_idx, &bitcost1);
+          cost += info.mvd_cost_func(state,
+                                     merge_candi.mv00,
+                                     merge_candi.mv01,
+                                     0,
+                                     info.mv_cand,
+                                     NULL, 0, 0,
+                                     &bitcost0);
+          cost += info.mvd_cost_func(state,
+                                     merge_candi.mv10,
+                                     merge_candi.mv11,
+                                     0,
+                                     info.mv_cand,
+                                     NULL, 0, 0,
+                                     &bitcost1);
+
+          const uint8_t mv_ref_coded2 = {
+            merge_candi.ref0,
+            merge_candj.ref1
+          };
+          const int extra_bits = mv_ref_coded0 + mv_ref_coded1 + 2 /* mv dir cost */;
+          cost += state->lambda_sqrt * extra_bits + 0.5;
 
-          if (cost < *inter_cost) {
 
+          if (cost < *inter_cost) {
             cur_cu->inter.mv_dir = 3;
-            uint8_t mv_ref_coded2 = {
-              state->frame->refmapmerge_candi.ref0.idx,
-              state->frame->refmapmerge_candj.ref1.idx
-            };
 
             cur_cu->inter.mv_ref0 = merge_candi.ref0;
             cur_cu->inter.mv_ref1 = merge_candj.ref1;
@@ -1574,16 +1428,16 @@
             cur_cu->inter.mv10 = merge_candj.mv10;
             cur_cu->inter.mv11 = merge_candj.mv11;
             cur_cu->merged = 0;
-                        
+
             // Check every candidate to find a match
-            for(int merge_idx = 0; merge_idx < num_cand; merge_idx++) {
-              if (
-                  merge_candmerge_idx.mv00 == cur_cu->inter.mv00 &&
-                  merge_candmerge_idx.mv01 == cur_cu->inter.mv01 &&     
+            for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) {
+              if (merge_candmerge_idx.mv00 == cur_cu->inter.mv00 &&
+                  merge_candmerge_idx.mv01 == cur_cu->inter.mv01 &&
                   merge_candmerge_idx.mv10 == cur_cu->inter.mv10 &&
-                  merge_candmerge_idx.mv11 == cur_cu->inter.mv11 &&    
-                  merge_candmerge_idx.ref0 == cur_cu->inter.mv_ref0 && 
-                  merge_candmerge_idx.ref1 == cur_cu->inter.mv_ref1) {
+                  merge_candmerge_idx.mv11 == cur_cu->inter.mv11 &&
+                  merge_candmerge_idx.ref0 == cur_cu->inter.mv_ref0 &&
+                  merge_candmerge_idx.ref1 == cur_cu->inter.mv_ref1)
+              {
                 cur_cu->merged = 1;
                 cur_cu->merge_idx = merge_idx;
                 break;
@@ -1593,28 +1447,36 @@
             // Each motion vector has its own candidate
             for (int reflist = 0; reflist < 2; reflist++) {
               cu_mv_cand = 0;
-              kvz_inter_get_mv_cand(state, x, y, width, height, mv_cand, cur_cu, lcu, reflist);
-              if ((mv_cand00 != mv_cand10 || mv_cand01 != mv_cand11)) {
+              kvz_inter_get_mv_cand(state, x, y, width, height, info.mv_cand, cur_cu, lcu, reflist);
+              if (info.mv_cand00 != info.mv_cand10 ||
+                  info.mv_cand01 != info.mv_cand11)
+              {
+                uint32_t (*mvd_coding_cost)(const encoder_state_t * const state,
+                                            vector2d_t *,
+                                            const cabac_data_t*) =
+                  cfg->mv_rdo ? kvz_get_mvd_coding_cost_cabac : get_mvd_coding_cost;
+
                 vector2d_t mvd_temp1, mvd_temp2;
                 int cand1_cost, cand2_cost;
 
-                mvd_temp1.x = cur_cu->inter.mvreflist0 - mv_cand00;
-                mvd_temp1.y = cur_cu->inter.mvreflist1 - mv_cand01;
-                cand1_cost = get_mvd_cost(state, &mvd_temp1, (cabac_data_t*)&state->cabac);
+                mvd_temp1.x = cur_cu->inter.mvreflist0 - info.mv_cand00;
+                mvd_temp1.y = cur_cu->inter.mvreflist1 - info.mv_cand01;
+                cand1_cost = mvd_coding_cost(state, &mvd_temp1, (cabac_data_t*)&state->cabac);
 
-                mvd_temp2.x = cur_cu->inter.mvreflist0 - mv_cand10;
-                mvd_temp2.y = cur_cu->inter.mvreflist1 - mv_cand11;
-                cand2_cost = get_mvd_cost(state, &mvd_temp2, (cabac_data_t*)&state->cabac);
+                mvd_temp2.x = cur_cu->inter.mvreflist0 - info.mv_cand10;
+                mvd_temp2.y = cur_cu->inter.mvreflist1 - info.mv_cand11;
+                cand2_cost = mvd_coding_cost(state, &mvd_temp2, (cabac_data_t*)&state->cabac);
 
                 // Select candidate 1 if it has lower cost
                 if (cand2_cost < cand1_cost) {
-                  cu_mv_cand = 1;                  
+                  cu_mv_cand = 1;
                 }
               }
               CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand);
             }
+
             *inter_cost = cost;
-            *inter_bitcost = bitcost0 + bitcost1 + cur_cu->inter.mv_dir - 1 + mv_ref_coded0 + mv_ref_coded1;
+            *inter_bitcost = bitcost0 + bitcost1 + extra_bits;
           }
         }
       }
@@ -1622,11 +1484,8 @@
     FREE_POINTER(templcu);
   }
 
-  if (*inter_cost < INT_MAX) {
-    const vector2d_t orig = { x, y };
-    if (cur_cu->inter.mv_dir == 1) {
-      assert(fracmv_within_tile(state, &orig, cur_cu->inter.mv00, cur_cu->inter.mv01, width, height, -1));
-    }
+  if (*inter_cost < INT_MAX && cur_cu->inter.mv_dir == 1) {
+    assert(fracmv_within_tile(&info, cur_cu->inter.mv00, cur_cu->inter.mv01));
   }
 }
 
@@ -1707,6 +1566,13 @@
 
     search_pu_inter(state, x, y, depth, part_mode, i, lcu, &cost, &bitcost);
 
+    if (cost >= MAX_INT) {
+      // Could not find any motion vector.
+      *inter_cost    = MAX_INT;
+      *inter_bitcost = MAX_INT;
+      return;
+    }
+
     *inter_cost    += cost;
     *inter_bitcost += bitcost;

kvazaar-1.1.0.tar.gz/src/search_inter.h -> kvazaar-1.2.0.tar.gz/src/search_inter.h Changed

@@ -50,14 +50,14 @@
   HPEL_POS_DIA = 2
 };
 
-typedef int kvz_mvd_cost_func(encoder_state_t * const state,
-                              int x, int y,
-                              int mv_shift,
-                              int16_t mv_cand22,
-                              inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
-                              int16_t num_cand,
-                              int32_t ref_idx,
-                              uint32_t *bitcost);
+typedef uint32_t kvz_mvd_cost_func(const encoder_state_t *state,
+                                  int x, int y,
+                                  int mv_shift,
+                                  int16_t mv_cand22,
+                                  inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
+                                  int16_t num_cand,
+                                  int32_t ref_idx,
+                                  uint32_t *bitcost);
 
 void kvz_search_cu_inter(encoder_state_t * const state,
                          int x, int y, int depth,
@@ -73,4 +73,10 @@
                        double *inter_cost,
                        uint32_t *inter_bitcost);
 
+
+unsigned kvz_inter_satd_cost(const encoder_state_t* state,
+                             const lcu_t *lcu,
+                             int x,
+                             int y);
+
 #endif // SEARCH_INTER_H_

kvazaar-1.1.0.tar.gz/src/search_intra.c -> kvazaar-1.2.0.tar.gz/src/search_intra.c Changed

@@ -220,15 +220,20 @@
     nosplit_cost = 0.0;
 
     cbf_clear(&pred_cu->cbf, depth, COLOR_Y);
-
-    kvz_intra_recon_lcu_luma(state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
-    nosplit_cost += kvz_cu_rd_cost_luma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
-
     if (reconstruct_chroma) {
       cbf_clear(&pred_cu->cbf, depth, COLOR_U);
       cbf_clear(&pred_cu->cbf, depth, COLOR_V);
+    }
 
-      kvz_intra_recon_lcu_chroma(state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
+    const int8_t chroma_mode = reconstruct_chroma ? intra_mode : -1;
+    kvz_intra_recon_cu(state,
+                       x_px, y_px,
+                       depth,
+                       intra_mode, chroma_mode,
+                       pred_cu, lcu);
+
+    nosplit_cost += kvz_cu_rd_cost_luma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
+    if (reconstruct_chroma) {
       nosplit_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
     }
 
@@ -697,7 +702,11 @@
     for (int8_t chroma_mode_i = 0; chroma_mode_i < num_modes; ++chroma_mode_i) {
       chroma.mode = modeschroma_mode_i;
 
-      kvz_intra_recon_lcu_chroma(state, x_px, y_px, depth, chroma.mode, NULL, lcu);
+      kvz_intra_recon_cu(state,
+                         x_px, y_px,
+                         depth,
+                         -1, chroma.mode, // skip luma
+                         NULL, lcu);
       chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);
 
       double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode);
@@ -836,7 +845,7 @@
 
   // Set transform depth to current depth, meaning no transform splits.
   kvz_lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
-
+  double best_rough_cost = costsselect_best_mode_index(modes, costs, number_of_modes);
   // Refine results with slower search or get some results if rough search was skipped.
   const int32_t rdo_level = state->encoder_control->cfg.rdo;
   if (rdo_level >= 2 || skip_rough_search) {
@@ -844,7 +853,7 @@
     if (rdo_level == 3) {
       number_of_modes_to_search = 35;
     } else if (rdo_level == 2) {
-      number_of_modes_to_search = (cu_width <= 8) ? 8 : 3;
+      number_of_modes_to_search = (cu_width == 4) ? 3 : 2;
     } else {
       // Check only the predicted modes.
       number_of_modes_to_search = 0;
@@ -863,5 +872,5 @@
   uint8_t best_mode_i = select_best_mode_index(modes, costs, number_of_modes);
 
   *mode_out = modesbest_mode_i;
-  *cost_out = costsbest_mode_i;
+  *cost_out = skip_rough_search ? costsbest_mode_i:best_rough_cost;
 }

kvazaar-1.1.0.tar.gz/src/strategies/avx2/ipol-avx2.c -> kvazaar-1.2.0.tar.gz/src/strategies/avx2/ipol-avx2.c Changed

kvazaar-1.1.0.tar.gz/src/strategies/avx2/quant-avx2.c -> kvazaar-1.2.0.tar.gz/src/strategies/avx2/quant-avx2.c Changed

@@ -343,7 +343,7 @@
 * \param color  Color.
 * \param scan_order  Coefficient scan order.
 * \param use_trskip  Whether transform skip is used.
-* \param stride  Stride for ref_in, pred_in rec_out and coeff_out.
+* \param stride  Stride for ref_in, pred_in and rec_out.
 * \param ref_in  Reference pixels.
 * \param pred_in  Predicted pixels.
 * \param rec_out  Reconstructed pixels.
@@ -360,7 +360,6 @@
 {
   // Temporary arrays to pass data to and from kvz_quant and transform functions.
   int16_t residualTR_MAX_WIDTH * TR_MAX_WIDTH;
-  coeff_t quant_coeffTR_MAX_WIDTH * TR_MAX_WIDTH;
   coeff_t coeffTR_MAX_WIDTH * TR_MAX_WIDTH;
 
   int has_coeffs = 0;
@@ -379,35 +378,32 @@
     kvz_transform2d(state->encoder_control, residual, coeff, width, (color == COLOR_Y ? 0 : 65535));
   }
 
-  // Quantize coeffs. (coeff -> quant_coeff)
+  // Quantize coeffs. (coeff -> coeff_out)
   if (state->encoder_control->cfg.rdoq_enable &&
       (width > 4 || !state->encoder_control->cfg.rdoq_skip))
   {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
-    kvz_rdoq(state, coeff, quant_coeff, width, width, (color == COLOR_Y ? 0 : 2),
+    kvz_rdoq(state, coeff, coeff_out, width, width, (color == COLOR_Y ? 0 : 2),
       scan_order, cur_cu->type, tr_depth);
   } else {
-    kvz_quant(state, coeff, quant_coeff, width, width, (color == COLOR_Y ? 0 : 2),
+    kvz_quant(state, coeff, coeff_out, width, width, (color == COLOR_Y ? 0 : 2),
       scan_order, cur_cu->type);
   }
 
   // Check if there are any non-zero coefficients.
   for (int i = 0; i < width * width; i += 8) {
-    __m128i v_quant_coeff = _mm_loadu_si128((__m128i*)&(quant_coeffi));
+    __m128i v_quant_coeff = _mm_loadu_si128((__m128i*)&(coeff_outi));
     has_coeffs = !_mm_testz_si128(_mm_set1_epi8(0xFF), v_quant_coeff);
     if(has_coeffs) break;
   }
 
-  // Copy coefficients to coeff_out.
-  kvz_coefficients_blit(quant_coeff, coeff_out, width, width, width, out_stride);
-
   // Do the inverse quantization and transformation and the reconstruction to
   // rec_out.
   if (has_coeffs) {
 
-    // Get quantized residual. (quant_coeff -> coeff -> residual)
-    kvz_dequant(state, quant_coeff, coeff, width, width, (color == COLOR_Y ? 0 : (color == COLOR_U ? 2 : 3)), cur_cu->type);
+    // Get quantized residual. (coeff_out -> coeff -> residual)
+    kvz_dequant(state, coeff_out, coeff, width, width, (color == COLOR_Y ? 0 : (color == COLOR_U ? 2 : 3)), cur_cu->type);
     if (use_trskip) {
       kvz_itransformskip(state->encoder_control, residual, coeff, width);
     }
@@ -506,8 +502,29 @@
   }
 }
 
-#endif //COMPILE_INTEL_AVX2 && defined X86_64
+static uint32_t coeff_abs_sum_avx2(const coeff_t *coeffs, const size_t length)
+{
+  assert(length % 8 == 0);
+
+  __m256i total = _mm256_abs_epi32(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) coeffs)));
+
+  for (int i = 8; i < length; i += 8) {
+    __m256i temp = _mm256_abs_epi32(_mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &coeffsi)));
+    total = _mm256_add_epi32(total, temp);
+  }
 
+  __m128i result128 = _mm_add_epi32(
+    _mm256_castsi256_si128(total),
+    _mm256_extractf128_si256(total, 1)
+  );
+
+  uint32_t parts4;
+  _mm_storeu_si128((__m128i*) parts, result128);
+
+  return parts0 + parts1 + parts2 + parts3;
+}
+
+#endif //COMPILE_INTEL_AVX2 && defined X86_64
 
 int kvz_strategy_register_quant_avx2(void* opaque, uint8_t bitdepth)
 {
@@ -519,6 +536,7 @@
     success &= kvz_strategyselector_register(opaque, "quantize_residual", "avx2", 40, &kvz_quantize_residual_avx2);
     success &= kvz_strategyselector_register(opaque, "dequant", "avx2", 40, &kvz_dequant_avx2);
   }
+  success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "avx2", 0, &coeff_abs_sum_avx2);
 #endif //COMPILE_INTEL_AVX2 && defined X86_64
 
   return success;

kvazaar-1.1.0.tar.gz/src/strategies/avx2/sao-avx2.c -> kvazaar-1.2.0.tar.gz/src/strategies/avx2/sao-avx2.c Changed

@@ -36,18 +36,13 @@
 // is difficult to understand.
 
 
-static INLINE __m256i load_6_offsets(const int* offsets){
-
-  return _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*) offsets)), _mm_loadl_epi64((__m128i*)&(offsets4)), 1);
-}
-
-static INLINE __m128i load_6_pixels(const kvz_pixel* data){
-
+static INLINE __m128i load_6_pixels(const kvz_pixel* data)
+{
   return _mm_insert_epi16(_mm_cvtsi32_si128(*(int32_t*)&(data0)), *(int16_t*)&(data4), 2);
 }
 
-static INLINE __m256i load_5_offsets(const int* offsets){
-
+static INLINE __m256i load_5_offsets(const int* offsets)
+{
   return _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*) offsets)), _mm_insert_epi32(_mm_setzero_si128(), offsets4, 0), 1);
 }
 
@@ -73,9 +68,12 @@
 }
 
 
-int kvz_sao_edge_ddistortion_avx2(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
-                         int block_width, int block_height,
-                         int eo_class, int offsetsNUM_SAO_EDGE_CATEGORIES)
+static int sao_edge_ddistortion_avx2(const kvz_pixel *orig_data,
+                                     const kvz_pixel *rec_data,
+                                     int block_width,
+                                     int block_height,
+                                     int eo_class,
+                                     int offsetsNUM_SAO_EDGE_CATEGORIES)
 {
   int y, x;
   int sum = 0;
@@ -96,7 +94,7 @@
 
       __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c));
 
-      __m256i v_offset = _mm256_loadu_si256((__m256i*) offsets);
+      __m256i v_offset = load_5_offsets(offsets);
       v_offset = _mm256_permutevar8x32_epi32(v_offset, v_cat);
    
       __m256i v_diff = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(orig_datay * block_width + x)));
@@ -117,7 +115,7 @@
 
     __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c));
 
-    __m256i v_offset = load_6_offsets(offsets);
+    __m256i v_offset = load_5_offsets(offsets);
     v_offset = _mm256_permutevar8x32_epi32(v_offset, v_cat);
    
     const kvz_pixel* orig_ptr = &(orig_datay * block_width + x);
@@ -139,7 +137,12 @@
 }
 
 
-static INLINE void accum_count_eo_cat_avx2(__m256i*  __restrict v_diff_accum, __m256i* __restrict v_count, __m256i* __restrict v_cat, __m256i* __restrict v_diff, int eo_cat){
+static INLINE void accum_count_eo_cat_avx2(__m256i*  __restrict v_diff_accum,
+                                           __m256i* __restrict v_count,
+                                           __m256i* __restrict v_cat,
+                                           __m256i* __restrict v_diff,
+                                           int eo_cat)
+{
         __m256i v_mask = _mm256_cmpeq_epi32(*v_cat, _mm256_set1_epi32(eo_cat));
         *v_diff_accum = _mm256_add_epi32(*v_diff_accum, _mm256_and_si256(*v_diff, v_mask));
         *v_count = _mm256_sub_epi32(*v_count, v_mask);
@@ -151,9 +154,12 @@
   accum_count_eo_cat_avx2(&(v_diff_accum EO_CAT ), &(v_count EO_CAT ), &V_CAT , &v_diff, EO_CAT);
 
 
-void kvz_calc_sao_edge_dir_avx2(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
-                              int eo_class, int block_width, int block_height,
-                              int cat_sum_cnt2NUM_SAO_EDGE_CATEGORIES)
+static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data,
+                                   const kvz_pixel *rec_data,
+                                   int eo_class,
+                                   int block_width,
+                                   int block_height,
+                                   int cat_sum_cnt2NUM_SAO_EDGE_CATEGORIES)
 {
   int y, x;
   vector2d_t a_ofs = g_sao_edge_offsetseo_class0;
@@ -240,30 +246,29 @@
 }
 
 
-void kvz_sao_reconstruct_color_avx2(const encoder_control_t * const encoder, 
-                                  const kvz_pixel *rec_data, kvz_pixel *new_rec_data,
-                                  const sao_info_t *sao,
-                                  int stride, int new_stride,
-                                  int block_width, int block_height,
-                                  color_t color_i)
+static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
+                                       const kvz_pixel *rec_data, kvz_pixel *new_rec_data,
+                                       const sao_info_t *sao,
+                                       int stride, int new_stride,
+                                       int block_width, int block_height,
+                                       color_t color_i)
 {
-  int y, x;
   // Arrays orig_data and rec_data are quarter size for chroma.
   int offset_v = color_i == COLOR_V ? 5 : 0;
 
-  if(sao->type == SAO_TYPE_BAND) {
-    int offsets1<<KVZ_BIT_DEPTH;
+  if (sao->type == SAO_TYPE_BAND) {
+    int offsets1 << KVZ_BIT_DEPTH;
     kvz_calc_sao_offset_array(encoder, sao, offsets, color_i);
-    for (y = 0; y < block_height; ++y) {
-      for (x = 0; x < block_width; ++x) {
+    for (int y = 0; y < block_height; ++y) {
+      for (int x = 0; x < block_width; ++x) {
         new_rec_datay * new_stride + x = offsetsrec_datay * stride + x;
       }
     }
   } else {
     // Don't sample the edge pixels because this function doesn't have access to
     // their neighbours.
-    for (y = 0; y < block_height; ++y) {
-      for (x = 0; x < block_width; x+=8) {
+    for (int y = 0; y < block_height; ++y) {
+      for (int x = 0; x < block_width; x+=8) {
         vector2d_t a_ofs = g_sao_edge_offsetssao->eo_class0;
         vector2d_t b_ofs = g_sao_edge_offsetssao->eo_class1;
         const kvz_pixel *c_data = &rec_datay * stride + x;
@@ -299,9 +304,13 @@
 }
 
 
-int kvz_sao_band_ddistortion_avx2(const encoder_state_t * const state, const kvz_pixel *orig_data, const kvz_pixel *rec_data,
-                         int block_width, int block_height,
-                         int band_pos, int sao_bands4)
+static int sao_band_ddistortion_avx2(const encoder_state_t * const state,
+                                     const kvz_pixel *orig_data,
+                                     const kvz_pixel *rec_data,
+                                     int block_width,
+                                     int block_height,
+                                     int band_pos,
+                                     int sao_bands4)
 {
   int y, x;
   int shift = state->encoder_control->bitdepth-5;
@@ -348,10 +357,10 @@
   bool success = true;
 #if COMPILE_INTEL_AVX2
   if (bitdepth == 8) {
-    success &= kvz_strategyselector_register(opaque, "sao_edge_ddistortion", "avx2", 40, &kvz_sao_edge_ddistortion_avx2);
-    success &= kvz_strategyselector_register(opaque, "calc_sao_edge_dir", "avx2", 40, &kvz_calc_sao_edge_dir_avx2);
-    success &= kvz_strategyselector_register(opaque, "sao_reconstruct_color", "avx2", 40, &kvz_sao_reconstruct_color_avx2);
-    success &= kvz_strategyselector_register(opaque, "sao_band_ddistortion", "avx2", 40, &kvz_sao_band_ddistortion_avx2);
+    success &= kvz_strategyselector_register(opaque, "sao_edge_ddistortion", "avx2", 40, &sao_edge_ddistortion_avx2);
+    success &= kvz_strategyselector_register(opaque, "calc_sao_edge_dir", "avx2", 40, &calc_sao_edge_dir_avx2);
+    success &= kvz_strategyselector_register(opaque, "sao_reconstruct_color", "avx2", 40, &sao_reconstruct_color_avx2);
+    success &= kvz_strategyselector_register(opaque, "sao_band_ddistortion", "avx2", 40, &sao_band_ddistortion_avx2);
   }
 #endif //COMPILE_INTEL_AVX2
   return success;

kvazaar-1.1.0.tar.gz/src/strategies/generic/quant-generic.c -> kvazaar-1.2.0.tar.gz/src/strategies/generic/quant-generic.c Changed

@@ -169,7 +169,7 @@
 * \param color  Color.
 * \param scan_order  Coefficient scan order.
 * \param use_trskip  Whether transform skip is used.
-* \param stride  Stride for ref_in, pred_in rec_out and coeff_out.
+* \param stride  Stride for ref_in, pred_in and rec_out.
 * \param ref_in  Reference pixels.
 * \param pred_in  Predicted pixels.
 * \param rec_out  Reconstructed pixels.
@@ -186,7 +186,6 @@
 {
   // Temporary arrays to pass data to and from kvz_quant and transform functions.
   int16_t residualTR_MAX_WIDTH * TR_MAX_WIDTH;
-  coeff_t quant_coeffTR_MAX_WIDTH * TR_MAX_WIDTH;
   coeff_t coeffTR_MAX_WIDTH * TR_MAX_WIDTH;
 
   int has_coeffs = 0;
@@ -212,16 +211,16 @@
     kvz_transform2d(state->encoder_control, residual, coeff, width, (color == COLOR_Y ? 0 : 65535));
   }
 
-  // Quantize coeffs. (coeff -> quant_coeff)
+  // Quantize coeffs. (coeff -> coeff_out)
   if (state->encoder_control->cfg.rdoq_enable &&
       (width > 4 || !state->encoder_control->cfg.rdoq_skip))
   {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
-    kvz_rdoq(state, coeff, quant_coeff, width, width, (color == COLOR_Y ? 0 : 2),
+    kvz_rdoq(state, coeff, coeff_out, width, width, (color == COLOR_Y ? 0 : 2),
       scan_order, cur_cu->type, tr_depth);
   } else {
-    kvz_quant(state, coeff, quant_coeff, width, width, (color == COLOR_Y ? 0 : 2),
+    kvz_quant(state, coeff, coeff_out, width, width, (color == COLOR_Y ? 0 : 2),
       scan_order, cur_cu->type);
   }
 
@@ -229,23 +228,20 @@
   {
     int i;
     for (i = 0; i < width * width; ++i) {
-      if (quant_coeffi != 0) {
+      if (coeff_outi != 0) {
         has_coeffs = 1;
         break;
       }
     }
   }
 
-  // Copy coefficients to coeff_out.
-  kvz_coefficients_blit(quant_coeff, coeff_out, width, width, width, out_stride);
-
   // Do the inverse quantization and transformation and the reconstruction to
   // rec_out.
   if (has_coeffs) {
     int y, x;
 
-    // Get quantized residual. (quant_coeff -> coeff -> residual)
-    kvz_dequant(state, quant_coeff, coeff, width, width, (color == COLOR_Y ? 0 : (color == COLOR_U ? 2 : 3)), cur_cu->type);
+    // Get quantized residual. (coeff_out -> coeff -> residual)
+    kvz_dequant(state, coeff_out, coeff, width, width, (color == COLOR_Y ? 0 : (color == COLOR_U ? 2 : 3)), cur_cu->type);
     if (use_trskip) {
       kvz_itransformskip(state->encoder_control, residual, coeff, width);
     }
@@ -324,6 +320,15 @@
   }
 }
 
+static uint32_t coeff_abs_sum_generic(const coeff_t *coeffs, size_t length)
+{
+  uint32_t sum = 0;
+  for (int i = 0; i < length; i++) {
+    sum += abs(coeffsi);
+  }
+  return sum;
+}
+
 int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
@@ -331,6 +336,7 @@
   success &= kvz_strategyselector_register(opaque, "quant", "generic", 0, &kvz_quant_generic);
   success &= kvz_strategyselector_register(opaque, "quantize_residual", "generic", 0, &kvz_quantize_residual_generic);
   success &= kvz_strategyselector_register(opaque, "dequant", "generic", 0, &kvz_dequant_generic);
+  success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "generic", 0, &coeff_abs_sum_generic);
 
   return success;
 }

kvazaar-1.1.0.tar.gz/src/strategies/generic/sao-generic.c -> kvazaar-1.2.0.tar.gz/src/strategies/generic/sao-generic.c Changed

@@ -40,9 +40,12 @@
 }
 
 
-int kvz_sao_edge_ddistortion_generic(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
-                         int block_width, int block_height,
-                         int eo_class, int offsetsNUM_SAO_EDGE_CATEGORIES)
+static int sao_edge_ddistortion_generic(const kvz_pixel *orig_data,
+                                        const kvz_pixel *rec_data,
+                                        int block_width,
+                                        int block_height,
+                                        int eo_class,
+                                        int offsetsNUM_SAO_EDGE_CATEGORIES)
 {
   int y, x;
   int sum = 0;
@@ -76,9 +79,12 @@
  * \param dir_offsets
  * \param is_chroma  0 for luma, 1 for chroma. Indicates
  */
-void kvz_calc_sao_edge_dir_generic(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
-                              int eo_class, int block_width, int block_height,
-                              int cat_sum_cnt2NUM_SAO_EDGE_CATEGORIES)
+static void calc_sao_edge_dir_generic(const kvz_pixel *orig_data,
+                                      const kvz_pixel *rec_data,
+                                      int eo_class,
+                                      int block_width,
+                                      int block_height,
+                                      int cat_sum_cnt2NUM_SAO_EDGE_CATEGORIES)
 {
   int y, x;
   vector2d_t a_ofs = g_sao_edge_offsetseo_class0;
@@ -103,30 +109,32 @@
 }
 
 
-void kvz_sao_reconstruct_color_generic(const encoder_control_t * const encoder, 
-                                  const kvz_pixel *rec_data, kvz_pixel *new_rec_data,
-                                  const sao_info_t *sao,
-                                  int stride, int new_stride,
-                                  int block_width, int block_height,
-                                  color_t color_i)
+static void sao_reconstruct_color_generic(const encoder_control_t * const encoder,
+                                          const kvz_pixel *rec_data,
+                                          kvz_pixel *new_rec_data,
+                                          const sao_info_t *sao,
+                                          int stride,
+                                          int new_stride,
+                                          int block_width,
+                                          int block_height,
+                                          color_t color_i)
 {
-  int y, x;
   // Arrays orig_data and rec_data are quarter size for chroma.
   int offset_v = color_i == COLOR_V ? 5 : 0;
 
-  if(sao->type == SAO_TYPE_BAND) {
+  if (sao->type == SAO_TYPE_BAND) {
     int offsets1<<KVZ_BIT_DEPTH;
     kvz_calc_sao_offset_array(encoder, sao, offsets, color_i);
-    for (y = 0; y < block_height; ++y) {
-      for (x = 0; x < block_width; ++x) {
+    for (int y = 0; y < block_height; ++y) {
+      for (int x = 0; x < block_width; ++x) {
         new_rec_datay * new_stride + x = offsetsrec_datay * stride + x;
       }
     }
   } else {
     // Don't sample the edge pixels because this function doesn't have access to
     // their neighbours.
-    for (y = 0; y < block_height; ++y) {
-      for (x = 0; x < block_width; ++x) {
+    for (int y = 0; y < block_height; ++y) {
+      for (int x = 0; x < block_width; ++x) {
         vector2d_t a_ofs = g_sao_edge_offsetssao->eo_class0;
         vector2d_t b_ofs = g_sao_edge_offsetssao->eo_class1;
         const kvz_pixel *c_data = &rec_datay * stride + x;
@@ -144,9 +152,13 @@
 }
 
 
-int kvz_sao_band_ddistortion_generic(const encoder_state_t * const state, const kvz_pixel *orig_data, const kvz_pixel *rec_data,
-                         int block_width, int block_height,
-                         int band_pos, int sao_bands4)
+static int sao_band_ddistortion_generic(const encoder_state_t * const state,
+                                        const kvz_pixel *orig_data,
+                                        const kvz_pixel *rec_data,
+                                        int block_width,
+                                        int block_height,
+                                        int band_pos,
+                                        int sao_bands4)
 {
   int y, x;
   int shift = state->encoder_control->bitdepth-5;
@@ -174,11 +186,11 @@
 int kvz_strategy_register_sao_generic(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
-  
-  success &= kvz_strategyselector_register(opaque, "sao_edge_ddistortion", "generic", 0, &kvz_sao_edge_ddistortion_generic);
-  success &= kvz_strategyselector_register(opaque, "calc_sao_edge_dir", "generic", 0, &kvz_calc_sao_edge_dir_generic);
-  success &= kvz_strategyselector_register(opaque, "sao_reconstruct_color", "generic", 0, &kvz_sao_reconstruct_color_generic);
-  success &= kvz_strategyselector_register(opaque, "sao_band_ddistortion", "generic", 0, &kvz_sao_band_ddistortion_generic);
+
+  success &= kvz_strategyselector_register(opaque, "sao_edge_ddistortion", "generic", 0, &sao_edge_ddistortion_generic);
+  success &= kvz_strategyselector_register(opaque, "calc_sao_edge_dir", "generic", 0, &calc_sao_edge_dir_generic);
+  success &= kvz_strategyselector_register(opaque, "sao_reconstruct_color", "generic", 0, &sao_reconstruct_color_generic);
+  success &= kvz_strategyselector_register(opaque, "sao_band_ddistortion", "generic", 0, &sao_band_ddistortion_generic);
 
   return success;
 }

kvazaar-1.1.0.tar.gz/src/strategies/strategies-quant.c -> kvazaar-1.2.0.tar.gz/src/strategies/strategies-quant.c Changed

kvazaar-1.1.0.tar.gz/src/strategies/strategies-quant.h -> kvazaar-1.2.0.tar.gz/src/strategies/strategies-quant.h Changed

kvazaar-1.1.0.tar.gz/src/threadqueue.c -> kvazaar-1.2.0.tar.gz/src/threadqueue.c Changed

@@ -30,668 +30,601 @@
 #include "threads.h"
 
 
-typedef struct {
-  threadqueue_queue_t * threadqueue;
-  int worker_id;
-} threadqueue_worker_spec;
+/**
+ * \file
+ *
+ * Lock acquisition order:
+ *
+ * 1. When locking a job and its dependency, the dependecy must be locked
+ * first and then the job depending on it.
+ *
+ * 2. When locking a job and the thread queue, the thread queue must be
+ * locked first and then the job.
+ *
+ * 3. When accessing threadqueue_job_t.next, the thread queue must be
+ * locked.
+ */
 
 #define THREADQUEUE_LIST_REALLOC_SIZE 32
 
-//#define PTHREAD_COND_SIGNAL(c) fprintf(stderr, "%s:%d pthread_cond_signal(%s=%p)\n", __FUNCTION__, __LINE__, #c, c); if (pthread_cond_signal((c)) != 0) { fprintf(stderr, "pthread_cond_signal(%s=%p) failed!\n", #c, c); assert(0); return 0; }
-//#define PTHREAD_COND_BROADCAST(c) fprintf(stderr, "%s:%d pthread_cond_broadcast(%s=%p)\n", __FUNCTION__, __LINE__, #c, c); if (pthread_cond_broadcast((c)) != 0) { fprintf(stderr, "pthread_cond_broadcast(%s=%p) failed!\n", #c, c); assert(0); return 0; }
-//#define PTHREAD_COND_WAIT(c,l) fprintf(stderr, "%s:%d pthread_cond_wait(%s=%p, %s=%p)\n", __FUNCTION__, __LINE__, #c, c, #l, l); if (pthread_cond_wait((c),(l)) != 0) { fprintf(stderr, "pthread_cond_wait(%s=%p, %s=%p) failed!\n", #c, c, #l, l); assert(0); return 0; } else {fprintf(stderr, "%s:%d pthread_cond_wait(%s=%p, %s=%p) (done)\n", __FUNCTION__, __LINE__, #c, c, #l, l);}
-//#define PTHREAD_LOCK(l) fprintf(stderr, "%s:%d pthread_mutex_lock(%s=%p) (try)\n", __FUNCTION__, __LINE__, #l, l); if (pthread_mutex_lock((l)) != 0) { fprintf(stderr, "pthread_mutex_lock(%s=%p) failed!\n", #l, l); assert(0); return 0; } else {fprintf(stderr, "%s:%d pthread_mutex_lock(%s=%p)\n", __FUNCTION__, __LINE__, #l, l);}
-//#define PTHREAD_UNLOCK(l) if (pthread_mutex_unlock((l)) != 0) { fprintf(stderr, "pthread_mutex_unlock(%s=%p) failed!\n", #l, l); assert(0); return 0; }  else {fprintf(stderr, "%s:%d pthread_mutex_unlock(%s=%p)\n", __FUNCTION__, __LINE__, #l, l);}
-
-
-#define PTHREAD_COND_SIGNAL(c) if (pthread_cond_signal((c)) != 0) { fprintf(stderr, "pthread_cond_signal(%s=%p) failed!\n", #c, c); assert(0); return 0; }
-#define PTHREAD_COND_BROADCAST(c) if (pthread_cond_broadcast((c)) != 0) { fprintf(stderr, "pthread_cond_broadcast(%s=%p) failed!\n", #c, c); assert(0); return 0; }
-
-#ifndef _PTHREAD_DUMP
-#define PTHREAD_COND_WAIT(c,l) if (pthread_cond_wait((c),(l)) != 0) { fprintf(stderr, "pthread_cond_wait(%s=%p, %s=%p) failed!\n", #c, c, #l, l); assert(0); return 0; }
-#define PTHREAD_LOCK(l) if (pthread_mutex_lock((l)) != 0) { fprintf(stderr, "pthread_mutex_lock(%s) failed!\n", #l); assert(0); return 0; }
-#define PTHREAD_UNLOCK(l) if (pthread_mutex_unlock((l)) != 0) { fprintf(stderr, "pthread_mutex_unlock(%s) failed!\n", #l); assert(0); return 0; }
-
-#else  //PTHREAD_DUMP
-#define PTHREAD_LOCK(l) do { \
-  PERFORMANCE_MEASURE_START(); \
-  if (pthread_mutex_lock((l)) != 0) { fprintf(stderr, "pthread_mutex_lock(%s) failed!\n", #l); assert(0); return 0; } \
-  PERFORMANCE_MEASURE_END(NULL, "pthread_mutex_lock(%s=%p)@%s:%d",#l,l,__FUNCTION__, __LINE__); \
-} while (0);
-
-#define PTHREAD_UNLOCK(l) do { \
-  PERFORMANCE_MEASURE_START(); \
-  if (pthread_mutex_unlock((l)) != 0) { fprintf(stderr, "pthread_mutex_unlock(%s) failed!\n", #l); assert(0); return 0; } \
-  PERFORMANCE_MEASURE_END(NULL, "pthread_mutex_unlock(%s=%p)@%s:%d",#l,l,__FUNCTION__, __LINE__); \
-} while (0);
-
-#define PTHREAD_COND_WAIT(c,l) do { \
-  PERFORMANCE_MEASURE_START(); \
-  if (pthread_cond_wait((c),(l)) != 0) { fprintf(stderr, "pthread_cond_wait(%s=%p, %s=%p) failed!\n", #c, c, #l, l); assert(0); return 0;} \
-  PERFORMANCE_MEASURE_END(NULL, "pthread_cond_wait(%s=%p, %s=%p)@%s:%d",#c, c, #l, l,__FUNCTION__, __LINE__); \
-} while (0);
-#endif //PTHREAD_DUMP
-
-static void* threadqueue_worker(void* threadqueue_worker_spec_opaque)
+#define PTHREAD_COND_SIGNAL(c) \
+  if (pthread_cond_signal((c)) != 0) { \
+    fprintf(stderr, "pthread_cond_signal(%s=%p) failed!\n", #c, c); \
+    assert(0); \
+    return 0; \
+  }
+
+#define PTHREAD_COND_BROADCAST(c) \
+  if (pthread_cond_broadcast((c)) != 0) { \
+    fprintf(stderr, "pthread_cond_broadcast(%s=%p) failed!\n", #c, c); \
+    assert(0); \
+    return 0; \
+  }
+
+#define PTHREAD_COND_WAIT(c,l) \
+  if (pthread_cond_wait((c),(l)) != 0) { \
+    fprintf(stderr, "pthread_cond_wait(%s=%p, %s=%p) failed!\n", #c, c, #l, l); \
+    assert(0); \
+    return 0; \
+  }
+
+#define PTHREAD_LOCK(l) \
+  if (pthread_mutex_lock((l)) != 0) { \
+    fprintf(stderr, "pthread_mutex_lock(%s) failed!\n", #l); \
+    assert(0); \
+    return 0; \
+  }
+
+#define PTHREAD_UNLOCK(l) \
+  if (pthread_mutex_unlock((l)) != 0) { \
+    fprintf(stderr, "pthread_mutex_unlock(%s) failed!\n", #l); \
+    assert(0); \
+    return 0; \
+  }
+
+
+typedef enum {
+  /**
+   * \brief Job has been submitted, but is not allowed to run yet.
+   */
+  THREADQUEUE_JOB_STATE_PAUSED,
+
+  /**
+   * \brief Job is waiting for dependencies.
+   */
+  THREADQUEUE_JOB_STATE_WAITING,
+
+  /**
+   * \brief Job is ready to run.
+   */
+  THREADQUEUE_JOB_STATE_READY,
+
+  /**
+   * \brief Job is running.
+   */
+  THREADQUEUE_JOB_STATE_RUNNING,
+
+  /**
+   * \brief Job is completed.
+   */
+  THREADQUEUE_JOB_STATE_DONE,
+
+} threadqueue_job_state;
+
+
+struct threadqueue_job_t {
+  pthread_mutex_t lock;
+
+  threadqueue_job_state state;
+
+  /**
+   * \brief Number of dependencies that have not been completed yet.
+   */
+  int ndepends;
+
+  /**
+   * \brief Reverse dependencies.
+   *
+   * Array of pointers to jobs that depend on this one. They have to exist
+   * when the thread finishes, because they cannot be run before.
+   */
+  struct threadqueue_job_t **rdepends;
+
+  /**
+   * \brief Number of elements in rdepends.
+   */
+  int rdepends_count;
+
+  /**
+   * \brief Allocated size of rdepends.
+   */
+  int rdepends_size;
+
+  /**
+   * \brief Reference count
+   */
+  int refcount;
+
+  /**
+   * \brief Pointer to the function to execute.
+   */
+  void (*fptr)(void *arg);
+
+  /**
+   * \brief Argument for fptr.
+   */
+  void *arg;
+
+  /**
+   * \brief Pointer to the next job in the queue.
+   */
+  struct threadqueue_job_t *next;
+
+};
+
+
+struct threadqueue_queue_t {
+  pthread_mutex_t lock;
+
+  /**
+   * \brief Job available condition variable
+   *
+   * Signalled when there is a new job to do.
+   */
+  pthread_cond_t job_available;
+
+  /**
+   * \brief Job done condition variable
+   *
+   * Signalled when a job has been completed.
+   */
+  pthread_cond_t job_done;
+
+  /**
+   * Array containing spawned threads
+   */
+  pthread_t *threads;
+
+  /**
+   * \brief Number of threads spawned
+   */
+  int thread_count;
+
+  /**
+   * \brief Number of threads running
+   */
+  int thread_running_count;
+
+  /**
+   * \brief If true, threads should stop ASAP.
+   */
+  bool stop;
+
+  /**
+   * \brief Pointer to the first ready job
+   */
+  threadqueue_job_t *first;
+
+  /**
+   * \brief Pointer to the last ready job
+   */
+  threadqueue_job_t *last;
+};
+
+
+/**
+ * \brief Add a job to the queue of jobs ready to run.
+ *
+ * The caller must have locked the thread queue and the job. This function
+ * takes the ownership of the job.
+ */
+static void threadqueue_push_job(threadqueue_queue_t * threadqueue,
+                                 threadqueue_job_t *job)
 {
-  threadqueue_worker_spec * const threadqueue_worker_spec = threadqueue_worker_spec_opaque;
-  threadqueue_queue_t * const threadqueue = threadqueue_worker_spec->threadqueue;
-  threadqueue_job_t * next_job = NULL;
+  assert(job->ndepends == 0);
+  job->state = THREADQUEUE_JOB_STATE_READY;
 
-#ifdef KVZ_DEBUG
-  KVZ_GET_TIME(&threadqueue->debug_clock_thread_startthreadqueue_worker_spec->worker_id);
-#endif //KVZ_DEBUG
+  if (threadqueue->first == NULL) {
+    threadqueue->first = job;
+  } else {
+    threadqueue->last->next = job;
+  }
 
-  for(;;) {
-    threadqueue_job_t * job = NULL;
+  threadqueue->last = job;
+  job->next = NULL;
+}
 
-    PTHREAD_LOCK(&threadqueue->lock);
 
-    while(!threadqueue->stop && threadqueue->queue_waiting_execution == 0 && !next_job) {
+/**
+ * \brief Retrieve a job from the queue of jobs ready to run.
+ *
+ * The caller must have locked the thread queue. The calling function
+ * receives the ownership of the job.
+ */
+static threadqueue_job_t * threadqueue_pop_job(threadqueue_queue_t * threadqueue)
+{
+  assert(threadqueue->first != NULL);
+
+  threadqueue_job_t *job = threadqueue->first;
+  threadqueue->first = job->next;
+  job->next = NULL;
+
+  if (threadqueue->first == NULL) {
+    threadqueue->last = NULL;
+  }
+
+  return job;
+}
+
+
+/**
+ * \brief Function executed by worker threads.
+ */
+static void* threadqueue_worker(void* threadqueue_opaque)
+{
+  threadqueue_queue_t * const threadqueue = (threadqueue_queue_t *) threadqueue_opaque;
+
+  PTHREAD_LOCK(&threadqueue->lock);
+
+  for (;;) {
+    while (!threadqueue->stop && threadqueue->first == NULL) {
       // Wait until there is something to do in the queue.
-      PTHREAD_COND_WAIT(&threadqueue->cond, &threadqueue->lock);
+      PTHREAD_COND_WAIT(&threadqueue->job_available, &threadqueue->lock);
     }
 
-    if(threadqueue->stop) {
-      if (next_job) {
-        // Put a job we had already reserved back into the queue.
-        // FIXME: This lock should be unnecessary, as nobody else is allowed
-        // to touch this job when it's running.
-        PTHREAD_LOCK(&next_job->lock);
-        next_job->state = THREADQUEUE_JOB_STATE_QUEUED;
-        PTHREAD_UNLOCK(&next_job->lock);
-      }
+    if (threadqueue->stop) {
       break;
     }
 
-    //Find a task (should be fast enough)
-    job = NULL;
-    if (next_job) {
-      assert(next_job->ndepends == 0);
-      job = next_job;
-    } else {
-      //FIXME: if not using OWF, the first is better than the second, otherwise we should use the second order
-      //for (i = threadqueue->queue_count - 1; i >= threadqueue->queue_start; --i) {
-      //for (i = threadqueue->queue_start; i < threadqueue->queue_count; ++i) {
-
-      for (int i = (threadqueue->fifo ? threadqueue->queue_start : threadqueue->queue_count - 1);
-           (threadqueue->fifo ? i < threadqueue->queue_count : i >= threadqueue->queue_start); 
-           (threadqueue->fifo ? ++i : --i)) {
-        threadqueue_job_t * const i_job = threadqueue->queuei;
-
-        if (i_job->state == THREADQUEUE_JOB_STATE_QUEUED && i_job->ndepends == 0) {
-          // Once we found the job with no dependancies, lock it and change
-          // its state to running, so nobody else can claim it.
-          PTHREAD_LOCK(&i_job->lock);
-          if (i_job->state == THREADQUEUE_JOB_STATE_QUEUED && i_job->ndepends == 0) {
-            job = i_job;
-            job->state = THREADQUEUE_JOB_STATE_RUNNING;
-          }
-          PTHREAD_UNLOCK(&i_job->lock);
-          if (job) break;
-        }
-      }
-    }
-
-    if (!job) {
-      // We have no job. Probably because more threads were woken up than
-      // there were jobs to do.
-      PTHREAD_UNLOCK(&threadqueue->lock);
-    } else {
-      // We have a job with ndepends==0 and its state is running.
-      assert(job->state == THREADQUEUE_JOB_STATE_RUNNING);
-
-      // Advance queue_start to skip all the running jobs.
-      while (threadqueue->queue_start < threadqueue->queue_count &&
-             threadqueue->queuethreadqueue->queue_start->state != THREADQUEUE_JOB_STATE_QUEUED)
-      {
-        threadqueue->queue_start++;
-      }
-      
-      if (!next_job) {
-        --threadqueue->queue_waiting_execution;
-        ++threadqueue->queue_running;
-      }
-
-      PTHREAD_UNLOCK(&threadqueue->lock);
-
-#ifdef KVZ_DEBUG
-      job->debug_worker_id = threadqueue_worker_spec->worker_id;
-      KVZ_GET_TIME(&job->debug_clock_start);
-#endif //KVZ_DEBUG
-
-      job->fptr(job->arg);
-
-#ifdef KVZ_DEBUG
-      job->debug_worker_id = threadqueue_worker_spec->worker_id;
-      KVZ_GET_TIME(&job->debug_clock_stop);
-#endif //KVZ_DEBUG
-
-      // FIXME: This lock should be unnecessary, as nobody else is allowed
-      // to touch this job when it's running.
-      PTHREAD_LOCK(&job->lock);
-      assert(job->state == THREADQUEUE_JOB_STATE_RUNNING);
-
-      job->state = THREADQUEUE_JOB_STATE_DONE;
-
-      next_job = NULL;
-
-      int queue_waiting_dependency_decr = 0;
-      int queue_waiting_execution_incr = 0;
-
-      // Go throught all the jobs that depend on this one, decresing their ndepends.
-      for (int i = 0; i < job->rdepends_count; ++i) {
-        threadqueue_job_t * const depjob = job->rdependsi;
-        // Note that we lock the dependency AFTER locking the source. This avoids a deadlock in dep_add.
-        PTHREAD_LOCK(&depjob->lock);
-
-        assert(depjob->state == THREADQUEUE_JOB_STATE_QUEUED);
-        assert(depjob->ndepends > 0);
-        --depjob->ndepends;
-
-        // Count how many jobs can now start executing so we know how many
-        // threads to wake up.
-        if (depjob->ndepends == 0) {
-          if (!next_job) {
-            // Avoid having to find a new job for this worker through the
-            // queue by taking one of the jobs that depended on current job.
-            next_job = depjob;
-            depjob->state = THREADQUEUE_JOB_STATE_RUNNING;
-          } else {
-            ++queue_waiting_execution_incr;
-          }
-          ++queue_waiting_dependency_decr;
-        }
-
-        PTHREAD_UNLOCK(&depjob->lock);
-      }
-      
-      PTHREAD_UNLOCK(&job->lock);
+    // Get a job and remove it from the queue.
+    threadqueue_job_t *job = threadqueue_pop_job(threadqueue);
 
-      PTHREAD_LOCK(&threadqueue->lock);
+    PTHREAD_LOCK(&job->lock);
+    assert(job->state == THREADQUEUE_JOB_STATE_READY);
+    job->state = THREADQUEUE_JOB_STATE_RUNNING;
+    PTHREAD_UNLOCK(&job->lock);
+    PTHREAD_UNLOCK(&threadqueue->lock);
 
-      assert(threadqueue->queue_waiting_dependency >= queue_waiting_dependency_decr);
+    job->fptr(job->arg);
 
-      // This thread will 
-      if (!next_job) {
-        // We didn't find a new job, so this thread will have to go wait.
-        threadqueue->queue_running--;
+    PTHREAD_LOCK(&threadqueue->lock);
+    PTHREAD_LOCK(&job->lock);
+    assert(job->state == THREADQUEUE_JOB_STATE_RUNNING);
+    job->state = THREADQUEUE_JOB_STATE_DONE;
+
+    PTHREAD_COND_SIGNAL(&threadqueue->job_done);
+
+    // Go through all the jobs that depend on this one, decreasing their
+    // ndepends. Count how many jobs can now start executing so we know how
+    // many threads to wake up.
+    int num_new_jobs = 0;
+    for (int i = 0; i < job->rdepends_count; ++i) {
+      threadqueue_job_t * const depjob = job->rdependsi;
+      // The dependency (job) is locked before the job depending on it.
+      // This must be the same order as in kvz_threadqueue_job_dep_add.
+      PTHREAD_LOCK(&depjob->lock);
+
+      assert(depjob->state == THREADQUEUE_JOB_STATE_WAITING ||
+             depjob->state == THREADQUEUE_JOB_STATE_PAUSED);
+      assert(depjob->ndepends > 0);
+      depjob->ndepends--;
+
+      if (depjob->ndepends == 0 && depjob->state == THREADQUEUE_JOB_STATE_WAITING) {
+        // Move the job to ready jobs.
+        threadqueue_push_job(threadqueue, kvz_threadqueue_copy_ref(depjob));
+        num_new_jobs++;
       }
-      threadqueue->queue_waiting_dependency -= queue_waiting_dependency_decr;
-      threadqueue->queue_waiting_execution += queue_waiting_execution_incr;
 
-      // Wake up enough threads to take care of the tasks now lacking dependancies.
-      for (int i = 0; i < queue_waiting_execution_incr; ++i) {
-        PTHREAD_COND_SIGNAL(&threadqueue->cond);
-      }
+      // Clear this reference to the job.
+      PTHREAD_UNLOCK(&depjob->lock);
+      kvz_threadqueue_free_job(&job->rdependsi);
+    }
+    job->rdepends_count = 0;
 
-      // Signal main thread that a job has been completed.
-      pthread_cond_signal(&threadqueue->cb_cond);
+    PTHREAD_UNLOCK(&job->lock);
+    kvz_threadqueue_free_job(&job);
 
-      PTHREAD_UNLOCK(&threadqueue->lock);
+    // The current thread will process one of the new jobs so we wake up
+    // one threads less than the the number of new jobs.
+    for (int i = 0; i < num_new_jobs - 1; i++) {
+      pthread_cond_signal(&threadqueue->job_available);
     }
   }
 
-  // We got out of the loop because threadqueue->stop == 1. The queue is locked.
-  assert(threadqueue->stop);
-  --threadqueue->threads_running;
-  
-#ifdef KVZ_DEBUG
-  KVZ_GET_TIME(&threadqueue->debug_clock_thread_endthreadqueue_worker_spec->worker_id);
-  
-  fprintf(threadqueue->debug_log, "\t%d\t-\t%lf\t+%lf\t-\tthread\n", threadqueue_worker_spec->worker_id, KVZ_CLOCK_T_AS_DOUBLE(threadqueue->debug_clock_thread_startthreadqueue_worker_spec->worker_id), KVZ_CLOCK_T_DIFF(threadqueue->debug_clock_thread_startthreadqueue_worker_spec->worker_id, threadqueue->debug_clock_thread_endthreadqueue_worker_spec->worker_id));
-#endif //KVZ_DEBUG
-  
+  threadqueue->thread_running_count--;
   PTHREAD_UNLOCK(&threadqueue->lock);
-  
-  free(threadqueue_worker_spec_opaque);
-  
-  pthread_exit(NULL);
-  
   return NULL;
 }
 
 
-int kvz_threadqueue_init(threadqueue_queue_t * const threadqueue, int thread_count, int fifo) {
-  int i;
+/**
+ * \brief Initialize the queue.
+ *
+ * \return 1 on success, 0 on failure
+ */
+threadqueue_queue_t * kvz_threadqueue_init(int thread_count)
+{
+  threadqueue_queue_t *threadqueue = MALLOC(threadqueue_queue_t, 1);
+  if (!threadqueue) {
+    goto failed;
+  }
+
   if (pthread_mutex_init(&threadqueue->lock, NULL) != 0) {
     fprintf(stderr, "pthread_mutex_init failed!\n");
-    assert(0);
-    return 0;
+    goto failed;
   }
-  if (pthread_cond_init(&threadqueue->cond, NULL) != 0) {
+
+  if (pthread_cond_init(&threadqueue->job_available, NULL) != 0) {
     fprintf(stderr, "pthread_cond_init failed!\n");
-    assert(0);
-    return 0;
+    goto failed;
   }
-  
-  if (pthread_cond_init(&threadqueue->cb_cond, NULL) != 0) {
+
+  if (pthread_cond_init(&threadqueue->job_done, NULL) != 0) {
     fprintf(stderr, "pthread_cond_init failed!\n");
-    assert(0);
-    return 0;
+    goto failed;
   }
-  
-  threadqueue->stop = 0;
-  threadqueue->fifo = !!fifo;
-  threadqueue->threads_running = 0;
-  threadqueue->threads_count = thread_count;
-  
+
   threadqueue->threads = MALLOC(pthread_t, thread_count);
   if (!threadqueue->threads) {
     fprintf(stderr, "Could not malloc threadqueue->threads!\n");
-    return 0;
+    goto failed;
   }
-#ifdef KVZ_DEBUG
-  threadqueue->debug_clock_thread_start = MALLOC(KVZ_CLOCK_T, thread_count);
-  assert(threadqueue->debug_clock_thread_start);
-  threadqueue->debug_clock_thread_end = MALLOC(KVZ_CLOCK_T, thread_count);
-  assert(threadqueue->debug_clock_thread_end);
-  threadqueue->debug_log = fopen("threadqueue.log", "w");
-#endif //KVZ_DEBUG
-  
-  threadqueue->queue = NULL;
-  threadqueue->queue_size = 0;
-  threadqueue->queue_count = 0;
-  threadqueue->queue_start = 0;
-  threadqueue->queue_waiting_execution = 0;
-  threadqueue->queue_waiting_dependency = 0;
-  threadqueue->queue_running = 0;
-  
-  //Lock the queue before creating threads, to ensure they all have correct information
+  threadqueue->thread_count = 0;
+  threadqueue->thread_running_count = 0;
+
+  threadqueue->stop = false;
+
+  threadqueue->first              = NULL;
+  threadqueue->last               = NULL;
+
+  // Lock the queue before creating threads, to ensure they all have correct information.
   PTHREAD_LOCK(&threadqueue->lock);
-  
-  for(i = 0; i < thread_count; i++) {
-    threadqueue_worker_spec *tqws = MALLOC(threadqueue_worker_spec, 1);
-    if (tqws) {
-      tqws->threadqueue = threadqueue;
-      tqws->worker_id = i;
-      if(pthread_create(&(threadqueue->threadsi), NULL, threadqueue_worker, (void*)tqws) != 0) {
-          fprintf(stderr, "pthread_create failed!\n");
-          assert(0);
-          return 0;
-      }
-      threadqueue->threads_running++;
-    } else {
-      fprintf(stderr, "Could not allocate threadqueue_worker_spec structure!\n");
-      PTHREAD_UNLOCK(&threadqueue->lock);
-      return 0;
+  for (int i = 0; i < thread_count; i++) {
+    if (pthread_create(&threadqueue->threadsi, NULL, threadqueue_worker, threadqueue) != 0) {
+        fprintf(stderr, "pthread_create failed!\n");
+        goto failed;
     }
+    threadqueue->thread_count++;
+    threadqueue->thread_running_count++;
   }
-  
   PTHREAD_UNLOCK(&threadqueue->lock);
 
-  return 1;
+  return threadqueue;
+
+failed:
+  kvz_threadqueue_free(threadqueue);
+  return NULL;
 }
 
+
 /**
- * \brief Free a single job from the threadqueue index i, destroying it.
+ * \brief Create a job and return a pointer to it.
+ *
+ * The job is created in a paused state. Function kvz_threadqueue_submit
+ * must be called on the job in order to have it run.
+ *
+ * \return pointer to the job, or NULL on failure
  */
-static void threadqueue_free_job(threadqueue_queue_t * const threadqueue, int i)
+threadqueue_job_t * kvz_threadqueue_job_create(void (*fptr)(void *arg), void *arg)
 {
-#ifdef KVZ_DEBUG
-#if KVZ_DEBUG & KVZ_PERF_JOB
-  int j;
-  KVZ_GET_TIME(&threadqueue->queuei->debug_clock_dequeue);
-  fprintf(threadqueue->debug_log, "%p\t%d\t%lf\t+%lf\t+%lf\t+%lf\t%s\n", threadqueue->queuei, threadqueue->queuei->debug_worker_id, KVZ_CLOCK_T_AS_DOUBLE(threadqueue->queuei->debug_clock_enqueue), KVZ_CLOCK_T_DIFF(threadqueue->queuei->debug_clock_enqueue, threadqueue->queuei->debug_clock_start), KVZ_CLOCK_T_DIFF(threadqueue->queuei->debug_clock_start, threadqueue->queuei->debug_clock_stop), KVZ_CLOCK_T_DIFF(threadqueue->queuei->debug_clock_stop, threadqueue->queuei->debug_clock_dequeue), threadqueue->queuei->debug_description);
-
-  for (j = 0; j < threadqueue->queuei->rdepends_count; ++j) {
-    fprintf(threadqueue->debug_log, "%p->%p\n", threadqueue->queuei, threadqueue->queuei->rdependsj);
+  threadqueue_job_t *job = MALLOC(threadqueue_job_t, 1);
+  if (!job) {
+    fprintf(stderr, "Could not alloc job!\n");
+    return NULL;
+  }
+
+  if (pthread_mutex_init(&job->lock, NULL) != 0) {
+    fprintf(stderr, "pthread_mutex_init(job) failed!\n");
+    return NULL;
   }
 
-  FREE_POINTER(threadqueue->queuei->debug_description);
-#endif
-#endif
-  FREE_POINTER(threadqueue->queuei->rdepends);
-  
-  pthread_mutex_destroy(&threadqueue->queuei->lock);
+  job->state = THREADQUEUE_JOB_STATE_PAUSED;
+  job->ndepends       = 0;
+  job->rdepends       = NULL;
+  job->rdepends_count = 0;
+  job->rdepends_size  = 0;
+  job->refcount       = 1;
+  job->fptr           = fptr;
+  job->arg            = arg;
 
-  FREE_POINTER(threadqueue->queuei);
+  return job;
 }
 
-static void threadqueue_free_jobs(threadqueue_queue_t * const threadqueue) {
-  int i;
-  for (i=0; i < threadqueue->queue_count; ++i) {
-    threadqueue_free_job(threadqueue, i);
-  }
-  threadqueue->queue_count = 0;
-  threadqueue->queue_start = 0;
-#ifdef KVZ_DEBUG
-#if KVZ_DEBUG & KVZ_PERF_JOB
-  {
-    KVZ_CLOCK_T time;
-    KVZ_GET_TIME(&time);
-   
-    fprintf(threadqueue->debug_log, "\t\t-\t-\t%lf\t-\tFLUSH\n", KVZ_CLOCK_T_AS_DOUBLE(time));
-  }
-#endif
-#endif
-}
 
-int kvz_threadqueue_finalize(threadqueue_queue_t * const threadqueue) {
-  int i;
-  
-  //Flush the queue
-  if (!kvz_threadqueue_flush(threadqueue)) {
-    fprintf(stderr, "Unable to flush threadqueue!\n");
-    return 0;
-  }
-  
-  //Lock threadqueue
+int kvz_threadqueue_submit(threadqueue_queue_t * const threadqueue, threadqueue_job_t *job)
+{
   PTHREAD_LOCK(&threadqueue->lock);
-  
-  //Free job memory
-  threadqueue_free_jobs(threadqueue);
-  
-  if (threadqueue->stop) {
-    fprintf(stderr, "threadqueue already stopping\n");
-    
-    if (pthread_mutex_unlock(&threadqueue->lock) != 0) {
-      fprintf(stderr, "pthread_mutex_unlock failed!\n");
-      assert(0);
-      return 0;
-    }
-    assert(0); //We should get here...
-    return 0;
-  }
-  
-  threadqueue->stop = 1;
-  
-  if (pthread_cond_broadcast(&(threadqueue->cond)) != 0) {
-    fprintf(stderr, "pthread_cond_broadcast failed!\n");
-    PTHREAD_UNLOCK(&threadqueue->lock);
-    assert(0);
-    return 0;
+  PTHREAD_LOCK(&job->lock);
+  assert(job->state == THREADQUEUE_JOB_STATE_PAUSED);
+
+  if (threadqueue->thread_count == 0) {
+    // When not using threads, run the job immediately.
+    job->fptr(job->arg);
+    job->state = THREADQUEUE_JOB_STATE_DONE;
+  } else if (job->ndepends == 0) {
+    threadqueue_push_job(threadqueue, kvz_threadqueue_copy_ref(job));
+    pthread_cond_signal(&threadqueue->job_available);
+  } else {
+    job->state = THREADQUEUE_JOB_STATE_WAITING;
   }
-  //Unlock it now, since all jobs have to stpo
+  PTHREAD_UNLOCK(&job->lock);
   PTHREAD_UNLOCK(&threadqueue->lock);
-  
-  //Join threads
-  for(i = 0; i < threadqueue->threads_count; i++) {
-    if(pthread_join(threadqueue->threadsi, NULL) != 0) {
-      fprintf(stderr, "pthread_join failed!\n");
-      return 0;
-    }
-  }
-  
-#ifdef KVZ_DEBUG
-  FREE_POINTER(threadqueue->debug_clock_thread_start);
-  FREE_POINTER(threadqueue->debug_clock_thread_end);
-  fclose(threadqueue->debug_log);
-#endif
-  
-  //Free allocated stuff
-  FREE_POINTER(threadqueue->queue);
-  threadqueue->queue_count = 0;
-  threadqueue->queue_size = 0;
-  threadqueue->queue_start = 0;
-  
-  FREE_POINTER(threadqueue->threads);
-  threadqueue->threads_count = 0;
-  
-  if (pthread_mutex_destroy(&threadqueue->lock) != 0) {
-    fprintf(stderr, "pthread_mutex_destroy failed!\n");
-    assert(0);
-    return 0;
-  }
-  if (pthread_cond_destroy(&threadqueue->cond) != 0) {
-    fprintf(stderr, "pthread_cond_destroy failed!\n");
-    assert(0);
-    return 0;
-  }
-  
-  if (pthread_cond_destroy(&threadqueue->cb_cond) != 0) {
-    fprintf(stderr, "pthread_cond_destroy failed!\n");
-    assert(0);
-    return 0;
-  }
-  
+
   return 1;
 }
 
-int kvz_threadqueue_flush(threadqueue_queue_t * const threadqueue) {
-  int notdone = 1;
-  
-  //Lock the queue
-  PTHREAD_LOCK(&threadqueue->lock);
-  
-  do {
-    notdone = threadqueue->queue_waiting_execution + threadqueue->queue_waiting_dependency + threadqueue->queue_running;
 
-    if (notdone > 0) {
-      PTHREAD_COND_BROADCAST(&(threadqueue->cond));
-      PTHREAD_COND_WAIT(&threadqueue->cb_cond, &threadqueue->lock);
-    }
-  } while (notdone > 0);
-  
-  threadqueue_free_jobs(threadqueue);
+/**
+ * \brief Add a dependency between two jobs.
+ *
+ * \param job           job that should be executed after dependency
+ * \param dependency    job that should be executed before job
+ *
+ * \return 1 on success, 0 on failure
+ *
+ */
+int kvz_threadqueue_job_dep_add(threadqueue_job_t *job, threadqueue_job_t *dependency)
+{
+  // Lock the dependency first and then the job depending on it.
+  // This must be the same order as in threadqueue_worker.
+  PTHREAD_LOCK(&dependency->lock);
+
+  if (dependency->state == THREADQUEUE_JOB_STATE_DONE) {
+    // The dependency has been completed already so there is nothing to do.
+    PTHREAD_UNLOCK(&dependency->lock);
+    return 1;
+  }
 
-  assert(threadqueue->queue_waiting_dependency == 0 && threadqueue->queue_waiting_execution == 0 && threadqueue->queue_running == 0);
+  PTHREAD_LOCK(&job->lock);
+  job->ndepends++;
+  PTHREAD_UNLOCK(&job->lock);
 
-  PTHREAD_UNLOCK(&threadqueue->lock);
+  // Add the reverse dependency
+  if (dependency->rdepends_count >= dependency->rdepends_size) {
+    dependency->rdepends_size += THREADQUEUE_LIST_REALLOC_SIZE;
+    size_t bytes = dependency->rdepends_size * sizeof(threadqueue_job_t*);
+    dependency->rdepends = realloc(dependency->rdepends, bytes);
+  }
+  dependency->rdependsdependency->rdepends_count++ = kvz_threadqueue_copy_ref(job);
+
+  PTHREAD_UNLOCK(&dependency->lock);
 
   return 1;
 }
 
-int kvz_threadqueue_waitfor(threadqueue_queue_t * const threadqueue, threadqueue_job_t * const job) {
-  int job_done = 0;
-  
-  //NULL job is clearly OK :-)
-  if (!job) return 1;
-  
-  //Lock the queue
-  PTHREAD_LOCK(&threadqueue->lock);
-  do {
-    
-    PTHREAD_LOCK(&job->lock);
-    job_done = (job->state == THREADQUEUE_JOB_STATE_DONE);
-    PTHREAD_UNLOCK(&job->lock);
-    
-    if (!job_done) {
-      PTHREAD_COND_BROADCAST(&(threadqueue->cond));
-      PTHREAD_COND_WAIT(&threadqueue->cb_cond, &threadqueue->lock);
-    }
-  } while (!job_done);
-
-  // Free jobs submitted before this job.
-  int i;
-  for (i = 0; i < threadqueue->queue_count; ++i) {
-    if (threadqueue->queuei == job) break;
-    threadqueue_free_job(threadqueue, i);
-  }
-  // Move remaining jobs to the beginning of the array.
-  if (i > 0) {
-    threadqueue->queue_count -= i;
-    threadqueue->queue_start = 0;
-    memmove(threadqueue->queue, &threadqueue->queuei, threadqueue->queue_count * sizeof(*threadqueue->queue));
-    FILL_ARRAY(&threadqueue->queuethreadqueue->queue_count, 0, i);
-  }
 
-  PTHREAD_UNLOCK(&threadqueue->lock);
-  
-  return 1;
+/**
+ * \brief Get a new pointer to a job.
+ *
+ * Increment reference count and return the job.
+ */
+threadqueue_job_t *kvz_threadqueue_copy_ref(threadqueue_job_t *job)
+{
+  // The caller should have had another reference.
+  assert(job->refcount > 0);
+  KVZ_ATOMIC_INC(&job->refcount);
+  return job;
 }
 
-threadqueue_job_t * kvz_threadqueue_submit(threadqueue_queue_t * const threadqueue, void (*fptr)(void *arg), void *arg, int wait, const char* const debug_description) {
-  threadqueue_job_t *job;
-  //No lock here... this should be constant
-  if (threadqueue->threads_count == 0) {
-    //FIXME: This should be improved in order to handle dependencies
-    PERFORMANCE_MEASURE_START(KVZ_PERF_JOB);
-    fptr(arg);
-    PERFORMANCE_MEASURE_END(KVZ_PERF_JOB, threadqueue, "%s", debug_description);
-    return NULL;
-  }
-  
-  assert(wait == 0 || wait == 1);
-  
-  job = MALLOC(threadqueue_job_t, 1);
-  
-#ifdef KVZ_DEBUG
-  if (debug_description) {
-    size_t desc_len = MIN(255, strlen(debug_description));
-    char* desc;
-    
-    //Copy description
-    desc = MALLOC(char, desc_len + 1);
-    assert(desc);
-    memcpy(desc, debug_description, desc_len);
-    descdesc_len = 0;
-    
-    job->debug_description = desc;
-  } else {
-    char* desc;
-    desc = MALLOC(char, 255);
-    sprintf(desc, "(*%p)(%p)", fptr, arg);
-    
-    job->debug_description = desc;
-  }
-  KVZ_GET_TIME(&job->debug_clock_enqueue);
-#endif //KVZ_DEBUG
-  
-  if (!job) {
-    fprintf(stderr, "Could not alloc job!\n");
-    assert(0);
-    return NULL;
+
+/**
+ * \brief Free a job.
+ *
+ * Decrement reference count of the job. If no references exist any more,
+ * deallocate associated memory and destroy mutexes.
+ *
+ * Sets the job pointer to NULL.
+ */
+void kvz_threadqueue_free_job(threadqueue_job_t **job_ptr)
+{
+  threadqueue_job_t *job = *job_ptr;
+  if (job == NULL) return;
+  *job_ptr = NULL;
+
+  int new_refcount = KVZ_ATOMIC_DEC(&job->refcount);
+  if (new_refcount > 0) {
+    // There are still references so we don't free the data yet.
+    return;
   }
-  
-  job->fptr = fptr;
-  job->arg = arg;
-  if (pthread_mutex_init(&job->lock, NULL) != 0) {
-    fprintf(stderr, "pthread_mutex_init(job) failed!\n");
-    assert(0);
-    return NULL;
+
+  assert(new_refcount == 0);
+
+  for (int i = 0; i < job->rdepends_count; i++) {
+    kvz_threadqueue_free_job(&job->rdependsi);
   }
-  job->ndepends = wait;
-  job->rdepends = NULL;
   job->rdepends_count = 0;
-  job->rdepends_size = 0;
-  job->state = THREADQUEUE_JOB_STATE_QUEUED;
-  
-  PTHREAD_LOCK(&threadqueue->lock);
-  
-  //Add the reverse dependency
-  if (threadqueue->queue_count >= threadqueue->queue_size) {
-    threadqueue->queue = realloc(threadqueue->queue, sizeof(threadqueue_job_t *) * (threadqueue->queue_size + THREADQUEUE_LIST_REALLOC_SIZE));
-    if (!threadqueue->queue) {
-      fprintf(stderr, "Could not realloc queue!\n");
-      assert(0);
-      return NULL;
-    }
-    threadqueue->queue_size += THREADQUEUE_LIST_REALLOC_SIZE;
-  }
-  threadqueue->queuethreadqueue->queue_count++ = job;
-  
-  if (job->ndepends == 0) {
-    ++threadqueue->queue_waiting_execution;
-    //Hope a thread can do it...
-    PTHREAD_COND_SIGNAL(&(threadqueue->cond));
-  } else {
-    ++threadqueue->queue_waiting_dependency;
-  }
-  
-  PTHREAD_UNLOCK(&threadqueue->lock);
-  
-  return job;
+
+  FREE_POINTER(job->rdepends);
+  pthread_mutex_destroy(&job->lock);
+  FREE_POINTER(job);
 }
 
-int kvz_threadqueue_job_dep_add(threadqueue_job_t *job, threadqueue_job_t *depends_on) {
-  //If we are not using threads, job are NULL pointers, so we can skip that
-  if (!job && !depends_on) return 1;
-  
-  assert(job && depends_on);
-  
-  //Lock first the job, and then the dependency
+
+/**
+ * \brief Wait for a job to be completed.
+ *
+ * \return 1 on success, 0 on failure
+ */
+int kvz_threadqueue_waitfor(threadqueue_queue_t * threadqueue, threadqueue_job_t * job)
+{
   PTHREAD_LOCK(&job->lock);
-  PTHREAD_LOCK(&depends_on->lock);
-  
-  if (depends_on->state != THREADQUEUE_JOB_STATE_DONE) {
-    job->ndepends++;
-  }
-  
-  //Add the reverse dependency (FIXME: this may be moved in the if above... but we would lose ability to track)
-  if (depends_on->rdepends_count >= depends_on->rdepends_size) {
-    depends_on->rdepends = realloc(depends_on->rdepends, sizeof(threadqueue_job_t *) * (depends_on->rdepends_size + THREADQUEUE_LIST_REALLOC_SIZE));
-    if (!depends_on->rdepends) {
-      fprintf(stderr, "Could not realloc rdepends!\n");
-      assert(0);
-      return 0;
-    }
-    depends_on->rdepends_size += THREADQUEUE_LIST_REALLOC_SIZE;
+  while (job->state != THREADQUEUE_JOB_STATE_DONE) {
+    PTHREAD_COND_WAIT(&threadqueue->job_done, &job->lock);
   }
-  depends_on->rdependsdepends_on->rdepends_count++ = job;
-  
-  PTHREAD_UNLOCK(&depends_on->lock);
   PTHREAD_UNLOCK(&job->lock);
-  
+
   return 1;
 }
 
-int kvz_threadqueue_job_unwait_job(threadqueue_queue_t * const threadqueue, threadqueue_job_t *job) {
-  int ndepends = 0;
-  
-  //NULL job =>  no threads, nothing to do
-  if (!job) return 1;
-  PTHREAD_LOCK(&job->lock);
-  job->ndepends--;
-  ndepends = job->ndepends;
-  PTHREAD_UNLOCK(&job->lock);
-  
-  if (ndepends == 0) {
-    PTHREAD_LOCK(&threadqueue->lock);
-    assert(threadqueue->queue_waiting_dependency > 0);
-    --threadqueue->queue_waiting_dependency;
-    ++threadqueue->queue_waiting_execution;
-    //Hope a thread can do it...
-    PTHREAD_COND_SIGNAL(&(threadqueue->cond));
-    
+
+/**
+ * \brief Stop all threads after they finish the current jobs.
+ *
+ * Block until all threads have stopped.
+ *
+ * \return 1 on success, 0 on failure
+ */
+int kvz_threadqueue_stop(threadqueue_queue_t * const threadqueue)
+{
+  PTHREAD_LOCK(&threadqueue->lock);
+
+  if (threadqueue->stop) {
+    // The threadqueue should have stopped already.
+    assert(threadqueue->thread_running_count == 0);
     PTHREAD_UNLOCK(&threadqueue->lock);
+    return 1;
+  }
+
+  // Tell all threads to stop.
+  threadqueue->stop = true;
+  PTHREAD_COND_BROADCAST(&threadqueue->job_available);
+  PTHREAD_UNLOCK(&threadqueue->lock);
+
+  // Wait for them to stop.
+  for (int i = 0; i < threadqueue->thread_count; i++) {
+    if (pthread_join(threadqueue->threadsi, NULL) != 0) {
+      fprintf(stderr, "pthread_join failed!\n");
+      return 0;
+    }
   }
-  
+
   return 1;
 }
 
-#ifdef KVZ_DEBUG
-int threadqueue_log(threadqueue_queue_t * threadqueue, const KVZ_CLOCK_T *start, const KVZ_CLOCK_T *stop, const char* debug_description) {
-  int i, thread_id = -1;
-  FILE* output;
-  
-  assert(start);
-  
-  if (threadqueue) {
-    //We need to lock to output safely
-    PTHREAD_LOCK(&threadqueue->lock);
-    
-    output = threadqueue->debug_log;
-    
-    //Find the thread
-    for(i = 0; i < threadqueue->threads_count; i++) {
-      if(pthread_equal(threadqueue->threadsi, pthread_self()) != 0) {
-        thread_id = i;
-        break;
-      }
-    }
-  } else {
-    thread_id = -1;
-    output = stderr;
+
+/**
+ * \brief Stop all threads and free allocated resources.
+ *
+ * \return 1 on success, 0 on failure
+ */
+void kvz_threadqueue_free(threadqueue_queue_t *threadqueue)
+{
+  if (threadqueue == NULL) return;
+
+  kvz_threadqueue_stop(threadqueue);
+
+  // Free all jobs.
+  while (threadqueue->first) {
+    threadqueue_job_t *next = threadqueue->first->next;
+    kvz_threadqueue_free_job(&threadqueue->first);
+    threadqueue->first = next;
   }
-  
-  if (thread_id >= 0) {
-    if (stop) {
-      fprintf(output, "\t%d\t-\t%lf\t+%lf\t-\t%s\n", thread_id, KVZ_CLOCK_T_AS_DOUBLE(*start), KVZ_CLOCK_T_DIFF(*start, *stop), debug_description);
-    } else {
-      fprintf(output, "\t%d\t-\t%lf\t-\t-\t%s\n", thread_id, KVZ_CLOCK_T_AS_DOUBLE(*start), debug_description);
-    }
-  } else {
-    if (stop) {
-      fprintf(output, "\t\t-\t%lf\t+%lf\t-\t%s\n", KVZ_CLOCK_T_AS_DOUBLE(*start), KVZ_CLOCK_T_DIFF(*start, *stop), debug_description);
-    } else {
-      fprintf(output, "\t\t-\t%lf\t-\t-\t%s\n", KVZ_CLOCK_T_AS_DOUBLE(*start), debug_description);
-    }
+  threadqueue->last = NULL;
+
+  FREE_POINTER(threadqueue->threads);
+  threadqueue->thread_count = 0;
+
+  if (pthread_mutex_destroy(&threadqueue->lock) != 0) {
+    fprintf(stderr, "pthread_mutex_destroy failed!\n");
   }
-  
-  if (threadqueue) {
-    PTHREAD_UNLOCK(&threadqueue->lock);
+
+  if (pthread_cond_destroy(&threadqueue->job_available) != 0) {
+    fprintf(stderr, "pthread_cond_destroy failed!\n");
   }
-  return 1;
+
+  if (pthread_cond_destroy(&threadqueue->job_done) != 0) {
+    fprintf(stderr, "pthread_cond_destroy failed!\n");
+  }
+
+  FREE_POINTER(threadqueue);
 }
-#endif //KVZ_DEBUG

kvazaar-1.1.0.tar.gz/src/threadqueue.h -> kvazaar-1.2.0.tar.gz/src/threadqueue.h Changed

@@ -30,140 +30,22 @@
 
 #include "global.h" // IWYU pragma: keep
 
-typedef enum {
-  THREADQUEUE_JOB_STATE_QUEUED = 0,
-  THREADQUEUE_JOB_STATE_RUNNING = 1,
-  THREADQUEUE_JOB_STATE_DONE = 2
-} threadqueue_job_state;
+typedef struct threadqueue_job_t threadqueue_job_t;
+typedef struct threadqueue_queue_t threadqueue_queue_t;
 
-typedef struct threadqueue_job_t {
-  pthread_mutex_t lock;
-  
-  threadqueue_job_state state;
-  
-  unsigned int ndepends; //Number of active dependencies that this job wait for
-  
-  struct threadqueue_job_t **rdepends; //array of pointer to jobs that depend on this one. They have to exist when the thread finishes, because they cannot be run before.
-  unsigned int rdepends_count; //number of rdepends
-  unsigned int rdepends_size; //allocated size of rdepends
-  
-  //Job function and state to use
-  void (*fptr)(void *arg);
-  void *arg;
-  
-#ifdef KVZ_DEBUG
-  const char* debug_description;
-  
-  int debug_worker_id;
-  
-  KVZ_CLOCK_T debug_clock_enqueue;
-  KVZ_CLOCK_T debug_clock_start;
-  KVZ_CLOCK_T debug_clock_stop;
-  KVZ_CLOCK_T debug_clock_dequeue;
-#endif
-} threadqueue_job_t;
+threadqueue_queue_t * kvz_threadqueue_init(int thread_count);
 
+threadqueue_job_t * kvz_threadqueue_job_create(void (*fptr)(void *arg), void *arg);
+int kvz_threadqueue_submit(threadqueue_queue_t * threadqueue, threadqueue_job_t *job);
 
-  
+int kvz_threadqueue_job_dep_add(threadqueue_job_t *job, threadqueue_job_t *dependency);
 
-typedef struct {
-  pthread_mutex_t lock;
-  pthread_cond_t cond;
-  pthread_cond_t cb_cond;
-  
-  pthread_t *threads;
-  int threads_count;
-  int threads_running;
+threadqueue_job_t *kvz_threadqueue_copy_ref(threadqueue_job_t *job);
 
-  int stop; //=>1: threads should stop asap
-  
-  int fifo;
-  
-  threadqueue_job_t **queue;
-  unsigned int queue_start;
-  unsigned int queue_count;
-  unsigned int queue_size;
-  unsigned int queue_waiting_execution; //Number of jobs without any dependency which could be run
-  unsigned int queue_waiting_dependency; //Number of jobs waiting for a dependency to complete
-  unsigned int queue_running; //Number of jobs running
-  
-#ifdef KVZ_DEBUG
-  //Format: pointer <tab> worker id <tab> time enqueued <tab> time started <tab> time stopped <tab> time dequeued <tab> job description
-  //For threads, pointer = "" and job description == "thread", time enqueued and time dequeued are equal to "-"
-  //For flush, pointer = "" and job description == "FLUSH", time enqueued, time dequeued and time started are equal to "-" 
-  //Each time field, except the first one in the line be expressed in a relative way, by prepending the number of seconds by +.
-  //Dependencies: pointer -> pointer
+void kvz_threadqueue_free_job(threadqueue_job_t **job_ptr);
 
-  FILE *debug_log;
-  
-  KVZ_CLOCK_T *debug_clock_thread_start;
-  KVZ_CLOCK_T *debug_clock_thread_end;
-#endif
-} threadqueue_queue_t;
-
-//Init a threadqueue (if fifo, then behave as a FIFO with dependencies, otherwise as a LIFO with dependencies)
-int kvz_threadqueue_init(threadqueue_queue_t * threadqueue, int thread_count, int fifo);
-
-//Add a job to the queue, and returs a threadqueue_job handle. If wait == 1, one has to run kvz_threadqueue_job_unwait_job in order to have it run
-threadqueue_job_t * kvz_threadqueue_submit(threadqueue_queue_t * threadqueue, void (*fptr)(void *arg), void *arg, int wait, const char* debug_description);
-
-int kvz_threadqueue_job_unwait_job(threadqueue_queue_t * threadqueue, threadqueue_job_t *job);
-
-//Add a dependency between two jobs.
-int kvz_threadqueue_job_dep_add(threadqueue_job_t *job, threadqueue_job_t *depends_on);
-
-//Blocking call until the queue is empty. Previously set threadqueue_job handles should not be used anymore
-int kvz_threadqueue_flush(threadqueue_queue_t * threadqueue);
-
-//Blocking call until job is executed. Job handles submitted before job should not be used any more as they are removed from the queue.
 int kvz_threadqueue_waitfor(threadqueue_queue_t * threadqueue, threadqueue_job_t * job);
+int kvz_threadqueue_stop(threadqueue_queue_t * threadqueue);
+void kvz_threadqueue_free(threadqueue_queue_t * threadqueue);
 
-//Free ressources in a threadqueue
-int kvz_threadqueue_finalize(threadqueue_queue_t * threadqueue);
-
-#ifdef KVZ_DEBUG
-int threadqueue_log(threadqueue_queue_t * threadqueue, const KVZ_CLOCK_T *start, const KVZ_CLOCK_T *stop, const char* debug_description);
-
-// Bitmasks for PERFORMANCE_MEASURE_START and PERFORMANCE_MEASURE_END.
-#define KVZ_PERF_FRAME    (1 << 0)
-#define KVZ_PERF_JOB      (1 << 1)
-#define KVZ_PERF_LCU      (1 << 2)
-#define KVZ_PERF_SAOREC   (1 << 3)
-#define KVZ_PERF_BSLEAF   (1 << 4)
-#define KVZ_PERF_SEARCHCU (1 << 5)
-
-#define IMPL_PERFORMANCE_MEASURE_START(mask) KVZ_CLOCK_T start, stop; if ((KVZ_DEBUG) & mask) { KVZ_GET_TIME(&start); }
-#define IMPL_PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) { if ((KVZ_DEBUG) & mask) { KVZ_GET_TIME(&stop); {char job_description256; sprintf(job_description, (str), __VA_ARGS__); threadqueue_log((threadqueue), &start, &stop, job_description);}} } \
-
-#ifdef _MSC_VER
-// Disable VS conditional expression warning from debug code.
-# define WITHOUT_CONSTANT_EXP_WARNING(macro) \
-  __pragma(warning(push)) \
-  __pragma(warning(disable:4127)) \
-  macro \
-  __pragma(warning(pop))
-# define PERFORMANCE_MEASURE_START(mask) \
-    WITHOUT_CONSTANT_EXP_WARNING(IMPL_PERFORMANCE_MEASURE_START(mask))
-# define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) \
-    WITHOUT_CONSTANT_EXP_WARNING(IMPL_PERFORMANCE_MEASURE_END(mask, threadqueue, str, ##__VA_ARGS__))
-#else
-# define PERFORMANCE_MEASURE_START(mask) \
-    IMPL_PERFORMANCE_MEASURE_START(mask)
-# define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) \
-    IMPL_PERFORMANCE_MEASURE_END(mask, threadqueue, str, ##__VA_ARGS__)
-#endif
-
-#else
-#define PERFORMANCE_MEASURE_START(mask) 
-#define PERFORMANCE_MEASURE_END(mask, threadqueue, str, ...) 
-#endif
-
-/* Constraints: 
- * 
- * - Always first lock threadqueue, than a job inside it
- * - When job A depends on job B, always lock first job B and then job A
- * - Jobs should be submitted in an order which is compatible with serial execution.
- * 
- * */
-
-#endif //THREADQUEUE_H_
+#endif // THREADQUEUE_H_

kvazaar-1.1.0.tar.gz/src/threads.h -> kvazaar-1.2.0.tar.gz/src/threads.h Changed

@@ -30,10 +30,6 @@
 
 #include <pthread.h>
 
-#define E3 1000
-#define E9 1000000000
-#define FILETIME_TO_EPOCH 0x19DB1DED53E8000LL
-
 #if defined(__GNUC__) && !defined(__MINGW32__) 
 #include <unistd.h> // IWYU pragma: export
 #include <time.h> // IWYU pragma: export
@@ -76,7 +72,64 @@
 
 #endif //__GNUC__
 
-#undef E9
-#undef E3
+#ifdef __APPLE__
+// POSIX semaphores are deprecated on Mac so we use Grand Central Dispatch
+// semaphores instead.
+#include <dispatch/dispatch.h>
+typedef dispatch_semaphore_t kvz_sem_t;
+
+static INLINE void kvz_sem_init(kvz_sem_t *sem, int value)
+{
+    assert(value >= 0);
+    *sem = dispatch_semaphore_create(value);
+}
+
+static INLINE void kvz_sem_wait(kvz_sem_t *sem)
+{
+    dispatch_semaphore_wait(*sem, DISPATCH_TIME_FOREVER);
+}
+
+static INLINE void kvz_sem_post(kvz_sem_t *sem)
+{
+    dispatch_semaphore_signal(*sem);
+}
+
+
+static INLINE void kvz_sem_destroy(kvz_sem_t *sem)
+{
+    // Do nothing for GCD semaphores.
+}
+
+#else
+// Use POSIX semaphores.
+#include <semaphore.h>
+
+typedef sem_t kvz_sem_t;
+
+static INLINE void kvz_sem_init(kvz_sem_t *sem, int value)
+{
+    assert(value >= 0);
+    // Pthreads-w32 does not support process-shared semaphores, so pshared
+    // must always be zero.
+    int pshared = 0;
+    sem_init(sem, pshared, value);
+}
+
+static INLINE void kvz_sem_wait(kvz_sem_t *sem)
+{
+    sem_wait(sem);
+}
+
+static INLINE void kvz_sem_post(kvz_sem_t *sem)
+{
+    sem_post(sem);
+}
+
+static INLINE void kvz_sem_destroy(kvz_sem_t *sem)
+{
+    sem_destroy(sem);
+}
+
+#endif
 
 #endif //THREADS_H_

kvazaar-1.1.0.tar.gz/src/transform.c -> kvazaar-1.2.0.tar.gz/src/transform.c Changed

@@ -62,7 +62,7 @@
  *
  * \param width       Transform width.
  * \param in_stride   Stride for ref_in and pred_in
- * \param out_stride  Stride for rec_out and coeff_out.
+ * \param out_stride  Stride for rec_out.
  * \param ref_in      Reference pixels.
  * \param pred_in     Predicted pixels.
  * \param rec_out     Returns the reconstructed pixels.
@@ -82,14 +82,15 @@
 
   for (int y = 0; y < width; ++y) {
     for (int x = 0; x < width; ++x) {
-      int32_t in_idx  = x + y * in_stride;
-      int32_t out_idx = x + y * out_stride;
+      int32_t in_idx    = x + y * in_stride;
+      int32_t out_idx   = x + y * out_stride;
+      int32_t coeff_idx = x + y * width;
 
       // The residual must be computed before writing to rec_out because
       // pred_in and rec_out may point to the same array.
-      coeff_t coeff      = (coeff_t)(ref_inin_idx - pred_inin_idx);
-      coeff_outout_idx = coeff;
-      rec_outout_idx   = ref_inin_idx;
+      coeff_t coeff        = (coeff_t)(ref_inin_idx - pred_inin_idx);
+      coeff_outcoeff_idx = coeff;
+      rec_outout_idx     = ref_inin_idx;
 
       nonzero_coeffs |= (coeff != 0);
     }
@@ -102,22 +103,20 @@
  * Apply DPCM to residual.
  *
  * \param width   width of the block
- * \param stride  stride of coeff array
  * \param dir     RDPCM direction
  * \param coeff   coefficients (residual) to filter
  */
 static void rdpcm(const int width,
-                  const int stride,
                   const rdpcm_dir dir,
                   coeff_t *coeff)
 {
-  const int offset = (dir == RDPCM_HOR) ? 1 : stride;
+  const int offset = (dir == RDPCM_HOR) ? 1 : width;
   const int min_x  = (dir == RDPCM_HOR) ? 1 : 0;
   const int min_y  = (dir == RDPCM_HOR) ? 0 : 1;
 
   for (int y = width - 1; y >= min_y; y--) {
     for (int x = width - 1; x >= min_x; x--) {
-      const int index = x + y * stride;
+      const int index = x + y * width;
       coeffindex -= coeffindex - offset;
     }
   }
@@ -209,7 +208,7 @@
  * \param color  Color.
  * \param scan_order  Coefficient scan order.
  * \param trskip_out  Whether transform skip is used.
- * \param stride  Stride for ref_in, pred_in rec_out and coeff_out.
+ * \param stride  Stride for ref_in, pred_in and rec_out.
  * \param ref_in  Reference pixels.
  * \param pred_in  Predicted pixels.
  * \param rec_out  Reconstructed pixels.
@@ -261,19 +260,142 @@
     // we can skip this.
     kvz_pixels_blit(best->rec, rec_out, width, width, 4, out_stride);
   }
-  kvz_coefficients_blit(best->coeff, coeff_out, width, width, 4, out_stride);
+  copy_coeffs(best->coeff, coeff_out, width);
 
   return best->has_coeffs;
 }
 
+/**
+ * Calculate the residual coefficients for a single TU.
+ */
+static void quantize_tr_residual(encoder_state_t * const state,
+                                 const color_t color,
+                                 const int32_t x,
+                                 const int32_t y,
+                                 const uint8_t depth,
+                                 cu_info_t *cur_pu,
+                                 lcu_t* lcu)
+{
+  const kvz_config *cfg    = &state->encoder_control->cfg;
+  const int32_t shift      = color == COLOR_Y ? 0 : 1;
+  const vector2d_t lcu_px  = { SUB_SCU(x) >> shift, SUB_SCU(y) >> shift };
+
+  // If luma is 4x4, do chroma for the 8x8 luma area when handling the top
+  // left PU because the coordinates are correct.
+  bool handled_elsewhere = color != COLOR_Y &&
+                           depth > MAX_DEPTH &&
+                           (lcu_px.x % 4 != 0 || lcu_px.y % 4 != 0);
+  if (handled_elsewhere) {
+    return;
+  }
+
+  // Clear coded block flag structures for depths lower than current depth.
+  // This should ensure that the CBF data doesn't get corrupted if this function
+  // is called more than once.
+  cbf_clear(&cur_pu->cbf, depth, color);
+
+  int32_t tr_width;
+  if (color == COLOR_Y) {
+    tr_width = LCU_WIDTH >> depth;
+  } else {
+    const int chroma_depth = (depth == MAX_PU_DEPTH ? depth - 1 : depth);
+    tr_width = LCU_WIDTH_C >> chroma_depth;
+  }
+  const int32_t lcu_width = LCU_WIDTH >> shift;
+  const int8_t mode =
+    (color == COLOR_Y) ? cur_pu->intra.mode : cur_pu->intra.mode_chroma;
+  const coeff_scan_order_t scan_idx =
+    kvz_get_scan_order(cur_pu->type, mode, depth);
+  const int offset = lcu_px.x + lcu_px.y * lcu_width;
+  const int z_index = xy_to_zorder(lcu_width, lcu_px.x, lcu_px.y);
+
+  // Pointers to current location in arrays with prediction. The
+  // reconstruction will be written to this array.
+  kvz_pixel *pred = NULL;
+  // Pointers to current location in arrays with reference.
+  const kvz_pixel *ref = NULL;
+  // Pointers to current location in arrays with quantized coefficients.
+  coeff_t *coeff = NULL;
+
+  switch (color) {
+    case COLOR_Y:
+      pred  = &lcu->rec.yoffset;
+      ref   = &lcu->ref.yoffset;
+      coeff = &lcu->coeff.yz_index;
+      break;
+    case COLOR_U:
+      pred = &lcu->rec.uoffset;
+      ref  = &lcu->ref.uoffset;
+      coeff = &lcu->coeff.uz_index;
+      break;
+    case COLOR_V:
+      pred = &lcu->rec.voffset;
+      ref  = &lcu->ref.voffset;
+      coeff = &lcu->coeff.vz_index;
+      break;
+  }
+
+  const bool can_use_trskip = tr_width == 4 &&
+                              color == COLOR_Y &&
+                              cfg->trskip_enable;
+
+  bool has_coeffs;
+
+  if (cfg->lossless) {
+    has_coeffs = bypass_transquant(tr_width,
+                                   lcu_width, // in stride
+                                   lcu_width, // out stride
+                                   ref,
+                                   pred,
+                                   pred,
+                                   coeff);
+    if (cfg->implicit_rdpcm && cur_pu->type == CU_INTRA) {
+      // implicit rdpcm for horizontal and vertical intra modes
+      if (mode == 10) {
+        rdpcm(tr_width, RDPCM_HOR, coeff);
+      } else if (mode == 26) {
+        rdpcm(tr_width, RDPCM_VER, coeff);
+      }
+    }
+
+  } else if (can_use_trskip) {
+    // Try quantization with trskip and use it if it's better.
+    has_coeffs = kvz_quantize_residual_trskip(state,
+                                              cur_pu,
+                                              tr_width,
+                                              color,
+                                              scan_idx,
+                                              &cur_pu->intra.tr_skip,
+                                              lcu_width,
+                                              lcu_width,
+                                              ref,
+                                              pred,
+                                              pred,
+                                              coeff);
+  } else {
+    has_coeffs = kvz_quantize_residual(state,
+                                       cur_pu,
+                                       tr_width,
+                                       color,
+                                       scan_idx,
+                                       false, // tr skip
+                                       lcu_width,
+                                       lcu_width,
+                                       ref,
+                                       pred,
+                                       pred,
+                                       coeff);
+  }
+
+  if (has_coeffs) {
+    cbf_set(&cur_pu->cbf, depth, color);
+  }
+}
 
 /**
  * This function calculates the residual coefficients for a region of the LCU
  * (defined by x, y and depth) and updates the reconstruction with the
- * kvantized residual.
- *
- * It handles recursion for transform split, but that is currently only work
- * for 64x64 inter to 32x32 transform blocks.
+ * kvantized residual. Processes the TU tree recursively.
  *
  * Inputs are:
  * - lcu->rec  pixels after prediction for the area
@@ -281,196 +403,69 @@
  * - lcu->cu   for the area
  *
  * Outputs are:
- * - lcu->rec  reconstruction after quantized residual
- * - lcu->coeff  quantized coefficients for the area
- * - lcu->cbf  coded block flags for the area
- * - lcu->cu.intra.tr_skip  for the area
+ * - lcu->rec               reconstruction after quantized residual
+ * - lcu->coeff             quantized coefficients for the area
+ * - lcu->cbf               coded block flags for the area
+ * - lcu->cu.intra.tr_skip  tr skip flags for the area (in case of luma)
  */
-void kvz_quantize_lcu_luma_residual(encoder_state_t * const state, int32_t x, int32_t y, const uint8_t depth, cu_info_t *cur_pu, lcu_t* lcu)
+void kvz_quantize_lcu_residual(encoder_state_t * const state,
+                               const bool luma,
+                               const bool chroma,
+                               const int32_t x,
+                               const int32_t y,
+                               const uint8_t depth,
+                               cu_info_t *cur_pu,
+                               lcu_t* lcu)
 {
-  // we have 64>>depth transform size
-  const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
+  const int32_t width = LCU_WIDTH >> depth;
+  const vector2d_t lcu_px  = { SUB_SCU(x), SUB_SCU(y) };
+
   if (cur_pu == NULL) {
     cur_pu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
-  const int8_t width = LCU_WIDTH>>depth;
-  
+
   // Tell clang-analyzer what is up. For some reason it can't figure out from
   // asserting just depth.
-  assert(width == 4 || width == 8 || width == 16 || width == 32 || width == 64);
+  assert(width ==  4 ||
+         width ==  8 ||
+         width == 16 ||
+         width == 32 ||
+         width == 64);
 
-  // Split transform and increase depth
   if (depth == 0 || cur_pu->tr_depth > depth) {
-    int offset = width / 2;
-    kvz_quantize_lcu_luma_residual(state, x,          y,          depth+1, NULL, lcu);
-    kvz_quantize_lcu_luma_residual(state, x + offset, y,          depth+1, NULL, lcu);
-    kvz_quantize_lcu_luma_residual(state, x,          y + offset, depth+1, NULL, lcu);
-    kvz_quantize_lcu_luma_residual(state, x + offset, y + offset, depth+1, NULL, lcu);
+    // Split transform and increase depth
+    const int offset = width / 2;
+    const int32_t x2 = x + offset;
+    const int32_t y2 = y + offset;
+
+    kvz_quantize_lcu_residual(state, luma, chroma, x,  y,  depth + 1, NULL, lcu);
+    kvz_quantize_lcu_residual(state, luma, chroma, x2, y,  depth + 1, NULL, lcu);
+    kvz_quantize_lcu_residual(state, luma, chroma, x,  y2, depth + 1, NULL, lcu);
+    kvz_quantize_lcu_residual(state, luma, chroma, x2, y2, depth + 1, NULL, lcu);
 
     // Propagate coded block flags from child CUs to parent CU.
-    if (depth <= MAX_DEPTH) {
-      uint16_t child_cbfs3 = {
-        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
-        LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
-        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
-      };
+    uint16_t child_cbfs3 = {
+      LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
+      LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
+      LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
+    };
+
+    if (luma && depth < MAX_DEPTH) {
       cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_Y);
     }
-
-    return;
-  }
-
-  {
-    const int luma_offset = lcu_px.x + lcu_px.y * LCU_WIDTH;
-
-    // Pointers to current location in arrays with prediction.
-    kvz_pixel *recbase_y = &lcu->rec.yluma_offset;
-    // Pointers to current location in arrays with reference.
-    const kvz_pixel *base_y = &lcu->ref.yluma_offset;
-    // Pointers to current location in arrays with kvantized coefficients.
-    coeff_t *orig_coeff_y = &lcu->coeff.yluma_offset;
-
-    coeff_scan_order_t scan_idx_luma = kvz_get_scan_order(cur_pu->type, cur_pu->intra.mode, depth);
-
-    #if OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD
-    uint32_t residual_sum = 0;
-    #endif
-
-    // Clear coded block flag structures for depths lower than current depth.
-    // This should ensure that the CBF data doesn't get corrupted if this function
-    // is called more than once.
-    cbf_clear(&cur_pu->cbf, depth, COLOR_Y);
-
-
-    if (state->encoder_control->cfg.lossless) {
-      if (bypass_transquant(width,
-                            LCU_WIDTH, LCU_WIDTH,
-                            base_y, recbase_y,
-                            recbase_y, orig_coeff_y)) {
-        cbf_set(&cur_pu->cbf, depth, COLOR_Y);
-      }
-      if (state->encoder_control->cfg.implicit_rdpcm && cur_pu->type == CU_INTRA) {
-        // implicit rdpcm for horizontal and vertical intra modes
-        if (cur_pu->intra.mode == 10) {
-          rdpcm(width, LCU_WIDTH, RDPCM_HOR, orig_coeff_y);
-
-        } else if (cur_pu->intra.mode == 26) {
-          rdpcm(width, LCU_WIDTH, RDPCM_VER, orig_coeff_y);
-        }
-      }
-    } else if (width == 4 && state->encoder_control->cfg.trskip_enable) {
-      // Try quantization with trskip and use it if it's better.
-      int has_coeffs = kvz_quantize_residual_trskip(
-          state, cur_pu, width, COLOR_Y, scan_idx_luma,
-          &cur_pu->intra.tr_skip,
-          LCU_WIDTH, LCU_WIDTH,
-          base_y, recbase_y, recbase_y, orig_coeff_y
-      );
-      if (has_coeffs) {
-        cbf_set(&cur_pu->cbf, depth, COLOR_Y);
-      }
-    } else {
-      int has_coeffs = kvz_quantize_residual(
-          state, cur_pu, width, COLOR_Y, scan_idx_luma,
-          0,
-          LCU_WIDTH, LCU_WIDTH,
-          base_y, recbase_y, recbase_y, orig_coeff_y
-      );
-      if (has_coeffs) {
-        cbf_set(&cur_pu->cbf, depth, COLOR_Y);
-      }
+    if (chroma && depth <= MAX_DEPTH) {
+      cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_U);
+      cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_V);
     }
-  }
-}
-
-
-void kvz_quantize_lcu_chroma_residual(encoder_state_t * const state, int32_t x, int32_t y, const uint8_t depth, cu_info_t *cur_cu, lcu_t* lcu)
-{
-  // we have 64>>depth transform size
-  const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
-  const int8_t width = LCU_WIDTH>>depth;
-  if (cur_cu == NULL) {
-    cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
-  }
-  
-  // Tell clang-analyzer what is up. For some reason it can't figure out from
-  // asserting just depth.
-  assert(width == 4 || width == 8 || width == 16 || width == 32 || width == 64);
-
-  // Split transform and increase depth
-  if (depth == 0 || cur_cu->tr_depth > depth) {
-    int offset = width / 2;
-    kvz_quantize_lcu_chroma_residual(state, x,          y,          depth+1, NULL, lcu);
-    kvz_quantize_lcu_chroma_residual(state, x + offset, y,          depth+1, NULL, lcu);
-    kvz_quantize_lcu_chroma_residual(state, x,          y + offset, depth+1, NULL, lcu);
-    kvz_quantize_lcu_chroma_residual(state, x + offset, y + offset, depth+1, NULL, lcu);
 
-    // Propagate coded block flags from child CUs to parent CU.
-    if (depth < MAX_DEPTH) {
-      uint16_t child_cbfs3 = {
-        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
-        LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
-        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
-      };
-      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_U);
-      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_V);
+  } else {
+    // Process a leaf TU.
+    if (luma) {
+      quantize_tr_residual(state, COLOR_Y, x, y, depth, cur_pu, lcu);
     }
-
-    return;
-  }
-
-  // If luma is 4x4, do chroma for the 8x8 luma area when handling the top
-  // left PU because the coordinates are correct.
-  if (depth <= MAX_DEPTH || (lcu_px.x % 8 == 0 && lcu_px.y % 8 == 0)) {
-    cbf_clear(&cur_cu->cbf, depth, COLOR_U);
-    cbf_clear(&cur_cu->cbf, depth, COLOR_V);
-
-    const int chroma_offset = lcu_px.x / 2 + lcu_px.y / 2 * LCU_WIDTH_C;
-    kvz_pixel *recbase_u = &lcu->rec.uchroma_offset;
-    kvz_pixel *recbase_v = &lcu->rec.vchroma_offset;
-    const kvz_pixel *base_u = &lcu->ref.uchroma_offset;
-    const kvz_pixel *base_v = &lcu->ref.vchroma_offset;
-    coeff_t *orig_coeff_u = &lcu->coeff.uchroma_offset;
-    coeff_t *orig_coeff_v = &lcu->coeff.vchroma_offset;
-    coeff_scan_order_t scan_idx_chroma;
-    int tr_skip = 0;
-    int chroma_depth = (depth == MAX_PU_DEPTH ? depth - 1 : depth);
-    int chroma_width = LCU_WIDTH_C >> chroma_depth;
-
-    scan_idx_chroma = kvz_get_scan_order(cur_cu->type, cur_cu->intra.mode_chroma, depth);
-
-    if (state->encoder_control->cfg.lossless) {
-      if (bypass_transquant(chroma_width,
-                            LCU_WIDTH_C, LCU_WIDTH_C,
-                            base_u, recbase_u,
-                            recbase_u, orig_coeff_u)) {
-        cbf_set(&cur_cu->cbf, depth, COLOR_U);
-      }
-      if (bypass_transquant(chroma_width,
-                            LCU_WIDTH_C, LCU_WIDTH_C,
-                            base_v, recbase_v,
-                            recbase_v, orig_coeff_v)) {
-        cbf_set(&cur_cu->cbf, depth, COLOR_V);
-      }
-      if (state->encoder_control->cfg.implicit_rdpcm && cur_cu->type == CU_INTRA) {
-        // implicit rdpcm for horizontal and vertical intra modes
-        if (cur_cu->intra.mode_chroma == 10) {
-          rdpcm(chroma_width, LCU_WIDTH_C, RDPCM_HOR, orig_coeff_u);
-          rdpcm(chroma_width, LCU_WIDTH_C, RDPCM_HOR, orig_coeff_v);
-
-        } else if (cur_cu->intra.mode_chroma == 26) {
-          rdpcm(chroma_width, LCU_WIDTH_C, RDPCM_VER, orig_coeff_u);
-          rdpcm(chroma_width, LCU_WIDTH_C, RDPCM_VER, orig_coeff_v);
-        }
-      }
-    } else {
-      if (kvz_quantize_residual(state, cur_cu, chroma_width, COLOR_U, scan_idx_chroma, tr_skip, LCU_WIDTH_C, LCU_WIDTH_C, base_u, recbase_u, recbase_u, orig_coeff_u)) {
-        cbf_set(&cur_cu->cbf, depth, COLOR_U);
-      }
-      if (kvz_quantize_residual(state, cur_cu, chroma_width, COLOR_V, scan_idx_chroma, tr_skip, LCU_WIDTH_C, LCU_WIDTH_C, base_v, recbase_v, recbase_v, orig_coeff_v)) {
-        cbf_set(&cur_cu->cbf, depth, COLOR_V);
-      }
+    if (chroma) {
+      quantize_tr_residual(state, COLOR_U, x, y, depth, cur_pu, lcu);
+      quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu);
     }
   }
 }
-

kvazaar-1.1.0.tar.gz/src/transform.h -> kvazaar-1.2.0.tar.gz/src/transform.h Changed

kvazaar-1.1.0.tar.gz/src/videoframe.c -> kvazaar-1.2.0.tar.gz/src/videoframe.c Changed

@@ -35,26 +35,13 @@
                                     int32_t height,
                                     enum kvz_chroma_format chroma_format)
 {
-  videoframe_t *frame = MALLOC(videoframe_t, 1);
-
+  videoframe_t *frame = calloc(1, sizeof(videoframe_t));
   if (!frame) return 0;
 
-  FILL(*frame, 0);
-
   frame->width  = width;
   frame->height = height;
-  frame->width_in_lcu  = frame->width / LCU_WIDTH;
-  if (frame->width_in_lcu * LCU_WIDTH < frame->width) frame->width_in_lcu++;
-  frame->height_in_lcu = frame->height / LCU_WIDTH;
-  if (frame->height_in_lcu * LCU_WIDTH < frame->height) frame->height_in_lcu++;
-
-  {
-    unsigned cu_array_width  = frame->width_in_lcu  * LCU_WIDTH;
-    unsigned cu_array_height = frame->height_in_lcu * LCU_WIDTH;
-    frame->cu_array = kvz_cu_array_alloc(cu_array_width, cu_array_height);
-  }
-
-  frame->coeff_y = NULL; frame->coeff_u = NULL; frame->coeff_v = NULL;
+  frame->width_in_lcu  = CEILDIV(frame->width,  LCU_WIDTH);
+  frame->height_in_lcu = CEILDIV(frame->height, LCU_WIDTH);
 
   frame->sao_luma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
   if (chroma_format != KVZ_CSP_400) {
@@ -76,11 +63,7 @@
   kvz_image_free(frame->rec);
   frame->rec = NULL;
 
-  kvz_cu_array_free(frame->cu_array);
-
-  FREE_POINTER(frame->coeff_y);
-  FREE_POINTER(frame->coeff_u);
-  FREE_POINTER(frame->coeff_v);
+  kvz_cu_array_free(&frame->cu_array);
 
   FREE_POINTER(frame->sao_luma);
   FREE_POINTER(frame->sao_chroma);
@@ -93,17 +76,3 @@
 void kvz_videoframe_set_poc(videoframe_t * const frame, const int32_t poc) {
   frame->poc = poc;
 }
-
-const cu_info_t* kvz_videoframe_get_cu_const(const videoframe_t * const frame,
-                                             unsigned int x_in_scu,
-                                             unsigned int y_in_scu)
-{
-  return kvz_cu_array_at_const(frame->cu_array, x_in_scu << 3, y_in_scu << 3);
-}
-
-cu_info_t* kvz_videoframe_get_cu(videoframe_t * const frame,
-                                 const unsigned int x_in_scu,
-                                 const unsigned int y_in_scu)
-{
-  return kvz_cu_array_at(frame->cu_array, x_in_scu << 3, y_in_scu << 3);
-}

kvazaar-1.1.0.tar.gz/src/videoframe.h -> kvazaar-1.2.0.tar.gz/src/videoframe.h Changed

kvazaar-1.1.0.tar.gz/tests/Makefile.am -> kvazaar-1.2.0.tar.gz/tests/Makefile.am Changed

@@ -1,9 +1,22 @@
 
-TESTS = $(check_PROGRAMS)
+TESTS = $(check_PROGRAMS) \
+    test_external_symbols.sh \
+    test_gop.sh \
+    test_interlace.sh \
+    test_intra.sh \
+    test_invalid_input.sh \
+    test_mv_constraint.sh \
+    test_owf_wpp_tiles.sh \
+    test_rate_control.sh \
+    test_slices.sh \
+    test_smp.sh \
+    test_tools.sh \
+    test_weird_shapes.sh
 
 check_PROGRAMS = kvazaar_tests
 
 kvazaar_tests_SOURCES = \
+	coeff_sum_tests.c \
 	dct_tests.c \
 	intra_sad_tests.c \
 	mv_cand_tests.c \
@@ -18,3 +31,15 @@
 kvazaar_tests_CFLAGS = -I$(srcdir) -I$(top_srcdir) -I$(top_srcdir)/src
 kvazaar_tests_LDFLAGS = -static $(top_builddir)/src/libkvazaar.la $(LIBS)
 
+# This makes sure that CXXLD gets defined.
+nodist_EXTRA_kvazaar_tests_SOURCES = cpp.cpp
+
+if USE_CRYPTOPP
+kvazaar_tests_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(kvazaar_tests_CFLAGS) $(CXXFLAGS) \
+	$(kvazaar_tests_LDFLAGS) $(LDFLAGS) -o $@
+else
+kvazaar_tests_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(kvazaar_tests_CFLAGS) $(CFLAGS) \
+	$(kvazaar_tests_LDFLAGS) $(LDFLAGS) -o $@
+endif

kvazaar-1.2.0.tar.gz/tests/coeff_sum_tests.c Added

@@ -0,0 +1,63 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2017 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1 as
+ * published by the Free Software Foundation.
+ *
+ * Kvazaar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "greatest/greatest.h"
+
+#include "test_strategies.h"
+
+#include <string.h>
+
+static coeff_t coeff_test_data64 * 64;
+static uint32_t expected_test_result;
+
+static void setup()
+{
+  // Fill test data.
+  coeff_t value = INT16_MIN;
+  for (int i = 0; i < 64 * 64; i++) {
+    coeff_test_datai = value;
+    value += 16;
+  }
+
+  // Calculate expected result using the formula for an arithmetic sum.
+  expected_test_result =
+    2048 * (16 - INT16_MIN) / 2 +
+    2048 * 2047 * 16 / 2;
+}
+
+TEST test_coeff_abs_sum()
+{
+  uint32_t sum = kvz_coeff_abs_sum(coeff_test_data, 64 * 64);
+  ASSERT_EQ(sum, expected_test_result);
+  PASS();
+}
+
+SUITE(coeff_sum_tests)
+{
+  setup();
+
+  for (volatile int i = 0; i < strategies.count; ++i) {
+    if (strcmp(strategies.strategiesi.type, "coeff_abs_sum") != 0) {
+      continue;
+    }
+
+    kvz_coeff_abs_sum = strategies.strategiesi.fptr;
+    RUN_TEST(test_coeff_abs_sum);
+  }
+}

kvazaar-1.1.0.tar.gz/tests/intra_sad_tests.c -> kvazaar-1.2.0.tar.gz/tests/intra_sad_tests.c Changed

kvazaar-1.1.0.tar.gz/tests/mv_cand_tests.c -> kvazaar-1.2.0.tar.gz/tests/mv_cand_tests.c Changed

@@ -31,22 +31,19 @@
     lcu.cui.type = CU_INTER;
   }
 
-  cu_info_t *mv_cand5 = { NULL };
+  merge_candidates_t cand = { {0, 0}, {0, 0, 0}, 0, 0 };
+
   get_spatial_merge_candidates(64 + 32, 64, // x, y
                                32, 24,      // width, height
                                1920, 1080,  // picture size
-                               &mv_cand0, // b0
-                               &mv_cand1, // b1
-                               &mv_cand2, // b2
-                               &mv_cand3, // a0
-                               &mv_cand4, // a1
-                               &lcu);
-
-  ASSERT_EQ(mv_cand0, &lcu.cu289); // b0
-  ASSERT_EQ(mv_cand1, &lcu.cu 16); // b1
-  ASSERT_EQ(mv_cand2, &lcu.cu  8); // b2
-  ASSERT_EQ(mv_cand3, &lcu.cu127); // a0
-  ASSERT_EQ(mv_cand4, &lcu.cu110); // a1
+                               &lcu,
+                               &cand);
+
+  ASSERT_EQ(cand.b0, &lcu.cu289);
+  ASSERT_EQ(cand.b1, &lcu.cu 16);
+  ASSERT_EQ(cand.b2, &lcu.cu  8);
+  ASSERT_EQ(cand.a0, &lcu.cu127);
+  ASSERT_EQ(cand.a1, &lcu.cu110);
 
   PASS();
 }

kvazaar-1.1.0.tar.gz/tests/sad_tests.c -> kvazaar-1.2.0.tar.gz/tests/sad_tests.c Changed

kvazaar-1.1.0.tar.gz/tests/satd_tests.c -> kvazaar-1.2.0.tar.gz/tests/satd_tests.c Changed

kvazaar-1.1.0.tar.gz/tests/speed_tests.c -> kvazaar-1.2.0.tar.gz/tests/speed_tests.c Changed

kvazaar-1.2.0.tar.gz/tests/test_external_symbols.sh Added

kvazaar-1.2.0.tar.gz/tests/test_gop.sh Added

kvazaar-1.2.0.tar.gz/tests/test_interlace.sh Added

kvazaar-1.2.0.tar.gz/tests/test_intra.sh Added

kvazaar-1.2.0.tar.gz/tests/test_invalid_input.sh Added

kvazaar-1.2.0.tar.gz/tests/test_mv_constraint.sh Added

kvazaar-1.2.0.tar.gz/tests/test_owf_wpp_tiles.sh Added

kvazaar-1.2.0.tar.gz/tests/test_rate_control.sh Added

kvazaar-1.2.0.tar.gz/tests/test_slices.sh Added

kvazaar-1.2.0.tar.gz/tests/test_smp.sh Added

kvazaar-1.1.0.tar.gz/tests/test_strategies.c -> kvazaar-1.2.0.tar.gz/tests/test_strategies.c Changed

kvazaar-1.2.0.tar.gz/tests/test_tools.sh Added

kvazaar-1.2.0.tar.gz/tests/test_weird_shapes.sh Added

kvazaar-1.1.0.tar.gz/tests/tests_main.c -> kvazaar-1.2.0.tar.gz/tests/tests_main.c Changed

kvazaar-1.2.0.tar.gz/tests/util.sh Added

@@ -0,0 +1,65 @@
+#!/bin/sh
+
+# Helper functions for test scripts.
+
+set -eu${BASH+o pipefail}
+
+# Temporary files for encoder input and output.
+yuvfile="$(mktemp)"
+hevcfile="$(mktemp)"
+
+cleanup() {
+    rm -rf "${yuvfile}" "${hevcfile}"
+}
+trap cleanup EXIT
+
+print_and_run() {
+    printf '\n\n$ %s\n' "$*"
+    "$@"
+}
+
+prepare() {
+    cleanup
+    print_and_run \
+        ffmpeg -f lavfi -i "mandelbrot=size=${1}" \
+            -vframes "${2}" -pix_fmt yuv420p -f yuv4mpegpipe \
+            "${yuvfile}"
+}
+
+valgrind_test() {
+    dimensions="$1"
+    shift
+    frames="$1"
+    shift
+
+    prepare "${dimensions}" "${frames}"
+
+    print_and_run \
+        libtool execute \
+            valgrind --leak-check=full --error-exitcode=1 -- \
+            ../src/kvazaar -i "${yuvfile}" "--input-res=${dimensions}" -o "${hevcfile}" "$@"
+
+    print_and_run \
+        TAppDecoderStatic -b "${hevcfile}"
+
+    cleanup
+}
+
+encode_test() {
+    dimensions="$1"
+    shift
+    frames="$1"
+    shift
+    expected_status="$1"
+    shift
+
+    prepare "${dimensions}" "${frames}"
+
+    set +e
+    print_and_run \
+        libtool execute \
+            ../src/kvazaar -i "${yuvfile}" "--input-res=${dimensions}" -o "${hevcfile}" "$@"
+    actual_status="$?"
+    set -e
+     ${actual_status} -eq ${expected_status} 
+}