Packman Build Service PMBS

Changes of Revision 15

kvazaar.changes Changed

@@ -1,4 +1,34 @@
 -------------------------------------------------------------------
+Wed Apr 22 16:16:28 UTC 2020 - Luigi Baldoni <aloisio@gmx.com>
+
+- Update to version 2.0.0
+  * Several unmentioned improvements and fixes
+  Highlights:
+  * Updated presets
+  * Updated GOP definitions using QP offset model.
+    + There is now even longer hierarchical GOP --gop=16
+  * Much faster and improved bipred
+  * Alternative and better rate control algorithm, optimal bit
+    allocation (--rc-algorithm oba)
+  * Variance adaptive quantization (--vaq)
+  Features:
+  * Option to set QP offset for intra frames (--intra-qp-offset,
+    automatical by default)
+  * Zero-coeff-rdo is now configurable (--zero-coeff-rdo)
+  * Optional intra frame analysis for rate control (--intra-bits)
+  * Optional machine learning based depth constraints for intra
+    search (--ml-pu-depth-intra)
+  * PU depths are now separately configurable for each GOP layer
+  User Interface:
+  * Report bitrate and some kind of (cumulative) average QP
+  Optimizations:
+  * More AVX2 opimizations for SAO
+  * More AVX2 opimizations for transforms
+  * More AVX2 opimizations for intra prediction
+  * AVX2 strategy for variance calculation
+- Bump sover to 6
+
+-------------------------------------------------------------------
 Tue Jul  9 20:15:25 UTC 2019 - Luigi Baldoni <aloisio@gmx.com>
 
 - Update to version 1.3.0

kvazaar.spec Changed

kvazaar-1.3.0.tar.gz/.gitignore -> kvazaar-2.0.0.tar.gz/.gitignore Changed

kvazaar-1.3.0.tar.gz/.gitlab-ci.yml -> kvazaar-2.0.0.tar.gz/.gitlab-ci.yml Changed

kvazaar-1.3.0.tar.gz/.travis-install.bash -> kvazaar-2.0.0.tar.gz/.travis-install.bash Changed

kvazaar-1.3.0.tar.gz/.travis.yml -> kvazaar-2.0.0.tar.gz/.travis.yml Changed

kvazaar-1.3.0.tar.gz/README.md -> kvazaar-2.0.0.tar.gz/README.md Changed

@@ -22,6 +22,7 @@
 - Compiling Kvazaar(#compiling-kvazaar)
   - Required libraries(#required-libraries)
   - Autotools(#autotools)
+  - Autotools on MinGW(#autotools-on-mingw)
   - OS X(#os-x)
   - Visual Studio(#visual-studio)
   - Docker(#docker)
@@ -113,11 +114,16 @@
                                    - 0: Only send VPS with the first frame.
                                    - N: Send VPS with every Nth intra frame.
   -r, --ref <integer>        : Number of reference frames, in range 1..15 4
-      --gop <string>         : GOP structure 8
-                                   - 0: Disabled
-                                   - 8: B-frame pyramid of length 8
-                                   - lp-<string>: Low-delay P-frame GOP
+      --gop <string>         : GOP structure lp-g4d3t1
+                                   -  0: Disabled
+                                   -  8: B-frame pyramid of length 8
+                                   - 16: B-frame pyramid of length 16
+                                   - lp-<string>: Low-delay P/B-frame GOP
                                      (e.g. lp-g8d4t2, see README)
+      --intra-qp-offset <int>: QP offset for intra frames -51..51 auto
+                                   - N: Set QP offset to N.
+                                   - auto: Select offset automatically based
+                                     on GOP length.
       --(no-)open-gop        : Use open GOP configuration. enabled
       --cqmfile <filename>   : Read custom quantization matrices from a file.
       --scaling-list <string>: Set scaling list mode. off
@@ -127,6 +133,15 @@
       --bitrate <integer>    : Target bitrate 0
                                    - 0: Disable rate control.
                                    - N: Target N bits per second.
+      --rc-algorithm <string>: Select used rc-algorithm. lambda
+                                   - lambda: rate control from:
+                                     DOI: 10.1109/TIP.2014.2336550 
+                                   - oba: DOI: 10.1109/TCSVT.2016.2589878
+      --(no-)intra-bits      : Use Hadamard cost based allocation for intra
+                               frames. Default on for gop 8 and off for lp-gop
+      --(no-)clip-neighbour  : On oba based rate control whether to clip 
+                               lambda values to same frame's ctus or previous'.
+                               Default on for RA GOPS and disabled for LP.
       --(no-)lossless        : Use lossless coding. disabled
       --mv-constraint <string> : Constrain movement vectors. none
                                    - none: No constraint
@@ -150,6 +165,9 @@
       --high-tier            : Used with --level. Use high tier bitrate limits
                                instead of the main tier limits during encoding.
                                High tier requires level 4 or higher.
+      --(no-)vaq <integer>   : Enable variance adaptive quantization with given
+                               strength, in range 1..20. Recommended: 5.
+                               disabled
 
 Compression tools:
       --(no-)deblock <beta:tc> : Deblocking filter. 0:0
@@ -173,6 +191,8 @@
                                         chroma mode search.
       --(no-)mv-rdo          : Rate-distortion optimized motion vector costs
                                disabled
+      --(no-)zero-coeff-rdo  : If a CU is set inter, check if forcing zero
+                               residual improves the RD cost. enabled
       --(no-)full-intra-search : Try all intra modes during rough search.
                                disabled
       --(no-)transform-skip  : Try transform skip disabled
@@ -192,8 +212,19 @@
                                    - 4: + 1/4-pixel diagonal
       --pu-depth-inter <int>-<int> : Inter prediction units sizes 0-3
                                    - 0, 1, 2, 3: from 64x64 to 8x8
+                                   - Accepts a list of values separated by ','
+                                     for setting separate depths per GOP layer
+                                     (values can be omitted to use the first
+                                     value for the respective layer).
       --pu-depth-intra <int>-<int> : Intra prediction units sizes 1-4
                                    - 0, 1, 2, 3, 4: from 64x64 to 4x4
+                                   - Accepts a list of values separated by ','
+                                     for setting separate depths per GOP layer
+                                     (values can be omitted to use the first
+                                     value for the respective layer).
+      --ml-pu-depth-intra    : Predict the pu-depth-intra using machine
+                                learning trees, overrides the
+                                --pu-depth-intra parameter. disabled
       --tr-depth-intra <int> : Transform split depth for intra blocks 0
       --(no-)bipred          : Bi-prediction disabled
       --cu-split-termination <string> : CU split search termination zero
@@ -246,6 +277,13 @@
                                    - tiles: Put tiles in independent slices.
                                    - wpp: Put rows in dependent slices.
                                    - tiles+wpp: Do both.
+      --partial-coding <x-offset>!<y-offset>!<slice-width>!<slice-height>
+                             : Encode partial frame.
+                               Parts must be merged to form a valid bitstream.
+                               X and Y are CTU offsets.
+                               Slice width and height must be divisible by CTU
+                               in pixels unless it is the last CTU row/column.
+                               This parameter is used by kvaShare.
 
 Video Usability Information:
       --sar <width:height>   : Specify sample aspect ratio
@@ -299,20 +337,20 @@
 
 |                      | 0-uf  | 1-sf  | 2-vf  | 3-fr  | 4-f   | 5-m   | 6-s   | 7-sr  | 8-vs  | 9-p   |
 | -------------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- |
-| rd                   | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 2     | 2     | 2     |
+| rd                   | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 2     | 2     | 2     |
 | pu-depth-intra       | 2-3   | 2-3   | 2-3   | 2-3   | 1-3   | 1-4   | 1-4   | 1-4   | 1-4   | 1-4   |
-| pu-depth-inter       | 2-3   | 2-3   | 1-3   | 1-3   | 1-3   | 0-3   | 0-3   | 0-3   | 0-3   | 0-3   |
-| me                   | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | tz    |
-| gop                  | g4d4t1| g4d4t1| g4d4t1| g4d4t1| g4d4t1| 8     | 8     | 8     | 8     | 8     |
+| pu-depth-inter       | 1-2   | 1-2   | 1-3   | 1-3   | 1-3   | 0-3   | 0-3   | 0-3   | 0-3   | 0-3   |
+| me                   | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | tz    | tz    |
+| gop                  | 8     | 8     | 8     | 8     | 8     | 16    | 16    | 16    | 16    | 16    |
 | ref                  | 1     | 1     | 1     | 1     | 2     | 4     | 4     | 4     | 4     | 4     |
-| bipred               | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1     |
+| bipred               | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     |
 | deblock              | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     |
 | signhide             | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     |
-| subme                | 2     | 2     | 2     | 4     | 4     | 4     | 4     | 4     | 4     | 4     |
+| subme                | 0     | 2     | 2     | 4     | 4     | 4     | 4     | 4     | 4     | 4     |
 | sao                  | off   | full  | full  | full  | full  | full  | full  | full  | full  | full  |
 | rdoq                 | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1     | 1     |
 | rdoq-skip            | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     |
-| transform-skip       | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     |
+| transform-skip       | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     |
 | mv-rdo               | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     |
 | full-intra-search    | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     |
 | smp                  | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     |
@@ -320,7 +358,7 @@
 | cu-split-termination | zero  | zero  | zero  | zero  | zero  | zero  | zero  | zero  | zero  | off   |
 | me-early-termination | sens. | sens. | sens. | sens. | sens. | on    | on    | off   | off   | off   |
 | intra-rdo-et         | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     |
-| early-skip           | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     |
+| early-skip           | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 0     |
 | fast-residual-cost   | 28    | 28    | 28    | 0     | 0     | 0     | 0     | 0     | 0     | 0     |
 | max-merge            | 5     | 5     | 5     | 5     | 5     | 5     | 5     | 5     | 5     | 5     |
 
@@ -357,6 +395,12 @@
 
 See `./configure --help` for more options.
 
+### Autotools on MinGW
+It is recommended to use Clang instead of GCC in MinGW environments. GCC also works, but AVX2 optimizations will be disabled because of a known GCC issue from 2012, so performance will suffer badly. Instead of `./configure`, run
+
+    CC=clang ./configure
+
+to build Kvazaar using Clang.
 
 ### OS X
 - Install Homebrew
@@ -365,7 +409,7 @@
 
 
 ### Visual Studio
-- At least VisualStudio 2015 is required.
+- At least VisualStudio 2015.2 is required.
 - Project files can be found under build/.
 - Requires external vsyasm.exe(http://yasm.tortall.net/Download.html)
   in %PATH%

kvazaar-1.3.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj -> kvazaar-2.0.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj Changed

@@ -138,6 +138,7 @@
     </ClCompile>
   </ItemDefinitionGroup>
   <ItemGroup>
+    <ClCompile Include="..\..\src\constraint.c" />
     <ClCompile Include="..\..\src\extras\crypto.cpp" />
     <ClCompile Include="..\..\src\extras\libmd5.c" />
     <ClCompile Include="..\..\src\input_frame_buffer.c" />
@@ -159,6 +160,7 @@
     <ClCompile Include="..\..\src\imagelist.c" />
     <ClCompile Include="..\..\src\inter.c" />
     <ClCompile Include="..\..\src\intra.c" />
+    <ClCompile Include="..\..\src\ml_intra_cu_depth_pred.c" />
     <ClCompile Include="..\..\src\nal.c" />
     <ClCompile Include="..\..\src\rate_control.c" />
     <ClCompile Include="..\..\src\rdo.c" />
@@ -199,9 +201,11 @@
     <ClCompile Include="..\..\src\strategies\strategies-intra.c" />
     <ClCompile Include="..\..\src\strategies\strategies-quant.c" />
     <ClInclude Include="..\..\src\checkpoint.h" />
+    <ClInclude Include="..\..\src\constraint.h" />
     <ClInclude Include="..\..\src\cu.h" />
     <ClInclude Include="..\..\src\extras\crypto.h" />
     <ClInclude Include="..\..\src\extras\libmd5.h" />
+    <ClInclude Include="..\..\src\gop.h" />
     <ClInclude Include="..\..\src\image.h" />
     <ClInclude Include="..\..\src\imagelist.h" />
     <ClCompile Include="..\..\src\strategies\altivec\picture-altivec.c" />
@@ -259,6 +263,7 @@
     <ClInclude Include="..\..\src\input_frame_buffer.h" />
     <ClInclude Include="..\..\src\kvazaar_internal.h" />
     <ClInclude Include="..\..\src\kvz_math.h" />
+    <ClInclude Include="..\..\src\ml_intra_cu_depth_pred.h" />
     <ClInclude Include="..\..\src\search_inter.h" />
     <ClInclude Include="..\..\src\search_intra.h" />
     <ClInclude Include="..\..\src\strategies\avx2\avx2_common_functions.h" />

kvazaar-1.3.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters -> kvazaar-2.0.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters Changed

@@ -52,6 +52,9 @@
     <Filter Include="Threadwrapper">
       <UniqueIdentifier>{f4abece9-e209-4817-a57e-c64ca7c5e05c}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Constraint">
+      <UniqueIdentifier>{895fc8cc-6f08-49a7-b377-b5c38a44d1b1}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\..\src\strategies\strategies-nal.c">
@@ -239,6 +242,12 @@
     <ClCompile Include="..\..\src\threadwrapper\src\semaphore.cpp">
       <Filter>Threadwrapper</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\constraint.c">
+      <Filter>Constraint</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\ml_intra_cu_depth_pred.c">
+      <Filter>Constraint</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitstream.h">
@@ -453,6 +462,15 @@
     <ClInclude Include="..\..\src\threadwrapper\include\semaphore.h">
       <Filter>Threadwrapper</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\constraint.h">
+      <Filter>Constraint</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\ml_intra_cu_depth_pred.h">
+      <Filter>Constraint</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\gop.h">
+      <Filter>Control</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <YASM Include="..\..\src\extras\x86inc.asm">

kvazaar-1.3.0.tar.gz/configure.ac -> kvazaar-2.0.0.tar.gz/configure.ac Changed

@@ -22,7 +22,7 @@
 #   - Increment when making new releases and major or minor was not changed since last release.
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
-ver_major=4
+ver_major=6
 ver_minor=2
 ver_release=0
 
@@ -45,16 +45,31 @@
 
 LT_INIT(win32-dll)
 
+AC_CANONICAL_HOST
+
+flag_gcc_on_mingw="false"
+case x"${host_os}" in
+  x"cygwin"*|x"mingw"*)
+    if test x"${CC}" = x"gcc" ; then
+      flag_gcc_on_mingw="true"
+    fi
+esac
+
 AX_CHECK_COMPILE_FLAG(-maltivec,flag_altivec="true")
 AX_CHECK_COMPILE_FLAG(-mavx2,   flag_avx2="true")
 AX_CHECK_COMPILE_FLAG(-msse4.1, flag_sse4_1="true")
 AX_CHECK_COMPILE_FLAG(-msse2,   flag_sse2="true")
 AX_CHECK_COMPILE_FLAG(-mbmi,    flag_bmi="true")
 AX_CHECK_COMPILE_FLAG(-mabm,    flag_abm="true")
+AX_CHECK_COMPILE_FLAG(-mpopcnt, flag_popcnt="true")
+AX_CHECK_COMPILE_FLAG(-mlzcnt,  flag_lzcnt="true")
 AX_CHECK_COMPILE_FLAG(-mbmi2,   flag_bmi2="true")
 
+# Do we need -mpopcnt and -mlzcnt, or -mabm to use POPCNT and LZCNT
+# instructions? Ask GCC and Clang, and they have different answers.
 AM_CONDITIONAL(HAVE_ALTIVEC, test x"$flag_altivec" = x"true")
-AM_CONDITIONAL(HAVE_AVX2, test x"$flag_avx2" = x"true" -a x"$flag_bmi" = x"true" -a x"$flag_abm" = x"true" -a x"$flag_bmi2" = x"true")
+AM_CONDITIONAL(HAVE_AVX2_GCC, test x"$flag_avx2" = x"true" -a x"$flag_bmi" = x"true" -a x"$flag_abm" = x"true" -a x"$flag_bmi2" = x"true" -a x"$flag_gcc_on_mingw" = x"false")
+AM_CONDITIONAL(HAVE_AVX2_CLANG, test x"$flag_avx2" = x"true" -a x"$flag_bmi" = x"true" -a x"$flag_popcnt" = x"true" -a x"$flag_lzcnt" = x"true" -a x"$flag_bmi2" = x"true" -a x"$flag_gcc_on_mingw" = x"false")
 AM_CONDITIONAL(HAVE_SSE4_1, test x"$flag_sse4_1" = x"true")
 AM_CONDITIONAL(HAVE_SSE2, test x"$flag_sse2" = x"true")
 
@@ -130,6 +145,15 @@
                
          )
         ,
+         midipix*, 
+         AS_IF(
+               test "x$BITS" = "x32", 
+                ASFLAGS="$ASFLAGS -fwin32 -DPREFIX -DHAVE_ALIGNED_STACK=0"
+               , 
+                ASFLAGS="$ASFLAGS -fwin64 -DHAVE_ALIGNED_STACK=1"
+               
+         )
+        ,
         linux*|*kfreebsd*, 
          ASFLAGS="$ASFLAGS -f elf$BITS"
          LDFLAGS="$LDFLAGS -Wl,-z,noexecstack"

kvazaar-1.3.0.tar.gz/doc/kvazaar.1 -> kvazaar-2.0.0.tar.gz/doc/kvazaar.1 Changed

@@ -1,4 +1,4 @@
-.TH KVAZAAR "1" "July 2019" "kvazaar v1.3.0" "User Commands"
+.TH KVAZAAR "1" "April 2020" "kvazaar v2.0.0" "User Commands"
 .SH NAME
 kvazaar \- open source HEVC encoder
 .SH SYNOPSIS
@@ -115,20 +115,25 @@
 Number of reference frames, in range 1..15 4
 .TP
 \fB\-\-gop <string>        
-GOP structure 8
-    \- 0: Disabled
-    \- 8: B\-frame pyramid of length 8
-    \- lp\-<string>: Low\-delay P\-frame GOP
+GOP structure lp\-g4d3t1
+    \-  0: Disabled
+    \-  8: B\-frame pyramid of length 8
+    \- 16: B\-frame pyramid of length 16
+    \- lp\-<string>: Low\-delay P/B\-frame GOP
       (e.g. lp\-g8d4t2, see README)
 .TP
-\fB\-\-(no\-)open\-gop
+\fB\-\-intra\-qp\-offset <int>: QP offset for intra frames \-51..51 auto
+    \- N: Set QP offset to N.
+    \- auto: Select offset automatically based
+      on GOP length.
+.TP
+\fB\-\-(no\-)open\-gop       
 Use open GOP configuration. enabled
 .TP
 \fB\-\-cqmfile <filename>  
 Read custom quantization matrices from a file.
 .TP
-\fB\-\-scaling-list <string>
-Set scaling list mode. off
+\fB\-\-scaling\-list <string>: Set scaling list mode. off
     \- off: Disable scaling lists.
     \- custom: use custom list (with \-\-cqmfile).
     \- default: Use default lists.
@@ -138,6 +143,20 @@
     \- 0: Disable rate control.
     \- N: Target N bits per second.
 .TP
+\fB\-\-rc\-algorithm <string>: Select used rc\-algorithm. lambda
+    \- lambda: rate control from:
+      DOI: 10.1109/TIP.2014.2336550 
+    \- oba: DOI: 10.1109/TCSVT.2016.2589878
+.TP
+\fB\-\-(no\-)intra\-bits     
+Use Hadamard cost based allocation for intra
+frames. Default on for gop 8 and off for lp\-gop
+.TP
+\fB\-\-(no\-)clip\-neighbour 
+On oba based rate control whether to clip 
+lambda values to same frame's ctus or previous'.
+Default on for RA GOPS and disabled for LP.
+.TP
 \fB\-\-(no\-)lossless       
 Use lossless coding. disabled
 .TP
@@ -176,6 +195,11 @@
 Used with \-\-level. Use high tier bitrate limits
 instead of the main tier limits during encoding.
 High tier requires level 4 or higher.
+.TP
+\fB\-\-(no\-)vaq <integer>  
+Enable variance adaptive quantization with given
+strength, in range 1..20. Recommended: 5.
+disabled
 
 .SS "Compression tools:"
 .TP
@@ -218,6 +242,10 @@
 Rate\-distortion optimized motion vector costs
 disabled
 .TP
+\fB\-\-(no\-)zero\-coeff\-rdo 
+If a CU is set inter, check if forcing zero
+residual improves the RD cost. enabled
+.TP
 \fB\-\-(no\-)full\-intra\-search
 Try all intra modes during rough search.
 disabled
@@ -248,10 +276,23 @@
 \fB\-\-pu\-depth\-inter <int>\-<int>
 Inter prediction units sizes 0\-3
     \- 0, 1, 2, 3: from 64x64 to 8x8
+    \- Accepts a list of values separated by ','
+      for setting separate depths per GOP layer
+      (values can be omitted to use the first
+      value for the respective layer).
 .TP
 \fB\-\-pu\-depth\-intra <int>\-<int>
 Intra prediction units sizes 1\-4
     \- 0, 1, 2, 3, 4: from 64x64 to 4x4
+    \- Accepts a list of values separated by ','
+      for setting separate depths per GOP layer
+      (values can be omitted to use the first
+      value for the respective layer).
+.TP
+\fB\-\-ml\-pu\-depth\-intra   
+Predict the pu\-depth\-intra using machine
+ learning trees, overrides the
+ \-\-pu\-depth\-intra parameter. disabled
 .TP
 \fB\-\-tr\-depth\-intra <int>
 Transform split depth for intra blocks 0
@@ -282,7 +323,8 @@
 Try to find skip cu from merge candidates.
 Perform no further search if skip is found.
 For rd=0..1: Try the first candidate.
-For rd=2.. : Try the best candidate based
+For rd=2..
+Try the best candidate based
              on luma satd cost. enabled
 .TP
 \fB\-\-max\-merge <integer> 
@@ -336,6 +378,15 @@
     \- tiles: Put tiles in independent slices.
     \- wpp: Put rows in dependent slices.
     \- tiles+wpp: Do both.
+.TP
+\fB\-\-partial\-coding <x\-offset>!<y\-offset>!<slice\-width>!<slice\-height>
+                            
+Encode partial frame.
+Parts must be merged to form a valid bitstream.
+X and Y are CTU offsets.
+Slice width and height must be divisible by CTU
+in pixels unless it is the last CTU row/column.
+This parameter is used by kvaShare.
 
 .SS "Video Usability Information:"
 .TP

kvazaar-1.3.0.tar.gz/src/Makefile.am -> kvazaar-2.0.0.tar.gz/src/Makefile.am Changed

@@ -53,6 +53,8 @@
 	checkpoint.h \
 	cfg.c \
 	cfg.h \
+	constraint.c \
+	constraint.h \ 
 	context.c \
 	context.h \
 	cu.c \
@@ -72,6 +74,7 @@
 	filter.c \
 	filter.h \
 	global.h \
+	gop.h \
 	image.c \
 	image.h \
 	imagelist.c \
@@ -85,6 +88,8 @@
 	kvazaar.c \
 	kvazaar_internal.h \
 	kvz_math.h \
+	ml_intra_cu_depth_pred.c \
+	ml_intra_cu_depth_pred.h \
 	nal.c \
 	nal.h \
 	rate_control.c \
@@ -126,6 +131,9 @@
 	strategies/generic/sao-generic.h \
 	strategies/generic/encode_coding_tree-generic.c \
 	strategies/generic/encode_coding_tree-generic.h \
+	strategies/missing-intel-intrinsics.h \
+	strategies/optimized_sad_func_ptr_t.h \
+	strategies/generic/sao_shared_generics.h \
 	strategies/strategies-common.h \
 	strategies/strategies-dct.c \
 	strategies/strategies-dct.h \
@@ -179,6 +187,7 @@
 	strategies/altivec/picture-altivec.h
 
 libavx2_la_SOURCES = \
+	strategies/avx2/avx2_common_functions.h \
 	strategies/avx2/dct-avx2.c \
 	strategies/avx2/dct-avx2.h \
 	strategies/avx2/intra-avx2.c \
@@ -189,6 +198,7 @@
 	strategies/avx2/picture-avx2.h \
 	strategies/avx2/quant-avx2.c \
 	strategies/avx2/quant-avx2.h \
+	strategies/avx2/reg_sad_pow2_widths-avx2.h \
 	strategies/avx2/sao-avx2.c \
 	strategies/avx2/sao-avx2.h \
 	strategies/avx2/encode_coding_tree-avx2.c \
@@ -200,7 +210,8 @@
 
 libsse41_la_SOURCES = \
 	strategies/sse41/picture-sse41.c \
-	strategies/sse41/picture-sse41.h
+	strategies/sse41/picture-sse41.h \
+	strategies/sse41/reg_sad_pow2_widths-sse41.h
 
 if HAVE_PPC
 
@@ -212,9 +223,12 @@
 
 if HAVE_X86
 
-if HAVE_AVX2
+if HAVE_AVX2_GCC
 libavx2_la_CFLAGS = -mavx2 -mbmi -mabm -mbmi2
 endif
+if HAVE_AVX2_CLANG
+libavx2_la_CFLAGS = -mavx2 -mbmi -mpopcnt -mlzcnt -mbmi2
+endif
 if HAVE_SSE4_1
 libsse41_la_CFLAGS = -msse4.1
 endif

kvazaar-1.3.0.tar.gz/src/cabac.c -> kvazaar-2.0.0.tar.gz/src/cabac.c Changed

@@ -309,14 +309,14 @@
    else
     if(r_param==2) {
        if( base_level ==1) {
-    	 uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
+         uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
          state->crypto_prev_pos  = ( Suffix + ( state->crypto_prev_pos^key ) ) & 3;
          CABAC_BINS_EP(cabac, state->crypto_prev_pos, 2, "coeff_abs_level_remaining");
          //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
        } else
          if( base_level ==2) {
            if(codeNumber<=7 || codeNumber>=12) {
-        	 uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
+             uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 2);
              state->crypto_prev_pos  = ( Suffix + ( state->crypto_prev_pos^key ) ) & 3;
              CABAC_BINS_EP(cabac, state->crypto_prev_pos, 2, "coeff_abs_level_remaining");
              //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
@@ -365,7 +365,7 @@
                //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
              } else
                if(codeNumber<=21){
-            	 uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 1);
+               uint32_t key    = kvz_crypto_get_key(state->crypto_hdl, 1);
                  state->crypto_prev_pos  = 4+(( (Suffix&1) + ( state->crypto_prev_pos^key )) & 1);
                  CABAC_BINS_EP(cabac, state->crypto_prev_pos, 3, "coeff_abs_level_remaining");
                  //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);

kvazaar-1.3.0.tar.gz/src/cabac.h -> kvazaar-2.0.0.tar.gz/src/cabac.h Changed

kvazaar-1.3.0.tar.gz/src/cfg.c -> kvazaar-2.0.0.tar.gz/src/cfg.c Changed

@@ -19,6 +19,7 @@
  ****************************************************************************/
 
 #include "cfg.h"
+#include "gop.h"
 
 #include <limits.h>
 #include <stdio.h>
@@ -40,6 +41,8 @@
   cfg->framerate_num   = 25;
   cfg->framerate_denom = 1;
   cfg->qp              = 22;
+  cfg->intra_qp_offset = 0;
+  cfg->intra_qp_offset_auto = true;
   cfg->intra_period    = 64;
   cfg->vps_period      = 0;
   cfg->deblock_enable  = 1;
@@ -98,10 +101,14 @@
   cfg->cpuid = 1;
 
   // Defaults for what sizes of PUs are tried.
-  cfg->pu_depth_inter.min = 2; // 0-3
-  cfg->pu_depth_inter.max = 3; // 0-3
-  cfg->pu_depth_intra.min = 2; // 0-4
-  cfg->pu_depth_intra.max = 3; // 0-4
+  memset( cfg->pu_depth_inter.min, -1, sizeof( cfg->pu_depth_inter.min ) );
+  memset( cfg->pu_depth_inter.max, -1, sizeof( cfg->pu_depth_inter.max ) );
+  memset( cfg->pu_depth_intra.min, -1, sizeof( cfg->pu_depth_intra.min ) );
+  memset( cfg->pu_depth_intra.max, -1, sizeof( cfg->pu_depth_intra.max ) );
+  *cfg->pu_depth_inter.min = 2; // 0-3
+  *cfg->pu_depth_inter.max = 3; // 0-3
+  *cfg->pu_depth_intra.min = 2; // 0-4
+  *cfg->pu_depth_intra.max = 3; // 0-4
 
   cfg->add_encoder_info = true;
   cfg->calc_psnr = true;
@@ -136,11 +143,26 @@
 
   cfg->me_max_steps = (uint32_t)-1;
 
+  cfg->vaq = 0;
+
   cfg->scaling_list = KVZ_SCALING_LIST_OFF;
 
   cfg->max_merge = 5;
   cfg->early_skip = true;
 
+  cfg->ml_pu_depth_intra = false;
+
+  cfg->partial_coding.startCTU_x = 0;
+  cfg->partial_coding.startCTU_y = 0;
+  cfg->partial_coding.fullWidth = 0;
+  cfg->partial_coding.fullHeight = 0;
+
+  cfg->zero_coeff_rdo = true;
+
+  cfg->rc_algorithm = KVZ_NO_RC;
+  cfg->intra_bit_allocation = false;
+  cfg->clip_neighbour = true;
+
   return 1;
 }
 
@@ -297,6 +319,45 @@
   return 1;
 }
 
+static int parse_pu_depth_list( const char *array, int32_t *depths_min, int32_t *depths_max, int size )
+{
+    char *list = strdup( array );
+    char *token;
+    int i = 0;
+    int ptr = -1;
+    int len = strlen( list );
+    int retval = 1;
+
+    //Reset depths in case multiple pu depth parameters are given
+    if(size > 1) memset( depths_max + 1, -1, (size - 1) * sizeof( *depths_max ) );
+    if(size > 1) memset( depths_min + 1, -1, (size - 1) * sizeof( *depths_min ) );
+
+    token = strtok( list, "," );
+    while( ptr < len && listptr + 1 == ',' )
+    {
+        i++;
+        ptr++;
+    }
+    while( retval && token != NULL && i < size ) {
+        retval &= (sscanf( token, "%d-%d", &depths_mini, &depths_maxi ) == 2);
+        ptr += (retval ? 4 : 0);
+        i++;
+        token = strtok( NULL, "," );
+        while(ptr < len && listptr + 1 == ',' ){
+          i++;
+          ptr++;
+        }
+    }
+
+    if( i >= size && ( token != NULL ) ) {
+        fprintf( stderr, "parsing failed : too many values.\n" );
+        retval = 0;
+    }
+    
+    free( list );
+    return retval;
+}
+
 static int parse_slice_specification(const char* const arg, int32_t * const nslices, int32_t** const array) {
   const char* current_arg = NULL;
   int32_t current_value;
@@ -386,19 +447,21 @@
 
   static const char * const scaling_list_names = { "off", "custom", "default", NULL };
 
+  static const char * const rc_algorithm_names = { "no-rc", "lambda", "oba", NULL };
   static const char * const preset_values1125*2 = {
+
       {
         "ultrafast",
         "rd", "0",
         "pu-depth-intra", "2-3",
-        "pu-depth-inter", "2-3",
+        "pu-depth-inter", "1-2",
         "me", "hexbs",
-        "gop", "lp-g4d4t1",
+        "gop", "8",
         "ref", "1",
-        "bipred", "0",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "0",
-        "subme", "2",
+        "subme", "0",
         "sao", "off",
         "rdoq", "0",
         "rdoq-skip", "0",
@@ -419,11 +482,11 @@
         "superfast",
         "rd", "0",
         "pu-depth-intra", "2-3",
-        "pu-depth-inter", "2-3",
+        "pu-depth-inter", "1-2",
         "me", "hexbs",
-        "gop", "lp-g4d4t1",
+        "gop", "8",
         "ref", "1",
-        "bipred", "0",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "0",
         "subme", "2",
@@ -449,9 +512,9 @@
         "pu-depth-intra", "2-3",
         "pu-depth-inter", "1-3",
         "me", "hexbs",
-        "gop", "lp-g4d4t1",
+        "gop", "8",
         "ref", "1",
-        "bipred", "0",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "0",
         "subme", "2",
@@ -477,9 +540,9 @@
         "pu-depth-intra", "2-3",
         "pu-depth-inter", "1-3",
         "me", "hexbs",
-        "gop", "lp-g4d4t1",
+        "gop", "8",
         "ref", "1",
-        "bipred", "0",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "0",
         "subme", "4",
@@ -505,9 +568,9 @@
         "pu-depth-intra", "1-3",
         "pu-depth-inter", "1-3",
         "me", "hexbs",
-        "gop", "lp-g4d4t1",
+        "gop", "8",
         "ref", "2",
-        "bipred", "0",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "0",
         "subme", "4",
@@ -533,9 +596,9 @@
         "pu-depth-intra", "1-4",
         "pu-depth-inter", "0-3",
         "me", "hexbs",
-        "gop", "8",
+        "gop", "16",
         "ref", "4",
-        "bipred", "0",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "0",
         "subme", "4",
@@ -557,11 +620,11 @@
       },
       {
         "slow",
-        "rd", "0",
+        "rd", "1",
         "pu-depth-intra", "1-4",
         "pu-depth-inter", "0-3",
         "me", "hexbs",
-        "gop", "8",
+        "gop", "16",
         "ref", "4",
         "bipred", "1",
         "deblock", "0:0",
@@ -589,7 +652,7 @@
         "pu-depth-intra", "1-4",
         "pu-depth-inter", "0-3",
         "me", "hexbs",
-        "gop", "8",
+        "gop", "16",
         "ref", "4",
         "bipred", "1",
         "deblock", "0:0",
@@ -616,8 +679,8 @@
         "rd", "2",
         "pu-depth-intra", "1-4",
         "pu-depth-inter", "0-3",
-        "me", "hexbs",
-        "gop", "8",
+        "me", "tz",
+        "gop", "16",
         "ref", "4",
         "bipred", "1",
         "deblock", "0:0",
@@ -626,7 +689,7 @@
         "sao", "full",
         "rdoq", "1",
         "rdoq-skip", "0",
-        "transform-skip", "0",
+        "transform-skip", "1",
         "mv-rdo", "0",
         "full-intra-search", "0",
         "smp", "1",
@@ -645,7 +708,7 @@
         "pu-depth-intra", "1-4",
         "pu-depth-inter", "0-3",
         "me", "tz",
-        "gop", "8",
+        "gop", "16",
         "ref", "4",
         "bipred", "1",
         "deblock", "0:0",
@@ -662,7 +725,7 @@
         "cu-split-termination", "off",
         "me-early-termination", "off",
         "intra-rdo-et", "0",
-        "early-skip", "1",
+        "early-skip", "0",
         "fast-residual-cost", "0",
         "max-merge", "5",
         NULL
@@ -892,9 +955,9 @@
   else if OPT("cpuid")
     cfg->cpuid = atobool(value);
   else if OPT("pu-depth-inter")
-    return sscanf(value, "%d-%d", &cfg->pu_depth_inter.min, &cfg->pu_depth_inter.max) == 2;
+    return parse_pu_depth_list(value, cfg->pu_depth_inter.min, cfg->pu_depth_inter.max, KVZ_MAX_GOP_LAYERS);
   else if OPT("pu-depth-intra")
-    return sscanf(value, "%d-%d", &cfg->pu_depth_intra.min, &cfg->pu_depth_intra.max) == 2;
+    return parse_pu_depth_list(value, cfg->pu_depth_intra.min, cfg->pu_depth_intra.max, KVZ_MAX_GOP_LAYERS);
   else if OPT("info")
     cfg->add_encoder_info = atobool(value);
   else if OPT("gop") {
@@ -928,41 +991,23 @@
       cfg->gop_len = gop.g;
       cfg->gop_lp_definition.d = gop.d;
       cfg->gop_lp_definition.t = gop.t;
+
+      cfg->intra_bit_allocation = true;
+      cfg->clip_neighbour = false;
     } else if (atoi(value) == 8) {
       cfg->gop_lowdelay = 0;
-      // GOP
-      cfg->gop_len = 8;
-      cfg->gop0.poc_offset = 8; cfg->gop0.qp_offset = 1; cfg->gop0.layer = 1; cfg->gop0.qp_factor = 0.442;  cfg->gop0.is_ref = 1;
-      cfg->gop0.ref_pos_count = 0;
-      cfg->gop0.ref_neg_count = 3; cfg->gop0.ref_neg0 = 8; cfg->gop0.ref_neg1 = 12; cfg->gop0.ref_neg2 = 16;
-
-      cfg->gop1.poc_offset = 4; cfg->gop1.qp_offset = 2; cfg->gop1.layer = 2; cfg->gop1.qp_factor = 0.3536; cfg->gop1.is_ref = 1;
-      cfg->gop1.ref_neg_count = 2; cfg->gop1.ref_neg0 = 4; cfg->gop1.ref_neg1 = 8;
-      cfg->gop1.ref_pos_count = 1; cfg->gop1.ref_pos0 = 4;
-
-      cfg->gop2.poc_offset = 2; cfg->gop2.qp_offset = 3; cfg->gop2.layer = 3; cfg->gop2.qp_factor = 0.3536; cfg->gop2.is_ref = 1;
-      cfg->gop2.ref_neg_count = 2; cfg->gop2.ref_neg0 = 2; cfg->gop2.ref_neg1 = 6;
-      cfg->gop2.ref_pos_count = 2; cfg->gop2.ref_pos0 = 2; cfg->gop2.ref_pos1 = 6;
-
-      cfg->gop3.poc_offset = 1; cfg->gop3.qp_offset = 4; cfg->gop3.layer = 4; cfg->gop3.qp_factor = 0.68;   cfg->gop3.is_ref = 0;
-      cfg->gop3.ref_neg_count = 1; cfg->gop3.ref_neg0 = 1;
-      cfg->gop3.ref_pos_count = 3; cfg->gop3.ref_pos0 = 1; cfg->gop3.ref_pos1 = 3; cfg->gop3.ref_pos2 = 7;
-
-      cfg->gop4.poc_offset = 3; cfg->gop4.qp_offset = 4; cfg->gop4.layer = 4; cfg->gop4.qp_factor = 0.68;   cfg->gop4.is_ref = 0;
-      cfg->gop4.ref_neg_count = 2; cfg->gop4.ref_neg0 = 1; cfg->gop4.ref_neg1 = 3;
-      cfg->gop4.ref_pos_count = 2; cfg->gop4.ref_pos0 = 1; cfg->gop4.ref_pos1 = 5;
-
-      cfg->gop5.poc_offset = 6; cfg->gop5.qp_offset = 3; cfg->gop5.layer = 3; cfg->gop5.qp_factor = 0.3536; cfg->gop5.is_ref = 1;
-      cfg->gop5.ref_neg_count = 2; cfg->gop5.ref_neg0 = 2; cfg->gop5.ref_neg1 = 6;
-      cfg->gop5.ref_pos_count = 1; cfg->gop5.ref_pos0 = 2;
-
-      cfg->gop6.poc_offset = 5; cfg->gop6.qp_offset = 4; cfg->gop6.layer = 4; cfg->gop6.qp_factor = 0.68;   cfg->gop6.is_ref = 0;
-      cfg->gop6.ref_neg_count = 2;  cfg->gop6.ref_neg0 = 1; cfg->gop6.ref_neg1 = 5;
-      cfg->gop6.ref_pos_count = 2; cfg->gop6.ref_pos0 = 1; cfg->gop6.ref_pos1 = 3;
-
-      cfg->gop7.poc_offset = 7; cfg->gop7.qp_offset = 4; cfg->gop7.layer = 4; cfg->gop7.qp_factor = 0.68;   cfg->gop7.is_ref = 0;
-      cfg->gop7.ref_neg_count = 3; cfg->gop7.ref_neg0 = 1; cfg->gop7.ref_neg1 = 3; cfg->gop7.ref_neg2 = 7;
-      cfg->gop7.ref_pos_count = 1; cfg->gop7.ref_pos0 = 1;
+      cfg->gop_len = sizeof(kvz_gop_ra8) / sizeof(kvz_gop_ra80);
+      memcpy(cfg->gop, kvz_gop_ra8, sizeof(kvz_gop_ra8));
+      cfg->intra_bit_allocation = false;
+      cfg->clip_neighbour = true;
+
+    } else if (atoi(value) == 16) {
+      cfg->gop_lowdelay = 0;
+      cfg->gop_len = sizeof(kvz_gop_ra16) / sizeof(kvz_gop_ra160);
+      memcpy(cfg->gop, kvz_gop_ra16, sizeof(kvz_gop_ra16));
+      cfg->intra_bit_allocation = false;
+      cfg->clip_neighbour = true;
+
     } else if (atoi(value) == 0) {
       //Disable gop
       cfg->gop_len = 0;
@@ -974,13 +1019,26 @@
       return 0;
     }
   }
+  else if OPT("intra-qp-offset") {
+    cfg->intra_qp_offset = atoi(value);
+    if( cfg->intra_qp_offset == 0 && !strcmp( value, "auto" ) )
+    {
+        cfg->intra_qp_offset_auto = true;
+    } else {
+        cfg->intra_qp_offset_auto = false;
+    }
+  }
   else if OPT("open-gop") {
     cfg->open_gop = (bool)atobool(value);
   }
   else if OPT("bipred")
     cfg->bipred = atobool(value);
-  else if OPT("bitrate")
+  else if OPT("bitrate") {
     cfg->target_bitrate = atoi(value);
+    if (!cfg->rc_algorithm) {
+      cfg->rc_algorithm = KVZ_LAMBDA;
+    }
+  }
   else if OPT("preset") {
     int preset_line = 0;
 
@@ -1249,6 +1307,9 @@
   }
   else if (OPT("fast-residual-cost"))
     cfg->fast_residual_cost_limit = atoi(value);
+  else if (OPT("vaq")) {
+    cfg->vaq = (int)atoi(value);
+  }
   else if (OPT("max-merge")) {
     int max_merge = atoi(value);
     if (max_merge < 1 || max_merge > 5) {
@@ -1258,7 +1319,45 @@
     cfg->max_merge = (uint8_t)max_merge;
   }
   else if OPT("early-skip") {
-  cfg->early_skip = (bool)atobool(value);
+    cfg->early_skip = (bool)atobool(value);
+  }
+  else if OPT("ml-pu-depth-intra") {
+    cfg->ml_pu_depth_intra = (bool)atobool(value);
+  }
+  else if OPT("partial-coding") {
+    uint32_t firstCTU_x;
+    uint32_t firstCTU_y;
+    uint32_t fullWidth;
+    uint32_t fullHeight;
+    if (4 != sscanf(value, "%u!%u!%u!%u", &firstCTU_x,
+      &firstCTU_y, &fullWidth, &fullHeight)) {
+      fprintf(stderr, "invalid partial-coding options. Expected \"%%u!%%u!%%u!%%u\", but got \"%s\"\n", value);
+      return 0;
+    }
+    cfg->partial_coding.startCTU_x = firstCTU_x;
+    cfg->partial_coding.startCTU_y = firstCTU_y;
+    cfg->partial_coding.fullWidth = fullWidth;
+    cfg->partial_coding.fullHeight = fullHeight;
+  }
+  else if OPT("zero-coeff-rdo") {
+  cfg->zero_coeff_rdo = (bool)atobool(value);
+  }
+  else if OPT("rc-algorithm") {
+    int8_t rc_algorithm = 0;
+    if (!parse_enum(value, rc_algorithm_names, &rc_algorithm)) {
+      fprintf(stderr, "Invalid rate control algorithm %s. Valid values include %s, %s, and %s\n", value, 
+        rc_algorithm_names0,
+        rc_algorithm_names1,
+        rc_algorithm_names2);
+      return 0;
+    }
+    cfg->rc_algorithm = rc_algorithm;
+  }
+  else if OPT("intra-bits") {
+    cfg->intra_bit_allocation = atobool(value);
+  }
+  else if OPT("clip-neighbour") {
+    cfg->clip_neighbour = atobool(value);
   }
   else {
     return 0;
@@ -1372,6 +1471,11 @@
 {
   int error = 0;
 
+  if (cfg->vaq < 0) {
+    fprintf(stderr, "vaq strength must be positive\n");
+    error = 1;
+  }
+
   if (cfg->width <= 0) {
     fprintf(stderr, "Input error: width must be positive\n");
     error = 1;
@@ -1477,33 +1581,49 @@
       error = 1;
   }
 
+  if (abs(cfg->intra_qp_offset) > 51) {
+    fprintf(stderr, "Input error: --intra-qp-offset out of range -51..51\n");
+    error = 1;
+  }
+
   if (cfg->target_bitrate < 0) {
       fprintf(stderr, "Input error: --bitrate must be nonnegative\n");
       error = 1;
   }
 
-  if (!WITHIN(cfg->pu_depth_inter.min, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX) ||
-      !WITHIN(cfg->pu_depth_inter.max, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX))
+  for( size_t i = 0; i < KVZ_MAX_GOP_LAYERS; i++ )
   {
-    fprintf(stderr, "Input error: illegal value for --pu-depth-inter (%d-%d)\n",
-            cfg->pu_depth_inter.min, cfg->pu_depth_inter.max);
-    error = 1;
-  } else if (cfg->pu_depth_inter.min > cfg->pu_depth_inter.max) {
-    fprintf(stderr, "Input error: Inter PU depth min (%d) > max (%d)\n",
-            cfg->pu_depth_inter.min, cfg->pu_depth_inter.max);
-    error = 1;
-  }
+      if( cfg->pu_depth_inter.mini < 0 || cfg->pu_depth_inter.maxi < 0 ) continue;
 
-  if (!WITHIN(cfg->pu_depth_intra.min, PU_DEPTH_INTRA_MIN, PU_DEPTH_INTRA_MAX) ||
-      !WITHIN(cfg->pu_depth_intra.max, PU_DEPTH_INTRA_MIN, PU_DEPTH_INTRA_MAX))
-  {
-    fprintf(stderr, "Input error: illegal value for --pu-depth-intra (%d-%d)\n",
-      cfg->pu_depth_intra.min, cfg->pu_depth_intra.max);
-    error = 1;
-  } else if (cfg->pu_depth_intra.min > cfg->pu_depth_intra.max) {
-    fprintf(stderr, "Input error: Intra PU depth min (%d) > max (%d)\n",
-            cfg->pu_depth_intra.min, cfg->pu_depth_intra.max);
-    error = 1;
+      if( !WITHIN( cfg->pu_depth_inter.mini, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX ) ||
+          !WITHIN( cfg->pu_depth_inter.maxi, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX ) )
+      {
+          fprintf( stderr, "Input error: illegal value for --pu-depth-inter (%d-%d)\n",
+                   cfg->pu_depth_inter.mini, cfg->pu_depth_inter.maxi );
+          error = 1;
+      }
+      else if( cfg->pu_depth_inter.mini > cfg->pu_depth_inter.maxi )
+      {
+          fprintf( stderr, "Input error: Inter PU depth min (%d) > max (%d)\n",
+                   cfg->pu_depth_inter.mini, cfg->pu_depth_inter.maxi );
+          error = 1;
+      }
+
+      if( cfg->pu_depth_intra.mini < 0 || cfg->pu_depth_intra.maxi < 0 ) continue;
+
+      if( !WITHIN( cfg->pu_depth_intra.mini, PU_DEPTH_INTRA_MIN, PU_DEPTH_INTRA_MAX ) ||
+          !WITHIN( cfg->pu_depth_intra.maxi, PU_DEPTH_INTRA_MIN, PU_DEPTH_INTRA_MAX ) )
+      {
+          fprintf( stderr, "Input error: illegal value for --pu-depth-intra (%d-%d)\n",
+                   cfg->pu_depth_intra.mini, cfg->pu_depth_intra.maxi );
+          error = 1;
+      }
+      else if( cfg->pu_depth_intra.mini > cfg->pu_depth_intra.maxi )
+      {
+          fprintf( stderr, "Input error: Intra PU depth min (%d) > max (%d)\n",
+                   cfg->pu_depth_intra.mini, cfg->pu_depth_intra.maxi );
+          error = 1;
+      }
   }
 
   // Tile separation should be at round position in terms of LCU, should be monotonic, and should not start by 0
@@ -1572,6 +1692,16 @@
     error = 1;
   }
 
+  if(cfg->target_bitrate > 0 && cfg->rc_algorithm == KVZ_NO_RC) {
+    fprintf(stderr, "Bitrate set but rc-algorithm is turned off.\n");
+    error = 1;
+  }
+
+  if(cfg->target_bitrate == 0 && cfg->rc_algorithm != KVZ_NO_RC) {
+    fprintf(stderr, "Rate control algorithm set but bitrate not set.\n");
+    error = 1;
+  }
+
   return !error;
 }

kvazaar-1.3.0.tar.gz/src/cli.c -> kvazaar-2.0.0.tar.gz/src/cli.c Changed

@@ -133,10 +133,22 @@
   { "set-qp-in-cu",             no_argument, NULL, 0 },
   { "open-gop",                 no_argument, NULL, 0 },
   { "no-open-gop",              no_argument, NULL, 0 },
+  { "vaq",                required_argument, NULL, 0 },
+  { "no-vaq",                   no_argument, NULL, 0 },
   { "scaling-list",       required_argument, NULL, 0 },
   { "max-merge",          required_argument, NULL, 0 },
   { "early-skip",               no_argument, NULL, 0 },
   { "no-early-skip",            no_argument, NULL, 0 },
+  { "ml-pu-depth-intra",        no_argument, NULL, 0 },
+  { "partial-coding",     required_argument, NULL, 0 },
+  { "zero-coeff-rdo",           no_argument, NULL, 0 },
+  { "no-zero-coeff-rdo",        no_argument, NULL, 0 },
+  { "intra-qp-offset",    required_argument, NULL, 0 },
+  { "rc-algorithm",       required_argument, NULL, 0 },
+  { "intra-bits",               no_argument, NULL, 0 },
+  { "no-intra-bits",            no_argument, NULL, 0 },
+  { "clip-neighbour",           no_argument, NULL, 0 },
+  { "no-clip-neighbour",        no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -396,11 +408,16 @@
     "                                   - 0: Only send VPS with the first frame.\n"
     "                                   - N: Send VPS with every Nth intra frame.\n"
     "  -r, --ref <integer>        : Number of reference frames, in range 1..15 4\n"
-    "      --gop <string>         : GOP structure 8\n"
-    "                                   - 0: Disabled\n"
-    "                                   - 8: B-frame pyramid of length 8\n"
-    "                                   - lp-<string>: Low-delay P-frame GOP\n"
+    "      --gop <string>         : GOP structure lp-g4d3t1\n"
+    "                                   -  0: Disabled\n"
+    "                                   -  8: B-frame pyramid of length 8\n"
+    "                                   - 16: B-frame pyramid of length 16\n"
+    "                                   - lp-<string>: Low-delay P/B-frame GOP\n"
     "                                     (e.g. lp-g8d4t2, see README)\n"
+    "      --intra-qp-offset <int>: QP offset for intra frames -51..51 auto\n"
+    "                                   - N: Set QP offset to N.\n"
+    "                                   - auto: Select offset automatically based\n"
+    "                                     on GOP length.\n"
     "      --(no-)open-gop        : Use open GOP configuration. enabled\n"
     "      --cqmfile <filename>   : Read custom quantization matrices from a file.\n"
     "      --scaling-list <string>: Set scaling list mode. off\n"
@@ -410,6 +427,15 @@
     "      --bitrate <integer>    : Target bitrate 0\n"
     "                                   - 0: Disable rate control.\n"
     "                                   - N: Target N bits per second.\n"
+    "      --rc-algorithm <string>: Select used rc-algorithm. lambda\n"
+    "                                   - lambda: rate control from:\n"
+    "                                     DOI: 10.1109/TIP.2014.2336550 \n"
+    "                                   - oba: DOI: 10.1109/TCSVT.2016.2589878\n"
+    "      --(no-)intra-bits      : Use Hadamard cost based allocation for intra\n"
+    "                               frames. Default on for gop 8 and off for lp-gop\n"
+    "      --(no-)clip-neighbour  : On oba based rate control whether to clip \n"
+    "                               lambda values to same frame's ctus or previous'.\n"
+    "                               Default on for RA GOPS and disabled for LP.\n"
     "      --(no-)lossless        : Use lossless coding. disabled\n"
     "      --mv-constraint <string> : Constrain movement vectors. none\n"
     "                                   - none: No constraint\n"
@@ -433,6 +459,9 @@
     "      --high-tier            : Used with --level. Use high tier bitrate limits\n"
     "                               instead of the main tier limits during encoding.\n"
     "                               High tier requires level 4 or higher.\n"
+    "      --(no-)vaq <integer>   : Enable variance adaptive quantization with given\n"
+    "                               strength, in range 1..20. Recommended: 5.\n"
+    "                               disabled\n"
     "\n"
     /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Compression tools:\n"
@@ -457,6 +486,8 @@
     "                                        chroma mode search.\n"
     "      --(no-)mv-rdo          : Rate-distortion optimized motion vector costs\n"
     "                               disabled\n"
+    "      --(no-)zero-coeff-rdo  : If a CU is set inter, check if forcing zero\n"
+    "                               residual improves the RD cost. enabled\n"
     "      --(no-)full-intra-search : Try all intra modes during rough search.\n"
     "                               disabled\n"
     "      --(no-)transform-skip  : Try transform skip disabled\n"
@@ -476,8 +507,19 @@
     "                                   - 4: + 1/4-pixel diagonal\n"
     "      --pu-depth-inter <int>-<int> : Inter prediction units sizes 0-3\n"
     "                                   - 0, 1, 2, 3: from 64x64 to 8x8\n"
+    "                                   - Accepts a list of values separated by ','\n"
+    "                                     for setting separate depths per GOP layer\n"
+    "                                     (values can be omitted to use the first\n"
+    "                                     value for the respective layer).\n"
     "      --pu-depth-intra <int>-<int> : Intra prediction units sizes 1-4\n"
     "                                   - 0, 1, 2, 3, 4: from 64x64 to 4x4\n"
+    "                                   - Accepts a list of values separated by ','\n"
+    "                                     for setting separate depths per GOP layer\n"
+    "                                     (values can be omitted to use the first\n"
+    "                                     value for the respective layer).\n"
+    "      --ml-pu-depth-intra    : Predict the pu-depth-intra using machine\n"
+    "                                learning trees, overrides the\n"
+    "                                --pu-depth-intra parameter. disabled\n"
     "      --tr-depth-intra <int> : Transform split depth for intra blocks 0\n"
     "      --(no-)bipred          : Bi-prediction disabled\n"
     "      --cu-split-termination <string> : CU split search termination zero\n"
@@ -531,6 +573,13 @@
     "                                   - tiles: Put tiles in independent slices.\n"
     "                                   - wpp: Put rows in dependent slices.\n"
     "                                   - tiles+wpp: Do both.\n"
+    "      --partial-coding <x-offset>!<y-offset>!<slice-width>!<slice-height>\n"
+    "                             : Encode partial frame.\n" 
+    "                               Parts must be merged to form a valid bitstream.\n"
+    "                               X and Y are CTU offsets.\n"
+    "                               Slice width and height must be divisible by CTU\n"
+    "                               in pixels unless it is the last CTU row/column.\n"
+    "                               This parameter is used by kvaShare.\n"
     "\n"
     /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Video Usability Information:\n"
@@ -564,13 +613,16 @@
 void print_frame_info(const kvz_frame_info *const info,
                       const double frame_psnr3,
                       const uint32_t bytes,
-                      const bool print_psnr)
+                      const bool print_psnr,
+                      const double avg_qp)
 {
-  fprintf(stderr, "POC %4d QP %2d (%c-frame) %10d bits",
+  fprintf(stderr, "POC %4d QP %2d AVG QP %.1f (%c-frame) %10d bits",
           info->poc,
           info->qp,
+          avg_qp,
           "BPI"info->slice_type % 3,
           bytes << 3);
+
   if (print_psnr) {
     fprintf(stderr, " PSNR Y %2.4f U %2.4f V %2.4f",
             frame_psnr0, frame_psnr1, frame_psnr2);

kvazaar-1.3.0.tar.gz/src/cli.h -> kvazaar-2.0.0.tar.gz/src/cli.h Changed

kvazaar-2.0.0.tar.gz/src/constraint.c Added

@@ -0,0 +1,59 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "constraint.h"
+
+ /**
+  * \brief Allocate the constraint_t structure.
+  *
+  * \param state   encoder state
+  * \return the pointer of constraint_t structure
+  */
+void * kvz_init_constraint(encoder_state_t* state, const encoder_control_t * const encoder) {
+  constraint_t* constr = NULL;
+  // Allocate the constraint_t strucutre
+  constr = MALLOC(constraint_t, 1);
+  if (!constr) {
+    fprintf(stderr, "Memory allocation failed!\n");
+    assert(0);
+  }
+
+  // Allocate the ml_intra_ctu_pred_t structure
+  constr->ml_intra_depth_ctu = NULL;
+  if (encoder->cfg.ml_pu_depth_intra) // TODO: Change this by a new param !!
+  {
+    constr->ml_intra_depth_ctu = kvz_init_ml_intra_depth_const();
+  }
+  return constr;
+}
+
+/**
+ * \brief Deallocate the constraint_t structure.
+ *
+ * \param state   encoder state
+ */
+void kvz_constraint_free(encoder_state_t* state) {
+  constraint_t* constr = state->constraint;
+  if (constr->ml_intra_depth_ctu) 
+  {
+    kvz_end_ml_intra_depth_const(constr->ml_intra_depth_ctu);
+  }
+  FREE_POINTER(constr);
+}

kvazaar-2.0.0.tar.gz/src/constraint.h Added

@@ -0,0 +1,40 @@
+#ifndef CONSTRAINT_H_
+#define CONSTRAINT_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "ml_intra_cu_depth_pred.h"
+#include "encoderstate.h"
+
+
+ /* Constraint structure:
+  * Each field corresponds to a constraint technique. The encoder tests if the constraint
+  * pointer is allocated to apply the technique.
+ */
+typedef struct {
+  // Structure used for the CTU depth prediction using Machine Learning in All Intra 
+  ml_intra_ctu_pred_t * ml_intra_depth_ctu;
+} constraint_t;
+
+
+void * kvz_init_constraint(encoder_state_t* state, const encoder_control_t * const);
+void kvz_constraint_free(encoder_state_t* state);
+
+#endif
\ No newline at end of file

kvazaar-1.3.0.tar.gz/src/encmain.c -> kvazaar-2.0.0.tar.gz/src/encmain.c Changed

@@ -305,6 +305,10 @@
   } while (picture_written);
 }
 
+static double calc_avg_qp(uint64_t qp_sum, uint32_t frames_done)
+{
+  return (double)qp_sum / (double)frames_done;
+}
 
 /**
  * \brief Program main function.
@@ -432,6 +436,7 @@
     uint64_t bitstream_length = 0;
     uint32_t frames_done = 0;
     double psnr_sum3 = { 0.0, 0.0, 0.0 };
+    uint64_t qp_sum = 0;
 
     // how many bits have been written this second? used for checking if framerate exceeds level's limits
     uint64_t bits_this_second = 0;
@@ -597,12 +602,15 @@
                                 opts->config->height);
         }
 
+        qp_sum      += info_out.qp;
         frames_done += 1;
+
         psnr_sum0 += frame_psnr0;
         psnr_sum1 += frame_psnr1;
         psnr_sum2 += frame_psnr2;
 
-        print_frame_info(&info_out, frame_psnr, len_out, encoder->cfg.calc_psnr);
+        print_frame_info(&info_out, frame_psnr, len_out, encoder->cfg.calc_psnr,
+                         calc_avg_qp(qp_sum, frames_done));
       }
 
       api->picture_free(cur_in_img);
@@ -632,12 +640,38 @@
     fprintf(stderr, " Total CPU time: %.3f s.\n", ((float)(clock() - start_time)) / CLOCKS_PER_SEC);
 
     {
+      const double mega = (double)(1 << 20);
+
       double encoding_time = ( (double)(encoding_end_cpu_time - encoding_start_cpu_time) ) / (double) CLOCKS_PER_SEC;
       double wall_time = KVZ_CLOCK_T_AS_DOUBLE(encoding_end_real_time) - KVZ_CLOCK_T_AS_DOUBLE(encoding_start_real_time);
-      fprintf(stderr, " Encoding time: %.3f s.\n", encoding_time);
+
+      double encoding_cpu = 100.0 * encoding_time / wall_time;
+      double encoding_fps = (double)frames_done   / wall_time;
+
+      double n_bits       = (double)(bitstream_length * 8);
+      double sf_num       = (double)encoder->cfg.framerate_num;
+      double sf_den       = (double)encoder->cfg.framerate_denom;
+      double sequence_fps =         sf_num / sf_den;
+
+      double sequence_t   = (double)frames_done / sequence_fps;
+      double bitrate_bps  = (double)n_bits      / sequence_t;
+      double bitrate_mbps =         bitrate_bps / mega;
+
+      double avg_qp       = calc_avg_qp(qp_sum, frames_done);
+
+#ifdef _WIN32
+      if (encoding_cpu > 100.0) {
+        encoding_cpu = 100.0;
+      }
+#endif
+      fprintf(stderr, " Encoding time: %.3f s.\n",      encoding_time);
       fprintf(stderr, " Encoding wall time: %.3f s.\n", wall_time);
-      fprintf(stderr, " Encoding CPU usage: %.2f%%\n", encoding_time/wall_time*100.f);
-      fprintf(stderr, " FPS: %.2f\n", ((double)frames_done)/wall_time);
+
+      fprintf(stderr, " Encoding CPU usage: %.2f%%\n",  encoding_cpu);
+      fprintf(stderr, " FPS: %.2f\n",                   encoding_fps);
+
+      fprintf(stderr, " Bitrate: %.3f Mbps\n",          bitrate_mbps);
+      fprintf(stderr, " AVG QP: %.1f\n",                avg_qp);
     }
     pthread_join(input_thread, NULL);
   }

kvazaar-1.3.0.tar.gz/src/encode_coding_tree.c -> kvazaar-2.0.0.tar.gz/src/encode_coding_tree.c Changed

kvazaar-1.3.0.tar.gz/src/encoder.c -> kvazaar-2.0.0.tar.gz/src/encoder.c Changed

@@ -27,7 +27,9 @@
 #include <stdlib.h>
 
 #include "cfg.h"
+#include "gop.h"
 #include "strategyselector.h"
+#include "kvz_math.h"
 
 
 /**
@@ -233,10 +235,26 @@
 
   if (encoder->cfg.gop_len > 0) {
     if (encoder->cfg.gop_lowdelay) {
-      kvz_config_process_lp_gop(&encoder->cfg);
+      if (encoder->cfg.gop_len == 4 && encoder->cfg.ref_frames == 4) {
+        memcpy(encoder->cfg.gop, kvz_gop_lowdelay4, sizeof(kvz_gop_lowdelay4));
+      } else {
+        kvz_config_process_lp_gop(&encoder->cfg);
+      }
     }
+  } 
+  
+  if( encoder->cfg.intra_qp_offset_auto ) {
+      encoder->cfg.intra_qp_offset = encoder->cfg.gop_len > 1 ? -kvz_math_ceil_log2( encoder->cfg.gop_len ) + 1 : 0;
+  }
+
+  // Disable GOP and QP offset for all-intra coding
+  if (encoder->cfg.intra_period == 1) {
+    encoder->cfg.gop_len = 0;
+    encoder->cfg.intra_qp_offset = 0;
   }
 
+  encoder->poc_lsb_bits = MAX(4, kvz_math_ceil_log2(encoder->cfg.gop_len * 2 + 1));
+
   encoder->max_inter_ref_lcu.right = 1;
   encoder->max_inter_ref_lcu.down  = 1;
 
@@ -332,7 +350,9 @@
   }
   encoder->target_avg_bpp = encoder->target_avg_bppic / encoder->in.pixels_per_pic;
 
-  if (!encoder_control_init_gop_layer_weights(encoder)) {
+  if (encoder->cfg.target_bitrate > 0 &&
+      !encoder_control_init_gop_layer_weights(encoder))
+  {
     goto init_failed;
   }
 
@@ -356,7 +376,7 @@
   // for SMP and AMP partition units.
   encoder->tr_depth_inter = 0;
 
-  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu) {
+  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu || encoder->cfg.vaq) {
     encoder->max_qp_delta_depth = 0;
   } else {
     encoder->max_qp_delta_depth = -1;
@@ -592,11 +612,16 @@
 #endif //KVZ_DEBUG
   }
 
-  assert(WITHIN(encoder->cfg.pu_depth_inter.min, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX));
-  assert(WITHIN(encoder->cfg.pu_depth_inter.max, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX));
-  assert(WITHIN(encoder->cfg.pu_depth_intra.min, PU_DEPTH_INTRA_MIN, PU_DEPTH_INTRA_MAX));
-  assert(WITHIN(encoder->cfg.pu_depth_intra.max, PU_DEPTH_INTRA_MIN, PU_DEPTH_INTRA_MAX));
+  for( size_t i = 0; i < KVZ_MAX_GOP_LAYERS; i++ )
+  {
+      if( encoder->cfg.pu_depth_inter.mini < 0 || cfg->pu_depth_inter.maxi < 0 ) continue;
+      assert( WITHIN( encoder->cfg.pu_depth_inter.mini, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX ) );
+      assert( WITHIN( encoder->cfg.pu_depth_inter.maxi, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX ) );
 
+      if( encoder->cfg.pu_depth_intra.mini < 0 || cfg->pu_depth_intra.maxi < 0 ) continue;
+      assert( WITHIN( encoder->cfg.pu_depth_intra.mini, PU_DEPTH_INTRA_MIN, PU_DEPTH_INTRA_MAX ) );
+      assert( WITHIN( encoder->cfg.pu_depth_intra.maxi, PU_DEPTH_INTRA_MIN, PU_DEPTH_INTRA_MAX ) );
+  }
   // Disable in-loop filters, sign hiding and transform skip when using
   // lossless coding.
   if (encoder->cfg.lossless) {
@@ -722,7 +747,8 @@
  * \return 1 on success, 0 on failure.
  *
  * Selects appropriate weights for layers according to the target bpp.
- * Only GOP structures with exactly four layers are supported.
+ * Only GOP structures with exactly four layers are supported with the.
+ * exception of experimental GOP 16.
  */
 static int encoder_control_init_gop_layer_weights(encoder_control_t * const encoder)
 {
@@ -795,10 +821,33 @@
         }
       }
       break;
-
+    case 5:
+      if(!encoder->cfg.gop_lowdelay) {
+        // These are obtained by running HM with RA GOP 16 collecting the ratio of bits spent for each
+        // layer from the CTC sequences and then fitting power curve
+        encoder->gop_layer_weights0 = 13.0060187535 * pow(encoder->target_avg_bpp, -0.3727651453);
+        encoder->gop_layer_weights1 = 7.3654107392 * pow(encoder->target_avg_bpp, -0.0854329266);
+        encoder->gop_layer_weights2 = 3.6563990701 * pow(encoder->target_avg_bpp, -0.0576990493);
+        encoder->gop_layer_weights3 = 2.1486937288 * pow(encoder->target_avg_bpp, -0.0155389471);
+        encoder->gop_layer_weights4 = 1;        
+      } 
+      else {
+        fprintf(stderr, "Unsupported amount of layers (%d) for lowdelay GOP\n", num_layers);
+        return 0;
+      }
+      break;
     default:
-      fprintf(stderr, "Unsupported number of GOP layers (%d)\n", num_layers);
-      return 0;
+      if (!encoder->cfg.gop_lowdelay && encoder->cfg.gop_len == 16) {
+        fprintf(stdout, 
+                "Rate control: Using experimental weights for GOP layers (%d)\n",
+                num_layers);
+        for (int i = 0; i < MAX_GOP_LAYERS; ++i) {
+          encoder->gop_layer_weightsi = (i == 0) ? 10 : 2;
+        }
+      } else {
+        fprintf(stderr, "Unsupported number of GOP layers (%d)\n", num_layers);
+        return 0;
+      }
   }
 
   // Normalize weights so that the sum of weights in a GOP is one.

kvazaar-1.3.0.tar.gz/src/encoder.h -> kvazaar-2.0.0.tar.gz/src/encoder.h Changed

kvazaar-1.3.0.tar.gz/src/encoder_state-bitstream.c -> kvazaar-2.0.0.tar.gz/src/encoder_state-bitstream.c Changed

@@ -39,6 +39,7 @@
 #include "tables.h"
 #include "threadqueue.h"
 #include "videoframe.h"
+#include "rate_control.h"
 
 
 static void encoder_state_write_bitstream_aud(encoder_state_t * const state)
@@ -346,8 +347,14 @@
     WRITE_U(stream, 0, 1, "separate_colour_plane_flag");
   }
 
-  WRITE_UE(stream, encoder->in.width, "pic_width_in_luma_samples");
-  WRITE_UE(stream, encoder->in.height, "pic_height_in_luma_samples");
+  if (encoder->cfg.partial_coding.fullWidth != 0) {
+    WRITE_UE(stream, encoder->cfg.partial_coding.fullWidth, "pic_width_in_luma_samples");
+    WRITE_UE(stream, encoder->cfg.partial_coding.fullHeight, "pic_height_in_luma_samples");
+  }
+  else {
+    WRITE_UE(stream, encoder->in.width, "pic_width_in_luma_samples");
+    WRITE_UE(stream, encoder->in.height, "pic_height_in_luma_samples");
+  }
 
   if (encoder->in.width != encoder->in.real_width || encoder->in.height != encoder->in.real_height) {
     // The standard does not seem to allow setting conf_win values such that
@@ -371,18 +378,22 @@
 
   WRITE_UE(stream, encoder->bitdepth-8, "bit_depth_luma_minus8");
   WRITE_UE(stream, encoder->bitdepth-8, "bit_depth_chroma_minus8");
-  WRITE_UE(stream, 1, "log2_max_pic_order_cnt_lsb_minus4");
+  WRITE_UE(stream, encoder->poc_lsb_bits - 4, "log2_max_pic_order_cnt_lsb_minus4");
+
   WRITE_U(stream, 0, 1, "sps_sub_layer_ordering_info_present_flag");
 
   //for each layer
   if (encoder->cfg.gop_lowdelay) {
-    WRITE_UE(stream, encoder->cfg.ref_frames, "sps_max_dec_pic_buffering");
-    WRITE_UE(stream, 0, "sps_num_reorder_pics");
+    const int dpb = encoder->cfg.ref_frames;
+    WRITE_UE(stream, dpb - 1, "sps_max_dec_pic_buffering_minus1");
+    WRITE_UE(stream, 0, "sps_max_num_reorder_pics");
   } else {
-    WRITE_UE(stream, encoder->cfg.ref_frames + encoder->cfg.gop_len, "sps_max_dec_pic_buffering");
-    WRITE_UE(stream, encoder->cfg.gop_len, "sps_num_reorder_pics");
+    // Clip to non-negative values to prevent problems with GOP=0
+    const int dpb = MIN(16, encoder->cfg.gop_len);
+    WRITE_UE(stream, MAX(dpb - 1, 0), "sps_max_dec_pic_buffering_minus1");
+    WRITE_UE(stream, MAX(encoder->cfg.gop_len - 1, 0), "sps_max_num_reorder_pics");
   }
-  WRITE_UE(stream, 0, "sps_max_latency_increase");
+  WRITE_UE(stream, 0, "sps_max_latency_increase_plus1");
   //end for
 
   WRITE_UE(stream, MIN_SIZE-3, "log2_min_coding_block_size_minus3");
@@ -709,16 +720,18 @@
   if (state->frame->pictype != KVZ_NAL_IDR_W_RADL
       && state->frame->pictype != KVZ_NAL_IDR_N_LP)
   {
+    const int poc_lsb = state->frame->poc & ((1 << encoder->poc_lsb_bits) - 1);
+    WRITE_U(stream, poc_lsb, encoder->poc_lsb_bits, "pic_order_cnt_lsb");
+
     int last_poc = 0;
     int poc_shift = 0;
 
-      WRITE_U(stream, state->frame->poc&0x1f, 5, "pic_order_cnt_lsb");
-      WRITE_U(stream, 0, 1, "short_term_ref_pic_set_sps_flag");
-      WRITE_UE(stream, ref_negative, "num_negative_pics");
-      WRITE_UE(stream, ref_positive, "num_positive_pics");
-    for (j = 0; j < ref_negative; j++) {      
+    WRITE_U(stream, 0, 1, "short_term_ref_pic_set_sps_flag");
+    WRITE_UE(stream, ref_negative, "num_negative_pics");
+    WRITE_UE(stream, ref_positive, "num_positive_pics");
+    for (j = 0; j < ref_negative; j++) {
       int8_t delta_poc = 0;
-      
+
       if (encoder->cfg.gop_len) {
         int8_t found = 0;
         do {
@@ -832,6 +845,11 @@
   printf("=========== Slice ===========\n");
 #endif
 
+  if (encoder->cfg.partial_coding.fullWidth != 0) {
+    state->slice->start_in_rs = encoder->cfg.partial_coding.startCTU_x +
+      CEILDIV(encoder->cfg.partial_coding.fullWidth, 64) * encoder->cfg.partial_coding.startCTU_y;
+  }
+
   bool first_slice_segment_in_pic = (state->slice->start_in_rs == 0);
   if ((state->encoder_control->cfg.slices & KVZ_SLICES_WPP)
       && state->wfrow->lcu_offset_y > 0)
@@ -854,6 +872,9 @@
     }
 
     int lcu_cnt = encoder->in.width_in_lcu * encoder->in.height_in_lcu;
+    if (encoder->cfg.partial_coding.fullWidth != 0) {
+      lcu_cnt = CEILDIV(encoder->cfg.partial_coding.fullWidth, 64) * CEILDIV(encoder->cfg.partial_coding.fullHeight, 64);
+    }
     int num_bits = kvz_math_ceil_log2(lcu_cnt);
     int slice_start_rs = state->slice->start_in_rs;
     if (state->encoder_control->cfg.slices & KVZ_SLICES_WPP) {
@@ -1043,8 +1064,11 @@
     state->frame->total_bits_coded = state->previous_encoder_state->frame->total_bits_coded;
   }
   state->frame->total_bits_coded += newpos - curpos;
+  if(state->encoder_control->cfg.rc_algorithm == KVZ_OBA) {
+    kvz_update_after_picture(state);
+  }
 
-    state->frame->cur_gop_bits_coded = state->previous_encoder_state->frame->cur_gop_bits_coded;
+  state->frame->cur_gop_bits_coded = state->previous_encoder_state->frame->cur_gop_bits_coded;
   state->frame->cur_gop_bits_coded += newpos - curpos;
 }

kvazaar-1.3.0.tar.gz/src/encoder_state-ctors_dtors.c -> kvazaar-2.0.0.tar.gz/src/encoder_state-ctors_dtors.c Changed

@@ -34,6 +34,7 @@
 #include "kvazaar.h"
 #include "threadqueue.h"
 #include "videoframe.h"
+#include "rate_control.h"
 
 
 static int encoder_state_config_frame_init(encoder_state_t * const state) {
@@ -46,15 +47,39 @@
   state->frame->num = 0;
   state->frame->poc = 0;
   state->frame->total_bits_coded = 0;
+  state->frame->cur_frame_bits_coded = 0;
   state->frame->cur_gop_bits_coded = 0;
   state->frame->prepared = 0;
   state->frame->done = 1;
+
   state->frame->rc_alpha = 3.2003;
   state->frame->rc_beta = -1.367;
+  state->frame->icost = 0;
 
   const encoder_control_t * const encoder = state->encoder_control;
   const int num_lcus = encoder->in.width_in_lcu * encoder->in.height_in_lcu;
-  state->frame->lcu_stats = MALLOC(lcu_stats_t, num_lcus);
+  state->frame->lcu_stats = calloc(num_lcus, sizeof(lcu_stats_t));
+  state->frame->aq_offsets = MALLOC(double, num_lcus);
+
+  for (int y = 0; y < encoder->in.height_in_lcu; y++) {
+    for (int x = 0; x < encoder->in.width_in_lcu; x++) {
+      int temp = MIN(encoder->cfg.width - x * 64, 64) * MIN(encoder->cfg.height - y * 64, 64);
+      state->frame->lcu_statsx + y * encoder->in.width_in_lcu.pixels = temp;
+    }
+  }
+
+  state->frame->c_para = malloc(sizeof(double) * num_lcus);
+  if(state->frame->c_para == NULL) {
+    return 0;
+  }
+  state->frame->k_para = malloc(sizeof(double) * num_lcus);
+  if (state->frame->k_para == NULL) {
+    return 0;
+  }
+
+  pthread_mutex_init(&state->frame->rc_lock, NULL);
+
+  state->frame->new_ratecontrol = kvz_get_rc_data(NULL);
 
   return 1;
 }
@@ -62,8 +87,13 @@
 static void encoder_state_config_frame_finalize(encoder_state_t * const state) {
   if (state->frame == NULL) return;
 
+  pthread_mutex_destroy(&state->frame->rc_lock);
+  if (state->frame->c_para) FREE_POINTER(state->frame->c_para);
+  if (state->frame->k_para) FREE_POINTER(state->frame->k_para);
+
   kvz_image_list_destroy(state->frame->ref);
   FREE_POINTER(state->frame->lcu_stats);
+  FREE_POINTER(state->frame->aq_offsets);
 }
 
 static int encoder_state_config_tile_init(encoder_state_t * const state, 
@@ -348,7 +378,9 @@
     if (!child_state->slice) child_state->slice = parent_state->slice;
     if (!child_state->wfrow) child_state->wfrow = parent_state->wfrow;
   }
-  
+  // Intialization of the constraint structure
+  child_state->constraint = kvz_init_constraint(child_state->constraint, child_state->encoder_control);
+
   kvz_bitstream_init(&child_state->stream);
   
   // Set CABAC output bitstream
@@ -681,7 +713,7 @@
     for (i = 0; state->childreni.encoder_control; ++i) {
       kvz_encoder_state_finalize(&state->childreni);
     }
-    
+
     FREE_POINTER(state->children);
   }
   
@@ -706,6 +738,11 @@
     FREE_POINTER(state->frame);
   }
   
+  if (state->constraint) {
+    // End of the constraint structure
+    kvz_constraint_free(state);
+  }
+
   kvz_bitstream_finalize(&state->stream);
 
   kvz_threadqueue_free_job(&state->tqj_recon_done);

kvazaar-1.3.0.tar.gz/src/encoder_state-ctors_dtors.h -> kvazaar-2.0.0.tar.gz/src/encoder_state-ctors_dtors.h Changed

kvazaar-1.3.0.tar.gz/src/encoderstate.c -> kvazaar-2.0.0.tar.gz/src/encoderstate.c Changed

@@ -37,6 +37,8 @@
 #include "tables.h"
 #include "threadqueue.h"
 
+#include "strategies/strategies-picture.h"
+
 
 int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
   int i;
@@ -616,7 +618,17 @@
   const encoder_control_t * const encoder = state->encoder_control;
   videoframe_t* const frame = state->tile->frame;
 
-  kvz_set_lcu_lambda_and_qp(state, lcu->position);
+  switch (encoder->cfg.rc_algorithm) {
+    case KVZ_NO_RC:
+    case KVZ_LAMBDA:
+      kvz_set_lcu_lambda_and_qp(state, lcu->position);
+      break;
+    case KVZ_OBA:
+      kvz_set_ctu_qp_lambda(state, lcu->position);
+      break;
+    default:
+      assert(0);
+  }
 
   lcu_coeff_t coeff;
   state->coeff = &coeff;
@@ -702,9 +714,27 @@
     }
   }
 
+  pthread_mutex_lock(&state->frame->rc_lock);
   const uint32_t bits = kvz_bitstream_tell(&state->stream) - existing_bits;
+  state->frame->cur_frame_bits_coded += bits;
+  // This variable is used differently by intra and inter frames and shouldn't
+  // be touched in intra frames here
+  state->frame->remaining_weight -= !state->frame->is_irap ?
+    kvz_get_lcu_stats(state, lcu->position.x, lcu->position.y)->original_weight :
+    0;
+  pthread_mutex_unlock(&state->frame->rc_lock);
   kvz_get_lcu_stats(state, lcu->position.x, lcu->position.y)->bits = bits;
 
+  uint8_t not_skip = false;
+  for(int y = 0; y < 64 && !not_skip; y+=8) {
+    for(int x = 0; x < 64 && !not_skip; x+=8) {
+      not_skip |= !kvz_cu_array_at_const(state->tile->frame->cu_array,
+        lcu->position_px.x + x,
+        lcu->position_px.y + y)->skipped;
+    }
+  }
+  kvz_get_lcu_stats(state, lcu->position.x, lcu->position.y)->skipped = !not_skip;
+
   //Wavefronts need the context to be copied to the next row
   if (state->type == ENCODER_STATE_TYPE_WAVEFRONT_ROW && lcu->index == 1) {
     int j;
@@ -803,6 +833,11 @@
           }
           kvz_threadqueue_job_dep_add(job0, ref_state->tile->wf_jobsdep_lcu->id);
 
+          //TODO: Preparation for the lock free implementation of the new rc
+          if (ref_state->frame->slicetype == KVZ_SLICE_I && ref_state->frame->num != 0 && state->encoder_control->cfg.owf > 1 && true) {
+            kvz_threadqueue_job_dep_add(job0, ref_state->previous_encoder_state->tile->wf_jobsdep_lcu->id);
+          }
+
           // Very spesific bug that happens when owf length is longer than the
           // gop length. Takes care of that.
           if(!state->encoder_control->cfg.gop_lowdelay &&
@@ -1163,6 +1198,12 @@
   kvz_threadqueue_free_job(&state->tqj_bitstream_written);
   kvz_threadqueue_free_job(&state->tqj_recon_done);
 
+  //Copy the constraint pointer
+  // TODO: Try to do it in the if (state->is_leaf)
+  //if (state->parent != NULL) {
+    // state->constraint = state->parent->constraint;
+  //}
+
   for (int i = 0; state->childreni.encoder_control; ++i) {
     encoder_state_init_children(&state->childreni);
   }
@@ -1184,6 +1225,21 @@
   }
 }
 
+// Check if lcu is edge lcu. Return false if frame dimensions are 64 divisible
+static bool edge_lcu(int id, int lcus_x, int lcus_y, bool xdiv64, bool ydiv64)
+{
+  if (xdiv64 && ydiv64) {
+    return false;
+  }
+  int last_row_first_id = (lcus_y - 1) * lcus_x;
+  if ((id % lcus_x == lcus_x - 1 && !xdiv64) || (id >= last_row_first_id && !ydiv64)) {
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+
 static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_picture* frame) {
   assert(state->type == ENCODER_STATE_TYPE_MAIN);
 
@@ -1197,11 +1253,108 @@
       state->tile->frame->height
   );
 
+  // Variance adaptive quantization
+  if (cfg->vaq) {
+    const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
+    double d = cfg->vaq * 0.1; // Empirically decided constant. Affects delta-QP strength
+    
+    // Calculate frame pixel variance
+    uint32_t len = state->tile->frame->width * state->tile->frame->height;
+    uint32_t c_len = len / 4;
+    double frame_var = kvz_pixel_var(state->tile->frame->source->y, len);
+    if (has_chroma) {
+      frame_var += kvz_pixel_var(state->tile->frame->source->u, c_len);
+      frame_var += kvz_pixel_var(state->tile->frame->source->v, c_len);
+    }
+
+    // Loop through LCUs
+    // For each LCU calculate: D * (log(LCU pixel variance) - log(frame pixel variance))
+    unsigned x_lim = state->tile->frame->width_in_lcu;
+    unsigned y_lim = state->tile->frame->height_in_lcu;
+    
+    unsigned id = 0;
+    for (int y = 0; y < y_lim; ++y) {
+      for (int x = 0; x < x_lim; ++x) {
+        kvz_pixel tmpLCU_LUMA_SIZE;
+        int pxl_x = x * LCU_WIDTH;
+        int pxl_y = y * LCU_WIDTH;
+        int x_max = MIN(pxl_x + LCU_WIDTH, frame->width) - pxl_x;
+        int y_max = MIN(pxl_y + LCU_WIDTH, frame->height) - pxl_y;
+        
+        bool xdiv64 = false;
+        bool ydiv64 = false;
+        if (frame->width % 64 == 0) xdiv64 = true;
+        if (frame->height % 64 == 0) ydiv64 = true;
+
+        // Luma variance
+        if (!edge_lcu(id, x_lim, y_lim, xdiv64, ydiv64)) {
+          kvz_pixels_blit(&state->tile->frame->source->ypxl_x + pxl_y * state->tile->frame->source->stride, tmp,
+            x_max, y_max, state->tile->frame->source->stride, LCU_WIDTH);
+        } else {
+          // Extend edge pixels for edge lcus
+          for (int y = 0; y < LCU_WIDTH; y++) {
+            for (int x = 0; x < LCU_WIDTH; x++) {
+              int src_y = CLIP(0, frame->height - 1, pxl_y + y);
+              int src_x = CLIP(0, frame->width - 1, pxl_x + x);
+              tmpy * LCU_WIDTH + x = state->tile->frame->source->ysrc_y * state->tile->frame->source->stride + src_x;
+            }
+          }
+        }
+        
+        double lcu_var = kvz_pixel_var(tmp, LCU_LUMA_SIZE);
+
+        if (has_chroma) {
+          // Add chroma variance if not monochrome
+          int32_t c_stride = state->tile->frame->source->stride >> 1;
+          kvz_pixel chromau_tmpLCU_CHROMA_SIZE;
+          kvz_pixel chromav_tmpLCU_CHROMA_SIZE;
+          int lcu_chroma_width = LCU_WIDTH >> 1;
+          int c_pxl_x = x * lcu_chroma_width;
+          int c_pxl_y = y * lcu_chroma_width;
+          int c_x_max = MIN(c_pxl_x + lcu_chroma_width, frame->width >> 1) - c_pxl_x;
+          int c_y_max = MIN(c_pxl_y + lcu_chroma_width, frame->height >> 1) - c_pxl_y;
+
+          if (!edge_lcu(id, x_lim, y_lim, xdiv64, ydiv64)) {
+            kvz_pixels_blit(&state->tile->frame->source->uc_pxl_x + c_pxl_y * c_stride, chromau_tmp, c_x_max, c_y_max, c_stride, lcu_chroma_width);
+            kvz_pixels_blit(&state->tile->frame->source->vc_pxl_x + c_pxl_y * c_stride, chromav_tmp, c_x_max, c_y_max, c_stride, lcu_chroma_width);
+          }
+          else {
+            for (int y = 0; y < lcu_chroma_width; y++) {
+              for (int x = 0; x < lcu_chroma_width; x++) {
+                int src_y = CLIP(0, (frame->height >> 1) - 1, c_pxl_y + y);
+                int src_x = CLIP(0, (frame->width >> 1) - 1, c_pxl_x + x);
+                chromau_tmpy * lcu_chroma_width + x = state->tile->frame->source->usrc_y * c_stride + src_x;
+                chromav_tmpy * lcu_chroma_width + x = state->tile->frame->source->vsrc_y * c_stride + src_x;
+              }
+            }
+          }
+          lcu_var += kvz_pixel_var(chromau_tmp, LCU_CHROMA_SIZE);
+          lcu_var += kvz_pixel_var(chromav_tmp, LCU_CHROMA_SIZE);
+        }
+                
+        state->frame->aq_offsetsid = d * (log(lcu_var) - log(frame_var));
+        id++; 
+      }
+    }
+  }
+  // Variance adaptive quantization - END
+
   // Use this flag to handle closed gop irap picture selection.
   // If set to true, irap is already set and we avoid
   // setting it based on the intra period
   bool is_closed_normal_gop = false;
 
+  encoder_state_t *previous = state->previous_encoder_state;
+  int owf = MIN(state->encoder_control->cfg.owf, state->frame->num);
+
+  const int layer = state->encoder_control->cfg.gopstate->frame->gop_offset.layer;
+
+  while (--owf > 0 && layer != state->encoder_control->cfg.gopprevious->frame->gop_offset.layer) {
+    previous = previous->previous_encoder_state;
+  }
+
+  if (owf == 0) previous = state;
+  state->frame->previous_layer_state = previous;
   // Set POC.
   if (state->frame->num == 0) {
     state->frame->poc = 0;
@@ -1281,8 +1434,20 @@
   if (cfg->target_bitrate > 0 && state->frame->num > cfg->owf) {
     normalize_lcu_weights(state);
   }
-  kvz_set_picture_lambda_and_qp(state);
-
+  state->frame->cur_frame_bits_coded = 0;
+
+  switch (state->encoder_control->cfg.rc_algorithm) {
+    case KVZ_NO_RC:
+    case KVZ_LAMBDA:
+      kvz_set_picture_lambda_and_qp(state);
+      break;
+    case KVZ_OBA:
+      kvz_estimate_pic_lambda(state);
+      break;
+    default:
+      assert(0);
+  }
+ 
   encoder_state_init_children(state);
 }
 
@@ -1345,6 +1510,7 @@
     assert(!state->tile->frame->rec);
     assert(!state->tile->frame->cu_array);
     state->frame->prepared = 1;
+
     return;
   }
 
@@ -1395,6 +1561,8 @@
   state->frame->irap_poc = prev_state->frame->irap_poc;
 
   state->frame->prepared = 1;
+
+
 }
 
 coeff_scan_order_t kvz_get_scan_order(int8_t cu_type, int intra_mode, int depth)

kvazaar-1.3.0.tar.gz/src/encoderstate.h -> kvazaar-2.0.0.tar.gz/src/encoderstate.h Changed

@@ -39,6 +39,7 @@
 #include "videoframe.h"
 #include "extras/crypto.h"
 
+struct kvz_rc_data;
 
 typedef enum {
   ENCODER_STATE_TYPE_INVALID = 'i',
@@ -53,9 +54,13 @@
   //! \brief Number of bits that were spent
   uint32_t bits;
 
+  uint32_t pixels;
+
   //! \brief Weight of the LCU for rate control
   double weight;
 
+  double original_weight;
+
   //! \brief Lambda value which was used for this LCU
   double lambda;
 
@@ -64,6 +69,11 @@
 
   //! \brief Rate control beta parameter
   double rc_beta;
+  double distortion;
+  int i_cost;
+
+  int8_t qp;
+  uint8_t skipped;
 } lcu_stats_t;
 
 
@@ -111,6 +121,9 @@
   //! Number of bits written in the current GOP.
   uint64_t cur_gop_bits_coded;
 
+  //! Number of bits written in the current frame.
+  uint64_t cur_frame_bits_coded;
+
   //! Number of bits targeted for the current GOP.
   double cur_gop_target_bits;
 
@@ -141,11 +154,27 @@
    */
   lcu_stats_t *lcu_stats;
 
+  pthread_mutex_t rc_lock;
+
+  struct kvz_rc_data *new_ratecontrol;
+
+  struct encoder_state_t const *previous_layer_state;
+
+  /**
+  * \brief Calculated adaptive QP offset for each LCU.
+  */
+  double *aq_offsets;
+
   /**
    * \brief Whether next NAL is the first NAL in the access unit.
    */
   bool first_nal;
+  double icost;
+  double remaining_weight;
+  double i_bits_left;
 
+  double *c_para;
+  double *k_para;
 } encoder_state_config_frame_t;
 
 typedef struct encoder_state_config_tile_t {
@@ -236,7 +265,7 @@
   
   //Pointer to the encoder_state of the previous frame
   struct encoder_state_t *previous_encoder_state;
-  
+    
   encoder_state_config_frame_t  *frame;
   encoder_state_config_tile_t   *tile;
   encoder_state_config_slice_t  *slice;
@@ -288,6 +317,11 @@
   //Jobs to wait for
   threadqueue_job_t * tqj_recon_done; //Reconstruction is done
   threadqueue_job_t * tqj_bitstream_written; //Bitstream is written
+
+  //Constraint structure  
+  void * constraint;
+
+
 } encoder_state_t;
 
 void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame);

kvazaar-1.3.0.tar.gz/src/global.h -> kvazaar-2.0.0.tar.gz/src/global.h Changed

kvazaar-2.0.0.tar.gz/src/gop.h Added

@@ -0,0 +1,400 @@
+#ifndef GOP_H_
+#define GOP_H_
+/*****************************************************************************
+* This file is part of Kvazaar HEVC encoder.
+*
+* Copyright (C) 2018 Tampere University of Technology and others (see
+* COPYING file).
+*
+* Kvazaar is free software: you can redistribute it and/or modify it under
+* the terms of the GNU Lesser General Public License as published by the
+* Free Software Foundation; either version 2.1 of the License, or (at your
+* option) any later version.
+*
+* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+* FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+* more details.
+*
+* You should have received a copy of the GNU General Public License along
+* with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+****************************************************************************/
+
+#include <kvazaar.h>
+
+
+static const kvz_gop_config kvz_gop_lowdelay44 = {
+  {
+    .poc_offset = 1,
+    .layer      = 1,
+    .qp_offset  = 5,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -6.5,
+    .qp_model_scale  = 0.2590,
+    .is_ref     = 1,
+    .ref_neg_count = 4,
+    .ref_neg = { 1, 5, 9, 13 },
+    .ref_pos_count = 0,
+    .ref_pos = { 0 },
+  },
+  {
+    .poc_offset = 2,
+    .layer      = 1,
+    .qp_offset  = 4,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -6.5,
+    .qp_model_scale  = 0.2590,
+    .is_ref     = 1,
+    .ref_neg_count = 3,
+    .ref_neg = { 1, 2, 6, 10 },
+    .ref_pos_count = 0,
+    .ref_pos = { 0 },
+  },
+  {
+    .poc_offset = 3,
+    .layer      = 1,
+    .qp_offset  = 5,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -6.5,
+    .qp_model_scale  = 0.2590,
+    .is_ref     = 1,
+    .ref_neg_count = 3,
+    .ref_neg = { 1, 3, 7, 11 },
+    .ref_pos_count = 0,
+    .ref_pos = { 0 },
+  },
+  {
+    .poc_offset = 4,
+    .layer      = 1,
+    .qp_offset  = 1,
+    .qp_factor  = 1.0,
+    .qp_model_offset = 0.0,
+    .qp_model_scale  = 0.0,
+    .is_ref     = 1,
+    .ref_neg_count = 3,
+    .ref_neg = { 1, 4, 8, 12 },
+    .ref_pos_count = 0,
+    .ref_pos = { 0 },
+  },
+};
+
+
+static const kvz_gop_config kvz_gop_ra88 = {
+  {
+    .poc_offset = 8,
+    .layer      = 1,
+    .qp_offset  = 0,
+    .qp_factor  = 1.0,
+    .qp_model_offset = 0.0,
+    .qp_model_scale  = 0.0,
+    .is_ref     = 1,
+    .ref_neg_count = 3,
+    .ref_neg = { 8, 12, 16 },
+    .ref_pos_count = 0,
+    .ref_pos = { 0 },
+  },
+  {
+    .poc_offset = 4,
+    .layer      = 2,
+    .qp_offset  = 3,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -6.25,
+    .qp_model_scale  = 0.25,
+    .is_ref     = 1,
+    .ref_neg_count = 2,
+    .ref_neg = { 4, 8 },
+    .ref_pos_count = 1,
+    .ref_pos = { 4 },
+  },
+  {
+    .poc_offset = 2,
+    .layer      = 3,
+    .qp_offset  = 4,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -6.25,
+    .qp_model_scale  = 0.25,
+    .is_ref     = 1,
+    .ref_neg_count = 2,
+    .ref_neg = { 2, 6 },
+    .ref_pos_count = 2,
+    .ref_pos = { 2, 6 },
+  },
+  {
+    .poc_offset = 1,
+    .layer      = 4,
+    .qp_offset  = 8,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.0,
+    .qp_model_scale  = 0.245,
+    .is_ref     = 0,
+    .ref_neg_count = 1,
+    .ref_neg = { 1 },
+    .ref_pos_count = 3,
+    .ref_pos = { 1, 3, 7 },
+  },
+  {
+    .poc_offset = 3,
+    .layer      = 4,
+    .qp_offset  = 8,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.0,
+    .qp_model_scale  = 0.245,
+    .is_ref     = 0,
+    .ref_neg_count = 2,
+    .ref_neg = { 1, 3 },
+    .ref_pos_count = 2,
+    .ref_pos = { 1, 5 },
+  },
+  {
+    .poc_offset = 6,
+    .layer      = 3,
+    .qp_offset  = 4,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -6.25,
+    .qp_model_scale  = 0.25,
+    .is_ref     = 1,
+    .ref_neg_count = 2,
+    .ref_neg = { 2, 6 },
+    .ref_pos_count = 1,
+    .ref_pos = { 2 },
+  },
+  {
+    .poc_offset = 5,
+    .layer      = 4,
+    .qp_offset  = 8,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.0,
+    .qp_model_scale  = 0.245,
+    .is_ref     = 0,
+    .ref_neg_count = 2,
+    .ref_neg = { 1, 5 },
+    .ref_pos_count = 2,
+    .ref_pos = { 1, 3 },
+  },
+  {
+    .poc_offset = 7,
+    .layer      = 4,
+    .qp_offset  = 8,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.0,
+    .qp_model_scale  = 0.245,
+    .is_ref     = 0,
+    .ref_neg_count = 3,
+    .ref_neg = { 1, 3, 7 },
+    .ref_pos_count = 1,
+    .ref_pos = { 1 },
+  },
+};
+
+static const kvz_gop_config kvz_gop_ra1616 = {
+  {
+    .poc_offset = 16,
+    .layer      = 1,
+    .qp_offset  = 1,
+    .qp_factor  = 1.0,
+    .qp_model_offset = 0.0,
+    .qp_model_scale  = 0.0,
+    .is_ref     = 1,
+    .ref_neg_count = 3,
+    .ref_neg = { 16, 24, 32 },
+    .ref_pos_count = 0,
+    .ref_pos = { 0 },
+  },
+  {
+    .poc_offset = 8,
+    .layer      = 2,
+    .qp_offset  = 1,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -4.8848,
+    .qp_model_scale  = 0.2061,
+    .is_ref     = 1,
+    .ref_neg_count = 2,
+    .ref_neg = { 8, 16 },
+    .ref_pos_count = 1,
+    .ref_pos = { 8 },
+  },
+  {
+    .poc_offset = 4,
+    .layer      = 3,
+    .qp_offset  = 4,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -5.7476,
+    .qp_model_scale  = 0.2286,
+    .is_ref     = 1,
+    .ref_neg_count = 2,
+    .ref_neg = { 4, 12 },
+    .ref_pos_count = 2,
+    .ref_pos = { 4, 12 },
+  },
+  {
+    .poc_offset = 2,
+    .layer      = 4,
+    .qp_offset  = 5,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -5.90,
+    .qp_model_scale  = 0.2333,
+    .is_ref     = 1,
+    .ref_neg_count = 2,
+    .ref_neg = { 2, 10 },
+    .ref_pos_count = 3,
+    .ref_pos = { 2, 6, 14 },
+  },
+  {
+    .poc_offset = 1,
+    .layer      = 5,
+    .qp_offset  = 6,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.1444,
+    .qp_model_scale  = 0.3,
+    .is_ref     = 0,
+    .ref_neg_count = 1,
+    .ref_neg = { 1 },
+    .ref_pos_count = 4,
+    .ref_pos = { 1, 3, 7, 15 },
+  },
+  {
+    .poc_offset = 3,
+    .layer      = 5,
+    .qp_offset  = 6,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.1444,
+    .qp_model_scale  = 0.3,
+    .is_ref     = 0,
+    .ref_neg_count = 2,
+    .ref_neg = { 1, 3 },
+    .ref_pos_count = 3,
+    .ref_pos = { 1, 5, 13 },
+  },
+  {
+    .poc_offset = 6,
+    .layer      = 4,
+    .qp_offset  = 5,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -5.90,
+    .qp_model_scale  = 0.2333,
+    .is_ref     = 1,
+    .ref_neg_count = 2,
+    .ref_neg = { 2, 6 },
+    .ref_pos_count = 2,
+    .ref_pos = { 2, 10 },
+  },
+  {
+    .poc_offset = 5,
+    .layer      = 5,
+    .qp_offset  = 6,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.1444,
+    .qp_model_scale  = 0.3,
+    .is_ref     = 0,
+    .ref_neg_count = 2,
+    .ref_neg = { 1, 5 },
+    .ref_pos_count = 3,
+    .ref_pos = { 1, 3, 11 },
+  },
+  {
+    .poc_offset = 7,
+    .layer      = 5,
+    .qp_offset  = 6,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.1444,
+    .qp_model_scale  = 0.3,
+    .is_ref     = 0,
+    .ref_neg_count = 3,
+    .ref_neg = { 1, 3, 7 },
+    .ref_pos_count = 2,
+    .ref_pos = { 1, 9 },
+  },
+  {
+    .poc_offset = 12,
+    .layer      = 3,
+    .qp_offset  = 4,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -5.7476,
+    .qp_model_scale  = 0.2286,
+    .is_ref     = 1,
+    .ref_neg_count = 2,
+    .ref_neg = { 4, 12 },
+    .ref_pos_count = 1,
+    .ref_pos = { 4 },
+  },
+  {
+    .poc_offset = 10,
+    .layer      = 4,
+    .qp_offset  = 5,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -5.90,
+    .qp_model_scale  = 0.2333,
+    .is_ref     = 1,
+    .ref_neg_count = 2,
+    .ref_neg = { 2, 10 },
+    .ref_pos_count = 2,
+    .ref_pos = { 2, 6 },
+  },
+  {
+    .poc_offset = 9,
+    .layer      = 5,
+    .qp_offset  = 6,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.1444,
+    .qp_model_scale  = 0.3,
+    .is_ref     = 0,
+    .ref_neg_count = 2,
+    .ref_neg = { 1, 9 },
+    .ref_pos_count = 3,
+    .ref_pos = { 1, 3, 7 },
+  },
+  {
+    .poc_offset = 11,
+    .layer      = 5,
+    .qp_offset  = 6,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.1444,
+    .qp_model_scale  = 0.3,
+    .is_ref     = 0,
+    .ref_neg_count = 3,
+    .ref_neg = { 1, 3, 11 },
+    .ref_pos_count = 2,
+    .ref_pos = { 1, 5 },
+  },
+  {
+    .poc_offset = 14,
+    .layer      = 4,
+    .qp_offset  = 5,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -5.90,
+    .qp_model_scale  = 0.2333,
+    .is_ref     = 1,
+    .ref_neg_count = 3,
+    .ref_neg = { 2, 6, 14 },
+    .ref_pos_count = 1,
+    .ref_pos = { 2 },
+  },
+  {
+    .poc_offset = 13,
+    .layer      = 5,
+    .qp_offset  = 6,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.1444,
+    .qp_model_scale  = 0.3,
+    .is_ref     = 0,
+    .ref_neg_count = 3,
+    .ref_neg = { 1, 5, 13 },
+    .ref_pos_count = 2,
+    .ref_pos = { 1, 3 },
+  },
+  {
+    .poc_offset = 15,
+    .layer      = 5,
+    .qp_offset  = 6,
+    .qp_factor  = 1.0,
+    .qp_model_offset = -7.1444,
+    .qp_model_scale  = 0.3,
+    .is_ref     = 0,
+    .ref_neg_count = 4,
+    .ref_neg = { 1, 3, 7, 15 },
+    .ref_pos_count = 1,
+    .ref_pos = { 1 },
+  },
+};
+
+#endif // GOP_H_

kvazaar-1.3.0.tar.gz/src/input_frame_buffer.c -> kvazaar-2.0.0.tar.gz/src/input_frame_buffer.c Changed

@@ -43,15 +43,18 @@
  *
  * The caller must not modify img_in after calling this function.
  *
- * \param buf     an input frame buffer
- * \param state   a main encoder state
- * \param img_in  input frame or NULL
+ * \param buf         an input frame buffer
+ * \param state       a main encoder state
+ * \param img_in      input frame or NULL
+ * \param first_done  whether the first frame has been done,
+ *                    needed for the OBA rc
  * \return        pointer to the next picture, or NULL if no picture is
  *                available
  */
 kvz_picture* kvz_encoder_feed_frame(input_frame_buffer_t *buf,
                                     encoder_state_t *const state,
-                                    kvz_picture *const img_in)
+                                    kvz_picture *const img_in, 
+                                    int first_done)
 {
   const encoder_control_t* const encoder = state->encoder_control;
   const kvz_config* const cfg = &encoder->cfg;
@@ -82,7 +85,7 @@
     buf->num_out++;
     return kvz_image_copy_ref(img_in);
   }
-
+  
   if (img_in != NULL) {
     // Index of the next input picture, in range -1, +inf). Values
     // i and j refer to the same indices in buf->pic_buffer iff
@@ -140,7 +143,7 @@
     dts_out = buf->pts_buffergop_buf_size - 1 + buf->delay;
     gop_offset = 0; // highest quality picture
 
-  } else {
+  } else if(first_done) {
     gop_offset = (buf->num_out - 1) % cfg->gop_len;
     
     // For closed gop, calculate the gop_offset again
@@ -183,6 +186,9 @@
       dts_out = buf->pts_bufferdts_idx % gop_buf_size;
     }
   }
+  else {
+    return NULL;
+  }
 
   // Index in buf->pic_buffer and buf->pts_buffer.
   int buf_idx = (idx_out + gop_buf_size) % gop_buf_size;

kvazaar-1.3.0.tar.gz/src/input_frame_buffer.h -> kvazaar-2.0.0.tar.gz/src/input_frame_buffer.h Changed

kvazaar-1.3.0.tar.gz/src/inter.c -> kvazaar-2.0.0.tar.gz/src/inter.c Changed

@@ -301,15 +301,17 @@
 /**
  * \brief Reconstruct an inter PU using uniprediction.
  *
- * \param state         encoder state
- * \param ref           picture to copy the data from
- * \param xpos          PU x position
- * \param ypos          PU y position
- * \param width         PU width
- * \param height        PU height
- * \param mv_param      motion vector
- * \param lcu           destination lcu
- * \param hi_prec_out   destination of high precision output, or NULL if not needed
+ * \param state          encoder state
+ * \param ref            picture to copy the data from
+ * \param xpos           PU x position
+ * \param ypos           PU y position
+ * \param width          PU width
+ * \param height         PU height
+ * \param mv_param       motion vector
+ * \param lcu            destination lcu
+ * \param hi_prec_out    destination of high precision output, or NULL if not needed
+ * \param predict_luma   Enable or disable luma prediction for this call.
+ * \param predict_chroma Enable or disable chroma prediction for this call.
 */
 static void inter_recon_unipred(const encoder_state_t * const state,
                                 const kvz_picture * const ref,
@@ -319,7 +321,9 @@
                                 int32_t height,
                                 const int16_t mv_param2,
                                 lcu_t *lcu,
-                                hi_prec_buf_t *hi_prec_out)
+                                hi_prec_buf_t *hi_prec_out,
+                                bool predict_luma,
+                                bool predict_chroma)
 {
   const vector2d_t pu_in_tile = { xpos, ypos };
   const vector2d_t pu_in_lcu = { xpos % LCU_WIDTH, ypos % LCU_WIDTH };
@@ -340,38 +344,43 @@
   const int8_t fractional_luma = ((mv_param0 & 3) || (mv_param1 & 3));
 
   // Generate prediction for luma.
-  if (fractional_luma) {
-    // With a fractional MV, do interpolation.
-    if (state->encoder_control->cfg.bipred && hi_prec_out) {
-      inter_recon_14bit_frac_luma(state, ref,
-                                  pu_in_tile.x, pu_in_tile.y,
-                                  width, height,
-                                  mv_param, hi_prec_out);
-    } else {
-      inter_recon_frac_luma(state, ref,
-                            pu_in_tile.x, pu_in_tile.y,
-                            width, height,
-                            mv_param, lcu);
+  if (predict_luma) {
+    if (fractional_luma) {
+      // With a fractional MV, do interpolation.
+      if (state->encoder_control->cfg.bipred && hi_prec_out) {
+        inter_recon_14bit_frac_luma(state, ref,
+          pu_in_tile.x, pu_in_tile.y,
+          width, height,
+          mv_param, hi_prec_out);
+      }
+      else {
+        inter_recon_frac_luma(state, ref,
+          pu_in_tile.x, pu_in_tile.y,
+          width, height,
+          mv_param, lcu);
+      }
     }
-  } else {
-    // With an integer MV, copy pixels directly from the reference.
-    const int lcu_pu_index = pu_in_lcu.y * LCU_WIDTH + pu_in_lcu.x;
-    if (mv_is_outside_frame) {
-      inter_cp_with_ext_border(ref->y, ref->width,
-                               ref->width, ref->height,
-                               &lcu->rec.ylcu_pu_index, LCU_WIDTH,
-                               width, height,
-                               &mv_in_frame);
-    } else {
-      const int frame_mv_index = mv_in_frame.y * ref->width + mv_in_frame.x;
-      kvz_pixels_blit(&ref->yframe_mv_index,
-                      &lcu->rec.ylcu_pu_index,
-                      width, height,
-                      ref->width, LCU_WIDTH);
+    else {
+      // With an integer MV, copy pixels directly from the reference.
+      const int lcu_pu_index = pu_in_lcu.y * LCU_WIDTH + pu_in_lcu.x;
+      if (mv_is_outside_frame) {
+        inter_cp_with_ext_border(ref->y, ref->width,
+          ref->width, ref->height,
+          &lcu->rec.ylcu_pu_index, LCU_WIDTH,
+          width, height,
+          &mv_in_frame);
+      }
+      else {
+        const int frame_mv_index = mv_in_frame.y * ref->width + mv_in_frame.x;
+        kvz_pixels_blit(&ref->yframe_mv_index,
+          &lcu->rec.ylcu_pu_index,
+          width, height,
+          ref->width, LCU_WIDTH);
+      }
     }
   }
 
-  if (state->encoder_control->chroma_format == KVZ_CSP_400) {
+  if (!predict_chroma) {
     return;
   }
 
@@ -422,15 +431,17 @@
 /**
  * \brief Reconstruct bi-pred inter PU
  *
- * \param state     encoder state
- * \param ref1      reference picture to copy the data from
- * \param ref2      other reference picture to copy the data from
- * \param xpos      PU x position
- * \param ypos      PU y position
- * \param width     PU width
- * \param height    PU height
- * \param mv_param  motion vectors
- * \param lcu       destination lcu
+ * \param state          encoder state
+ * \param ref1           reference picture to copy the data from
+ * \param ref2           other reference picture to copy the data from
+ * \param xpos           PU x position
+ * \param ypos           PU y position
+ * \param width          PU width
+ * \param height         PU height
+ * \param mv_param       motion vectors
+ * \param lcu            destination lcu
+ * \param predict_luma   Enable or disable luma prediction for this call.
+ * \param predict_chroma Enable or disable chroma prediction for this call.
  */
 void kvz_inter_recon_bipred(const encoder_state_t * const state,
                             const kvz_picture * ref1,
@@ -440,7 +451,9 @@
                             int32_t width,
                             int32_t height,
                             int16_t mv_param22,
-                            lcu_t* lcu)
+                            lcu_t* lcu,
+                            bool predict_luma,
+                            bool predict_chroma)
 {
   kvz_pixel temp_lcu_yLCU_WIDTH*LCU_WIDTH;
   kvz_pixel temp_lcu_uLCU_WIDTH_C*LCU_WIDTH_C;
@@ -459,7 +472,8 @@
 
 
   //Reconstruct both predictors
-  inter_recon_unipred(state, ref1, xpos, ypos, width, height, mv_param0, lcu, high_precision_rec0);
+  inter_recon_unipred(state, ref1, xpos, ypos, width, height, mv_param0, lcu, high_precision_rec0,
+                      predict_luma, predict_chroma);
   if (!hi_prec_luma_rec0){
     memcpy(temp_lcu_y, lcu->rec.y, sizeof(kvz_pixel) * 64 * 64); // copy to temp_lcu_y
   }
@@ -467,10 +481,15 @@
     memcpy(temp_lcu_u, lcu->rec.u, sizeof(kvz_pixel) * 32 * 32); // copy to temp_lcu_u
     memcpy(temp_lcu_v, lcu->rec.v, sizeof(kvz_pixel) * 32 * 32); // copy to temp_lcu_v
   }
-  inter_recon_unipred(state, ref2, xpos, ypos, width, height, mv_param1, lcu, high_precision_rec1);
+  inter_recon_unipred(state, ref2, xpos, ypos, width, height, mv_param1, lcu, high_precision_rec1,
+                      predict_luma, predict_chroma);
 
   // After reconstruction, merge the predictors by taking an average of each pixel
-  kvz_inter_recon_bipred_blend(hi_prec_luma_rec0, hi_prec_luma_rec1, hi_prec_chroma_rec0, hi_prec_chroma_rec1, height, width, ypos, xpos, high_precision_rec0, high_precision_rec1, lcu, temp_lcu_y, temp_lcu_u, temp_lcu_v);
+  kvz_inter_recon_bipred_blend(hi_prec_luma_rec0, hi_prec_luma_rec1, 
+                               hi_prec_chroma_rec0, hi_prec_chroma_rec1,
+                               height, width, ypos, xpos,
+                               high_precision_rec0, high_precision_rec1,
+                               lcu, temp_lcu_y, temp_lcu_u, temp_lcu_v, predict_luma, predict_chroma);
  
   if (high_precision_rec0 != 0) kvz_hi_prec_buf_t_free(high_precision_rec0);
   if (high_precision_rec1 != 0) kvz_hi_prec_buf_t_free(high_precision_rec1);
@@ -488,54 +507,87 @@
  * \param x       x-coordinate of the CU in pixels
  * \param y       y-coordinate of the CU in pixels
  * \param width   CU width
+ * \param predict_luma   Enable or disable luma prediction for this call.
+ * \param predict_chroma Enable or disable chroma prediction for this call.
  */
 void kvz_inter_recon_cu(const encoder_state_t * const state,
                         lcu_t *lcu,
                         int32_t x,
                         int32_t y,
-                        int32_t width)
+                        int32_t width,
+                        bool predict_luma,
+                        bool predict_chroma)
 {
   cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
-
   const int num_pu = kvz_part_mode_num_partscu->part_size;
   for (int i = 0; i < num_pu; ++i) {
-    const int pu_x = PU_GET_X(cu->part_size, width, x, i);
-    const int pu_y = PU_GET_Y(cu->part_size, width, y, i);
-    const int pu_w = PU_GET_W(cu->part_size, width, i);
-    const int pu_h = PU_GET_H(cu->part_size, width, i);
-
-    cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
-
-    if (pu->inter.mv_dir == 3) {
-      const kvz_picture *const refs2 = {
-        state->frame->ref->images
-          state->frame->ref_LX0
-            pu->inter.mv_ref0,
-        state->frame->ref->images
-          state->frame->ref_LX1
-            pu->inter.mv_ref1,
-      };
-      kvz_inter_recon_bipred(state,
-                             refs0, refs1,
-                             pu_x, pu_y,
-                             pu_w, pu_h,
-                             pu->inter.mv,
-                             lcu);
-    } else {
-      const int mv_idx = pu->inter.mv_dir - 1;
-      const kvz_picture *const ref =
-        state->frame->ref->images
-          state->frame->ref_LXmv_idx
-            pu->inter.mv_refmv_idx;
-
-      inter_recon_unipred(state,
-                          ref,
-                          pu_x, pu_y,
-                          pu_w, pu_h,
-                          pu->inter.mvmv_idx,
-                          lcu,
-                          NULL);
-    }
+    kvz_inter_pred_pu(state, lcu, x, y, width, predict_luma, predict_chroma, i);
+  }
+}
+
+/**
+ * Predict a single PU.
+ *
+ * The PU may use either uniprediction or biprediction.
+ *
+ * \param state          encoder state
+ * \param lcu            containing LCU
+ * \param x              x-coordinate of the CU in pixels
+ * \param y              y-coordinate of the CU in pixels
+ * \param width          CU width
+ * \param predict_luma   Enable or disable luma prediction for this call.
+ * \param predict_chroma Enable or disable chroma prediction for this call.
+ * \param i_pu           Index of the PU. Always zero for 2Nx2N. Used for SMP+AMP.
+ */
+void kvz_inter_pred_pu(const encoder_state_t * const state,
+                       lcu_t *lcu,
+                       int32_t x,
+                       int32_t y,
+                       int32_t width,
+                       bool predict_luma,
+                       bool predict_chroma,
+                       int i_pu)
+
+{
+  cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+  const int pu_x = PU_GET_X(cu->part_size, width, x, i_pu);
+  const int pu_y = PU_GET_Y(cu->part_size, width, y, i_pu);
+  const int pu_w = PU_GET_W(cu->part_size, width, i_pu);
+  const int pu_h = PU_GET_H(cu->part_size, width, i_pu);
+  cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
+
+  if (pu->inter.mv_dir == 3) {
+    const kvz_picture *const refs2 = {
+      state->frame->ref->images
+        state->frame->ref_LX0
+          pu->inter.mv_ref0,
+      state->frame->ref->images
+        state->frame->ref_LX1
+          pu->inter.mv_ref1,
+    };
+    kvz_inter_recon_bipred(state,
+      refs0, refs1,
+      pu_x, pu_y,
+      pu_w, pu_h,
+      pu->inter.mv,
+      lcu,
+      predict_luma, predict_chroma);
+  }
+  else {
+    const int mv_idx = pu->inter.mv_dir - 1;
+    const kvz_picture *const ref =
+      state->frame->ref->images
+        state->frame->ref_LXmv_idx
+          pu->inter.mv_refmv_idx;
+
+    inter_recon_unipred(state,
+      ref,
+      pu_x, pu_y,
+      pu_w, pu_h,
+      pu->inter.mvmv_idx,
+      lcu,
+      NULL,
+      predict_luma, predict_chroma);
   }
 }

kvazaar-1.3.0.tar.gz/src/inter.h -> kvazaar-2.0.0.tar.gz/src/inter.h Changed

@@ -44,7 +44,18 @@
                         lcu_t *lcu,
                         int32_t x,
                         int32_t y,
-                        int32_t width);
+                        int32_t width,
+                        bool predict_luma,
+                        bool predict_chroma);
+
+void kvz_inter_pred_pu(const encoder_state_t * const state,
+  lcu_t *lcu,
+  int32_t x,
+  int32_t y,
+  int32_t width,
+  bool predict_luma,
+  bool predict_chroma,
+  int i_pu);
 
 void kvz_inter_recon_bipred(const encoder_state_t * const state,
                             const kvz_picture * ref1,
@@ -54,7 +65,9 @@
                             int32_t width,
                             int32_t height,
                             int16_t mv_param22,
-                            lcu_t* lcu);
+                            lcu_t* lcu,
+                            bool predict_luma,
+                            bool predict_chroma);
 
 
 void kvz_inter_get_mv_cand(const encoder_state_t * const state,

kvazaar-1.3.0.tar.gz/src/intra.c -> kvazaar-2.0.0.tar.gz/src/intra.c Changed

@@ -237,47 +237,6 @@
 }
 
 
-/**
-* \brief Generage intra DC prediction with post filtering applied.
-* \param log2_width    Log2 of width, range 2..5.
-* \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
-* \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
-* \param dst           Buffer of size width*width.
-*/
-static void intra_pred_filtered_dc(
-  const int_fast8_t log2_width,
-  const kvz_pixel *const ref_top,
-  const kvz_pixel *const ref_left,
-  kvz_pixel *const out_block)
-{
-  assert(log2_width >= 2 && log2_width <= 5);
-
-  const int_fast8_t width = 1 << log2_width;
-
-  int_fast16_t sum = 0;
-  for (int_fast8_t i = 0; i < width; ++i) {
-    sum += ref_topi + 1;
-    sum += ref_lefti + 1;
-  }
-
-  const kvz_pixel dc_val = (sum + width) >> (log2_width + 1);
-
-  // Filter top-left with (1 2 1 / 4)
-  out_block0 = (ref_left1 + 2 * dc_val + ref_top1 + 2) / 4;
-
-  // Filter rest of the boundary with (1 3 / 4)
-  for (int_fast8_t x = 1; x < width; ++x) {
-    out_blockx = (ref_topx + 1 + 3 * dc_val + 2) / 4;
-  }
-  for (int_fast8_t y = 1; y < width; ++y) {
-    out_blocky * width = (ref_lefty + 1 + 3 * dc_val + 2) / 4;
-    for (int_fast8_t x = 1; x < width; ++x) {
-      out_blocky * width + x = dc_val;
-    }
-  }
-}
-
-
 void kvz_intra_predict(
   kvz_intra_references *refs,
   int_fast8_t log2_width,
@@ -314,7 +273,7 @@
   } else if (mode == 1) {
     // Do extra post filtering for edge pixels of luma DC mode.
     if (color == COLOR_Y && width < 32) {
-      intra_pred_filtered_dc(log2_width, used_ref->top, used_ref->left, dst);
+      kvz_intra_pred_filtered_dc(log2_width, used_ref->top, used_ref->left, dst);
     } else {
       intra_pred_dc(log2_width, used_ref->top, used_ref->left, dst);
     }
@@ -665,7 +624,18 @@
     cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
 
+  // Reset CBFs because CBFs might have been set
+  // for depth earlier
+  if (mode_luma >= 0) {
+    cbf_clear(&cur_cu->cbf, depth, COLOR_Y);
+  }
+  if (mode_chroma >= 0) {
+    cbf_clear(&cur_cu->cbf, depth, COLOR_U);
+    cbf_clear(&cur_cu->cbf, depth, COLOR_V);
+  }
+
   if (depth == 0 || cur_cu->tr_depth > depth) {
+
     const int offset = width / 2;
     const int32_t x2 = x + offset;
     const int32_t y2 = y + offset;
@@ -682,7 +652,7 @@
       LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
     };
 
-    if (mode_luma != -1 && depth < MAX_DEPTH) {
+    if (mode_luma != -1 && depth <= MAX_DEPTH) {
       cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y);
     }
     if (mode_chroma != -1 && depth <= MAX_DEPTH) {
@@ -701,6 +671,6 @@
       intra_recon_tb_leaf(state, x, y, depth, mode_chroma, lcu, COLOR_V);
     }
 
-    kvz_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu);
+    kvz_quantize_lcu_residual(state, has_luma, has_chroma, x, y, depth, cur_cu, lcu, false);
   }
 }

kvazaar-1.3.0.tar.gz/src/kvazaar.c -> kvazaar-2.0.0.tar.gz/src/kvazaar.c Changed

@@ -38,6 +38,7 @@
 #include "strategyselector.h"
 #include "threadqueue.h"
 #include "videoframe.h"
+#include "rate_control.h"
 
 
 static void kvazaar_close(kvz_encoder *encoder)
@@ -53,7 +54,8 @@
       kvz_picture *pic = NULL;
       while ((pic = kvz_encoder_feed_frame(&encoder->input_buffer,
                                            &encoder->states0,
-                                           NULL)) != NULL) {
+                                           NULL,
+                                           1)) != NULL) {
         kvz_image_free(pic);
         pic = NULL;
       }
@@ -64,6 +66,7 @@
     }
     FREE_POINTER(encoder->states);
 
+    kvz_free_rc_data();
     // Discard const from the pointer.
     kvz_encoder_control_free((void*) encoder->control);
     encoder->control = NULL;
@@ -99,6 +102,11 @@
   encoder->frames_started = 0;
   encoder->frames_done = 0;
 
+  // Assure that the rc data allocation was successful
+  if(!kvz_get_rc_data(encoder->control)) {
+    goto kvazaar_open_failure;
+  }
+
   kvz_init_input_frame_buffer(&encoder->input_buffer);
 
   encoder->states = calloc(encoder->num_encoder_states, sizeof(encoder_state_t));
@@ -108,7 +116,6 @@
 
   for (unsigned i = 0; i < encoder->num_encoder_states; ++i) {
     encoder->statesi.encoder_control = encoder->control;
-
     if (!kvz_encoder_state_init(&encoder->statesi, NULL)) {
       goto kvazaar_open_failure;
     }
@@ -246,7 +253,10 @@
     CHECKPOINT_MARK("read source frame: %d", state->frame->num + enc->control->cfg.seek);
   }
 
-  kvz_picture* frame = kvz_encoder_feed_frame(&enc->input_buffer, state, pic_in);
+  kvz_picture* frame = kvz_encoder_feed_frame(
+    &enc->input_buffer, state, pic_in,
+    enc->frames_done || state->encoder_control->cfg.rc_algorithm != KVZ_OBA
+  );
   if (frame) {
     assert(state->frame->num == enc->frames_started);
     // Start encoding.
@@ -265,8 +275,9 @@
   }
 
   encoder_state_t *output_state = &enc->statesenc->out_state_num;
-  if (!output_state->frame->done &&
-      (pic_in == NULL || enc->cur_state_num == enc->out_state_num)) {
+  if ((!output_state->frame->done &&
+       (pic_in == NULL || enc->cur_state_num == enc->out_state_num)) ||
+       (state->frame->num == 0  && state->encoder_control->cfg.rc_algorithm == KVZ_OBA)) {
 
     kvz_threadqueue_waitfor(enc->control->threadqueue, output_state->tqj_bitstream_written);
     // The job pointer must be set to NULL here since it won't be usable after

kvazaar-1.3.0.tar.gz/src/kvazaar.h -> kvazaar-2.0.0.tar.gz/src/kvazaar.h Changed

@@ -64,6 +64,11 @@
  */
 #define KVZ_MAX_GOP_LENGTH 32
 
+ /**
+ * Maximum amount of GoP layers.
+ */
+#define KVZ_MAX_GOP_LAYERS 6
+
 /**
  * Size of data chunks.
  */
@@ -213,6 +218,12 @@
   KVZ_SCALING_LIST_DEFAULT = 2,  
 };
 
+enum kvz_rc_algorithm
+{
+  KVZ_NO_RC = 0,
+  KVZ_LAMBDA = 1,
+  KVZ_OBA = 2,
+};
 // Map from input format to chroma format.
 #define KVZ_FORMAT2CSP(format) ((enum kvz_chroma_format)"\0\1\2\3"format)
 
@@ -229,6 +240,8 @@
   int8_t ref_pos16;  /*!< \brief reference picture offset list */
   int8_t ref_neg_count;/*!< \brief Reference picture count */
   int8_t ref_neg16;  /*!< \brief reference picture offset list */
+  double qp_model_offset;
+  double qp_model_scale;
 } kvz_gop_config;
 
 /**
@@ -306,8 +319,8 @@
   int32_t cpuid;
 
   struct {
-    int32_t min;
-    int32_t max;
+    int32_t minKVZ_MAX_GOP_LAYERS;
+    int32_t maxKVZ_MAX_GOP_LAYERS;
   } pu_depth_inter, pu_depth_intra;
 
   int32_t add_encoder_info;
@@ -372,6 +385,11 @@
   /** \brief Maximum steps that hexagonal and diagonal motion estimation can use. -1 to disable */
   uint32_t me_max_steps;
 
+  /** \brief Offset to add to QP for intra frames */
+  int8_t intra_qp_offset;
+  /** \brief Select intra QP Offset based on GOP length */
+  uint8_t intra_qp_offset_auto;
+
   /** \brief Minimum QP that uses CABAC for residual cost instead of a fast estimate. */
   int8_t fast_residual_cost_limit;
 
@@ -381,6 +399,8 @@
   /** \brief Flag to enable/disable open GOP configuration */
   int8_t open_gop;
 
+	int32_t vaq; /** \brief Enable variance adaptive quantization*/
+
   /** \brief Type of scaling lists to use */
   int8_t scaling_list;
 
@@ -390,6 +410,30 @@
   /** \brief Enable Early Skip Mode Decision */
   uint8_t early_skip;
 
+  /** \brief Enable Machine learning CU depth prediction for Intra encoding. */
+  uint8_t ml_pu_depth_intra;  
+  
+  /** \brief Used for partial frame encoding*/
+  struct {
+    uint8_t startCTU_x;
+    uint8_t startCTU_y;
+    uint16_t fullWidth;
+    uint16_t fullHeight;
+  } partial_coding;
+
+  /** \brief Always consider CU without any quantized residual */
+  uint8_t zero_coeff_rdo;
+
+  /** \brief Currently unused parameter for OBA rc */
+  int8_t frame_allocation;
+
+  /** \brief used rc scheme, 0 for QP */
+  int8_t rc_algorithm;
+
+  /** \brief whether to use hadamard based bit allocation for intra frames or not */
+  uint8_t intra_bit_allocation;
+
+  uint8_t clip_neighbour;
 } kvz_config;
 
 /**

kvazaar-2.0.0.tar.gz/src/ml_classifier_intra_depth_pred.c Added

@@ -0,0 +1,808 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "ml_classifier_intra_depth_pred.h"
+
+
+int tree_predict_merge_depth_1(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+	if (p_features->merge_variance <= 140.3129)
+	{
+		if (p_features->var_of_sub_var <= 569.6553)
+		{
+			if (p_features->merge_variance <= 20.8854)
+			{
+				*p_nb_iter = 19428.0;
+				*p_nb_bad = 1740.0;
+				return -1.0000;
+			}
+			else if (p_features->sub_variance_0 <= 9.1015)
+			{
+				if (p_features->merge_variance <= 39.132)
+				{
+					*p_nb_iter = 1166.0;
+					*p_nb_bad = 358.0;
+					return -1.0000;
+				}
+				else {
+					*p_nb_iter = 1049.0;
+					*p_nb_bad = 392.0;
+					return 1.0000;
+				}
+			}
+			else {
+				*p_nb_iter = 9371.0;
+				*p_nb_bad = 1805.0;
+				return -1.0000;
+			}
+		}
+		else if (p_features->sub_variance_2 <= 23.3193)
+		{
+			*p_nb_iter = 1059.0;
+			*p_nb_bad = 329.0;
+			return 1.0000;
+		}
+		else if (p_features->sub_variance_1 <= 30.7348)
+		{
+			*p_nb_iter = 1042.0;
+			*p_nb_bad = 395.0;
+			return 1.0000;
+		}
+		else {
+			*p_nb_iter = 1756.0;
+			*p_nb_bad = 588.0;
+			return -1.0000;
+		}
+	}
+	else if (p_features->merge_variance <= 857.8047)
+	{
+		if (p_features->var_of_sub_var <= 66593.5553)
+		{
+			if (p_features->sub_variance_0 <= 12.1697)
+			{
+				*p_nb_iter = 2006.0;
+				*p_nb_bad = 374.0;
+				return 1.0000;
+			}
+			else if (p_features->neigh_variance_C <= 646.8204)
+			{
+				if (p_features->neigh_variance_A <= 664.7609)
+				{
+					if (p_features->neigh_variance_B <= 571.2004)
+					{
+						if (p_features->var_of_sub_mean <= 4.1069)
+						{
+							*p_nb_iter = 1208.0;
+							*p_nb_bad = 399.0;
+							return 1.0000;
+						}
+						else if (p_features->var_of_sub_var <= 11832.6635)
+						{
+							*p_nb_iter = 8701.0;
+							*p_nb_bad = 3037.0;
+							return -1.0000;
+						}
+						else if (p_features->neigh_variance_A <= 142.298)
+						{
+							*p_nb_iter = 1025.0;
+							*p_nb_bad = 290.0;
+							return 1.0000;
+						}
+						else if (p_features->variance <= 394.4839)
+						{
+							*p_nb_iter = 1156.0;
+							*p_nb_bad = 489.0;
+							return 1.0000;
+						}
+						else {
+							*p_nb_iter = 1150.0;
+							*p_nb_bad = 503.0;
+							return -1.0000;
+						}
+					}
+					else {
+						*p_nb_iter = 1777.0;
+						*p_nb_bad = 558.0;
+						return 1.0000;
+					}
+				}
+				else {
+					*p_nb_iter = 1587.0;
+					*p_nb_bad = 411.0;
+					return 1.0000;
+				}
+			}
+			else {
+				*p_nb_iter = 1980.0;
+				*p_nb_bad = 474.0;
+				return 1.0000;
+			}
+		}
+		else {
+			*p_nb_iter = 3613.0;
+			*p_nb_bad = 475.0;
+			return 1.0000;
+		}
+	}
+	else {
+		*p_nb_iter = 20926.0;
+		*p_nb_bad = 1873.0;
+		return 1.0000;
+	}
+}
+
+
+
+int tree_predict_merge_depth_2(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+	if (p_features->merge_variance <= 119.4611)
+	{
+		if (p_features->var_of_sub_var <= 1078.0638)
+		{
+			if (p_features->neigh_variance_B <= 70.2189)
+			{
+				*p_nb_iter = 29253.0;
+				*p_nb_bad = 3837.0;
+				return -1.0000;
+			}
+			else if (p_features->variance <= 20.8711)
+			{
+				*p_nb_iter = 1292.0;
+				*p_nb_bad = 458.0;
+				return 2.0000;
+			}
+			else {
+				*p_nb_iter = 1707.0;
+				*p_nb_bad = 399.0;
+				return -1.0000;
+			}
+		}
+		else if (p_features->var_of_sub_var <= 3300.4034)
+		{
+			*p_nb_iter = 1554.0;
+			*p_nb_bad = 675.0;
+			return -1.0000;
+		}
+		else {
+			*p_nb_iter = 1540.0;
+			*p_nb_bad = 429.0;
+			return 2.0000;
+		}
+	}
+	else if (p_features->merge_variance <= 696.1989)
+	{
+		if (p_features->var_of_sub_var <= 31803.3242)
+		{
+			if (p_features->sub_variance_2 <= 10.3845)
+			{
+				*p_nb_iter = 3473.0;
+				*p_nb_bad = 768.0;
+				return 2.0000;
+			}
+			else if (p_features->neigh_variance_C <= 571.5329)
+			{
+				if (p_features->neigh_variance_B <= 492.8159)
+				{
+					if (p_features->neigh_variance_B <= 38.9672)
+					{
+						*p_nb_iter = 1887.0;
+						*p_nb_bad = 588.0;
+						return 2.0000;
+					}
+					else if (p_features->neigh_variance_A <= 380.5927)
+					{
+						if (p_features->sub_variance_1 <= 19.9678)
+						{
+							*p_nb_iter = 1686.0;
+							*p_nb_bad = 721.0;
+							return 2.0000;
+						}
+						else if (p_features->neigh_variance_A <= 66.6749)
+						{
+							*p_nb_iter = 1440.0;
+							*p_nb_bad = 631.0;
+							return 2.0000;
+						}
+						else {
+							*p_nb_iter = 5772.0;
+							*p_nb_bad = 2031.0;
+							return -1.0000;
+						}
+					}
+					else {
+						*p_nb_iter = 1791.0;
+						*p_nb_bad = 619.0;
+						return 2.0000;
+					}
+				}
+				else {
+					*p_nb_iter = 1624.0;
+					*p_nb_bad = 494.0;
+					return 2.0000;
+				}
+			}
+			else {
+				*p_nb_iter = 1298.0;
+				*p_nb_bad = 312.0;
+				return 2.0000;
+			}
+		}
+		else {
+			*p_nb_iter = 4577.0;
+			*p_nb_bad = 892.0;
+			return 2.0000;
+		}
+	}
+	else {
+		*p_nb_iter = 21106.0;
+		*p_nb_bad = 2744.0;
+		return 2.0000;
+	}
+}
+
+
+
+int tree_predict_merge_depth_3(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+	if (p_features->merge_variance <= 80.1487)
+	{
+		if (p_features->neigh_variance_C <= 83.7148)
+		{
+			*p_nb_iter = 29806.0;
+			*p_nb_bad = 3603.0;
+			return -1.0000;
+		}
+		else {
+			*p_nb_iter = 1003.0;
+			*p_nb_bad = 421.0;
+			return 3.0000;
+		}
+	}
+	else if (p_features->merge_variance <= 351.8138)
+	{
+		if (p_features->neigh_variance_C <= 255.4236)
+		{
+			if (p_features->neigh_variance_B <= 260.5349)
+			{
+				if (p_features->var_of_sub_var <= 6381.513)
+				{
+					if (p_features->neigh_variance_A <= 244.2556)
+					{
+						if (p_features->sub_variance_0 <= 4.75)
+						{
+							*p_nb_iter = 1290.0;
+							*p_nb_bad = 525.0;
+							return 3.0000;
+						}
+						else if (p_features->neigh_variance_B <= 16.9287)
+						{
+							*p_nb_iter = 1045.0;
+							*p_nb_bad = 499.0;
+							return 3.0000;
+						}
+						else {
+							*p_nb_iter = 6901.0;
+							*p_nb_bad = 2494.0;
+							return -1.0000;
+						}
+					}
+					else {
+						*p_nb_iter = 1332.0;
+						*p_nb_bad = 408.0;
+						return 3.0000;
+					}
+				}
+				else {
+					*p_nb_iter = 2929.0;
+					*p_nb_bad = 842.0;
+					return 3.0000;
+				}
+			}
+			else {
+				*p_nb_iter = 2239.0;
+				*p_nb_bad = 572.0;
+				return 3.0000;
+			}
+		}
+		else {
+			*p_nb_iter = 2777.0;
+			*p_nb_bad = 714.0;
+			return 3.0000;
+		}
+	}
+	else {
+		*p_nb_iter = 30678.0;
+		*p_nb_bad = 5409.0;
+		return 3.0000;
+	}
+}
+
+
+
+int tree_predict_merge_depth_4(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+	if (p_features->neigh_variance_C <= 240.2773)
+	{
+		if (p_features->neigh_variance_B <= 227.5898)
+		{
+			if (p_features->neigh_variance_A <= 195.4844)
+			{
+				if (p_features->variance <= 203.3086)
+				{
+					if (p_features->qp <= 32)
+					{
+						if (p_features->neigh_variance_C <= 102.2344)
+						{
+							if (p_features->neigh_variance_B <= 116.4961)
+							{
+								if (p_features->variance <= 89.4023)
+								{
+									*p_nb_iter = 27398.0;
+									*p_nb_bad = 4665.0;
+									return -1.0000;
+								}
+								else {
+									*p_nb_iter = 1676.0;
+									*p_nb_bad = 795.0;
+									return 4.0000;
+								}
+							}
+							else {
+								*p_nb_iter = 1405.0;
+								*p_nb_bad = 566.0;
+								return 4.0000;
+							}
+						}
+						else {
+							*p_nb_iter = 2827.0;
+							*p_nb_bad = 1173.0;
+							return 4.0000;
+						}
+					}
+					else {
+						*p_nb_iter = 8871.0;
+						*p_nb_bad = 822.0;
+						return -1.0000;
+					}
+				}
+				else {
+					*p_nb_iter = 3162.0;
+					*p_nb_bad = 718.0;
+					return 4.0000;
+				}
+			}
+			else {
+				*p_nb_iter = 6154.0;
+				*p_nb_bad = 1397.0;
+				return 4.0000;
+			}
+		}
+		else {
+			*p_nb_iter = 9385.0;
+			*p_nb_bad = 1609.0;
+			return 4.0000;
+		}
+	}
+	else {
+		*p_nb_iter = 19122.0;
+		*p_nb_bad = 2960.0;
+		return 4.0000;
+	}
+}
+
+
+
+int tree_predict_split_depth_0(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+	if (p_features->var_of_sub_var <= 12754.7856)
+	{
+		if (p_features->var_of_sub_var <= 137.9034)
+		{
+			*p_nb_iter = 25155.0;
+			*p_nb_bad = 2959.0;
+			return 0.0000;
+		}
+		else if (p_features->sub_variance_2 <= 13.2892)
+		{
+			*p_nb_iter = 1080.0;
+			*p_nb_bad = 383.0;
+			return -1.0000;
+		}
+		else if (p_features->variance <= 564.1738)
+		{
+			if (p_features->var_of_sub_var <= 1185.4728)
+			{
+				*p_nb_iter = 6067.0;
+				*p_nb_bad = 1699.0;
+				return 0.0000;
+			}
+			else if (p_features->var_of_sub_mean <= 46.2388)
+			{
+				if (p_features->sub_variance_0 <= 46.8708)
+				{
+					*p_nb_iter = 1088.0;
+					*p_nb_bad = 377.0;
+					return -1.0000;
+				}
+				else if (p_features->sub_variance_3 <= 61.4213)
+				{
+					*p_nb_iter = 1183.0;
+					*p_nb_bad = 498.0;
+					return -1.0000;
+				}
+				else {
+					*p_nb_iter = 3416.0;
+					*p_nb_bad = 1373.0;
+					return 0.0000;
+				}
+			}
+			else {
+				*p_nb_iter = 3769.0;
+				*p_nb_bad = 1093.0;
+				return 0.0000;
+			}
+		}
+		else {
+			*p_nb_iter = 1036.0;
+			*p_nb_bad = 434.0;
+			return -1.0000;
+		}
+	}
+	else if (p_features->var_of_sub_var <= 98333.8279)
+	{
+		if (p_features->variance <= 987.2333)
+		{
+			if (p_features->var_of_sub_var <= 37261.2896)
+			{
+				if (p_features->variance <= 238.2248)
+				{
+					*p_nb_iter = 1323.0;
+					*p_nb_bad = 301.0;
+					return -1.0000;
+				}
+				else if (p_features->var_of_sub_var <= 17347.3971)
+				{
+					*p_nb_iter = 1215.0;
+					*p_nb_bad = 550.0;
+					return 0.0000;
+				}
+				else if (p_features->qp <= 22)
+				{
+					*p_nb_iter = 1000.0;
+					*p_nb_bad = 493.0;
+					return 0.0000;
+				}
+				else {
+					*p_nb_iter = 2640.0;
+					*p_nb_bad = 1121.0;
+					return -1.0000;
+				}
+			}
+			else {
+				*p_nb_iter = 5188.0;
+				*p_nb_bad = 1248.0;
+				return -1.0000;
+			}
+		}
+		else {
+			*p_nb_iter = 2323.0;
+			*p_nb_bad = 274.0;
+			return -1.0000;
+		}
+	}
+	else {
+		*p_nb_iter = 21357.0;
+		*p_nb_bad = 1829.0;
+		return -1.0000;
+	}
+}
+
+
+int tree_predict_split_depth_1(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+	if (p_features->var_of_sub_var <= 1138.9473)
+	{
+		*p_nb_iter = 32445.0;
+		*p_nb_bad = 4580.0;
+		return 1.0000;
+	}
+	else if (p_features->var_of_sub_var <= 27289.2117)
+	{
+		if (p_features->sub_variance_1 <= 12.0603)
+		{
+			*p_nb_iter = 1900.0;
+			*p_nb_bad = 401.0;
+			return -1.0000;
+		}
+		else if (p_features->var_of_sub_var <= 5841.4773)
+		{
+			if (p_features->variance <= 72.4175)
+			{
+				*p_nb_iter = 1000.0;
+				*p_nb_bad = 356.0;
+				return -1.0000;
+			}
+			else if (p_features->neigh_variance_A <= 633.8163)
+			{
+				*p_nb_iter = 5279.0;
+				*p_nb_bad = 1961.0;
+				return 1.0000;
+			}
+			else {
+				*p_nb_iter = 1176.0;
+				*p_nb_bad = 527.0;
+				return -1.0000;
+			}
+		}
+		else if (p_features->sub_variance_0 <= 38.3035)
+		{
+			*p_nb_iter = 1251.0;
+			*p_nb_bad = 293.0;
+			return -1.0000;
+		}
+		else if (p_features->neigh_variance_B <= 664.9494)
+		{
+			if (p_features->sub_variance_3 <= 45.8181)
+			{
+				*p_nb_iter = 1276.0;
+				*p_nb_bad = 471.0;
+				return -1.0000;
+			}
+			else if (p_features->sub_variance_3 <= 404.3086)
+			{
+				if (p_features->sub_variance_1 <= 99.8715)
+				{
+					*p_nb_iter = 1005.0;
+					*p_nb_bad = 435.0;
+					return -1.0000;
+				}
+				else if (p_features->sub_variance_0 <= 282.3064)
+				{
+					*p_nb_iter = 1370.0;
+					*p_nb_bad = 539.0;
+					return 1.0000;
+				}
+				else {
+					*p_nb_iter = 1013.0;
+					*p_nb_bad = 495.0;
+					return -1.0000;
+				}
+			}
+			else {
+				*p_nb_iter = 1000.0;
+				*p_nb_bad = 379.0;
+				return -1.0000;
+			}
+		}
+		else {
+			*p_nb_iter = 2270.0;
+			*p_nb_bad = 679.0;
+			return -1.0000;
+		}
+	}
+	else {
+		*p_nb_iter = 29015.0;
+		*p_nb_bad = 3950.0;
+		return -1.0000;
+	}
+}
+
+
+int tree_predict_split_depth_2(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+	if (p_features->var_of_sub_var <= 2597.4529)
+	{
+		if (p_features->var_of_sub_var <= 146.7734)
+		{
+			*p_nb_iter = 23216.0;
+			*p_nb_bad = 1560.0;
+			return 2.0000;
+		}
+		else if (p_features->merge_variance <= 259.6952)
+		{
+			*p_nb_iter = 7470.0;
+			*p_nb_bad = 1902.0;
+			return 2.0000;
+		}
+		else if (p_features->qp <= 27)
+		{
+			if (p_features->variance <= 73.9929)
+			{
+				*p_nb_iter = 1138.0;
+				*p_nb_bad = 486.0;
+				return -1.0000;
+			}
+			else {
+				*p_nb_iter = 1619.0;
+				*p_nb_bad = 716.0;
+				return 2.0000;
+			}
+		}
+		else {
+			*p_nb_iter = 2425.0;
+			*p_nb_bad = 861.0;
+			return 2.0000;
+		}
+	}
+	else if (p_features->var_of_sub_var <= 60850.5208)
+	{
+		if (p_features->var_of_sub_var <= 10144.602)
+		{
+			if (p_features->neigh_variance_C <= 926.8972)
+			{
+				if (p_features->sub_variance_0 <= 26.6006)
+				{
+					*p_nb_iter = 1796.0;
+					*p_nb_bad = 586.0;
+					return -1.0000;
+				}
+				else if (p_features->neigh_variance_A <= 493.5849)
+				{
+					if (p_features->neigh_variance_A <= 72.9516)
+					{
+						*p_nb_iter = 1326.0;
+						*p_nb_bad = 557.0;
+						return -1.0000;
+					}
+					else if (p_features->variance <= 156.4014)
+					{
+						*p_nb_iter = 1210.0;
+						*p_nb_bad = 563.0;
+						return -1.0000;
+					}
+					else {
+						*p_nb_iter = 1920.0;
+						*p_nb_bad = 817.0;
+						return 2.0000;
+					}
+				}
+				else {
+					*p_nb_iter = 1106.0;
+					*p_nb_bad = 437.0;
+					return -1.0000;
+				}
+			}
+			else {
+				*p_nb_iter = 1001.0;
+				*p_nb_bad = 278.0;
+				return -1.0000;
+			}
+		}
+		else {
+			*p_nb_iter = 13068.0;
+			*p_nb_bad = 3612.0;
+			return -1.0000;
+		}
+	}
+	else {
+		*p_nb_iter = 22705.0;
+		*p_nb_bad = 2687.0;
+		return -1.0000;
+	}
+}
+
+
+
+int tree_predict_split_depth_3(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+	if (p_features->var_of_sub_var <= 818.5173)
+	{
+		if (p_features->merge_variance <= 62.7641)
+		{
+			*p_nb_iter = 20568.0;
+			*p_nb_bad = 767.0;
+			return 3.0000;
+		}
+		else if (p_features->qp <= 27)
+		{
+			if (p_features->variance <= 9.4219)
+			{
+				*p_nb_iter = 1255.0;
+				*p_nb_bad = 206.0;
+				return 3.0000;
+			}
+			else if (p_features->merge_variance <= 375.2185)
+			{
+				*p_nb_iter = 3999.0;
+				*p_nb_bad = 1321.0;
+				return 3.0000;
+			}
+			else {
+				*p_nb_iter = 1786.0;
+				*p_nb_bad = 817.0;
+				return -1.0000;
+			}
+		}
+		else {
+			*p_nb_iter = 5286.0;
+			*p_nb_bad = 737.0;
+			return 3.0000;
+		}
+	}
+	else if (p_features->var_of_sub_var <= 37332.3018)
+	{
+		if (p_features->var_of_sub_var <= 7585.0282)
+		{
+			if (p_features->qp <= 32)
+			{
+				if (p_features->neigh_variance_C <= 330.2178)
+				{
+					if (p_features->sub_variance_0 <= 8.5273)
+					{
+						*p_nb_iter = 1114.0;
+						*p_nb_bad = 346.0;
+						return -1.0000;
+					}
+					else if (p_features->neigh_variance_B <= 221.5469)
+					{
+						if (p_features->var_of_sub_var <= 1989.7928)
+						{
+							*p_nb_iter = 1539.0;
+							*p_nb_bad = 606.0;
+							return 3.0000;
+						}
+						else if (p_features->variance <= 155.5974)
+						{
+							*p_nb_iter = 1298.0;
+							*p_nb_bad = 634.0;
+							return 3.0000;
+						}
+						else {
+							*p_nb_iter = 1076.0;
+							*p_nb_bad = 456.0;
+							return -1.0000;
+						}
+					}
+					else {
+						*p_nb_iter = 1644.0;
+						*p_nb_bad = 639.0;
+						return -1.0000;
+					}
+				}
+				else {
+					*p_nb_iter = 2401.0;
+					*p_nb_bad = 713.0;
+					return -1.0000;
+				}
+			}
+			else if (p_features->merge_variance <= 281.9509)
+			{
+				*p_nb_iter = 1020.0;
+				*p_nb_bad = 262.0;
+				return 3.0000;
+			}
+			else {
+				*p_nb_iter = 1278.0;
+				*p_nb_bad = 594.0;
+				return -1.0000;
+			}
+		}
+		else {
+			*p_nb_iter = 10507.0;
+			*p_nb_bad = 2943.0;
+			return -1.0000;
+		}
+	}
+	else {
+		*p_nb_iter = 25229.0;
+		*p_nb_bad = 3060.0;
+		return -1.0000;
+	}
+}

kvazaar-2.0.0.tar.gz/src/ml_classifier_intra_depth_pred.h Added

@@ -0,0 +1,38 @@
+#ifndef ML_CLASSIFIER_INTRA_DEPTH_PRED
+#define ML_CLASSIFIER_INTRA_DEPTH_PRED
+
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "ml_intra_cu_depth_pred.h"
+
+
+int tree_predict_merge_depth_1(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+int tree_predict_merge_depth_2(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+int tree_predict_merge_depth_3(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+int tree_predict_merge_depth_4(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+
+
+int tree_predict_split_depth_0(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+int tree_predict_split_depth_1(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+int tree_predict_split_depth_2(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+int tree_predict_split_depth_3(features_s* p_features, double* p_nb_iter, double* p_nb_bad);
+
+#endif
\ No newline at end of file

kvazaar-2.0.0.tar.gz/src/ml_intra_cu_depth_pred.c Added

@@ -0,0 +1,1744 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "ml_intra_cu_depth_pred.h"
+
+
+static int tree_predict_merge_depth_1(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+		if (p_features->merge_variance <= 140.3129)
+		{
+				if (p_features->var_of_sub_var <= 569.6553)
+				{
+						if (p_features->merge_variance <= 20.8854)
+						{
+								*p_nb_iter = 19428.0;
+								*p_nb_bad = 1740.0;
+								return -1.0000;
+						}
+						else if (p_features->sub_variance_0 <= 9.1015)
+						{
+								if (p_features->merge_variance <= 39.132)
+								{
+										*p_nb_iter = 1166.0;
+										*p_nb_bad = 358.0;
+										return -1.0000;
+								}
+								else {
+										*p_nb_iter = 1049.0;
+										*p_nb_bad = 392.0;
+										return 1.0000;
+								}
+						}
+						else {
+								*p_nb_iter = 9371.0;
+								*p_nb_bad = 1805.0;
+								return -1.0000;
+						}
+				}
+				else if (p_features->sub_variance_2 <= 23.3193)
+				{
+						*p_nb_iter = 1059.0;
+						*p_nb_bad = 329.0;
+						return 1.0000;
+				}
+				else if (p_features->sub_variance_1 <= 30.7348)
+				{
+						*p_nb_iter = 1042.0;
+						*p_nb_bad = 395.0;
+						return 1.0000;
+				}
+				else {
+						*p_nb_iter = 1756.0;
+						*p_nb_bad = 588.0;
+						return -1.0000;
+				}
+		}
+		else if (p_features->merge_variance <= 857.8047)
+		{
+				if (p_features->var_of_sub_var <= 66593.5553)
+				{
+						if (p_features->sub_variance_0 <= 12.1697)
+						{
+								*p_nb_iter = 2006.0;
+								*p_nb_bad = 374.0;
+								return 1.0000;
+						}
+						else if (p_features->neigh_variance_C <= 646.8204)
+						{
+								if (p_features->neigh_variance_A <= 664.7609)
+								{
+										if (p_features->neigh_variance_B <= 571.2004)
+										{
+												if (p_features->var_of_sub_mean <= 4.1069)
+												{
+														*p_nb_iter = 1208.0;
+														*p_nb_bad = 399.0;
+														return 1.0000;
+												}
+												else if (p_features->var_of_sub_var <= 11832.6635)
+												{
+														*p_nb_iter = 8701.0;
+														*p_nb_bad = 3037.0;
+														return -1.0000;
+												}
+												else if (p_features->neigh_variance_A <= 142.298)
+												{
+														*p_nb_iter = 1025.0;
+														*p_nb_bad = 290.0;
+														return 1.0000;
+												}
+												else if (p_features->variance <= 394.4839)
+												{
+														*p_nb_iter = 1156.0;
+														*p_nb_bad = 489.0;
+														return 1.0000;
+												}
+												else {
+														*p_nb_iter = 1150.0;
+														*p_nb_bad = 503.0;
+														return -1.0000;
+												}
+										}
+										else {
+												*p_nb_iter = 1777.0;
+												*p_nb_bad = 558.0;
+												return 1.0000;
+										}
+								}
+								else {
+										*p_nb_iter = 1587.0;
+										*p_nb_bad = 411.0;
+										return 1.0000;
+								}
+						}
+						else {
+								*p_nb_iter = 1980.0;
+								*p_nb_bad = 474.0;
+								return 1.0000;
+						}
+				}
+				else {
+						*p_nb_iter = 3613.0;
+						*p_nb_bad = 475.0;
+						return 1.0000;
+				}
+		}
+		else {
+				*p_nb_iter = 20926.0;
+				*p_nb_bad = 1873.0;
+				return 1.0000;
+		}
+}
+
+
+
+static int tree_predict_merge_depth_2(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+		if (p_features->merge_variance <= 119.4611)
+		{
+				if (p_features->var_of_sub_var <= 1078.0638)
+				{
+						if (p_features->neigh_variance_B <= 70.2189)
+						{
+								*p_nb_iter = 29253.0;
+								*p_nb_bad = 3837.0;
+								return -1.0000;
+						}
+						else if (p_features->variance <= 20.8711)
+						{
+								*p_nb_iter = 1292.0;
+								*p_nb_bad = 458.0;
+								return 2.0000;
+						}
+						else {
+								*p_nb_iter = 1707.0;
+								*p_nb_bad = 399.0;
+								return -1.0000;
+						}
+				}
+				else if (p_features->var_of_sub_var <= 3300.4034)
+				{
+						*p_nb_iter = 1554.0;
+						*p_nb_bad = 675.0;
+						return -1.0000;
+				}
+				else {
+						*p_nb_iter = 1540.0;
+						*p_nb_bad = 429.0;
+						return 2.0000;
+				}
+		}
+		else if (p_features->merge_variance <= 696.1989)
+		{
+				if (p_features->var_of_sub_var <= 31803.3242)
+				{
+						if (p_features->sub_variance_2 <= 10.3845)
+						{
+								*p_nb_iter = 3473.0;
+								*p_nb_bad = 768.0;
+								return 2.0000;
+						}
+						else if (p_features->neigh_variance_C <= 571.5329)
+						{
+								if (p_features->neigh_variance_B <= 492.8159)
+								{
+										if (p_features->neigh_variance_B <= 38.9672)
+										{
+												*p_nb_iter = 1887.0;
+												*p_nb_bad = 588.0;
+												return 2.0000;
+										}
+										else if (p_features->neigh_variance_A <= 380.5927)
+										{
+												if (p_features->sub_variance_1 <= 19.9678)
+												{
+														*p_nb_iter = 1686.0;
+														*p_nb_bad = 721.0;
+														return 2.0000;
+												}
+												else if (p_features->neigh_variance_A <= 66.6749)
+												{
+														*p_nb_iter = 1440.0;
+														*p_nb_bad = 631.0;
+														return 2.0000;
+												}
+												else {
+														*p_nb_iter = 5772.0;
+														*p_nb_bad = 2031.0;
+														return -1.0000;
+												}
+										}
+										else {
+												*p_nb_iter = 1791.0;
+												*p_nb_bad = 619.0;
+												return 2.0000;
+										}
+								}
+								else {
+										*p_nb_iter = 1624.0;
+										*p_nb_bad = 494.0;
+										return 2.0000;
+								}
+						}
+						else {
+								*p_nb_iter = 1298.0;
+								*p_nb_bad = 312.0;
+								return 2.0000;
+						}
+				}
+				else {
+						*p_nb_iter = 4577.0;
+						*p_nb_bad = 892.0;
+						return 2.0000;
+				}
+		}
+		else {
+				*p_nb_iter = 21106.0;
+				*p_nb_bad = 2744.0;
+				return 2.0000;
+		}
+}
+
+
+
+static int tree_predict_merge_depth_3(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+		if (p_features->merge_variance <= 80.1487)
+		{
+				if (p_features->neigh_variance_C <= 83.7148)
+				{
+						*p_nb_iter = 29806.0;
+						*p_nb_bad = 3603.0;
+						return -1.0000;
+				}
+				else {
+						*p_nb_iter = 1003.0;
+						*p_nb_bad = 421.0;
+						return 3.0000;
+				}
+		}
+		else if (p_features->merge_variance <= 351.8138)
+		{
+				if (p_features->neigh_variance_C <= 255.4236)
+				{
+						if (p_features->neigh_variance_B <= 260.5349)
+						{
+								if (p_features->var_of_sub_var <= 6381.513)
+								{
+										if (p_features->neigh_variance_A <= 244.2556)
+										{
+												if (p_features->sub_variance_0 <= 4.75)
+												{
+														*p_nb_iter = 1290.0;
+														*p_nb_bad = 525.0;
+														return 3.0000;
+												}
+												else if (p_features->neigh_variance_B <= 16.9287)
+												{
+														*p_nb_iter = 1045.0;
+														*p_nb_bad = 499.0;
+														return 3.0000;
+												}
+												else {
+														*p_nb_iter = 6901.0;
+														*p_nb_bad = 2494.0;
+														return -1.0000;
+												}
+										}
+										else {
+												*p_nb_iter = 1332.0;
+												*p_nb_bad = 408.0;
+												return 3.0000;
+										}
+								}
+								else {
+										*p_nb_iter = 2929.0;
+										*p_nb_bad = 842.0;
+										return 3.0000;
+								}
+						}
+						else {
+								*p_nb_iter = 2239.0;
+								*p_nb_bad = 572.0;
+								return 3.0000;
+						}
+				}
+				else {
+						*p_nb_iter = 2777.0;
+						*p_nb_bad = 714.0;
+						return 3.0000;
+				}
+		}
+		else {
+				*p_nb_iter = 30678.0;
+				*p_nb_bad = 5409.0;
+				return 3.0000;
+		}
+}
+
+
+
+static int tree_predict_merge_depth_4(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+		if (p_features->neigh_variance_C <= 240.2773)
+		{
+				if (p_features->neigh_variance_B <= 227.5898)
+				{
+						if (p_features->neigh_variance_A <= 195.4844)
+						{
+								if (p_features->variance <= 203.3086)
+								{
+										if (p_features->qp <= 32)
+										{
+												if (p_features->neigh_variance_C <= 102.2344)
+												{
+														if (p_features->neigh_variance_B <= 116.4961)
+														{
+																if (p_features->variance <= 89.4023)
+																{
+																		*p_nb_iter = 27398.0;
+																		*p_nb_bad = 4665.0;
+																		return -1.0000;
+																}
+																else {
+																		*p_nb_iter = 1676.0;
+																		*p_nb_bad = 795.0;
+																		return 4.0000;
+																}
+														}
+														else {
+																*p_nb_iter = 1405.0;
+																*p_nb_bad = 566.0;
+																return 4.0000;
+														}
+												}
+												else {
+														*p_nb_iter = 2827.0;
+														*p_nb_bad = 1173.0;
+														return 4.0000;
+												}
+										}
+										else {
+												*p_nb_iter = 8871.0;
+												*p_nb_bad = 822.0;
+												return -1.0000;
+										}
+								}
+								else {
+										*p_nb_iter = 3162.0;
+										*p_nb_bad = 718.0;
+										return 4.0000;
+								}
+						}
+						else {
+								*p_nb_iter = 6154.0;
+								*p_nb_bad = 1397.0;
+								return 4.0000;
+						}
+				}
+				else {
+						*p_nb_iter = 9385.0;
+						*p_nb_bad = 1609.0;
+						return 4.0000;
+				}
+		}
+		else {
+				*p_nb_iter = 19122.0;
+				*p_nb_bad = 2960.0;
+				return 4.0000;
+		}
+}
+
+
+
+static int tree_predict_split_depth_0(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+		if (p_features->var_of_sub_var <= 12754.7856)
+		{
+				if (p_features->var_of_sub_var <= 137.9034)
+				{
+						*p_nb_iter = 25155.0;
+						*p_nb_bad = 2959.0;
+						return 0.0000;
+				}
+				else if (p_features->sub_variance_2 <= 13.2892)
+				{
+						*p_nb_iter = 1080.0;
+						*p_nb_bad = 383.0;
+						return -1.0000;
+				}
+				else if (p_features->variance <= 564.1738)
+				{
+						if (p_features->var_of_sub_var <= 1185.4728)
+						{
+								*p_nb_iter = 6067.0;
+								*p_nb_bad = 1699.0;
+								return 0.0000;
+						}
+						else if (p_features->var_of_sub_mean <= 46.2388)
+						{
+								if (p_features->sub_variance_0 <= 46.8708)
+								{
+										*p_nb_iter = 1088.0;
+										*p_nb_bad = 377.0;
+										return -1.0000;
+								}
+								else if (p_features->sub_variance_3 <= 61.4213)
+								{
+										*p_nb_iter = 1183.0;
+										*p_nb_bad = 498.0;
+										return -1.0000;
+								}
+								else {
+										*p_nb_iter = 3416.0;
+										*p_nb_bad = 1373.0;
+										return 0.0000;
+								}
+						}
+						else {
+								*p_nb_iter = 3769.0;
+								*p_nb_bad = 1093.0;
+								return 0.0000;
+						}
+				}
+				else {
+						*p_nb_iter = 1036.0;
+						*p_nb_bad = 434.0;
+						return -1.0000;
+				}
+		}
+		else if (p_features->var_of_sub_var <= 98333.8279)
+		{
+				if (p_features->variance <= 987.2333)
+				{
+						if (p_features->var_of_sub_var <= 37261.2896)
+						{
+								if (p_features->variance <= 238.2248)
+								{
+										*p_nb_iter = 1323.0;
+										*p_nb_bad = 301.0;
+										return -1.0000;
+								}
+								else if (p_features->var_of_sub_var <= 17347.3971)
+								{
+										*p_nb_iter = 1215.0;
+										*p_nb_bad = 550.0;
+										return 0.0000;
+								}
+								else if (p_features->qp <= 22)
+								{
+										*p_nb_iter = 1000.0;
+										*p_nb_bad = 493.0;
+										return 0.0000;
+								}
+								else {
+										*p_nb_iter = 2640.0;
+										*p_nb_bad = 1121.0;
+										return -1.0000;
+								}
+						}
+						else {
+								*p_nb_iter = 5188.0;
+								*p_nb_bad = 1248.0;
+								return -1.0000;
+						}
+				}
+				else {
+						*p_nb_iter = 2323.0;
+						*p_nb_bad = 274.0;
+						return -1.0000;
+				}
+		}
+		else {
+				*p_nb_iter = 21357.0;
+				*p_nb_bad = 1829.0;
+				return -1.0000;
+		}
+}
+
+
+static int tree_predict_split_depth_1(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+		if (p_features->var_of_sub_var <= 1138.9473)
+		{
+				*p_nb_iter = 32445.0;
+				*p_nb_bad = 4580.0;
+				return 1.0000;
+		}
+		else if (p_features->var_of_sub_var <= 27289.2117)
+		{
+				if (p_features->sub_variance_1 <= 12.0603)
+				{
+						*p_nb_iter = 1900.0;
+						*p_nb_bad = 401.0;
+						return -1.0000;
+				}
+				else if (p_features->var_of_sub_var <= 5841.4773)
+				{
+						if (p_features->variance <= 72.4175)
+						{
+								*p_nb_iter = 1000.0;
+								*p_nb_bad = 356.0;
+								return -1.0000;
+						}
+						else if (p_features->neigh_variance_A <= 633.8163)
+						{
+								*p_nb_iter = 5279.0;
+								*p_nb_bad = 1961.0;
+								return 1.0000;
+						}
+						else {
+								*p_nb_iter = 1176.0;
+								*p_nb_bad = 527.0;
+								return -1.0000;
+						}
+				}
+				else if (p_features->sub_variance_0 <= 38.3035)
+				{
+						*p_nb_iter = 1251.0;
+						*p_nb_bad = 293.0;
+						return -1.0000;
+				}
+				else if (p_features->neigh_variance_B <= 664.9494)
+				{
+						if (p_features->sub_variance_3 <= 45.8181)
+						{
+								*p_nb_iter = 1276.0;
+								*p_nb_bad = 471.0;
+								return -1.0000;
+						}
+						else if (p_features->sub_variance_3 <= 404.3086)
+						{
+								if (p_features->sub_variance_1 <= 99.8715)
+								{
+										*p_nb_iter = 1005.0;
+										*p_nb_bad = 435.0;
+										return -1.0000;
+								}
+								else if (p_features->sub_variance_0 <= 282.3064)
+								{
+										*p_nb_iter = 1370.0;
+										*p_nb_bad = 539.0;
+										return 1.0000;
+								}
+								else {
+										*p_nb_iter = 1013.0;
+										*p_nb_bad = 495.0;
+										return -1.0000;
+								}
+						}
+						else {
+								*p_nb_iter = 1000.0;
+								*p_nb_bad = 379.0;
+								return -1.0000;
+						}
+				}
+				else {
+						*p_nb_iter = 2270.0;
+						*p_nb_bad = 679.0;
+						return -1.0000;
+				}
+		}
+		else {
+				*p_nb_iter = 29015.0;
+				*p_nb_bad = 3950.0;
+				return -1.0000;
+		}
+}
+
+
+static int tree_predict_split_depth_2(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+		if (p_features->var_of_sub_var <= 2597.4529)
+		{
+				if (p_features->var_of_sub_var <= 146.7734)
+				{
+						*p_nb_iter = 23216.0;
+						*p_nb_bad = 1560.0;
+						return 2.0000;
+				}
+				else if (p_features->merge_variance <= 259.6952)
+				{
+						*p_nb_iter = 7470.0;
+						*p_nb_bad = 1902.0;
+						return 2.0000;
+				}
+				else if (p_features->qp <= 27)
+				{
+						if (p_features->variance <= 73.9929)
+						{
+								*p_nb_iter = 1138.0;
+								*p_nb_bad = 486.0;
+								return -1.0000;
+						}
+						else {
+								*p_nb_iter = 1619.0;
+								*p_nb_bad = 716.0;
+								return 2.0000;
+						}
+				}
+				else {
+						*p_nb_iter = 2425.0;
+						*p_nb_bad = 861.0;
+						return 2.0000;
+				}
+		}
+		else if (p_features->var_of_sub_var <= 60850.5208)
+		{
+				if (p_features->var_of_sub_var <= 10144.602)
+				{
+						if (p_features->neigh_variance_C <= 926.8972)
+						{
+								if (p_features->sub_variance_0 <= 26.6006)
+								{
+										*p_nb_iter = 1796.0;
+										*p_nb_bad = 586.0;
+										return -1.0000;
+								}
+								else if (p_features->neigh_variance_A <= 493.5849)
+								{
+										if (p_features->neigh_variance_A <= 72.9516)
+										{
+												*p_nb_iter = 1326.0;
+												*p_nb_bad = 557.0;
+												return -1.0000;
+										}
+										else if (p_features->variance <= 156.4014)
+										{
+												*p_nb_iter = 1210.0;
+												*p_nb_bad = 563.0;
+												return -1.0000;
+										}
+										else {
+												*p_nb_iter = 1920.0;
+												*p_nb_bad = 817.0;
+												return 2.0000;
+										}
+								}
+								else {
+										*p_nb_iter = 1106.0;
+										*p_nb_bad = 437.0;
+										return -1.0000;
+								}
+						}
+						else {
+								*p_nb_iter = 1001.0;
+								*p_nb_bad = 278.0;
+								return -1.0000;
+						}
+				}
+				else {
+						*p_nb_iter = 13068.0;
+						*p_nb_bad = 3612.0;
+						return -1.0000;
+				}
+		}
+		else {
+				*p_nb_iter = 22705.0;
+				*p_nb_bad = 2687.0;
+				return -1.0000;
+		}
+}
+
+
+
+static int tree_predict_split_depth_3(features_s* p_features, double* p_nb_iter, double* p_nb_bad)
+{
+		if (p_features->var_of_sub_var <= 818.5173)
+		{
+				if (p_features->merge_variance <= 62.7641)
+				{
+						*p_nb_iter = 20568.0;
+						*p_nb_bad = 767.0;
+						return 3.0000;
+				}
+				else if (p_features->qp <= 27)
+				{
+						if (p_features->variance <= 9.4219)
+						{
+								*p_nb_iter = 1255.0;
+								*p_nb_bad = 206.0;
+								return 3.0000;
+						}
+						else if (p_features->merge_variance <= 375.2185)
+						{
+								*p_nb_iter = 3999.0;
+								*p_nb_bad = 1321.0;
+								return 3.0000;
+						}
+						else {
+								*p_nb_iter = 1786.0;
+								*p_nb_bad = 817.0;
+								return -1.0000;
+						}
+				}
+				else {
+						*p_nb_iter = 5286.0;
+						*p_nb_bad = 737.0;
+						return 3.0000;
+				}
+		}
+		else if (p_features->var_of_sub_var <= 37332.3018)
+		{
+				if (p_features->var_of_sub_var <= 7585.0282)
+				{
+						if (p_features->qp <= 32)
+						{
+								if (p_features->neigh_variance_C <= 330.2178)
+								{
+										if (p_features->sub_variance_0 <= 8.5273)
+										{
+												*p_nb_iter = 1114.0;
+												*p_nb_bad = 346.0;
+												return -1.0000;
+										}
+										else if (p_features->neigh_variance_B <= 221.5469)
+										{
+												if (p_features->var_of_sub_var <= 1989.7928)
+												{
+														*p_nb_iter = 1539.0;
+														*p_nb_bad = 606.0;
+														return 3.0000;
+												}
+												else if (p_features->variance <= 155.5974)
+												{
+														*p_nb_iter = 1298.0;
+														*p_nb_bad = 634.0;
+														return 3.0000;
+												}
+												else {
+														*p_nb_iter = 1076.0;
+														*p_nb_bad = 456.0;
+														return -1.0000;
+												}
+										}
+										else {
+												*p_nb_iter = 1644.0;
+												*p_nb_bad = 639.0;
+												return -1.0000;
+										}
+								}
+								else {
+										*p_nb_iter = 2401.0;
+										*p_nb_bad = 713.0;
+										return -1.0000;
+								}
+						}
+						else if (p_features->merge_variance <= 281.9509)
+						{
+								*p_nb_iter = 1020.0;
+								*p_nb_bad = 262.0;
+								return 3.0000;
+						}
+						else {
+								*p_nb_iter = 1278.0;
+								*p_nb_bad = 594.0;
+								return -1.0000;
+						}
+				}
+				else {
+						*p_nb_iter = 10507.0;
+						*p_nb_bad = 2943.0;
+						return -1.0000;
+				}
+		}
+		else {
+				*p_nb_iter = 25229.0;
+				*p_nb_bad = 3060.0;
+				return -1.0000;
+		}
+}
+
+
+
+ /**
+ *	Allocate the structure and buffer
+ */
+ml_intra_ctu_pred_t* kvz_init_ml_intra_depth_const() {
+	ml_intra_ctu_pred_t* ml_intra_depth_ctu = NULL;
+	// Allocate the ml_intra_ctu_pred_t strucutre
+	ml_intra_depth_ctu = MALLOC(ml_intra_ctu_pred_t, 1);
+	if (!ml_intra_depth_ctu) {
+		fprintf(stderr, "Memory allocation failed!\n");
+		assert(0);
+	}
+	// Set the number of number of deth add to 1 by default
+	ml_intra_depth_ctu->i_nb_addDepth = 1;
+	// Set the extra Upper Expansion in the upper_depth enabled by default 
+	ml_intra_depth_ctu->b_extra_up_exp = true;
+
+	// Allocate the depth matrices 
+	ml_intra_depth_ctu->_mat_lower_depth = MALLOC(uint8_t, LCU_DEPTH_MAT_SIZE);
+	if (!ml_intra_depth_ctu->_mat_lower_depth) {
+		fprintf(stderr, "Memory allocation failed!\n");
+		assert(0);
+	}
+	ml_intra_depth_ctu->_mat_upper_depth = MALLOC(uint8_t, LCU_DEPTH_MAT_SIZE);
+	if (!ml_intra_depth_ctu->_mat_upper_depth) {
+		fprintf(stderr, "Memory allocation failed!\n");
+		assert(0);
+	}
+
+	return ml_intra_depth_ctu;
+};
+
+/**
+*	Fee the bufer and structure
+*/
+void kvz_end_ml_intra_depth_const(ml_intra_ctu_pred_t* ml_intra_depth_ctu) {
+	FREE_POINTER(ml_intra_depth_ctu->_mat_lower_depth);
+	FREE_POINTER(ml_intra_depth_ctu->_mat_upper_depth);
+
+	FREE_POINTER(ml_intra_depth_ctu);
+}
+
+// Initialize to 0 all the features
+static void features_init_array(features_s* arr_features, int16_t _size, int _qp)//, int _NB_pixels)
+{
+	int16_t i = 0;
+	for (i = 0; i < _size; ++i)
+	{
+		arr_featuresi.variance = 0.0;
+		arr_featuresi.sub_variance_0 = 0.0;
+		arr_featuresi.sub_variance_1 = 0.0;
+		arr_featuresi.sub_variance_2 = 0.0;
+		arr_featuresi.sub_variance_3 = 0.0;
+		arr_featuresi.merge_variance = 0.0;
+		arr_featuresi.neigh_variance_A = 0.0;
+		arr_featuresi.neigh_variance_B = 0.0;
+		arr_featuresi.neigh_variance_C = 0.0;
+		arr_featuresi.var_of_sub_mean = 0.0;
+		arr_featuresi.qp = _qp;
+		//arr_featuresi.NB_pixels = _NB_pixels;
+	}
+}
+
+/*!
+* \brief Compute the average of a block inside an 8 bits 2D vector.
+*
+* \param _mat_src 	First depth map.
+* \param _x 		X coordinate of the start of the block inside the matrix.
+* \param _x_end 	X coordinate of the end of the block inside the matrix.
+* \param _y 		Y coordinate of the start of the block inside the matrix.
+* \param _y_end     Y coordinate of the end of the block inside the matrix.
+* \param _width     Width of the matrix.
+* \return average value of the block, -1 if error.
+*/
+static INLINE double vect_average_blck_int8(const uint8_t* _mat_src, size_t _x, size_t _x_end, size_t _y, size_t _y_end, size_t _width)
+{
+	if (_mat_src == NULL)
+	{
+		fprintf(stderr, "null pointer as parameter.");
+		assert(0);
+		return -1.0;
+	}
+	double   block_size = (double)(_x_end - _x) * (double)(_y_end - _y);
+	double   avg_vect = 0.0;
+	//STD_print_matrix(_mat_src,64, 64);
+	for (size_t i_y = _y; i_y < _y_end; ++i_y)
+	{
+		size_t i_y_line = i_y * _width;
+		for (size_t i_x = _x; i_x < _x_end; ++i_x)
+		{
+			avg_vect = avg_vect + (double)_mat_srci_x + i_y_line;
+		}
+	}
+	return avg_vect / (double)(block_size);
+}
+
+/*!
+* \brief Compute the variance of a block inside an 8 bits 2D vector.
+*
+* \param _mat_src 	First depth map.
+* \param _x 		X coordinate of the start of the block inside the matrix.
+* \param _x_end 	X coordinate of the end of the block inside the matrix.
+* \param _y 		Y coordinate of the start of the block inside the matrix.
+* \param _y_end     Y coordinate of the end of the block inside the matrix.
+* \param _avg_blck  Average value of the block.
+* \param _width     Width of the matrix.
+* \return average value of the block, -1 if error.
+*/
+static INLINE double vect_variance_blck_int8(const uint8_t* _mat_src, size_t _x, size_t _x_end, size_t _y, size_t _y_end, double _avg_blck, size_t _width)
+{
+	if (_mat_src == NULL)
+	{
+		fprintf(stderr, "null pointer as parameter.");
+		assert(0);
+		return -1.0;
+	}
+	double   block_size = (double)(_x_end - _x) * (double)(_y_end - _y);
+	double   variance = 0.0;
+	for (size_t i_y = _y; i_y < _y_end; ++i_y)
+	{
+		size_t i_y_line = i_y * _width;
+		for (size_t i_x = _x; i_x < _x_end; ++i_x)
+		{
+			variance = variance + pow2((double)(_mat_srci_x + i_y_line) - _avg_blck);
+		}
+	}
+	return variance / (double)(block_size);
+}
+
+/*!
+* \brief Function to compute the average and the variance of a pixel block inside of a LCU.
+*
+* \param arr_luma_px Array of the pixels of the block.
+* \param i_xLcu      X coordinate of the lcu.
+* \param i_yLcu      Y coordinate of the lcu.
+* \param i_xBlck     X coordinate of the pixel block inside the LCU.
+* \param i_yBlck     Y coordinate of the pixel block inside the LCU.
+* \param i_blockSize Size of the block in pixels (4, 8, 16, 32 or 64).
+* \param i_width     Width  of the frame in pixels.
+* \param i_height    Height of the frame in pixels.
+* \param p_average   Pointer to be filled with the average.
+* \param p_variance  Pointer to be filled with the variance;
+* \return None.
+*/
+static INLINE void features_var_avg_blck(uint8_t* arr_luma_px, uint32_t i_xLcu, uint32_t i_yLcu,
+	uint32_t i_xBlck, uint32_t i_yBlck, uint8_t i_blockSize,
+	int32_t i_width, int32_t i_height,
+	double* p_average, double* p_variance)
+{
+	uint32_t iXMax = CR_XMAX(i_xLcu, i_blockSize + i_xBlck, i_width);
+	uint32_t iYMax = CR_YMAX(i_yLcu, i_blockSize + i_yBlck, i_height);
+	*p_average = vect_average_blck_int8(arr_luma_px, i_xBlck, iXMax, i_yBlck, iYMax, 64);
+	*p_variance = vect_variance_blck_int8(arr_luma_px, i_xBlck, iXMax, i_yBlck, iYMax, (*p_average), 64);
+}
+
+/*!
+* \brief Function to combine the variance and mean values of four blocks.
+*
+* \param arr_var     Array of 4*4 variances of the LCU.
+* \param arr_avgLuma Array of 4*4 mean values of the LCU.
+* \param i_x         X coordinate of the top left block.
+* \param i_y         Y coordinate of the top left block.
+* \param i_depth     Depth of the blocks (0,1,2,3 or 4).
+* \param p_varianceC Pointer to be filled with the combined variance.
+* \param p_avgLumaC  Pointer to be filled with the combined average.
+* \return None.
+*/
+static INLINE void features_combine_var(double* arr_var, double* arr_avgLuma, uint32_t i_x, uint32_t i_y, uint32_t i_depth,
+	double* p_varianceC, double* p_avgLumaC)
+{
+	double d_var_temp_1 = 0.0;
+	double d_var_temp_2 = 0.0;
+	double d_avg_temp_1 = 0.0;
+	double d_avg_temp_2 = 0.0;
+
+	int16_t i_subCU = (i_x + (i_y << 4)) << (4 - i_depth);
+	int16_t i_rows = (16 << (3 - i_depth));
+
+	int16_t i_sb0 = i_subCU; 									/*!< Top left sub block index */
+	int16_t i_sb1 = i_subCU + (1 << (3 - i_depth));			/*!< Top right sub block index */
+	int16_t i_sb2 = i_subCU + i_rows;							/*!< Bottom left sub block index */
+	int16_t i_sb3 = i_subCU + i_rows + (1 << (3 - i_depth)); 	/*!< Bottom right sub block index */
+
+	d_avg_temp_1 = (arr_avgLumai_sb0 + arr_avgLumai_sb1) / 2.0;
+	d_avg_temp_2 = (arr_avgLumai_sb2 + arr_avgLumai_sb3) / 2.0;
+
+	d_var_temp_1 = (2.0 * (arr_vari_sb0 + arr_vari_sb1) + pow2((arr_avgLumai_sb0 - arr_avgLumai_sb1))) / 4.0;
+	d_var_temp_2 = (2.0 * (arr_vari_sb2 + arr_vari_sb3) + pow2((arr_avgLumai_sb2 - arr_avgLumai_sb3))) / 4.0;
+
+	if (p_avgLumaC)
+	{
+		*p_avgLumaC = (d_avg_temp_1 + d_avg_temp_2) / 2.0;
+	}
+	*p_varianceC = (2.0 * (d_var_temp_1 + d_var_temp_2) + pow2(d_avg_temp_1 - d_avg_temp_2)) / 4.0;
+}
+
+
+/*!
+* \brief Function to combine the variance of the mean values of the sub block.
+*
+* \param arr_avgLuma   	Array of 4*4 mean values of the LCU.
+* \param i_sb0         	Index of the sub_blocks 0 in the array of avg values .
+* \param i_sb1     		Index of the sub_blocks 1 in the array of avg values.
+* \param i_sb2 			Index of the sub_blocks 2 in the array of avg values
+* \param i_sb3  		Index of the sub_blocks 3 in the array of avg values
+* \return variance of the average of the sub blocks.
+*/
+static INLINE double features_get_var_of_sub_mean(double* arr_avgLuma, int16_t i_sb0, int16_t i_sb1, int16_t i_sb2, int16_t i_sb3)
+{
+	double d_var = 0.0;
+	double d_avg = (arr_avgLumai_sb0 + arr_avgLumai_sb1 + arr_avgLumai_sb2 + arr_avgLumai_sb3) / 4.0;
+	d_var = pow2(arr_avgLumai_sb0 - d_avg);
+	d_var = pow2(arr_avgLumai_sb1 - d_avg) + d_var;
+	d_var = pow2(arr_avgLumai_sb2 - d_avg) + d_var;
+	d_var = pow2(arr_avgLumai_sb3 - d_avg) + d_var;
+	return d_var / 4.0;
+}
+
+/*!
+* \brief Build the neighboring variances of four cu's.
+*
+* \param arr_features   Array of features for current depth.
+* \param _x         	X position of the first cu in the array.
+* \param _y     		Y position of the first cu in the array.
+* \param _depth 		Evaluated depth.
+* \return None.
+*/
+static void features_var_neighbor(features_s* arr_features, int16_t _x, int16_t _y, int16_t _depth)
+{
+	int16_t i_cu0 = (_x - 1) + ((_y - 1) << _depth);
+	int16_t i_cu1 = (_x)+((_y - 1) << _depth);
+	int16_t i_cu2 = (_x - 1) + (_y << _depth);
+	int16_t i_cu3 = _x + (_y << _depth);
+
+	arr_featuresi_cu0.neigh_variance_A = arr_featuresi_cu1.variance;
+	arr_featuresi_cu0.neigh_variance_B = arr_featuresi_cu2.variance;
+	arr_featuresi_cu0.neigh_variance_C = arr_featuresi_cu3.variance;
+
+
+	arr_featuresi_cu1.neigh_variance_A = arr_featuresi_cu0.variance;
+	arr_featuresi_cu1.neigh_variance_B = arr_featuresi_cu2.variance;
+	arr_featuresi_cu1.neigh_variance_C = arr_featuresi_cu3.variance;
+
+
+	arr_featuresi_cu2.neigh_variance_A = arr_featuresi_cu0.variance;
+	arr_featuresi_cu2.neigh_variance_B = arr_featuresi_cu1.variance;
+	arr_featuresi_cu2.neigh_variance_C = arr_featuresi_cu3.variance;
+
+
+	arr_featuresi_cu3.neigh_variance_A = arr_featuresi_cu0.variance;
+	arr_featuresi_cu3.neigh_variance_B = arr_featuresi_cu1.variance;
+	arr_featuresi_cu3.neigh_variance_C = arr_featuresi_cu2.variance;
+}
+
+
+/*!
+* \brief Extract the features from the pixels for a given different depth.
+*
+* \param arr_features 		Array of features to be retrieved for the current depth.
+* \param i_depth 			Depth to be evaluated.
+* \param arr_var 			Array of 16*16 variances.
+* \param arr_avg 			Array of 16*16 average lumas.
+* \return None.
+*/
+static void features_compute(features_s* arr_features, uint8_t i_depth, double* arr_var, double* arr_avg)
+{
+	double d_avgLumaC;
+
+	int8_t i_nbBlock = (1 << i_depth);
+
+	for (int8_t y = 0; y < i_nbBlock; ++y)
+	{
+		for (int8_t x = 0; x < i_nbBlock; ++x)
+		{
+			int16_t i_cu = x + (y << i_depth);
+			if (i_depth == 4)
+			{
+				arr_featuresi_cu.variance = arr_vari_cu;
+			}
+			else
+			{
+				features_combine_var(arr_var, arr_avg, x, y, i_depth, &arr_featuresi_cu.variance, &d_avgLumaC);
+				int16_t i_CU_4 = (x << (4 - i_depth)) + (y << (8 - i_depth));
+				int16_t i_rows = (16 << (3 - i_depth));
+				arr_featuresi_cu.var_of_sub_mean = features_get_var_of_sub_mean(arr_avg,
+					i_CU_4,
+					i_CU_4 + (1 << (3 - i_depth)),
+					i_CU_4 + i_rows,
+					i_CU_4 + i_rows + (1 << (3 - i_depth)));
+				arr_avgi_CU_4 = d_avgLumaC;
+				arr_vari_CU_4 = arr_featuresi_cu.variance;
+			}
+			if (x % 2 == 1 &&
+				y % 2 == 1)
+			{
+				features_var_neighbor(arr_features, x, y, i_depth);
+			}
+
+		}
+	}
+}
+
+
+
+/*!
+* \brief Set the features Sub_var from the sub level for a given different depth.
+*
+* \param arr_features 		Array of features to be retrieved for the current depth.
+* \param arr_sub_features	Array of features to be retrieved for the sub depth (depth - 1).
+* \param i_rdepth 			Depth to be evaluated.
+
+* \return None.
+*/
+static void features_sub_var(features_s* arr_features, features_s* arr_sub_features, uint8_t i_depth)
+{
+	int8_t i_nbBlock = (1 << i_depth);
+
+	for (int8_t y = 0; y < i_nbBlock; ++y)
+	{
+		for (int8_t x = 0; x < i_nbBlock; ++x)
+		{
+			int16_t i_cu = x + (y << i_depth);
+			int16_t i_sb0 = (x << 1) + (y << (2 + i_depth)); 					/*!< Top left sub block index */
+			int16_t i_sb1 = (x << 1) + 1 + (y << (2 + i_depth)); 		    	/*!< Top right sub block index */
+			int16_t i_sb2 = (x << 1) + (((y << 1) + 1) << (1 + i_depth)); 		/*!< Bottom left sub block index */
+			int16_t i_sb3 = (x << 1) + 1 + (((y << 1) + 1) << (1 + i_depth));  /*!< Bottom right sub block index */
+
+
+			arr_featuresi_cu.sub_variance_0 = arr_sub_featuresi_sb0.variance;
+			arr_featuresi_cu.sub_variance_1 = arr_sub_featuresi_sb1.variance;
+			arr_featuresi_cu.sub_variance_2 = arr_sub_featuresi_sb2.variance;
+			arr_featuresi_cu.sub_variance_3 = arr_sub_featuresi_sb3.variance;
+
+		}
+	}
+}
+
+
+/*!
+* \brief Set the features Merge_var from the up level for a given different depth.
+*
+* \param arr_features 		Array of features to be retrieved for the current depth.
+* \param arr_up_features	Array of features to be retrieved for the upper depth (depth - 1).
+* \param i_rdepth 			Depth to be evaluated.
+
+* \return None.
+*/
+static void features_merge_var(features_s* arr_features, features_s* arr_up_features, uint8_t i_rdepth)
+{
+	uint8_t i_depth = i_rdepth - 1;
+	int8_t 	i_nbBlock = (1 << i_depth);
+
+	for (int8_t y = 0; y < i_nbBlock; ++y)
+	{
+		for (int8_t x = 0; x < i_nbBlock; ++x)
+		{
+			int16_t i_cu = x + (y << i_depth);
+			int16_t i_sb0 = (x << 1) + (y << (2 + i_depth)); 					/*!< Top left sub block index */
+			int16_t i_sb1 = (x << 1) + 1 + (y << (2 + i_depth)); 		    	/*!< Top right sub block index */
+			int16_t i_sb2 = (x << 1) + (((y << 1) + 1) << (1 + i_depth)); 		/*!< Bottom left sub block index */
+			int16_t i_sb3 = (x << 1) + 1 + (((y << 1) + 1) << (1 + i_depth));  /*!< Bottom right sub block index */
+
+			arr_featuresi_sb0.merge_variance = arr_up_featuresi_cu.variance;
+			arr_featuresi_sb1.merge_variance = arr_up_featuresi_cu.variance;
+			arr_featuresi_sb2.merge_variance = arr_up_featuresi_cu.variance;
+			arr_featuresi_sb3.merge_variance = arr_up_featuresi_cu.variance;
+
+		}
+	}
+}
+
+
+/*!
+* \brief Set the features Var_of_sub_var from the sub level for a given different depth.
+*
+* \param arr_features 		Array of features to be retrieved for the current depth.
+* \param i_rdepth 			Depth to be evaluated.
+
+* \return None.
+*/
+static void features_var_of_sub_var(features_s* arr_features, uint8_t i_depth)
+{
+	int8_t i_nbBlock = (1 << i_depth);
+
+	for (int8_t y = 0; y < i_nbBlock; ++y)
+	{
+		for (int8_t x = 0; x < i_nbBlock; ++x)
+		{
+			int16_t i_cu = x + (y << i_depth);
+			double d_var = 0.0;
+			double d_avg = (arr_featuresi_cu.sub_variance_0 + arr_featuresi_cu.sub_variance_1 + arr_featuresi_cu.sub_variance_2 + arr_featuresi_cu.sub_variance_3) / 4.0;
+
+			d_var = pow2(arr_featuresi_cu.sub_variance_0 - d_avg);
+			d_var = pow2(arr_featuresi_cu.sub_variance_1 - d_avg) + d_var;
+			d_var = pow2(arr_featuresi_cu.sub_variance_2 - d_avg) + d_var;
+			d_var = pow2(arr_featuresi_cu.sub_variance_3 - d_avg) + d_var;
+			arr_featuresi_cu.var_of_sub_var = d_var / 4.0;
+		}
+	}
+}
+
+
+/*!
+* \brief Extract the features from the pixels for all the depth.
+*
+* \param main_handler 		Pointer to the main high level reduction handler.
+* \param p_state      		Pointer to the state of the current LCU.
+* \param arr_features_4 	Array of features for level of depth 4.
+* \param arr_features_8 	Array of features for level of depth 3.
+* \param arr_features_16 	Array of features for level of depth 2.
+* \param arr_features_32	Array of features for level of depth 1.
+* \param p_features64	    Pointer to the features of depth 0.
+* \return None.
+*/
+static void features_compute_all(features_s* arr_features5, uint8_t* luma_px)
+{
+
+	uint32_t x_px = 0; /*!< Top left X of the lcu */
+	uint32_t y_px = 0; /*!< Top left Y of the lcu */
+	double variance256 = { 0.0 };
+	double avg_luma256 = { 0.0 };
+
+
+	features_s* arr_features_4 = arr_features4;
+	features_s* arr_features_8 = arr_features3;
+	features_s* arr_features_16 = arr_features2;
+	features_s* arr_features_32 = arr_features1;
+	features_s* p_features64 = arr_features0;
+
+	/*!< Compute the variance for all 4*4 blocs */
+	for (int8_t y = 0; y < 8; ++y)
+	{
+		for (int8_t x = 0; x < 8; ++x)
+		{
+			int16_t x_blck = (x << 1);
+			int16_t y_blck = (y << 1);
+			features_var_avg_blck(luma_px, x_px, y_px, x_blck << 2, y_blck << 2, 4, LCU_WIDTH, LCU_WIDTH,
+				&avg_lumaCR_GET_CU_D4(x_blck, y_blck, 4),
+				&varianceCR_GET_CU_D4(x_blck, y_blck, 4));
+
+			features_var_avg_blck(luma_px, x_px, y_px, (x_blck + 1) << 2, y_blck << 2, 4, LCU_WIDTH, LCU_WIDTH,
+				&avg_lumaCR_GET_CU_D4(x_blck + 1, y_blck, 4),
+				&varianceCR_GET_CU_D4(x_blck + 1, y_blck, 4));
+			features_var_avg_blck(luma_px, x_px, y_px, x_blck << 2, (y_blck + 1) << 2, 4, LCU_WIDTH, LCU_WIDTH,
+				&avg_lumaCR_GET_CU_D4(x_blck, y_blck + 1, 4),
+				&varianceCR_GET_CU_D4(x_blck, y_blck + 1, 4));
+			features_var_avg_blck(luma_px, x_px, y_px, (x_blck + 1) << 2, (y_blck + 1) << 2, 4, LCU_WIDTH, LCU_WIDTH,
+				&avg_lumaCR_GET_CU_D4(x_blck + 1, y_blck + 1, 4),
+				&varianceCR_GET_CU_D4(x_blck + 1, y_blck + 1, 4));
+
+		}
+	}
+
+	/* Compute the generic features of the all depth */
+	features_compute(arr_features_4, 4, variance, avg_luma);
+	features_compute(arr_features_8, 3, variance, avg_luma);
+	features_compute(arr_features_16, 2, variance, avg_luma);
+	features_compute(arr_features_32, 1, variance, avg_luma);
+	features_compute(p_features64, 0, variance, avg_luma);
+
+	/* Set the Sub_var features for the depth 3, 2, 1, 0*/
+	features_sub_var(arr_features_8, arr_features_4, 3);
+	features_sub_var(arr_features_16, arr_features_8, 2);
+	features_sub_var(arr_features_32, arr_features_16, 1);
+	features_sub_var(p_features64, arr_features_32, 0);
+
+	/* Set the Merge_var features for the depth 4, 3, 2, 1*/
+	features_merge_var(arr_features_4, arr_features_8, 4);
+	features_merge_var(arr_features_8, arr_features_16, 3);
+	features_merge_var(arr_features_16, arr_features_32, 2);
+	features_merge_var(arr_features_32, p_features64, 1);
+
+	/* Compute the Var_of_sub_var for the depth 3, 2, 1, 0*/
+	features_var_of_sub_var(arr_features_8, 3);
+	features_var_of_sub_var(arr_features_16, 2);
+	features_var_of_sub_var(arr_features_32, 1);
+	features_var_of_sub_var(p_features64, 0);
+
+}
+
+/*!
+* \brief Compute the constrain on the neighboring depth of a cu for
+*        a given depth for a BU approach
+*
+* \param arr_depthMap 	8*8 depth map.
+* \param _x      		X coordinate of the cu in the 8*8 depth map;
+* \param _y      		Y coordinate of the cu in the 8*8 depth map;
+* \param _depth      	Current depth tested.
+* \param _level  		number of depth gap that we want
+* \return 1 if the predictions should be tested for this cu, 0 else.
+*/
+static int neighbor_constrain_bu(uint8_t* arr_depthMap, int _x, int _y, int _depth, int _level)
+{
+	int nb_block = (8 >> (_depth)) << 1;
+	for (int y = _y; y < _y + nb_block; ++y)
+	{
+		for (int x = _x; x < _x + nb_block; ++x)
+		{
+			if (arr_depthMapx + (y << 3) - _level >= _depth)
+				return 0;
+		}
+	}
+	return 1;
+}
+
+
+
+static int8_t combined_tree_function(int8_t merge_prediction4, int8_t split_prediction, uint8_t test_id, uint8_t depth)
+{
+	int8_t prediction;
+	int8_t pred_merge_tmp = 0; // NUmber of sub-blocks non merge (=d)
+	for (int8_t i = 0; i < 4; i++) {
+		pred_merge_tmp += (merge_predictioni > 0) ? 1 : 0;
+	}
+	switch (test_id) {// We don't merge (-1) if :
+	case 0: // At least one sub block non merge
+		prediction = (pred_merge_tmp >= 1) ? depth : -1;
+		break;
+	case 1: // At least two sub blocks non merge
+		prediction = (pred_merge_tmp >= 2) ? depth : -1;
+		break;
+	case 2: // At least three sub blocks non merge
+		prediction = (pred_merge_tmp >= 3) ? depth : -1;
+		break;
+	case 3: // All sub blocks non merge
+		prediction = (pred_merge_tmp >= 4) ? depth : -1;
+		break;
+	case 4: // Up bock non merge ( = split)
+		prediction = (split_prediction == -1) ? depth : -1;
+		break;
+	case 5: // (At least one sub block non merge) & Up block non merge
+		prediction = ((pred_merge_tmp >= 1) && (split_prediction == -1)) ? depth : -1;
+		break;
+	case 6: // (At least two sub blocks non merge) & Up block non merge
+		prediction = ((pred_merge_tmp >= 2) && (split_prediction == -1)) ? depth : -1;
+		break;
+	case 7: // (At least three sub blocks non merge) & Up block non merge
+		prediction = ((pred_merge_tmp >= 3) && (split_prediction == -1)) ? depth : -1;
+		break;
+	case 8: // (All sub blocks non merge) & Up block non merge
+		prediction = ((pred_merge_tmp >= 4) && (split_prediction == -1)) ? depth : -1;
+		break;
+	case 9: // (At least one sub block non merge) | Up block non merge
+		prediction = ((pred_merge_tmp >= 1) || (split_prediction == -1)) ? depth : -1;
+		break;
+	case 10: // (At least two sub blocks non merge) | Up block non merge
+		prediction = ((pred_merge_tmp >= 2) || (split_prediction == -1)) ? depth : -1;
+		break;
+	case 11: // (At least three sub blocks non merge) | Up block non merge
+		prediction = ((pred_merge_tmp >= 3) || (split_prediction == -1)) ? depth : -1;
+		break;
+	case 12: // (All sub blocks non merge) | Up block non merge
+		prediction = ((pred_merge_tmp >= 4) || (split_prediction == -1)) ? depth : -1;
+		break;
+	default:
+		prediction = 0;
+	}
+
+	return prediction;
+}
+
+
+static void fill_depth_matrix_8(uint8_t* matrix, vect_2D* cu, int8_t curr_depth, int8_t val)
+{
+	//convert cu coordinate
+	int32_t x = cu->x;
+	int32_t y = cu->y;
+	int i = 0;
+	int32_t block = (8 >> curr_depth); //nb blocks in 8*8 block
+	for (i = y; i < y + block; ++i)
+	{
+		memset(matrix + x + (i << 3), val, block);
+	}
+}
+
+
+/*!
+* \brief Generate the PUM depth map in a 8*8 array for a given depth with a Buttom-Up approach.
+*
+* \param arr_depthMap 		Array of the depth map.
+* \param arr_features_cur 	Array of features for current depth (i_depth).
+* \param arr_features_up 	Array of features for up depth (i_depth-1).
+* \param i_depth      		Current depth tested.
+* \param _level             Number of level tested when the algo is Restrained (limited)
+* \param limited_flag 		0 to not test that the 4 blocks are at the same depth
+* 					  		1 to only merge a bloc if the 4 sub blocks are at the same depth
+* \param depth_flag 	    0 to not use depth features
+* 							1 to use use depth features
+* \return None.
+*/
+static void ml_os_qt_gen(uint8_t* arr_depthMap, features_s* arr_features_cur, features_s* arr_features_up, uint8_t i_depth, int _level, uint8_t limited_flag)
+{
+	
+
+		tree_predict predict_func_merge4 = {
+				tree_predict_merge_depth_1,
+				tree_predict_merge_depth_2,
+				tree_predict_merge_depth_3,
+				tree_predict_merge_depth_4
+		};
+
+		tree_predict predict_func_split4 = {
+				tree_predict_split_depth_0,
+				tree_predict_split_depth_1,
+				tree_predict_split_depth_2,
+				tree_predict_split_depth_3
+		};
+	
+	tree_predict prediction_function_merge = predict_func_mergei_depth - 1;
+	tree_predict prediction_function_split = predict_func_spliti_depth - 1;
+
+	double d_nb_iter;
+	double d_nb_bad;
+
+	uint8_t i_rdepth = i_depth < 4 ? i_depth : 3;
+
+	int16_t i_nbBlocks = 2 << (i_depth - 1);
+
+	int inc = 2;
+	for (int16_t y = 0; y < i_nbBlocks; y += inc)
+	{
+		for (int16_t x = 0; x < i_nbBlocks; x += inc)
+		{
+			uint8_t check_flag = 1;
+			/*!< Check if neighboring blocks are of the same size */
+			if ((limited_flag == 1) && (i_depth != 4))
+			{
+				check_flag = neighbor_constrain_bu(arr_depthMap, x << (3 - i_depth), y << (3 - i_depth), i_depth, _level);
+			}
+
+			if (check_flag)
+			{
+				int16_t i_cu_0 = x + (y << i_depth);
+				int16_t i_cu_1 = x + 1 + (y << i_depth);
+				int16_t i_cu_2 = x + ((y + 1) << i_depth);
+				int16_t i_cu_3 = x + 1 + ((y + 1) << i_depth);
+				int16_t i_cu_up = x / 2 + (y / 2 << (i_depth - 1));
+
+
+				int8_t merge_prediction4;
+				int8_t split_prediction;
+
+
+				merge_prediction0 = prediction_function_merge(&arr_features_curi_cu_0, &d_nb_iter, &d_nb_bad);
+				merge_prediction1 = prediction_function_merge(&arr_features_curi_cu_1, &d_nb_iter, &d_nb_bad);
+				merge_prediction2 = prediction_function_merge(&arr_features_curi_cu_2, &d_nb_iter, &d_nb_bad);
+				merge_prediction3 = prediction_function_merge(&arr_features_curi_cu_3, &d_nb_iter, &d_nb_bad);
+				split_prediction = prediction_function_split(&arr_features_upi_cu_up, &d_nb_iter, &d_nb_bad);
+
+				int8_t pred = combined_tree_function(merge_prediction, split_prediction, (i_depth >= 4) ? 8 : 9, i_depth);
+				int condition = (pred < 0) ? 1 : 0;
+
+				if (condition)
+				{
+					int16_t i_subCU = CR_GET_CU_D3((i_depth < 4 ? x : x / 2), (i_depth < 4 ? y : y / 2), i_rdepth);
+					vect_2D tmp;
+					tmp.x = i_subCU % 8;
+					tmp.y = i_subCU / 8;
+					fill_depth_matrix_8(arr_depthMap, &tmp, i_depth - 1, i_depth - 1);
+				}
+			}
+		}
+	}
+}
+
+
+
+static void os_luma_qt_pred(ml_intra_ctu_pred_t* ml_intra_depth_ctu, uint8_t* luma_px, int8_t qp, uint8_t* arr_CDM)
+{
+	// Features array per depth
+	features_s arr_features_4256;
+	features_s arr_features_864;
+	features_s arr_features_1616;
+	features_s arr_features_324;
+	features_s features64;
+
+	// Initialize to 0 all the features
+	features_init_array(arr_features_4, 256, qp);
+	features_init_array(arr_features_8, 64, qp);
+	features_init_array(arr_features_16, 16, qp);
+	features_init_array(arr_features_32, 4, qp);
+	features_init_array(&features64, 1, qp);
+
+	// Commpute the features for the current CTU for all depth
+	features_s* arr_features5;
+	arr_features0 = &features64;
+	arr_features1 = arr_features_32;
+	arr_features2 = arr_features_16; 
+	arr_features3 = arr_features_8;
+	arr_features4 = arr_features_4;
+
+
+	features_compute_all(arr_features, luma_px);
+
+	// Generate the CDM for the current CTU
+	
+	/*!< Set the depth map to 4 by default */
+	memset(arr_CDM, 4, 64);
+	ml_os_qt_gen(arr_CDM, arr_features_4, arr_features_8, 4, 1, RESTRAINED_FLAG);
+	
+
+	ml_os_qt_gen(arr_CDM, arr_features_8, arr_features_16, 3, 1, RESTRAINED_FLAG);
+	ml_os_qt_gen(arr_CDM, arr_features_16, arr_features_32, 2, 1, RESTRAINED_FLAG);
+	ml_os_qt_gen(arr_CDM, arr_features_32, &features64, 1, 1, RESTRAINED_FLAG);
+
+
+
+}
+
+static void fill_matrix_with_depth(uint8_t* matrix, int32_t x, int32_t y, int8_t depth)
+{
+	int i = 0;
+	int32_t block = depth < 4 ? (8 >> depth) : 1; //nb blocks in 8*8 block
+	for (i = y; i < y + block; ++i)
+	{
+		memset(matrix + x + (i << 3), depth, block);
+	}
+}
+
+/*!
+* \brief Merge the depth of the blocks of a depth map if
+*        four blocks of the same depths are found.
+*
+* \param _mat_seed  Array of the depth used as seed for the merge (WARNING: must be the same as arrDepthMerge (tmp)).
+* \param _mat_dst   Array of the depth merged.
+* \return 1 if blocks have been merged, 0 else.
+*/
+static uint8_t merge_matrix_64(uint8_t* _mat_seed, uint8_t* _mat_dst)
+{
+	uint8_t i_depth = 0;
+	uint32_t nb_block = 0;
+	uint8_t retval = 0;
+	uint8_t mat_tmp64;
+	memcpy(mat_tmp, _mat_seed, 64);
+	for (uint_fast8_t i_y = 0; i_y < 8; ++i_y)
+	{
+		for (uint_fast8_t i_x = 0; i_x < 8; ++i_x)
+		{
+			i_depth = mat_tmpi_x + (i_y << 3);
+
+			if (i_depth == 4)
+			{
+				_mat_dsti_x + (i_y << 3) = 3;/*!< All depth 4 blocks are merged by default to depth 3 */
+				retval = 1;
+				continue; /*!< Skip the modulo operations and conditional tests */
+			}
+
+			if (i_depth == 0) /*!< Skip all the loop process, since 0 depth means there will be no other depths tested */
+			{
+				_mat_dsti_x + (i_y << 3) = i_depth;
+				memset(_mat_dst, 0, 64);
+				goto exit_64;
+			}
+
+			nb_block = (16 >> i_depth); /*!< Offset to go check the three other blocks */
+										/*!< Check if we are on the fourth block of a depth*/
+			if ((i_x % nb_block == (8 >> i_depth)) &&
+				(i_y % nb_block == (8 >> i_depth)))
+			{
+				retval = 1;
+				nb_block = (8 >> i_depth); /*!< Generate the real offset for the array */
+										   /*
+										   *   x 0 1 2 3 4 5 6 7
+										   * y
+										   * 0   3 3 2 2 1 1 1 1
+										   * 1   3 3 2 2 1 1 1 1
+										   * 2   2 2 2 2 1 1 1 1
+										   * 3   2 2 2 2 1 1 1 1
+										   * 4   1 1 1 1 2 2 2 2
+										   * 5   1 1 1 1 2 2 2 2
+										   * 6   1 1 1 1 2 2 2 2
+										   * 7   1 1 1 1 2 2 2 2
+										   *
+										   * exemple for the first fourth block of depth 2 :
+										   * 8 >> 2 = 2
+										   * nb_block = 4 -> x % 4 == 2 -> x = 2
+										   *              -> y % 4 == 2 -> y = 2
+										   * nb_block = 2 -> check blocs(0,2),(2,0),(0,0)
+										   * all informations are available
+										   */
+				if (mat_tmpi_x - nb_block + (i_y << 3) == i_depth &&
+					mat_tmpi_x + ((i_y - nb_block) << 3) == i_depth &&
+					mat_tmpi_x - nb_block + ((i_y - nb_block) << 3) == i_depth)
+				{
+					fill_matrix_with_depth(_mat_dst, i_x - nb_block, i_y - nb_block, i_depth - 1);
+				}
+			}
+		}
+	}
+exit_64:
+	return retval;
+}
+
+
+
+/*!
+* \brief Perform an in place element wise mask between the two matrix.
+*
+* \param _mat_mask  Matrix containing result of the mask (input/output).
+* \param _mat_src   Matrix used for the mask (input).
+* \param _size_w    Width of the matrix.
+* \param _size_h    Height of the matrix.
+* \return None.
+*/
+static void matrix_mask(uint8_t* _mat_mask, const uint8_t* _mat_src, size_t _size_w, size_t _size_h)
+{
+	if (_mat_mask == NULL || _mat_src == NULL)
+	{
+		fprintf(stderr, "null pointer as parameter.");
+		assert(0);
+		return;
+	}
+	size_t i_size = _size_h * _size_w;
+	for (size_t i = 0; i < i_size; ++i)
+	{
+		_mat_maski = (_mat_maski ^ _mat_srci) != 0 ? 1 : 0;
+	}
+}
+
+
+/*!
+* \brief Add 1 depth level to the depth map. If d + 1 > 4 then d - 1 is done.
+*        This function use a mask to add level only on selected roi.
+*
+* \param _mat_sup    	Original upper depth map .
+* \param _mat_inf    	Lower depth map.
+* \param _mat_sup_dst   Final upper depth map (WARNING: must be a different array as _mat_sup as it can be modified).
+* \param _nb_level      The number of level there should be between inf and sup_dst.
+* \param _mat_roi   	Mask used to determine which area should be modified on the _mat_inf (convention is 0 for changed area and 1 else).
+* \return None.
+*/
+static void matrix_add_level_roi(const uint8_t* _mat_sup, uint8_t* _mat_inf, uint8_t* _mat_sup_dst, int8_t _nb_level, const uint8_t* _mat_roi)
+{
+	int8_t x = 0, y = 0;
+	int8_t i_depth = 0;
+	for (y = 0; y < 8; ++y)
+	{
+		for (x = 0; x < 8; ++x)
+		{
+			if ((!_mat_roix + (y << 3)) == 1)
+			{
+				i_depth = _mat_supx + (y << 3);
+				if (i_depth == 4)
+				{
+					int8_t i_depth_sup = _mat_sup_dstx + (y << 3);
+					_mat_infx + (y << 3) = 4;
+					if (i_depth_sup == 4)
+					{
+						_mat_sup_dstx + (y << 3) = 3;
+					}
+					else if (i_depth_sup > 0 && abs(i_depth_sup - 4) < _nb_level)
+					{
+						fill_matrix_with_depth(_mat_sup_dst, (x & (~(8 >> (i_depth_sup)))), (y & (~(8 >> (i_depth_sup)))), i_depth_sup - 1);
+					}
+					continue;
+				}
+				else if (i_depth == 3)
+				{
+					_mat_infx + (y << 3) = 4;
+					continue;
+				}
+				else if (abs(_mat_infx + (y << 3) - _mat_supx + (y << 3)) != _nb_level)
+				{
+					fill_matrix_with_depth(_mat_inf, x, y, i_depth + 1);
+				}
+				x += (8 >> (i_depth + 1)) - 1;
+			}
+		}
+	}
+}
+
+/*!
+* \brief Generate a search interval of controlled level around a MEP seed.
+*
+* \param _mat_depth_min  Upper depth map (considered as the MEP on call).
+* \param _mat_depth_max  Lower depth map (considered initialized with the MEP values).
+* \param _nb_level  	 Fixed distance between the two generated depth map.
+* \return None.
+*/
+static void generate_interval_from_os_pred(ml_intra_ctu_pred_t* ml_intra_depth_ctu, uint8_t* _mat_depth_MEP)
+{
+	uint8_t* _mat_depth_min = ml_intra_depth_ctu->_mat_upper_depth;
+	uint8_t* _mat_depth_max = ml_intra_depth_ctu->_mat_lower_depth;
+	int8_t _nb_level = ml_intra_depth_ctu->i_nb_addDepth;
+
+	memcpy(_mat_depth_min, _mat_depth_MEP, 64 * sizeof(uint8_t));
+	memcpy(_mat_depth_max, _mat_depth_MEP, 64 * sizeof(uint8_t));
+	if (_nb_level <= 0)
+	{
+		return;
+	}
+	else if (_nb_level >= 4)
+	{
+		memset(_mat_depth_min, 0, 64 * sizeof(uint8_t));
+		memset(_mat_depth_max, 4, 64 * sizeof(uint8_t));
+		return;
+	}
+	uint8_t mat_ref64;	/*!< Matrix used to store the ref map */
+	uint8_t mat_mask64; 	/*!< Matrix used as mask */
+	uint8_t mat_max64;	/*!< Matrix used to store current depth map max */
+
+	for (int j = 0; j < _nb_level; ++j)
+	{
+		/*!< Copy the original map seed */
+		memcpy(mat_ref, _mat_depth_min, 64 * sizeof(uint8_t));
+		memcpy(mat_mask, _mat_depth_min, 64 * sizeof(uint8_t));
+		memcpy(mat_max, _mat_depth_max, 64 * sizeof(uint8_t));
+
+		/*!< Apply the RCDM on the upper map */
+		merge_matrix_64(_mat_depth_min, _mat_depth_min);
+
+		/*!< Extract the mask */
+		matrix_mask(mat_mask, _mat_depth_min, 8, 8);
+
+		/*!< Add a level only on the masked area */
+		matrix_add_level_roi(mat_max, _mat_depth_max, _mat_depth_min, 1, mat_mask);
+		
+	}
+}
+
+/**
+*	Generate the interval of depth predictions based on the luma samples
+*/
+void kvz_lcu_luma_depth_pred(ml_intra_ctu_pred_t* ml_intra_depth_ctu, uint8_t* luma_px, int8_t qp) {
+
+	// Compute the one-shot (OS) Quad-tree prediction (_mat_OS_pred)
+	os_luma_qt_pred(ml_intra_depth_ctu, luma_px, qp, ml_intra_depth_ctu->_mat_upper_depth);
+
+	// Generate the interval of QT predictions around the first one
+	generate_interval_from_os_pred(ml_intra_depth_ctu, ml_intra_depth_ctu->_mat_upper_depth);
+
+	// Apply the extra Upper Expansion pass
+	merge_matrix_64(ml_intra_depth_ctu->_mat_upper_depth, ml_intra_depth_ctu->_mat_upper_depth);
+}

kvazaar-2.0.0.tar.gz/src/ml_intra_cu_depth_pred.h Added

@@ -0,0 +1,90 @@
+#ifndef ML_INTRA_CU_DEPTH_PRED_H_
+#define ML_INTRA_CU_DEPTH_PRED_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include <stdio.h>
+#include "global.h" // IWYU pragma: keep
+
+
+
+
+#define LCU_DEPTH_MAT_SIZE 64
+#define RESTRAINED_FLAG 1
+
+#define pow2(x) ((x)*(x))
+#define CR_XMAX(x_px, block_size, width)       (MIN((x_px) + (block_size), (width))  - (x_px))
+#define CR_YMAX(y_px, block_size, height)      (MIN((y_px) + (block_size), (height)) - (y_px))
+#define CR_GET_X_LCU(lcu_id, nb_lcu_width)     (((lcu_id) % (nb_lcu_width)) << 6)
+#define CR_GET_Y_LCU(lcu_id, nb_lcu_width)     (((lcu_id) / (nb_lcu_width)) << 6)
+#define CR_GET_CU_D3(x, y, depth) ((x)*(1 << (3-depth)) + ((y) << (6 - depth)))
+#define CR_GET_CU_D4(x, y, depth) ((x)*(1 << (4-depth)) + ((y) << (8 - depth)))
+#define CR_GET_DEPTH_MIN(x, y, depth_min_mat) *(depth_min_mat + (x >> 3) + ((y >> 3) << 3))
+#define CR_GET_DEPTH_MAX(x, y, depth_max_mat) *(depth_max_mat + (x >> 3) + ((y >> 3) << 3))
+
+typedef struct {
+	int32_t x;
+	int32_t y;
+}vect_2D;
+
+
+ // Structure used for the CTU depth prediction using Machine Learning 
+ // in All Intra 
+typedef struct {
+	/*!< Number of depth to add to the QT prediction in ''one-shot'' */
+	int8_t   i_nb_addDepth;
+	/*!< Apply an extra Upper Expansion in the upper_depth */
+	bool	 b_extra_up_exp;
+	/*!< Matrix used to store the upper and lower QT prediction*/
+	uint8_t* _mat_upper_depth; 
+	uint8_t* _mat_lower_depth;
+} ml_intra_ctu_pred_t;
+
+
+
+/*
+ * brief generic structure used for the features
+ *
+ */
+typedef struct {
+	double variance;
+	double merge_variance;
+	double sub_variance_0;
+	double sub_variance_1;
+	double sub_variance_2;
+	double sub_variance_3;
+	double neigh_variance_A;
+	double neigh_variance_B;
+	double neigh_variance_C;
+	double var_of_sub_mean;
+	int 	qp;
+	//int   NB_pixels;
+	double var_of_sub_var;
+}features_s;
+
+
+typedef int (*tree_predict)(features_s*, double*, double*);
+
+ml_intra_ctu_pred_t* kvz_init_ml_intra_depth_const(void);
+void kvz_end_ml_intra_depth_const(ml_intra_ctu_pred_t * ml_intra_depth_ctu);
+
+void kvz_lcu_luma_depth_pred(ml_intra_ctu_pred_t* ml_intra_depth_ctu, uint8_t* luma_px, int8_t qp);
+
+#endif
\ No newline at end of file

kvazaar-1.3.0.tar.gz/src/rate_control.c -> kvazaar-2.0.0.tar.gz/src/rate_control.c Changed

@@ -24,11 +24,15 @@
 
 #include "encoder.h"
 #include "kvazaar.h"
+#include "pthread.h"
 
 
 static const int SMOOTHING_WINDOW = 40;
 static const double MIN_LAMBDA    = 0.1;
 static const double MAX_LAMBDA    = 10000;
+#define BETA1 1.2517
+
+static kvz_rc_data *data;
 
 /**
  * \brief Clip lambda value to a valid range.
@@ -38,6 +42,73 @@
   return CLIP(MIN_LAMBDA, MAX_LAMBDA, lambda);
 }
 
+kvz_rc_data * kvz_get_rc_data(const encoder_control_t * const encoder) {
+  if (data != NULL || encoder == NULL) return data;
+
+  data = calloc(1, sizeof(kvz_rc_data));
+
+  if (data == NULL) return NULL;
+  if (pthread_mutex_init(&data->ck_frame_lock, NULL) != 0) return NULL;
+  if (pthread_mutex_init(&data->lambda_lock, NULL) != 0) return NULL;
+  if (pthread_mutex_init(&data->intra_lock, NULL) != 0) return NULL;
+  for (int (i) = 0; (i) < KVZ_MAX_GOP_LAYERS; ++(i)) {
+    if (pthread_rwlock_init(&data->ck_ctu_locki, NULL) != 0) return NULL;
+  }
+
+  const int num_lcus = encoder->in.width_in_lcu * encoder->in.height_in_lcu;
+
+  for (int i = 0; i < KVZ_MAX_GOP_LAYERS; i++) {
+    data->c_parai = malloc(sizeof(double) * num_lcus);
+    if (data->c_parai == NULL) return NULL;
+
+    data->k_parai = malloc(sizeof(double) * num_lcus);
+    if (data->k_parai == NULL) return NULL;
+
+    data->pic_c_parai = 5.0;
+    data->pic_k_parai = -0.1;
+
+    for (int j = 0; j < num_lcus; j++) {
+      data->c_paraij = 5.0;
+      data->k_paraij = -0.1;
+    }
+  }
+  data->intra_bpp = calloc(num_lcus, sizeof(double));
+  if (data->intra_bpp == NULL) return NULL;
+  data->intra_dis = calloc(num_lcus, sizeof(double));
+  if (data->intra_dis == NULL) return NULL;
+
+  memset(data->previous_lambdas, 0, sizeof(data->previous_lambdas));
+
+  data->previous_frame_lambda = 0.0;
+
+  data->intra_pic_bpp = 0.0;
+  data->intra_pic_distortion = 0.0;
+
+  data->intra_alpha = 6.7542000000000000;
+  data->intra_beta = 1.7860000000000000;
+  return data;
+}
+
+void kvz_free_rc_data() {
+  if (data == NULL) return;
+
+  pthread_mutex_destroy(&data->ck_frame_lock);
+  pthread_mutex_destroy(&data->lambda_lock);
+  pthread_mutex_destroy(&data->intra_lock);
+  for (int i = 0; i < KVZ_MAX_GOP_LAYERS; ++i) {
+    pthread_rwlock_destroy(&data->ck_ctu_locki);
+  }
+
+  if (data->intra_bpp) FREE_POINTER(data->intra_bpp);
+  if (data->intra_dis) FREE_POINTER(data->intra_dis);
+  for (int i = 0; i < KVZ_MAX_GOP_LAYERS; i++) {
+    if (data->c_parai) FREE_POINTER(data->c_parai);
+    if (data->k_parai) FREE_POINTER(data->k_parai);
+  }
+  FREE_POINTER(data);
+}
+
+
 /**
  * \brief Update alpha and beta parameters.
  *
@@ -95,6 +166,96 @@
   return MAX(200, gop_target_bits);
 }
 
+static int xCalcHADs8x8_ISlice(kvz_pixel * piOrg, int y, int iStrideOrg)
+{
+  piOrg += y * iStrideOrg;
+  int i, j;
+  int diff64, m188, m288, m388, iSumHad = 0;
+
+  for (int k = 0; k < 64; k += 8) {
+    diffk + 0 = piOrg0;
+    diffk + 1 = piOrg1;
+    diffk + 2 = piOrg2;
+    diffk + 3 = piOrg3;
+    diffk + 4 = piOrg4;
+    diffk + 5 = piOrg5;
+    diffk + 6 = piOrg6;
+    diffk + 7 = piOrg7;
+
+    piOrg += iStrideOrg;
+  }
+
+  //horizontal
+  for (j = 0; j < 8; j++) {
+    int jj = j << 3;
+    m2j0 = diffjj + diffjj + 4;
+    m2j1 = diffjj + 1 + diffjj + 5;
+    m2j2 = diffjj + 2 + diffjj + 6;
+    m2j3 = diffjj + 3 + diffjj + 7;
+    m2j4 = diffjj - diffjj + 4;
+    m2j5 = diffjj + 1 - diffjj + 5;
+    m2j6 = diffjj + 2 - diffjj + 6;
+    m2j7 = diffjj + 3 - diffjj + 7;
+
+    m1j0 = m2j0 + m2j2;
+    m1j1 = m2j1 + m2j3;
+    m1j2 = m2j0 - m2j2;
+    m1j3 = m2j1 - m2j3;
+    m1j4 = m2j4 + m2j6;
+    m1j5 = m2j5 + m2j7;
+    m1j6 = m2j4 - m2j6;
+    m1j7 = m2j5 - m2j7;
+
+    m2j0 = m1j0 + m1j1;
+    m2j1 = m1j0 - m1j1;
+    m2j2 = m1j2 + m1j3;
+    m2j3 = m1j2 - m1j3;
+    m2j4 = m1j4 + m1j5;
+    m2j5 = m1j4 - m1j5;
+    m2j6 = m1j6 + m1j7;
+    m2j7 = m1j6 - m1j7;
+  }
+
+  //vertical
+  for (i = 0; i < 8; i++) {
+    m30i = m20i + m24i;
+    m31i = m21i + m25i;
+    m32i = m22i + m26i;
+    m33i = m23i + m27i;
+    m34i = m20i - m24i;
+    m35i = m21i - m25i;
+    m36i = m22i - m26i;
+    m37i = m23i - m27i;
+
+    m10i = m30i + m32i;
+    m11i = m31i + m33i;
+    m12i = m30i - m32i;
+    m13i = m31i - m33i;
+    m14i = m34i + m36i;
+    m15i = m35i + m37i;
+    m16i = m34i - m36i;
+    m17i = m35i - m37i;
+
+    m20i = m10i + m11i;
+    m21i = m10i - m11i;
+    m22i = m12i + m13i;
+    m23i = m12i - m13i;
+    m24i = m14i + m15i;
+    m25i = m14i - m15i;
+    m26i = m16i + m17i;
+    m27i = m16i - m17i;
+  }
+
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
+      iSumHad += abs(m2ij);
+    }
+  }
+  iSumHad -= abs(m200);
+  iSumHad = (iSumHad + 2) >> 2;
+  return(iSumHad);
+}
+
 /**
  * Estimate number of bits used for headers of the current picture.
  * \param state   the main encoder state
@@ -155,6 +316,29 @@
       state->previous_encoder_state->frame->cur_gop_target_bits;
   }
 
+  if (state->frame->is_irap && encoder->cfg.intra_bit_allocation) {
+    int total_cost = 0;
+    for (int y = 0; y < encoder->cfg.height; y += 8) {
+      for (int x = 0; x < encoder->cfg.width; x += 8) {
+        int cost = xCalcHADs8x8_ISlice(state->tile->frame->source->y + x, y, state->tile->frame->source->stride);
+        total_cost += cost;
+        kvz_get_lcu_stats(state, x / 64, y / 64)->i_cost += cost;
+      }
+    }
+    state->frame->icost = total_cost;
+    state->frame->remaining_weight = total_cost;
+
+    double bits = state->frame->cur_gop_target_bits / MAX(encoder->cfg.gop_len, 1);
+    double alpha, beta = 0.5582;
+    if (bits * 40 < encoder->cfg.width * encoder->cfg.height) {
+      alpha = 0.25;
+    }
+    else {
+      alpha = 0.3;
+    }
+    return MAX(100, alpha*pow(state->frame->icost * 4 / bits, beta)*bits);
+  }
+
   if (encoder->cfg.gop_len <= 0) {
     return state->frame->cur_gop_target_bits;
   }
@@ -173,39 +357,574 @@
   return CLIP_TO_QP(qp);
 }
 
-static double qp_to_lamba(encoder_state_t * const state, int qp)
+static double solve_cubic_equation(const encoder_state_config_frame_t * const state,
+                            int ctu_index,
+                            int last_ctu,
+                            double est_lambda,
+                            double target_bits) 
 {
-  const encoder_control_t * const ctrl = state->encoder_control;
-  const int gop_len = ctrl->cfg.gop_len;
-  const int period = gop_len > 0 ? gop_len : ctrl->cfg.intra_period;
+  double best_lambda = 0.0;
+  double para_a = 0.0;
+  double para_b = 0.0;
+  double para_c = 0.0;
+  double para_d = 0.0;
+  double delta = 0.0;
+  double para_aa = 0.0;
+  double para_bb = 0.0;
+  double para_cc = 0.0;
+  for (int i = ctu_index; i < last_ctu; i++)
+  {
+    double a = 0.0;
+    double b = 0.0;
+    double c = 0.0;
+    double d = 0.0;
+    assert(!((state->c_parai <= 0) || (state->k_parai >= 0))); //Check C and K during each solution 
 
-  kvz_gop_config const * const gop = &ctrl->cfg.gopstate->frame->gop_offset;
+    double CLCU = state->c_parai;
+    double KLCU = state->k_parai;
+    a = -CLCU * KLCU / pow(state->lcu_statsi.pixels, KLCU - 1.0);
+    b = -1.0 / (KLCU - 1.0);
+    d = est_lambda;
+    c = pow(a / d, b);
+    para_a = para_a - c * pow(b, 3.0) / 6.0;
+    para_b = para_b + (pow(b, 2.0) / 2.0 + pow(b, 3.0)*log(d) / 2.0)*c;
+    para_c = para_c - (pow(b, 3.0) / 2.0*pow(log(d), 2.0) + pow(b, 2.0)*log(d) + b)*c;
+    para_d = para_d + c * (1 + b * log(d) + pow(b, 2.0) / 2 * pow(log(d), 2.0) + pow(b, 3.0) / 6 * pow(log(d), 3.0));
+  }
 
-  double lambda = pow(2.0, (qp - 12) / 3.0);
+  para_d = para_d - target_bits;
+  para_aa = para_b * para_b - 3 * para_a*para_c;
+  para_bb = para_b * para_c - 9 * para_a*para_d;
+  para_cc = para_c * para_c - 3 * para_b*para_d;
 
-  if (state->frame->slicetype == KVZ_SLICE_I) {
-    lambda *= 0.57;
+  delta = para_bb * para_bb - 4 * para_aa*para_cc;
 
-    // Reduce lambda for I-frames according to the number of references.
-    if (period == 0) {
-      lambda *= 0.5;
-    } else {
-      lambda *= 1.0 - CLIP(0.0, 0.5, 0.05 * (period - 1));
+  if (delta > 0.0)  //Check whether delta is right
+  {
+    double temp_x = 0.0;
+    double part1 = 0.0;
+    double part2 = 0.0;
+    double flag1 = 0.0;
+    double flag2 = 0.0;
+    part1 = para_aa * para_b + 3 * para_a*(-para_bb - pow(delta, 0.5)) / 2.0;
+    part2 = para_aa * para_b + 3 * para_a*(-para_bb + pow(delta, 0.5)) / 2.0;
+    if (part1 < 0.0) {
+      part1 = -part1;
+      flag1 = -1.0;
     }
-  } else if (gop_len > 0) {
-    lambda *= gop->qp_factor;
-  } else {
-    lambda *= 0.4624;
+    else {
+      flag1 = 1.0;
+    }
+    if (part2 < 0.0) {
+      part2 = -part2;
+      flag2 = -1.0;
+    }
+    else {
+      flag2 = 1.0;
+    }
+    temp_x = (-para_b - flag1 * pow(part1, 1.0 / 3.0) - flag2 * pow(part2, 1.0 / 3.0)) / 3 / para_a;
+    best_lambda = exp(temp_x);
+  }
+  else {
+    best_lambda = est_lambda;  //Use the original picture estimated lambda for the current CTU
+  }
+  best_lambda = CLIP(0.001, 100000000.0, best_lambda);
+
+  return best_lambda;
+}
+
+static INLINE double calculate_weights(encoder_state_t* const state, const int ctu_count, double est_lambda) {
+  double total_weight = 0;
+  for(int i = 0; i < ctu_count; i++) {
+    double c_lcu = state->frame->c_parai;
+    double k_lcu = state->frame->k_parai;
+    double a = -c_lcu * k_lcu / pow(state->frame->lcu_statsi.pixels, k_lcu - 1.0);
+    double b = -1.0 / (k_lcu - 1.0);
+    state->frame->lcu_statsi.original_weight = state->frame->lcu_statsi.weight = pow(a / est_lambda, b);
+    if (state->frame->lcu_statsi.weight < 0.01) {
+      state->frame->lcu_statsi.weight = 0.01;
+    }
+    total_weight += state->frame->lcu_statsi.weight;
+  }
+  return total_weight;
+}
+
+
+void kvz_estimate_pic_lambda(encoder_state_t * const state) {
+  const encoder_control_t * const encoder = state->encoder_control;
+
+  const int layer = encoder->cfg.gopstate->frame->gop_offset.layer - (state->frame->is_irap ? 1 : 0);
+  const int ctu_count = state->tile->frame->height_in_lcu * state->tile->frame->width_in_lcu;
+
+  double alpha;
+  double beta;
+  if(state->frame->is_irap && encoder->cfg.intra_bit_allocation) {
+    pthread_mutex_lock(&state->frame->new_ratecontrol->intra_lock);
+    alpha = state->frame->new_ratecontrol->intra_alpha;
+    beta = state->frame->new_ratecontrol->intra_beta;
+    pthread_mutex_unlock(&state->frame->new_ratecontrol->intra_lock);
+  }
+  else if(state->frame->poc == 0) {
+    alpha = state->frame->rc_alpha;
+    beta = state->frame->rc_beta;
+  }
+  else {
+    pthread_mutex_lock(&state->frame->new_ratecontrol->ck_frame_lock);
+    alpha = -state->frame->new_ratecontrol->pic_c_paralayer *
+      state->frame->new_ratecontrol->pic_k_paralayer;
+    beta = state->frame->new_ratecontrol->pic_k_paralayer - 1;
+    pthread_mutex_unlock(&state->frame->new_ratecontrol->ck_frame_lock);
   }
+  double bits = pic_allocate_bits(state);
+  state->frame->cur_pic_target_bits = bits;
 
-  // Increase lambda if not key-frame.
-  if (period > 0 && state->frame->poc % period != 0) {
-    lambda *= CLIP(2.0, 4.0, (state->frame->QP - 12) / 6.0);
+  double est_lambda;
+  int32_t num_pixels = state->encoder_control->cfg.width * state->encoder_control->cfg.height;
+  double bpp = bits / num_pixels;
+  if (state->frame->is_irap) {
+    if(encoder->cfg.intra_bit_allocation) {
+      state->frame->i_bits_left = bits;
+      double temp = pow(state->frame->icost / num_pixels, BETA1);
+      est_lambda = alpha / 256 * pow(temp/bpp, beta);
+    }
+    else {
+      // arbitrary reduction to the lambda for intra frames
+      est_lambda = alpha * pow(bpp, beta) * 0.5;
+    }
+  }
+  else {
+    est_lambda = alpha * pow(bpp, beta);
   }
 
+  double temp_lambda;
+  pthread_mutex_lock(&state->frame->new_ratecontrol->lambda_lock);
+  if ((temp_lambda = state->frame->new_ratecontrol->previous_lambdaslayer) > 0.0) {
+    temp_lambda = CLIP(0.1, 10000.0, temp_lambda);
+    est_lambda = CLIP(temp_lambda * pow(2.0, -1), temp_lambda * 2, est_lambda);
+  }
+
+  if((temp_lambda = state->frame->new_ratecontrol->previous_frame_lambda) > 0.0) {
+    temp_lambda = CLIP(0.1, 2000.0, temp_lambda);
+    est_lambda = CLIP(temp_lambda * pow(2.0, -10.0 / 3.0), temp_lambda * pow(2.0, 10.0 / 3.0), est_lambda);
+  }
+  pthread_mutex_unlock(&state->frame->new_ratecontrol->lambda_lock);
+
+  est_lambda = CLIP(0.1, 10000.0, est_lambda);
+
+  double total_weight = 0;
+
+  if(!state->frame->is_irap) {
+    double best_lambda = est_lambda;
+    if(!state->encoder_control->cfg.frame_allocation) {
+      pthread_rwlock_rdlock(&state->frame->new_ratecontrol->ck_ctu_locklayer);
+      memcpy(state->frame->c_para, state->frame->new_ratecontrol->c_paralayer, ctu_count * sizeof(double));
+      memcpy(state->frame->k_para, state->frame->new_ratecontrol->k_paralayer, ctu_count * sizeof(double));
+      pthread_rwlock_unlock(&state->frame->new_ratecontrol->ck_ctu_locklayer);
+      temp_lambda = est_lambda;
+      double taylor_e3;
+      int iteration_number = 0;
+      do {
+        taylor_e3 = 0.0;
+        best_lambda = temp_lambda = solve_cubic_equation(state->frame, 0, ctu_count, temp_lambda, bits);
+        for (int i = 0; i < ctu_count; ++i) {
+          double CLCU = state->frame->c_parai;
+          double KLCU = state->frame->k_parai;
+          double a = -CLCU * KLCU / pow(state->frame->lcu_statsi.pixels, KLCU - 1.0);
+          double b = -1.0 / (KLCU - 1.0);
+          taylor_e3 += pow(a / best_lambda, b);
+        }
+        iteration_number++;
+      }
+      while (fabs(taylor_e3 - bits) > 0.01 && iteration_number <= 11);
+    }
+    total_weight = calculate_weights(state, ctu_count, best_lambda);
+    state->frame->remaining_weight = bits;
+  }
+  else {
+    for (int i = 0; i < ctu_count; ++i) {
+      state->frame->lcu_statsi.weight = MAX(0.01,
+        state->frame->lcu_statsi.pixels * pow(est_lambda / alpha,
+                                                1.0 / beta));
+      total_weight += state->frame->lcu_statsi.weight;
+    }
+  }
+
+  for(int i = 0; i < ctu_count; ++i) {
+    state->frame->lcu_statsi.weight = bits * state->frame->lcu_statsi.weight / total_weight;
+  }
+
+  state->frame->lambda = est_lambda;
+  state->frame->QP = lambda_to_qp(est_lambda);
+}
+
+
+static double get_ctu_bits(encoder_state_t * const state, vector2d_t pos) {
+  int avg_bits;
+  const encoder_control_t * const encoder = state->encoder_control;
+  
+  int num_ctu = state->encoder_control->in.width_in_lcu * state->encoder_control->in.height_in_lcu;
+  const int index = pos.x + pos.y * state->tile->frame->width_in_lcu;
+
+  if (state->frame->is_irap) {
+    if(encoder->cfg.intra_bit_allocation) {
+      int cus_left = num_ctu - index + 1;
+      int window = MIN(4, cus_left);
+      double mad = kvz_get_lcu_stats(state, pos.x, pos.y)->i_cost;
+
+      pthread_mutex_lock(&state->frame->rc_lock);
+      double bits_left = state->frame->cur_pic_target_bits - state->frame->cur_frame_bits_coded;
+      double weighted_bits_left = (bits_left * window + (bits_left - state->frame->i_bits_left)*cus_left) / window;
+      avg_bits = mad * weighted_bits_left / state->frame->remaining_weight;
+      state->frame->remaining_weight -= mad;
+      state->frame->i_bits_left -= state->frame->cur_pic_target_bits * mad / state->frame->icost;
+      pthread_mutex_unlock(&state->frame->rc_lock);
+    }
+    else {
+      avg_bits = state->frame->cur_pic_target_bits * ((double)state->frame->lcu_statsindex.pixels /
+        (state->encoder_control->in.height * state->encoder_control->in.width));
+    }
+  }
+  else {
+    double total_weight = 0;
+    // In case wpp is used only the ctus of the current frame are safe to use
+    const int used_ctu_count = MIN(4, (encoder->cfg.wpp ? (pos.y + 1) * encoder->in.width_in_lcu : num_ctu) - index);
+    int target_bits = 0;
+    double best_lambda = 0.0;
+    double temp_lambda = state->frame->lambda;
+    double taylor_e3 = 0.0;
+    int iter = 0;
+
+    int last_ctu = index + used_ctu_count;
+    for (int i = index; i < last_ctu; i++) {
+      target_bits += state->frame->lcu_statsi.weight;
+    }
+
+    pthread_mutex_lock(&state->frame->rc_lock);
+    total_weight = state->frame->remaining_weight;
+    target_bits = MAX(target_bits + state->frame->cur_pic_target_bits - state->frame->cur_frame_bits_coded - (int)total_weight, 10);
+    pthread_mutex_unlock(&state->frame->rc_lock);
+
+    //just similar with the process at frame level, details can refer to the function kvz_estimate_pic_lambda
+    do {
+      taylor_e3 = 0.0;
+      best_lambda = solve_cubic_equation(state->frame, index, last_ctu, temp_lambda, target_bits);
+      temp_lambda = best_lambda;
+      for (int i = index; i < last_ctu; i++) {
+        double CLCU = state->frame->c_parai;
+        double KLCU = state->frame->k_parai;
+        double a = -CLCU * KLCU / pow((double)state->frame->lcu_statsi.pixels, KLCU - 1.0);
+        double b = -1.0 / (KLCU - 1.0);
+        taylor_e3 += pow(a / best_lambda, b);
+      }
+      iter++;
+    } while (fabs(taylor_e3 - target_bits) > 0.01 && iter < 5);
+
+    double c_ctu = state->frame->c_paraindex;
+    double k_ctu = state->frame->k_paraindex;
+    double a = -c_ctu * k_ctu / pow(((double)state->frame->lcu_statsindex.pixels), k_ctu - 1.0);
+    double b = -1.0 / (k_ctu - 1.0);
+
+    state->frame->lcu_statsindex.weight = MAX(pow(a / best_lambda, b), 0.01);
+
+    avg_bits = (int)(state->frame->lcu_statsindex.weight + 0.5);
+  }
+
+  if (avg_bits < 1) {
+    avg_bits = 1;
+  }
+
+  return avg_bits;
+}
+
+static double qp_to_lambda(encoder_state_t* const state, int qp)
+{
+  const int shift_qp = 12;
+  double lambda = 0.57 * pow(2.0, (qp - shift_qp) / 3.0);
+
+  // NOTE: HM adjusts lambda for inter according to Hadamard usage in ME.
+  //       SATD is currently always enabled for ME, so this has no effect.
+  // bool hadamard_me = true;
+  // if (!hadamard_me && state->frame->slicetype != KVZ_SLICE_I) {
+  //   lambda *= 0.95;
+  // }
+
   return lambda;
 }
 
+ void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos) {
+  double bits = get_ctu_bits(state, pos);
+
+  const encoder_control_t * const encoder = state->encoder_control;
+  const int frame_allocation = state->encoder_control->cfg.frame_allocation;
+
+  int index = pos.x + pos.y * state->encoder_control->in.width_in_lcu;
+  lcu_stats_t* ctu = &state->frame->lcu_statsindex;
+  double bpp = bits / ctu->pixels;
+
+  double alpha;
+  double beta;
+  if (state->frame->is_irap && encoder->cfg.intra_bit_allocation) {
+    pthread_mutex_lock(&state->frame->new_ratecontrol->intra_lock);
+    alpha = state->frame->new_ratecontrol->intra_alpha;
+    beta = state->frame->new_ratecontrol->intra_beta;
+    pthread_mutex_unlock(&state->frame->new_ratecontrol->intra_lock);
+  }
+  else if(state->frame->num == 0) {
+    alpha = state->frame->rc_alpha;
+    beta = state->frame->rc_beta;
+  }
+  else {
+    alpha = -state->frame->c_paraindex * state->frame->k_paraindex;
+    beta = state->frame->k_paraindex - 1;
+  }
+
+  double est_lambda;
+  int est_qp;
+  if (state->frame->is_irap && encoder->cfg.intra_bit_allocation) {
+    double cost_per_pixel = (double)ctu->i_cost / ctu->pixels;
+    cost_per_pixel = pow(cost_per_pixel, BETA1);
+    est_lambda = alpha / 256.0 * pow(cost_per_pixel / bpp, beta);
+    est_qp = state->frame->QP;
+    double max_lambda = exp(((double)est_qp + 2.49 - 13.7122) / 4.2005);
+    double min_lambda = exp(((double)est_qp - 2.49 - 13.7122) / 4.2005);
+    est_lambda = CLIP(min_lambda, max_lambda, est_lambda);
+
+    est_qp = lambda_to_qp(est_lambda);
+  }
+  else {
+    // In case wpp is used the previous ctus may not be ready from above rows
+    const int ctu_limit = encoder->cfg.wpp ? pos.y * encoder->in.width_in_lcu : 0;
+    
+    est_lambda = alpha * pow(bpp, beta) * (state->frame->is_irap ? 0.5 : 1);
+    const double clip_lambda = state->frame->lambda;
+
+    double clip_neighbor_lambda = -1;
+    int clip_qp = -1;
+    if (encoder->cfg.clip_neighbour || state->frame->num == 0) {
+      for (int temp_index = index - 1; temp_index >= ctu_limit; --temp_index) {
+        if (state->frame->lcu_statstemp_index.lambda > 0) {
+          clip_neighbor_lambda = state->frame->lcu_statstemp_index.lambda;
+          break;
+        }
+      }
+      for (int temp_index = index - 1; temp_index >= ctu_limit; --temp_index) {
+        if (state->frame->lcu_statstemp_index.qp > -1) {
+          clip_qp = state->frame->lcu_statstemp_index.qp;
+          break;
+        }
+      }
+    }
+    else {
+      
+      if (state->frame->lcu_statsindex.lambda > 0) {
+        clip_neighbor_lambda = state->frame->previous_layer_state->frame->lcu_statsindex.lambda;
+      }
+      if (state->frame->lcu_statsindex.qp > 0) {
+        clip_qp = state->frame->previous_layer_state->frame->lcu_statsindex.qp;
+      }
+    }
+
+
+    if (clip_neighbor_lambda > 0) {
+      est_lambda = CLIP(clip_neighbor_lambda * pow(2, -(1.0 + frame_allocation) / 3.0),
+        clip_neighbor_lambda * pow(2.0, (1.0 + frame_allocation) / 3.0),
+        est_lambda);
+    }
+
+    if (clip_lambda > 0) {
+      est_lambda = CLIP(clip_lambda * pow(2, -(2.0 + frame_allocation) / 3.0),
+        clip_lambda * pow(2.0, (1.0 + frame_allocation) / 3.0),
+        est_lambda);
+    }
+    else {
+      est_lambda = CLIP(10.0, 1000.0, est_lambda);
+    }
+
+    if (est_lambda < 0.1) {
+      est_lambda = 0.1;
+    }
+
+    est_qp = lambda_to_qp(est_lambda);
+
+    if( clip_qp > -1) {
+      est_qp = CLIP(clip_qp - 1 - frame_allocation,
+        clip_qp + 1 + frame_allocation,
+        est_qp);
+    }
+
+    est_qp = CLIP(state->frame->QP - 2 - frame_allocation,
+      state->frame->QP + 2 + frame_allocation,
+      est_qp);
+  }
+
+  state->lambda = est_lambda;
+  state->lambda_sqrt = sqrt(est_lambda);
+  state->qp = est_qp;
+  ctu->qp = est_qp;
+  ctu->lambda = est_lambda;
+  ctu->i_cost = 0;
+
+  // Apply variance adaptive quantization
+  if (encoder->cfg.vaq) {
+    vector2d_t lcu = {
+      pos.x + state->tile->lcu_offset_x,
+      pos.y + state->tile->lcu_offset_y
+    };
+    int id = lcu.x + lcu.y * state->tile->frame->width_in_lcu;
+    int aq_offset = round(state->frame->aq_offsetsid);
+    state->qp += aq_offset;
+    // Maximum delta QP is clipped between -26, 25 according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
+    // Since this value will be later combined with qp_pred, clip to half of that instead to be safe
+    state->qp = CLIP(state->frame->QP - 13, state->frame->QP + 12, state->qp);
+    state->qp = CLIP_TO_QP(state->qp);
+    state->lambda = qp_to_lambda(state, state->qp);
+    state->lambda_sqrt = sqrt(state->lambda);
+
+    //ctu->qp = state->qp;
+    //ctu->lambda = state->lambda;
+  }
+}
+
+
+static void update_pic_ck(encoder_state_t * const state, double bpp, double distortion, double lambda, int layer) {
+  double new_k = 0, new_c;
+  if(state->frame->num == 1) {
+    new_k = log(distortion / state->frame->new_ratecontrol->intra_pic_distortion) /
+      log(bpp / state->frame->new_ratecontrol->intra_pic_bpp);
+    new_c = distortion / pow(bpp, new_k);
+  }
+  new_k = -bpp * lambda / distortion;
+  new_c = distortion / pow(bpp, new_k);
+
+  new_c = CLIP(+.1, 100.0, new_c);
+  new_k = CLIP(-3.0, -0.001, new_k);
+
+  if(state->frame->is_irap || state->frame->num <= (4 - state->encoder_control->cfg.frame_allocation)) {
+    for(int i = 1; i < 5; i++) {
+      state->frame->new_ratecontrol->pic_c_parai = new_c;
+      state->frame->new_ratecontrol->pic_k_parai = new_k;
+    }
+  }
+  else {
+    state->frame->new_ratecontrol->pic_c_paralayer = new_c;
+    state->frame->new_ratecontrol->pic_k_paralayer = new_k;
+  }
+}
+
+
+static void update_ck(encoder_state_t * const state, int ctu_index, int layer)
+{
+  double bpp = (double)state->frame->lcu_statsctu_index.bits / state->frame->lcu_statsctu_index.pixels;
+  double distortion = state->frame->lcu_statsctu_index.distortion;
+  double lambda = state->frame->lcu_statsctu_index.lambda;
+
+  double new_k = 0, new_c = -1;
+  if (!state->frame->lcu_statsctu_index.skipped) {
+    distortion = MAX(distortion, 0.0001);
+
+    bpp = CLIP(0.0001, 10.0, bpp);
+    new_k = -bpp * lambda / distortion;
+    new_k = CLIP(-3.0, -0.001, new_k);
+    new_c = distortion / pow(bpp, new_k);
+    
+    new_c = CLIP(+.1, 100.0, new_c);
+
+    if (state->frame->is_irap || state->frame->num <= (4 - state->encoder_control->cfg.frame_allocation)) {
+      for (int i = 1; i < 5; i++) {
+        state->frame->new_ratecontrol->c_paraictu_index = new_c;
+        state->frame->new_ratecontrol->k_paraictu_index = new_k;
+      }
+    }
+    else {
+      state->frame->new_ratecontrol->c_paralayerctu_index = new_c;
+      state->frame->new_ratecontrol->k_paralayerctu_index = new_k;
+    }
+  }
+}
+
+
+void kvz_update_after_picture(encoder_state_t * const state) {
+  double total_distortion = 0;
+  double lambda = 0;
+  int32_t pixels = (state->encoder_control->in.width * state->encoder_control->in.height);
+  double pic_bpp = (double)state->frame->cur_frame_bits_coded / pixels;
+
+  const encoder_control_t * const encoder = state->encoder_control;
+  const int layer = encoder->cfg.gopstate->frame->gop_offset.layer - (state->frame->is_irap ? 1 : 0);
+
+  if (state->frame->is_irap && encoder->cfg.intra_bit_allocation) {
+    double lnbpp = log(pow(state->frame->icost / pixels, BETA1));
+    pthread_mutex_lock(&state->frame->new_ratecontrol->intra_lock);
+    double diff_lambda = state->frame->new_ratecontrol->intra_beta * log(state->frame->cur_frame_bits_coded) - log(state->frame->cur_pic_target_bits);
+
+    diff_lambda = CLIP(-0.125, 0.125, 0.25*diff_lambda);
+
+    state->frame->new_ratecontrol->intra_alpha *= exp(diff_lambda);
+    state->frame->new_ratecontrol->intra_beta += diff_lambda / lnbpp;
+    pthread_mutex_unlock(&state->frame->new_ratecontrol->intra_lock);
+  }
+
+  for(int y_ctu = 0; y_ctu < state->encoder_control->in.height_in_lcu; y_ctu++) {
+    for (int x_ctu = 0; x_ctu < state->encoder_control->in.width_in_lcu; x_ctu++) {
+      int ctu_distortion = 0;
+      lcu_stats_t *ctu = kvz_get_lcu_stats(state, x_ctu, y_ctu);
+      for (int y = y_ctu * 64; y < MIN((y_ctu + 1) * 64, state->tile->frame->height); y++) {
+        for (int x = x_ctu * 64; x < MIN((x_ctu + 1) * 64, state->tile->frame->width); x++) {
+          int temp = (int)state->tile->frame->source->yx + y * state->encoder_control->in.width -
+            state->tile->frame->rec->yx + y * state->encoder_control->in.width;
+          ctu_distortion += temp * temp;
+        }        
+      }
+      ctu->distortion = (double)ctu_distortion / ctu->pixels;
+      total_distortion += (double)ctu_distortion / ctu->pixels;
+      lambda += ctu->lambda / (state->encoder_control->in.width_in_lcu * state->encoder_control->in.height_in_lcu);
+    }    
+  }
+
+  total_distortion /= (state->encoder_control->in.height_in_lcu * state->encoder_control->in.width_in_lcu);
+  if (state->frame->is_irap) {
+    pthread_mutex_lock(&state->frame->new_ratecontrol->intra_lock);
+    for (int y_ctu = 0; y_ctu < state->encoder_control->in.height_in_lcu; y_ctu++) {
+      for (int x_ctu = 0; x_ctu < state->encoder_control->in.width_in_lcu; x_ctu++) {
+        lcu_stats_t *ctu = kvz_get_lcu_stats(state, x_ctu, y_ctu);
+        state->frame->new_ratecontrol->intra_disx_ctu + y_ctu * state->encoder_control->in.width_in_lcu =
+          ctu->distortion;
+        state->frame->new_ratecontrol->intra_bppx_ctu + y_ctu * state->encoder_control->in.width_in_lcu =
+          ctu->bits / ctu->pixels;
+      }
+    }
+    state->frame->new_ratecontrol->intra_pic_distortion = total_distortion;
+    state->frame->new_ratecontrol->intra_pic_bpp = pic_bpp;
+    pthread_mutex_unlock(&state->frame->new_ratecontrol->intra_lock);
+  }
+
+  pthread_mutex_lock(&state->frame->new_ratecontrol->lambda_lock);
+  state->frame->new_ratecontrol->previous_frame_lambda = lambda;
+  state->frame->new_ratecontrol->previous_lambdaslayer = lambda;
+  pthread_mutex_unlock(&state->frame->new_ratecontrol->lambda_lock);
+
+  update_pic_ck(state, pic_bpp, total_distortion, lambda, layer);
+  if (state->frame->num <= 4 || state->frame->is_irap){
+    for (int i = 1; i < 5; ++i) {
+      pthread_rwlock_wrlock(&state->frame->new_ratecontrol->ck_ctu_locki);
+    }
+  }
+  else{
+    pthread_rwlock_wrlock(&state->frame->new_ratecontrol->ck_ctu_locklayer);
+  }
+  for(int i = 0; i < state->encoder_control->in.width_in_lcu * state->encoder_control->in.height_in_lcu; i++) {
+    update_ck(state, i, layer);
+  }
+  if (state->frame->num <= 4 || state->frame->is_irap){
+    for (int i = 1; i < 5; ++i) {
+      pthread_rwlock_unlock(&state->frame->new_ratecontrol->ck_ctu_locki);
+    }
+  }
+  else{
+    pthread_rwlock_unlock(&state->frame->new_ratecontrol->ck_ctu_locklayer);
+  }
+}
+
 /**
  * \brief Allocate bits and set lambda and QP for the current picture.
  * \param state the main encoder state
@@ -241,12 +960,17 @@
     const int gop_len = ctrl->cfg.gop_len;
 
     if (gop_len > 0 && state->frame->slicetype != KVZ_SLICE_I) {
-      state->frame->QP = CLIP_TO_QP(ctrl->cfg.qp + gop->qp_offset);
-    } else {
-      state->frame->QP = ctrl->cfg.qp;
+      double qp = ctrl->cfg.qp;
+      qp += gop->qp_offset;
+      qp += CLIP(0.0, 3.0, qp * gop->qp_model_scale + gop->qp_model_offset);
+      state->frame->QP = CLIP_TO_QP((int)(qp + 0.5));
+
+    }
+    else {
+      state->frame->QP = CLIP_TO_QP(ctrl->cfg.qp + ctrl->cfg.intra_qp_offset);
     }
 
-    state->frame->lambda = qp_to_lamba(state, state->frame->QP);
+    state->frame->lambda = qp_to_lambda(state, state->frame->QP);
   }
 }
 
@@ -292,10 +1016,11 @@
     int roi_index = roi.x + roi.y * ctrl->cfg.roi.width;
     int dqp = ctrl->cfg.roi.dqpsroi_index;
     state->qp = CLIP_TO_QP(state->frame->QP + dqp);
-    state->lambda = qp_to_lamba(state, state->qp);
+    state->lambda = qp_to_lambda(state, state->qp);
     state->lambda_sqrt = sqrt(state->lambda);
 
-  } else if (ctrl->cfg.target_bitrate > 0) {
+  }
+  else if (ctrl->cfg.target_bitrate > 0) {
     lcu_stats_t *lcu         = kvz_get_lcu_stats(state, pos.x, pos.y);
     const uint32_t pixels    = MIN(LCU_WIDTH, state->tile->frame->width  - LCU_WIDTH * pos.x) *
                                MIN(LCU_WIDTH, state->tile->frame->height - LCU_WIDTH * pos.y);
@@ -339,4 +1064,21 @@
     state->lambda      = state->frame->lambda;
     state->lambda_sqrt = sqrt(state->frame->lambda);
   }
+
+  // Apply variance adaptive quantization
+  if (ctrl->cfg.vaq) {
+    vector2d_t lcu = {
+      pos.x + state->tile->lcu_offset_x,
+      pos.y + state->tile->lcu_offset_y
+    };
+    int id = lcu.x + lcu.y * state->tile->frame->width_in_lcu;
+    int aq_offset = round(state->frame->aq_offsetsid);
+    state->qp += aq_offset;    
+    // Maximum delta QP is clipped between -26, 25 according to ITU T-REC-H.265 specification chapter 7.4.9.10 Transform unit semantics
+    // Since this value will be later combined with qp_pred, clip to half of that instead to be safe
+    state->qp = CLIP(state->frame->QP - 13, state->frame->QP + 12, state->qp);
+    state->qp = CLIP_TO_QP(state->qp);
+    state->lambda = qp_to_lambda(state, state->qp);
+    state->lambda_sqrt = sqrt(state->lambda);
+  }
 }

kvazaar-1.3.0.tar.gz/src/rate_control.h -> kvazaar-2.0.0.tar.gz/src/rate_control.h Changed

@@ -29,10 +29,39 @@
 #include "global.h" // IWYU pragma: keep
 
 #include "encoderstate.h"
+#include "pthread.h"
+
+typedef struct kvz_rc_data {
+  double *c_paraKVZ_MAX_GOP_LAYERS;
+  double *k_paraKVZ_MAX_GOP_LAYERS;
+  double pic_c_paraKVZ_MAX_GOP_LAYERS;
+  double pic_k_paraKVZ_MAX_GOP_LAYERS;
+  double previous_lambdasKVZ_MAX_GOP_LAYERS + 1;
+  double previous_frame_lambda;
+  double *intra_bpp;
+  double *intra_dis;
+  double intra_pic_distortion;
+  double intra_pic_bpp;
+
+  double intra_alpha;
+  double intra_beta;
+
+  pthread_rwlock_t ck_ctu_lockKVZ_MAX_GOP_LAYERS;
+  pthread_mutex_t ck_frame_lock;
+  pthread_mutex_t lambda_lock;
+  pthread_mutex_t intra_lock;
+} kvz_rc_data;
+
+kvz_rc_data * kvz_get_rc_data(const encoder_control_t * const encoder);
+void kvz_free_rc_data();
 
 void kvz_set_picture_lambda_and_qp(encoder_state_t * const state);
 
 void kvz_set_lcu_lambda_and_qp(encoder_state_t * const state,
                                vector2d_t pos);
 
+void kvz_set_ctu_qp_lambda(encoder_state_t * const state, vector2d_t pos);
+void kvz_update_after_picture(encoder_state_t * const state);
+void kvz_estimate_pic_lambda(encoder_state_t * const state);
+
 #endif // RATE_CONTROL_H_

kvazaar-1.3.0.tar.gz/src/sao.c -> kvazaar-2.0.0.tar.gz/src/sao.c Changed

@@ -157,29 +157,36 @@
   return mode_bits;
 }
 
-
 /**
  * \brief calculate an array of intensity correlations for each intensity value
  */
+// NOTE: There's also an AVX2 variant of this in strategies/avx2/sao-avx2.c.
+// It has to be separate, because it returns the offset array in different
+// format (an array of YMM vectors).
 void kvz_calc_sao_offset_array(const encoder_control_t * const encoder, const sao_info_t *sao, int *offset, color_t color_i)
 {
-  int val;
-  int values = (1<<encoder->bitdepth);
-  int shift = encoder->bitdepth-5;
-  int band_pos = (color_i == COLOR_V) ? 1 : 0;
+  int32_t val;
+  const int32_t values = (1<<encoder->bitdepth);
+  const int32_t shift = encoder->bitdepth-5;
+  const int32_t band_pos = (color_i == COLOR_V) ? 1 : 0;
+  const int32_t cur_bp   = sao->band_positionband_pos;
 
   // Loop through all intensity values and construct an offset array
   for (val = 0; val < values; val++) {
-    int cur_band = val>>shift;
-    if (cur_band >= sao->band_positionband_pos && cur_band < sao->band_positionband_pos + 4) {
-      offsetval = CLIP(0, values - 1, val + sao->offsetscur_band - sao->band_positionband_pos + 1 + 5 * band_pos);
+    int32_t cur_band     = val >> shift;
+    int32_t cb_minus_cbp = cur_band - cur_bp;
+
+    if (cb_minus_cbp >= 0 && cb_minus_cbp <= 3) {
+      uint32_t offset_id    = cb_minus_cbp + 1 + 5 * band_pos;
+      int32_t val_unclipped = val + sao->offsetsoffset_id;
+      offsetval = CLIP(0, values - 1, val_unclipped);
+
     } else {
       offsetval = val;
     }
   }
 }
 
-
 /**
  * \param orig_data  Original pixel data. 64x64 for luma, 32x32 for chroma.
  * \param rec_data  Reconstructed pixel data. 64x64 for luma, 32x32 for chroma.
@@ -254,8 +261,11 @@
   //Loop pixels and take top 5 bits to classify different bands
   for (y = 0; y < block_height; ++y) {
     for (x = 0; x < block_width; ++x) {
-      sao_bands0rec_datay * block_width + x>>shift += orig_datay * block_width + x - rec_datay * block_width + x;
-      sao_bands1rec_datay * block_width + x>>shift++;
+      int32_t curr_pos = y * block_width + x;
+
+      kvz_pixel sb_index = rec_datacurr_pos >> shift;
+      sao_bands0sb_index += orig_datacurr_pos - rec_datacurr_pos;
+      sao_bands1sb_index++;
     }
   }
 }

kvazaar-1.3.0.tar.gz/src/search.c -> kvazaar-2.0.0.tar.gz/src/search.c Changed

@@ -455,6 +455,11 @@
   uint32_t inter_bitcost = MAX_INT;
   cu_info_t *cur_cu;
 
+  struct {
+    int32_t min;
+    int32_t max;
+  } pu_depth_inter, pu_depth_intra;
+
   lcu_t *const lcu = &work_treedepth;
 
   int x_local = SUB_SCU(x);
@@ -466,6 +471,21 @@
     return 0;
   }
 
+  int gop_layer = ctrl->cfg.gop_len != 0 ? ctrl->cfg.gopstate->frame->gop_offset.layer - 1 : 0;
+
+  // Assign correct depth limit
+  constraint_t* constr = state->constraint;
+ if(constr->ml_intra_depth_ctu) {
+    pu_depth_intra.min = constr->ml_intra_depth_ctu->_mat_upper_depth(x_local >> 3) + (y_local >> 3) * 8;
+    pu_depth_intra.max = constr->ml_intra_depth_ctu->_mat_lower_depth(x_local >> 3) + (y_local >> 3) * 8;
+  }
+  else {
+    pu_depth_intra.min = ctrl->cfg.pu_depth_intra.mingop_layer >= 0 ? ctrl->cfg.pu_depth_intra.mingop_layer : ctrl->cfg.pu_depth_intra.min0;
+    pu_depth_intra.max = ctrl->cfg.pu_depth_intra.maxgop_layer >= 0 ? ctrl->cfg.pu_depth_intra.maxgop_layer : ctrl->cfg.pu_depth_intra.max0;
+  }
+  pu_depth_inter.min = ctrl->cfg.pu_depth_inter.mingop_layer >= 0 ? ctrl->cfg.pu_depth_inter.mingop_layer : ctrl->cfg.pu_depth_inter.min0;
+  pu_depth_inter.max = ctrl->cfg.pu_depth_inter.maxgop_layer >= 0 ? ctrl->cfg.pu_depth_inter.maxgop_layer : ctrl->cfg.pu_depth_inter.max0;
+
   cur_cu = LCU_GET_CU_AT_PX(lcu, x_local, y_local);
   // Assign correct depth
   cur_cu->depth = depth > MAX_DEPTH ? MAX_DEPTH : depth;
@@ -479,12 +499,12 @@
   if (x + cu_width <= frame->width &&
       y + cu_width <= frame->height)
   {
-    int cu_width_inter_min = LCU_WIDTH >> ctrl->cfg.pu_depth_inter.max;
+    int cu_width_inter_min = LCU_WIDTH >> pu_depth_inter.max;
     bool can_use_inter =
       state->frame->slicetype != KVZ_SLICE_I &&
       depth <= MAX_DEPTH &&
       (
-        WITHIN(depth, ctrl->cfg.pu_depth_inter.min, ctrl->cfg.pu_depth_inter.max) ||
+        WITHIN(depth, pu_depth_inter.min, pu_depth_inter.max) ||
         // When the split was forced because the CTU is partially outside the
         // frame, we permit inter coding even if pu_depth_inter would
         // otherwise forbid it.
@@ -520,11 +540,11 @@
         const int last_mode = (ctrl->cfg.amp_enable && cu_width >= 16) ? 5 : 1;
         for (int i = first_mode; i <= last_mode; ++i) {
           kvz_search_cu_smp(state,
-		                    x, y,
-		                    depth,
-		                    mp_modesi,
-		                    &work_treedepth + 1,
-		                    &mode_cost, &mode_bitcost);
+                            x, y,
+                            depth,
+                            mp_modesi,
+                            &work_treedepth + 1,
+                            &mode_cost, &mode_bitcost);
           if (mode_cost < cost) {
             cost = mode_cost;
             inter_bitcost = mode_bitcost;
@@ -543,9 +563,9 @@
                       && cost / (cu_width * cu_width) < INTRA_THRESHOLD)
                       || (ctrl->cfg.early_skip && cur_cu->skipped);
 
-    int32_t cu_width_intra_min = LCU_WIDTH >> ctrl->cfg.pu_depth_intra.max;
+    int32_t cu_width_intra_min = LCU_WIDTH >> pu_depth_intra.max;
     bool can_use_intra =
-        WITHIN(depth, ctrl->cfg.pu_depth_intra.min, ctrl->cfg.pu_depth_intra.max) ||
+        WITHIN(depth, pu_depth_intra.min, pu_depth_intra.max) ||
         // When the split was forced because the CTU is partially outside
         // the frame, we permit intra coding even if pu_depth_intra would
         // otherwise forbid it.
@@ -604,20 +624,21 @@
         }
         kvz_lcu_fill_trdepth(lcu, x, y, depth, tr_depth);
 
-        kvz_inter_recon_cu(state, lcu, x, y, cu_width);
+        const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
+        kvz_inter_recon_cu(state, lcu, x, y, cu_width, true, has_chroma);
 
-        if (!ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
+        if (ctrl->cfg.zero_coeff_rdo && !ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
           //Calculate cost for zero coeffs
           inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, depth) + inter_bitcost * state->lambda;
 
         }
 
-        const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
         kvz_quantize_lcu_residual(state,
           true, has_chroma,
           x, y, depth,
           NULL,
-          lcu);
+          lcu,
+          false);
 
         int cbf = cbf_is_set_any(cur_cu->cbf, depth);
 
@@ -650,7 +671,7 @@
 
     cost += mode_bits * state->lambda;
 
-    if (inter_zero_coeff_cost <= cost) {
+    if (ctrl->cfg.zero_coeff_rdo && inter_zero_coeff_cost <= cost) {
       cost = inter_zero_coeff_cost;
 
       // Restore saved pixels from lower level of the working tree.
@@ -677,9 +698,9 @@
     // If the CU is partially outside the frame, we need to split it even
     // if pu_depth_intra and pu_depth_inter would not permit it.
     cur_cu->type == CU_NOTSET ||
-    depth < ctrl->cfg.pu_depth_intra.max ||
+    depth < pu_depth_intra.max ||
     (state->frame->slicetype != KVZ_SLICE_I &&
-      depth < ctrl->cfg.pu_depth_inter.max);
+      depth < pu_depth_inter.max);
 
   // Recursively split all the way to max search depth.
   if (can_split_cu) {
@@ -937,11 +958,21 @@
     work_treedepth = work_tree0;
   }
 
+  // If the ML depth prediction is enabled, 
+  // generate the depth prediction interval 
+  // for the current lcu
+  constraint_t* constr = state->constraint;
+  if (constr->ml_intra_depth_ctu) {
+    kvz_lcu_luma_depth_pred(constr->ml_intra_depth_ctu, work_tree0.ref.y, state->qp);
+  }
+
   // Start search from depth 0.
   double cost = search_cu(state, x, y, 0, work_tree);
 
   // Save squared cost for rate control.
-  kvz_get_lcu_stats(state, x / LCU_WIDTH, y / LCU_WIDTH)->weight = cost * cost;
+  if(state->encoder_control->cfg.rc_algorithm == KVZ_LAMBDA) {
+    kvz_get_lcu_stats(state, x / LCU_WIDTH, y / LCU_WIDTH)->weight = cost * cost;
+  }
 
   // The best decisions through out the LCU got propagated back to depth 0,
   // so copy those back to the frame.

kvazaar-1.3.0.tar.gz/src/search.h -> kvazaar-2.0.0.tar.gz/src/search.h Changed

kvazaar-1.3.0.tar.gz/src/search_inter.c -> kvazaar-2.0.0.tar.gz/src/search_inter.c Changed

@@ -1135,15 +1135,56 @@
   if (src.malloc_used) free(src.buffer);
 }
 
+/**
+* \brief Calculate the scaled MV
+*/
+static INLINE int16_t get_scaled_mv(int16_t mv, int scale)
+{
+  int32_t scaled = scale * mv;
+  return CLIP(-32768, 32767, (scaled + 127 + (scaled < 0)) >> 8);
+}
+/**
+* \brief Scale the MV according to the POC difference
+*
+* \param current_poc        POC of current frame
+* \param current_ref_poc    POC of reference frame
+* \param neighbor_poc       POC of neighbor frame
+* \param neighbor_ref_poc   POC of neighbors reference frame
+* \param mv_cand            MV candidates to scale
+*/
+static void apply_mv_scaling(int32_t current_poc,
+  int32_t current_ref_poc,
+  int32_t neighbor_poc,
+  int32_t neighbor_ref_poc,
+  vector2d_t* mv_cand)
+{
+  int32_t diff_current = current_poc - current_ref_poc;
+  int32_t diff_neighbor = neighbor_poc - neighbor_ref_poc;
+
+  if (diff_current == diff_neighbor) return;
+  if (diff_neighbor == 0) return;
+
+  diff_current = CLIP(-128, 127, diff_current);
+  diff_neighbor = CLIP(-128, 127, diff_neighbor);
+
+  int scale = CLIP(-4096, 4095,
+    (diff_current * ((0x4000 + (abs(diff_neighbor) >> 1)) / diff_neighbor) + 32) >> 6);
+
+  mv_cand->x = get_scaled_mv(mv_cand->x, scale);
+  mv_cand->y = get_scaled_mv(mv_cand->y, scale);
+}
+
 
 /**
  * \brief Perform inter search for a single reference frame.
  */
 static void search_pu_inter_ref(inter_search_info_t *info,
-                                int depth,
-                                lcu_t *lcu, cu_info_t *cur_cu,
-                                double *inter_cost,
-                                uint32_t *inter_bitcost)
+  int depth,
+  lcu_t *lcu, cu_info_t *cur_cu,
+  double *inter_cost,
+  uint32_t *inter_bitcost,
+  double *best_LX_cost,
+  cu_info_t *unipred_LX)
 {
   const kvz_config *cfg = &info->state->encoder_control->cfg;
 
@@ -1153,20 +1194,20 @@
   int8_t LX_idx;
   // max value of LX_idx plus one
   const int8_t LX_IDX_MAX_PLUS_1 = MAX(info->state->frame->ref_LX_size0,
-                                       info->state->frame->ref_LX_size1);
+    info->state->frame->ref_LX_size1);
 
   for (LX_idx = 0; LX_idx < LX_IDX_MAX_PLUS_1; LX_idx++)
   {
     // check if ref_idx is in L0
     if (LX_idx < info->state->frame->ref_LX_size0 &&
-        info->state->frame->ref_LX0LX_idx == info->ref_idx) {
+      info->state->frame->ref_LX0LX_idx == info->ref_idx) {
       ref_list = 0;
       break;
     }
 
     // check if ref_idx is in L1
     if (LX_idx < info->state->frame->ref_LX_size1 &&
-        info->state->frame->ref_LX1LX_idx == info->ref_idx) {
+      info->state->frame->ref_LX1LX_idx == info->ref_idx) {
       ref_list = 1;
       break;
     }
@@ -1194,22 +1235,57 @@
   cur_cu->inter.mv_refref_list = temp_ref_idx;
 
   vector2d_t mv = { 0, 0 };
-  {
-    // Take starting point for MV search from previous frame.
-    // When temporal motion vector candidates are added, there is probably
-    // no point to this anymore, but for now it helps.
-    const int mid_x = info->state->tile->offset_x + info->origin.x + (info->width >> 1);
-    const int mid_y = info->state->tile->offset_y + info->origin.y + (info->height >> 1);
-    const cu_array_t* ref_array = info->state->frame->ref->cu_arraysinfo->ref_idx;
-    const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y);
-    if (ref_cu->type == CU_INTER) {
-      if (ref_cu->inter.mv_dir & 1) {
-        mv.x = ref_cu->inter.mv00;
-        mv.y = ref_cu->inter.mv01;
-      } else {
-        mv.x = ref_cu->inter.mv10;
-        mv.y = ref_cu->inter.mv11;
+
+  // Take starting point for MV search from previous frame.
+  // When temporal motion vector candidates are added, there is probably
+  // no point to this anymore, but for now it helps.
+  const int mid_x = info->state->tile->offset_x + info->origin.x + (info->width >> 1);
+  const int mid_y = info->state->tile->offset_y + info->origin.y + (info->height >> 1);
+  const cu_array_t* ref_array = info->state->frame->ref->cu_arraysinfo->ref_idx;
+  const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y);
+  if (ref_cu->type == CU_INTER) {
+    if (ref_cu->inter.mv_dir & 1) {
+      mv.x = ref_cu->inter.mv00;
+      mv.y = ref_cu->inter.mv01;
+    }
+    else {
+      mv.x = ref_cu->inter.mv10;
+      mv.y = ref_cu->inter.mv11;
+    }
+    // Apply mv scaling if neighbor poc is available
+    if (info->state->frame->ref_LX_sizeref_list > 0) {
+      // When there are reference pictures from the future (POC > current POC)
+      // in L0 or L1, the primary list for the colocated PU is the inverse of
+      // collocated_from_l0_flag. Otherwise it is equal to reflist.
+      //
+      // Kvazaar always sets collocated_from_l0_flag so the list is L1 when
+      // there are future references.
+      int col_list = ref_list;
+      for (int i = 0; i < info->state->frame->ref->used_size; i++) {
+        if (info->state->frame->ref->pocsi > info->state->frame->poc) {
+          col_list = 1;
+          break;
+        }
       }
+      if ((ref_cu->inter.mv_dir & (col_list + 1)) == 0) {
+        // Use the other list if the colocated PU does not have a MV for the
+        // primary list.
+        col_list = 1 - col_list;
+      }
+
+      uint8_t neighbor_poc_index = info->state->frame->ref_LXref_listLX_idx;
+      // Scaling takes current POC, reference POC, neighbor POC and neighbor reference POC as argument
+      apply_mv_scaling(
+        info->state->frame->poc,
+        info->state->frame->ref->pocsinfo->state->frame->ref_LXref_listLX_idx,
+        info->state->frame->ref->pocsneighbor_poc_index,
+        info->state->frame->ref->imagesneighbor_poc_index->ref_pocs
+          info->state->frame->ref->ref_LXsneighbor_poc_index
+          col_list
+          ref_cu->inter.mv_refcol_list
+        ,
+        &mv
+      );
     }
   }
 
@@ -1303,6 +1379,23 @@
     *inter_cost = info->best_cost;
     *inter_bitcost = info->best_bitcost + cur_cu->inter.mv_dir - 1 + mv_ref_coded;
   }
+
+
+  // Update best unipreds for biprediction
+  if (info->best_cost < best_LX_costref_list) {
+    bool valid_mv = fracmv_within_tile(info, mv.x, mv.y);
+    if (valid_mv) {
+      // Map reference index to L0/L1 pictures
+      unipred_LXref_list.inter.mv_dir = ref_list + 1;
+      unipred_LXref_list.inter.mv_refref_list = LX_idx;
+      unipred_LXref_list.inter.mvref_list0 = (int16_t)mv.x;
+      unipred_LXref_list.inter.mvref_list1 = (int16_t)mv.y;
+
+      CU_SET_MV_CAND(&unipred_LXref_list, ref_list, cu_mv_cand);
+
+      best_LX_costref_list = info->best_cost;
+    }
+  }
 }
 
 
@@ -1365,7 +1458,9 @@
                            width,
                            height,
                            mv,
-                           lcu);
+                           lcu,
+                           true,
+                           false);
 
     const kvz_pixel *rec = &lcu->rec.ySUB_SCU(y) * LCU_WIDTH + SUB_SCU(x);
     const kvz_pixel *src = &frame->source->yx + y * frame->source->width;
@@ -1442,6 +1537,37 @@
 }
 
 /**
+ * \brief Check if an identical merge candidate exists in a list
+ *
+ * \param all_cand        Full list of available merge candidates
+ * \param cand_to_add     Merge candidate to be checked for duplicates
+ * \param added_idx_list  List of indices of unique merge candidates
+ * \param list_size       Size of the list
+ *
+ * \return                Does an identical candidate exist in list
+ */
+static bool merge_candidate_in_list(inter_merge_cand_t * all_cands,
+                                    inter_merge_cand_t * cand_to_add,
+                                    int8_t * added_idx_list,
+                                    int list_size)
+{
+  bool found = false;
+  for (int i = 0; i < list_size && !found; ++i) {
+    inter_merge_cand_t * list_cand = &all_candsadded_idx_listi;
+
+    found = cand_to_add->dir == list_cand->dir &&
+        cand_to_add->ref0 == list_cand->ref0 &&
+        cand_to_add->mv00 == list_cand->mv00 &&
+        cand_to_add->mv01 == list_cand->mv01 &&
+        cand_to_add->ref1 == list_cand->ref1 &&
+        cand_to_add->mv10 == list_cand->mv10 &&
+        cand_to_add->mv11 == list_cand->mv11;
+  }
+
+  return found;
+}
+
+/**
  * \brief Update PU to have best modes at this depth.
  *
  * \param state       encoder state
@@ -1510,54 +1636,68 @@
   CU_SET_MV_CAND(cur_cu, 0, 0);
   CU_SET_MV_CAND(cur_cu, 1, 0);
 
-  // Early Skip Mode Decision
-  if (cfg->early_skip && cur_cu->part_size == SIZE_2Nx2N) {
+  // Merge Analysis starts here
+  int8_t mrg_candsMRG_MAX_NUM_CANDS;
+  double mrg_costsMRG_MAX_NUM_CANDS;
+  for (int i = 0; i < MRG_MAX_NUM_CANDS; ++i) {
+    mrg_candsi = -1;
+    mrg_costsi = MAX_DOUBLE;
+  }
 
-    int num_rdo_cands = 0;
-    int8_t mrg_candsMRG_MAX_NUM_CANDS = { 0, 1, 2, 3, 4 };
-    double mrg_costsMRG_MAX_NUM_CANDS = { MAX_DOUBLE };
+  int num_rdo_cands = 0;
 
-    // Check motion vector constraints and perform rough search
-    for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) {
+  // Check motion vector constraints and perform rough search
+  for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) {
 
-      cur_cu->inter.mv_dir = info.merge_candmerge_idx.dir;
-      cur_cu->inter.mv_ref0 = info.merge_candmerge_idx.ref0;
-      cur_cu->inter.mv_ref1 = info.merge_candmerge_idx.ref1;
-      cur_cu->inter.mv00 = info.merge_candmerge_idx.mv00;
-      cur_cu->inter.mv01 = info.merge_candmerge_idx.mv01;
-      cur_cu->inter.mv10 = info.merge_candmerge_idx.mv10;
-      cur_cu->inter.mv11 = info.merge_candmerge_idx.mv11;
+    inter_merge_cand_t *cur_cand = &info.merge_candmerge_idx;
+    cur_cu->inter.mv_dir = cur_cand->dir;
+    cur_cu->inter.mv_ref0 = cur_cand->ref0;
+    cur_cu->inter.mv_ref1 = cur_cand->ref1;
+    cur_cu->inter.mv00 = cur_cand->mv00;
+    cur_cu->inter.mv01 = cur_cand->mv01;
+    cur_cu->inter.mv10 = cur_cand->mv10;
+    cur_cu->inter.mv11 = cur_cand->mv11;
 
-      // Don't try merge candidates that don't satisfy mv constraints.
-      if (!fracmv_within_tile(&info, cur_cu->inter.mv00, cur_cu->inter.mv01) ||
-          !fracmv_within_tile(&info, cur_cu->inter.mv10, cur_cu->inter.mv11))
-      {
-        continue;
-      }
-
-      if (cfg->rdo >= 2) {
+    // If bipred is not enabled, do not try candidates with mv_dir == 3.
+    // Bipred is also forbidden for 4x8 and 8x4 blocks by the standard. 
+    if (cur_cu->inter.mv_dir == 3 && !state->encoder_control->cfg.bipred) continue;
+    if (cur_cu->inter.mv_dir == 3 && !(width + height > 12)) continue;
 
-        kvz_lcu_fill_trdepth(lcu, x, y, depth, depth);
-        kvz_inter_recon_cu(state, lcu, x, y, width);
-        mrg_costsmerge_idx = kvz_satd_any_size(width, height,
-          lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
-          lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
-      }
+    bool is_duplicate = merge_candidate_in_list(info.merge_cand, cur_cand,
+      mrg_cands, 
+      num_rdo_cands);
 
-      num_rdo_cands++;
+    // Don't try merge candidates that don't satisfy mv constraints.
+    // Don't add duplicates to list
+    if (!fracmv_within_tile(&info, cur_cu->inter.mv00, cur_cu->inter.mv01) ||
+        !fracmv_within_tile(&info, cur_cu->inter.mv10, cur_cu->inter.mv11) ||
+        is_duplicate)
+    {
+      continue;
     }
 
+    kvz_inter_pred_pu(state, lcu, x_cu, y_cu, width_cu, true, false, i_pu);
+    mrg_costsnum_rdo_cands = kvz_satd_any_size(width, height,
+      lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
+      lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
+    
+    // Add cost of coding the merge index
+    mrg_costsnum_rdo_cands += merge_idx * info.state->lambda_sqrt;
 
-    if (cfg->rdo >= 2) {
-      // Sort candidates by cost
-      kvz_sort_modes(mrg_cands, mrg_costs, num_rdo_cands);
-    }
+    mrg_candsnum_rdo_cands = merge_idx;
+    num_rdo_cands++;
+  }
 
-    // Limit by availability
-    // TODO: Do not limit to just 1
-    num_rdo_cands = MIN(1, num_rdo_cands);
+  // Sort candidates by cost
+  kvz_sort_modes(mrg_cands, mrg_costs, num_rdo_cands);
 
-    // RDO search
+  // Limit by availability
+  // TODO: Do not limit to just 1
+  num_rdo_cands = MIN(1, num_rdo_cands);
+    
+  // Early Skip Mode Decision
+  bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
+  if (cfg->early_skip && cur_cu->part_size == SIZE_2Nx2N) {
     for (int merge_rdo_idx = 0; merge_rdo_idx < num_rdo_cands; ++merge_rdo_idx) {
 
       // Reconstruct blocks with merge candidate.
@@ -1573,32 +1713,39 @@
       cur_cu->inter.mv10 = info.merge_candmerge_idx.mv10;
       cur_cu->inter.mv11 = info.merge_candmerge_idx.mv11;
       kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth));
-      kvz_inter_recon_cu(state, lcu, x, y, width);
-      kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_cu, lcu);
+      kvz_inter_recon_cu(state, lcu, x, y, width, true, false);
+      kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_cu, lcu, true);
 
       if (cbf_is_set(cur_cu->cbf, depth, COLOR_Y)) {
         continue;
       }
-      else if(state->encoder_control->chroma_format != KVZ_CSP_400) {
-
-        kvz_quantize_lcu_residual(state, false, true, x, y, depth, cur_cu, lcu);
+      else if (has_chroma) {
+        kvz_inter_recon_cu(state, lcu, x, y, width, false, has_chroma);
+        kvz_quantize_lcu_residual(state, false, has_chroma, x, y, depth, cur_cu, lcu, true);
         if (!cbf_is_set_any(cur_cu->cbf, depth)) {
           cur_cu->type = CU_INTER;
           cur_cu->merge_idx = merge_idx;
           cur_cu->skipped = true;
           *inter_cost = 0.0;  // TODO: Check this
-          *inter_bitcost = 0; // TODO: Check this
+          *inter_bitcost = merge_idx; // TODO: Check this
           return;
         }
       }
     }
   }
 
+  // AMVP search starts here
+
+  // Store unipred information of L0 and L1 for biprediction
+  // Best cost will be left at MAX_DOUBLE if no valid CU is found
+  double best_cost_LX2 = { MAX_DOUBLE, MAX_DOUBLE };
+  cu_info_t unipreds2;
+
   for (int ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) {
     info.ref_idx = ref_idx;
     info.ref = state->frame->ref->imagesref_idx;
 
-    search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost);
+    search_pu_inter_ref(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost, best_cost_LX, unipreds);
   }
 
   // Search bi-pred positions
@@ -1607,7 +1754,129 @@
     && width + height >= 16; // 4x8 and 8x4 PBs are restricted to unipred
 
   if (can_use_bipred) {
-    search_pu_inter_bipred(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost);
+
+    // Try biprediction from valid acquired unipreds.
+    if (best_cost_LX0 != MAX_DOUBLE && best_cost_LX1 != MAX_DOUBLE) {
+
+      // TODO: logic is copy paste from search_pu_inter_bipred.
+      // Get rid of duplicate code asap.
+      const image_list_t *const ref = info.state->frame->ref;
+      uint8_t(*ref_LX)16 = info.state->frame->ref_LX;
+
+      inter_merge_cand_t *merge_cand = info.merge_cand;
+
+      int16_t mv22;
+      mv00 = unipreds0.inter.mv00;
+      mv01 = unipreds0.inter.mv01;
+      mv10 = unipreds1.inter.mv10;
+      mv11 = unipreds1.inter.mv11;
+
+      kvz_inter_recon_bipred(info.state,
+        ref->imagesref_LX0unipreds0.inter.mv_ref0,
+        ref->imagesref_LX1unipreds1.inter.mv_ref1,
+        x, y,
+        width,
+        height,
+        mv,
+        lcu,
+        true,
+        false);
+
+      const kvz_pixel *rec = &lcu->rec.ySUB_SCU(y) * LCU_WIDTH + SUB_SCU(x);
+      const kvz_pixel *src = &lcu->ref.ySUB_SCU(y) * LCU_WIDTH + SUB_SCU(x);
+      uint32_t cost =
+        kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, LCU_WIDTH);
+
+      uint32_t bitcost2 = { 0, 0 };
+
+      cost += info.mvd_cost_func(info.state,
+        unipreds0.inter.mv00,
+        unipreds0.inter.mv01,
+        0,
+        info.mv_cand,
+        NULL, 0, 0,
+        &bitcost0);
+      cost += info.mvd_cost_func(info.state,
+        unipreds1.inter.mv10,
+        unipreds1.inter.mv11,
+        0,
+        info.mv_cand,
+        NULL, 0, 0,
+        &bitcost1);
+
+      const uint8_t mv_ref_coded2 = {
+        unipreds0.inter.mv_ref0,
+        unipreds1.inter.mv_ref1
+      };
+      const int extra_bits = mv_ref_coded0 + mv_ref_coded1 + 2 /* mv dir cost */;
+      cost += info.state->lambda_sqrt * extra_bits + 0.5;
+
+      if (cost < *inter_cost) {
+        cur_cu->inter.mv_dir = 3;
+
+        cur_cu->inter.mv_ref0 = unipreds0.inter.mv_ref0;
+        cur_cu->inter.mv_ref1 = unipreds1.inter.mv_ref1;
+
+        cur_cu->inter.mv00 = unipreds0.inter.mv00;
+        cur_cu->inter.mv01 = unipreds0.inter.mv01;
+        cur_cu->inter.mv10 = unipreds1.inter.mv10;
+        cur_cu->inter.mv11 = unipreds1.inter.mv11;
+        cur_cu->merged = 0;
+
+        // Check every candidate to find a match
+        for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) {
+          if (merge_candmerge_idx.mv00 == cur_cu->inter.mv00 &&
+            merge_candmerge_idx.mv01 == cur_cu->inter.mv01 &&
+            merge_candmerge_idx.mv10 == cur_cu->inter.mv10 &&
+            merge_candmerge_idx.mv11 == cur_cu->inter.mv11 &&
+            merge_candmerge_idx.ref0 == cur_cu->inter.mv_ref0 &&
+            merge_candmerge_idx.ref1 == cur_cu->inter.mv_ref1)
+          {
+            cur_cu->merged = 1;
+            cur_cu->merge_idx = merge_idx;
+            break;
+          }
+        }
+
+        // Each motion vector has its own candidate
+        for (int reflist = 0; reflist < 2; reflist++) {
+          kvz_inter_get_mv_cand(info.state, x, y, width, height, info.mv_cand, cur_cu, lcu, reflist);
+          int cu_mv_cand = select_mv_cand(
+            info.state,
+            info.mv_cand,
+            cur_cu->inter.mvreflist0,
+            cur_cu->inter.mvreflist1,
+            NULL);
+          CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand);
+        }
+
+        *inter_cost = cost;
+        *inter_bitcost = bitcost0 + bitcost1 + extra_bits;
+      }
+    }
+
+    // TODO: this probably should have a separate command line option
+    if (cfg->rdo == 3) {
+      search_pu_inter_bipred(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost);
+    }
+  }
+
+  // Compare best merge cost to amvp cost
+  if (mrg_costs0 < *inter_cost) {
+    *inter_cost = mrg_costs0;
+    *inter_bitcost = 0; // TODO: Check this
+    int merge_idx = mrg_cands0;
+    cur_cu->type = CU_INTER;
+    cur_cu->merge_idx = merge_idx;
+    cur_cu->inter.mv_dir = info.merge_candmerge_idx.dir;
+    cur_cu->inter.mv_ref0 = info.merge_candmerge_idx.ref0;
+    cur_cu->inter.mv_ref1 = info.merge_candmerge_idx.ref1;
+    cur_cu->inter.mv00 = info.merge_candmerge_idx.mv00;
+    cur_cu->inter.mv01 = info.merge_candmerge_idx.mv01;
+    cur_cu->inter.mv10 = info.merge_candmerge_idx.mv10;
+    cur_cu->inter.mv11 = info.merge_candmerge_idx.mv11;
+    cur_cu->merged = true;
+    cur_cu->skipped = false;
   }
 
   if (*inter_cost < INT_MAX && cur_cu->inter.mv_dir == 1) {
@@ -1646,16 +1915,17 @@
     tr_depth = depth + 1;
   }
   kvz_lcu_fill_trdepth(lcu, x, y, depth, tr_depth);
-  kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth));
 
   const bool reconstruct_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
+  kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth), true, reconstruct_chroma);
   kvz_quantize_lcu_residual(state, true, reconstruct_chroma,
     x, y, depth,
     NULL,
-    lcu);
+    lcu,
+    false);
 
   *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu);
-  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+  if (reconstruct_chroma) {
     *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu);
   }

kvazaar-1.3.0.tar.gz/src/search_intra.c -> kvazaar-2.0.0.tar.gz/src/search_intra.c Changed

kvazaar-1.3.0.tar.gz/src/strategies/altivec/picture-altivec.c -> kvazaar-2.0.0.tar.gz/src/strategies/altivec/picture-altivec.c Changed

kvazaar-1.3.0.tar.gz/src/strategies/avx2/avx2_common_functions.h -> kvazaar-2.0.0.tar.gz/src/strategies/avx2/avx2_common_functions.h Changed

@@ -3,6 +3,30 @@
 
 #include <immintrin.h>
 
+// The calling convention used by MSVC on 32-bit builds will essentially
+// disallow functions to have more than 3 XMM/YMM parameters, because it
+// will not provide more than 8-byte param alignment, and only the first
+// three vector params will be carried in SIMD registers. Now the
+// vectorcall convention could probably be problematic in globally visible
+// funcitons, but likely not in static ones.
+#if defined _MSC_VER && defined _WIN32 && !defined _WIN64
+  #define FIX_W32 __vectorcall
+#else
+  #define FIX_W32
+#endif
+
+// Non-inline functions defined in this header are likely to trigger a
+// warning for each module including this header that does NOT use them,
+// at least on unix-ish platforms (GCC/Clang both on native Unix and MinGW).
+// Tell 'em we actually want to do that, it's not an accident.
+#if defined __GNUC__ || defined __clang__ || defined __MINGW32__ || defined __MINGW64__
+  #define FIX_UNUSED __attribute__((unused))
+#else
+  #define FIX_UNUSED
+#endif
+
+#define FIX_NOINLINE FIX_W32 FIX_UNUSED
+
 /*
  * Reorder coefficients from raster to scan order
  * Fun fact: Once upon a time, doing this in a loop looked like this:
@@ -111,4 +135,19 @@
   *last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
 }
 
+static int32_t FIX_NOINLINE hsum_8x32b(const __m256i v)
+{
+  __m256i sum1 = v;
+  __m256i sum2 = _mm256_permute4x64_epi64(sum1, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i sum3 = _mm256_add_epi32        (sum1, sum2);
+  __m256i sum4 = _mm256_shuffle_epi32    (sum3, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i sum5 = _mm256_add_epi32        (sum3, sum4);
+  __m256i sum6 = _mm256_shuffle_epi32    (sum5, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256i sum7 = _mm256_add_epi32        (sum5, sum6);
+
+  __m128i sum8 = _mm256_castsi256_si128  (sum7);
+  int32_t sum9 = _mm_cvtsi128_si32       (sum8);
+  return  sum9;
+}
+
 #endif

kvazaar-1.3.0.tar.gz/src/strategies/avx2/dct-avx2.c -> kvazaar-2.0.0.tar.gz/src/strategies/avx2/dct-avx2.c Changed

@@ -47,262 +47,834 @@
 * \brief AVX2 transformations.
 */
 
+static INLINE __m256i swap_lanes(__m256i v)
+{
+  return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(1, 0, 3, 2));
+}
+
+static INLINE __m256i truncate(__m256i v, __m256i debias, int32_t shift)
+{
+  __m256i truncable = _mm256_add_epi32 (v,         debias);
+  return              _mm256_srai_epi32(truncable, shift);
+}
+
 // 4x4 matrix multiplication with value clipping.
 // Parameters: Two 4x4 matrices containing 16-bit values in consecutive addresses,
 //             destination for the result and the shift value for clipping.
-static void mul_clip_matrix_4x4_avx2(const int16_t *left, const int16_t *right, int16_t *dst, int32_t shift)
+static __m256i mul_clip_matrix_4x4_avx2(const __m256i left, const __m256i right, int shift)
 {
-  __m256i b2, a, result, even2, odd2;
+  const int32_t add    = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
 
-  const int32_t add = 1 << (shift - 1);
+  __m256i right_los = _mm256_permute4x64_epi64(right, _MM_SHUFFLE(2, 0, 2, 0));
+  __m256i right_his = _mm256_permute4x64_epi64(right, _MM_SHUFFLE(3, 1, 3, 1));
 
-  a = _mm256_loadu_si256((__m256i*) left);
-  b0 = _mm256_loadu_si256((__m256i*) right);
+  __m256i right_cols_up = _mm256_unpacklo_epi16(right_los, right_his);
+  __m256i right_cols_dn = _mm256_unpackhi_epi16(right_los, right_his);
 
-  // Interleave values in both 128-bit lanes
-  b0 = _mm256_unpacklo_epi16(b0, _mm256_srli_si256(b0, 8));
-  b1 = _mm256_permute2x128_si256(b0, b0, 1 + 16);
-  b0 = _mm256_permute2x128_si256(b0, b0, 0);
+  __m256i left_slice1 = _mm256_shuffle_epi32(left, _MM_SHUFFLE(0, 0, 0, 0));
+  __m256i left_slice2 = _mm256_shuffle_epi32(left, _MM_SHUFFLE(1, 1, 1, 1));
+  __m256i left_slice3 = _mm256_shuffle_epi32(left, _MM_SHUFFLE(2, 2, 2, 2));
+  __m256i left_slice4 = _mm256_shuffle_epi32(left, _MM_SHUFFLE(3, 3, 3, 3));
 
-  // Fill both 128-lanes with the first pair of 16-bit factors in the lane.
-  even0 = _mm256_shuffle_epi32(a, 0);
-  odd0 = _mm256_shuffle_epi32(a, 1 + 4 + 16 + 64);
+  __m256i prod1 = _mm256_madd_epi16(left_slice1, right_cols_up);
+  __m256i prod2 = _mm256_madd_epi16(left_slice2, right_cols_dn);
+  __m256i prod3 = _mm256_madd_epi16(left_slice3, right_cols_up);
+  __m256i prod4 = _mm256_madd_epi16(left_slice4, right_cols_dn);
 
-  // Multiply packed elements and sum pairs. Input 16-bit output 32-bit.
-  even0 = _mm256_madd_epi16(even0, b0);
-  odd0 = _mm256_madd_epi16(odd0, b1);
+  __m256i rows_up = _mm256_add_epi32(prod1, prod2);
+  __m256i rows_dn = _mm256_add_epi32(prod3, prod4);
 
-  // Add the halves of the dot product and
-  // round.
-  result = _mm256_add_epi32(even0, odd0);
-  result = _mm256_add_epi32(result, _mm256_set1_epi32(add));
-  result = _mm256_srai_epi32(result, shift);
+  __m256i rows_up_tr = truncate(rows_up, debias, shift);
+  __m256i rows_dn_tr = truncate(rows_dn, debias, shift);
 
-  //Repeat for the remaining parts
-  even1 = _mm256_shuffle_epi32(a, 2 + 8 + 32 + 128);
-  odd1 = _mm256_shuffle_epi32(a, 3 + 12 + 48 + 192);
+  __m256i result = _mm256_packs_epi32(rows_up_tr, rows_dn_tr);
+  return result;
+}
 
-  even1 = _mm256_madd_epi16(even1, b0);
-  odd1 = _mm256_madd_epi16(odd1, b1);
+static void matrix_dst_4x4_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
+{
+  int32_t shift_1st = kvz_g_convert_to_bit4 + 1 + (bitdepth - 8);
+  int32_t shift_2nd = kvz_g_convert_to_bit4 + 8;
+  const int16_t *tdst = &kvz_g_dst_4_t00;
+  const int16_t *dst  = &kvz_g_dst_4  00;
 
-  odd1 = _mm256_add_epi32(even1, odd1);
-  odd1 = _mm256_add_epi32(odd1, _mm256_set1_epi32(add));
-  odd1 = _mm256_srai_epi32(odd1, shift);
+  __m256i tdst_v = _mm256_load_si256((const __m256i *) tdst);
+  __m256i  dst_v = _mm256_load_si256((const __m256i *)  dst);
+  __m256i   in_v = _mm256_load_si256((const __m256i *)input);
 
-  // Truncate to 16-bit values
-  result = _mm256_packs_epi32(result, odd1);
+  __m256i tmp    = mul_clip_matrix_4x4_avx2(in_v,  tdst_v, shift_1st);
+  __m256i result = mul_clip_matrix_4x4_avx2(dst_v, tmp,    shift_2nd);
 
-  _mm256_storeu_si256((__m256i*)dst, result);
+  _mm256_store_si256((__m256i *)output, result);
+}
+
+static void matrix_idst_4x4_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
+{
+  int32_t shift_1st = 7;
+  int32_t shift_2nd = 12 - (bitdepth - 8);
+
+  const int16_t *tdst = &kvz_g_dst_4_t00;
+  const int16_t *dst  = &kvz_g_dst_4  00;
+
+  __m256i tdst_v = _mm256_load_si256((const __m256i *)tdst);
+  __m256i  dst_v = _mm256_load_si256((const __m256i *) dst);
+  __m256i   in_v = _mm256_load_si256((const __m256i *)input);
+
+  __m256i tmp    = mul_clip_matrix_4x4_avx2(tdst_v, in_v,  shift_1st);
+  __m256i result = mul_clip_matrix_4x4_avx2(tmp,    dst_v, shift_2nd);
+
+  _mm256_store_si256((__m256i *)output, result);
+}
+
+static void matrix_dct_4x4_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
+{
+  int32_t shift_1st = kvz_g_convert_to_bit4 + 1 + (bitdepth - 8);
+  int32_t shift_2nd = kvz_g_convert_to_bit4 + 8;
+  const int16_t *tdct = &kvz_g_dct_4_t00;
+  const int16_t *dct  = &kvz_g_dct_4  00;
+
+  __m256i tdct_v = _mm256_load_si256((const __m256i *) tdct);
+  __m256i  dct_v = _mm256_load_si256((const __m256i *)  dct);
+  __m256i   in_v = _mm256_load_si256((const __m256i *)input);
+
+  __m256i tmp    = mul_clip_matrix_4x4_avx2(in_v,  tdct_v, shift_1st);
+  __m256i result = mul_clip_matrix_4x4_avx2(dct_v, tmp,    shift_2nd);
+
+  _mm256_store_si256((__m256i *)output, result);
+}
+
+static void matrix_idct_4x4_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
+{
+  int32_t shift_1st = 7;
+  int32_t shift_2nd = 12 - (bitdepth - 8);
+
+  const int16_t *tdct = &kvz_g_dct_4_t00;
+  const int16_t *dct  = &kvz_g_dct_4  00;
+
+  __m256i tdct_v = _mm256_load_si256((const __m256i *)tdct);
+  __m256i  dct_v = _mm256_load_si256((const __m256i *) dct);
+  __m256i   in_v = _mm256_load_si256((const __m256i *)input);
+
+  __m256i tmp    = mul_clip_matrix_4x4_avx2(tdct_v, in_v,  shift_1st);
+  __m256i result = mul_clip_matrix_4x4_avx2(tmp,    dct_v, shift_2nd);
+
+  _mm256_store_si256((__m256i *)output, result);
 }
 
-// 8x8 matrix multiplication with value clipping.
-// Parameters: Two 8x8 matrices containing 16-bit values in consecutive addresses,
-//             destination for the result and the shift value for clipping.
-//
 static void mul_clip_matrix_8x8_avx2(const int16_t *left, const int16_t *right, int16_t *dst, const int32_t shift)
 {
-  int i, j;
-  __m256i b2, accu8, even2, odd2;
+  const __m256i transp_mask = _mm256_broadcastsi128_si256(_mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15));
+
+  const int32_t add    = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  __m256i left_dr4 = {
+    _mm256_load_si256((const __m256i *)left + 0),
+    _mm256_load_si256((const __m256i *)left + 1),
+    _mm256_load_si256((const __m256i *)left + 2),
+    _mm256_load_si256((const __m256i *)left + 3),
+  };
+  __m256i right_dr4 = {
+    _mm256_load_si256((const __m256i *)right + 0),
+    _mm256_load_si256((const __m256i *)right + 1),
+    _mm256_load_si256((const __m256i *)right + 2),
+    _mm256_load_si256((const __m256i *)right + 3),
+  };
+
+  __m256i rdrs_rearr8;
+
+  // Rearrange right matrix
+  for (int32_t dry = 0; dry < 4; dry++) {
+    __m256i rdr = right_drdry;
+    __m256i rdr_los = _mm256_permute4x64_epi64(rdr, _MM_SHUFFLE(2, 0, 2, 0));
+    __m256i rdr_his = _mm256_permute4x64_epi64(rdr, _MM_SHUFFLE(3, 1, 3, 1));
+
+    __m256i rdr_lo_rearr = _mm256_shuffle_epi8(rdr_los, transp_mask);
+    __m256i rdr_hi_rearr = _mm256_shuffle_epi8(rdr_his, transp_mask);
+
+    rdrs_rearrdry * 2 + 0 = rdr_lo_rearr;
+    rdrs_rearrdry * 2 + 1 = rdr_hi_rearr;
+  }
+
+  // Double-Row Y for destination matrix
+  for (int32_t dry = 0; dry < 4; dry++) {
+    __m256i ldr = left_drdry;
 
-  const int32_t add = 1 << (shift - 1);
+    __m256i ldr_slice12 = _mm256_shuffle_epi32(ldr, _MM_SHUFFLE(0, 0, 0, 0));
+    __m256i ldr_slice34 = _mm256_shuffle_epi32(ldr, _MM_SHUFFLE(1, 1, 1, 1));
+    __m256i ldr_slice56 = _mm256_shuffle_epi32(ldr, _MM_SHUFFLE(2, 2, 2, 2));
+    __m256i ldr_slice78 = _mm256_shuffle_epi32(ldr, _MM_SHUFFLE(3, 3, 3, 3));
 
-  b0 = _mm256_loadu_si256((__m256i*) right);
+    __m256i prod1 = _mm256_madd_epi16(ldr_slice12, rdrs_rearr0);
+    __m256i prod2 = _mm256_madd_epi16(ldr_slice12, rdrs_rearr1);
+    __m256i prod3 = _mm256_madd_epi16(ldr_slice34, rdrs_rearr2);
+    __m256i prod4 = _mm256_madd_epi16(ldr_slice34, rdrs_rearr3);
+    __m256i prod5 = _mm256_madd_epi16(ldr_slice56, rdrs_rearr4);
+    __m256i prod6 = _mm256_madd_epi16(ldr_slice56, rdrs_rearr5);
+    __m256i prod7 = _mm256_madd_epi16(ldr_slice78, rdrs_rearr6);
+    __m256i prod8 = _mm256_madd_epi16(ldr_slice78, rdrs_rearr7);
 
-  b1 = _mm256_unpackhi_epi16(b0, _mm256_castsi128_si256(_mm256_extracti128_si256(b0, 1)));
-  b0 = _mm256_unpacklo_epi16(b0, _mm256_castsi128_si256(_mm256_extracti128_si256(b0, 1)));
-  b0 = _mm256_inserti128_si256(b0, _mm256_castsi256_si128(b1), 1);
+    __m256i lo_1 = _mm256_add_epi32(prod1, prod3);
+    __m256i hi_1 = _mm256_add_epi32(prod2, prod4);
+    __m256i lo_2 = _mm256_add_epi32(prod5, prod7);
+    __m256i hi_2 = _mm256_add_epi32(prod6, prod8);
 
-  for (i = 0; i < 8; i += 2) {
+    __m256i lo   = _mm256_add_epi32(lo_1,  lo_2);
+    __m256i hi   = _mm256_add_epi32(hi_1,  hi_2);
 
-    even0 = _mm256_set1_epi32(((int32_t*)left)4 * i);
-    even0 = _mm256_madd_epi16(even0, b0);
-    accui = even0;
+    __m256i lo_tr = truncate(lo, debias, shift);
+    __m256i hi_tr = truncate(hi, debias, shift);
 
-    odd0 = _mm256_set1_epi32(((int32_t*)left)4 * (i + 1));
-    odd0 = _mm256_madd_epi16(odd0, b0);
-    accui + 1 = odd0;
+    __m256i final_dr = _mm256_packs_epi32(lo_tr, hi_tr);
+
+    _mm256_store_si256((__m256i *)dst + dry, final_dr);
   }
+}
 
-  for (j = 1; j < 4; ++j) {
+// Multiplies A by B_T's transpose and stores result's transpose in output,
+// which should be an array of 4 __m256i's
+static void matmul_8x8_a_bt_t(const int16_t *a, const int16_t *b_t,
+    __m256i *output, const int8_t shift)
+{
+  const int32_t add    = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
 
-    b0 = _mm256_loadu_si256((__m256i*)right + j);
+  // Keep upper row intact and swap neighboring 16-bit words in lower row
+  const __m256i shuf_lorow_mask =
+      _mm256_setr_epi8(0,  1,  2,  3,  4,  5,  6,  7,
+                       8,  9,  10, 11, 12, 13, 14, 15,
+                       18, 19, 16, 17, 22, 23, 20, 21,
+                       26, 27, 24, 25, 30, 31, 28, 29);
 
-    b1 = _mm256_unpackhi_epi16(b0, _mm256_castsi128_si256(_mm256_extracti128_si256(b0, 1)));
-    b0 = _mm256_unpacklo_epi16(b0, _mm256_castsi128_si256(_mm256_extracti128_si256(b0, 1)));
-    b0 = _mm256_inserti128_si256(b0, _mm256_castsi256_si128(b1), 1);
+  const __m256i *b_t_256 = (const __m256i *)b_t;
 
-    for (i = 0; i < 8; i += 2) {
+  // Dual Rows, because two 8x16b words fit in one YMM
+  __m256i a_dr_0      = _mm256_load_si256((__m256i *)a + 0);
+  __m256i a_dr_1      = _mm256_load_si256((__m256i *)a + 1);
+  __m256i a_dr_2      = _mm256_load_si256((__m256i *)a + 2);
+  __m256i a_dr_3      = _mm256_load_si256((__m256i *)a + 3);
 
-      even0 = _mm256_set1_epi32(((int32_t*)left)4 * i + j);
-      even0 = _mm256_madd_epi16(even0, b0);
-      accui = _mm256_add_epi32(accui, even0);
+  __m256i a_dr_0_swp  = swap_lanes(a_dr_0);
+  __m256i a_dr_1_swp  = swap_lanes(a_dr_1);
+  __m256i a_dr_2_swp  = swap_lanes(a_dr_2);
+  __m256i a_dr_3_swp  = swap_lanes(a_dr_3);
 
-      odd0 = _mm256_set1_epi32(((int32_t*)left)4 * (i + 1) + j);
-      odd0 = _mm256_madd_epi16(odd0, b0);
-      accui + 1 = _mm256_add_epi32(accui + 1, odd0);
-    }
+  for (int dry = 0; dry < 4; dry++) {
+
+    // Read dual columns of B matrix by reading rows of its transpose
+    __m256i b_dc        = _mm256_load_si256(b_t_256 + dry);
+
+    __m256i prod0       = _mm256_madd_epi16(b_dc,     a_dr_0);
+    __m256i prod0_swp   = _mm256_madd_epi16(b_dc,     a_dr_0_swp);
+    __m256i prod1       = _mm256_madd_epi16(b_dc,     a_dr_1);
+    __m256i prod1_swp   = _mm256_madd_epi16(b_dc,     a_dr_1_swp);
+    __m256i prod2       = _mm256_madd_epi16(b_dc,     a_dr_2);
+    __m256i prod2_swp   = _mm256_madd_epi16(b_dc,     a_dr_2_swp);
+    __m256i prod3       = _mm256_madd_epi16(b_dc,     a_dr_3);
+    __m256i prod3_swp   = _mm256_madd_epi16(b_dc,     a_dr_3_swp);
+
+    __m256i hsum0       = _mm256_hadd_epi32(prod0,    prod0_swp);
+    __m256i hsum1       = _mm256_hadd_epi32(prod1,    prod1_swp);
+    __m256i hsum2       = _mm256_hadd_epi32(prod2,    prod2_swp);
+    __m256i hsum3       = _mm256_hadd_epi32(prod3,    prod3_swp);
+
+    __m256i hsum2c_0    = _mm256_hadd_epi32(hsum0,    hsum1);
+    __m256i hsum2c_1    = _mm256_hadd_epi32(hsum2,    hsum3);
+
+    __m256i hsum2c_0_tr = truncate(hsum2c_0, debias, shift);
+    __m256i hsum2c_1_tr = truncate(hsum2c_1, debias, shift);
+
+    __m256i tmp_dc      = _mm256_packs_epi32(hsum2c_0_tr, hsum2c_1_tr);
+
+    outputdry         = _mm256_shuffle_epi8(tmp_dc, shuf_lorow_mask);
   }
+}
+
+// Multiplies A by B_T's transpose and stores result in output
+// which should be an array of 4 __m256i's
+static void matmul_8x8_a_bt(const int16_t *a, const __m256i *b_t,
+    int16_t *output, const int8_t shift)
+{
+  const int32_t add    = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const __m256i shuf_lorow_mask =
+      _mm256_setr_epi8(0,  1,  2,  3,  4,  5,  6,  7,
+                       8,  9,  10, 11, 12, 13, 14, 15,
+                       18, 19, 16, 17, 22, 23, 20, 21,
+                       26, 27, 24, 25, 30, 31, 28, 29);
+
+  const __m256i *a_256 = (const __m256i *)a;
+
+  __m256i b_dc_0      = b_t0;
+  __m256i b_dc_1      = b_t1;
+  __m256i b_dc_2      = b_t2;
+  __m256i b_dc_3      = b_t3;
+
+  __m256i b_dc_0_swp  = swap_lanes(b_dc_0);
+  __m256i b_dc_1_swp  = swap_lanes(b_dc_1);
+  __m256i b_dc_2_swp  = swap_lanes(b_dc_2);
+  __m256i b_dc_3_swp  = swap_lanes(b_dc_3);
+
+  for (int dry = 0; dry < 4; dry++) {
+    __m256i a_dr        = _mm256_load_si256(a_256 + dry);
 
-  for (i = 0; i < 8; i += 2) {
-    __m256i result, first_half, second_half;
+    __m256i prod0       = _mm256_madd_epi16(a_dr,     b_dc_0);
+    __m256i prod0_swp   = _mm256_madd_epi16(a_dr,     b_dc_0_swp);
+    __m256i prod1       = _mm256_madd_epi16(a_dr,     b_dc_1);
+    __m256i prod1_swp   = _mm256_madd_epi16(a_dr,     b_dc_1_swp);
+    __m256i prod2       = _mm256_madd_epi16(a_dr,     b_dc_2);
+    __m256i prod2_swp   = _mm256_madd_epi16(a_dr,     b_dc_2_swp);
+    __m256i prod3       = _mm256_madd_epi16(a_dr,     b_dc_3);
+    __m256i prod3_swp   = _mm256_madd_epi16(a_dr,     b_dc_3_swp);
 
-    first_half = _mm256_srai_epi32(_mm256_add_epi32(accui, _mm256_set1_epi32(add)), shift);
-    second_half = _mm256_srai_epi32(_mm256_add_epi32(accui + 1, _mm256_set1_epi32(add)), shift);
-    result = _mm256_permute4x64_epi64(_mm256_packs_epi32(first_half, second_half), 0 + 8 + 16 + 192);
-    _mm256_storeu_si256((__m256i*)dst + i / 2, result);
+    __m256i hsum0       = _mm256_hadd_epi32(prod0,    prod0_swp);
+    __m256i hsum1       = _mm256_hadd_epi32(prod1,    prod1_swp);
+    __m256i hsum2       = _mm256_hadd_epi32(prod2,    prod2_swp);
+    __m256i hsum3       = _mm256_hadd_epi32(prod3,    prod3_swp);
 
+    __m256i hsum2c_0    = _mm256_hadd_epi32(hsum0,    hsum1);
+    __m256i hsum2c_1    = _mm256_hadd_epi32(hsum2,    hsum3);
+
+    __m256i hsum2c_0_tr = truncate(hsum2c_0, debias, shift);
+    __m256i hsum2c_1_tr = truncate(hsum2c_1, debias, shift);
+
+    __m256i tmp_dr      = _mm256_packs_epi32(hsum2c_0_tr, hsum2c_1_tr);
+
+    __m256i final_dr    = _mm256_shuffle_epi8(tmp_dr, shuf_lorow_mask);
+
+    _mm256_store_si256((__m256i *)output + dry, final_dr);
   }
 }
 
-// 16x16 matrix multiplication with value clipping.
-// Parameters: Two 16x16 matrices containing 16-bit values in consecutive addresses,
-//             destination for the result and the shift value for clipping.
-static void mul_clip_matrix_16x16_avx2(const int16_t *left, const int16_t *right, int16_t *dst, const int32_t shift)
+static void matrix_dct_8x8_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
+{
+  int32_t shift_1st = kvz_g_convert_to_bit8 + 1 + (bitdepth - 8);
+  int32_t shift_2nd = kvz_g_convert_to_bit8 + 8;
+
+  const int16_t *dct  = &kvz_g_dct_800;
+
+  /*
+   * Multiply input by the tranpose of DCT matrix into tmpres, and DCT matrix
+   * by tmpres - this is then our output matrix
+   *
+   * It's easier to implement an AVX2 matrix multiplication if you can multiply
+   * the left term with the transpose of the right term. Here things are stored
+   * row-wise, not column-wise, so we can effectively read DCT_T column-wise
+   * into YMM registers by reading DCT row-wise. Also because of this, the
+   * first multiplication is hacked to produce the transpose of the result
+   * instead, since it will be used in similar fashion as the right operand
+   * in the second multiplication.
+   */
+
+  __m256i tmpres4;
+
+  matmul_8x8_a_bt_t(input,  dct, tmpres, shift_1st);
+  matmul_8x8_a_bt  (dct, tmpres, output, shift_2nd);
+}
+
+static void matrix_idct_8x8_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
 {
-  int i, j;
-  __m256i row4, accu162, even, odd;
+  int32_t shift_1st = 7;
+  int32_t shift_2nd = 12 - (bitdepth - 8);
+  ALIGNED(64) int16_t tmp8 * 8;
+
+  const int16_t *tdct = &kvz_g_dct_8_t00;
+  const int16_t *dct  = &kvz_g_dct_8  00;
+
+  mul_clip_matrix_8x8_avx2(tdct, input, tmp,    shift_1st);
+  mul_clip_matrix_8x8_avx2(tmp,  dct,   output, shift_2nd);
+
+  /*
+   * Because:
+   * out = tdct * input * dct = tdct * (input * dct) = tdct * (input * transpose(tdct))
+   * This could almost be done this way:
+   *
+   * matmul_8x8_a_bt_t(input, tdct, debias1, shift_1st, tmp);
+   * matmul_8x8_a_bt  (tdct,  tmp,  debias2, shift_2nd, output);
+   *
+   * But not really, since it will fall victim to some very occasional
+   * rounding errors. Sadly.
+   */
+}
 
-  const int32_t stride = 8;
+static void matmul_16x16_a_bt(const __m256i *a,
+                              const __m256i *b_t,
+                                    __m256i *output,
+                              const int32_t  shift)
+{
+  const int32_t add    = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  for (int32_t y = 0; y < 16; y++) {
+    __m256i a_r = ay;
+    __m256i results_322;
+
+    for (int32_t fco = 0; fco < 2; fco++) {
+      // Read first cols 0, 1, 2, 3, 8, 9, 10, 11, and then next 4
+      __m256i bt_c0  = b_tfco * 4 + 0;
+      __m256i bt_c1  = b_tfco * 4 + 1;
+      __m256i bt_c2  = b_tfco * 4 + 2;
+      __m256i bt_c3  = b_tfco * 4 + 3;
+      __m256i bt_c8  = b_tfco * 4 + 8;
+      __m256i bt_c9  = b_tfco * 4 + 9;
+      __m256i bt_c10 = b_tfco * 4 + 10;
+      __m256i bt_c11 = b_tfco * 4 + 11;
+
+      __m256i p0  = _mm256_madd_epi16(a_r, bt_c0);
+      __m256i p1  = _mm256_madd_epi16(a_r, bt_c1);
+      __m256i p2  = _mm256_madd_epi16(a_r, bt_c2);
+      __m256i p3  = _mm256_madd_epi16(a_r, bt_c3);
+      __m256i p8  = _mm256_madd_epi16(a_r, bt_c8);
+      __m256i p9  = _mm256_madd_epi16(a_r, bt_c9);
+      __m256i p10 = _mm256_madd_epi16(a_r, bt_c10);
+      __m256i p11 = _mm256_madd_epi16(a_r, bt_c11);
+
+      // Combine low lanes from P0 and P8, high lanes from them, and the same
+      // with P1:P9 and so on
+      __m256i p0l = _mm256_permute2x128_si256(p0, p8,  0x20);
+      __m256i p0h = _mm256_permute2x128_si256(p0, p8,  0x31);
+      __m256i p1l = _mm256_permute2x128_si256(p1, p9,  0x20);
+      __m256i p1h = _mm256_permute2x128_si256(p1, p9,  0x31);
+      __m256i p2l = _mm256_permute2x128_si256(p2, p10, 0x20);
+      __m256i p2h = _mm256_permute2x128_si256(p2, p10, 0x31);
+      __m256i p3l = _mm256_permute2x128_si256(p3, p11, 0x20);
+      __m256i p3h = _mm256_permute2x128_si256(p3, p11, 0x31);
+
+      __m256i s0  = _mm256_add_epi32(p0l, p0h);
+      __m256i s1  = _mm256_add_epi32(p1l, p1h);
+      __m256i s2  = _mm256_add_epi32(p2l, p2h);
+      __m256i s3  = _mm256_add_epi32(p3l, p3h);
+
+      __m256i s4  = _mm256_unpacklo_epi64(s0, s1);
+      __m256i s5  = _mm256_unpackhi_epi64(s0, s1);
+      __m256i s6  = _mm256_unpacklo_epi64(s2, s3);
+      __m256i s7  = _mm256_unpackhi_epi64(s2, s3);
+
+      __m256i s8  = _mm256_add_epi32(s4, s5);
+      __m256i s9  = _mm256_add_epi32(s6, s7);
+
+      __m256i res = _mm256_hadd_epi32(s8, s9);
+      results_32fco = truncate(res, debias, shift);
+    }
+    outputy = _mm256_packs_epi32(results_320, results_321);
+  }
+}
 
-  const int32_t add = 1 << (shift - 1);
+// NOTE: The strides measured by s_stride_log2 and d_stride_log2 are in units
+// of 16 coeffs, not 1!
+static void transpose_16x16_stride(const int16_t *src,
+                                         int16_t *dst,
+                                         uint8_t  s_stride_log2,
+                                         uint8_t  d_stride_log2)
+{
+  __m256i tmp_12816;
+  for (uint32_t i = 0; i < 16; i += 8) {
+
+    // After every n-bit unpack, 2n-bit units in the vectors will be in
+    // correct order. Pair words first, then dwords, then qwords. After that,
+    // whole lanes will be correct.
+    __m256i tmp_328;
+    __m256i tmp_648;
+
+    __m256i m8 = {
+      _mm256_load_si256((const __m256i *)src + ((i + 0) << s_stride_log2)),
+      _mm256_load_si256((const __m256i *)src + ((i + 1) << s_stride_log2)),
+      _mm256_load_si256((const __m256i *)src + ((i + 2) << s_stride_log2)),
+      _mm256_load_si256((const __m256i *)src + ((i + 3) << s_stride_log2)),
+      _mm256_load_si256((const __m256i *)src + ((i + 4) << s_stride_log2)),
+      _mm256_load_si256((const __m256i *)src + ((i + 5) << s_stride_log2)),
+      _mm256_load_si256((const __m256i *)src + ((i + 6) << s_stride_log2)),
+      _mm256_load_si256((const __m256i *)src + ((i + 7) << s_stride_log2)),
+    };
+
+    tmp_320      = _mm256_unpacklo_epi16(     m0,      m1);
+    tmp_321      = _mm256_unpacklo_epi16(     m2,      m3);
+    tmp_322      = _mm256_unpackhi_epi16(     m0,      m1);
+    tmp_323      = _mm256_unpackhi_epi16(     m2,      m3);
+
+    tmp_324      = _mm256_unpacklo_epi16(     m4,      m5);
+    tmp_325      = _mm256_unpacklo_epi16(     m6,      m7);
+    tmp_326      = _mm256_unpackhi_epi16(     m4,      m5);
+    tmp_327      = _mm256_unpackhi_epi16(     m6,      m7);
+
+
+    tmp_640      = _mm256_unpacklo_epi32(tmp_320, tmp_321);
+    tmp_641      = _mm256_unpacklo_epi32(tmp_322, tmp_323);
+    tmp_642      = _mm256_unpackhi_epi32(tmp_320, tmp_321);
+    tmp_643      = _mm256_unpackhi_epi32(tmp_322, tmp_323);
+
+    tmp_644      = _mm256_unpacklo_epi32(tmp_324, tmp_325);
+    tmp_645      = _mm256_unpacklo_epi32(tmp_326, tmp_327);
+    tmp_646      = _mm256_unpackhi_epi32(tmp_324, tmp_325);
+    tmp_647      = _mm256_unpackhi_epi32(tmp_326, tmp_327);
+
+
+    tmp_128i + 0 = _mm256_unpacklo_epi64(tmp_640, tmp_644);
+    tmp_128i + 1 = _mm256_unpackhi_epi64(tmp_640, tmp_644);
+    tmp_128i + 2 = _mm256_unpacklo_epi64(tmp_642, tmp_646);
+    tmp_128i + 3 = _mm256_unpackhi_epi64(tmp_642, tmp_646);
+
+    tmp_128i + 4 = _mm256_unpacklo_epi64(tmp_641, tmp_645);
+    tmp_128i + 5 = _mm256_unpackhi_epi64(tmp_641, tmp_645);
+    tmp_128i + 6 = _mm256_unpacklo_epi64(tmp_643, tmp_647);
+    tmp_128i + 7 = _mm256_unpackhi_epi64(tmp_643, tmp_647);
+  }
 
-  row0 = _mm256_loadu_si256((__m256i*) right);
-  row1 = _mm256_loadu_si256((__m256i*) right + 1);
-  row2 = _mm256_unpacklo_epi16(row0, row1);
-  row3 = _mm256_unpackhi_epi16(row0, row1);
-  row0 = _mm256_permute2x128_si256(row2, row3, 0 + 32);
-  row1 = _mm256_permute2x128_si256(row2, row3, 1 + 48);
+  for (uint32_t i = 0; i < 8; i++) {
+    uint32_t loid     = i + 0;
+    uint32_t hiid     = i + 8;
 
-  for (i = 0; i < 16; i += 2) {
+    uint32_t dst_loid = loid << d_stride_log2;
+    uint32_t dst_hiid = hiid << d_stride_log2;
 
-    even = _mm256_set1_epi32(((int32_t*)left)stride * i);
-    accui0 = _mm256_madd_epi16(even, row0);
-    accui1 = _mm256_madd_epi16(even, row1);
+    __m256i lo       = tmp_128loid;
+    __m256i hi       = tmp_128hiid;
+    __m256i final_lo = _mm256_permute2x128_si256(lo, hi, 0x20);
+    __m256i final_hi = _mm256_permute2x128_si256(lo, hi, 0x31);
 
-    odd = _mm256_set1_epi32(((int32_t*)left)stride * (i + 1));
-    accui + 10 = _mm256_madd_epi16(odd, row0);
-    accui + 11 = _mm256_madd_epi16(odd, row1);
+    _mm256_store_si256((__m256i *)dst + dst_loid, final_lo);
+    _mm256_store_si256((__m256i *)dst + dst_hiid, final_hi);
   }
+}
 
-  for (j = 2; j < 16; j += 2) {
+static void transpose_16x16(const int16_t *src, int16_t *dst)
+{
+  transpose_16x16_stride(src, dst, 0, 0);
+}
 
-    row0 = _mm256_loadu_si256((__m256i*)right + j);
-    row1 = _mm256_loadu_si256((__m256i*)right + j + 1);
-    row2 = _mm256_unpacklo_epi16(row0, row1);
-    row3 = _mm256_unpackhi_epi16(row0, row1);
-    row0 = _mm256_permute2x128_si256(row2, row3, 0 + 32);
-    row1 = _mm256_permute2x128_si256(row2, row3, 1 + 48);
+static __m256i truncate_inv(__m256i v, int32_t shift)
+{
+  int32_t add = 1 << (shift - 1);
 
-    for (i = 0; i < 16; i += 2) {
+  __m256i debias  = _mm256_set1_epi32(add);
+  __m256i v2      = _mm256_add_epi32 (v,  debias);
+  __m256i trunced = _mm256_srai_epi32(v2, shift);
+  return  trunced;
+}
 
-      even = _mm256_set1_epi32(((int32_t*)left)stride * i + j / 2);
-      accui0 = _mm256_add_epi32(accui0, _mm256_madd_epi16(even, row0));
-      accui1 = _mm256_add_epi32(accui1, _mm256_madd_epi16(even, row1));
+static __m256i extract_odds(__m256i v)
+{
+  // 0 1 2 3 4 5 6 7 | 8 9 a b c d e f => 1 3 5 7 1 3 5 7 | 9 b d f 9 b d f
+  const __m256i oddmask = _mm256_setr_epi8( 2,  3,  6,  7, 10, 11, 14, 15,
+                                            2,  3,  6,  7, 10, 11, 14, 15,
+                                            2,  3,  6,  7, 10, 11, 14, 15,
+                                            2,  3,  6,  7, 10, 11, 14, 15);
+
+  __m256i tmp = _mm256_shuffle_epi8 (v,   oddmask);
+  return _mm256_permute4x64_epi64   (tmp, _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+static __m256i extract_combine_odds(__m256i v0, __m256i v1)
+{
+  // 0 1 2 3 4 5 6 7 | 8 9 a b c d e f => 1 3 5 7 1 3 5 7 | 9 b d f 9 b d f
+  const __m256i oddmask = _mm256_setr_epi8( 2,  3,  6,  7, 10, 11, 14, 15,
+                                            2,  3,  6,  7, 10, 11, 14, 15,
+                                            2,  3,  6,  7, 10, 11, 14, 15,
+                                            2,  3,  6,  7, 10, 11, 14, 15);
 
-      odd = _mm256_set1_epi32(((int32_t*)left)stride * (i + 1) + j / 2);
-      accui + 10 = _mm256_add_epi32(accui + 10, _mm256_madd_epi16(odd, row0));
-      accui + 11 = _mm256_add_epi32(accui + 11, _mm256_madd_epi16(odd, row1));
+  __m256i tmp0 = _mm256_shuffle_epi8(v0,   oddmask);
+  __m256i tmp1 = _mm256_shuffle_epi8(v1,   oddmask);
 
-    }
+  __m256i tmp2 = _mm256_blend_epi32 (tmp0, tmp1, 0xcc); // 1100 1100
+
+  return _mm256_permute4x64_epi64   (tmp2, _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+// Extract items 2, 6, A and E from first four columns of DCT, order them as
+// follows:
+// D0,2 D0,6 D1,2 D1,6 D1,a D1,e D0,a D0,e | D2,2 D2,6 D3,2 D3,6 D3,a D3,e D2,a D2,e
+static __m256i extract_26ae(const __m256i *tdct)
+{
+  // 02 03 22 23 06 07 26 27 | 0a 0b 2a 2b 02 0f 2e 2f
+  // =>
+  // 02 06 22 26 02 06 22 26 | 2a 2e 0a 0e 2a 2e 0a 0e
+  const __m256i evens_mask = _mm256_setr_epi8( 0,  1,  8,  9,  4,  5, 12, 13,
+                                               0,  1,  8,  9,  4,  5, 12, 13,
+                                               4,  5, 12, 13,  0,  1,  8,  9,
+                                               4,  5, 12, 13,  0,  1,  8,  9);
+
+  __m256i shufd_0 = _mm256_shuffle_epi32(tdct0, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256i shufd_2 = _mm256_shuffle_epi32(tdct2, _MM_SHUFFLE(2, 3, 0, 1));
+
+  __m256i cmbd_01 = _mm256_blend_epi32(shufd_0, tdct1, 0xaa); // 1010 1010
+  __m256i cmbd_23 = _mm256_blend_epi32(shufd_2, tdct3, 0xaa); // 1010 1010
+
+  __m256i evens_01 = _mm256_shuffle_epi8(cmbd_01, evens_mask);
+  __m256i evens_23 = _mm256_shuffle_epi8(cmbd_23, evens_mask);
+
+  __m256i evens_0123 = _mm256_unpacklo_epi64(evens_01, evens_23);
+
+  return _mm256_permute4x64_epi64(evens_0123, _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+// 2 6 2 6 a e a e | 2 6 2 6 a e a e
+static __m256i extract_26ae_vec(__m256i col)
+{
+  const __m256i mask_26ae = _mm256_set1_epi32(0x0d0c0504);
+
+  // 2 6 2 6 2 6 2 6 | a e a e a e a e
+  __m256i reord = _mm256_shuffle_epi8     (col,   mask_26ae);
+  __m256i final = _mm256_permute4x64_epi64(reord, _MM_SHUFFLE(3, 1, 2, 0));
+  return  final;
+}
+
+// D00 D80 D01 D81 D41 Dc1 D40 Dc0 | D40 Dc0 D41 Dc1 D01 D81 D00 D80
+static __m256i extract_d048c(const __m256i *tdct)
+{
+  const __m256i final_shuf = _mm256_setr_epi8( 0,  1,  8,  9,  2,  3, 10, 11,
+                                               6,  7, 14, 15,  4,  5, 12, 13,
+                                               4,  5, 12, 13,  6,  7, 14, 15,
+                                               2,  3, 10, 11,  0,  1,  8,  9);
+  __m256i c0 = tdct0;
+  __m256i c1 = tdct1;
+
+  __m256i c1_2  = _mm256_slli_epi32       (c1,    16);
+  __m256i cmbd  = _mm256_blend_epi16      (c0,    c1_2, 0x22); // 0010 0010
+  __m256i cmbd2 = _mm256_shuffle_epi32    (cmbd,  _MM_SHUFFLE(2, 0, 2, 0));
+  __m256i cmbd3 = _mm256_permute4x64_epi64(cmbd2, _MM_SHUFFLE(3, 1, 2, 0));
+  __m256i final = _mm256_shuffle_epi8     (cmbd3, final_shuf);
+
+  return final;
+}
+
+// 0 8 0 8 4 c 4 c | 4 c 4 c 0 8 0 8
+static __m256i extract_d048c_vec(__m256i col)
+{
+  const __m256i shufmask = _mm256_setr_epi8( 0,  1,  0,  1,  8,  9,  8,  9,
+                                             8,  9,  8,  9,  0,  1,  0,  1,
+                                             0,  1,  0,  1,  8,  9,  8,  9,
+                                             8,  9,  8,  9,  0,  1,  0,  1);
+
+  __m256i col_db4s = _mm256_shuffle_epi8     (col, shufmask);
+  __m256i col_los  = _mm256_permute4x64_epi64(col_db4s, _MM_SHUFFLE(1, 1, 0, 0));
+  __m256i col_his  = _mm256_permute4x64_epi64(col_db4s, _MM_SHUFFLE(3, 3, 2, 2));
+
+  __m256i final    = _mm256_unpacklo_epi16   (col_los,  col_his);
+  return final;
+}
+
+static void partial_butterfly_inverse_16_avx2(const int16_t *src, int16_t *dst, int32_t shift)
+{
+  __m256i tsrc16;
+
+  const uint32_t width = 16;
+
+  const int16_t *tdct = &kvz_g_dct_16_t00;
+
+  const __m256i  eo_signmask = _mm256_setr_epi32( 1,  1,  1,  1, -1, -1, -1, -1);
+  const __m256i eeo_signmask = _mm256_setr_epi32( 1,  1, -1, -1, -1, -1,  1,  1);
+  const __m256i   o_signmask = _mm256_set1_epi32(-1);
+
+  const __m256i final_shufmask = _mm256_setr_epi8( 0,  1,  2,  3,  4,  5,  6,  7,
+                                                   8,  9, 10, 11, 12, 13, 14, 15,
+                                                   6,  7,  4,  5,  2,  3,  0,  1,
+                                                  14, 15, 12, 13, 10, 11,  8,  9);
+  transpose_16x16(src, (int16_t *)tsrc);
+
+  const __m256i dct_cols8 = {
+    _mm256_load_si256((const __m256i *)tdct + 0),
+    _mm256_load_si256((const __m256i *)tdct + 1),
+    _mm256_load_si256((const __m256i *)tdct + 2),
+    _mm256_load_si256((const __m256i *)tdct + 3),
+    _mm256_load_si256((const __m256i *)tdct + 4),
+    _mm256_load_si256((const __m256i *)tdct + 5),
+    _mm256_load_si256((const __m256i *)tdct + 6),
+    _mm256_load_si256((const __m256i *)tdct + 7),
+  };
+
+  // These contain: D1,0 D3,0 D5,0 D7,0 D9,0 Db,0 Dd,0 Df,0 | D1,4 D3,4 D5,4 D7,4 D9,4 Db,4 Dd,4 Df,4
+  //                D1,1 D3,1 D5,1 D7,1 D9,1 Db,1 Dd,1 Df,1 | D1,5 D3,5 D5,5 D7,5 D9,5 Db,5 Dd,5 Df,5
+  //                D1,2 D3,2 D5,2 D7,2 D9,2 Db,2 Dd,2 Df,2 | D1,6 D3,6 D5,6 D7,6 D9,6 Db,6 Dd,6 Df,6
+  //                D1,3 D3,3 D5,3 D7,3 D9,3 Db,3 Dd,3 Df,3 | D1,7 D3,7 D5,7 D7,7 D9,7 Db,7 Dd,7 Df,7
+  __m256i dct_col_odds4;
+  for (uint32_t j = 0; j < 4; j++) {
+    dct_col_oddsj = extract_combine_odds(dct_colsj + 0, dct_colsj + 4);
   }
+  for (uint32_t j = 0; j < width; j++) {
+    __m256i col = tsrcj;
+    __m256i odds = extract_odds(col);
 
-  for (i = 0; i < 16; ++i) {
-    __m256i result, first_half, second_half;
+    __m256i o04   = _mm256_madd_epi16           (odds,     dct_col_odds0);
+    __m256i o15   = _mm256_madd_epi16           (odds,     dct_col_odds1);
+    __m256i o26   = _mm256_madd_epi16           (odds,     dct_col_odds2);
+    __m256i o37   = _mm256_madd_epi16           (odds,     dct_col_odds3);
 
-    first_half = _mm256_srai_epi32(_mm256_add_epi32(accui0, _mm256_set1_epi32(add)), shift);
-    second_half = _mm256_srai_epi32(_mm256_add_epi32(accui1, _mm256_set1_epi32(add)), shift);
-    result = _mm256_permute4x64_epi64(_mm256_packs_epi32(first_half, second_half), 0 + 8 + 16 + 192);
-    _mm256_storeu_si256((__m256i*)dst + i, result);
+    __m256i o0145 = _mm256_hadd_epi32           (o04,      o15);
+    __m256i o2367 = _mm256_hadd_epi32           (o26,      o37);
 
+    __m256i o     = _mm256_hadd_epi32           (o0145,    o2367);
+
+    // D0,2 D0,6 D1,2 D1,6 D1,a D1,e D0,a D0,e | D2,2 D2,6 D3,2 D3,6 D3,a D3,e D2,a D2,e
+    __m256i d_db2 = extract_26ae(dct_cols);
+
+    // 2 6 2 6 a e a e | 2 6 2 6 a e a e
+    __m256i t_db2 = extract_26ae_vec            (col);
+
+    __m256i eo_parts  = _mm256_madd_epi16       (d_db2,    t_db2);
+    __m256i eo_parts2 = _mm256_shuffle_epi32    (eo_parts, _MM_SHUFFLE(0, 1, 2, 3));
+
+    // EO0 EO1 EO1 EO0 | EO2 EO3 EO3 EO2
+    __m256i eo        = _mm256_add_epi32        (eo_parts, eo_parts2);
+    __m256i eo2       = _mm256_permute4x64_epi64(eo,       _MM_SHUFFLE(1, 3, 2, 0));
+    __m256i eo3       = _mm256_sign_epi32       (eo2,      eo_signmask);
+
+    __m256i d_db4     = extract_d048c           (dct_cols);
+    __m256i t_db4     = extract_d048c_vec       (col);
+    __m256i eee_eeo   = _mm256_madd_epi16       (d_db4,   t_db4);
+
+    __m256i eee_eee   = _mm256_permute4x64_epi64(eee_eeo,  _MM_SHUFFLE(3, 0, 3, 0));
+    __m256i eeo_eeo1  = _mm256_permute4x64_epi64(eee_eeo,  _MM_SHUFFLE(1, 2, 1, 2));
+
+    __m256i eeo_eeo2  = _mm256_sign_epi32       (eeo_eeo1, eeo_signmask);
+
+    // EE0 EE1 EE2 EE3 | EE3 EE2 EE1 EE0
+    __m256i ee        = _mm256_add_epi32        (eee_eee,  eeo_eeo2);
+    __m256i e         = _mm256_add_epi32        (ee,       eo3);
+
+    __m256i o_neg     = _mm256_sign_epi32       (o,        o_signmask);
+    __m256i o_lo      = _mm256_blend_epi32      (o,        o_neg, 0xf0); // 1111 0000
+    __m256i o_hi      = _mm256_blend_epi32      (o,        o_neg, 0x0f); // 0000 1111
+
+    __m256i res_lo    = _mm256_add_epi32        (e,        o_lo);
+    __m256i res_hi    = _mm256_add_epi32        (e,        o_hi);
+    __m256i res_hi2   = _mm256_permute4x64_epi64(res_hi,   _MM_SHUFFLE(1, 0, 3, 2));
+
+    __m256i res_lo_t  = truncate_inv(res_lo,  shift);
+    __m256i res_hi_t  = truncate_inv(res_hi2, shift);
+
+    __m256i res_16_1  = _mm256_packs_epi32      (res_lo_t, res_hi_t);
+    __m256i final     = _mm256_shuffle_epi8     (res_16_1, final_shufmask);
+
+    _mm256_store_si256((__m256i *)dst + j, final);
   }
 }
 
+static void matrix_idct_16x16_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
+{
+  int32_t shift_1st = 7;
+  int32_t shift_2nd = 12 - (bitdepth - 8);
+  ALIGNED(64) int16_t tmp16 * 16;
+
+  partial_butterfly_inverse_16_avx2(input, tmp,    shift_1st);
+  partial_butterfly_inverse_16_avx2(tmp,   output, shift_2nd);
+}
+
+static void matrix_dct_16x16_avx2(int8_t bitdepth, const int16_t *input, int16_t *output)
+{
+  int32_t shift_1st = kvz_g_convert_to_bit16 + 1 + (bitdepth - 8);
+  int32_t shift_2nd = kvz_g_convert_to_bit16 + 8;
+
+  const int16_t *dct  = &kvz_g_dct_1600;
+
+  /*
+   * Multiply input by the tranpose of DCT matrix into tmpres, and DCT matrix
+   * by tmpres - this is then our output matrix
+   *
+   * It's easier to implement an AVX2 matrix multiplication if you can multiply
+   * the left term with the transpose of the right term. Here things are stored
+   * row-wise, not column-wise, so we can effectively read DCT_T column-wise
+   * into YMM registers by reading DCT row-wise. Also because of this, the
+   * first multiplication is hacked to produce the transpose of the result
+   * instead, since it will be used in similar fashion as the right operand
+   * in the second multiplication.
+   */
+
+  const __m256i *d_v = (const __m256i *)dct;
+  const __m256i *i_v = (const __m256i *)input;
+        __m256i *o_v = (      __m256i *)output;
+  __m256i tmp16;
+
+  // Hack! (A * B^T)^T = B * A^T, so we can dispatch the transpose-produciong
+  // multiply completely
+  matmul_16x16_a_bt(d_v, i_v, tmp, shift_1st);
+  matmul_16x16_a_bt(d_v, tmp, o_v, shift_2nd);
+}
+
 // 32x32 matrix multiplication with value clipping.
 // Parameters: Two 32x32 matrices containing 16-bit values in consecutive addresses,
 //             destination for the result and the shift value for clipping.
-static void mul_clip_matrix_32x32_avx2(const int16_t *left, const int16_t *right, int16_t *dst, const int32_t shift)
+static void mul_clip_matrix_32x32_avx2(const int16_t *left,
+                                       const int16_t *right,
+                                             int16_t *dst,
+                                       const int32_t  shift)
 {
-  int i, j;
-  __m256i row4, tmp2, accu324, even, odd;
-
-  const int32_t stride = 16;
-
-  const int32_t add = 1 << (shift - 1);
-
-  row0 = _mm256_loadu_si256((__m256i*) right);
-  row1 = _mm256_loadu_si256((__m256i*) right + 2);
-  tmp0 = _mm256_unpacklo_epi16(row0, row1);
-  tmp1 = _mm256_unpackhi_epi16(row0, row1);
-  row0 = _mm256_permute2x128_si256(tmp0, tmp1, 0 + 32);
-  row1 = _mm256_permute2x128_si256(tmp0, tmp1, 1 + 48);
-
-  row2 = _mm256_loadu_si256((__m256i*) right + 1);
-  row3 = _mm256_loadu_si256((__m256i*) right + 3);
-  tmp0 = _mm256_unpacklo_epi16(row2, row3);
-  tmp1 = _mm256_unpackhi_epi16(row2, row3);
-  row2 = _mm256_permute2x128_si256(tmp0, tmp1, 0 + 32);
-  row3 = _mm256_permute2x128_si256(tmp0, tmp1, 1 + 48);
-
-  for (i = 0; i < 32; i += 2) {
-
-    even = _mm256_set1_epi32(((int32_t*)left)stride * i);
-    accui0 = _mm256_madd_epi16(even, row0);
-    accui1 = _mm256_madd_epi16(even, row1);
-    accui2 = _mm256_madd_epi16(even, row2);
-    accui3 = _mm256_madd_epi16(even, row3);
-
-    odd = _mm256_set1_epi32(((int32_t*)left)stride * (i + 1));
-    accui + 10 = _mm256_madd_epi16(odd, row0);
-    accui + 11 = _mm256_madd_epi16(odd, row1);
-    accui + 12 = _mm256_madd_epi16(odd, row2);
-    accui + 13 = _mm256_madd_epi16(odd, row3);
-  }
+  const int32_t add    = 1 << (shift - 1);
+  const __m256i debias = _mm256_set1_epi32(add);
+
+  const uint32_t *l_32  = (const uint32_t *)left;
+  const __m256i  *r_v   = (const __m256i *)right;
+        __m256i  *dst_v = (      __m256i *)dst;
+
+  __m256i accu128 = {_mm256_setzero_si256()};
+  size_t i, j;
+
+  for (j = 0; j < 64; j += 4) {
+    const __m256i r0 = r_vj + 0;
+    const __m256i r1 = r_vj + 1;
+    const __m256i r2 = r_vj + 2;
+    const __m256i r3 = r_vj + 3;
 
-  for (j = 4; j < 64; j += 4) {
+    __m256i r02l   = _mm256_unpacklo_epi16(r0, r2);
+    __m256i r02h   = _mm256_unpackhi_epi16(r0, r2);
+    __m256i r13l   = _mm256_unpacklo_epi16(r1, r3);
+    __m256i r13h   = _mm256_unpackhi_epi16(r1, r3);
 
-    row0 = _mm256_loadu_si256((__m256i*)right + j);
-    row1 = _mm256_loadu_si256((__m256i*)right + j + 2);
-    tmp0 = _mm256_unpacklo_epi16(row0, row1);
-    tmp1 = _mm256_unpackhi_epi16(row0, row1);
-    row0 = _mm256_permute2x128_si256(tmp0, tmp1, 0 + 32);
-    row1 = _mm256_permute2x128_si256(tmp0, tmp1, 1 + 48);
+    __m256i r02_07 = _mm256_permute2x128_si256(r02l, r02h, 0x20);
+    __m256i r02_8f = _mm256_permute2x128_si256(r02l, r02h, 0x31);
 
-    row2 = _mm256_loadu_si256((__m256i*) right + j + 1);
-    row3 = _mm256_loadu_si256((__m256i*) right + j + 3);
-    tmp0 = _mm256_unpacklo_epi16(row2, row3);
-    tmp1 = _mm256_unpackhi_epi16(row2, row3);
-    row2 = _mm256_permute2x128_si256(tmp0, tmp1, 0 + 32);
-    row3 = _mm256_permute2x128_si256(tmp0, tmp1, 1 + 48);
+    __m256i r13_07 = _mm256_permute2x128_si256(r13l, r13h, 0x20);
+    __m256i r13_8f = _mm256_permute2x128_si256(r13l, r13h, 0x31);
 
     for (i = 0; i < 32; i += 2) {
+      size_t acc_base = i << 2;
 
-      even = _mm256_set1_epi32(((int32_t*)left)stride * i + j / 4);
-      accui0 = _mm256_add_epi32(accui0, _mm256_madd_epi16(even, row0));
-      accui1 = _mm256_add_epi32(accui1, _mm256_madd_epi16(even, row1));
-      accui2 = _mm256_add_epi32(accui2, _mm256_madd_epi16(even, row2));
-      accui3 = _mm256_add_epi32(accui3, _mm256_madd_epi16(even, row3));
+      uint32_t curr_e    = l_32(i + 0) * (32 / 2) + (j >> 2);
+      uint32_t curr_o    = l_32(i + 1) * (32 / 2) + (j >> 2);
 
-      odd = _mm256_set1_epi32(((int32_t*)left)stride * (i + 1) + j / 4);
-      accui + 10 = _mm256_add_epi32(accui + 10, _mm256_madd_epi16(odd, row0));
-      accui + 11 = _mm256_add_epi32(accui + 11, _mm256_madd_epi16(odd, row1));
-      accui + 12 = _mm256_add_epi32(accui + 12, _mm256_madd_epi16(odd, row2));
-      accui + 13 = _mm256_add_epi32(accui + 13, _mm256_madd_epi16(odd, row3));
+      __m256i even       = _mm256_set1_epi32(curr_e);
+      __m256i odd        = _mm256_set1_epi32(curr_o);
 
+      __m256i p_e0       = _mm256_madd_epi16(even, r02_07);
+      __m256i p_e1       = _mm256_madd_epi16(even, r02_8f);
+      __m256i p_e2       = _mm256_madd_epi16(even, r13_07);
+      __m256i p_e3       = _mm256_madd_epi16(even, r13_8f);
+
+      __m256i p_o0       = _mm256_madd_epi16(odd,  r02_07);
+      __m256i p_o1       = _mm256_madd_epi16(odd,  r02_8f);
+      __m256i p_o2       = _mm256_madd_epi16(odd,  r13_07);
+      __m256i p_o3       = _mm256_madd_epi16(odd,  r13_8f);
+
+      accuacc_base + 0 = _mm256_add_epi32 (p_e0, accuacc_base + 0);
+      accuacc_base + 1 = _mm256_add_epi32 (p_e1, accuacc_base + 1);
+      accuacc_base + 2 = _mm256_add_epi32 (p_e2, accuacc_base + 2);
+      accuacc_base + 3 = _mm256_add_epi32 (p_e3, accuacc_base + 3);
+
+      accuacc_base + 4 = _mm256_add_epi32 (p_o0, accuacc_base + 4);
+      accuacc_base + 5 = _mm256_add_epi32 (p_o1, accuacc_base + 5);
+      accuacc_base + 6 = _mm256_add_epi32 (p_o2, accuacc_base + 6);
+      accuacc_base + 7 = _mm256_add_epi32 (p_o3, accuacc_base + 7);
     }
   }
 
-  for (i = 0; i < 32; ++i) {
-    __m256i result, first_quarter, second_quarter, third_quarter, fourth_quarter;
+  for (i = 0; i < 32; i++) {
+    size_t acc_base = i << 2;
+    size_t dst_base = i << 1;
+
+    __m256i q0  = truncate(accuacc_base + 0, debias, shift);
+    __m256i q1  = truncate(accuacc_base + 1, debias, shift);
+    __m256i q2  = truncate(accuacc_base + 2, debias, shift);
+    __m256i q3  = truncate(accuacc_base + 3, debias, shift);
 
-    first_quarter = _mm256_srai_epi32(_mm256_add_epi32(accui0, _mm256_set1_epi32(add)), shift);
-    second_quarter = _mm256_srai_epi32(_mm256_add_epi32(accui1, _mm256_set1_epi32(add)), shift);
-    third_quarter = _mm256_srai_epi32(_mm256_add_epi32(accui2, _mm256_set1_epi32(add)), shift);
-    fourth_quarter = _mm256_srai_epi32(_mm256_add_epi32(accui3, _mm256_set1_epi32(add)), shift);
-    result = _mm256_permute4x64_epi64(_mm256_packs_epi32(first_quarter, second_quarter), 0 + 8 + 16 + 192);
-    _mm256_storeu_si256((__m256i*)dst + 2 * i, result);
-    result = _mm256_permute4x64_epi64(_mm256_packs_epi32(third_quarter, fourth_quarter), 0 + 8 + 16 + 192);
-    _mm256_storeu_si256((__m256i*)dst + 2 * i + 1, result);
+    __m256i h01 = _mm256_packs_epi32(q0, q1);
+    __m256i h23 = _mm256_packs_epi32(q2, q3);
 
+            h01 = _mm256_permute4x64_epi64(h01, _MM_SHUFFLE(3, 1, 2, 0));
+            h23 = _mm256_permute4x64_epi64(h23, _MM_SHUFFLE(3, 1, 2, 0));
+
+    _mm256_store_si256(dst_v + dst_base + 0, h01);
+    _mm256_store_si256(dst_v + dst_base + 1, h23);
   }
 }
 
@@ -313,7 +885,7 @@
 {\
   int32_t shift_1st = kvz_g_convert_to_bitn + 1 + (bitdepth - 8); \
   int32_t shift_2nd = kvz_g_convert_to_bitn + 8; \
-  int16_t tmpn * n;\
+  ALIGNED(64) int16_t tmpn * n;\
   const int16_t *tdct = &kvz_g_ ## type ## _ ## n ## _t00;\
   const int16_t *dct = &kvz_g_ ## type ## _ ## n 00;\
 \
@@ -329,7 +901,7 @@
 {\
   int32_t shift_1st = 7; \
   int32_t shift_2nd = 12 - (bitdepth - 8); \
-  int16_t tmpn * n;\
+  ALIGNED(64) int16_t tmpn * n;\
   const int16_t *tdct = &kvz_g_ ## type ## _ ## n ## _t00;\
   const int16_t *dct = &kvz_g_ ## type ## _ ## n 00;\
 \
@@ -337,17 +909,19 @@
   mul_clip_matrix_ ## n ## x ## n ## _avx2(tmp, dct, output, shift_2nd);\
 }\
 
+// Ha, we've got a tailored implementation for these
+// TRANSFORM(dst, 4);
+// ITRANSFORM(dst, 4);
+// TRANSFORM(dct, 4);
+// ITRANSFORM(dct, 4);
+// TRANSFORM(dct, 8);
+// ITRANSFORM(dct, 8);
+// TRANSFORM(dct, 16);
+// ITRANSFORM(dct, 16);
+
 // Generate all the transform functions
-TRANSFORM(dst, 4);
-TRANSFORM(dct, 4);
-TRANSFORM(dct, 8);
-TRANSFORM(dct, 16);
-TRANSFORM(dct, 32);
 
-ITRANSFORM(dst, 4);
-ITRANSFORM(dct, 4);
-ITRANSFORM(dct, 8);
-ITRANSFORM(dct, 16);
+TRANSFORM(dct, 32);
 ITRANSFORM(dct, 32);
 
 #endif //COMPILE_INTEL_AVX2

kvazaar-1.3.0.tar.gz/src/strategies/avx2/intra-avx2.c -> kvazaar-2.0.0.tar.gz/src/strategies/avx2/intra-avx2.c Changed

@@ -26,6 +26,7 @@
 
 #include "kvazaar.h"
 #include "strategyselector.h"
+#include "strategies/missing-intel-intrinsics.h"
 
 
  /**
@@ -416,7 +417,7 @@
       tmp_refx + width = ref_mainx;
     }
     // Get a pointer to block index 0 in tmp_ref.
-    ref_main = &tmp_refwidth;
+    ref_main = tmp_ref + width;
 
     // Extend the side reference to the negative indices of main reference.
     int_fast32_t col_sample_disp = 128; // rounding for the ">> 8"
@@ -453,7 +454,6 @@
   }
 }
 
-
 /**
  * \brief Generate planar prediction.
  * \param log2_width    Log2 of width, range 2..5.
@@ -500,19 +500,411 @@
         _mm_storel_epi64((__m128i*)&(dsty * width + x), chunk);
       }
     }
-
   } else {
-    // Unoptimized version for reference.
-    for (int y = 0; y < width; ++y) {
-      for (int x = 0; x < width; ++x) {
-        int_fast16_t hor = (width - 1 - x) * ref_lefty + 1 + (x + 1) * top_right;
-        int_fast16_t ver = (width - 1 - y) * ref_topx + 1 + (y + 1) * bottom_left;
-        dsty * width + x = (ver + hor + width) >> (log2_width + 1);
-      }
+    // Only if log2_width == 2 <=> width == 4
+    assert(width == 4);
+    const __m128i rl_shufmask = _mm_setr_epi32(0x04040404, 0x05050505,
+                                               0x06060606, 0x07070707);
+
+    const __m128i xp1   = _mm_set1_epi32  (0x04030201);
+    const __m128i yp1   = _mm_shuffle_epi8(xp1,   rl_shufmask);
+
+    const __m128i rdist = _mm_set1_epi32  (0x00010203);
+    const __m128i bdist = _mm_shuffle_epi8(rdist, rl_shufmask);
+
+    const __m128i wid16 = _mm_set1_epi16  (width);
+    const __m128i tr    = _mm_set1_epi8   (top_right);
+    const __m128i bl    = _mm_set1_epi8   (bottom_left);
+
+    uint32_t rt14    = *(const uint32_t *)(ref_top  + 1);
+    uint32_t rl14    = *(const uint32_t *)(ref_left + 1);
+    uint64_t rt14_64 = (uint64_t)rt14;
+    uint64_t rl14_64 = (uint64_t)rl14;
+    uint64_t rtl14   = rt14_64 | (rl14_64 << 32);
+
+    __m128i rtl_v    = _mm_cvtsi64_si128   (rtl14);
+    __m128i rt       = _mm_broadcastd_epi32(rtl_v);
+    __m128i rl       = _mm_shuffle_epi8    (rtl_v,    rl_shufmask);
+
+    __m128i rtrl_l   = _mm_unpacklo_epi8   (rt,       rl);
+    __m128i rtrl_h   = _mm_unpackhi_epi8   (rt,       rl);
+
+    __m128i bdrd_l   = _mm_unpacklo_epi8   (bdist,    rdist);
+    __m128i bdrd_h   = _mm_unpackhi_epi8   (bdist,    rdist);
+
+    __m128i hvs_lo   = _mm_maddubs_epi16   (rtrl_l,   bdrd_l);
+    __m128i hvs_hi   = _mm_maddubs_epi16   (rtrl_h,   bdrd_h);
+
+    __m128i xp1yp1_l = _mm_unpacklo_epi8   (xp1,      yp1);
+    __m128i xp1yp1_h = _mm_unpackhi_epi8   (xp1,      yp1);
+    __m128i trbl_lh  = _mm_unpacklo_epi8   (tr,       bl);
+
+    __m128i addend_l = _mm_maddubs_epi16   (trbl_lh,  xp1yp1_l);
+    __m128i addend_h = _mm_maddubs_epi16   (trbl_lh,  xp1yp1_h);
+
+            addend_l = _mm_add_epi16       (addend_l, wid16);
+            addend_h = _mm_add_epi16       (addend_h, wid16);
+
+    __m128i sum_l    = _mm_add_epi16       (hvs_lo,   addend_l);
+    __m128i sum_h    = _mm_add_epi16       (hvs_hi,   addend_h);
+
+    // Shift right by log2_width + 1
+    __m128i sum_l_t  = _mm_srli_epi16      (sum_l,    3);
+    __m128i sum_h_t  = _mm_srli_epi16      (sum_h,    3);
+    __m128i result   = _mm_packus_epi16    (sum_l_t,  sum_h_t);
+    _mm_storeu_si128((__m128i *)dst, result);
+  }
+}
+
+// Calculate the DC value for a 4x4 block. The algorithm uses slightly
+// different addends, multipliers etc for different pixels in the block,
+// but for a fixed-size implementation one vector wide, all the weights,
+// addends etc can be preinitialized for each position.
+static void pred_filtered_dc_4x4(const uint8_t *ref_top,
+                                 const uint8_t *ref_left,
+                                       uint8_t *out_block)
+{
+  const uint32_t rt_u32 = *(const uint32_t *)(ref_top  + 1);
+  const uint32_t rl_u32 = *(const uint32_t *)(ref_left + 1);
+
+  const __m128i zero    = _mm_setzero_si128();
+  const __m128i twos    = _mm_set1_epi8(2);
+
+  // Hack. Move 4 u8's to bit positions 0, 64, 128 and 192 in two regs, to
+  // expand them to 16 bits sort of "for free". Set highest bits on all the
+  // other bytes in vectors to zero those bits in the result vector.
+  const __m128i rl_shuf_lo = _mm_setr_epi32(0x80808000, 0x80808080,
+                                            0x80808001, 0x80808080);
+  const __m128i rl_shuf_hi = _mm_add_epi8  (rl_shuf_lo, twos);
+
+  // Every second multiplier is 1, because we want maddubs to calculate
+  // a + bc = 1 * a + bc (actually 2 + bc). We need to fill a vector with
+  // ((u8)2)'s for other stuff anyway, so that can also be used here.
+  const __m128i mult_lo = _mm_setr_epi32(0x01030102, 0x01030103,
+                                         0x01040103, 0x01040104);
+  const __m128i mult_hi = _mm_setr_epi32(0x01040103, 0x01040104,
+                                         0x01040103, 0x01040104);
+  __m128i four         = _mm_cvtsi32_si128  (4);
+  __m128i rt           = _mm_cvtsi32_si128  (rt_u32);
+  __m128i rl           = _mm_cvtsi32_si128  (rl_u32);
+  __m128i rtrl         = _mm_unpacklo_epi32 (rt, rl);
+
+  __m128i sad0         = _mm_sad_epu8       (rtrl, zero);
+  __m128i sad1         = _mm_shuffle_epi32  (sad0, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad2         = _mm_add_epi64      (sad0, sad1);
+  __m128i sad3         = _mm_add_epi64      (sad2, four);
+
+  __m128i dc_64        = _mm_srli_epi64     (sad3, 3);
+  __m128i dc_8         = _mm_broadcastb_epi8(dc_64);
+
+  __m128i rl_lo        = _mm_shuffle_epi8   (rl, rl_shuf_lo);
+  __m128i rl_hi        = _mm_shuffle_epi8   (rl, rl_shuf_hi);
+
+  __m128i rt_lo        = _mm_unpacklo_epi8  (rt, zero);
+  __m128i rt_hi        = zero;
+
+  __m128i dc_addend    = _mm_unpacklo_epi8(dc_8, twos);
+
+  __m128i dc_multd_lo  = _mm_maddubs_epi16(dc_addend,    mult_lo);
+  __m128i dc_multd_hi  = _mm_maddubs_epi16(dc_addend,    mult_hi);
+
+  __m128i rl_rt_lo     = _mm_add_epi16    (rl_lo,        rt_lo);
+  __m128i rl_rt_hi     = _mm_add_epi16    (rl_hi,        rt_hi);
+
+  __m128i res_lo       = _mm_add_epi16    (dc_multd_lo,  rl_rt_lo);
+  __m128i res_hi       = _mm_add_epi16    (dc_multd_hi,  rl_rt_hi);
+
+          res_lo       = _mm_srli_epi16   (res_lo,       2);
+          res_hi       = _mm_srli_epi16   (res_hi,       2);
+
+  __m128i final        = _mm_packus_epi16 (res_lo,       res_hi);
+  _mm_storeu_si128((__m128i *)out_block, final);
+}
+
+static void pred_filtered_dc_8x8(const uint8_t *ref_top,
+                                 const uint8_t *ref_left,
+                                       uint8_t *out_block)
+{
+  const uint64_t rt_u64 = *(const uint64_t *)(ref_top  + 1);
+  const uint64_t rl_u64 = *(const uint64_t *)(ref_left + 1);
+
+  const __m128i zero128 = _mm_setzero_si128();
+  const __m256i twos    = _mm256_set1_epi8(2);
+
+  // DC multiplier is 2 at (0, 0), 3 at (*, 0) and (0, *), and 4 at (*, *).
+  // There is a constant addend of 2 on each pixel, use values from the twos
+  // register and multipliers of 1 for that, to use maddubs for an (a*b)+c
+  // operation.
+  const __m256i mult_up_lo = _mm256_setr_epi32(0x01030102, 0x01030103,
+                                               0x01030103, 0x01030103,
+                                               0x01040103, 0x01040104,
+                                               0x01040104, 0x01040104);
+
+  // The 6 lowest rows have same multipliers, also the DC values and addends
+  // are the same so this works for all of those
+  const __m256i mult_rest  = _mm256_permute4x64_epi64(mult_up_lo, _MM_SHUFFLE(3, 2, 3, 2));
+
+  // Every 8-pixel row starts with the next pixel of ref_left. Along with
+  // doing the shuffling, also expand u8->u16, ie. move bytes 0 and 1 from
+  // ref_left to bit positions 0 and 128 in rl_up_lo, 2 and 3 to rl_up_hi,
+  // etc. The places to be zeroed out are 0x80 instead of the usual 0xff,
+  // because this allows us to form new masks on the fly by adding 0x02-bytes
+  // to this mask and still retain the highest bits as 1 where things should
+  // be zeroed out.
+  const __m256i rl_shuf_up_lo = _mm256_setr_epi32(0x80808000, 0x80808080,
+                                                  0x80808080, 0x80808080,
+                                                  0x80808001, 0x80808080,
+                                                  0x80808080, 0x80808080);
+  // And don't waste memory or architectural regs, hope these instructions
+  // will be placed in between the shuffles by the compiler to only use one
+  // register for the shufmasks, and executed way ahead of time because their
+  // regs can be renamed.
+  const __m256i rl_shuf_up_hi = _mm256_add_epi8 (rl_shuf_up_lo, twos);
+  const __m256i rl_shuf_dn_lo = _mm256_add_epi8 (rl_shuf_up_hi, twos);
+  const __m256i rl_shuf_dn_hi = _mm256_add_epi8 (rl_shuf_dn_lo, twos);
+
+  __m128i eight         = _mm_cvtsi32_si128     (8);
+  __m128i rt            = _mm_cvtsi64_si128     (rt_u64);
+  __m128i rl            = _mm_cvtsi64_si128     (rl_u64);
+  __m128i rtrl          = _mm_unpacklo_epi64    (rt, rl);
+
+  __m128i sad0          = _mm_sad_epu8          (rtrl, zero128);
+  __m128i sad1          = _mm_shuffle_epi32     (sad0, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad2          = _mm_add_epi64         (sad0, sad1);
+  __m128i sad3          = _mm_add_epi64         (sad2, eight);
+
+  __m128i dc_64         = _mm_srli_epi64        (sad3, 4);
+  __m256i dc_8          = _mm256_broadcastb_epi8(dc_64);
+
+  __m256i dc_addend     = _mm256_unpacklo_epi8  (dc_8, twos);
+
+  __m256i dc_up_lo      = _mm256_maddubs_epi16  (dc_addend, mult_up_lo);
+  __m256i dc_rest       = _mm256_maddubs_epi16  (dc_addend, mult_rest);
+
+  // rt_dn is all zeros, as is rt_up_hi. This'll get us the rl and rt parts
+  // in A|B, C|D order instead of A|C, B|D that could be packed into abcd
+  // order, so these need to be permuted before adding to the weighed DC
+  // values.
+  __m256i rt_up_lo      = _mm256_cvtepu8_epi16   (rt);
+
+  __m256i rlrlrlrl      = _mm256_broadcastq_epi64(rl);
+  __m256i rl_up_lo      = _mm256_shuffle_epi8    (rlrlrlrl, rl_shuf_up_lo);
+
+  // Everything ref_top is zero except on the very first row
+  __m256i rt_rl_up_hi   = _mm256_shuffle_epi8    (rlrlrlrl, rl_shuf_up_hi);
+  __m256i rt_rl_dn_lo   = _mm256_shuffle_epi8    (rlrlrlrl, rl_shuf_dn_lo);
+  __m256i rt_rl_dn_hi   = _mm256_shuffle_epi8    (rlrlrlrl, rl_shuf_dn_hi);
+
+  __m256i rt_rl_up_lo   = _mm256_add_epi16       (rt_up_lo, rl_up_lo);
+
+  __m256i rt_rl_up_lo_2 = _mm256_permute2x128_si256(rt_rl_up_lo, rt_rl_up_hi, 0x20);
+  __m256i rt_rl_up_hi_2 = _mm256_permute2x128_si256(rt_rl_up_lo, rt_rl_up_hi, 0x31);
+  __m256i rt_rl_dn_lo_2 = _mm256_permute2x128_si256(rt_rl_dn_lo, rt_rl_dn_hi, 0x20);
+  __m256i rt_rl_dn_hi_2 = _mm256_permute2x128_si256(rt_rl_dn_lo, rt_rl_dn_hi, 0x31);
+
+  __m256i up_lo = _mm256_add_epi16(rt_rl_up_lo_2, dc_up_lo);
+  __m256i up_hi = _mm256_add_epi16(rt_rl_up_hi_2, dc_rest);
+  __m256i dn_lo = _mm256_add_epi16(rt_rl_dn_lo_2, dc_rest);
+  __m256i dn_hi = _mm256_add_epi16(rt_rl_dn_hi_2, dc_rest);
+
+          up_lo = _mm256_srli_epi16(up_lo, 2);
+          up_hi = _mm256_srli_epi16(up_hi, 2);
+          dn_lo = _mm256_srli_epi16(dn_lo, 2);
+          dn_hi = _mm256_srli_epi16(dn_hi, 2);
+
+  __m256i res_up = _mm256_packus_epi16(up_lo, up_hi);
+  __m256i res_dn = _mm256_packus_epi16(dn_lo, dn_hi);
+
+  _mm256_storeu_si256(((__m256i *)out_block) + 0, res_up);
+  _mm256_storeu_si256(((__m256i *)out_block) + 1, res_dn);
+}
+
+static INLINE __m256i cvt_u32_si256(const uint32_t u)
+{
+  const __m256i zero = _mm256_setzero_si256();
+  return _mm256_insert_epi32(zero, u, 0);
+}
+
+static void pred_filtered_dc_16x16(const uint8_t *ref_top,
+                                   const uint8_t *ref_left,
+                                         uint8_t *out_block)
+{
+  const __m128i rt_128 = _mm_loadu_si128((const __m128i *)(ref_top  + 1));
+  const __m128i rl_128 = _mm_loadu_si128((const __m128i *)(ref_left + 1));
+
+  const __m128i zero_128 = _mm_setzero_si128();
+  const __m256i zero     = _mm256_setzero_si256();
+  const __m256i twos     = _mm256_set1_epi8(2);
+
+  const __m256i mult_r0  = _mm256_setr_epi32(0x01030102, 0x01030103,
+                                             0x01030103, 0x01030103,
+                                             0x01030103, 0x01030103,
+                                             0x01030103, 0x01030103);
+
+  const __m256i mult_left = _mm256_set1_epi16(0x0103);
+
+  // Leftmost bytes' blend mask, to move bytes (pixels) from the leftmost
+  // column vector to the result row
+  const __m256i lm8_bmask = _mm256_setr_epi32(0xff, 0, 0, 0, 0xff, 0, 0, 0);
+
+  __m128i sixteen = _mm_cvtsi32_si128(16);
+  __m128i sad0_t  = _mm_sad_epu8 (rt_128, zero_128);
+  __m128i sad0_l  = _mm_sad_epu8 (rl_128, zero_128);
+  __m128i sad0    = _mm_add_epi64(sad0_t, sad0_l);
+
+  __m128i sad1    = _mm_shuffle_epi32      (sad0, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad2    = _mm_add_epi64          (sad0, sad1);
+  __m128i sad3    = _mm_add_epi64          (sad2, sixteen);
+
+  __m128i dc_64   = _mm_srli_epi64         (sad3, 5);
+  __m256i dc_8    = _mm256_broadcastb_epi8 (dc_64);
+
+  __m256i rt      = _mm256_cvtepu8_epi16   (rt_128);
+  __m256i rl      = _mm256_cvtepu8_epi16   (rl_128);
+
+  uint8_t rl0       = *(uint8_t *)(ref_left + 1);
+  __m256i rl_r0     = cvt_u32_si256((uint32_t)rl0);
+
+  __m256i rlrt_r0   = _mm256_add_epi16(rl_r0, rt);
+
+  __m256i dc_addend = _mm256_unpacklo_epi8(dc_8, twos);
+  __m256i r0        = _mm256_maddubs_epi16(dc_addend, mult_r0);
+  __m256i left_dcs  = _mm256_maddubs_epi16(dc_addend, mult_left);
+
+          r0        = _mm256_add_epi16    (r0,       rlrt_r0);
+          r0        = _mm256_srli_epi16   (r0, 2);
+  __m256i r0r0      = _mm256_packus_epi16 (r0, r0);
+          r0r0      = _mm256_permute4x64_epi64(r0r0, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i leftmosts = _mm256_add_epi16    (left_dcs,  rl);
+          leftmosts = _mm256_srli_epi16   (leftmosts, 2);
+
+  // Contain the leftmost column's bytes in both lanes of lm_8
+  __m256i lm_8      = _mm256_packus_epi16 (leftmosts, zero);
+          lm_8      = _mm256_permute4x64_epi64(lm_8,  _MM_SHUFFLE(2, 0, 2, 0));
+
+  __m256i lm8_r1    = _mm256_srli_epi32       (lm_8, 8);
+  __m256i r1r1      = _mm256_blendv_epi8      (dc_8, lm8_r1, lm8_bmask);
+  __m256i r0r1      = _mm256_blend_epi32      (r0r0, r1r1, 0xf0);
+
+  _mm256_storeu_si256((__m256i *)out_block, r0r1);
+
+  // Starts from 2 because row 0 (and row 1) is handled separately
+  __m256i lm8_l     = _mm256_bsrli_epi128     (lm_8, 2);
+  __m256i lm8_h     = _mm256_bsrli_epi128     (lm_8, 3);
+          lm_8      = _mm256_blend_epi32      (lm8_l, lm8_h, 0xf0);
+
+  for (uint32_t y = 2; y < 16; y += 2) {
+    __m256i curr_row = _mm256_blendv_epi8 (dc_8, lm_8, lm8_bmask);
+    _mm256_storeu_si256((__m256i *)(out_block + (y << 4)), curr_row);
+    lm_8 = _mm256_bsrli_epi128(lm_8, 2);
+  }
+}
+
+static void pred_filtered_dc_32x32(const uint8_t *ref_top,
+                                   const uint8_t *ref_left,
+                                         uint8_t *out_block)
+{
+  const __m256i rt = _mm256_loadu_si256((const __m256i *)(ref_top  + 1));
+  const __m256i rl = _mm256_loadu_si256((const __m256i *)(ref_left + 1));
+
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i twos = _mm256_set1_epi8(2);
+
+  const __m256i mult_r0lo = _mm256_setr_epi32(0x01030102, 0x01030103,
+                                              0x01030103, 0x01030103,
+                                              0x01030103, 0x01030103,
+                                              0x01030103, 0x01030103);
+
+  const __m256i mult_left = _mm256_set1_epi16(0x0103);
+  const __m256i lm8_bmask = cvt_u32_si256    (0xff);
+
+  const __m256i bshif_msk = _mm256_setr_epi32(0x04030201, 0x08070605,
+                                              0x0c0b0a09, 0x800f0e0d,
+                                              0x03020100, 0x07060504,
+                                              0x0b0a0908, 0x0f0e0d0c);
+  __m256i debias = cvt_u32_si256(32);
+  __m256i sad0_t = _mm256_sad_epu8         (rt,     zero);
+  __m256i sad0_l = _mm256_sad_epu8         (rl,     zero);
+  __m256i sad0   = _mm256_add_epi64        (sad0_t, sad0_l);
+
+  __m256i sad1   = _mm256_permute4x64_epi64(sad0,   _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i sad2   = _mm256_add_epi64        (sad0,   sad1);
+  __m256i sad3   = _mm256_shuffle_epi32    (sad2,   _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i sad4   = _mm256_add_epi64        (sad2,   sad3);
+  __m256i sad5   = _mm256_add_epi64        (sad4,   debias);
+  __m256i dc_64  = _mm256_srli_epi64       (sad5,   6);
+
+  __m128i dc_64_ = _mm256_castsi256_si128  (dc_64);
+  __m256i dc_8   = _mm256_broadcastb_epi8  (dc_64_);
+
+  __m256i rtlo   = _mm256_unpacklo_epi8    (rt, zero);
+  __m256i rllo   = _mm256_unpacklo_epi8    (rl, zero);
+  __m256i rthi   = _mm256_unpackhi_epi8    (rt, zero);
+  __m256i rlhi   = _mm256_unpackhi_epi8    (rl, zero);
+
+  __m256i dc_addend = _mm256_unpacklo_epi8 (dc_8, twos);
+  __m256i r0lo   = _mm256_maddubs_epi16    (dc_addend, mult_r0lo);
+  __m256i r0hi   = _mm256_maddubs_epi16    (dc_addend, mult_left);
+  __m256i c0dc   = r0hi;
+
+          r0lo   = _mm256_add_epi16        (r0lo, rtlo);
+          r0hi   = _mm256_add_epi16        (r0hi, rthi);
+
+  __m256i rlr0   = _mm256_blendv_epi8      (zero, rl, lm8_bmask);
+          r0lo   = _mm256_add_epi16        (r0lo, rlr0);
+
+          r0lo   = _mm256_srli_epi16       (r0lo, 2);
+          r0hi   = _mm256_srli_epi16       (r0hi, 2);
+  __m256i r0     = _mm256_packus_epi16     (r0lo, r0hi);
+
+  _mm256_storeu_si256((__m256i *)out_block, r0);
+
+  __m256i c0lo   = _mm256_add_epi16        (c0dc, rllo);
+  __m256i c0hi   = _mm256_add_epi16        (c0dc, rlhi);
+          c0lo   = _mm256_srli_epi16       (c0lo, 2);
+          c0hi   = _mm256_srli_epi16       (c0hi, 2);
+
+  __m256i c0     = _mm256_packus_epi16     (c0lo, c0hi);
+
+  // r0 already handled!
+  for (uint32_t y = 1; y < 32; y++) {
+    if (y == 16) {
+      c0 = _mm256_permute4x64_epi64(c0, _MM_SHUFFLE(1, 0, 3, 2));
+    } else {
+      c0 = _mm256_shuffle_epi8     (c0, bshif_msk);
     }
+    __m256i curr_row = _mm256_blendv_epi8 (dc_8, c0, lm8_bmask);
+    _mm256_storeu_si256(((__m256i *)out_block) + y, curr_row);
   }
 }
 
+/**
+* \brief Generage intra DC prediction with post filtering applied.
+* \param log2_width    Log2 of width, range 2..5.
+* \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
+* \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+* \param dst           Buffer of size width*width.
+*/
+static void kvz_intra_pred_filtered_dc_avx2(
+  const int_fast8_t log2_width,
+  const kvz_pixel *ref_top,
+  const kvz_pixel *ref_left,
+        kvz_pixel *out_block)
+{
+  assert(log2_width >= 2 && log2_width <= 5);
+  assert(sizeof(kvz_pixel) == sizeof(uint8_t));
+
+  if (log2_width == 2) {
+    pred_filtered_dc_4x4(ref_top, ref_left, out_block);
+  } else if (log2_width == 3) {
+    pred_filtered_dc_8x8(ref_top, ref_left, out_block);
+  } else if (log2_width == 4) {
+    pred_filtered_dc_16x16(ref_top, ref_left, out_block);
+  } else if (log2_width == 5) {
+    pred_filtered_dc_32x32(ref_top, ref_left, out_block);
+  }
+}
 
 #endif //COMPILE_INTEL_AVX2 && defined X86_64
 
@@ -523,7 +915,9 @@
   if (bitdepth == 8) {
     success &= kvz_strategyselector_register(opaque, "angular_pred", "avx2", 40, &kvz_angular_pred_avx2);
     success &= kvz_strategyselector_register(opaque, "intra_pred_planar", "avx2", 40, &kvz_intra_pred_planar_avx2);
+    success &= kvz_strategyselector_register(opaque, "intra_pred_filtered_dc", "avx2", 40, &kvz_intra_pred_filtered_dc_avx2);
   }
 #endif //COMPILE_INTEL_AVX2 && defined X86_64
   return success;
 }
+

kvazaar-1.3.0.tar.gz/src/strategies/avx2/picture-avx2.c -> kvazaar-2.0.0.tar.gz/src/strategies/avx2/picture-avx2.c Changed

@@ -756,251 +756,6 @@
   }
 }
 
-static void inter_recon_bipred_no_mov_avx2(
- const int height,
- const int width,
- const int ypos,
- const int xpos,
- const hi_prec_buf_t*high_precision_rec0,
- const hi_prec_buf_t*high_precision_rec1,
- lcu_t* lcu,
- kvz_pixel* temp_lcu_y,
- kvz_pixel* temp_lcu_u,
- kvz_pixel* temp_lcu_v) {
-
- // This function is used only when kvazaar can't find any movement from the current block
- int y_in_lcu, x_in_lcu;
- __m256i sample0_epi8, sample1_epi8, temp_y_epi8;
- int32_t * pointer = 0;
-
- for (int temp_y = 0; temp_y < height; temp_y += 1) {
-  y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
-
-  for (int temp_x = 0; temp_x < width; temp_x += 32) {
-
-   x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
-
-   switch (width)
-   {
-
-   case 4:
-
-    sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu));
-    sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu));
-
-    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-    pointer = (int32_t*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu);
-    *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_y_epi8));
-
-    break;
-
-   case 8:
-
-    sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu));
-    sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu));
-
-    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-    // Store 64-bits from vector to memory
-    _mm_storel_epi64((__m128i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), _mm256_castsi256_si128(temp_y_epi8));
-
-    break;
-
-   case 12:
-    sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu));
-    sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu));
-
-    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-    // Store 64-bits from vector to memory
-    _mm_storel_epi64((__m128i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), _mm256_castsi256_si128(temp_y_epi8));
-
-    x_in_lcu = ((xpos + temp_x + 8) & ((LCU_WIDTH)-1));
-
-    sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu));
-    sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu));
-
-    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-    pointer = (int32_t*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu);
-    *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_y_epi8));
-    break;
-
-
-   case 16:
-
-    sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu));
-    sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu));
-
-    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-    // Store 128-bit to memory
-    _mm_storeu_si128((__m128i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), _mm256_castsi256_si128(temp_y_epi8));
-
-    break;
-
-   case 32:
-
-    sample0_epi8 = _mm256_loadu_si256((__m256i*) &(temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu));
-    sample1_epi8 = _mm256_loadu_si256((__m256i*) &(lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu));
-
-    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-
-    // Store 256-bit integers to memory
-    _mm256_storeu_si256((__m256i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), temp_y_epi8);
-    break;
-
-   default:
-    // If width is something strange size, use this
-    for (int temp_i = 0; temp_i < width; ++temp_i) {
-     x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1));
-
-     int sample0_y = (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH));
-     int sample1_y = (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH));
-
-     lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y) >> 1);
-    }
-
-
-   }
-
-   if (temp_x < width >> 1 && temp_y < height >> 1) {
-    y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
-    x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
-
-    __m256i temp_u_epi8;
-    __m256i temp_v_epi8;
-
-
-    switch (width)
-    {
-
-    case 8:
-
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     pointer = (int32_t*)&(lcu->rec.u(y_in_lcu)* LCU_WIDTH_C + x_in_lcu);
-     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_u_epi8));
-
-     pointer = (int32_t*)&(lcu->rec.v(y_in_lcu)* LCU_WIDTH_C + x_in_lcu);
-     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_v_epi8));
-
-     break;
-
-    case 12:
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     pointer = (int32_t*)&(lcu->rec.u(y_in_lcu)* LCU_WIDTH_C + x_in_lcu);
-     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_u_epi8));
-
-     pointer = (int32_t*)&(lcu->rec.v(y_in_lcu)* LCU_WIDTH_C + x_in_lcu);
-     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_v_epi8));
-
-     // This is used only with odd shaped objects
-     for (int temp_i = 4; temp_i < width >> 1; ++temp_i) {
-      int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
-      int16_t sample0_u = (temp_lcu_uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
-      int16_t sample1_u = (lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
-      lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u) >> 1);
-
-      int16_t sample0_v = (temp_lcu_vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
-      int16_t sample1_v = (lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
-      lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v) >> 1);
-     }
-
-     break;
-
-    case 16:
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     // Store 64-bit integer into memory
-     _mm_storel_epi64((__m128i*)&(lcu->rec.u(y_in_lcu)* LCU_WIDTH_C + x_in_lcu), _mm256_castsi256_si128(temp_u_epi8));
-
-     // Store 64-bit integer into memory
-     _mm_storel_epi64((__m128i*)&(lcu->rec.v(y_in_lcu)* LCU_WIDTH_C + x_in_lcu), _mm256_castsi256_si128(temp_v_epi8));
-
-     break;
-
-    case 32:
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     // Fill 128 bit vector with packed data and store it to memory
-     _mm_storeu_si128((__m128i*)&(lcu->rec.u(y_in_lcu)* LCU_WIDTH_C + x_in_lcu), _mm256_castsi256_si128(temp_u_epi8));
-
-     // Fill 128 bit vector with packed data and store it to memory
-     _mm_storeu_si128((__m128i*)&(lcu->rec.v(y_in_lcu)* LCU_WIDTH_C + x_in_lcu), _mm256_castsi256_si128(temp_v_epi8));
-
-
-     break;
-
-     case 64:
-
-     sample0_epi8 = _mm256_loadu_si256((__m256i*) &(temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     sample1_epi8 = _mm256_loadu_si256((__m256i*) &(lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     sample0_epi8 = _mm256_loadu_si256((__m256i*) &(temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     sample1_epi8 = _mm256_loadu_si256((__m256i*) &(lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
-     temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
-
-     _mm256_storeu_si256((__m256i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), temp_u_epi8);
-     _mm256_storeu_si256((__m256i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), temp_v_epi8);
-     break;
-
-     default:
-      // This is used only with odd shaped objects
-      for (int temp_i = 0; temp_i < width >> 1; ++temp_i) {
-       int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
-       int16_t sample0_u = (temp_lcu_uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
-       int16_t sample1_u = (lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
-       lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u) >> 1);
-
-       int16_t sample0_v = (temp_lcu_vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
-       int16_t sample1_v = (lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
-       lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v) >> 1);
-      }
-
-      break;
-
-    }
-    y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
-   }
-  }
- }
-
-
-}
-
 static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
  const int hi_prec_luma_rec1,
  const int hi_prec_chroma_rec0,
@@ -1014,16 +769,10 @@
  lcu_t* lcu,
  kvz_pixel* temp_lcu_y,
  kvz_pixel* temp_lcu_u,
- kvz_pixel* temp_lcu_v)
+ kvz_pixel* temp_lcu_v,
+bool predict_luma,
+bool predict_chroma)
 {
- if(hi_prec_luma_rec0 == 0 && hi_prec_luma_rec1 == 0 && hi_prec_chroma_rec0 == 0 && hi_prec_chroma_rec1 == 0)
- {
-  inter_recon_bipred_no_mov_avx2(height, width, ypos, xpos, high_precision_rec0, high_precision_rec1, lcu, temp_lcu_y, temp_lcu_u, temp_lcu_v);
- }
-
- else
- {
-
   int y_in_lcu, x_in_lcu;
   int shift = 15 - KVZ_BIT_DEPTH;
   int offset = 1 << (shift - 1);
@@ -1038,87 +787,79 @@
    for (int temp_x = 0; temp_x < width; temp_x += 8) {
     x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
 
-    bool use_8_elements = ((temp_x + 8) <= width);
-
-    switch (use_8_elements)
-    {
-
-    case false:
-
-     if (width < 4) {
-      // If width is smaller than 4 there's no need to use SIMD
-      for (int temp_i = 0; temp_i < width; ++temp_i) {
-       x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1));
-
-       int sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu : (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-       int sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu : (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-
-       lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
-      }
-     }
-
-     else{
-     // Load total of 4 elements from memory to vector
-     sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu))) :
-      _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)&(temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
-
-
-     sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu))) :
-      _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+    if (predict_luma) {
+      bool use_8_elements = ((temp_x + 8) <= width);
 
+      if (!use_8_elements) {
+        if (width < 4) {
+          // If width is smaller than 4 there's no need to use SIMD
+          for (int temp_i = 0; temp_i < width; ++temp_i) {
+            x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1));
 
-     // (sample1 + sample2 + offset)>>shift 
-     temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
-     temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32);
-     temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift);
+            int sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu : (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+            int sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu : (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
 
-     // Pack the bits from 32-bit to 8-bit
-     temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32);
-     temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
-     temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
+            lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
+          }
+        }
 
-     pointer = (int32_t*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu);
-     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
+        else {
+          // Load total of 4 elements from memory to vector
+          sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu))) :
+            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)&(temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
 
 
+          sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu))) :
+            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
 
-     for (int temp_i = temp_x + 4; temp_i < width; ++temp_i) {
-      x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1));
 
-      int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu : (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-      int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu : (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+          // (sample1 + sample2 + offset)>>shift 
+          temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+          temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32);
+          temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift);
 
-      lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
-     }
+          // Pack the bits from 32-bit to 8-bit
+          temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32);
+          temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
+          temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
 
-     }
-     break;
+          pointer = (int32_t*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu);
+          *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
 
-    default:
-     // Load total of 8 elements from memory to vector
-     sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu))) :
-      _mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu)))), 14 - KVZ_BIT_DEPTH);
 
-     sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu))) :
-      _mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu)))), 14 - KVZ_BIT_DEPTH);
 
-     // (sample1 + sample2 + offset)>>shift 
-     temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
-     temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32);
-     temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift);
+          for (int temp_i = temp_x + 4; temp_i < width; ++temp_i) {
+            x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1));
 
-     // Pack the bits from 32-bit to 8-bit
-     temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32);
-     temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
-     temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
+            int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu : (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+            int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu : (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
 
-     // Store 64-bits from vector to memory
-     _mm_storel_epi64((__m128i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), _mm256_castsi256_si128(temp_epi8));
+            lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
+          }
 
-     break;
+        }
+      } else {
+        // Load total of 8 elements from memory to vector
+        sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu))) :
+          _mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu)))), 14 - KVZ_BIT_DEPTH);
+
+        sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu))) :
+          _mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu)))), 14 - KVZ_BIT_DEPTH);
+
+        // (sample1 + sample2 + offset)>>shift 
+        temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+        temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32);
+        temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift);
+
+        // Pack the bits from 32-bit to 8-bit
+        temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32);
+        temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
+        temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
+
+        // Store 64-bits from vector to memory
+        _mm_storel_epi64((__m128i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), _mm256_castsi256_si128(temp_epi8));
+      }
     }
-
-
    }
   }
   for (int temp_y = 0; temp_y < height >> 1; ++temp_y) {
@@ -1128,135 +869,126 @@
 
     int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
 
-    if ((width >> 1) < 4) {
-     // If width>>1 is smaller than 4 there's no need to use SIMD
-
-     for (int temp_i = 0; temp_i < width >> 1; ++temp_i) {
-      int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
-      int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (temp_lcu_uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-      int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-      lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
-
-      int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (temp_lcu_vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-      int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-      lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
-     }
-    }
-
-    else{
-
-     bool use_8_elements = ((temp_x + 8) <= (width>>1));
+    if (predict_chroma) {
+      if ((width >> 1) < 4) {
+        // If width>>1 is smaller than 4 there's no need to use SIMD
 
-     __m256i temp_u_epi32, temp_v_epi32;
+        for (int temp_i = 0; temp_i < width >> 1; ++temp_i) {
+          int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
+          int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (temp_lcu_uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+          int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+          lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
 
-     switch (use_8_elements)
-     {
-
-     case false:
-      // Load 4 pixels to vector
-      sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
-       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
-
-      sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
-       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+          int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (temp_lcu_vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+          int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+          lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
+        }
+      }
 
-      // (sample1 + sample2 + offset)>>shift 
-      temp_u_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
-      temp_u_epi32 = _mm256_add_epi32(temp_u_epi32, offset_epi32);
-      temp_u_epi32 = _mm256_srai_epi32(temp_u_epi32, shift);
+      else {
 
+        bool use_8_elements = ((temp_x + 8) <= (width >> 1));
 
+        __m256i temp_u_epi32, temp_v_epi32;
 
-      sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
-       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+        if (!use_8_elements) {
+          // Load 4 pixels to vector
+          sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
 
-      sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
-       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+          sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
 
+          // (sample1 + sample2 + offset)>>shift 
+          temp_u_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+          temp_u_epi32 = _mm256_add_epi32(temp_u_epi32, offset_epi32);
+          temp_u_epi32 = _mm256_srai_epi32(temp_u_epi32, shift);
 
-      // (sample1 + sample2 + offset)>>shift 
-      temp_v_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
-      temp_v_epi32 = _mm256_add_epi32(temp_v_epi32, offset_epi32);
-      temp_v_epi32 = _mm256_srai_epi32(temp_v_epi32, shift);
 
 
-      temp_epi16 = _mm256_packs_epi32(temp_u_epi32, temp_u_epi32);
-      temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
-      temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
+          sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
 
-      pointer = (int32_t*)&(lcu->rec.u(y_in_lcu)* LCU_WIDTH_C + x_in_lcu);
-      *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
+          sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
 
 
-      temp_epi16 = _mm256_packs_epi32(temp_v_epi32, temp_v_epi32);
-      temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
-      temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
+          // (sample1 + sample2 + offset)>>shift 
+          temp_v_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+          temp_v_epi32 = _mm256_add_epi32(temp_v_epi32, offset_epi32);
+          temp_v_epi32 = _mm256_srai_epi32(temp_v_epi32, shift);
 
-      pointer = (int32_t*)&(lcu->rec.v(y_in_lcu)* LCU_WIDTH_C + x_in_lcu);
-      *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
 
-      for (int temp_i = 4; temp_i < width >> 1; ++temp_i) {
+          temp_epi16 = _mm256_packs_epi32(temp_u_epi32, temp_u_epi32);
+          temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
+          temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
 
-       // Use only if width>>1 is not divideble by 4
-       int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
-       int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (temp_lcu_uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-       int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-       lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
+          pointer = (int32_t*)&(lcu->rec.u(y_in_lcu)* LCU_WIDTH_C + x_in_lcu);
+          *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
 
-       int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (temp_lcu_vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-       int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-       lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
-      }
 
+          temp_epi16 = _mm256_packs_epi32(temp_v_epi32, temp_v_epi32);
+          temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
+          temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
 
-      break;
+          pointer = (int32_t*)&(lcu->rec.v(y_in_lcu)* LCU_WIDTH_C + x_in_lcu);
+          *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
 
-     default:
-      // Load 8 pixels to vector
-      sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
-       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+          for (int temp_i = 4; temp_i < width >> 1; ++temp_i) {
 
-      sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
-       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+            // Use only if width>>1 is not divideble by 4
+            int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
+            int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (temp_lcu_uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+            int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+            lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
 
-      // (sample1 + sample2 + offset)>>shift 
-      temp_u_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
-      temp_u_epi32 = _mm256_add_epi32(temp_u_epi32, offset_epi32);
-      temp_u_epi32 = _mm256_srai_epi32(temp_u_epi32, shift);
+            int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (temp_lcu_vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+            int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+            lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
+          }
+        } else {
+          // Load 8 pixels to vector
+          sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
 
-      sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
-       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+          sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
 
-      sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
-       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+          // (sample1 + sample2 + offset)>>shift 
+          temp_u_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+          temp_u_epi32 = _mm256_add_epi32(temp_u_epi32, offset_epi32);
+          temp_u_epi32 = _mm256_srai_epi32(temp_u_epi32, shift);
 
+          sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
 
-      // (sample1 + sample2 + offset)>>shift 
-      temp_v_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
-      temp_v_epi32 = _mm256_add_epi32(temp_v_epi32, offset_epi32);
-      temp_v_epi32 = _mm256_srai_epi32(temp_v_epi32, shift);
+          sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+            _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
 
-      temp_epi16 = _mm256_packs_epi32(temp_u_epi32, temp_u_epi32);
-      temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
-      temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
 
-      // Store 64-bit integer into memory
-      _mm_storel_epi64((__m128i*)&(lcu->rec.u(y_in_lcu)* LCU_WIDTH_C + x_in_lcu), _mm256_castsi256_si128(temp_epi8));
+          // (sample1 + sample2 + offset)>>shift 
+          temp_v_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+          temp_v_epi32 = _mm256_add_epi32(temp_v_epi32, offset_epi32);
+          temp_v_epi32 = _mm256_srai_epi32(temp_v_epi32, shift);
 
-      temp_epi16 = _mm256_packs_epi32(temp_v_epi32, temp_v_epi32);
-      temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
-      temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
+          temp_epi16 = _mm256_packs_epi32(temp_u_epi32, temp_u_epi32);
+          temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
+          temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
 
-      // Store 64-bit integer into memory
-      _mm_storel_epi64((__m128i*)&(lcu->rec.v(y_in_lcu)* LCU_WIDTH_C + x_in_lcu), _mm256_castsi256_si128(temp_epi8));
+          // Store 64-bit integer into memory
+          _mm_storel_epi64((__m128i*)&(lcu->rec.u(y_in_lcu)* LCU_WIDTH_C + x_in_lcu), _mm256_castsi256_si128(temp_epi8));
 
+          temp_epi16 = _mm256_packs_epi32(temp_v_epi32, temp_v_epi32);
+          temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
+          temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
 
-      break;
-     }
+          // Store 64-bit integer into memory
+          _mm_storel_epi64((__m128i*)&(lcu->rec.v(y_in_lcu)* LCU_WIDTH_C + x_in_lcu), _mm256_castsi256_si128(temp_epi8));
+        }
+      }
     }
    }
   }
- }
 }
 
 static optimized_sad_func_ptr_t get_optimized_sad_avx2(int32_t width)
@@ -1319,6 +1051,185 @@
                                    pic_stride, ref_stride, left, right);
 }
 
+static double pixel_var_avx2_largebuf(const kvz_pixel *buf, const uint32_t len)
+{
+  const float len_f  = (float)len;
+  const __m256i zero = _mm256_setzero_si256();
+
+  int64_t sum;
+  size_t i;
+  __m256i sums = zero;
+  for (i = 0; i + 31 < len; i += 32) {
+    __m256i curr = _mm256_loadu_si256((const __m256i *)(buf + i));
+    __m256i curr_sum = _mm256_sad_epu8(curr, zero);
+            sums = _mm256_add_epi64(sums, curr_sum);
+  }
+  __m128i sum_lo = _mm256_castsi256_si128  (sums);
+  __m128i sum_hi = _mm256_extracti128_si256(sums,   1);
+  __m128i sum_3  = _mm_add_epi64           (sum_lo, sum_hi);
+  __m128i sum_4  = _mm_shuffle_epi32       (sum_3,  _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sum_5  = _mm_add_epi64           (sum_3,  sum_4);
+
+  _mm_storel_epi64((__m128i *)&sum, sum_5);
+
+  // Remaining len mod 32 pixels
+  for (; i < len; ++i) {
+    sum += bufi;
+  }
+
+  float   mean_f = (float)sum / len_f;
+  __m256  mean   = _mm256_set1_ps(mean_f);
+  __m256  accum  = _mm256_setzero_ps();
+
+  for (i = 0; i + 31 < len; i += 32) {
+    __m128i curr0    = _mm_loadl_epi64((const __m128i *)(buf + i +  0));
+    __m128i curr1    = _mm_loadl_epi64((const __m128i *)(buf + i +  8));
+    __m128i curr2    = _mm_loadl_epi64((const __m128i *)(buf + i + 16));
+    __m128i curr3    = _mm_loadl_epi64((const __m128i *)(buf + i + 24));
+
+    __m256i curr0_32 = _mm256_cvtepu8_epi32(curr0);
+    __m256i curr1_32 = _mm256_cvtepu8_epi32(curr1);
+    __m256i curr2_32 = _mm256_cvtepu8_epi32(curr2);
+    __m256i curr3_32 = _mm256_cvtepu8_epi32(curr3);
+
+    __m256  curr0_f  = _mm256_cvtepi32_ps  (curr0_32);
+    __m256  curr1_f  = _mm256_cvtepi32_ps  (curr1_32);
+    __m256  curr2_f  = _mm256_cvtepi32_ps  (curr2_32);
+    __m256  curr3_f  = _mm256_cvtepi32_ps  (curr3_32);
+
+    __m256  curr0_sd = _mm256_sub_ps       (curr0_f,  mean);
+    __m256  curr1_sd = _mm256_sub_ps       (curr1_f,  mean);
+    __m256  curr2_sd = _mm256_sub_ps       (curr2_f,  mean);
+    __m256  curr3_sd = _mm256_sub_ps       (curr3_f,  mean);
+
+    __m256  curr0_v  = _mm256_mul_ps       (curr0_sd, curr0_sd);
+    __m256  curr1_v  = _mm256_mul_ps       (curr1_sd, curr1_sd);
+    __m256  curr2_v  = _mm256_mul_ps       (curr2_sd, curr2_sd);
+    __m256  curr3_v  = _mm256_mul_ps       (curr3_sd, curr3_sd);
+
+    __m256  curr01   = _mm256_add_ps       (curr0_v,  curr1_v);
+    __m256  curr23   = _mm256_add_ps       (curr2_v,  curr3_v);
+    __m256  curr     = _mm256_add_ps       (curr01,   curr23);
+            accum    = _mm256_add_ps       (accum,    curr);
+  }
+  __m256d accum_d  = _mm256_castps_pd     (accum);
+  __m256d accum2_d = _mm256_permute4x64_pd(accum_d, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256  accum2   = _mm256_castpd_ps     (accum2_d);
+
+  __m256  accum3   = _mm256_add_ps        (accum,  accum2);
+  __m256  accum4   = _mm256_permute_ps    (accum3, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256  accum5   = _mm256_add_ps        (accum3, accum4);
+  __m256  accum6   = _mm256_permute_ps    (accum5, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256  accum7   = _mm256_add_ps        (accum5, accum6);
+
+  __m128  accum8   = _mm256_castps256_ps128(accum7);
+  float   var_sum  = _mm_cvtss_f32         (accum8);
+
+  // Remaining len mod 32 pixels
+  for (; i < len; ++i) {
+    float diff = bufi - mean_f;
+    var_sum += diff * diff;
+  }
+
+  return  var_sum / len_f;
+}
+
+#ifdef INACCURATE_VARIANCE_CALCULATION
+
+// Assumes that u is a power of two
+static INLINE uint32_t ilog2(uint32_t u)
+{
+  return _tzcnt_u32(u);
+}
+
+// A B C D | E F G H (8x32b)
+//        ==>
+// A+B C+D | E+F G+H (4x64b)
+static __m256i hsum_epi32_to_epi64(const __m256i v)
+{
+  const __m256i zero    = _mm256_setzero_si256();
+        __m256i v_shufd = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
+        __m256i sums_32 = _mm256_add_epi32    (v, v_shufd);
+        __m256i sums_64 = _mm256_blend_epi32  (sums_32, zero, 0xaa);
+  return        sums_64;
+}
+
+static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len)
+{
+  assert(sizeof(*buf) == 1);
+  assert((len & 31) == 0);
+
+  // Uses Q8.7 numbers to measure mean and deviation, so variances are Q16.14
+  const uint64_t sum_maxwid     = ilog2(len) + (8 * sizeof(*buf));
+  const __m128i normalize_sum   = _mm_cvtsi32_si128(sum_maxwid - 15); // Normalize mean to 0, 32767, so signed 16-bit subtraction never overflows
+  const __m128i debias_sum      = _mm_cvtsi32_si128(1 << (sum_maxwid - 16));
+  const float varsum_to_f       = 1.0f / (float)(1 << (14 + ilog2(len)));
+
+  const bool power_of_two = (len & (len - 1)) == 0;
+  if (sum_maxwid > 32 || sum_maxwid < 15 || !power_of_two) {
+    return pixel_var_avx2_largebuf(buf, len);
+  }
+
+  const __m256i zero      = _mm256_setzero_si256();
+  const __m256i himask_15 = _mm256_set1_epi16(0x7f00);
+
+  uint64_t vars;
+  size_t i;
+  __m256i sums = zero;
+  for (i = 0; i < len; i += 32) {
+    __m256i curr = _mm256_loadu_si256((const __m256i *)(buf + i));
+    __m256i curr_sum = _mm256_sad_epu8(curr, zero);
+            sums = _mm256_add_epi64(sums, curr_sum);
+  }
+  __m128i sum_lo = _mm256_castsi256_si128  (sums);
+  __m128i sum_hi = _mm256_extracti128_si256(sums,   1);
+  __m128i sum_3  = _mm_add_epi64           (sum_lo, sum_hi);
+  __m128i sum_4  = _mm_shuffle_epi32       (sum_3,  _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sum_5  = _mm_add_epi64           (sum_3,  sum_4);
+  __m128i sum_5n = _mm_srl_epi32           (sum_5,  normalize_sum);
+          sum_5n = _mm_add_epi32           (sum_5n, debias_sum);
+
+  __m256i sum_n  = _mm256_broadcastw_epi16 (sum_5n);
+
+  __m256i accum = zero;
+  for (i = 0; i < len; i += 32) {
+    __m256i curr = _mm256_loadu_si256((const __m256i *)(buf + i));
+
+    __m256i curr0    = _mm256_slli_epi16  (curr,  7);
+    __m256i curr1    = _mm256_srli_epi16  (curr,  1);
+            curr0    = _mm256_and_si256   (curr0, himask_15);
+            curr1    = _mm256_and_si256   (curr1, himask_15);
+
+    __m256i dev0     = _mm256_sub_epi16   (curr0, sum_n);
+    __m256i dev1     = _mm256_sub_epi16   (curr1, sum_n);
+
+    __m256i vars0    = _mm256_madd_epi16  (dev0,  dev0);
+    __m256i vars1    = _mm256_madd_epi16  (dev1,  dev1);
+
+    __m256i varsum   = _mm256_add_epi32   (vars0, vars1);
+            varsum   = hsum_epi32_to_epi64(varsum);
+            accum    = _mm256_add_epi64   (accum, varsum);
+  }
+  __m256i accum2 = _mm256_permute4x64_epi64(accum,  _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i accum3 = _mm256_add_epi64        (accum,  accum2);
+  __m256i accum4 = _mm256_permute4x64_epi64(accum3, _MM_SHUFFLE(2, 3, 1, 0));
+  __m256i v_tot  = _mm256_add_epi64        (accum3, accum4);
+  __m128i vt128  = _mm256_castsi256_si128  (v_tot);
+
+  _mm_storel_epi64((__m128i *)&vars, vt128);
+
+  return (float)vars * varsum_to_f;
+}
+
+#else // INACCURATE_VARIANCE_CALCULATION
+
+static double pixel_var_avx2(const kvz_pixel *buf, const uint32_t len)
+{
+  return pixel_var_avx2_largebuf(buf, len);
+}
+
+#endif // !INACCURATE_VARIANCE_CALCULATION
+
 #endif //COMPILE_INTEL_AVX2
 
 int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
@@ -1352,11 +1263,13 @@
     success &= kvz_strategyselector_register(opaque, "satd_any_size_quad", "avx2", 40, &satd_any_size_quad_avx2);
 
     success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2);
-	  success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2);
+    success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2);
     success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "avx2", 40, &get_optimized_sad_avx2);
     success &= kvz_strategyselector_register(opaque, "ver_sad", "avx2", 40, &ver_sad_avx2);
     success &= kvz_strategyselector_register(opaque, "hor_sad", "avx2", 40, &hor_sad_avx2);
 
+    success &= kvz_strategyselector_register(opaque, "pixel_var", "avx2", 40, &pixel_var_avx2);
+
   }
 #endif
   return success;

kvazaar-1.3.0.tar.gz/src/strategies/avx2/quant-avx2.c -> kvazaar-2.0.0.tar.gz/src/strategies/avx2/quant-avx2.c Changed

@@ -621,6 +621,7 @@
 * \param pred_in  Predicted pixels.
 * \param rec_out  Reconstructed pixels.
 * \param coeff_out  Coefficients used for reconstruction of rec_out.
+* \param early_skip if this is used for early skip, bypass IT and IQ
 *
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
@@ -629,11 +630,12 @@
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const kvz_pixel *const ref_in, const kvz_pixel *const pred_in,
-  kvz_pixel *rec_out, coeff_t *coeff_out)
+  kvz_pixel *rec_out, coeff_t *coeff_out,
+  bool early_skip)
 {
   // Temporary arrays to pass data to and from kvz_quant and transform functions.
-  int16_t residualTR_MAX_WIDTH * TR_MAX_WIDTH;
-  coeff_t coeffTR_MAX_WIDTH * TR_MAX_WIDTH;
+  ALIGNED(64) int16_t residualTR_MAX_WIDTH * TR_MAX_WIDTH;
+  ALIGNED(64) coeff_t coeffTR_MAX_WIDTH * TR_MAX_WIDTH;
 
   int has_coeffs = 0;
 
@@ -673,7 +675,7 @@
 
   // Do the inverse quantization and transformation and the reconstruction to
   // rec_out.
-  if (has_coeffs) {
+  if (has_coeffs && !early_skip) {
 
     // Get quantized residual. (coeff_out -> coeff -> residual)
     kvz_dequant(state, coeff_out, coeff, width, width, (color == COLOR_Y ? 0 : (color == COLOR_U ? 2 : 3)), cur_cu->type);

kvazaar-1.3.0.tar.gz/src/strategies/avx2/sao-avx2.c -> kvazaar-2.0.0.tar.gz/src/strategies/avx2/sao-avx2.c Changed

@@ -22,7 +22,12 @@
 
 #if COMPILE_INTEL_AVX2
 #include <immintrin.h>
+#include <nmmintrin.h>
 
+// Use a couple generic functions from here as a worst-case fallback
+#include "strategies/generic/sao_shared_generics.h"
+#include "strategies/avx2/avx2_common_functions.h"
+#include "strategies/missing-intel-intrinsics.h"
 #include "cu.h"
 #include "encoder.h"
 #include "encoderstate.h"
@@ -30,324 +35,853 @@
 #include "sao.h"
 #include "strategyselector.h"
 
-
 // These optimizations are based heavily on sao-generic.c.
 // Might be useful to check that if (when) this file
 // is difficult to understand.
 
-
-static INLINE __m128i load_6_pixels(const kvz_pixel* data)
+// Do the SIGN3 operation for the difference a-b
+static INLINE __m256i sign3_diff_epu8(const __m256i a, const __m256i b)
 {
-  return _mm_insert_epi16(_mm_cvtsi32_si128(*(int32_t*)&(data0)), *(int16_t*)&(data4), 2);
+  // Subtract 0x80 from unsigneds to compare them as signed
+  const __m256i epu2epi = _mm256_set1_epi8  (0x80);
+  const __m256i ones    = _mm256_set1_epi8  (0x01);
+
+  __m256i a_signed      = _mm256_sub_epi8   (a,        epu2epi);
+  __m256i b_signed      = _mm256_sub_epi8   (b,        epu2epi);
+
+  __m256i diff          = _mm256_subs_epi8  (a_signed, b_signed);
+  return                  _mm256_sign_epi8  (ones,     diff);
 }
 
-static INLINE __m256i load_5_offsets(const int* offsets)
+// Mapping of edge_idx values to eo-classes, 32x8b at once
+static __m256i FIX_W32 calc_eo_cat(const __m256i a,
+                                   const __m256i b,
+                                   const __m256i c)
 {
-  return _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*) offsets)), _mm_insert_epi32(_mm_setzero_si128(), offsets4, 0), 1);
+  const __m256i twos       = _mm256_set1_epi8  (0x02);
+  const __m256i idx_to_cat = _mm256_setr_epi64x(0x0403000201, 0,
+                                                0x0403000201, 0);
+
+  __m256i c_a_sign         = sign3_diff_epu8    (c, a);
+  __m256i c_b_sign         = sign3_diff_epu8    (c, b);
+
+  __m256i signsum          = _mm256_add_epi8    (c_a_sign,   c_b_sign);
+  __m256i eo_idx           = _mm256_add_epi8    (signsum,    twos);
+
+  return                     _mm256_shuffle_epi8(idx_to_cat, eo_idx);
 }
 
+static INLINE __m256i srli_epi8(const __m256i  v,
+                                const uint32_t shift)
+{
+  const uint8_t hibit_mask     = 0xff >> shift;
+  const __m256i hibit_mask_256 = _mm256_set1_epi8(hibit_mask);
+
+  __m256i v_shifted = _mm256_srli_epi32(v,         shift);
+  __m256i v_masked  = _mm256_and_si256 (v_shifted, hibit_mask_256);
 
-static __m128i sao_calc_eo_cat_avx2(__m128i* a, __m128i* b, __m128i* c)
+  return v_masked;
+}
+
+static INLINE void cvt_epu8_epi16(const __m256i  v,
+                                        __m256i *res_lo,
+                                        __m256i *res_hi)
 {
-  __m128i v_eo_idx = _mm_set1_epi16(2);
-  __m128i v_a = _mm_cvtepu8_epi16(*a);
-  __m128i v_c = _mm_cvtepu8_epi16(*c);
-  __m128i v_b = _mm_cvtepu8_epi16(*b);
-  
-  __m128i temp_a = _mm_sign_epi16(_mm_set1_epi16(1), _mm_sub_epi16(v_c, v_a));
-  __m128i temp_b = _mm_sign_epi16(_mm_set1_epi16(1), _mm_sub_epi16(v_c, v_b));
-  v_eo_idx = _mm_add_epi16(v_eo_idx, temp_a);
-  v_eo_idx = _mm_add_epi16(v_eo_idx, temp_b);
-  
-  v_eo_idx = _mm_packus_epi16(v_eo_idx, v_eo_idx);
-  __m128i v_cat_lookup = _mm_setr_epi8(1,2,0,3,4,0,0,0,0,0,0,0,0,0,0,0);
-  __m128i v_cat = _mm_shuffle_epi8(v_cat_lookup, v_eo_idx);
-
-
-  return v_cat;
+  const __m256i zero  = _mm256_setzero_si256();
+             *res_lo  = _mm256_unpacklo_epi8(v, zero);
+             *res_hi  = _mm256_unpackhi_epi8(v, zero);
 }
 
+static INLINE void cvt_epi8_epi16(const __m256i  v,
+                                        __m256i *res_lo,
+                                        __m256i *res_hi)
+{
+  const __m256i zero  = _mm256_setzero_si256();
+        __m256i signs = _mm256_cmpgt_epi8   (zero, v);
+             *res_lo  = _mm256_unpacklo_epi8(v,    signs);
+             *res_hi  = _mm256_unpackhi_epi8(v,    signs);
+}
 
-static int sao_edge_ddistortion_avx2(const kvz_pixel *orig_data,
-                                     const kvz_pixel *rec_data,
-                                     int block_width,
-                                     int block_height,
-                                     int eo_class,
-                                     int offsetsNUM_SAO_EDGE_CATEGORIES)
+static INLINE void diff_epi8_epi16(const __m256i  a,
+                                   const __m256i  b,
+                                         __m256i *res_lo,
+                                         __m256i *res_hi)
 {
-  int y, x;
-  int sum = 0;
-  vector2d_t a_ofs = g_sao_edge_offsetseo_class0;
-  vector2d_t b_ofs = g_sao_edge_offsetseo_class1;
+  const __m256i invmask = _mm256_set1_epi16(0xff01);
 
-  __m256i v_accum = { 0 };
+  __m256i composite_lo  = _mm256_unpacklo_epi8(a, b);
+  __m256i composite_hi  = _mm256_unpackhi_epi8(a, b);
 
-  for (y = 1; y < block_height - 1; ++y) {
+         *res_lo        = _mm256_maddubs_epi16(composite_lo, invmask);
+         *res_hi        = _mm256_maddubs_epi16(composite_hi, invmask);
+}
 
-    for (x = 1; x < block_width - 8; x+=8) {
-      const kvz_pixel *c_data = &rec_datay * block_width + x;
+// Convert a byte-addressed mask for VPSHUFB into two word-addressed ones, for
+// example:
+// 7 3 6 2 5 1 4 0 => e f 6 7 c d 4 5 a b 2 3 8 9 0 1
+static INLINE void cvt_shufmask_epi8_epi16(const __m256i  v,
+                                                 __m256i *res_lo,
+                                                 __m256i *res_hi)
+{
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i ones = _mm256_set1_epi8(1);
+
+  // There's no 8-bit shift, so highest bit could bleed into neighboring byte
+  // if set. To avoid it, reset all sign bits with max. The only valid input
+  // values for v are 0, 7 anyway and invalid places should be masked out by
+  // caller, so it doesn't matter that we turn negative bytes into garbage.
+  __m256i v_nonnegs  = _mm256_max_epi8  (zero,      v);
+  __m256i v_lobytes  = _mm256_slli_epi32(v_nonnegs, 1);
+  __m256i v_hibytes  = _mm256_add_epi8  (v_lobytes, ones);
+
+          *res_lo    = _mm256_unpacklo_epi8(v_lobytes, v_hibytes);
+          *res_hi    = _mm256_unpackhi_epi8(v_lobytes, v_hibytes);
+}
 
-      __m128i v_c_data = _mm_loadl_epi64((__m128i*)c_data);
-      __m128i v_a = _mm_loadl_epi64((__m128i*)(&c_dataa_ofs.y * block_width + a_ofs.x));
-      __m128i v_c = v_c_data;
-      __m128i v_b = _mm_loadl_epi64((__m128i*)(&c_datab_ofs.y * block_width + b_ofs.x));
+// Check if all 4 dwords of v are in -128, 127 and can be truncated to
+// 8 bits each. Returns -1 if everything is fine
+static INLINE uint16_t epi32v_fits_in_epi8s(const __m128i v)
+{
+  // Compare most significant 25 bits of SAO bands to the sign bit to assert
+  // that the i32's are between -128 and 127 (only comparing 24 would fail to
+  // detect values of 128...255)
+  __m128i  v_ms25b = _mm_srai_epi32   (v,  7);
+  __m128i  v_signs = _mm_srai_epi32   (v, 31);
+  __m128i  ok_i32s = _mm_cmpeq_epi32  (v_ms25b, v_signs);
+  return             _mm_movemask_epi8(ok_i32s);
+}
 
-      __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c));
+static INLINE __m128i truncate_epi32_epi8(const __m128i v)
+{
+  // LSBs of each dword, the values values must fit in 8 bits anyway for
+  // what this intended for (use epi32v_fits_in_epi8s to check if needed)
+  const __m128i trunc_shufmask = _mm_set1_epi32  (0x0c080400);
+        __m128i sbs_8          = _mm_shuffle_epi8(v, trunc_shufmask);
+  return        sbs_8;
+}
 
-      __m256i v_offset = load_5_offsets(offsets);
-      v_offset = _mm256_permutevar8x32_epi32(v_offset, v_cat);
-   
-      __m256i v_diff = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(orig_datay * block_width + x)));
-      v_diff = _mm256_sub_epi32(v_diff, _mm256_cvtepu8_epi32(v_c));
-      __m256i v_diff_minus_offset = _mm256_sub_epi32(v_diff, v_offset);
-      __m256i v_temp_sum = _mm256_sub_epi32(_mm256_mullo_epi32(v_diff_minus_offset, v_diff_minus_offset), _mm256_mullo_epi32(v_diff, v_diff));
-      v_accum = _mm256_add_epi32(v_accum, v_temp_sum);
-    }
+// Read 0-3 bytes (pixels) into uint32_t
+static INLINE uint32_t load_border_bytes(const uint8_t *buf,
+                                         const int32_t  start_pos,
+                                         const int32_t  width_rest)
+{
+  uint32_t last_dword = 0;
+  for (int32_t i = 0; i < width_rest; i++) {
+    uint8_t  currb = bufstart_pos + i;
+    uint32_t currd = ((uint32_t)currb) << (i * 8);
+    last_dword |= currd;
+  }
+  return last_dword;
+}
 
-    //Handle last 6 pixels separately to prevent reading over boundary
-    const kvz_pixel *c_data = &rec_datay * block_width + x;
-    __m128i v_c_data = load_6_pixels(c_data);
-    const kvz_pixel* a_ptr = &c_dataa_ofs.y * block_width + a_ofs.x;
-    const kvz_pixel* b_ptr = &c_datab_ofs.y * block_width + b_ofs.x;
-    __m128i v_a = load_6_pixels(a_ptr);
-    __m128i v_c = v_c_data;
-    __m128i v_b = load_6_pixels(b_ptr);
-
-    __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c));
-
-    __m256i v_offset = load_5_offsets(offsets);
-    v_offset = _mm256_permutevar8x32_epi32(v_offset, v_cat);
-   
-    const kvz_pixel* orig_ptr = &(orig_datay * block_width + x);
-    __m256i v_diff = _mm256_cvtepu8_epi32(load_6_pixels(orig_ptr));
-    v_diff = _mm256_sub_epi32(v_diff, _mm256_cvtepu8_epi32(v_c));
-
-    __m256i v_diff_minus_offset = _mm256_sub_epi32(v_diff, v_offset);
-    __m256i v_temp_sum = _mm256_sub_epi32(_mm256_mullo_epi32(v_diff_minus_offset, v_diff_minus_offset), _mm256_mullo_epi32(v_diff, v_diff));
-    v_accum = _mm256_add_epi32(v_accum, v_temp_sum);
+static INLINE void store_border_bytes(      uint8_t  *buf,
+                                      const uint32_t  start_pos,
+                                      const int32_t   width_rest,
+                                            uint32_t  data)
+{
+  for (uint32_t i = 0; i < width_rest; i++) {
+    uint8_t currb = data & 0xff;
+    bufstart_pos + i = currb;
+    data >>= 8;
   }
+}
+
+// Mask all inexistent bytes to 0xFF for functions that count particular byte
+// values, so they won't count anywhere
+static INLINE __m256i gen_badbyte_mask(const __m256i db4_mask,
+                                       const int32_t width_rest)
+{
+  const __m256i zero    = _mm256_setzero_si256();
+
+  uint32_t last_badbytes = 0xffffffff << (width_rest << 3);
+  __m256i  badbyte_mask  = _mm256_cmpeq_epi8  (db4_mask,     zero);
+  return                   _mm256_insert_epi32(badbyte_mask, last_badbytes, 7);
+}
+
+// Ok, so the broadcast si128->si256 instruction only works with a memory
+// source operand..
+static INLINE __m256i broadcast_xmm2ymm(const __m128i v)
+{
+  __m256i res = _mm256_castsi128_si256 (v);
+  return        _mm256_inserti128_si256(res, v, 1);
+}
+
+// Used for edge_ddistortion and band_ddistortion
+static __m256i FIX_W32 calc_diff_off_delta(const __m256i diff_lo,
+                                           const __m256i diff_hi,
+                                           const __m256i offsets,
+                                           const __m256i orig)
+{
+  const __m256i zero          = _mm256_setzero_si256();
+  const __m256i negate_hiword = _mm256_set1_epi32(0xffff0001);
+
+  __m256i orig_lo, orig_hi, offsets_lo, offsets_hi;
+
+  cvt_epu8_epi16(orig,    &orig_lo,    &orig_hi);
+  cvt_epi8_epi16(offsets, &offsets_lo, &offsets_hi);
+
+  __m256i offsets_0_lo = _mm256_cmpeq_epi16   (offsets_lo,   zero);
+  __m256i offsets_0_hi = _mm256_cmpeq_epi16   (offsets_hi,   zero);
+
+  __m256i delta_lo     = _mm256_sub_epi16     (diff_lo,      offsets_lo);
+  __m256i delta_hi     = _mm256_sub_epi16     (diff_hi,      offsets_hi);
 
-  //Full horizontal sum
-  v_accum = _mm256_add_epi32(v_accum, _mm256_castsi128_si256(_mm256_extracti128_si256(v_accum, 1)));
-  v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(1, 0, 3, 2)));
-  v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(0, 1, 0, 1)));
-  sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_accum));
+  __m256i diff_lo_m    = _mm256_andnot_si256  (offsets_0_lo, diff_lo);
+  __m256i diff_hi_m    = _mm256_andnot_si256  (offsets_0_hi, diff_hi);
+  __m256i delta_lo_m   = _mm256_andnot_si256  (offsets_0_lo, delta_lo);
+  __m256i delta_hi_m   = _mm256_andnot_si256  (offsets_0_hi, delta_hi);
 
-  return sum;
+  __m256i dd0_lo       = _mm256_unpacklo_epi16(delta_lo_m,   diff_lo_m);
+  __m256i dd0_hi       = _mm256_unpackhi_epi16(delta_lo_m,   diff_lo_m);
+  __m256i dd1_lo       = _mm256_unpacklo_epi16(delta_hi_m,   diff_hi_m);
+  __m256i dd1_hi       = _mm256_unpackhi_epi16(delta_hi_m,   diff_hi_m);
+
+  __m256i dd0_lo_n     = _mm256_sign_epi16    (dd0_lo,       negate_hiword);
+  __m256i dd0_hi_n     = _mm256_sign_epi16    (dd0_hi,       negate_hiword);
+  __m256i dd1_lo_n     = _mm256_sign_epi16    (dd1_lo,       negate_hiword);
+  __m256i dd1_hi_n     = _mm256_sign_epi16    (dd1_hi,       negate_hiword);
+
+  __m256i sum0_lo      = _mm256_madd_epi16    (dd0_lo,       dd0_lo_n);
+  __m256i sum0_hi      = _mm256_madd_epi16    (dd0_hi,       dd0_hi_n);
+  __m256i sum1_lo      = _mm256_madd_epi16    (dd1_lo,       dd1_lo_n);
+  __m256i sum1_hi      = _mm256_madd_epi16    (dd1_hi,       dd1_hi_n);
+
+  __m256i sum0         = _mm256_add_epi32     (sum0_lo,      sum0_hi);
+  __m256i sum1         = _mm256_add_epi32     (sum1_lo,      sum1_hi);
+  return                 _mm256_add_epi32     (sum0,         sum1);
 }
 
+static INLINE __m256i FIX_W32 do_one_edge_ymm(const __m256i a,
+                                              const __m256i b,
+                                              const __m256i c,
+                                              const __m256i orig,
+                                              const __m256i badbyte_mask,
+                                              const __m256i offsets_256)
+{
+  __m256i eo_cat = calc_eo_cat(a, b, c);
+          eo_cat = _mm256_or_si256    (eo_cat,      badbyte_mask);
+  __m256i offset = _mm256_shuffle_epi8(offsets_256, eo_cat);
+
+  __m256i offset_lo, offset_hi;
+  cvt_epi8_epi16(offset, &offset_lo, &offset_hi);
+
+  __m256i diff_lo, diff_hi;
+  diff_epi8_epi16(orig, c, &diff_lo, &diff_hi);
 
-static INLINE void accum_count_eo_cat_avx2(__m256i*  __restrict v_diff_accum,
-                                           __m256i* __restrict v_count,
-                                           __m256i* __restrict v_cat,
-                                           __m256i* __restrict v_diff,
-                                           int eo_cat)
+  return calc_diff_off_delta(diff_lo, diff_hi, offset, orig);
+}
+
+static int32_t sao_edge_ddistortion_avx2(const kvz_pixel *orig_data,
+                                         const kvz_pixel *rec_data,
+                                               int32_t    block_width,
+                                               int32_t    block_height,
+                                               int32_t    eo_class,
+                                         const int32_t    offsetsNUM_SAO_EDGE_CATEGORIES)
 {
-        __m256i v_mask = _mm256_cmpeq_epi32(*v_cat, _mm256_set1_epi32(eo_cat));
-        *v_diff_accum = _mm256_add_epi32(*v_diff_accum, _mm256_and_si256(*v_diff, v_mask));
-        *v_count = _mm256_sub_epi32(*v_count, v_mask);
+  int32_t y, x;
+  vector2d_t a_ofs = g_sao_edge_offsetseo_class0;
+  vector2d_t b_ofs = g_sao_edge_offsetseo_class1;
+
+   int32_t scan_width = block_width -   2;
+  uint32_t width_db32 = scan_width  & ~31;
+  uint32_t width_db4  = scan_width  &  ~3;
+  uint32_t width_rest = scan_width  &   3;
+
+  // Form the load&store mask
+  const __m256i wdb4_256      = _mm256_set1_epi32 (width_db4 & 31);
+  const __m256i indexes       = _mm256_setr_epi32 (3, 7, 11, 15, 19, 23, 27, 31);
+  const __m256i db4_mask      = _mm256_cmpgt_epi32(wdb4_256, indexes);
+
+  const __m256i zero          = _mm256_setzero_si256();
+
+  __m128i offsets03 = _mm_loadu_si128((const __m128i *)offsets);
+  __m128i offsets4  = _mm_cvtsi32_si128(offsets4);
+
+  uint16_t offsets_ok = epi32v_fits_in_epi8s(offsets03) &
+                        epi32v_fits_in_epi8s(offsets4);
+
+  assert(NUM_SAO_EDGE_CATEGORIES == 5);
+
+  if (offsets_ok != 0xffff) {
+    return sao_edge_ddistortion_generic(orig_data,
+                                        rec_data,
+                                        block_width,
+                                        block_height,
+                                        eo_class,
+                                        offsets);
+  }
+
+  __m128i offsets03_8b = truncate_epi32_epi8(offsets03);
+  __m128i offsets4_8b  = truncate_epi32_epi8(offsets4);
+  __m128i offsets_8b   = _mm_unpacklo_epi32 (offsets03_8b, offsets4_8b);
+  __m256i offsets_256  = broadcast_xmm2ymm  (offsets_8b);
+
+  __m256i sum = _mm256_setzero_si256();
+  for (y = 1; y < block_height - 1; y++) {
+    for (x = 1; x < width_db32 + 1; x += 32) {
+      uint32_t c_pos =  y            * block_width + x;
+      uint32_t a_pos = (y + a_ofs.y) * block_width + x + a_ofs.x;
+      uint32_t b_pos = (y + b_ofs.y) * block_width + x + b_ofs.x;
+
+      __m256i a      = _mm256_loadu_si256((const __m256i *)(rec_data  + a_pos));
+      __m256i b      = _mm256_loadu_si256((const __m256i *)(rec_data  + b_pos));
+      __m256i c      = _mm256_loadu_si256((const __m256i *)(rec_data  + c_pos));
+      __m256i orig   = _mm256_loadu_si256((const __m256i *)(orig_data + c_pos));
+
+      __m256i curr   = do_one_edge_ymm(a, b, c, orig, zero, offsets_256);
+              sum    = _mm256_add_epi32(sum, curr);
+    }
+    if (scan_width > width_db32) {
+      const uint32_t curr_cpos   =  y            * block_width + x;
+      const uint32_t rest_cpos   =  y            * block_width + width_db4 + 1;
+
+      const  int32_t curr_apos   = (y + a_ofs.y) * block_width + x + a_ofs.x;
+      const  int32_t rest_apos   = (y + a_ofs.y) * block_width + width_db4 + a_ofs.x + 1;
+
+      const  int32_t curr_bpos   = (y + b_ofs.y) * block_width + x + b_ofs.x;
+      const  int32_t rest_bpos   = (y + b_ofs.y) * block_width + width_db4 + b_ofs.x + 1;
+
+      // Same trick to read a narrow line as there is in the band SAO routine
+      uint32_t a_last         = load_border_bytes(rec_data,  rest_apos, width_rest);
+      uint32_t b_last         = load_border_bytes(rec_data,  rest_bpos, width_rest);
+      uint32_t c_last         = load_border_bytes(rec_data,  rest_cpos, width_rest);
+      uint32_t orig_last      = load_border_bytes(orig_data, rest_cpos, width_rest);
+
+      const int32_t *a_ptr    = (const int32_t *)(rec_data  + curr_apos);
+      const int32_t *b_ptr    = (const int32_t *)(rec_data  + curr_bpos);
+      const int32_t *c_ptr    = (const int32_t *)(rec_data  + curr_cpos);
+      const int32_t *orig_ptr = (const int32_t *)(orig_data + curr_cpos);
+
+      __m256i a    = _mm256_maskload_epi32(a_ptr,    db4_mask);
+      __m256i b    = _mm256_maskload_epi32(b_ptr,    db4_mask);
+      __m256i c    = _mm256_maskload_epi32(c_ptr,    db4_mask);
+      __m256i orig = _mm256_maskload_epi32(orig_ptr, db4_mask);
+
+              a    = _mm256_insert_epi32  (a,        a_last,    7);
+              b    = _mm256_insert_epi32  (b,        b_last,    7);
+              c    = _mm256_insert_epi32  (c,        c_last,    7);
+              orig = _mm256_insert_epi32  (orig,     orig_last, 7);
+
+      // Mask all unused bytes to 0xFF, so they won't count anywhere
+      __m256i badbyte_mask = gen_badbyte_mask(db4_mask, width_rest);
+
+      __m256i curr  = do_one_edge_ymm(a, b, c, orig, badbyte_mask, offsets_256);
+              sum   = _mm256_add_epi32(sum, curr);
+    }
+  }
+  return hsum_8x32b(sum);
 }
 
+static void FIX_W32 calc_edge_dir_one_ymm(const __m256i  a,
+                                          const __m256i  b,
+                                          const __m256i  c,
+                                          const __m256i  orig,
+                                          const __m256i  badbyte_mask,
+                                                __m256i *diff_accum,
+                                                int32_t *hit_cnt)
+{
+  const __m256i ones_16 = _mm256_set1_epi16(1);
+        __m256i eo_cat  = calc_eo_cat      (a, b, c);
+                eo_cat  = _mm256_or_si256  (eo_cat, badbyte_mask);
+
+  __m256i diffs_lo, diffs_hi;
+  diff_epi8_epi16(orig, c, &diffs_lo, &diffs_hi);
+
+  for (uint32_t i = 0; i < 5; i++) {
+    __m256i  curr_id       = _mm256_set1_epi8    (i);
+    __m256i  eoc_mask      = _mm256_cmpeq_epi8   (eo_cat, curr_id);
+    uint32_t eoc_bits      = _mm256_movemask_epi8(eoc_mask);
+    uint32_t eoc_hits      = _mm_popcnt_u32      (eoc_bits);
 
-#define ACCUM_COUNT_EO_CAT_AVX2(EO_CAT, V_CAT) \
-  \
-  accum_count_eo_cat_avx2(&(v_diff_accum EO_CAT ), &(v_count EO_CAT ), &V_CAT , &v_diff, EO_CAT);
+    __m256i  eoc_mask_lo   = _mm256_unpacklo_epi8(eoc_mask,      eoc_mask);
+    __m256i  eoc_mask_hi   = _mm256_unpackhi_epi8(eoc_mask,      eoc_mask);
 
+    __m256i  eoc_diffs_lo  = _mm256_and_si256    (diffs_lo,      eoc_mask_lo);
+    __m256i  eoc_diffs_hi  = _mm256_and_si256    (diffs_hi,      eoc_mask_hi);
+
+    __m256i  eoc_diffs_16  = _mm256_add_epi16    (eoc_diffs_lo,  eoc_diffs_hi);
+    __m256i  eoc_diffs_32  = _mm256_madd_epi16   (eoc_diffs_16,  ones_16);
+
+             diff_accumi = _mm256_add_epi32    (diff_accumi, eoc_diffs_32);
+             hit_cnti   += eoc_hits;
+  }
+}
 
 static void calc_sao_edge_dir_avx2(const kvz_pixel *orig_data,
                                    const kvz_pixel *rec_data,
-                                   int eo_class,
-                                   int block_width,
-                                   int block_height,
-                                   int cat_sum_cnt2NUM_SAO_EDGE_CATEGORIES)
+                                         int32_t    eo_class,
+                                         int32_t    block_width,
+                                         int32_t    block_height,
+                                         int32_t    cat_sum_cnt2NUM_SAO_EDGE_CATEGORIES)
 {
-  int y, x;
   vector2d_t a_ofs = g_sao_edge_offsetseo_class0;
   vector2d_t b_ofs = g_sao_edge_offsetseo_class1;
 
-  // Don't sample the edge pixels because this function doesn't have access to
-  // their neighbours.
+  int32_t *diff_sum   = cat_sum_cnt0;
+  int32_t *hit_cnt    = cat_sum_cnt1;
 
-  __m256i v_diff_accumNUM_SAO_EDGE_CATEGORIES = { { 0 } };
-  __m256i v_countNUM_SAO_EDGE_CATEGORIES = { { 0 } };
+  int32_t scan_width  = block_width -   2;
+  int32_t width_db32  = scan_width  & ~31;
+  int32_t width_db4   = scan_width  &  ~3;
+  int32_t width_rest  = scan_width  &   3;
 
-  for (y = 1; y < block_height - 1; ++y) {
+  const __m256i zero          = _mm256_setzero_si256();
 
-    //Calculation for 8 pixels per round
-    for (x = 1; x < block_width - 8; x += 8) {
-      const kvz_pixel *c_data = &rec_datay * block_width + x;
+  // Form the load&store mask
+  const __m256i wdb4_256      = _mm256_set1_epi32 (width_db4 & 31);
+  const __m256i indexes       = _mm256_setr_epi32 (3, 7, 11, 15, 19, 23, 27, 31);
+  const __m256i db4_mask      = _mm256_cmpgt_epi32(wdb4_256, indexes);
 
-      __m128i v_c_data = _mm_loadl_epi64((__m128i* __restrict)c_data);
-      __m128i v_a = _mm_loadl_epi64((__m128i* __restrict)(&c_dataa_ofs.y * block_width + a_ofs.x));
-      __m128i v_c = v_c_data;
-      __m128i v_b = _mm_loadl_epi64((__m128i* __restrict)(&c_datab_ofs.y * block_width + b_ofs.x));
+  __m256i diff_accum5 = { _mm256_setzero_si256() };
 
-      __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c));
+  int32_t y, x;
+  for (y = 1; y < block_height - 1; y++) {
+    for (x = 1; x < width_db32 + 1; x += 32) {
+      const uint32_t a_off = (y + a_ofs.y) * block_width + x + a_ofs.x;
+      const uint32_t b_off = (y + b_ofs.y) * block_width + x + b_ofs.x;
+      const uint32_t c_off =  y            * block_width + x;
 
-      __m256i v_diff = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i* __restrict)&(orig_datay * block_width + x)));
-      v_diff = _mm256_sub_epi32(v_diff, _mm256_cvtepu8_epi32(v_c));
+      __m256i a      = _mm256_loadu_si256((const __m256i *)(rec_data  + a_off));
+      __m256i b      = _mm256_loadu_si256((const __m256i *)(rec_data  + b_off));
+      __m256i c      = _mm256_loadu_si256((const __m256i *)(rec_data  + c_off));
+      __m256i orig   = _mm256_loadu_si256((const __m256i *)(orig_data + c_off));
 
-      //Accumulate differences and occurrences for each category
-      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT0, v_cat);
-      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT1, v_cat);
-      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT2, v_cat);
-      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT3, v_cat);
-      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT4, v_cat);
+      calc_edge_dir_one_ymm(a, b, c, orig, zero, diff_accum, hit_cnt);
     }
+    if (scan_width > width_db32) {
+      const uint32_t curr_cpos   =  y            * block_width + x;
+      const uint32_t rest_cpos   =  y            * block_width + width_db4 + 1;
 
-    //Handle last 6 pixels separately to prevent reading over boundary
-    const kvz_pixel *c_data = &rec_datay * block_width + x;
-    __m128i v_c_data = load_6_pixels(c_data);
-    const kvz_pixel* a_ptr = &c_dataa_ofs.y * block_width + a_ofs.x;
-    const kvz_pixel* b_ptr = &c_datab_ofs.y * block_width + b_ofs.x;
-    __m128i v_a = load_6_pixels(a_ptr);
-    __m128i v_c = v_c_data;
-    __m128i v_b = load_6_pixels(b_ptr);
-
-    __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c));
-
-    //Set the last two elements to a non-existing category to cause
-    //the accumulate-count macro to discard those values.
-    __m256i v_mask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-    v_cat = _mm256_or_si256(v_cat, v_mask);
-
-    const kvz_pixel* orig_ptr = &(orig_datay * block_width + x);
-    __m256i v_diff = _mm256_cvtepu8_epi32(load_6_pixels(orig_ptr));
-    v_diff = _mm256_sub_epi32(v_diff, _mm256_cvtepu8_epi32(v_c));
-
-    //Accumulate differences and occurrences for each category
-    ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT0, v_cat);
-    ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT1, v_cat);
-    ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT2, v_cat);
-    ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT3, v_cat);
-    ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT4, v_cat);
-  }
+      const  int32_t curr_apos   = (y + a_ofs.y) * block_width + x + a_ofs.x;
+      const  int32_t rest_apos   = (y + a_ofs.y) * block_width + width_db4 + a_ofs.x + 1;
+
+      const  int32_t curr_bpos   = (y + b_ofs.y) * block_width + x + b_ofs.x;
+      const  int32_t rest_bpos   = (y + b_ofs.y) * block_width + width_db4 + b_ofs.x + 1;
 
-  for (int eo_cat = 0; eo_cat < NUM_SAO_EDGE_CATEGORIES; ++eo_cat) {
-    int accum = 0;
-    int count = 0;
+            uint32_t a_last      = load_border_bytes(rec_data,  rest_apos, width_rest);
+            uint32_t b_last      = load_border_bytes(rec_data,  rest_bpos, width_rest);
+            uint32_t c_last      = load_border_bytes(rec_data,  rest_cpos, width_rest);
+            uint32_t orig_last   = load_border_bytes(orig_data, rest_cpos, width_rest);
 
-    //Full horizontal sum of accumulated values
-    v_diff_accumeo_cat = _mm256_add_epi32(v_diff_accumeo_cat, _mm256_castsi128_si256(_mm256_extracti128_si256(v_diff_accumeo_cat, 1)));
-    v_diff_accumeo_cat = _mm256_add_epi32(v_diff_accumeo_cat, _mm256_shuffle_epi32(v_diff_accumeo_cat, _MM_SHUFFLE(1, 0, 3, 2)));
-    v_diff_accumeo_cat = _mm256_add_epi32(v_diff_accumeo_cat, _mm256_shuffle_epi32(v_diff_accumeo_cat, _MM_SHUFFLE(0, 1, 0, 1)));
-    accum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_diff_accumeo_cat));
+      const int32_t *a_ptr       = (const int32_t *)(rec_data  + curr_apos);
+      const int32_t *b_ptr       = (const int32_t *)(rec_data  + curr_bpos);
+      const int32_t *c_ptr       = (const int32_t *)(rec_data  + curr_cpos);
+      const int32_t *orig_ptr    = (const int32_t *)(orig_data + curr_cpos);
 
-    //Full horizontal sum of accumulated values
-    v_counteo_cat = _mm256_add_epi32(v_counteo_cat, _mm256_castsi128_si256(_mm256_extracti128_si256(v_counteo_cat, 1)));
-    v_counteo_cat = _mm256_add_epi32(v_counteo_cat, _mm256_shuffle_epi32(v_counteo_cat, _MM_SHUFFLE(1, 0, 3, 2)));
-    v_counteo_cat = _mm256_add_epi32(v_counteo_cat, _mm256_shuffle_epi32(v_counteo_cat, _MM_SHUFFLE(0, 1, 0, 1)));
-    count += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_counteo_cat));
+      __m256i a    = _mm256_maskload_epi32(a_ptr,    db4_mask);
+      __m256i b    = _mm256_maskload_epi32(b_ptr,    db4_mask);
+      __m256i c    = _mm256_maskload_epi32(c_ptr,    db4_mask);
+      __m256i orig = _mm256_maskload_epi32(orig_ptr, db4_mask);
 
-    cat_sum_cnt0eo_cat += accum;
-    cat_sum_cnt1eo_cat += count; 
+              a    = _mm256_insert_epi32  (a,        a_last,    7);
+              b    = _mm256_insert_epi32  (b,        b_last,    7);
+              c    = _mm256_insert_epi32  (c,        c_last,    7);
+              orig = _mm256_insert_epi32  (orig,     orig_last, 7);
 
+      __m256i badbyte_mask = gen_badbyte_mask(db4_mask, width_rest);
+
+      calc_edge_dir_one_ymm(a, b, c, orig, badbyte_mask, diff_accum, hit_cnt);
+    }
+  }
+  for (uint32_t i = 0; i < 5; i++) {
+    int32_t sum = hsum_8x32b(diff_accumi);
+    diff_sumi += sum;
   }
 }
 
+/*
+ * Calculate an array of intensity correlations for each intensity value.
+ * Return array as 16 YMM vectors, each containing 2x16 unsigned bytes
+ * (to ease array lookup from YMMs using the shuffle trick, the low and
+ * high lanes of each vector are duplicates). Have fun scaling this to
+ * 16-bit picture data!
+ */
+static void calc_sao_offset_array_avx2(const encoder_control_t *encoder,
+                                       const sao_info_t        *sao,
+                                             __m256i           *offsets,
+                                             color_t            color_i)
+{
+  const uint32_t band_pos   = (color_i == COLOR_V) ? 1 : 0;
+  const  int32_t cur_bp     = sao->band_positionband_pos;
+
+  const __m256i  zero       = _mm256_setzero_si256();
+  const __m256i  threes     = _mm256_set1_epi8  (  3);
+
+  const __m256i  band_pos_v = _mm256_set1_epi8  (band_pos << 2);
+  const __m256i  cur_bp_v   = _mm256_set1_epi8  (cur_bp);
+  const __m256i  val_incr   = _mm256_set1_epi8  (16);
+  const __m256i  band_incr  = _mm256_set1_epi8  ( 2);
+        __m256i  vals       = _mm256_setr_epi8  ( 0,  1,  2,  3,  4,  5,  6,  7,
+                                                  8,  9, 10, 11, 12, 13, 14, 15,
+                                                  0,  1,  2,  3,  4,  5,  6,  7,
+                                                  8,  9, 10, 11, 12, 13, 14, 15);
+
+        __m256i  bands     = _mm256_setr_epi32 (0, 0, 0x01010101, 0x01010101,
+                                                0, 0, 0x01010101, 0x01010101);
+
+  // We'll only ever address SAO offsets 1, 2, 3, 4, 6, 7, 8, 9, so only load
+  // them and truncate into signed 16 bits (anything out of that range will
+  // anyway saturate anything they're used to do)
+  __m128i sao_offs_lo  = _mm_loadu_si128((const __m128i *)(sao->offsets + 1));
+  __m128i sao_offs_hi  = _mm_loadu_si128((const __m128i *)(sao->offsets + 6));
+
+  __m128i sao_offs_xmm = _mm_packs_epi32  (sao_offs_lo, sao_offs_hi);
+  __m256i sao_offs     = broadcast_xmm2ymm(sao_offs_xmm);
+
+  for (uint32_t i = 0; i < 16; i++) {
+    // bands will always be in 0, 31, and cur_bp in 0, 27, so no overflow
+    // can occur
+    __m256i band_m_bp = _mm256_sub_epi8    (bands,  cur_bp_v);
+
+    // If (x & ~3) != 0 for any signed x, then x < 0 or x > 3
+    __m256i bmbp_bads = _mm256_andnot_si256(threes,    band_m_bp);
+    __m256i in_band   = _mm256_cmpeq_epi8  (zero,      bmbp_bads);
+
+    __m256i offset_id = _mm256_add_epi8    (band_m_bp, band_pos_v);
+
+    __m256i val_lo, val_hi;
+    cvt_epu8_epi16(vals, &val_lo, &val_hi);
+
+    __m256i offid_lo, offid_hi;
+    cvt_shufmask_epi8_epi16(offset_id, &offid_lo, &offid_hi);
+
+    __m256i offs_lo = _mm256_shuffle_epi8(sao_offs, offid_lo);
+    __m256i offs_hi = _mm256_shuffle_epi8(sao_offs, offid_hi);
+
+    __m256i sums_lo = _mm256_adds_epi16  (val_lo,   offs_lo);
+    __m256i sums_hi = _mm256_adds_epi16  (val_hi,   offs_hi);
+
+            sums_lo = _mm256_max_epi16   (sums_lo,  zero);
+            sums_hi = _mm256_max_epi16   (sums_hi,  zero);
+
+    __m256i offs    = _mm256_packus_epi16(sums_lo,  sums_hi);
 
-static void sao_reconstruct_color_avx2(const encoder_control_t * const encoder,
-                                       const kvz_pixel *rec_data, kvz_pixel *new_rec_data,
-                                       const sao_info_t *sao,
-                                       int stride, int new_stride,
-                                       int block_width, int block_height,
-                                       color_t color_i)
+    offsetsi      = _mm256_blendv_epi8 (vals,     offs, in_band);
+
+            vals    = _mm256_add_epi8    (vals,     val_incr);
+            bands   = _mm256_add_epi8    (bands,    band_incr);
+  }
+}
+
+static __m256i lookup_color_band_ymm(const __m256i  curr_row,
+                                     const __m256i *offsets)
 {
-  // Arrays orig_data and rec_data are quarter size for chroma.
-  int offset_v = color_i == COLOR_V ? 5 : 0;
+  const __m256i select_nibble = _mm256_set1_epi8   (0x0f);
+  const __m256i lo_nibbles    = _mm256_and_si256   (select_nibble, curr_row);
+  const __m256i hi_nibbles    = _mm256_andnot_si256(select_nibble, curr_row);
+
+  // Loop through the offset vectors, the 0xi'th one always holding
+  // offsets 0xi0...0xif. Use shuffle to do a lookup on the current
+  // offset vector, then check which pixels actually should be looked
+  // up from this vector (ie. whether their values are 0xi0...0xif) and
+  // mask out any but correct ones.
+  __m256i result_row = _mm256_setzero_si256();
+  for (uint8_t i = 0; i < 16; i += 4) {
+
+    __m256i curr_hinib0   = _mm256_set1_epi8   ((i + 0) << 4);
+    __m256i curr_hinib1   = _mm256_set1_epi8   ((i + 1) << 4);
+    __m256i curr_hinib2   = _mm256_set1_epi8   ((i + 2) << 4);
+    __m256i curr_hinib3   = _mm256_set1_epi8   ((i + 3) << 4);
+
+    __m256i hinib_select0 = _mm256_cmpeq_epi8  (curr_hinib0,    hi_nibbles);
+    __m256i hinib_select1 = _mm256_cmpeq_epi8  (curr_hinib1,    hi_nibbles);
+    __m256i hinib_select2 = _mm256_cmpeq_epi8  (curr_hinib2,    hi_nibbles);
+    __m256i hinib_select3 = _mm256_cmpeq_epi8  (curr_hinib3,    hi_nibbles);
+
+    __m256i lonib_lookup0 = _mm256_shuffle_epi8(offsetsi + 0, lo_nibbles);
+    __m256i lonib_lookup1 = _mm256_shuffle_epi8(offsetsi + 1, lo_nibbles);
+    __m256i lonib_lookup2 = _mm256_shuffle_epi8(offsetsi + 2, lo_nibbles);
+    __m256i lonib_lookup3 = _mm256_shuffle_epi8(offsetsi + 3, lo_nibbles);
+
+    __m256i lookup_mskd0  = _mm256_and_si256   (hinib_select0,  lonib_lookup0);
+    __m256i lookup_mskd1  = _mm256_and_si256   (hinib_select1,  lonib_lookup1);
+    __m256i lookup_mskd2  = _mm256_and_si256   (hinib_select2,  lonib_lookup2);
+    __m256i lookup_mskd3  = _mm256_and_si256   (hinib_select3,  lonib_lookup3);
+
+    __m256i lookup_mskd01 = _mm256_or_si256    (lookup_mskd0,   lookup_mskd1);
+    __m256i lookup_mskd23 = _mm256_or_si256    (lookup_mskd2,   lookup_mskd3);
+    __m256i lookup_res    = _mm256_or_si256    (lookup_mskd01,  lookup_mskd23);
+
+            result_row    = _mm256_or_si256    (result_row,     lookup_res);
+  }
+  return result_row;
+}
 
-  if (sao->type == SAO_TYPE_BAND) {
-    int offsets1 << KVZ_BIT_DEPTH;
-    kvz_calc_sao_offset_array(encoder, sao, offsets, color_i);
-    for (int y = 0; y < block_height; ++y) {
-      for (int x = 0; x < block_width; ++x) {
-        new_rec_datay * new_stride + x = offsetsrec_datay * stride + x;
-      }
+static INLINE void reconstruct_color_band(const encoder_control_t *encoder,
+                                          const kvz_pixel         *rec_data,
+                                                kvz_pixel         *new_rec_data,
+                                          const sao_info_t        *sao,
+                                                int32_t            stride,
+                                                int32_t            new_stride,
+                                                int32_t            block_width,
+                                                int32_t            block_height,
+                                                color_t            color_i)
+{
+  const uint32_t width_db32 = block_width & ~31;
+  const uint32_t width_db4  = block_width &  ~3;
+  const uint32_t width_rest = block_width &   3;
+
+  // Form the load&store mask
+  const __m256i wdb4_256      = _mm256_set1_epi32 (width_db4 & 31);
+  const __m256i indexes       = _mm256_setr_epi32 (3, 7, 11, 15, 19, 23, 27, 31);
+  const __m256i db4_mask      = _mm256_cmpgt_epi32(wdb4_256, indexes);
+
+  // Each of the 256 offsets is a byte, but only 16 are held in one YMM since
+  // lanes must be duplicated to use shuffle.
+  __m256i offsets16;
+  calc_sao_offset_array_avx2(encoder, sao, offsets, color_i);
+
+  for (uint32_t y = 0; y < block_height; y++) {
+    uint32_t x = 0;
+    for (; x < width_db32; x += 32) {
+      const uint32_t curr_srcpos = y *     stride + x;
+      const uint32_t curr_dstpos = y * new_stride + x;
+
+      __m256i curr_row = _mm256_loadu_si256((const __m256i *)(rec_data + curr_srcpos));
+      __m256i result   = lookup_color_band_ymm(curr_row, offsets);
+      _mm256_storeu_si256((__m256i *)(new_rec_data + curr_dstpos), result);
     }
-  } else {
-    // Don't sample the edge pixels because this function doesn't have access to
-    // their neighbours.
-    for (int y = 0; y < block_height; ++y) {
-      for (int x = 0; x < block_width; x+=8) {
-        vector2d_t a_ofs = g_sao_edge_offsetssao->eo_class0;
-        vector2d_t b_ofs = g_sao_edge_offsetssao->eo_class1;
-        const kvz_pixel *c_data = &rec_datay * stride + x;
-        kvz_pixel *new_data = &new_rec_datay * new_stride + x;
-        const kvz_pixel* a_ptr = &c_dataa_ofs.y * stride + a_ofs.x;
-        const kvz_pixel* c_ptr = &c_data0;
-        const kvz_pixel* b_ptr = &c_datab_ofs.y * stride + b_ofs.x;
-
-        __m128i v_a = _mm_loadl_epi64((__m128i*)a_ptr);
-        __m128i v_b = _mm_loadl_epi64((__m128i*)b_ptr);
-        __m128i v_c = _mm_loadl_epi64((__m128i*)c_ptr);
-
-        __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c) );
-
-        __m256i v_offset_v = load_5_offsets(sao->offsets + offset_v);
-        __m256i v_new_data = _mm256_permutevar8x32_epi32(v_offset_v, v_cat);
-        v_new_data = _mm256_add_epi32(v_new_data, _mm256_cvtepu8_epi32(v_c));
-        __m128i v_new_data_128 = _mm_packus_epi32(_mm256_castsi256_si128(v_new_data), _mm256_extracti128_si256(v_new_data, 1));
-        v_new_data_128 = _mm_packus_epi16(v_new_data_128, v_new_data_128);
-        
-        if ((block_width - x) >= 8) {
-          _mm_storel_epi64((__m128i*)new_data, v_new_data_128);
-        } else {
-          
-          kvz_pixel arr8;
-          _mm_storel_epi64((__m128i*)arr, v_new_data_128);
-          for (int i = 0; i < block_width - x; ++i) new_datai = arri;
-        }
-      
-      }
+    if (block_width > width_db32) {
+      const uint32_t curr_srcpos = y *     stride + x;
+      const uint32_t curr_dstpos = y * new_stride + x;
+      const uint32_t rest_srcpos = y *     stride + width_db4;
+      const uint32_t rest_dstpos = y * new_stride + width_db4;
+
+      // Read the very last pixels byte by byte and pack them into one dword.
+      // Piggyback said dword as the highest dword of the row vector variable,
+      // that particular place can never be loaded into by the maskmove
+      // (otherwise that vector would go through the divisible-by-32 code
+      // path).
+      uint32_t last_dword = load_border_bytes(rec_data, rest_srcpos, width_rest);
+
+      const int32_t *src_ptr = (const int32_t *)(    rec_data + curr_srcpos);
+            int32_t *dst_ptr = (      int32_t *)(new_rec_data + curr_dstpos);
+
+      __m256i curr_row = _mm256_maskload_epi32(src_ptr,  db4_mask);
+              curr_row = _mm256_insert_epi32  (curr_row, last_dword, 7);
+      __m256i result   = lookup_color_band_ymm(curr_row, offsets);
+
+      _mm256_maskstore_epi32(dst_ptr, db4_mask, result);
+      uint32_t last_dword_dst = _mm256_extract_epi32(result, 7);
+
+      store_border_bytes(new_rec_data, rest_dstpos, width_rest, last_dword_dst);
     }
   }
 }
 
+static __m256i FIX_W32 do_one_nonband_ymm(const __m256i a,
+                                          const __m256i b,
+                                          const __m256i c,
+                                          const __m256i sao_offs)
+{
+  const __m256i zero = _mm256_setzero_si256();
+
+  __m256i eo_cat = calc_eo_cat(a, b, c);
+  __m256i eo_cat_lo, eo_cat_hi, c_lo, c_hi;
+  cvt_shufmask_epi8_epi16(eo_cat, &eo_cat_lo, &eo_cat_hi);
+  cvt_epu8_epi16         (c,      &c_lo,      &c_hi);
+
+  __m256i offs_lo = _mm256_shuffle_epi8(sao_offs, eo_cat_lo);
+  __m256i offs_hi = _mm256_shuffle_epi8(sao_offs, eo_cat_hi);
+
+  __m256i res_lo  = _mm256_adds_epi16  (offs_lo,  c_lo);
+  __m256i res_hi  = _mm256_adds_epi16  (offs_hi,  c_hi);
+
+          res_lo  = _mm256_max_epi16   (res_lo,   zero);
+          res_hi  = _mm256_max_epi16   (res_hi,   zero);
 
-static int sao_band_ddistortion_avx2(const encoder_state_t * const state,
-                                     const kvz_pixel *orig_data,
-                                     const kvz_pixel *rec_data,
-                                     int block_width,
-                                     int block_height,
-                                     int band_pos,
-                                     int sao_bands4)
+  __m256i res     = _mm256_packus_epi16(res_lo,   res_hi);
+  return res;
+}
+
+static INLINE void reconstruct_color_other(const encoder_control_t *encoder,
+                                           const kvz_pixel         *rec_data,
+                                                 kvz_pixel         *new_rec_data,
+                                           const sao_info_t        *sao,
+                                                 int32_t            stride,
+                                                 int32_t            new_stride,
+                                                 int32_t            block_width,
+                                                 int32_t            block_height,
+                                                 color_t            color_i)
 {
-  int y, x;
-  int shift = state->encoder_control->bitdepth-5;
-  int sum = 0;
-
-  __m256i v_accum = { 0 };
-
-  for (y = 0; y < block_height; ++y) {
-    for (x = 0; x < block_width; x+=8) {
-      
-      __m256i v_band = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(rec_datay * block_width + x)));
-      v_band = _mm256_srli_epi32(v_band, shift);
-      v_band = _mm256_sub_epi32(v_band, _mm256_set1_epi32(band_pos));
-
-      __m256i v_offset = { 0 };
-      __m256i v_mask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_set1_epi32(~3), v_band), _mm256_setzero_si256());
-      v_offset = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)sao_bands)), v_band);
-
-      v_offset = _mm256_and_si256(v_offset, v_mask);
-      
-
-      __m256i v_diff = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(orig_datay * block_width + x)));
-      __m256i v_rec = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(rec_datay * block_width + x)));
-      v_diff = _mm256_sub_epi32(v_diff, v_rec);
-      __m256i v_diff_minus_offset = _mm256_sub_epi32(v_diff, v_offset);
-      __m256i v_temp_sum = _mm256_sub_epi32(_mm256_mullo_epi32(v_diff_minus_offset, v_diff_minus_offset), _mm256_mullo_epi32(v_diff, v_diff));
-      v_accum = _mm256_add_epi32(v_accum, v_temp_sum);
+  const uint32_t   offset_v    = color_i == COLOR_V ? 5 : 0;
+  const vector2d_t a_ofs       = g_sao_edge_offsetssao->eo_class0;
+  const vector2d_t b_ofs       = g_sao_edge_offsetssao->eo_class1;
+
+  const uint32_t   width_db32  = block_width & ~31;
+  const uint32_t   width_db4   = block_width &  ~3;
+  const uint32_t   width_rest  = block_width &   3;
+
+  // Form the load&store mask
+  const __m256i    wdb4_256    = _mm256_set1_epi32 (width_db4 & 31);
+  const __m256i    indexes     = _mm256_setr_epi32 (3, 7, 11, 15, 19, 23, 27, 31);
+  const __m256i    db4_mask    = _mm256_cmpgt_epi32(wdb4_256, indexes);
+
+  // Again, saturate offsets to signed 16 bits, because anything outside of
+  // -255, 255 will saturate anything these are used with
+  const __m128i    sao_offs_lo = _mm_loadu_si128  ((const __m128i *)(sao->offsets + offset_v + 0));
+  const __m128i    sao_offs_hi = _mm_cvtsi32_si128(sao->offsetsoffset_v + 4);
+  const __m128i    sao_offs_16 = _mm_packs_epi32  (sao_offs_lo, sao_offs_hi);
+
+  const __m256i    sao_offs    = broadcast_xmm2ymm(sao_offs_16);
+
+  for (uint32_t y = 0; y < block_height; y++) {
+    uint32_t x;
+    for (x = 0; x < width_db32; x += 32) {
+      const uint32_t  src_pos = y *     stride + x;
+      const uint32_t  dst_pos = y * new_stride + x;
+
+      // TODO: these will go negative, but that's a defect of the original
+      // code already since 2013 (98f2a1aedc5f4933c2729ae15412549dea9e5549)
+      const int32_t   a_pos   = (y + a_ofs.y) * stride + x + a_ofs.x;
+      const int32_t   b_pos   = (y + b_ofs.y) * stride + x + b_ofs.x;
+
+      __m256i a = _mm256_loadu_si256((const __m256i *)(rec_data + a_pos));
+      __m256i b = _mm256_loadu_si256((const __m256i *)(rec_data + b_pos));
+      __m256i c = _mm256_loadu_si256((const __m256i *)(rec_data + src_pos));
+
+      __m256i res = do_one_nonband_ymm(a, b, c, sao_offs);
+      _mm256_storeu_si256((__m256i *)(new_rec_data + dst_pos), res);
+    }
+    if (block_width > width_db32) {
+      const uint32_t curr_srcpos =  y            * stride + x;
+      const uint32_t rest_srcpos =  y            * stride + width_db4;
+
+      const  int32_t curr_apos   = (y + a_ofs.y) * stride + a_ofs.x + x;
+      const  int32_t rest_apos   = (y + a_ofs.y) * stride + a_ofs.x + width_db4;
+
+      const  int32_t curr_bpos   = (y + b_ofs.y) * stride + b_ofs.x + x;
+      const  int32_t rest_bpos   = (y + b_ofs.y) * stride + b_ofs.x + width_db4;
+
+      const uint32_t curr_dstpos = y * new_stride + x;
+      const uint32_t rest_dstpos = y * new_stride + width_db4;
+
+      uint32_t a_last        = load_border_bytes(rec_data, rest_apos,   width_rest);
+      uint32_t b_last        = load_border_bytes(rec_data, rest_bpos,   width_rest);
+      uint32_t c_last        = load_border_bytes(rec_data, rest_srcpos, width_rest);
+
+      const int32_t   *a_ptr = (const int32_t *)(    rec_data + curr_apos);
+      const int32_t   *b_ptr = (const int32_t *)(    rec_data + curr_bpos);
+      const int32_t   *c_ptr = (const int32_t *)(    rec_data + curr_srcpos);
+            int32_t *dst_ptr = (      int32_t *)(new_rec_data + curr_dstpos);
+
+      __m256i a = _mm256_maskload_epi32(a_ptr, db4_mask);
+      __m256i b = _mm256_maskload_epi32(b_ptr, db4_mask);
+      __m256i c = _mm256_maskload_epi32(c_ptr, db4_mask);
+
+              a = _mm256_insert_epi32  (a, a_last, 7);
+              b = _mm256_insert_epi32  (b, b_last, 7);
+              c = _mm256_insert_epi32  (c, c_last, 7);
+
+      __m256i res = do_one_nonband_ymm(a, b, c, sao_offs);
+      _mm256_maskstore_epi32(dst_ptr, db4_mask, res);
+
+      uint32_t last_dword = _mm256_extract_epi32(res, 7);
+
+      store_border_bytes(new_rec_data, rest_dstpos, width_rest, last_dword);
     }
   }
+}
+
+static void sao_reconstruct_color_avx2(const encoder_control_t *encoder,
+                                       const kvz_pixel         *rec_data,
+                                             kvz_pixel         *new_rec_data,
+                                       const sao_info_t        *sao,
+                                             int32_t            stride,
+                                             int32_t            new_stride,
+                                             int32_t            block_width,
+                                             int32_t            block_height,
+                                             color_t            color_i)
+{
+  if (sao->type == SAO_TYPE_BAND) {
+    reconstruct_color_band (encoder, rec_data, new_rec_data, sao, stride, new_stride, block_width, block_height, color_i);
+  } else {
+    reconstruct_color_other(encoder, rec_data, new_rec_data, sao, stride, new_stride, block_width, block_height, color_i);
+  }
+}
+
+static int32_t sao_band_ddistortion_avx2(const encoder_state_t *state,
+                                         const uint8_t         *orig_data,
+                                         const uint8_t         *rec_data,
+                                               int32_t          block_width,
+                                               int32_t          block_height,
+                                               int32_t          band_pos,
+                                         const int32_t          sao_bands4)
+{
+  const uint32_t bitdepth = 8;
+  const uint32_t shift    = bitdepth - 5;
 
-  //Full horizontal sum
-  v_accum = _mm256_add_epi32(v_accum, _mm256_castsi128_si256(_mm256_extracti128_si256(v_accum, 1)));
-  v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(1, 0, 3, 2)));
-  v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(0, 1, 0, 1)));
-  sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_accum));
+  // Clamp band_pos to 32 from above. It'll be subtracted from the shifted
+  // rec_data values, which in 8-bit depth will always be clamped to 0, 31,
+  // so if it ever exceeds 32, all the band values will be negative and
+  // ignored. Ditto for less than -4.
+  __m128i bp_128   = _mm_cvtsi32_si128    (band_pos);
+  __m128i hilimit  = _mm_cvtsi32_si128    (32);
+  __m128i lolimit  = _mm_cvtsi32_si128    (-4);
+
+          bp_128   = _mm_min_epi8         (bp_128, hilimit);
+          bp_128   = _mm_max_epi8         (bp_128, lolimit);
+
+  __m256i bp_256  = _mm256_broadcastb_epi8(bp_128);
+
+  __m128i sbs_32   = _mm_loadu_si128((const __m128i *)sao_bands);
+  __m128i sbs_8    = truncate_epi32_epi8(sbs_32);
+  __m256i sb_256   = broadcast_xmm2ymm  (sbs_8);
+
+  // These should trigger like, never, at least the later condition of block
+  // not being a multiple of 32 wide. Rather safe than sorry though, huge SAO
+  // bands are more tricky of these two because the algorithm needs a complete
+  // reimplementation to work on 16-bit values.
+  if (epi32v_fits_in_epi8s(sbs_32) != 0xffff)
+    goto use_generic;
+
+  // If VVC or something will start using SAO on blocks with width a multiple
+  // of 16, feel free to implement a XMM variant of this algorithm
+  if ((block_width & 31) != 0)
+    goto use_generic;
+
+  const __m256i zero          = _mm256_setzero_si256();
+  const __m256i threes        = _mm256_set1_epi8 (3);
+
+  __m256i sum = _mm256_setzero_si256();
+  for (uint32_t y = 0; y < block_height; y++) {
+    for (uint32_t x = 0; x < block_width; x += 32) {
+      const int32_t curr_pos = y * block_width + x;
+
+      __m256i   rd = _mm256_loadu_si256((const __m256i *)( rec_data + curr_pos));
+      __m256i orig = _mm256_loadu_si256((const __m256i *)(orig_data + curr_pos));
+
+      __m256i orig_lo, orig_hi, rd_lo, rd_hi;
+      cvt_epu8_epi16(orig, &orig_lo, &orig_hi);
+      cvt_epu8_epi16(rd,   &rd_lo,   &rd_hi);
+
+      __m256i diff_lo      = _mm256_sub_epi16     (orig_lo,      rd_lo);
+      __m256i diff_hi      = _mm256_sub_epi16     (orig_hi,      rd_hi);
+
+      // The shift will clamp band to 0...31; band_pos on the other
+      // hand is always between 0...32, so band will be -1...31. Anything
+      // below zero is ignored, so we can clamp band_pos to 32.
+      __m256i rd_divd      = srli_epi8           (rd,            shift);
+      __m256i band         = _mm256_sub_epi8     (rd_divd,       bp_256);
+
+      // Force all <0 or >3 bands to 0xff, which will zero the shuffle result
+      __m256i band_lt_0    = _mm256_cmpgt_epi8   (zero,          band);
+      __m256i band_gt_3    = _mm256_cmpgt_epi8   (band,          threes);
+      __m256i band_inv     = _mm256_or_si256     (band_lt_0,     band_gt_3);
+
+              band         = _mm256_or_si256     (band,          band_inv);
+
+      __m256i offsets      = _mm256_shuffle_epi8 (sb_256,        band);
+
+      __m256i curr_sum     = calc_diff_off_delta (diff_lo, diff_hi, offsets, orig);
+              sum          = _mm256_add_epi32    (sum,          curr_sum);
+    }
+  }
+  return hsum_8x32b(sum);
 
-  return sum;
+use_generic:
+  return sao_band_ddistortion_generic(state, orig_data, rec_data, block_width,
+      block_height, band_pos, sao_bands);
 }
 
 #endif //COMPILE_INTEL_AVX2

kvazaar-1.3.0.tar.gz/src/strategies/generic/dct-generic.c -> kvazaar-2.0.0.tar.gz/src/strategies/generic/dct-generic.c Changed

@@ -23,7 +23,7 @@
 #include "strategyselector.h"
 #include "tables.h"
 
-const int16_t kvz_g_dst_444 =
+ALIGNED(32) const int16_t kvz_g_dst_444 =
 {
   { 29, 55, 74, 84 },
   { 74, 74, 0, -74 },
@@ -31,7 +31,7 @@
   { 55, -84, 74, -29 }
 };
 
-const int16_t kvz_g_dct_444 =
+ALIGNED(32) const int16_t kvz_g_dct_444 =
 {
   { 64, 64, 64, 64 },
   { 83, 36, -36, -83 },
@@ -39,7 +39,7 @@
   { 36, -83, 83, -36 }
 };
 
-const int16_t kvz_g_dct_888 =
+ALIGNED(64) const int16_t kvz_g_dct_888 =
 {
   { 64, 64, 64, 64, 64, 64, 64, 64 },
   { 89, 75, 50, 18, -18, -50, -75, -89 },
@@ -51,7 +51,7 @@
   { 18, -50, 75, -89, 89, -75, 50, -18 }
 };
 
-const int16_t kvz_g_dct_161616 =
+ALIGNED(64) const int16_t kvz_g_dct_161616 =
 {
   { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
   { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90 },
@@ -71,7 +71,7 @@
   { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9 }
 };
 
-const int16_t kvz_g_dct_323232 =
+ALIGNED(64) const int16_t kvz_g_dct_323232 =
 {
   { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
   { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
@@ -107,7 +107,7 @@
   { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 }
 };
 
-const int16_t kvz_g_dst_4_t44 =
+ALIGNED(32) const int16_t kvz_g_dst_4_t44 =
 {
   { 29, 74, 84, 55 },
   { 55, 74, -29, -84 },
@@ -115,7 +115,7 @@
   { 84, -74, 55, -29 }
 };
 
-const int16_t kvz_g_dct_4_t44 =
+ALIGNED(32) const int16_t kvz_g_dct_4_t44 =
 {
   { 64, 83, 64, 36, },
   { 64, 36, -64, -83, },
@@ -123,7 +123,7 @@
   { 64, -83, 64, -36 }
 };
 
-const int16_t kvz_g_dct_8_t88 =
+ALIGNED(64) const int16_t kvz_g_dct_8_t88 =
 {
   { 64, 89, 83, 75, 64, 50, 36, 18, },
   { 64, 75, 36, -18, -64, -89, -83, -50, },
@@ -135,7 +135,7 @@
   { 64, -89, 83, -75, 64, -50, 36, -18 }
 };
 
-const int16_t kvz_g_dct_16_t1616 =
+ALIGNED(64) const int16_t kvz_g_dct_16_t1616 =
 {
   { 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9, },
   { 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25, },
@@ -155,7 +155,7 @@
   { 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9 }
 };
 
-const int16_t kvz_g_dct_32_t3232 =
+ALIGNED(64) const int16_t kvz_g_dct_32_t3232 =
 {
   { 64, 90, 90, 90, 89, 88, 87, 85, 83, 82, 80, 78, 75, 73, 70, 67, 64, 61, 57, 54, 50, 46, 43, 38, 36, 31, 25, 22, 18, 13, 9, 4, },
   { 64, 90, 87, 82, 75, 67, 57, 46, 36, 22, 9, -4, -18, -31, -43, -54, -64, -73, -80, -85, -89, -90, -90, -88, -83, -78, -70, -61, -50, -38, -25, -13, },

kvazaar-1.3.0.tar.gz/src/strategies/generic/encode_coding_tree-generic.c -> kvazaar-2.0.0.tar.gz/src/strategies/generic/encode_coding_tree-generic.c Changed

@@ -227,16 +227,16 @@
         }
       }
       if (be_valid && sign_hidden) {
-    	coeff_signs = coeff_signs >> 1;
-    	if (!cabac->only_count)
-    	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) {
-    	    coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero-1);
-    	  }
+        coeff_signs = coeff_signs >> 1;
+        if (!cabac->only_count)
+          if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) {
+            coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero-1);
+          }
         CABAC_BINS_EP(cabac, coeff_signs , (num_non_zero - 1), "coeff_sign_flag");
       } else {
         if (!cabac->only_count)
-    	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS)
-    	    coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero);
+          if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS)
+            coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero);
         CABAC_BINS_EP(cabac, coeff_signs, num_non_zero, "coeff_sign_flag");
       }
 
@@ -247,12 +247,12 @@
           int32_t base_level  = (idx < C1FLAG_NUMBER) ? (2 + first_coeff2) : 1;
 
           if (abs_coeffidx >= base_level) {
-        	if (!cabac->only_count) {
-        	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFFS)
-                    kvz_cabac_write_coeff_remain_encry(state, cabac, abs_coeffidx - base_level, go_rice_param, base_level);
-        	  else
-        		kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
-        	} else
+            if (!cabac->only_count) {
+              if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFFS)
+                kvz_cabac_write_coeff_remain_encry(state, cabac, abs_coeffidx - base_level, go_rice_param, base_level);
+              else
+                kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
+            } else
               kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
 
             if (abs_coeffidx > 3 * (1 << go_rice_param)) {

kvazaar-1.3.0.tar.gz/src/strategies/generic/intra-generic.c -> kvazaar-2.0.0.tar.gz/src/strategies/generic/intra-generic.c Changed

@@ -188,12 +188,54 @@
 #endif
 }
 
+/**
+* \brief Generage intra DC prediction with post filtering applied.
+* \param log2_width    Log2 of width, range 2..5.
+* \param in_ref_above  Pointer to -1 index of above reference, length=width*2+1.
+* \param in_ref_left   Pointer to -1 index of left reference, length=width*2+1.
+* \param dst           Buffer of size width*width.
+*/
+static void kvz_intra_pred_filtered_dc_generic(
+  const int_fast8_t log2_width,
+  const kvz_pixel *const ref_top,
+  const kvz_pixel *const ref_left,
+  kvz_pixel *const out_block)
+{
+  assert(log2_width >= 2 && log2_width <= 5);
+
+  const int_fast8_t width = 1 << log2_width;
+
+  int_fast16_t sum = 0;
+  for (int_fast8_t i = 0; i < width; ++i) {
+    sum += ref_topi + 1;
+    sum += ref_lefti + 1;
+  }
+
+  const kvz_pixel dc_val = (sum + width) >> (log2_width + 1);
+
+  // Filter top-left with (1 2 1 / 4)
+  out_block0 = (ref_left1 + 2 * dc_val + ref_top1 + 2) / 4;
+
+  // Filter rest of the boundary with (1 3 / 4)
+  for (int_fast8_t x = 1; x < width; ++x) {
+    out_blockx = (ref_topx + 1 + 3 * dc_val + 2) / 4;
+  }
+  for (int_fast8_t y = 1; y < width; ++y) {
+    out_blocky * width = (ref_lefty + 1 + 3 * dc_val + 2) / 4;
+    for (int_fast8_t x = 1; x < width; ++x) {
+      out_blocky * width + x = dc_val;
+    }
+  }
+}
+
+
 int kvz_strategy_register_intra_generic(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
 
   success &= kvz_strategyselector_register(opaque, "angular_pred", "generic", 0, &kvz_angular_pred_generic);
   success &= kvz_strategyselector_register(opaque, "intra_pred_planar", "generic", 0, &kvz_intra_pred_planar_generic);
+  success &= kvz_strategyselector_register(opaque, "intra_pred_filtered_dc", "generic", 0, &kvz_intra_pred_filtered_dc_generic);
 
   return success;
 }

kvazaar-1.3.0.tar.gz/src/strategies/generic/picture-generic.c -> kvazaar-2.0.0.tar.gz/src/strategies/generic/picture-generic.c Changed

@@ -536,54 +536,58 @@
 }
 
 static void inter_recon_bipred_generic(const int hi_prec_luma_rec0,
-	const int hi_prec_luma_rec1,
-	const int hi_prec_chroma_rec0,
-	const int hi_prec_chroma_rec1,
-	int32_t height,
-	int32_t width,
-	int32_t ypos,
-	int32_t xpos,
-	const hi_prec_buf_t*high_precision_rec0,
-	const hi_prec_buf_t*high_precision_rec1,
-	lcu_t* lcu,
-	kvz_pixel* temp_lcu_y,
-	kvz_pixel* temp_lcu_u,
-	kvz_pixel* temp_lcu_v) {
-
-	int shift = 15 - KVZ_BIT_DEPTH;
-	int offset = 1 << (shift - 1);
-
-	int y_in_lcu;
-	int x_in_lcu;
-
-	//After reconstruction, merge the predictors by taking an average of each pixel
-	for (int temp_y = 0; temp_y < height; ++temp_y) {
-
-
-		for (int temp_x = 0; temp_x < width; ++temp_x) {
-			y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
-			x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
-
-			int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu : (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-			int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu : (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-
-			lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
-
-			if (temp_x < width >> 1 && temp_y < height >> 1) {
-
-				y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
-				x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
-
-				int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + x_in_lcu : (temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-				int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + x_in_lcu : (lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-				lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
-
-				int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + x_in_lcu : (temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-				int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + x_in_lcu : (lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-				lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
-			}
-		}
-	}
+  const int hi_prec_luma_rec1,
+  const int hi_prec_chroma_rec0,
+  const int hi_prec_chroma_rec1,
+  int32_t height,
+  int32_t width,
+  int32_t ypos,
+  int32_t xpos,
+  const hi_prec_buf_t*high_precision_rec0,
+  const hi_prec_buf_t*high_precision_rec1,
+  lcu_t* lcu,
+  kvz_pixel* temp_lcu_y,
+  kvz_pixel* temp_lcu_u,
+  kvz_pixel* temp_lcu_v,
+  bool predict_luma,
+  bool predict_chroma) {
+
+  int shift = 15 - KVZ_BIT_DEPTH;
+  int offset = 1 << (shift - 1);
+
+  int y_in_lcu;
+  int x_in_lcu;
+
+  //After reconstruction, merge the predictors by taking an average of each pixel
+  for (int temp_y = 0; temp_y < height; ++temp_y) {
+
+
+    for (int temp_x = 0; temp_x < width; ++temp_x) {
+      y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+      x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+
+      if (predict_luma) {
+        int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu : (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+        int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu : (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+
+        lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
+      }
+
+      if (predict_chroma && (temp_x < width >> 1 && temp_y < height >> 1)) {
+
+        y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+        x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+
+        int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + x_in_lcu : (temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+        int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + x_in_lcu : (lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+        lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
+
+        int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + x_in_lcu : (temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+        int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + x_in_lcu : (lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+        lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
+      }
+    }
+  }
 
 }
 
@@ -671,6 +675,32 @@
   return result;
 }
 
+// Calculate pixel value variance. Takes in arrays of kvz_pixel
+static double pixel_var_generic(const kvz_pixel *arr, const uint32_t len)
+{
+  double var = 0;
+  double arr_mean = 0;
+
+  // Calculate array mean
+  int i = 0;
+  double sum = 0;
+
+  for (; i < len; ++i) {
+    sum += arri;
+  }
+  arr_mean = sum / (double)len;
+
+  // Calculate array variance
+  for (i = 0; i < len; ++i) {
+    double tmp = (double)arri - arr_mean;
+    var += tmp*tmp;
+  }
+
+  var /= len;
+
+  return var;
+}
+
 int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
@@ -710,5 +740,7 @@
   success &= kvz_strategyselector_register(opaque, "ver_sad", "generic", 0, &ver_sad_generic);
   success &= kvz_strategyselector_register(opaque, "hor_sad", "generic", 0, &hor_sad_generic);
 
+  success &= kvz_strategyselector_register(opaque, "pixel_var", "generic", 0, &pixel_var_generic);
+
   return success;
 }

kvazaar-1.3.0.tar.gz/src/strategies/generic/quant-generic.c -> kvazaar-2.0.0.tar.gz/src/strategies/generic/quant-generic.c Changed

@@ -178,6 +178,7 @@
 * \param pred_in  Predicted pixels.
 * \param rec_out  Reconstructed pixels.
 * \param coeff_out  Coefficients used for reconstruction of rec_out.
+* \param early_skip if this is used for early skip, bypass IT and IQ
 *
 * \returns  Whether coeff_out contains any non-zero coefficients.
 */
@@ -186,11 +187,12 @@
   const coeff_scan_order_t scan_order, const int use_trskip,
   const int in_stride, const int out_stride,
   const kvz_pixel *const ref_in, const kvz_pixel *const pred_in,
-  kvz_pixel *rec_out, coeff_t *coeff_out)
+  kvz_pixel *rec_out, coeff_t *coeff_out,
+  bool early_skip)
 {
   // Temporary arrays to pass data to and from kvz_quant and transform functions.
-  int16_t residualTR_MAX_WIDTH * TR_MAX_WIDTH;
-  coeff_t coeffTR_MAX_WIDTH * TR_MAX_WIDTH;
+  ALIGNED(64) int16_t residualTR_MAX_WIDTH * TR_MAX_WIDTH;
+  ALIGNED(64) coeff_t coeffTR_MAX_WIDTH * TR_MAX_WIDTH;
 
   int has_coeffs = 0;
 
@@ -241,7 +243,7 @@
 
   // Do the inverse quantization and transformation and the reconstruction to
   // rec_out.
-  if (has_coeffs) {
+  if (has_coeffs && !early_skip) {
     int y, x;
 
     // Get quantized residual. (coeff_out -> coeff -> residual)

kvazaar-1.3.0.tar.gz/src/strategies/generic/quant-generic.h -> kvazaar-2.0.0.tar.gz/src/strategies/generic/quant-generic.h Changed

@@ -1,48 +1,49 @@
-#ifndef STRATEGIES_QUANT_GENERIC_H_
-#define STRATEGIES_QUANT_GENERIC_H_
-/*****************************************************************************
- * This file is part of Kvazaar HEVC encoder.
- *
- * Copyright (C) 2013-2015 Tampere University of Technology and others (see
- * COPYING file).
- *
- * Kvazaar is free software: you can redistribute it and/or modify it under
- * the terms of the GNU Lesser General Public License as published by the
- * Free Software Foundation; either version 2.1 of the License, or (at your
- * option) any later version.
- *
- * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
- * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
- ****************************************************************************/
-
-/**
- * \ingroup Optimization
- * \file
- * Generic C implementations of optimized functions.
- */
-
-#include "cu.h"
-#include "encoderstate.h"
-#include "global.h" // IWYU pragma: keep
-#include "kvazaar.h"
-#include "tables.h"
-
-#define QUANT_SHIFT 14
-
-int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth);
-void kvz_quant_generic(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
-  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type);
-
-int kvz_quantize_residual_generic(encoder_state_t *const state,
-  const cu_info_t *const cur_cu, const int width, const color_t color,
-  const coeff_scan_order_t scan_order, const int use_trskip,
-  const int in_stride, const int out_stride,
-  const kvz_pixel *const ref_in, const kvz_pixel *const pred_in,
-  kvz_pixel *rec_out, coeff_t *coeff_out);
-
-#endif //STRATEGIES_QUANT_GENERIC_H_
+#ifndef STRATEGIES_QUANT_GENERIC_H_
+#define STRATEGIES_QUANT_GENERIC_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Generic C implementations of optimized functions.
+ */
+
+#include "cu.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+#include "tables.h"
+
+#define QUANT_SHIFT 14
+
+int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth);
+void kvz_quant_generic(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
+  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type);
+
+int kvz_quantize_residual_generic(encoder_state_t *const state,
+  const cu_info_t *const cur_cu, const int width, const color_t color,
+  const coeff_scan_order_t scan_order, const int use_trskip,
+  const int in_stride, const int out_stride,
+  const kvz_pixel *const ref_in, const kvz_pixel *const pred_in,
+  kvz_pixel *rec_out, coeff_t *coeff_out,
+  bool early_skip);
+
+#endif //STRATEGIES_QUANT_GENERIC_H_

kvazaar-1.3.0.tar.gz/src/strategies/generic/sao-generic.c -> kvazaar-2.0.0.tar.gz/src/strategies/generic/sao-generic.c Changed

@@ -19,6 +19,7 @@
  ****************************************************************************/
 
 #include "strategies/generic/sao-generic.h"
+#include "strategies/generic/sao_shared_generics.h"
 
 #include "cu.h"
 #include "encoder.h"
@@ -28,51 +29,6 @@
 #include "strategyselector.h"
 
 
-// Mapping of edge_idx values to eo-classes.
-static int sao_calc_eo_cat(kvz_pixel a, kvz_pixel b, kvz_pixel c)
-{
-  // Mapping relationships between a, b and c to eo_idx.
-  static const int sao_eo_idx_to_eo_category = { 1, 2, 0, 3, 4 };
-
-  int eo_idx = 2 + SIGN3((int)c - (int)a) + SIGN3((int)c - (int)b);
-
-  return sao_eo_idx_to_eo_categoryeo_idx;
-}
-
-
-static int sao_edge_ddistortion_generic(const kvz_pixel *orig_data,
-                                        const kvz_pixel *rec_data,
-                                        int block_width,
-                                        int block_height,
-                                        int eo_class,
-                                        int offsetsNUM_SAO_EDGE_CATEGORIES)
-{
-  int y, x;
-  int sum = 0;
-  vector2d_t a_ofs = g_sao_edge_offsetseo_class0;
-  vector2d_t b_ofs = g_sao_edge_offsetseo_class1;
-
-  for (y = 1; y < block_height - 1; ++y) {
-    for (x = 1; x < block_width - 1; ++x) {
-      const kvz_pixel *c_data = &rec_datay * block_width + x;
-      kvz_pixel a = c_dataa_ofs.y * block_width + a_ofs.x;
-      kvz_pixel c = c_data0;
-      kvz_pixel b = c_datab_ofs.y * block_width + b_ofs.x;
-
-      int offset = offsetssao_calc_eo_cat(a, b, c);
-
-      if (offset != 0) {
-        int diff = orig_datay * block_width + x - c;
-        // Offset is applied to reconstruction, so it is subtracted from diff.
-        sum += (diff - offset) * (diff - offset) - diff * diff;
-      }
-    }
-  }
-
-  return sum;
-}
-
-
 /**
  * \param orig_data  Original pixel data. 64x64 for luma, 32x32 for chroma.
  * \param rec_data  Reconstructed pixel data. 64x64 for luma, 32x32 for chroma.
@@ -93,6 +49,9 @@
 
   // Don't sample the edge pixels because this function doesn't have access to
   // their neighbours.
+
+  
+
   for (y = 1; y < block_height - 1; ++y) {
     for (x = 1; x < block_width - 1; ++x) {
       const kvz_pixel *c_data = &rec_datay * block_width + x;
@@ -152,36 +111,6 @@
 }
 
 
-static int sao_band_ddistortion_generic(const encoder_state_t * const state,
-                                        const kvz_pixel *orig_data,
-                                        const kvz_pixel *rec_data,
-                                        int block_width,
-                                        int block_height,
-                                        int band_pos,
-                                        int sao_bands4)
-{
-  int y, x;
-  int shift = state->encoder_control->bitdepth-5;
-  int sum = 0;
-
-  for (y = 0; y < block_height; ++y) {
-    for (x = 0; x < block_width; ++x) {
-      int band = (rec_datay * block_width + x >> shift) - band_pos;
-      int offset = 0;
-      if (band >= 0 && band < 4) {
-        offset = sao_bandsband;
-      }
-      if (offset != 0) {
-        int diff = orig_datay * block_width + x - rec_datay * block_width + x;
-        // Offset is applied to reconstruction, so it is subtracted from diff.
-        sum += (diff - offset) * (diff - offset) - diff * diff;
-      }
-    }
-  }
-
-  return sum;
-}
-
 
 int kvz_strategy_register_sao_generic(void* opaque, uint8_t bitdepth)
 {

kvazaar-2.0.0.tar.gz/src/strategies/generic/sao_shared_generics.h Added

@@ -0,0 +1,97 @@
+#ifndef SAO_BAND_DDISTORTION_H_
+#define SAO_BAND_DDISTORTION_H_
+
+// #include "encoder.h"
+#include "encoderstate.h"
+#include "kvazaar.h"
+#include "sao.h"
+
+// Mapping of edge_idx values to eo-classes.
+static int sao_calc_eo_cat(kvz_pixel a, kvz_pixel b, kvz_pixel c)
+{
+  // Mapping relationships between a, b and c to eo_idx.
+  static const int sao_eo_idx_to_eo_category = { 1, 2, 0, 3, 4 };
+
+  int eo_idx = 2 + SIGN3((int)c - (int)a) + SIGN3((int)c - (int)b);
+
+  return sao_eo_idx_to_eo_categoryeo_idx;
+}
+
+static int sao_edge_ddistortion_generic(const kvz_pixel *orig_data,
+                                        const kvz_pixel *rec_data,
+                                              int32_t    block_width,
+                                              int32_t    block_height,
+                                              int32_t    eo_class,
+                                        const int32_t    offsetsNUM_SAO_EDGE_CATEGORIES)
+{
+  int y, x;
+  int32_t sum = 0;
+  vector2d_t a_ofs = g_sao_edge_offsetseo_class0;
+  vector2d_t b_ofs = g_sao_edge_offsetseo_class1;
+
+  for (y = 1; y < block_height - 1; y++) {
+    for (x = 1; x < block_width - 1; x++) {
+      uint32_t c_pos =  y            * block_width + x;
+      uint32_t a_pos = (y + a_ofs.y) * block_width + x + a_ofs.x;
+      uint32_t b_pos = (y + b_ofs.y) * block_width + x + b_ofs.x;
+
+      uint8_t   a    =  rec_dataa_pos;
+      uint8_t   b    =  rec_datab_pos;
+      uint8_t   c    =  rec_datac_pos;
+      uint8_t   orig = orig_datac_pos;
+
+      int32_t eo_cat = sao_calc_eo_cat(a, b, c);
+      int32_t offset = offsetseo_cat;
+
+      if (offset != 0) {
+        int32_t diff   = orig - c;
+        int32_t delta  = diff - offset;
+        int32_t curr   = delta * delta - diff * diff;
+
+        sum += curr;
+      }
+    }
+  }
+  return sum;
+}
+
+static int sao_band_ddistortion_generic(const encoder_state_t * const state,
+                                        const kvz_pixel *orig_data,
+                                        const kvz_pixel *rec_data,
+                                        int block_width,
+                                        int block_height,
+                                        int band_pos,
+                                        const int sao_bands4)
+{
+  int y, x;
+  int shift = state->encoder_control->bitdepth-5;
+  int sum = 0;
+  for (y = 0; y < block_height; ++y) {
+    for (x = 0; x < block_width; ++x) {
+      const int32_t curr_pos = y * block_width + x;
+
+      kvz_pixel rec  =  rec_datacurr_pos;
+      kvz_pixel orig = orig_datacurr_pos;
+
+      int32_t band = (rec >> shift) - band_pos;
+      int32_t offset = 0;
+      if (band >= 0 && band <= 3) {
+        offset = sao_bandsband;
+      }
+      // Offset is applied to reconstruction, so it is subtracted from diff.
+
+      int32_t diff  = orig - rec;
+      int32_t delta = diff - offset;
+
+      int32_t dmask = (offset == 0) ? -1 : 0;
+      diff  &= ~dmask;
+      delta &= ~dmask;
+
+      sum += delta * delta - diff * diff;
+    }
+  }
+
+  return sum;
+}
+
+#endif

kvazaar-1.3.0.tar.gz/src/strategies/missing-intel-intrinsics.h -> kvazaar-2.0.0.tar.gz/src/strategies/missing-intel-intrinsics.h Changed

kvazaar-1.3.0.tar.gz/src/strategies/strategies-intra.c -> kvazaar-2.0.0.tar.gz/src/strategies/strategies-intra.c Changed

kvazaar-1.3.0.tar.gz/src/strategies/strategies-intra.h -> kvazaar-2.0.0.tar.gz/src/strategies/strategies-intra.h Changed

kvazaar-1.3.0.tar.gz/src/strategies/strategies-picture.c -> kvazaar-2.0.0.tar.gz/src/strategies/strategies-picture.c Changed

kvazaar-1.3.0.tar.gz/src/strategies/strategies-picture.h -> kvazaar-2.0.0.tar.gz/src/strategies/strategies-picture.h Changed

@@ -27,6 +27,7 @@
  */
 
 #include "global.h" // IWYU pragma: keep
+#include "inter.h"
 #include "kvazaar.h"
 #include "encoderstate.h"
 #include "strategies/optimized_sad_func_ptr_t.h"
@@ -121,21 +122,23 @@
                                 uint32_t ref_stride, uint32_t left, uint32_t right);
 
 typedef void (inter_recon_bipred_func)(const int hi_prec_luma_rec0,
-	const int hi_prec_luma_rec1,
-	const int hi_prec_chroma_rec0,
-	const int hi_prec_chroma_rec1,
-	int height,
-	int width,
-	int ypos,
-	int xpos,
-	const hi_prec_buf_t*high_precision_rec0,
-	const hi_prec_buf_t*high_precision_rec1,
-	lcu_t* lcu,
-	kvz_pixel temp_lcu_yLCU_WIDTH*LCU_WIDTH,
-	kvz_pixel temp_lcu_uLCU_WIDTH_C*LCU_WIDTH_C,
-	kvz_pixel temp_lcu_vLCU_WIDTH_C*LCU_WIDTH_C);
-	
-	
+    const int hi_prec_luma_rec1,
+    const int hi_prec_chroma_rec0,
+    const int hi_prec_chroma_rec1,
+    int height,
+    int width,
+    int ypos,
+    int xpos,
+    const hi_prec_buf_t*high_precision_rec0,
+    const hi_prec_buf_t*high_precision_rec1,
+    lcu_t* lcu,
+    kvz_pixel temp_lcu_yLCU_WIDTH*LCU_WIDTH,
+    kvz_pixel temp_lcu_uLCU_WIDTH_C*LCU_WIDTH_C,
+    kvz_pixel temp_lcu_vLCU_WIDTH_C*LCU_WIDTH_C,
+    bool predict_luma,
+    bool predict_chroma);  
+
+typedef double (pixel_var_func)(const kvz_pixel *buf, const uint32_t len);
 
 // Declare function pointers.
 extern reg_sad_func * kvz_reg_sad;
@@ -175,6 +178,8 @@
 extern ver_sad_func *kvz_ver_sad;
 extern hor_sad_func *kvz_hor_sad;
 
+extern pixel_var_func *kvz_pixel_var;
+
 int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth);
 cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n);
 cost_pixel_nxn_func * kvz_pixels_get_sad_func(unsigned n);
@@ -210,6 +215,7 @@
   {"get_optimized_sad", (void**) &kvz_get_optimized_sad}, \
   {"ver_sad", (void**) &kvz_ver_sad}, \
   {"hor_sad", (void**) &kvz_hor_sad}, \
+  {"pixel_var", (void**) &kvz_pixel_var}, \

kvazaar-1.3.0.tar.gz/src/strategies/strategies-quant.h -> kvazaar-2.0.0.tar.gz/src/strategies/strategies-quant.h Changed

kvazaar-1.3.0.tar.gz/src/strategies/strategies-sao.h -> kvazaar-2.0.0.tar.gz/src/strategies/strategies-sao.h Changed

kvazaar-1.3.0.tar.gz/src/strategyselector.c -> kvazaar-2.0.0.tar.gz/src/strategyselector.c Changed

@@ -103,115 +103,115 @@
 
   //We can free the structure now, as all strategies are statically set to pointers
   if (strategies.allocated) {
-	  //Also check what optimizations are available and what are in use
-	  //SIMD optimizations available
-	  bool strategies_available = false;
-	  fprintf(stderr, "Available: ");
-	  if (kvz_g_strategies_available.intel_flags.avx != 0){
-		  fprintf(stderr, "avx(%d) ", kvz_g_strategies_available.intel_flags.avx);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.avx2 != 0){
-		  fprintf(stderr, "avx2(%d) ", kvz_g_strategies_available.intel_flags.avx2);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.mmx != 0) {
-		  fprintf(stderr, "mmx(%d) ", kvz_g_strategies_available.intel_flags.mmx);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.sse != 0) {
-		  fprintf(stderr, "sse(%d) ", kvz_g_strategies_available.intel_flags.sse);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.sse2 != 0) {
-		  fprintf(stderr, "sse2(%d) ", kvz_g_strategies_available.intel_flags.sse2);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.sse3 != 0) {
-		  fprintf(stderr, "sse3(%d) ", kvz_g_strategies_available.intel_flags.sse3);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.sse41 != 0) {
-		  fprintf(stderr, "sse41(%d) ", kvz_g_strategies_available.intel_flags.sse41);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.sse42 != 0) {
-		  fprintf(stderr, "sse42(%d) ", kvz_g_strategies_available.intel_flags.sse42);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.intel_flags.ssse3 != 0) {
-		  fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_available.intel_flags.ssse3);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.arm_flags.neon != 0) {
-		  fprintf(stderr, "neon(%d) ", kvz_g_strategies_available.arm_flags.neon);
-		  strategies_available = true;
-	  }
-	  if (kvz_g_strategies_available.powerpc_flags.altivec != 0) {
-		  fprintf(stderr, "altivec(%d) ", kvz_g_strategies_available.powerpc_flags.altivec);
-		  strategies_available = true;
-	  }
-	  //If there is no strategies available
-	  if (!strategies_available){
-		  fprintf(stderr, "no SIMD optimizations");
-	  }
-	  fprintf(stderr, "\n");
-
-	  //SIMD optimizations in use
-	  bool strategies_in_use = false;
-	  fprintf(stderr, "In use: ");
-	  if (kvz_g_strategies_in_use.intel_flags.avx != 0){
-		  fprintf(stderr, "avx(%d) ", kvz_g_strategies_in_use.intel_flags.avx);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.avx2 != 0){ 
-		  fprintf(stderr, "avx2(%d) ", kvz_g_strategies_in_use.intel_flags.avx2);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.mmx != 0) {
-		  fprintf(stderr, "mmx(%d) ", kvz_g_strategies_in_use.intel_flags.mmx);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.sse != 0) {
-		  fprintf(stderr, "sse(%d) ", kvz_g_strategies_in_use.intel_flags.sse);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.sse2 != 0) {
-		  fprintf(stderr, "sse2(%d) ", kvz_g_strategies_in_use.intel_flags.sse2);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.sse3 != 0) {
-		  fprintf(stderr, "sse3(%d) ", kvz_g_strategies_in_use.intel_flags.sse3);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.sse41 != 0) {
-		  fprintf(stderr, "sse41(%d) ", kvz_g_strategies_in_use.intel_flags.sse41);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.sse42 != 0) {
-		  fprintf(stderr, "sse42(%d) ", kvz_g_strategies_in_use.intel_flags.sse42);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.intel_flags.ssse3 != 0) {
-		  fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_in_use.intel_flags.ssse3);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.arm_flags.neon != 0) {
-		  fprintf(stderr, "neon(%d) ", kvz_g_strategies_in_use.arm_flags.neon);
-		  strategies_in_use = true;
-	  }
-	  if (kvz_g_strategies_in_use.powerpc_flags.altivec != 0) {
-		  fprintf(stderr, "altivec(%d) ", kvz_g_strategies_in_use.powerpc_flags.altivec);
-		  strategies_in_use = true;
-	  }
-	  //If there is no strategies in use
-	  if (!strategies_in_use){
-		  fprintf(stderr, "no SIMD optimizations");
-	  }
-	  fprintf(stderr, "\n");
-
-	  //Free memory
-	  free(strategies.strategies);
+    //Also check what optimizations are available and what are in use
+    //SIMD optimizations available
+    bool strategies_available = false;
+    fprintf(stderr, "Available: ");
+    if (kvz_g_strategies_available.intel_flags.avx != 0){
+      fprintf(stderr, "avx(%d) ", kvz_g_strategies_available.intel_flags.avx);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.avx2 != 0){
+      fprintf(stderr, "avx2(%d) ", kvz_g_strategies_available.intel_flags.avx2);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.mmx != 0) {
+      fprintf(stderr, "mmx(%d) ", kvz_g_strategies_available.intel_flags.mmx);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.sse != 0) {
+      fprintf(stderr, "sse(%d) ", kvz_g_strategies_available.intel_flags.sse);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.sse2 != 0) {
+      fprintf(stderr, "sse2(%d) ", kvz_g_strategies_available.intel_flags.sse2);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.sse3 != 0) {
+      fprintf(stderr, "sse3(%d) ", kvz_g_strategies_available.intel_flags.sse3);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.sse41 != 0) {
+      fprintf(stderr, "sse41(%d) ", kvz_g_strategies_available.intel_flags.sse41);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.sse42 != 0) {
+      fprintf(stderr, "sse42(%d) ", kvz_g_strategies_available.intel_flags.sse42);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.intel_flags.ssse3 != 0) {
+      fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_available.intel_flags.ssse3);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.arm_flags.neon != 0) {
+      fprintf(stderr, "neon(%d) ", kvz_g_strategies_available.arm_flags.neon);
+      strategies_available = true;
+    }
+    if (kvz_g_strategies_available.powerpc_flags.altivec != 0) {
+      fprintf(stderr, "altivec(%d) ", kvz_g_strategies_available.powerpc_flags.altivec);
+      strategies_available = true;
+    }
+    //If there is no strategies available
+    if (!strategies_available){
+      fprintf(stderr, "no SIMD optimizations");
+    }
+    fprintf(stderr, "\n");
+
+    //SIMD optimizations in use
+    bool strategies_in_use = false;
+    fprintf(stderr, "In use: ");
+    if (kvz_g_strategies_in_use.intel_flags.avx != 0){
+      fprintf(stderr, "avx(%d) ", kvz_g_strategies_in_use.intel_flags.avx);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.avx2 != 0){ 
+      fprintf(stderr, "avx2(%d) ", kvz_g_strategies_in_use.intel_flags.avx2);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.mmx != 0) {
+      fprintf(stderr, "mmx(%d) ", kvz_g_strategies_in_use.intel_flags.mmx);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.sse != 0) {
+      fprintf(stderr, "sse(%d) ", kvz_g_strategies_in_use.intel_flags.sse);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.sse2 != 0) {
+      fprintf(stderr, "sse2(%d) ", kvz_g_strategies_in_use.intel_flags.sse2);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.sse3 != 0) {
+      fprintf(stderr, "sse3(%d) ", kvz_g_strategies_in_use.intel_flags.sse3);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.sse41 != 0) {
+      fprintf(stderr, "sse41(%d) ", kvz_g_strategies_in_use.intel_flags.sse41);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.sse42 != 0) {
+      fprintf(stderr, "sse42(%d) ", kvz_g_strategies_in_use.intel_flags.sse42);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.intel_flags.ssse3 != 0) {
+      fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_in_use.intel_flags.ssse3);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.arm_flags.neon != 0) {
+      fprintf(stderr, "neon(%d) ", kvz_g_strategies_in_use.arm_flags.neon);
+      strategies_in_use = true;
+    }
+    if (kvz_g_strategies_in_use.powerpc_flags.altivec != 0) {
+      fprintf(stderr, "altivec(%d) ", kvz_g_strategies_in_use.powerpc_flags.altivec);
+      strategies_in_use = true;
+    }
+    //If there is no strategies in use
+    if (!strategies_in_use){
+      fprintf(stderr, "no SIMD optimizations");
+    }
+    fprintf(stderr, "\n");
+
+    //Free memory
+    free(strategies.strategies);
   }
 
   return 1;

kvazaar-1.3.0.tar.gz/src/threadwrapper/include/pthread.h -> kvazaar-2.0.0.tar.gz/src/threadwrapper/include/pthread.h Changed

kvazaar-1.3.0.tar.gz/src/threadwrapper/src/pthread.cpp -> kvazaar-2.0.0.tar.gz/src/threadwrapper/src/pthread.cpp Changed

@@ -17,8 +17,13 @@
 #include "pthread.h"
 #include <condition_variable>
 #include <mutex>
+#include <shared_mutex>
 #include <thread>
 
+typedef struct {
+  std::shared_mutex *lock;
+  bool write_lock;
+} rw_lock_internal;
 
 int pthread_cond_broadcast(pthread_cond_t* cond) {
     static_cast<std::condition_variable*>(*cond)->notify_all();
@@ -86,3 +91,43 @@
     static_cast<std::mutex*>(*mutex)->unlock();
     return 0;
 }
+
+int pthread_rwlock_init(pthread_rwlock_t * lock, const pthread_rwlockattr_t *)
+{
+  *lock = new rw_lock_internal;
+  static_cast<rw_lock_internal*>(*lock)->lock = new std::shared_mutex;
+  static_cast<rw_lock_internal*>(*lock)->write_lock = false;
+  return 0;
+}
+
+int pthread_rwlock_destroy(pthread_rwlock_t* rwlock)
+{
+  delete static_cast<rw_lock_internal*>(*rwlock)->lock;
+  delete static_cast<rw_lock_internal*>(*rwlock);
+  return 0;
+}
+
+int pthread_rwlock_rdlock(pthread_rwlock_t* rwlock)
+{
+  static_cast<rw_lock_internal*>(*rwlock)->lock->lock_shared();
+  return 0;
+}
+
+int pthread_rwlock_wrlock(pthread_rwlock_t* rwlock)
+{
+  static_cast<rw_lock_internal*>(*rwlock)->lock->lock();
+  static_cast<rw_lock_internal*>(*rwlock)->write_lock = true;
+  return 0;
+}
+
+int pthread_rwlock_unlock(pthread_rwlock_t* rwlock)
+{
+  if (static_cast<rw_lock_internal*>(*rwlock)->write_lock) {
+    static_cast<rw_lock_internal*>(*rwlock)->write_lock = false;
+    static_cast<rw_lock_internal*>(*rwlock)->lock->unlock();
+  }
+  else {
+    static_cast<rw_lock_internal*>(*rwlock)->lock->unlock_shared();
+  }
+  return 0;
+}

kvazaar-1.3.0.tar.gz/src/transform.c -> kvazaar-2.0.0.tar.gz/src/transform.c Changed

@@ -155,7 +155,8 @@
   int32_t  j,k;
   for (j = 0; j < block_size; j++) {
     for(k = 0; k < block_size; k ++) {
-      coeffj * block_size + k = blockj * block_size + k << shift;
+      // Casting back and forth to make UBSan not trigger due to left-shifting negatives
+      coeffj * block_size + k = (int16_t)((uint16_t)(blockj * block_size + k) << shift);
     }
   }
 }
@@ -246,14 +247,14 @@
   noskip.has_coeffs = kvz_quantize_residual(
       state, cur_cu, width, color, scan_order,
       0, in_stride, 4,
-      ref_in, pred_in, noskip.rec, noskip.coeff);
+      ref_in, pred_in, noskip.rec, noskip.coeff, false);
   noskip.cost = kvz_pixels_calc_ssd(ref_in, noskip.rec, in_stride, 4, 4);
   noskip.cost += kvz_get_coeff_cost(state, noskip.coeff, 4, 0, scan_order) * bit_cost;
 
   skip.has_coeffs = kvz_quantize_residual(
     state, cur_cu, width, color, scan_order,
     1, in_stride, 4,
-    ref_in, pred_in, skip.rec, skip.coeff);
+    ref_in, pred_in, skip.rec, skip.coeff, false);
   skip.cost = kvz_pixels_calc_ssd(ref_in, skip.rec, in_stride, 4, 4);
   skip.cost += kvz_get_coeff_cost(state, skip.coeff, 4, 0, scan_order) * bit_cost;
 
@@ -277,6 +278,8 @@
 
 /**
  * Calculate the residual coefficients for a single TU.
+ *
+ * \param early_skip if this is used for early skip, bypass IT and IQ
  */
 static void quantize_tr_residual(encoder_state_t * const state,
                                  const color_t color,
@@ -284,7 +287,8 @@
                                  const int32_t y,
                                  const uint8_t depth,
                                  cu_info_t *cur_pu,
-                                 lcu_t* lcu)
+                                 lcu_t* lcu,
+                                 bool early_skip)
 {
   const kvz_config *cfg    = &state->encoder_control->cfg;
   const int32_t shift      = color == COLOR_Y ? 0 : 1;
@@ -397,7 +401,8 @@
                                        ref,
                                        pred,
                                        pred,
-                                       coeff);
+                                       coeff,
+                                       early_skip);
   }
 
   if (has_coeffs) {
@@ -411,9 +416,10 @@
  * kvantized residual. Processes the TU tree recursively.
  *
  * Inputs are:
- * - lcu->rec  pixels after prediction for the area
- * - lcu->ref  reference pixels for the area
- * - lcu->cu   for the area
+ * - lcu->rec   pixels after prediction for the area
+ * - lcu->ref   reference pixels for the area
+ * - lcu->cu    for the area
+ * - early_skip if this is used for early skip, bypass IT and IQ
  *
  * Outputs are:
  * - lcu->rec               reconstruction after quantized residual
@@ -428,7 +434,8 @@
                                const int32_t y,
                                const uint8_t depth,
                                cu_info_t *cur_pu,
-                               lcu_t* lcu)
+                               lcu_t* lcu,
+                               bool early_skip)
 {
   const int32_t width = LCU_WIDTH >> depth;
   const vector2d_t lcu_px  = { SUB_SCU(x), SUB_SCU(y) };
@@ -445,16 +452,27 @@
          width == 32 ||
          width == 64);
 
+  // Reset CBFs because CBFs might have been set
+  // for depth earlier
+  if (luma) {
+    cbf_clear(&cur_pu->cbf, depth, COLOR_Y);
+  }
+  if (chroma) {
+    cbf_clear(&cur_pu->cbf, depth, COLOR_U);
+    cbf_clear(&cur_pu->cbf, depth, COLOR_V);
+  }
+
   if (depth == 0 || cur_pu->tr_depth > depth) {
+
     // Split transform and increase depth
     const int offset = width / 2;
     const int32_t x2 = x + offset;
     const int32_t y2 = y + offset;
 
-    kvz_quantize_lcu_residual(state, luma, chroma, x,  y,  depth + 1, NULL, lcu);
-    kvz_quantize_lcu_residual(state, luma, chroma, x2, y,  depth + 1, NULL, lcu);
-    kvz_quantize_lcu_residual(state, luma, chroma, x,  y2, depth + 1, NULL, lcu);
-    kvz_quantize_lcu_residual(state, luma, chroma, x2, y2, depth + 1, NULL, lcu);
+    kvz_quantize_lcu_residual(state, luma, chroma, x,  y,  depth + 1, NULL, lcu, early_skip);
+    kvz_quantize_lcu_residual(state, luma, chroma, x2, y,  depth + 1, NULL, lcu, early_skip);
+    kvz_quantize_lcu_residual(state, luma, chroma, x,  y2, depth + 1, NULL, lcu, early_skip);
+    kvz_quantize_lcu_residual(state, luma, chroma, x2, y2, depth + 1, NULL, lcu, early_skip);
 
     // Propagate coded block flags from child CUs to parent CU.
     uint16_t child_cbfs3 = {
@@ -472,11 +490,11 @@
   } else {
     // Process a leaf TU.
     if (luma) {
-      quantize_tr_residual(state, COLOR_Y, x, y, depth, cur_pu, lcu);
+      quantize_tr_residual(state, COLOR_Y, x, y, depth, cur_pu, lcu, early_skip);
     }
     if (chroma) {
-      quantize_tr_residual(state, COLOR_U, x, y, depth, cur_pu, lcu);
-      quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu);
+      quantize_tr_residual(state, COLOR_U, x, y, depth, cur_pu, lcu, early_skip);
+      quantize_tr_residual(state, COLOR_V, x, y, depth, cur_pu, lcu, early_skip);
     }
   }
 }

kvazaar-1.3.0.tar.gz/src/transform.h -> kvazaar-2.0.0.tar.gz/src/transform.h Changed

kvazaar-1.3.0.tar.gz/tests/Makefile.am -> kvazaar-2.0.0.tar.gz/tests/Makefile.am Changed

kvazaar-1.3.0.tar.gz/tests/dct_tests.c -> kvazaar-2.0.0.tar.gz/tests/dct_tests.c Changed

kvazaar-1.3.0.tar.gz/tests/test_gop.sh -> kvazaar-2.0.0.tar.gz/tests/test_gop.sh Changed

kvazaar-1.3.0.tar.gz/tests/test_interlace.sh -> kvazaar-2.0.0.tar.gz/tests/test_interlace.sh Changed

kvazaar-2.0.0.tar.gz/tests/test_pu_depth_constraints.sh Added

kvazaar-1.3.0.tar.gz/tests/test_rate_control.sh -> kvazaar-2.0.0.tar.gz/tests/test_rate_control.sh Changed

@@ -5,3 +5,8 @@
 
 valgrind_test 264x130 10 --bitrate=500000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3
 if  ! -z ${GITLAB_CI+x} ;then valgrind_test 512x512 30 --bitrate=100000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=2 --pu-depth-inter=1-3 --pu-depth-intra=2-3 --bipred; fi
+if  ! -z ${GITLAB_CI+x} ;then valgrind_test 264x130 10 --bitrate=500000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3 --bipred --gop 8 --rc-algorithm oba --no-intra-bits --no-clip-neighbour; fi
+if  ! -z ${GITLAB_CI+x} ;then valgrind_test 264x130 10 --bitrate=500000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3 --bipred --gop 8 --rc-algorithm oba --intra-bits --clip-neighbour; fi
+if  ! -z ${GITLAB_CI+x} ;then valgrind_test 264x130 10 --bitrate=500000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3 --bipred --gop lp-g8d4t1 --rc-algorithm oba --no-intra-bits --no-clip-neighbour; fi
+if  ! -z ${GITLAB_CI+x} ;then valgrind_test 264x130 10 --bitrate=500000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3 --bipred --gop lp-g8d4t1 --rc-algorithm oba --intra-bits --clip-neighbour; fi
+

kvazaar-1.3.0.tar.gz/tests/test_tools.sh -> kvazaar-2.0.0.tar.gz/tests/test_tools.sh Changed

kvazaar-2.0.0.tar.gz/tests/tsan_suppressions.txt Added

kvazaar-1.3.0.tar.gz/tests/util.sh -> kvazaar-2.0.0.tar.gz/tests/util.sh Changed