Packman Build Service PMBS

Changes of Revision 9

kvazaar.changes Changed

@@ -1,4 +1,108 @@
 -------------------------------------------------------------------
+Tue Oct  4 07:43:42 UTC 2016 - aloisio@gmx.com
+
+- Update to version 1.0.0
+  New Features
+  * --version
+  * --help
+  * --loop-input
+  * --mv-constraint to constrain motion vectors
+  * --tiles=2x2 as an alternative syntax for uniform tiles
+  * --hash=md5
+  * Print information about what SIMD optimizations are in 
+    use
+  * --mv=full8 --mv=full16 --mv=full32 --mv=full64
+  * --cu-split-termination=zero/off
+  * --crypto for selective encryption of bitstream (for 
+    OpenHEVC)
+  * --me-early-termination=sensitive/on/off for early 
+    termination of motion vector search
+  * Added 4x8 SMP and 4x12 AMP motion partitions
+  * --subme=0/1/2/3/4 for control over complexity of 
+    fractional pixel motion prediction
+  * --lossless for lossless coding
+  * Monochrome coding
+  * --input-format=420/400
+  * --input-bitdepth=8/10
+  * --tmpv for temporal motion vector predictor
+  * --rdoq-skip for not using rdoq for situations where it's 
+    unlikely to improve BDRate
+  * Modified --gop=lp-g4d3r1t1 syntax to not take the 
+    reference frames as a parameter, it's now --gop=lp-g4d3t1.
+  * Enable WPP and multithreading by default, with detection 
+    for number of cores
+  * Update all presets to ratedistortion-complexity 
+    optimized versions. These are based on a search of all 
+    (~ish) possible encoding parameters and bring a huge boost 
+    to both speed and BDRate when encoding with the presets (10x 
+    speed for veryslow, ~1.1x-4x for others, up to 30% improved 
+    BDRate for some presets).
+  * Set default options to match medium with intra period of 
+    64, QP 22 and --gop=lp-g4d3t1
+  * --implicit-rdpcm RExt feature
+  Optimizations
+  * AVX2 version for Sample Adaptive Offset (SAO)
+  * Optimized memory copying
+  * AVX2 versions of filters for fractional pixel motion 
+    estimation
+  * AVX2 version for half pixel chroma sampling for SMP/AMP
+  * AVX2 versions for calculating two or four SATD values at 
+    once for small blocks
+  * Rewrote AVX2 version of fractional pixel motion 
+    compensation
+  * Rewrote motion vector cost calculation. It only got 
+    slightly faster, but BDRate improved a bunch due to the new 
+    implementation being more correct.
+  * Made AVX2 SAD use SSE4.1 for cases where there isn't an 
+    AVX2 implementation, speeding up SMP/AMP.
+  Bugfixes
+  * Fixed a bug in rate control where an int overflowed 
+    after coding 2^31 bits (2Gb)
+  * Fixed non-determinism intiles
+  * Fixed chroma reconstruction bug in tiles
+  * Fixed a bug with calculating the number of bits used for 
+    intra mode on 4x4 CUs
+  * Stopped checking zero motion vector multiple times in 
+    motion compensation
+  * Fixed possible segfault in motion compensation
+  * Fixed a race condition with OWF and SMP/AMP
+  * Gave pthread_cond_timedwait time in correctly, such that 
+    main thread now sleeps instead of busylooping when it has 
+    nothing to do
+  * Fixed rate control with lp-gop
+  * Fixed full search not taking temporal motion vector into 
+    account
+  * Allow non-gop-length intra period for lp-gop
+  Code / Building / Testing
+  * Moved SAO to it's own file
+  * Removed a ton of unnecessary includes
+  * Updated autotools ax_pthread
+  * Added build test for OS-X for Travis
+  * Made tests check for bitstream correctness
+  * Refactored some of the copypasta in motion vector search 
+    starting point selection
+  * Refactored the cu_info_t datastructures to hold 
+    information at a 4x4 resolution needed for AMP and SMP
+  * Changed cu_info_t to use bitfields to negate the effect 
+    of increasing the cu_info_t array by a factor of 4
+  * Moved bitstream generation from encoderstate.c to 
+    encode_coding_tree.c
+  * Renamed encoder_state_t.global to frame, which makes 
+    sense since it hold frame level data, not global data
+  * Rewrote integer vector inter prediction, because it was 
+    so bad
+  * Refactored init_lcu_t
+  * Added more tests for inter SAD
+  * Added speed tests for dual intra SAD functions
+  * Added more realistic speed tests for inter SAD
+  Other
+  * Added a manpage
+  * Added scripts for updating manpage and README based on 
+    --usage.
+  * Added a Dockerfile. Just because.
+  * Added commit date to --version
+
+-------------------------------------------------------------------
 Thu Jan 28 20:07:47 UTC 2016 - aloisio@gmx.com
 
 - Update to version 0.8.3

kvazaar.spec Changed

kvazaar-0.8.3.tar.gz/.travis-install.sh -> kvazaar-1.0.0.tar.gz/.travis-install.sh Changed

kvazaar-0.8.3.tar.gz/.travis-script.sh -> kvazaar-1.0.0.tar.gz/.travis-script.sh Changed

kvazaar-0.8.3.tar.gz/.travis.yml -> kvazaar-1.0.0.tar.gz/.travis.yml Changed

@@ -2,25 +2,81 @@
 
 env:
   global:
-  - KVZ_DISABLE_AVX2=1
   - TEST_DIM=264x130
   - TEST_FRAMES=10
 
+# Use container based infrastructure
 sudo: false
 
+# Use this the global requirements list for valgrind tests, because those are the most numerous.
+addons:
+  apt:
+    sources:
+    - ubuntu-toolchain-r-test
+    packages:
+    - autoconf
+    - libtool
+    - p7zip-full  # to uncompress our own ffmpeg binary
+    - valgrind
+    - yasm
+
 matrix:
   fast_finish: true
+  allow_failures:
+  - os: osx  # Don't know what's wrong. Something changed in the environment.
   
   include:
     - compiler: clang
+      addons:
+        apt:
+          sources:
+          - ubuntu-toolchain-r-test
+          packages:
+          - autoconf
+          - libtool
+          - yasm
+    
     - compiler: gcc-4.8
+      addons:
+        apt:
+          sources:
+          - ubuntu-toolchain-r-test
+          packages:
+          - autoconf
+          - gcc-4.8
+          - libtool
+          - yasm
+
+    # We have some Mac specific code and Mac sometimes has odd build issues.
+    - os: osx
+      compiler: clang  # gcc is actually clang on Travis OS X
 
     # Check for external symbols without kvz_ prefix.
     - compiler: gcc-4.8
       script:
         - ./autogen.sh
         - ./configure && make
-        - (! nm -go --defined-only src/.libs/libkvazaar.a | grep -v ' kvz_')
+        - (! nm -go --defined-only src/.libs/libkvazaar.a | grep -v ' kvz_') || (echo 'ERROR Only symbols prefixed with kvz_ should be exported from libkvazaar.'; false)
+      addons:
+        apt:
+          sources:
+          - ubuntu-toolchain-r-test
+          packages:
+          - autoconf
+          - gcc-4.8
+          - libtool
+          - yasm
+
+    # Tests trying to use invalid input dimensions
+    - env: EXPECTED_STATUS=1 PARAMS="-i src/kvazaar --input-res=1x65 -o /dev/null"
+      addons:
+        apt:
+          sources:
+          - ubuntu-toolchain-r-test
+          packages:
+          - autoconf
+          - libtool
+          - yasm
 
     # These valgrind tests are slow, so they are performed with the minimum
     # number of small frames and fast settings.
@@ -35,8 +91,8 @@
     - env: VALGRIND_TEST="-p4 -r1 --owf=0 --threads=0 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
     - env: VALGRIND_TEST="-p4 -r2 --owf=1 --threads=2 --wpp --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
     - env: VALGRIND_TEST="-p4 -r2 --owf=0 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
-    - env: VALGRIND_TEST="-p4 -r2 --owf=1 --threads=2 --tiles-height-split=u2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
-    - env: VALGRIND_TEST="-p4 -r2 --owf=0 --threads=2 --tiles-height-split=u2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
+    - env: VALGRIND_TEST="-p4 -r2 --owf=1 --threads=2 --tiles-height-split=u2 --no-wpp --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
+    - env: VALGRIND_TEST="-p4 -r2 --owf=0 --threads=2 --tiles-height-split=u2 --no-wpp --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
     
     # Tests for rdoq, sao, deblock and signhide and subme.
     - env: VALGRIND_TEST="-p0 -r1 --threads=2 --wpp --owf=1 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=1 --pu-depth-intra=2-3"
@@ -60,24 +116,6 @@
     - env: TEST_FRAMES=10 VALGRIND_TEST="--gop=8 -p0 --threads=2 --wpp --owf=4 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
     - env: TEST_FRAMES=20 VALGRIND_TEST="--gop=8 -p0 --threads=2 --wpp --owf=0 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3"
 
-    # Tests trying to use invalid input dimensions
-    - env: EXPECTED_STATUS=1 PARAMS="-i src/kvazaar --input-res=1x65 -o /dev/null"
-
-addons:
-  apt:
-    sources:
-    - ubuntu-toolchain-r-test
-
-    packages:
-    - autoconf
-    - gcc
-    - gcc-4.8
-    - libtool
-    - nasm
-    - p7zip-full
-    - valgrind
-    - yasm
-
 install:
   - source .travis-install.sh

kvazaar-1.0.0.tar.gz/Dockerfile Added

@@ -0,0 +1,42 @@
+# A simple Dockerfile for building Kvazaar from the git repository
+# Example build command when in this directory: docker build -t kvazaar .
+#
+# Example usage
+# Run with an input YUV file and output HEVC binary file
+#     docker run -i -a STDIN -a STDOUT kvazaar -i - --input-res=320x240 -o - < testfile_320x240.yuv > out.265
+#
+# Use libav or ffmpeg to input (almost) any format and convert it to YUV420 for kvazaar, audio is disabled
+#
+#     RESOLUTION=`avconv -i input.avi 2>&1 | grep Stream | grep -oP ', \K0-9+x0-9+'`
+#     avconv -i input.avi -an -f rawvideo -pix_fmt yuv420p - | docker run -i -a STDIN -a STDOUT kvazaar -i - --wpp --threads=8 --input-res=$RESOLUTION --preset=ultrafast -o - > output.265
+#  or 
+#     RESOLUTION=`ffmpeg -i input.avi 2>&1 | grep Stream | grep -oP ', \K0-9+x0-9+'`
+#     ffmpeg -i input.avi -an -f rawvideo -pix_fmt yuv420p - | docker run -i -a STDIN -a STDOUT kvazaar -i - --wpp --threads=8 --input-res=$RESOLUTION --preset=ultrafast -o - > output.265
+#
+
+# Use Ubuntu 15.10 as a base for now, it's around 136MB
+FROM ubuntu:15.10
+
+MAINTAINER Marko Viitanen <fador@iki.fi>
+
+    # List of needed packages to be able to build kvazaar with autotools
+    ENV REQUIRED_PACKAGES automake autoconf libtool m4 build-essential git yasm pkgconf
+    
+    # Run all the commands in one RUN so we don't have any extra history
+    # data in the image.
+    RUN apt-get update \
+    && apt-get install -y $REQUIRED_PACKAGES \
+    && apt-get clean \
+    && git clone --depth=1 git://github.com/ultravideo/kvazaar.git; \
+        cd kvazaar; \
+        ./autogen.sh; \
+        ./configure --disable-shared;\
+        make;\
+        make install; \
+    AUTOINSTALLED_PACKAGES=`apt-mark showauto`; \
+    apt-get remove --purge --force-yes -y $REQUIRED_PACKAGES $AUTOINSTALLED_PACKAGES; \
+        apt-get clean autoclean; \
+        apt-get autoremove -y; \
+        rm -rf /var/lib/{apt,dpkg,cache,log}/
+ENTRYPOINT "kvazaar"
+CMD "--help"

kvazaar-0.8.3.tar.gz/Makefile.am -> kvazaar-1.0.0.tar.gz/Makefile.am Changed

kvazaar-0.8.3.tar.gz/README.md -> kvazaar-1.0.0.tar.gz/README.md Changed

@@ -1,5 +1,5 @@
-Kvazaar {#mainpage}
-=========
+Kvazaar
+=======
 An open-source HEVC encoder licensed under LGPLv2.1
 
 Join channel #kvazaar_hevc in Freenode IRC network to contact us.
@@ -11,131 +11,157 @@
 
 !Build Status(https://travis-ci.org/ultravideo/kvazaar.svg?branch=master)(https://travis-ci.org/ultravideo/kvazaar)
 
-##Using Kvazaar
-
-    Usage:
-    kvazaar -i <input> --input-res <width>x<height> -o <output>
-
-    Optional parameters:
-          -n, --frames <integer>     : Number of frames to code all
-          --seek <integer>           : First frame to code 0
-          --input-res <int>x<int>    : Input resolution (width x height) or
-                      auto           : try to detect from file name auto
-          --input-fps <num>/<denom>  : Framerate of the input video 25.0
-          -q, --qp <integer>         : Quantization Parameter 32
-          -p, --period <integer>     : Period of intra pictures 0
-                                         0: only first picture is intra
-                                         1: all pictures are intra
-                                         2-N: every Nth picture is intra
-              --vps-period <integer> : Specify how often the video parameter set is
-                                       re-sent. 0
-                                         0: only send VPS with the first frame
-                                         1: send VPS with every intra frame
-                                         N: send VPS with every Nth intra frame
-          -r, --ref <integer>        : Reference frames, range 1..15 3
-              --no-deblock           : Disable deblocking filter
-              --deblock <beta:tc>    : Deblocking filter parameters
-                                       beta and tc range is -6..6 0:0
-              --no-sao               : Disable sample adaptive offset
-              --no-rdoq              : Disable RDO quantization
-              --no-signhide          : Disable sign hiding in quantization
-              --smp                  : Enable Symmetric Motion Partition
-              --amp                  : Enable Asymmetric Motion Partition
-              --rd <integer>         : Rate-Distortion Optimization level 1
-                                         0: no RDO
-                                         1: estimated RDO
-                                         2: full RDO
-              --mv-rdo               : Enable Rate-Distortion Optimized motion vector costs
-              --full-intra-search    : Try all intra modes.
-              --no-transform-skip    : Disable transform skip
-              --aud                  : Use access unit delimiters
-              --cqmfile <string>     : Custom Quantization Matrices from a file
-              --debug <string>       : Output encoders reconstruction.
-              --cpuid <integer>      : Disable runtime cpu optimizations with value 0.
-              --me <string>          : Set integer motion estimation algorithm "hexbs"
-                                         "hexbs": Hexagon Based Search (faster)
-                                         "tz":    Test Zone Search (better quality)
-                                         "full":  Full Search (super slow)
-              --subme <integer>      : Set fractional pixel motion estimation level 1.
-                                         0: only integer motion estimation
-                                         1: fractional pixel motion estimation enabled
-              --source-scan-type <string> : Set source scan type "progressive".
-                                              "progressive": progressive scan
-                                              "tff": top field first
-                                              "bff": bottom field first
-              --pu-depth-inter <int>-<int> : Range for sizes of inter prediction units to try.
-                                         0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8
-              --pu-depth-intra <int>-<int> : Range for sizes of intra prediction units to try.
-                                         0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8, 4: 4x4
-              --no-info              : Don't add information about the encoder to settings.
-              --gop <string>         : Definition for GOP 0
-                                         - 0 disabled
-                                         - 8 B-frame pyramid of length 8
-                                         - lp-gop syntax, defined below (example: g8d4r3t2)
-              --bipred               : Enable bi-prediction search
-              --bitrate <integer>    : Target bitrate. 0
-                                         0: disable rate-control
-                                         N: target N bits per second
-              --preset <string>      : Use preset. This will override previous options.
-                                         ultrafast, superfast,veryfast, faster,
-                                         fast, medium, slow, slower, veryslow, placebo
-              --no-psnr              : Don't calculate PSNR for frames
-
-      Video Usability Information:
-              --sar <width:height>   : Specify Sample Aspect Ratio
-              --overscan <string>    : Specify crop overscan setting "undef"
-                                         - undef, show, crop
-              --videoformat <string> : Specify video format "undef"
-                                         - component, pal, ntsc, secam, mac, undef
-              --range <string>       : Specify color range "tv"
-                                         - tv, pc
-              --colorprim <string>   : Specify color primaries "undef"
-                                         - undef, bt709, bt470m, bt470bg,
-                                           smpte170m, smpte240m, film, bt2020
-              --transfer <string>    : Specify transfer characteristics "undef"
-                                         - undef, bt709, bt470m, bt470bg,
-                                           smpte170m, smpte240m, linear, log100,
-                                           log316, iec61966-2-4, bt1361e,
-                                           iec61966-2-1, bt2020-10, bt2020-12
-              --colormatrix <string> : Specify color matrix setting "undef"
-                                         - undef, bt709, fcc, bt470bg, smpte170m,
-                                           smpte240m, GBR, YCgCo, bt2020nc, bt2020c
-              --chromaloc <integer>  : Specify chroma sample location (0 to 5) 0
-
-      Parallel processing:
-              --threads <integer>    : Maximum number of threads to use.
-                                       Disable threads if set to 0.
-
-      Tiles:
-              --tiles-width-split <string>|u<int> :
-                                       Specifies a comma separated list of pixel
-                                       positions of tiles columns separation coordinates.
-                                       Can also be u followed by and a single int n,
-                                       in which case it produces columns of uniform width.
-              --tiles-height-split <string>|u<int> :
-                                       Specifies a comma separated list of pixel
-                                       positions of tiles rows separation coordinates.
-                                       Can also be u followed by and a single int n,
-                                       in which case it produces rows of uniform height.
-
-      Wpp:
-              --wpp                  : Enable wavefront parallel processing
-              --owf <integer>|auto   : Number of parallel frames to process. 0 to disable.
-
-      Slices:
-              --slice-addresses <string>|u<int>:
-                                       Specifies a comma separated list of LCU
-                                       positions in tile scan order of tile separations.
-                                       Can also be u followed by and a single int n,
-                                       in which case it produces uniform slice length.
-
-      Deprecated parameters: (might be removed at some point)
-         Use --input-res:
-           -w, --width               : Width of input in pixels
-           -h, --height              : Height of input in pixels
-
-
-###For example:
+## Using Kvazaar
+
+comment: # (BEGIN KVAZAAR HELP MESSAGE)
+```
+Usage:
+kvazaar -i <input> --input-res <width>x<height> -o <output>
+
+Optional parameters:
+      --help                     : Print this help message and exit
+      --version                  : Print version information and exit
+      -n, --frames <integer>     : Number of frames to code all
+      --seek <integer>           : First frame to code 0
+      --input-res <int>x<int>    : Input resolution (width x height) or
+                  auto           : try to detect from file name auto
+      --input-fps <num>/<denom>  : Framerate of the input video 25.0
+      -q, --qp <integer>         : Quantization Parameter 32
+      -p, --period <integer>     : Period of intra pictures 0
+                                     0: only first picture is intra
+                                     1: all pictures are intra
+                                     2-N: every Nth picture is intra
+          --vps-period <integer> : Specify how often the video parameter set is
+                                   re-sent. 0
+                                     0: only send VPS with the first frame
+                                     1: send VPS with every intra frame
+                                     N: send VPS with every Nth intra frame
+      -r, --ref <integer>        : Reference frames, range 1..15 3
+          --no-deblock           : Disable deblocking filter
+          --deblock <beta:tc>    : Deblocking filter parameters
+                                   beta and tc range is -6..6 0:0
+          --no-sao               : Disable sample adaptive offset
+          --no-rdoq              : Disable RDO quantization
+          --no-signhide          : Disable sign hiding in quantization
+          --smp                  : Enable Symmetric Motion Partition
+          --amp                  : Enable Asymmetric Motion Partition
+          --rd <integer>         : Rate-Distortion Optimization level 1
+                                     0: no RDO
+                                     1: estimated RDO
+                                     2: full RDO
+          --mv-rdo               : Enable Rate-Distortion Optimized motion vector costs
+          --full-intra-search    : Try all intra modes.
+          --no-transform-skip    : Disable transform skip
+          --aud                  : Use access unit delimiters
+          --cqmfile <string>     : Custom Quantization Matrices from a file
+          --debug <string>       : Output encoders reconstruction.
+          --cpuid <integer>      : Disable runtime cpu optimizations with value 0.
+          --me <string>          : Set integer motion estimation algorithm "hexbs"
+                                     "hexbs": Hexagon Based Search (faster)
+                                     "tz":    Test Zone Search (better quality)
+                                     "full":  Full Search (super slow)
+          --subme <integer>      : Set fractional pixel motion estimation level 4.
+                                     0: only integer motion estimation
+                                     1: + 1/2-pixel horizontal and vertical
+                                     2: + 1/2-pixel diagonal
+                                     3: + 1/4-pixel horizontal and vertical
+                                     4: + 1/4-pixel diagonal
+          --source-scan-type <string> : Set source scan type "progressive".
+                                     "progressive": progressive scan
+                                     "tff": top field first
+                                     "bff": bottom field first
+          --pu-depth-inter <int>-<int> : Range for sizes of inter prediction units to try.
+                                     0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8
+          --pu-depth-intra <int>-<int> : Range for sizes of intra prediction units to try.
+                                     0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8, 4: 4x4
+          --no-info              : Don't add information about the encoder to settings.
+          --gop <string>         : Definition of GOP structure 0
+                                     "0":           disabled
+                                     "8":           B-frame pyramid of length 8
+                                     "lp-<string>": lp-gop definition (e.g. lp-g8d4r3t2)
+          --bipred               : Enable bi-prediction search
+          --bitrate <integer>    : Target bitrate. 0
+                                     0: disable rate-control
+                                     N: target N bits per second
+          --preset <string>      : Use preset. This will override previous options.
+                                     ultrafast, superfast, veryfast, faster,
+                                     fast, medium, slow, slower, veryslow, placebo
+          --no-psnr              : Don't calculate PSNR for frames
+          --loop-input           : Re-read input file forever
+          --mv-constraint        : Constrain movement vectors
+                                     "none": no constraint
+                                     "frametile": constrain within the tile
+                                     "frametilemargin": constrain even more
+          --hash                 : Specify which decoded picture hash to use checksum
+                                     "none": 0 bytes
+                                     "checksum": 18 bytes
+                                     "md5": 56 bytes
+          --cu-split-termination : Specify the cu split termination behaviour
+                                     "zero": Terminate when splitting gives little
+                                               improvement.
+                                     "off": Don't terminate splitting early
+          --me-early-termination : Specify the me early termination behaviour
+                                     "off": Early termination is off
+                                     "on": Early termination is on
+                                     "sensitive": Sensitive early termination is on
+          --lossless             : Use lossless coding
+          --implicit-rdpcm       : Enable implicit residual DPCM. Currently only supported
+                                   with lossless coding.
+          --no-tmvp              : Disable Temporal Motion Vector Prediction
+          --rdoq-skip            : Skips RDOQ for 4x4 blocks
+          --input-format         : P420 or P400
+          --input-bitdepth       : 8-16
+
+  Video Usability Information:
+          --sar <width:height>   : Specify Sample Aspect Ratio
+          --overscan <string>    : Specify crop overscan setting "undef"
+                                     - undef, show, crop
+          --videoformat <string> : Specify video format "undef"
+                                     - component, pal, ntsc, secam, mac, undef
+          --range <string>       : Specify color range "tv"
+                                     - tv, pc
+          --colorprim <string>   : Specify color primaries "undef"
+                                     - undef, bt709, bt470m, bt470bg,
+                                       smpte170m, smpte240m, film, bt2020
+          --transfer <string>    : Specify transfer characteristics "undef"
+                                     - undef, bt709, bt470m, bt470bg,
+                                       smpte170m, smpte240m, linear, log100,
+                                       log316, iec61966-2-4, bt1361e,
+                                       iec61966-2-1, bt2020-10, bt2020-12
+          --colormatrix <string> : Specify color matrix setting "undef"
+                                     - undef, bt709, fcc, bt470bg, smpte170m,
+                                       smpte240m, GBR, YCgCo, bt2020nc, bt2020c
+          --chromaloc <integer>  : Specify chroma sample location (0 to 5) 0
+
+  Parallel processing:
+          --threads <integer>    : Maximum number of threads to use.
+                                   Disable threads if set to 0.
+
+  Tiles:
+          --tiles <int>x<int>    : Split picture into width x height uniform tiles.
+          --tiles-width-split <string>|u<int> :
+                                   Specifies a comma separated list of pixel
+                                   positions of tiles columns separation coordinates.
+                                   Can also be u followed by and a single int n,
+                                   in which case it produces columns of uniform width.
+          --tiles-height-split <string>|u<int> :
+                                   Specifies a comma separated list of pixel
+                                   positions of tiles rows separation coordinates.
+                                   Can also be u followed by and a single int n,
+                                   in which case it produces rows of uniform height.
+
+  Wpp:
+          --wpp                  : Enable wavefront parallel processing
+          --owf <integer>|auto   : Number of parallel frames to process. 0 to disable.
+
+  Deprecated parameters: (might be removed at some point)
+     Use --input-res:
+       -w, --width               : Width of input in pixels
+       -h, --height              : Height of input in pixels
+```
+comment: # (END KVAZAAR HELP MESSAGE)
+
+### For example:
 
     kvazaar -i BQMall_832x480_60.yuv --input-res 832x480 -o out.hevc -n 600 -q 32
 
@@ -153,27 +179,35 @@
 
 
 
-##Presets
-The names of the presets are the same as with x264: ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow and placebo. The effects of the presets are listed in the following table, where the names have been abreviated to fit the layout in GitHub.
-
-                  | 0-uf  | 1-sf  | 2-vf  | 3-fr  | 4-f   | 5-m   | 6-s   | 7-sr  | 8-vs  | 9-p
------------------ | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | -----
-rd                | 0     | 1     | 1     | 1     | 1     | 1     | 2     | 2     | 2     | 3
-pu-depth-intra    | 2-3   | 1-3   | 1-3   | 1-3   | 1-3   | 1-4   | 1-4   | 1-4   | 1-4   | 0-4
-pu-depth-inter    | 1-3   | 1-3   | 0-3   | 0-3   | 0-3   | 0-3   | 0-3   | 0-3   | 0-3   | 0-3
-me                | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | tz    | tz    | tz
-ref               | 1     | 1     | 2     | 2     | 2     | 3     | 3     | 4     | 4     | 6
-deblock           | 0     | 0     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1
-signhide          | 0     | 0     | 0     | 1     | 1     | 1     | 1     | 1     | 1     | 1
-subme             | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1     | 1     | 1
-sao               | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1
-rdoq              | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1
-transform-skip    | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1
-mv-rdo            | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1
-full-intra-search | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1
-
-
-##Kvazaar library
+## Presets
+The names of the presets are the same as with x264: ultrafast,
+superfast, veryfast, faster, fast, medium, slow, slower, veryslow and
+placebo. The effects of the presets are listed in the following table,
+where the names have been abbreviated to fit the layout in GitHub.
+
+                     | 0-uf  | 1-sf  | 2-vf  | 3-fr  | 4-f   | 5-m   | 6-s   | 7-sr  | 8-vs  | 9-p
+-------------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | -----
+rd                   | 0     | 0     | 0     | 1     | 1     | 1     | 1     | 1     | 1     | 1
+pu-depth-intra       | 2-3   | 2-3   | 2-3   | 2-3   | 2-3   | 1-3   | 1-3   | 1-3   | 1-4   | 1-4
+pu-depth-inter       | 2-3   | 2-3   | 2-3   | 1-3   | 1-3   | 1-3   | 1-3   | 0-3   | 0-3   | 0-3
+me                   | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | tz
+ref                  | 1     | 1     | 1     | 1     | 1     | 1     | 2     | 2     | 3     | 4
+deblock              | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1
+signhide             | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1
+subme                | 0     | 0     | 2     | 2     | 4     | 4     | 4     | 4     | 4     | 4
+sao                  | 0     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1
+rdoq                 | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1     | 1
+rdoq-skip            | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 0
+transform-skip       | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1
+mv-rdo               | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1
+full-intra-search    | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0
+smp                  | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1
+amp                  | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1
+cu-split-termination | zero  | zero  | zero  | zero  | zero  | zero  | zero  | zero  | zero  | off
+me-early-termination | sens. | sens. | sens. | sens. | on    | on    | on    | on    | on    | off
+
+
+## Kvazaar library
 
 See kvazaar.h(src/kvazaar.h) for the library API and its
 documentation.
@@ -183,7 +217,7 @@
 
 The needed linker and compiler flags can be obtained with pkg-config.
 
-##Compiling Kvazaar
+## Compiling Kvazaar
 
 If you have trouble regarding compiling the source code, please make an
 issue(https://github.com/ultravideo/kvazaar/issues) about in Github.
@@ -191,7 +225,7 @@
 improve in the build process. We want to make this as simple as
 possible.
 
-###Required libraries
+### Required libraries
 - For Visual Studio, the pthreads-w32 library is required. Platforms
   with native POSIX thread support don't need anything.
   - The project file expects the library to be in ../pthreads.2/
@@ -200,7 +234,11 @@
   - The executable needs pthreadVC2.dll to be present. Either install it
     somewhere or ship it with the executable.
 
-###Autotools
+### Autotools
+
+Depending on the platform, some additional tools are required for compiling Kvazaar with autotools.
+For Ubuntu, the required packages are `automake autoconf libtool m4 build-essential yasm`.
+
 Run the following commands to compile and install Kvazaar.
 
     ./autogen.sh
@@ -210,11 +248,11 @@
 
 See `./configure --help` for more options.
 
-###OS X
+### OS X
 - The program should compile and work on OS X but you might need a newer
   version of GCC than what comes with the platform.
 
-###Visual Studio
+### Visual Studio
 - VS2010 and older do not have support for some of the C99 features that
   we use. Please use VS2013 or newer or GCC (MinGW) to compile on
   Windows.
@@ -225,21 +263,41 @@
     user variables
 - Building the Kvazaar library is not yet supported.
 
+### Docker
+
+This project includes a Dockerfile(./Dockerfile), which enables building for Docker. Kvazaar is also available in the Docker Hub `ultravideo/kvazaar`(https://hub.docker.com/r/ultravideo/kvazaar/)
+Build using Docker: `docker build -t kvazaar .`
+Example usage: `docker run -i -a STDIN -a STDOUT kvazaar -i - --input-res=320x240 -o - < testfile_320x240.yuv > out.265`
+For other examples, see Dockerfile(./Dockerfile)
+
+### Visualization (Windows only)
+
+Branch `visualizer` has a visual studio project, which can be compiled to enable visualization feature in Kvazaar.
+
+Additional Requirements: `SDL2`(https://www.libsdl.org/download-2.0.php), `SDL2-ttf`(https://www.libsdl.org/projects/SDL_ttf/).
+
+Directory `visualizer_extras` is expected to be found from the same directory level as the kvazaar project directory. Inside should be directories `include` and `lib` found from the development library zip packages.
+
+`SDL2.dll`, `SDL2_ttf.dll`, `libfreetype-6.dll`, `zlib1.dll`, and `pthreadVC2.dll` should be placed in the working directory (i.e. the folder the `kvazaar.exe` is in after compiling the `kvazaar_cli` project/solution) when running the visualizer. The required `.dll` can be found in the aforementioned `lib`-folder (`lib\x64`) and the dll folder inside the pthreads folder (see `Required libraries`).
+
+Note: The solution should be compiled on the x64 platform in visual studio.
+
+Optional font file `arial.ttf` is to be placed in the working directory, if block info tool is used.
 
-##Contributing to Kvazaar
+## Contributing to Kvazaar
 
 See http://github.com/ultravideo/kvazaar/wiki/List-of-suggested-topics
 for a list of topics you might want to examine if you would like to do
 something bigger than a bug fix but don't know what yet.
 
 
-###Code documentation
+### Code documentation
 
 You can generate Doxygen documentation pages by running the command
-"doxygen docs.doxy". Here is a rough schetch of the module structure:
+"doxygen docs.doxy". Here is a rough sketch of the module structure:
 !Kvazaar module hierarchy(https://github.com/ultravideo/kvazaar/blob/master/doc/kvazaar_module_hierarchy.png)
 
-###For version control we try to follow these conventions:
+### For version control we try to follow these conventions:
 
 - Master branch always produces a working bitstream (can be decoded with
   HM).
@@ -251,11 +309,11 @@
   nicely.
 - Every commit should at least compile. Producing a working bitstream is
   nice as well, but not always possible. Features may be temporarily
-  disabled to produce a working bitstream, but remember to re-enbable
+  disabled to produce a working bitstream, but remember to re-enable
   them before merging to master.
 
 
-###Testing
+### Testing
 
 - We do not have a proper testing framework yet. We test mainly by
   decoding the bitstream with HM and checking that the result matches
@@ -268,7 +326,7 @@
   bitstream is valid. As of yet there is no such suite.
 
 
-###Unit tests
+### Unit tests
 - There are some unit tests located in the tests directory. We would
   like to have more.
 - The Visual Studio project links the unit tests against the actual .lib
@@ -281,7 +339,7 @@
         git submodule update
 
 
-###Code style
+### Code style
 
 We try to follow the following conventions:
 - C99 without features not supported by Visual Studio 2013 (VLAs).
@@ -297,7 +355,7 @@
   necessary.
 
 
-###Resources for HEVC bitstream features
+### Resources for HEVC bitstream features
 
 - A good first resource for HEVC bitstream is JCTVC-N1002 High
   Efficiency Video Coding (HEVC) Test Model 12 (HM12) Encoder

kvazaar-0.8.3.tar.gz/build/C_Properties.props -> kvazaar-1.0.0.tar.gz/build/C_Properties.props Changed

kvazaar-0.8.3.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj -> kvazaar-1.0.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj Changed

@@ -125,6 +125,8 @@
     </ClCompile>
   </ItemDefinitionGroup>
   <ItemGroup>
+    <ClCompile Include="..\..\src\extras\crypto.cpp" />
+    <ClCompile Include="..\..\src\extras\libmd5.c" />
     <ClCompile Include="..\..\src\input_frame_buffer.c" />
     <ClCompile Include="..\..\src\kvazaar.c" />
     <ClCompile Include="..\..\src\bitstream.c" />
@@ -137,6 +139,7 @@
     <ClCompile Include="..\..\src\encoder_state-bitstream.c" />
     <ClCompile Include="..\..\src\encoder_state-ctors_dtors.c" />
     <ClCompile Include="..\..\src\encoder_state-geometry.c" />
+    <ClCompile Include="..\..\src\encode_coding_tree.c" />
     <ClCompile Include="..\..\src\extras\getopt.c" />
     <ClCompile Include="..\..\src\filter.c" />
     <ClCompile Include="..\..\src\image.c" />
@@ -163,12 +166,21 @@
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
+    <ClCompile Include="..\..\src\strategies\avx2\sao-avx2.c">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
     <ClCompile Include="..\..\src\strategies\generic\intra-generic.c" />
     <ClCompile Include="..\..\src\strategies\generic\quant-generic.c" />
+    <ClCompile Include="..\..\src\strategies\generic\sao-generic.c" />
     <ClCompile Include="..\..\src\strategies\strategies-intra.c" />
     <ClCompile Include="..\..\src\strategies\strategies-quant.c" />
     <ClInclude Include="..\..\src\checkpoint.h" />
     <ClInclude Include="..\..\src\cu.h" />
+    <ClInclude Include="..\..\src\extras\crypto.h" />
+    <ClInclude Include="..\..\src\extras\libmd5.h" />
     <ClInclude Include="..\..\src\image.h" />
     <ClInclude Include="..\..\src\imagelist.h" />
     <ClCompile Include="..\..\src\strategies\altivec\picture-altivec.c" />
@@ -200,21 +212,26 @@
     <ClCompile Include="..\..\src\strategies\strategies-ipol.c" />
     <ClCompile Include="..\..\src\strategies\strategies-nal.c" />
     <ClCompile Include="..\..\src\strategies\strategies-picture.c" />
+    <ClCompile Include="..\..\src\strategies\strategies-sao.c" />
     <ClCompile Include="..\..\src\strategies\x86_asm\picture-x86-asm.c" />
     <ClCompile Include="..\..\src\videoframe.c" />
     <ClInclude Include="..\..\src\encoder_state-bitstream.h" />
     <ClInclude Include="..\..\src\encoder_state-ctors_dtors.h" />
     <ClInclude Include="..\..\src\encoder_state-geometry.h" />
+    <ClInclude Include="..\..\src\encode_coding_tree.h" />
     <ClCompile Include="..\..\src\strategyselector.c" />
     <ClCompile Include="..\..\src\tables.c" />
     <ClCompile Include="..\..\src\threadqueue.c" />
     <ClCompile Include="..\..\src\transform.c" />
     <ClInclude Include="..\..\src\input_frame_buffer.h" />
     <ClInclude Include="..\..\src\kvazaar_internal.h" />
+    <ClInclude Include="..\..\src\kvz_math.h" />
     <ClInclude Include="..\..\src\search_inter.h" />
     <ClInclude Include="..\..\src\search_intra.h" />
     <ClInclude Include="..\..\src\strategies\avx2\intra-avx2.h" />
+    <ClInclude Include="..\..\src\strategies\avx2\sao-avx2.h" />
     <ClInclude Include="..\..\src\strategies\generic\intra-generic.h" />
+    <ClInclude Include="..\..\src\strategies\generic\sao-generic.h" />
     <ClInclude Include="..\..\src\strategies\strategies-common.h" />
     <ClInclude Include="..\..\src\strategies\avx2\quant-avx2.h" />
     <ClInclude Include="..\..\src\strategies\generic\quant-generic.h" />
@@ -254,6 +271,7 @@
     <ClInclude Include="..\..\src\strategies\strategies-ipol.h" />
     <ClInclude Include="..\..\src\strategies\strategies-nal.h" />
     <ClInclude Include="..\..\src\strategies\strategies-picture.h" />
+    <ClInclude Include="..\..\src\strategies\strategies-sao.h" />
     <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm-sad.h" />
     <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm-satd.h" />
     <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm.h" />

kvazaar-0.8.3.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters -> kvazaar-1.0.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters Changed

@@ -207,6 +207,20 @@
     <ClCompile Include="..\..\src\encoder_state-bitstream.c">
       <Filter>Bitstream</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\encode_coding_tree.c">
+      <Filter>Bitstream</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\strategies\strategies-sao.c">
+      <Filter>Optimization\strategies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\strategies\generic\sao-generic.c">
+      <Filter>Optimization\strategies\generic</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\strategies\avx2\sao-avx2.c">
+      <Filter>Optimization\strategies\avx2</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\extras\libmd5.c" />
+    <ClCompile Include="..\..\src\extras\crypto.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitstream.h">
@@ -382,6 +396,21 @@
     <ClInclude Include="..\..\src\encoder_state-bitstream.h">
       <Filter>Bitstream</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\src\encode_coding_tree.h">
+      <Filter>Bitstream</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\kvz_math.h" />
+    <ClInclude Include="..\..\src\strategies\strategies-sao.h">
+      <Filter>Optimization\strategies</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\generic\sao-generic.h">
+      <Filter>Optimization\strategies\generic</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\avx2\sao-avx2.h">
+      <Filter>Optimization\strategies\avx2</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\extras\libmd5.h" />
+    <ClInclude Include="..\..\src\extras\crypto.h" />
   </ItemGroup>
   <ItemGroup>
     <YASM Include="..\..\src\extras\x86inc.asm">

kvazaar-0.8.3.tar.gz/configure.ac -> kvazaar-1.0.0.tar.gz/configure.ac Changed

@@ -23,10 +23,10 @@
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
 ver_major=3
-ver_minor=2
+ver_minor=13
 ver_release=0
 
-# not used, but it prevents configure from adding a lot of defines to the CFLAGS
+# Prevents configure from adding a lot of defines to the CFLAGS
 AC_CONFIG_HEADERS(config.h)
 
 AC_CONFIG_MACRO_DIR(m4)
@@ -38,40 +38,59 @@
 AC_PROG_CC
 AC_PROG_CC_C99
 AM_PROG_AR
+AC_PROG_CXX
 
 # Get fread that can read more than 2GB on 32 bit systems.
 AC_SYS_LARGEFILE
 
 LT_INIT(win32-dll)
 
-AX_CHECK_COMPILE_FLAG(-mavx2,   flag_avx2="true")
-AX_CHECK_COMPILE_FLAG(-msse4.1, flag_sse4_1="true")
-AX_CHECK_COMPILE_FLAG(-msse2,   flag_sse2="true")
+AX_CHECK_COMPILE_FLAG(-mavx2,   flag_avx2="true")
+AX_CHECK_COMPILE_FLAG(-msse4.1, flag_sse4_1="true")
+AX_CHECK_COMPILE_FLAG(-msse2,   flag_sse2="true")
 
 AM_CONDITIONAL(HAVE_AVX2, test x"$flag_avx2" = x"true")
 AM_CONDITIONAL(HAVE_SSE4_1, test x"$flag_sse4_1" = x"true")
 AM_CONDITIONAL(HAVE_SSE2, test x"$flag_sse2" = x"true")
 
-AX_PTHREAD
-CFLAGS="-Wall -Wtype-limits -Wvla -I$srcdir/src -ftree-vectorize -fvisibility=hidden $PTHREAD_CFLAGS $CFLAGS"
+KVZ_CFLAGS="-Wall -Wtype-limits -Wvla -I$srcdir/src -I$srcdir/src/extras -ftree-vectorize -fvisibility=hidden"
+CFLAGS="$KVZ_CFLAGS $CFLAGS"
+
+AC_ARG_WITH(cryptopp,
+    AS_HELP_STRING(--with-cryptopp,
+        Build with cryptopp Enables selective encryption.))
+AS_IF(test "x$with_cryptopp" = "xyes", 
+    PKG_CHECK_MODULES(cryptopp, cryptopp,
+        AC_DEFINE(KVZ_SEL_ENCRYPTION, 1, With cryptopp),
+        AC_MSG_ERROR(cryptopp not found with pkg-config)
+    )
+)
+
+AM_CONDITIONAL(USE_CRYPTOPP, test "x$with_cryptopp" = "xyes")
+CPPFLAGS="$CPPFLAGS $cryptopp_CFLAGS"
+LIBS="$LIBS $cryptopp_LIBS"
+
+
 CPPFLAGS="-DKVZ_DLL_EXPORTS $CPPFLAGS"
 
 AC_SEARCH_LIBS(log, m c, , exit 1)
 AC_SEARCH_LIBS(pow, m c, , exit 1)
 AC_SEARCH_LIBS(sqrt, m c, , exit 1)
-LIBS="$PTHREAD_LIBS $LIBS"
 
 
+
+# This does workarounds for pthreads on various compilers.
+AX_PTHREAD
+CFLAGS="$PTHREAD_CFLAGS $CFLAGS"
+LIBS="$PTHREAD_LIBS $LIBS"
+CC="$PTHREAD_CC"
+
 # --disable-werror
 AC_ARG_ENABLE(werror, AS_HELP_STRING(--disable-werror, don't treat warnings as errors no),
               , CFLAGS="-Werror $CFLAGS"
 )
 
 
-# check for getopt
-AC_CHECK_HEADER(getopt.h, , CFLAGS="$CFLAGS -I$srcdir/src/extras")
-
-
 # host and cpu specific settings
 AS_CASE($host_cpu,
         i?86, BITS="32" ASFLAGS="$ASFLAGS -DARCH_X86_64=0" X86="true",
@@ -129,10 +148,8 @@
 KVZ_API_VERSION="$ver_major:$ver_minor:$ver_release"
 AC_SUBST(KVZ_API_VERSION)
 
-
 AC_CONFIG_FILES(Makefile
                  src/Makefile
                  src/kvazaar.pc
                  tests/Makefile)
 AC_OUTPUT
-

kvazaar-1.0.0.tar.gz/doc/kvazaar.1 Added

@@ -0,0 +1,262 @@
+.TH KVAZAAR "1" "October 2016" "kvazaar v0.8.3" "User Commands"
+.SH NAME
+kvazaar \- open source HEVC encoder
+.SH SYNOPSIS
+\fBkvazaar \fR\-i <input> \-\-input\-res <width>x<height> \-o <output>
+.SH DESCRIPTION
+.TP
+\fB\-\-help                    
+Print this help message and exit
+.TP
+\fB\-\-version                 
+Print version information and exit
+.TP
+\fB\-n\fR, \fB\-\-frames <integer>    
+Number of frames to code all
+.TP
+\fB\-\-seek <integer>          
+First frame to code 0
+.TP
+\fB\-\-input\-res <int>x<int>   
+Input resolution (width x height) or
+auto          
+try to detect from file name auto
+.TP
+\fB\-\-input\-fps <num>/<denom> 
+Framerate of the input video 25.0
+.TP
+\fB\-q\fR, \fB\-\-qp <integer>        
+Quantization Parameter 32
+.TP
+\fB\-p\fR, \fB\-\-period <integer>    
+Period of intra pictures 0
+  0: only first picture is intra
+  1: all pictures are intra
+  2\-N: every Nth picture is intra
+.TP
+\fB\-\-vps\-period <integer>
+Specify how often the video parameter set is
+re\-sent. 0
+  0: only send VPS with the first frame
+  1: send VPS with every intra frame
+  N: send VPS with every Nth intra frame
+.TP
+\fB\-r\fR, \fB\-\-ref <integer>       
+Reference frames, range 1..15 3
+.TP
+\fB\-\-no\-deblock          
+Disable deblocking filter
+.TP
+\fB\-\-deblock <beta:tc>   
+Deblocking filter parameters
+beta and tc range is \-6..6 0:0
+.TP
+\fB\-\-no\-sao              
+Disable sample adaptive offset
+.TP
+\fB\-\-no\-rdoq             
+Disable RDO quantization
+.TP
+\fB\-\-no\-signhide         
+Disable sign hiding in quantization
+.TP
+\fB\-\-smp                 
+Enable Symmetric Motion Partition
+.TP
+\fB\-\-amp                 
+Enable Asymmetric Motion Partition
+.TP
+\fB\-\-rd <integer>        
+Rate\-Distortion Optimization level 1
+  0: no RDO
+  1: estimated RDO
+  2: full RDO
+.TP
+\fB\-\-mv\-rdo              
+Enable Rate\-Distortion Optimized motion vector costs
+.TP
+\fB\-\-full\-intra\-search   
+Try all intra modes.
+.TP
+\fB\-\-no\-transform\-skip   
+Disable transform skip
+.TP
+\fB\-\-aud                 
+Use access unit delimiters
+.TP
+\fB\-\-cqmfile <string>    
+Custom Quantization Matrices from a file
+.TP
+\fB\-\-debug <string>      
+Output encoders reconstruction.
+.TP
+\fB\-\-cpuid <integer>     
+Disable runtime cpu optimizations with value 0.
+.TP
+\fB\-\-me <string>         
+Set integer motion estimation algorithm "hexbs"
+  "hexbs": Hexagon Based Search (faster)
+  "tz":    Test Zone Search (better quality)
+  "full":  Full Search (super slow)
+.TP
+\fB\-\-subme <integer>     
+Set fractional pixel motion estimation level 4.
+  0: only integer motion estimation
+  1: + 1/2\-pixel horizontal and vertical
+  2: + 1/2\-pixel diagonal
+  3: + 1/4\-pixel horizontal and vertical
+  4: + 1/4\-pixel diagonal
+.TP
+\fB\-\-source\-scan\-type <string>
+Set source scan type "progressive".
+  "progressive": progressive scan
+  "tff": top field first
+  "bff": bottom field first
+.TP
+\fB\-\-pu\-depth\-inter <int>\-<int>
+Range for sizes of inter prediction units to try.
+  0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8
+.TP
+\fB\-\-pu\-depth\-intra <int>\-<int>
+Range for sizes of intra prediction units to try.
+  0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8, 4: 4x4
+.TP
+\fB\-\-no\-info             
+Don't add information about the encoder to settings.
+.TP
+\fB\-\-gop <string>        
+Definition of GOP structure 0
+  "0":           disabled
+  "8":           B\-frame pyramid of length 8
+  "lp\-<string>": lp\-gop definition (e.g. lp\-g8d4r3t2)
+.TP
+\fB\-\-bipred              
+Enable bi\-prediction search
+.TP
+\fB\-\-bitrate <integer>   
+Target bitrate. 0
+  0: disable rate\-control
+  N: target N bits per second
+.TP
+\fB\-\-preset <string>     
+Use preset. This will override previous options.
+  ultrafast, superfast, veryfast, faster,
+  fast, medium, slow, slower, veryslow, placebo
+.TP
+\fB\-\-no\-psnr             
+Don't calculate PSNR for frames
+.TP
+\fB\-\-loop\-input          
+Re\-read input file forever
+.TP
+\fB\-\-mv\-constraint       
+Constrain movement vectors
+  "none": no constraint
+  "frametile": constrain within the tile
+  "frametilemargin": constrain even more
+.TP
+\fB\-\-hash                
+Specify which decoded picture hash to use checksum
+  "none": 0 bytes
+  "checksum": 18 bytes
+  "md5": 56 bytes
+.TP
+\fB\-\-cu\-split\-termination
+Specify the cu split termination behaviour
+  "zero": Terminate when splitting gives little
+            improvement.
+  "off": Don't terminate splitting early
+.TP
+\fB\-\-me\-early\-termination
+Specify the me early termination behaviour
+  "off": Early termination is off
+  "on": Early termination is on
+  "sensitive": Sensitive early termination is on
+.TP
+\fB\-\-lossless            
+Use lossless coding
+.TP
+\fB\-\-implicit\-rdpcm      
+Enable implicit residual DPCM. Currently only supported
+with lossless coding.
+.TP
+\fB\-\-no\-tmvp             
+Disable Temporal Motion Vector Prediction
+.TP
+\fB\-\-rdoq\-skip           
+Skips RDOQ for 4x4 blocks
+.TP
+\fB\-\-input\-format        
+P420 or P400
+.TP
+\fB\-\-input\-bitdepth      
+8\-16
+
+.SS "Video Usability Information:"
+.TP
+\fB\-\-sar <width:height>  
+Specify Sample Aspect Ratio
+.TP
+\fB\-\-overscan <string>   
+Specify crop overscan setting "undef"
+  \- undef, show, crop
+.TP
+\fB\-\-videoformat <string>
+Specify video format "undef"
+  \- component, pal, ntsc, secam, mac, undef
+.TP
+\fB\-\-range <string>      
+Specify color range "tv"
+  \- tv, pc
+.TP
+\fB\-\-colorprim <string>  
+Specify color primaries "undef"
+  \- undef, bt709, bt470m, bt470bg,
+    smpte170m, smpte240m, film, bt2020
+.TP
+\fB\-\-transfer <string>   
+Specify transfer characteristics "undef"
+  \- undef, bt709, bt470m, bt470bg,
+    smpte170m, smpte240m, linear, log100,
+    log316, iec61966\-2\-4, bt1361e,
+    iec61966\-2\-1, bt2020\-10, bt2020\-12
+.TP
+\fB\-\-colormatrix <string>
+Specify color matrix setting "undef"
+  \- undef, bt709, fcc, bt470bg, smpte170m,
+    smpte240m, GBR, YCgCo, bt2020nc, bt2020c
+.TP
+\fB\-\-chromaloc <integer> 
+Specify chroma sample location (0 to 5) 0
+
+.SS "Parallel processing:"
+.TP
+\fB\-\-threads <integer>   
+Maximum number of threads to use.
+Disable threads if set to 0.
+
+.SS "Tiles:"
+.TP
+\fB\-\-tiles <int>x<int>   
+Split picture into width x height uniform tiles.
+.TP
+\fB\-\-tiles\-width\-split <string>|u<int>
+Specifies a comma separated list of pixel
+positions of tiles columns separation coordinates.
+Can also be u followed by and a single int n,
+in which case it produces columns of uniform width.
+.TP
+\fB\-\-tiles\-height\-split <string>|u<int>
+Specifies a comma separated list of pixel
+positions of tiles rows separation coordinates.
+Can also be u followed by and a single int n,
+in which case it produces rows of uniform height.
+
+.SS "Wpp:"
+.TP
+\fB\-\-wpp                 
+Enable wavefront parallel processing
+.TP
+\fB\-\-owf <integer>|auto  
+Number of parallel frames to process. 0 to disable.
+

kvazaar-0.8.3.tar.gz/m4/ax_pthread.m4 -> kvazaar-1.0.0.tar.gz/m4/ax_pthread.m4 Changed

@@ -19,10 +19,10 @@
 #   is necessary on AIX to use the special cc_r compiler alias.)
 #
 #   NOTE: You are assumed to not only compile your program with these flags,
-#   but also link it with them as well. e.g. you should link with
+#   but also to link with them as well. For example, you might link with
 #   $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
 #
-#   If you are only building threads programs, you may wish to use these
+#   If you are only building threaded programs, you may wish to use these
 #   variables in your default LIBS, CFLAGS, and CC:
 #
 #     LIBS="$PTHREAD_LIBS $LIBS"
@@ -30,8 +30,8 @@
 #     CC="$PTHREAD_CC"
 #
 #   In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant
-#   has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to that name
-#   (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
+#   has a nonstandard name, this macro defines PTHREAD_CREATE_JOINABLE to
+#   that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
 #
 #   Also HAVE_PTHREAD_PRIO_INHERIT is defined if pthread is found and the
 #   PTHREAD_PRIO_INHERIT symbol is defined when compiling with
@@ -82,35 +82,40 @@
 #   modified version of the Autoconf Macro, you may extend this special
 #   exception to the GPL to apply to your modified version as well.
 
-#serial 21
+#serial 23
 
 AU_ALIAS(ACX_PTHREAD, AX_PTHREAD)
 AC_DEFUN(AX_PTHREAD, 
 AC_REQUIRE(AC_CANONICAL_HOST)
+AC_REQUIRE(AC_PROG_CC)
+AC_REQUIRE(AC_PROG_SED)
 AC_LANG_PUSH(C)
 ax_pthread_ok=no
 
 # We used to check for pthread.h first, but this fails if pthread.h
-# requires special compiler flags (e.g. on True64 or Sequent).
+# requires special compiler flags (e.g. on Tru64 or Sequent).
 # It gets checked for in the link test anyway.
 
 # First of all, check if the user has set any of the PTHREAD_LIBS,
 # etcetera environment variables, and if threads linking works using
 # them:
-if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
-        save_CFLAGS="$CFLAGS"
+if test "x$PTHREAD_CFLAGS$PTHREAD_LIBS" != "x"; then
+        ax_pthread_save_CC="$CC"
+        ax_pthread_save_CFLAGS="$CFLAGS"
+        ax_pthread_save_LIBS="$LIBS"
+        AS_IF(test "x$PTHREAD_CC" != "x", CC="$PTHREAD_CC")
         CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-        save_LIBS="$LIBS"
         LIBS="$PTHREAD_LIBS $LIBS"
-        AC_MSG_CHECKING(for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS)
-        AC_TRY_LINK_FUNC(pthread_join, ax_pthread_ok=yes)
+        AC_MSG_CHECKING(for pthread_join using $CC $PTHREAD_CFLAGS $PTHREAD_LIBS)
+        AC_LINK_IFELSE(AC_LANG_CALL(, pthread_join), ax_pthread_ok=yes)
         AC_MSG_RESULT($ax_pthread_ok)
-        if test x"$ax_pthread_ok" = xno; then
+        if test "x$ax_pthread_ok" = "xno"; then
                 PTHREAD_LIBS=""
                 PTHREAD_CFLAGS=""
         fi
-        LIBS="$save_LIBS"
-        CFLAGS="$save_CFLAGS"
+        CC="$ax_pthread_save_CC"
+        CFLAGS="$ax_pthread_save_CFLAGS"
+        LIBS="$ax_pthread_save_LIBS"
 fi
 
 # We must check for the threads library under a number of different
@@ -123,7 +128,7 @@
 # which indicates that we try without any flags at all, and "pthread-config"
 # which is a program returning the flags for the Pth emulation library.
 
-ax_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
+ax_pthread_flags="pthreads none -Kthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
 
 # The ordering *is* (sometimes) important.  Some notes on the
 # individual items follow:
@@ -132,82 +137,225 @@
 # none: in case threads are in libc; should be tried before -Kthread and
 #       other compiler flags to prevent continual compiler warnings
 # -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
-# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
-# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
-# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
-# -pthreads: Solaris/gcc
-# -mthreads: Mingw32/gcc, Lynx/gcc
+# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads), Tru64
+#           (Note: HP C rejects this with "bad form for `-t' option")
+# -pthreads: Solaris/gcc (Note: HP C also rejects)
 # -mt: Sun Workshop C (may only link SunOS threads -lthread, but it
-#      doesn't hurt to check since this sometimes defines pthreads too;
-#      also defines -D_REENTRANT)
-#      ... -mt is also the pthreads flag for HP/aCC
+#      doesn't hurt to check since this sometimes defines pthreads and
+#      -D_REENTRANT too), HP C (must be checked before -lpthread, which
+#      is present but should not be used directly; and before -mthreads,
+#      because the compiler interprets this as "-mt" + "-hreads")
+# -mthreads: Mingw32/gcc, Lynx/gcc
 # pthread: Linux, etcetera
 # --thread-safe: KAI C++
 # pthread-config: use pthread-config program (for GNU Pth library)
 
-case ${host_os} in
+case $host_os in
+
+        freebsd*)
+
+        # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+        # lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+
+        ax_pthread_flags="-kthread lthread $ax_pthread_flags"
+        ;;
+
+        hpux*)
+
+        # From the cc(1) man page: "-mt Sets various -D flags to enable
+        # multi-threading and also sets -lpthread."
+
+        ax_pthread_flags="-mt -pthread pthread $ax_pthread_flags"
+        ;;
+
+        openedition*)
+
+        # IBM z/OS requires a feature-test macro to be defined in order to
+        # enable POSIX threads at all, so give the user a hint if this is
+        # not set. (We don't define these ourselves, as they can affect
+        # other portions of the system API in unpredictable ways.)
+
+        AC_EGREP_CPP(AX_PTHREAD_ZOS_MISSING,
+            
+#            if !defined(_OPEN_THREADS) && !defined(_UNIX03_THREADS)
+             AX_PTHREAD_ZOS_MISSING
+#            endif
+            ,
+            AC_MSG_WARN(IBM z/OS requires -D_OPEN_THREADS or -D_UNIX03_THREADS to enable pthreads support.))
+        ;;
+
         solaris*)
 
         # On Solaris (at least, for some versions), libc contains stubbed
         # (non-functional) versions of the pthreads routines, so link-based
-        # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
-        # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
-        # a function called by this macro, so we could check for that, but
-        # who knows whether they'll stub that too in a future libc.)  So,
-        # we'll just look for -pthreads and -lpthread first:
+        # tests will erroneously succeed. (N.B.: The stubs are missing
+        # pthread_cleanup_push, or rather a function called by this macro,
+        # so we could check for that, but who knows whether they'll stub
+        # that too in a future libc.)  So we'll check first for the
+        # standard Solaris way of linking pthreads (-mt -lpthread).
+
+        ax_pthread_flags="-mt,pthread pthread $ax_pthread_flags"
+        ;;
+esac
+
+# GCC generally uses -pthread, or -pthreads on some platforms (e.g. SPARC)
 
-        ax_pthread_flags="-pthreads pthread -mt -pthread $ax_pthread_flags"
+AS_IF(test "x$GCC" = "xyes",
+      ax_pthread_flags="-pthread -pthreads $ax_pthread_flags")
+
+# The presence of a feature test macro requesting re-entrant function
+# definitions is, on some systems, a strong hint that pthreads support is
+# correctly enabled
+
+case $host_os in
+        darwin* | hpux* | linux* | osf* | solaris*)
+        ax_pthread_check_macro="_REENTRANT"
         ;;
 
-        darwin*)
-        ax_pthread_flags="-pthread $ax_pthread_flags"
+        aix*)
+        ax_pthread_check_macro="_THREAD_SAFE"
+        ;;
+
+        *)
+        ax_pthread_check_macro="--"
         ;;
 esac
+AS_IF(test "x$ax_pthread_check_macro" = "x--",
+      ax_pthread_check_cond=0,
+      ax_pthread_check_cond="!defined($ax_pthread_check_macro)")
+
+# Are we compiling with Clang?
+
+AC_CACHE_CHECK(whether $CC is Clang,
+    ax_cv_PTHREAD_CLANG,
+    ax_cv_PTHREAD_CLANG=no
+     # Note that Autoconf sets GCC=yes for Clang as well as GCC
+     if test "x$GCC" = "xyes"; then
+        AC_EGREP_CPP(AX_PTHREAD_CC_IS_CLANG,
+            /* Note: Clang 2.7 lacks __clang_a-z+__ */
+#            if defined(__clang__) && defined(__llvm__)
+             AX_PTHREAD_CC_IS_CLANG
+#            endif
+            ,
+            ax_cv_PTHREAD_CLANG=yes)
+     fi
+    )
+ax_pthread_clang="$ax_cv_PTHREAD_CLANG"
+
+ax_pthread_clang_warning=no
+
+# Clang needs special handling, because older versions handle the -pthread
+# option in a rather... idiosyncratic way
+
+if test "x$ax_pthread_clang" = "xyes"; then
+
+        # Clang takes -pthread; it has never supported any other flag
+
+        # (Note 1: This will need to be revisited if a system that Clang
+        # supports has POSIX threads in a separate library.  This tends not
+        # to be the way of modern systems, but it's conceivable.)
+
+        # (Note 2: On some systems, notably Darwin, -pthread is not needed
+        # to get POSIX threads support; the API is always present and
+        # active.  We could reasonably leave PTHREAD_CFLAGS empty.  But
+        # -pthread does define _REENTRANT, and while the Darwin headers
+        # ignore this macro, third-party headers might not.)
+
+        PTHREAD_CFLAGS="-pthread"
+        PTHREAD_LIBS=
+
+        ax_pthread_ok=yes
+
+        # However, older versions of Clang make a point of warning the user
+        # that, in an invocation where only linking and no compilation is
+        # taking place, the -pthread option has no effect ("argument unused
+        # during compilation").  They expect -pthread to be passed in only
+        # when source code is being compiled.
+        #
+        # Problem is, this is at odds with the way Automake and most other
+        # C build frameworks function, which is that the same flags used in
+        # compilation (CFLAGS) are also used in linking.  Many systems
+        # supported by AX_PTHREAD require exactly this for POSIX threads
+        # support, and in fact it is often not straightforward to specify a
+        # flag that is used only in the compilation phase and not in
+        # linking.  Such a scenario is extremely rare in practice.
+        #
+        # Even though use of the -pthread flag in linking would only print
+        # a warning, this can be a nuisance for well-run software projects
+        # that build with -Werror.  So if the active version of Clang has
+        # this misfeature, we search for an option to squash it.
+
+        AC_CACHE_CHECK(whether Clang needs flag to prevent "argument unused" warning when linking with -pthread,
+            ax_cv_PTHREAD_CLANG_NO_WARN_FLAG,
+            ax_cv_PTHREAD_CLANG_NO_WARN_FLAG=unknown
+             # Create an alternate version of $ac_link that compiles and
+             # links in two steps (.c -> .o, .o -> exe) instead of one
+             # (.c -> exe), because the warning occurs only in the second
+             # step
+             ax_pthread_save_ac_link="$ac_link"
+             ax_pthread_sed='s/conftest\.\$ac_ext/conftest.$ac_objext/g'
+             ax_pthread_link_step=`$as_echo "$ac_link" | sed "$ax_pthread_sed"`
+             ax_pthread_2step_ac_link="($ac_compile) && (echo ==== >&5) && ($ax_pthread_link_step)"
+             ax_pthread_save_CFLAGS="$CFLAGS"
+             for ax_pthread_try in '' -Qunused-arguments -Wno-unused-command-line-argument unknown; do
+                AS_IF(test "x$ax_pthread_try" = "xunknown", break)
+                CFLAGS="-Werror -Wunknown-warning-option $ax_pthread_try -pthread $ax_pthread_save_CFLAGS"
+                ac_link="$ax_pthread_save_ac_link"
+                AC_LINK_IFELSE(AC_LANG_SOURCE(int main(void){return 0;}),
+                    ac_link="$ax_pthread_2step_ac_link"
+                     AC_LINK_IFELSE(AC_LANG_SOURCE(int main(void){return 0;}),
+                         break)
+                    )
+             done
+             ac_link="$ax_pthread_save_ac_link"
+             CFLAGS="$ax_pthread_save_CFLAGS"
+             AS_IF(test "x$ax_pthread_try" = "x", ax_pthread_try=no)
+             ax_cv_PTHREAD_CLANG_NO_WARN_FLAG="$ax_pthread_try"
+            )
 
-# Clang doesn't consider unrecognized options an error unless we specify
-# -Werror. We throw in some extra Clang-specific options to ensure that
-# this doesn't happen for GCC, which also accepts -Werror.
+        case "$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG" in
+                no | unknown) ;;
+                *) PTHREAD_CFLAGS="$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG $PTHREAD_CFLAGS" ;;
+        esac
 
-AC_MSG_CHECKING(if compiler needs -Werror to reject unknown flags)
-save_CFLAGS="$CFLAGS"
-ax_pthread_extra_flags="-Werror"
-CFLAGS="$CFLAGS $ax_pthread_extra_flags -Wunknown-warning-option -Wsizeof-array-argument"
-AC_COMPILE_IFELSE(AC_LANG_PROGRAM(int foo(void);,foo()),
-                  AC_MSG_RESULT(yes),
-                  ax_pthread_extra_flags=
-                   AC_MSG_RESULT(no))
-CFLAGS="$save_CFLAGS"
+fi # $ax_pthread_clang = yes
 
-if test x"$ax_pthread_ok" = xno; then
-for flag in $ax_pthread_flags; do
+if test "x$ax_pthread_ok" = "xno"; then
+for ax_pthread_try_flag in $ax_pthread_flags; do
 
-        case $flag in
+        case $ax_pthread_try_flag in
                 none)
                 AC_MSG_CHECKING(whether pthreads work without any flags)
                 ;;
 
+                -mt,pthread)
+                AC_MSG_CHECKING(whether pthreads work with -mt -lpthread)
+                PTHREAD_CFLAGS="-mt"
+                PTHREAD_LIBS="-lpthread"
+                ;;
+
                 -*)
-                AC_MSG_CHECKING(whether pthreads work with $flag)
-                PTHREAD_CFLAGS="$flag"
+                AC_MSG_CHECKING(whether pthreads work with $ax_pthread_try_flag)
+                PTHREAD_CFLAGS="$ax_pthread_try_flag"
                 ;;
 
                 pthread-config)
                 AC_CHECK_PROG(ax_pthread_config, pthread-config, yes, no)
-                if test x"$ax_pthread_config" = xno; then continue; fi
+                AS_IF(test "x$ax_pthread_config" = "xno", continue)
                 PTHREAD_CFLAGS="`pthread-config --cflags`"
                 PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
                 ;;
 
                 *)
-                AC_MSG_CHECKING(for the pthreads library -l$flag)
-                PTHREAD_LIBS="-l$flag"
+                AC_MSG_CHECKING(for the pthreads library -l$ax_pthread_try_flag)
+                PTHREAD_LIBS="-l$ax_pthread_try_flag"
                 ;;
         esac
 
-        save_LIBS="$LIBS"
-        save_CFLAGS="$CFLAGS"
+        ax_pthread_save_CFLAGS="$CFLAGS"
+        ax_pthread_save_LIBS="$LIBS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
         LIBS="$PTHREAD_LIBS $LIBS"
-        CFLAGS="$CFLAGS $PTHREAD_CFLAGS $ax_pthread_extra_flags"
 
         # Check for various functions.  We must include pthread.h,
         # since some functions may be macros.  (On the Sequent, we
@@ -218,7 +366,11 @@
         # pthread_cleanup_push because it is one of the few pthread
         # functions on Solaris that doesn't have a non-functional libc stub.
         # We try pthread_create on general principles.
+
         AC_LINK_IFELSE(AC_LANG_PROGRAM(#include <pthread.h>
+#                       if $ax_pthread_check_cond
+#                        error "$ax_pthread_check_macro must be defined"
+#                       endif
                         static void routine(void *a) { a = 0; }
                         static void *start_routine(void *a) { return a; },
                        pthread_t th; pthread_attr_t attr;
@@ -227,16 +379,14 @@
                         pthread_attr_init(&attr);
                         pthread_cleanup_push(routine, 0);
                         pthread_cleanup_pop(0) /* ; */),
-                ax_pthread_ok=yes,
-                )
+            ax_pthread_ok=yes,
+            )
 
-        LIBS="$save_LIBS"
-        CFLAGS="$save_CFLAGS"
+        CFLAGS="$ax_pthread_save_CFLAGS"
+        LIBS="$ax_pthread_save_LIBS"
 
         AC_MSG_RESULT($ax_pthread_ok)
-        if test "x$ax_pthread_ok" = xyes; then
-                break;
-        fi
+        AS_IF(test "x$ax_pthread_ok" = "xyes", break)
 
         PTHREAD_LIBS=""
         PTHREAD_CFLAGS=""
@@ -244,71 +394,74 @@
 fi
 
 # Various other checks:
-if test "x$ax_pthread_ok" = xyes; then
-        save_LIBS="$LIBS"
-        LIBS="$PTHREAD_LIBS $LIBS"
-        save_CFLAGS="$CFLAGS"
+if test "x$ax_pthread_ok" = "xyes"; then
+        ax_pthread_save_CFLAGS="$CFLAGS"
+        ax_pthread_save_LIBS="$LIBS"
         CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+        LIBS="$PTHREAD_LIBS $LIBS"
 
         # Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
-        AC_MSG_CHECKING(for joinable pthread attribute)
-        attr_name=unknown
-        for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
-            AC_LINK_IFELSE(AC_LANG_PROGRAM(#include <pthread.h>,
-                           int attr = $attr; return attr /* ; */),
-                attr_name=$attr; break,
-                )
-        done
-        AC_MSG_RESULT($attr_name)
-        if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then
-            AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE, $attr_name,
-                               Define to necessary symbol if this constant
-                                uses a non-standard name on your system.)
-        fi
-
-        AC_MSG_CHECKING(if more special flags are required for pthreads)
-        flag=no
-        case ${host_os} in
-            aix* | freebsd* | darwin*) flag="-D_THREAD_SAFE";;
-            osf* | hpux*) flag="-D_REENTRANT";;
-            solaris*)
-            if test "$GCC" = "yes"; then
-                flag="-D_REENTRANT"
-            else
-                # TODO: What about Clang on Solaris?
-                flag="-mt -D_REENTRANT"
-            fi
-            ;;
-        esac
-        AC_MSG_RESULT($flag)
-        if test "x$flag" != xno; then
-            PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
-        fi
+        AC_CACHE_CHECK(for joinable pthread attribute,
+            ax_cv_PTHREAD_JOINABLE_ATTR,
+            ax_cv_PTHREAD_JOINABLE_ATTR=unknown
+             for ax_pthread_attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
+                 AC_LINK_IFELSE(AC_LANG_PROGRAM(#include <pthread.h>,
+                                                 int attr = $ax_pthread_attr; return attr /* ; */),
+                                ax_cv_PTHREAD_JOINABLE_ATTR=$ax_pthread_attr; break,
+                                )
+             done
+            )
+        AS_IF(test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xunknown" && \
+               test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xPTHREAD_CREATE_JOINABLE" && \
+               test "x$ax_pthread_joinable_attr_defined" != "xyes",
+              AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE,
+                                  $ax_cv_PTHREAD_JOINABLE_ATTR,
+                                  Define to necessary symbol if this constant
+                                   uses a non-standard name on your system.)
+               ax_pthread_joinable_attr_defined=yes
+              )
+
+        AC_CACHE_CHECK(whether more special flags are required for pthreads,
+            ax_cv_PTHREAD_SPECIAL_FLAGS,
+            ax_cv_PTHREAD_SPECIAL_FLAGS=no
+             case $host_os in
+             solaris*)
+             ax_cv_PTHREAD_SPECIAL_FLAGS="-D_POSIX_PTHREAD_SEMANTICS"
+             ;;
+             esac
+            )
+        AS_IF(test "x$ax_cv_PTHREAD_SPECIAL_FLAGS" != "xno" && \
+               test "x$ax_pthread_special_flags_added" != "xyes",
+              PTHREAD_CFLAGS="$ax_cv_PTHREAD_SPECIAL_FLAGS $PTHREAD_CFLAGS"
+               ax_pthread_special_flags_added=yes)
 
         AC_CACHE_CHECK(for PTHREAD_PRIO_INHERIT,
-            ax_cv_PTHREAD_PRIO_INHERIT, 
-                AC_LINK_IFELSE(AC_LANG_PROGRAM(#include <pthread.h>,
-                                                int i = PTHREAD_PRIO_INHERIT;),
-                    ax_cv_PTHREAD_PRIO_INHERIT=yes,
-                    ax_cv_PTHREAD_PRIO_INHERIT=no)
+            ax_cv_PTHREAD_PRIO_INHERIT,
+            AC_LINK_IFELSE(AC_LANG_PROGRAM(#include <pthread.h>,
+                                             int i = PTHREAD_PRIO_INHERIT;),
+                            ax_cv_PTHREAD_PRIO_INHERIT=yes,
+                            ax_cv_PTHREAD_PRIO_INHERIT=no)
             )
-        AS_IF(test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes",
-            AC_DEFINE(HAVE_PTHREAD_PRIO_INHERIT, 1, Have PTHREAD_PRIO_INHERIT.))
+        AS_IF(test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes" && \
+               test "x$ax_pthread_prio_inherit_defined" != "xyes",
+              AC_DEFINE(HAVE_PTHREAD_PRIO_INHERIT, 1, Have PTHREAD_PRIO_INHERIT.)
+               ax_pthread_prio_inherit_defined=yes
+              )
 
-        LIBS="$save_LIBS"
-        CFLAGS="$save_CFLAGS"
+        CFLAGS="$ax_pthread_save_CFLAGS"
+        LIBS="$ax_pthread_save_LIBS"
 
         # More AIX lossage: compile with *_r variant
-        if test "x$GCC" != xyes; then
+        if test "x$GCC" != "xyes"; then
             case $host_os in
                 aix*)
                 AS_CASE("x/$CC",
-                  x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6,
-                  #handle absolute path differently from PATH based program lookup
-                   AS_CASE("x$CC",
-                     x/*,
-                     AS_IF(AS_EXECUTABLE_P(${CC}_r),PTHREAD_CC="${CC}_r"),
-                     AC_CHECK_PROGS(PTHREAD_CC,${CC}_r,$CC)))
+                    x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6,
+                    #handle absolute path differently from PATH based program lookup
+                     AS_CASE("x$CC",
+                         x/*,
+                         AS_IF(AS_EXECUTABLE_P(${CC}_r),PTHREAD_CC="${CC}_r"),
+                         AC_CHECK_PROGS(PTHREAD_CC,${CC}_r,$CC)))
                 ;;
             esac
         fi
@@ -321,7 +474,7 @@
 AC_SUBST(PTHREAD_CC)
 
 # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
-if test x"$ax_pthread_ok" = xyes; then
+if test "x$ax_pthread_ok" = "xyes"; then
         ifelse($1,,AC_DEFINE(HAVE_PTHREAD,1,Define if you have POSIX threads libraries and header files.),$1)
         :
 else

kvazaar-0.8.3.tar.gz/src/Makefile.am -> kvazaar-1.0.0.tar.gz/src/Makefile.am Changed

@@ -31,6 +31,7 @@
 	yuv_io.h
 kvazaar_LDADD = libkvazaar.la $(LIBS)
 
+kvazaar_CPPFLAGS = -DKVZ_VERSION="`$(srcdir)/../tools/version.sh`"
 
 libkvazaar_la_SOURCES = \
 	bitstream.c \
@@ -55,6 +56,8 @@
 	encoder_state-ctors_dtors.h \
 	encoder_state-geometry.c \
 	encoder_state-geometry.h \
+	encode_coding_tree.c \
+	encode_coding_tree.h \
 	filter.c \
 	filter.h \
 	global.h \
@@ -70,6 +73,7 @@
 	intra.h \
 	kvazaar.c \
 	kvazaar_internal.h \
+	kvz_math.h \
 	nal.c \
 	nal.h \
 	rate_control.c \
@@ -107,6 +111,8 @@
 	strategies/generic/picture-generic.h \
 	strategies/generic/quant-generic.c \
 	strategies/generic/quant-generic.h \
+	strategies/generic/sao-generic.c \
+	strategies/generic/sao-generic.h \
 	strategies/strategies-common.h \
 	strategies/strategies-dct.c \
 	strategies/strategies-dct.h \
@@ -120,10 +126,17 @@
 	strategies/strategies-picture.h \
 	strategies/strategies-quant.c \
 	strategies/strategies-quant.h \
+	strategies/strategies-sao.c \
+	strategies/strategies-sao.h \
 	strategies/x86_asm/picture-x86-asm.c \
 	strategies/x86_asm/picture-x86-asm.h \
 	strategyselector.c \
-	strategyselector.h
+	strategyselector.h \
+	extras/libmd5.c \
+	extras/libmd5.h \
+	extras/crypto.h
+
+libkvazaar_la_CFLAGS =
 
 libkvazaar_la_LIBADD = \
 	libaltivec.la \
@@ -131,6 +144,12 @@
 	libsse2.la \
 	libsse41.la
 
+if USE_CRYPTOPP
+libkvazaar_la_SOURCES += \
+	extras/crypto.h \
+	extras/crypto.cpp
+endif
+
 libkvazaar_la_LDFLAGS = $(AM_LDFLAGS) -no-undefined -version-number $(KVZ_API_VERSION)
 
 
@@ -148,7 +167,10 @@
 	strategies/avx2/picture-avx2.c \
 	strategies/avx2/picture-avx2.h \
 	strategies/avx2/quant-avx2.c \
-	strategies/avx2/quant-avx2.h
+	strategies/avx2/quant-avx2.h \
+	strategies/avx2/sao-avx2.c \
+	strategies/avx2/sao-avx2.h
+	
 
 libsse2_la_SOURCES = \
 	strategies/sse2/picture-sse2.c \
@@ -182,7 +204,7 @@
 	strategies/x86_asm/picture-x86-asm-sad.h \
 	strategies/x86_asm/picture-x86-asm-satd.asm \
 	strategies/x86_asm/picture-x86-asm-satd.h
-libasm_la_CFLAGS = -DKVZ_COMPILE_ASM
+libkvazaar_la_CFLAGS += -DKVZ_COMPILE_ASM
 
 strategies/x86_asm/picture-x86-asm-sad.lo: strategies/x86_asm/picture-x86-asm-sad.asm
 strategies/x86_asm/picture-x86-asm-satd.lo: strategies/x86_asm/picture-x86-asm-satd.asm
@@ -195,5 +217,5 @@
 yasm_verbose_0 = @echo "  YASM    " $@;
 
 .asm.lo:
-	$(yasm_verbose)$(LIBTOOL) --mode=compile $(YASM) -I$(srcdir)/extras $(ASFLAGS) $< -o $@ -prefer-non-pic 1>/dev/null
+	$(yasm_verbose)$(LIBTOOL) --mode=compile --tag=CC $(YASM) -I$(srcdir)/extras $(ASFLAGS) $< -o $@ -prefer-non-pic 1>/dev/null

kvazaar-0.8.3.tar.gz/src/bitstream.c -> kvazaar-1.0.0.tar.gz/src/bitstream.c Changed

@@ -20,13 +20,12 @@
 
 #include "bitstream.h"
 
-#include <stdio.h>
-#include <stdlib.h>
 #include <math.h>
-#include <string.h>
-#include <stdarg.h>
 #include <stdlib.h>
-#include <assert.h>
+#include <string.h>
+
+#include "kvz_math.h"
+
 
 const uint32_t kvz_bit_set_mask =
 {
@@ -57,19 +56,6 @@
 }
 #endif
 
-static int floor_log2(unsigned int n)
-{
-  assert(n != 0);
-
-  int pos = 0;
-  if (n >= 1<<16) { n >>= 16; pos += 16; }
-  if (n >= 1<< 8) { n >>=  8; pos +=  8; }
-  if (n >= 1<< 4) { n >>=  4; pos +=  4; }
-  if (n >= 1<< 2) { n >>=  2; pos +=  2; }
-  if (n >= 1<< 1) {           pos +=  1; }
-  return pos;
-}
-
 /**
  * \brief Initialize the Exp Golomb code table.
  *
@@ -84,7 +70,7 @@
   uint8_t M;
   uint32_t info;
   for (code_num = 0; code_num < EXP_GOLOMB_TABLE_SIZE; code_num++) {
-    M = (uint8_t)floor_log2(code_num + 1);
+    M = kvz_math_floor_log2(code_num + 1);
     info = code_num + 1 - (uint32_t)pow(2, M);
     kvz_g_exp_tablecode_num.len = M * 2 + 1;
     kvz_g_exp_tablecode_num.value = (1<<M) | info;

kvazaar-0.8.3.tar.gz/src/bitstream.h -> kvazaar-1.0.0.tar.gz/src/bitstream.h Changed

kvazaar-0.8.3.tar.gz/src/cabac.c -> kvazaar-1.0.0.tar.gz/src/cabac.c Changed

@@ -20,10 +20,10 @@
 
 #include "cabac.h"
 
-#include <assert.h>
-#include <stdlib.h>
-#include <stdio.h>
-
+#include "encoder.h"
+#include "encoderstate.h"
+#include "extras/crypto.h"
+#include "kvazaar.h"
 
 const uint8_t kvz_g_auc_next_state_mps128 =
 {
@@ -275,6 +275,7 @@
 {
   int32_t code_number = symbol;
   uint32_t length;
+
   if (code_number < (3 << r_param)) {
     length = code_number >> r_param;
     CABAC_BINS_EP(cabac, (1 << (length + 1)) - 2 , length + 1, "coeff_abs_level_remaining");
@@ -291,6 +292,198 @@
   }
 }
 
+void kvz_cabac_write_coeff_remain_encry(struct encoder_state_t * const state, cabac_data_t * const cabac,const uint32_t symbol, const uint32_t r_param, int32_t base_level)
+{
+ int32_t codeNumber  = (int32_t)symbol;
+ uint32_t length;
+
+ if (codeNumber < (3 << r_param)) {
+   length = codeNumber>>r_param;
+   CABAC_BINS_EP(cabac, (1 << (length + 1)) - 2 , length + 1, "coeff_abs_level_remaining");
+   //m_pcBinIf->encodeBinsEP( (1<<(length+1))-2 , length+1);
+   uint32_t Suffix = (codeNumber%(1<<r_param));
+
+   if(!r_param)
+    CABAC_BINS_EP(cabac, Suffix, r_param, "coeff_abs_level_remaining");
+    //m_pcBinIf->encodeBinsEP(Suffix, r_param);
+   if(r_param==1) {
+     if(!(( base_level ==2 )&& (codeNumber==4 || codeNumber==5) ) ) {
+       uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
+       state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 1;
+       CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 1, "coeff_abs_level_remaining");
+       //m_pcBinIf->encodeBinsEP(m_prev_pos, 1);
+     } else {
+       CABAC_BINS_EP(cabac, Suffix, 1, "coeff_abs_level_remaining");
+       //m_pcBinIf->encodeBinsEP(Suffix, 1);
+     }
+   }
+   else
+    if(r_param==2) {
+       if( base_level ==1) {
+    	 uint32_t key    =ff_get_key(&state->tile->dbs_g, 2);
+         state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 3;
+         CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+         //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
+       } else
+         if( base_level ==2) {
+           if(codeNumber<=7 || codeNumber>=12) {
+        	 uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
+             state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 3;
+             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+             //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
+           }
+           else
+             if(codeNumber<10) {
+                uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
+                state->tile->m_prev_pos  = (( (Suffix&1) + ( state->tile->m_prev_pos^key )) & 1);
+                CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+                //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
+             } else
+               CABAC_BINS_EP(cabac, Suffix, 2, "coeff_abs_level_remaining");
+               //m_pcBinIf->encodeBinsEP(Suffix, 2);
+         } else { //base_level=3
+           if(codeNumber<=7 || codeNumber>11) {
+             uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
+             state->tile->m_prev_pos  = (Suffix + ( state->tile->m_prev_pos^key ) ) & 3;
+             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+             //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
+           } else {
+             uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
+             state->tile->m_prev_pos  = ((Suffix&2))+(( (Suffix&1) + ( state->tile->m_prev_pos^key)) & 1);
+             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 2, "coeff_abs_level_remaining");
+             //m_pcBinIf->encodeBinsEP(m_prev_pos, 2);
+           }
+         }
+     } else
+       if(r_param==3) {
+         if( base_level ==1) {
+           uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
+           state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 7;
+           CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+           //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
+         }
+         else if( base_level ==2) {
+           if(codeNumber<=15 || codeNumber>23) {
+             uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
+             state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 7;
+             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+             //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
+           } else
+             if(codeNumber<=19){
+               uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
+               state->tile->m_prev_pos  = ((Suffix&4))+(( (Suffix&3) + (state->tile->m_prev_pos^key )) & 3);
+               CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+               //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
+             } else
+               if(codeNumber<=21){
+            	 uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
+                 state->tile->m_prev_pos  = 4+(( (Suffix&1) + ( state->tile->m_prev_pos^key )) & 1);
+                 CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+                 //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
+               } else
+                 CABAC_BINS_EP(cabac, Suffix, 3, "coeff_abs_level_remaining");
+           // m_pcBinIf->encodeBinsEP(Suffix, 3);
+         } else {//base_level=3
+           CABAC_BINS_EP(cabac, Suffix, 3, "coeff_abs_level_remaining");
+           //m_pcBinIf->encodeBinsEP(Suffix, 3);
+           if(codeNumber<=15 || codeNumber>23) {
+             uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
+             state->tile->m_prev_pos  = (Suffix + ( state->tile->m_prev_pos^key ) ) & 7;
+             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+             //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
+           } else
+             if(codeNumber<=19) {
+               uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
+               state->tile->m_prev_pos  = (( (Suffix&3) + ( state->tile->m_prev_pos^key )) &3);
+               CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+               //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
+             } else
+               if(codeNumber<=23) {
+                 uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
+                 state->tile->m_prev_pos  = (Suffix&6)+(( (Suffix&1) + (state->tile->m_prev_pos^key )) & 1);
+                 CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 3, "coeff_abs_level_remaining");
+                 //m_pcBinIf->encodeBinsEP(m_prev_pos, 3);
+               }
+         }
+       } else
+         if(r_param==4) {
+           if( base_level ==1) {
+             uint32_t key    = ff_get_key(&state->tile->dbs_g, 4);
+             state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 15;
+             CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+             //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
+           } else
+             if( base_level ==2) {
+               if(codeNumber<=31 || codeNumber>47) {
+                 uint32_t key    = ff_get_key(&state->tile->dbs_g, 4);
+                 state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & 15;
+                 CABAC_BINS_EP(cabac, state->tile->m_prev_pos, r_param, "coeff_abs_level_remaining");
+                 //m_pcBinIf->encodeBinsEP(m_prev_pos, r_param);
+               } else
+                 if(codeNumber<=39) {
+                   uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
+                   state->tile->m_prev_pos  = (( (Suffix&7) + ( state->tile->m_prev_pos^key )) & 7);
+                   CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                   //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
+                 } else
+                   if(codeNumber<=43) {
+                     uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
+                     state->tile->m_prev_pos  = 8+(( (Suffix&3) + ( state->tile->m_prev_pos^key )) & 3);
+                     CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                     //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
+                   } else
+                     if(codeNumber<=45){
+                       uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
+                       state->tile->m_prev_pos  = 12+(( (Suffix&1) + ( state->tile->m_prev_pos^key )) & 1);
+                       CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                       //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
+                     } else
+                       CABAC_BINS_EP(cabac, Suffix, 4, "coeff_abs_level_remaining");
+                       //m_pcBinIf->encodeBinsEP(Suffix, 4);
+             } else {//base_level=3
+               if(codeNumber<=31 || codeNumber>47) {
+                 uint32_t key    = ff_get_key(&state->tile->dbs_g, 4);
+                 state->tile->m_prev_pos  = (Suffix + ( state->tile->m_prev_pos^key ) ) & 15;
+                 CABAC_BINS_EP(cabac, state->tile->m_prev_pos, r_param, "coeff_abs_level_remaining");
+                 //m_pcBinIf->encodeBinsEP(m_prev_pos, r_param);
+               } else
+                 if(codeNumber<=39) {
+                   uint32_t key    = ff_get_key(&state->tile->dbs_g, 3);
+                   state->tile->m_prev_pos  = (( (Suffix&7) + ( state->tile->m_prev_pos^key )) & 7);
+                   CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                   //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
+                 } else
+                   if(codeNumber<=43) {
+                     uint32_t key    = ff_get_key(&state->tile->dbs_g, 2);
+                     state->tile->m_prev_pos  = 8+(( (Suffix&3) + ( state->tile->m_prev_pos^key )) & 3);
+                     CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                     //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
+                   } else
+                     if(codeNumber<=47) {
+                       uint32_t key    = ff_get_key(&state->tile->dbs_g, 1);
+                       state->tile->m_prev_pos  = (Suffix&14)+(( (Suffix&1) + (state->tile->m_prev_pos^key )) & 1);
+                       CABAC_BINS_EP(cabac, state->tile->m_prev_pos, 4, "coeff_abs_level_remaining");
+                       //m_pcBinIf->encodeBinsEP(m_prev_pos, 4);
+                     }
+             }
+       }
+  } else {
+    length = r_param;
+    codeNumber  = codeNumber - ( 3 << r_param);
+    while (codeNumber >= (1<<length)) {
+      codeNumber -=  (1<<(length));
+      ++length;
+    }
+    CABAC_BINS_EP(cabac, (1 << (3 + length + 1 - r_param)) - 2, 3 + length + 1 - r_param, "coeff_abs_level_remaining");
+    //m_pcBinIf->encodeBinsEP((1<<(COEF_REMAIN_BIN_REDUCTION+length+1-r_param))-2,COEF_REMAIN_BIN_REDUCTION+length+1-r_param);
+    uint32_t Suffix = codeNumber;
+    uint32_t key    = ff_get_key(&state->tile->dbs_g, length);
+    uint32_t mask   = ( (1<<length ) -1 );
+    state->tile->m_prev_pos  = ( Suffix + ( state->tile->m_prev_pos^key ) ) & mask;
+    CABAC_BINS_EP(cabac, state->tile->m_prev_pos, length, "coeff_abs_level_remaining");
+    //m_pcBinIf->encodeBinsEP(m_prev_pos,length);
+  }
+}
 /**
  * \brief
  */
@@ -350,7 +543,7 @@
 /**
  * \brief
  */
-void kvz_cabac_write_ep_ex_golomb(cabac_data_t * const data, uint32_t symbol, uint32_t count)
+void kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t * const data, uint32_t symbol, uint32_t count)
 {
   uint32_t bins = 0;
   int32_t num_bins = 0;
@@ -364,8 +557,15 @@
   bins = 2 * bins;
   ++num_bins;
 
-  bins = (bins << count) | symbol;
+  bins      = (bins << count) | symbol;
   num_bins += count;
-
+  if(!state->cabac.only_count)
+    if (state->encoder_control->cfg->crypto_features & KVZ_CRYPTO_MVs) {
+      uint32_t key, mask;
+      key                      = ff_get_key(&state->tile->dbs_g, num_bins>>1);
+      mask                     = ( (1<<(num_bins >>1) ) -1 );
+      state->tile->m_prev_pos  = ( bins + ( state->tile->m_prev_pos^key ) ) & mask;
+      bins                     = ( (bins >> (num_bins >>1) ) << (num_bins >>1) ) | state->tile->m_prev_pos;
+    }
   kvz_cabac_encode_bins_ep(data, bins, num_bins);
 }

kvazaar-0.8.3.tar.gz/src/cabac.h -> kvazaar-1.0.0.tar.gz/src/cabac.h Changed

@@ -26,10 +26,11 @@
  * Coding bins using CABAC.
  */
 
-#include "global.h"
+#include "global.h" // IWYU pragma: keep
 
 #include "bitstream.h"
 
+struct encoder_state_t;
 
 // Types
 typedef struct
@@ -75,6 +76,7 @@
     cabac_ctx_t cu_skip_flag_model3;
     cabac_ctx_t cu_merge_idx_ext_model;
     cabac_ctx_t cu_merge_flag_ext_model;
+    cabac_ctx_t cu_transquant_bypass;
     cabac_ctx_t cu_mvd_model2;
     cabac_ctx_t cu_ref_pic_model2;
     cabac_ctx_t mvp_idx_model2;
@@ -103,8 +105,10 @@
 void kvz_cabac_flush(cabac_data_t *data);
 void kvz_cabac_write_coeff_remain(cabac_data_t *cabac, uint32_t symbol,
                               uint32_t r_param);
-void kvz_cabac_write_ep_ex_golomb(cabac_data_t *data, uint32_t symbol,
-                              uint32_t count);
+void kvz_cabac_write_coeff_remain_encry(struct encoder_state_t * const state, cabac_data_t * const cabac, const uint32_t symbol,
+		const uint32_t r_param, int32_t base_level);
+void kvz_cabac_write_ep_ex_golomb(struct encoder_state_t * const state, cabac_data_t *data,
+								uint32_t symbol, uint32_t count);
 void kvz_cabac_write_unary_max_symbol(cabac_data_t *data, cabac_ctx_t *ctx,
                                   uint32_t symbol, int32_t offset,
                                   uint32_t max_symbol);
@@ -112,36 +116,36 @@
 
 
 // Macros
-#define CTX_STATE(ctx) (ctx->uc_state >> 1)
-#define CTX_MPS(ctx) (ctx->uc_state & 1)
+#define CTX_STATE(ctx) ((ctx)->uc_state >> 1)
+#define CTX_MPS(ctx) ((ctx)->uc_state & 1)
 #define CTX_UPDATE_LPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_lps (ctx)->uc_state ; }
 #define CTX_UPDATE_MPS(ctx) { (ctx)->uc_state = kvz_g_auc_next_state_mps (ctx)->uc_state ; }
 
 #ifdef VERBOSE
   #define CABAC_BIN(data, value, name) { \
     uint32_t prev_state = (data)->ctx->uc_state; \
-    kvz_cabac_encode_bin(data, value); \
+    kvz_cabac_encode_bin((data), (value)) \
     printf("%s = %u, state = %u -> %u\n", \
-           name, (uint32_t)value, (uint32_t)prev_state, (data)->ctx->uc_state); }
+           (name), (uint32_t)(value), prev_state, (data)->ctx->uc_state); }
 
   #define CABAC_BINS_EP(data, value, bins, name) { \
     uint32_t prev_state = (data)->ctx->uc_state; \
-    kvz_cabac_encode_bins_ep(data, value, bins); \
+    kvz_cabac_encode_bins_ep((data), (value), (bins)); \
     printf("%s = %u(%u bins), state = %u -> %u\n", \
-           name, (uint32_t)value, (uint32_t)bins, prev_state, (data)->ctx->uc_state); }
+           (name), (uint32_t)(value), (bins), prev_state, (data)->ctx->uc_state); }
 
   #define CABAC_BIN_EP(data, value, name) { \
     uint32_t prev_state = (data)->ctx->uc_state; \
-    kvz_cabac_encode_bin_ep(data, value); \
+    kvz_cabac_encode_bin_ep((data), (value)); \
     printf("%s = %u, state = %u -> %u\n", \
-           name, (uint32_t)value, (uint32_t)prev_state, (data)->ctx->uc_state); }
+           (name), (uint32_t)(value), prev_state, (data)->ctx->uc_state); }
 #else
   #define CABAC_BIN(data, value, name) \
-    kvz_cabac_encode_bin(data, value);
+    kvz_cabac_encode_bin((data), (value));
   #define CABAC_BINS_EP(data, value, bins, name) \
-    kvz_cabac_encode_bins_ep(data, value, bins);
+    kvz_cabac_encode_bins_ep((data), (value), (bins));
   #define CABAC_BIN_EP(data, value, name) \
-    kvz_cabac_encode_bin_ep(data, value);
+    kvz_cabac_encode_bin_ep((data), (value));
 #endif
 
 #endif

kvazaar-0.8.3.tar.gz/src/cfg.c -> kvazaar-1.0.0.tar.gz/src/cfg.c Changed

@@ -24,6 +24,7 @@
 #include <stdlib.h>
 #include <string.h>
 
+
 kvz_config *kvz_config_alloc(void)
 {
   kvz_config *cfg = (kvz_config *)malloc(sizeof(kvz_config));
@@ -44,24 +45,25 @@
   cfg->framerate       = 25; // deprecated and will be removed.
   cfg->framerate_num   = 0;
   cfg->framerate_denom = 1;
-  cfg->qp              = 32;
-  cfg->intra_period    = 0;
+  cfg->qp              = 22;
+  cfg->intra_period    = 64;
   cfg->vps_period      = 0;
   cfg->deblock_enable  = 1;
   cfg->deblock_beta    = 0;
   cfg->deblock_tc      = 0;
   cfg->sao_enable      = 1;
   cfg->rdoq_enable     = 1;
+  cfg->rdoq_skip       = 1;
   cfg->signhide_enable = true;
   cfg->smp_enable      = false;
   cfg->amp_enable      = false;
   cfg->rdo             = 1;
   cfg->mv_rdo          = 0;
   cfg->full_intra_search = 0;
-  cfg->trskip_enable   = 1;
+  cfg->trskip_enable   = 0;
   cfg->tr_depth_intra  = 0;
   cfg->ime_algorithm   = 0; /* hexbs */
-  cfg->fme_level       = 1;
+  cfg->fme_level       = 4;
   cfg->source_scan_type = 0; /* progressive */
   cfg->vui.sar_width   = 0;
   cfg->vui.sar_height  = 0;
@@ -75,33 +77,51 @@
   cfg->aud_enable      = 0;
   cfg->cqmfile         = NULL;
   cfg->ref_frames      = DEFAULT_REF_PIC_COUNT;
-  cfg->gop_len         = 0;
+  cfg->gop_len         = 4;
+  cfg->gop_lowdelay    = true;
   cfg->bipred          = 0;
   cfg->target_bitrate  = 0;
+  cfg->hash            = KVZ_HASH_CHECKSUM;
+  cfg->lossless        = false;
+  cfg->tmvp_enable     = true;
+  cfg->implicit_rdpcm  = false;
+
+  cfg->cu_split_termination = KVZ_CU_SPLIT_TERMINATION_ZERO;
 
-  cfg->tiles_width_count         = 0;
-  cfg->tiles_height_count         = 0;
-  cfg->tiles_width_split          = NULL;
-  cfg->tiles_height_split          = NULL;
+  cfg->tiles_width_count  = 1;
+  cfg->tiles_height_count = 1;
+  cfg->tiles_width_split  = NULL;
+  cfg->tiles_height_split = NULL;
   
-  cfg->wpp = 0;
+  cfg->wpp = 1;
   cfg->owf = -1;
   cfg->slice_count = 1;
   cfg->slice_addresses_in_ts = MALLOC(int32_t, 1);
   cfg->slice_addresses_in_ts0 = 0;
   
-  cfg->threads = 0;
+  cfg->threads = -1;
   cfg->cpuid = 1;
 
   // Defaults for what sizes of PUs are tried.
-  cfg->pu_depth_inter.min = 0; // 0-3
+  cfg->pu_depth_inter.min = 2; // 0-3
   cfg->pu_depth_inter.max = 3; // 0-3
-  cfg->pu_depth_intra.min = 1; // 0-4
-  cfg->pu_depth_intra.max = 4; // 0-4
+  cfg->pu_depth_intra.min = 2; // 0-4
+  cfg->pu_depth_intra.max = 3; // 0-4
 
   cfg->add_encoder_info = true;
   cfg->calc_psnr = true;
 
+  cfg->mv_constraint = KVZ_MV_CONSTRAIN_NONE;
+  cfg->crypto_features = KVZ_CRYPTO_OFF;
+
+  cfg->me_early_termination = 1;
+
+  cfg->input_format = KVZ_FORMAT_P420;
+  cfg->input_bitdepth = 8;
+
+  cfg->gop_lp_definition.d = 3;
+  cfg->gop_lp_definition.t = 1;
+
   return 1;
 }
 
@@ -131,11 +151,11 @@
   return 0;
 }
 
-static int parse_enum(const char *arg, const char * const *names, int8_t *dst)
+static int parse_enum_n(const char *arg, unsigned num_chars, const char * const *names, int8_t *dst)
 {
   int8_t i;
   for (i = 0; namesi; i++) {
-    if (!strcmp(arg, namesi)) {
+    if (!strncmp(arg, namesi, num_chars)) {
       *dst = i;
       return 1;
     }
@@ -144,6 +164,11 @@
   return 0;
 }
 
+static int parse_enum(const char *arg, const char * const *names, int8_t *dst)
+{
+  return parse_enum_n(arg, 255, names, dst);
+}
+
 static int parse_tiles_specification(const char* const arg, int32_t * const ntiles, int32_t** const array) {
   const char* current_arg = NULL;
   int32_t current_value;
@@ -158,9 +183,9 @@
   
   //If the arg starts with u, we want an uniform split
   if (arg0=='u') {
-    *ntiles = atoi(arg+1)-1;
-    if (MAX_TILES_PER_DIM <= *ntiles || 0 > *ntiles) {
-      fprintf(stderr, "Invalid number of tiles (0 < %d <= %d = MAX_TILES_PER_DIM)!\n", *ntiles + 1, MAX_TILES_PER_DIM);
+    *ntiles = atoi(arg + 1);
+    if (MAX_TILES_PER_DIM <= *ntiles || 1 >= *ntiles) {
+      fprintf(stderr, "Invalid number of tiles (1 <= %d <= %d = MAX_TILES_PER_DIM)!\n", *ntiles, MAX_TILES_PER_DIM);
       return 0;
     }
     //Done with parsing
@@ -169,7 +194,7 @@
   
   //We have a comma-separated list of int for the split...
   current_arg = arg;
-  *ntiles = 0;
+  *ntiles = 1;
   do {
     int ret = sscanf(current_arg, "%d", &current_value);
     if (ret != 1) {
@@ -179,24 +204,24 @@
     current_arg = strchr(current_arg, ',');
     //Skip the , if we found one
     if (current_arg) ++current_arg;
-    values*ntiles = current_value;
+    values*ntiles - 1 = current_value;
     ++(*ntiles);
     if (MAX_TILES_PER_DIM <= *ntiles) break;
   } while (current_arg);
   
-  if (MAX_TILES_PER_DIM <= *ntiles || 0 >= *ntiles) {
-    fprintf(stderr, "Invalid number of tiles (0 < %d <= %d = MAX_TILES_PER_DIM)!\n", *ntiles + 1, MAX_TILES_PER_DIM);
+  if (MAX_TILES_PER_DIM <= *ntiles || 1 >= *ntiles) {
+    fprintf(stderr, "Invalid number of tiles (1 <= %d <= %d = MAX_TILES_PER_DIM)!\n", *ntiles, MAX_TILES_PER_DIM);
     return 0;
   }
   
-  *array = MALLOC(int32_t, *ntiles);
+  *array = MALLOC(int32_t, *ntiles - 1);
   if (!*array) {
     fprintf(stderr, "Could not allocate array for tiles\n");
     return 0;
   }
   
   //TODO: memcpy?
-  for (i = 0; i < *ntiles; ++i) {
+  for (i = 0; i < *ntiles - 1; ++i) {
     (*array)i = valuesi;
   }
   
@@ -266,7 +291,7 @@
 
 int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
 {
-  static const char * const me_names          = { "hexbs", "tz", "full", NULL };
+  static const char * const me_names          = { "hexbs", "tz", "full", "full8", "full16", "full32", "full64", NULL };
   static const char * const source_scan_type_names = { "progressive", "tff", "bff", NULL };
 
   static const char * const overscan_names    = { "undef", "show", "crop", NULL };
@@ -279,176 +304,244 @@
                                                     "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", NULL };
   static const char * const colormatrix_names = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m",
                                                     "smpte240m", "YCgCo", "bt2020nc", "bt2020c", NULL };
+  static const char * const mv_constraint_names = { "none", "frame", "tile", "frametile", "frametilemargin", NULL };
+  static const char * const hash_names = { "none", "checksum", "md5", NULL };
+
+  static const char * const cu_split_termination_names = { "zero", "off", NULL };
+  static const char * const crypto_toggle_names = { "off", "on", NULL };
+  static const char * const crypto_feature_names = { "mvs", "mv_signs", "trans_coeffs", "trans_coeff_signs", NULL };
+
+  static const char * const me_early_termination_names = { "off", "on", "sensitive", NULL };
 
-  static const char * const preset_values1128 = {
+  static const char * const preset_values1120*2 = {
       { 
         "ultrafast", 
         "pu-depth-intra", "2-3",
-        "pu-depth-inter", "1-3",
+        "pu-depth-inter", "2-3",
         "rd", "0",
         "me", "hexbs",
         "ref", "1",
-        "deblock", "0",
+        "deblock", "0:0",
         "signhide", "0",
         "subme", "0",
         "sao", "0",
         "rdoq", "0",
+        "rdoq-skip", "1",
         "transform-skip", "0", 
         "full-intra-search", "0",
         "mv-rdo", "0",
+        "smp", "0",
+        "amp", "0",
+        "cu-split-termination", "zero",
+        "me-early-termination", "sensitive",
+        "gop", "lp-g4d3t1",
         NULL 
       },
       { 
         "superfast",
-        "pu-depth-intra", "1-3",
-        "pu-depth-inter", "1-3",
-        "rd", "1",
+        "pu-depth-intra", "2-3",
+        "pu-depth-inter", "2-3",
+        "rd", "0",
         "me", "hexbs",
         "ref", "1",
-        "deblock", "0",
+        "deblock", "0:0",
         "signhide", "0",
         "subme", "0",
-        "sao", "0",
+        "sao", "1",
         "rdoq", "0",
+        "rdoq-skip", "1",
         "transform-skip", "0",
         "full-intra-search", "0",
         "mv-rdo", "0",
+        "smp", "0",
+        "amp", "0",
+        "cu-split-termination", "zero",
+        "me-early-termination", "sensitive",
+        "gop", "lp-g4d3t1",
         NULL
       },
       {
         "veryfast",
-        "pu-depth-intra", "1-3",
-        "pu-depth-inter", "0-3",
-        "rd", "1",
+        "pu-depth-intra", "2-3",
+        "pu-depth-inter", "2-3",
+        "rd", "0",
         "me", "hexbs",
-        "ref", "2",
-        "deblock", "1",
+        "ref", "1",
+        "deblock", "0:0",
         "signhide", "0",
-        "subme", "0",
-        "sao", "0",
+        "subme", "2",
+        "sao", "1",
         "rdoq", "0",
+        "rdoq-skip", "1",
         "transform-skip", "0",
         "full-intra-search", "0",
         "mv-rdo", "0",
+        "smp", "0",
+        "amp", "0",
+        "cu-split-termination", "zero",
+        "me-early-termination", "sensitive",
+        "gop", "lp-g4d3t1",
         NULL
       },
       {
         "faster",
-        "pu-depth-intra", "1-3",
-        "pu-depth-inter", "0-3",
+        "pu-depth-intra", "2-3",
+        "pu-depth-inter", "1-3",
         "rd", "1",
         "me", "hexbs",
-        "ref", "2",
-        "deblock", "1",
-        "signhide", "1",
-        "subme", "0",
-        "sao", "0",
+        "ref", "1",
+        "deblock", "0:0",
+        "signhide", "0",
+        "subme", "2",
+        "sao", "1",
         "rdoq", "0",
+        "rdoq-skip", "1",
         "transform-skip", "0",
         "full-intra-search", "0",
         "mv-rdo", "0",
+        "smp", "0",
+        "amp", "0",
+        "cu-split-termination", "zero",
+        "me-early-termination", "sensitive",
+        "gop", "lp-g4d3t1",
         NULL
       },
       {
         "fast",
-        "pu-depth-intra", "1-3",
-        "pu-depth-inter", "0-3",
+        "pu-depth-intra", "2-3",
+        "pu-depth-inter", "1-3",
         "rd", "1",
         "me", "hexbs",
-        "ref", "2",
-        "deblock", "1",
-        "signhide", "1",
-        "subme", "1",
-        "sao", "0",
+        "ref", "1",
+        "deblock", "0:0",
+        "signhide", "0",
+        "subme", "4",
+        "sao", "1",
         "rdoq", "0",
+        "rdoq-skip", "1",
         "transform-skip", "0",
         "full-intra-search", "0",
         "mv-rdo", "0",
+        "smp", "0",
+        "amp", "0",
+        "cu-split-termination", "zero",
+        "me-early-termination", "on",
+        "gop", "lp-g4d3t1",
         NULL
       },
       {
         "medium",
-        "pu-depth-intra", "1-4",
-        "pu-depth-inter", "0-3",
+        "pu-depth-intra", "1-3",
+        "pu-depth-inter", "1-3",
         "rd", "1",
         "me", "hexbs",
-        "ref", "3",
-        "deblock", "1",
-        "signhide", "1",
-        "subme", "1",
-        "sao", "0",
-        "rdoq", "0",
+        "ref", "1",
+        "deblock", "0:0",
+        "signhide", "0",
+        "subme", "4",
+        "sao", "1",
+        "rdoq", "1",
+        "rdoq-skip", "1",
         "transform-skip", "0",
         "full-intra-search", "0",
         "mv-rdo", "0",
+        "smp", "0",
+        "amp", "0",
+        "cu-split-termination", "zero",
+        "me-early-termination", "on",
+        "gop", "lp-g4d3t1",
         NULL
       },
       {
         "slow",
-        "pu-depth-intra", "1-4",
-        "pu-depth-inter", "0-3",
-        "rd", "2",
+        "pu-depth-intra", "1-3",
+        "pu-depth-inter", "1-3",
+        "rd", "1",
         "me", "hexbs",
-        "ref", "3",
-        "deblock", "1",
+        "ref", "2",
+        "deblock", "0:0",
         "signhide", "1",
-        "subme", "1",
+        "subme", "4",
         "sao", "1",
-        "rdoq", "0",
+        "rdoq", "1",
+        "rdoq-skip", "1",
         "transform-skip", "0",
         "full-intra-search", "0",
         "mv-rdo", "0",
+        "smp", "0",
+        "amp", "0",
+        "cu-split-termination", "zero",
+        "me-early-termination", "on",
+        "gop", "lp-g4d2t1",
         NULL
       },
       {
         "slower",
-        "pu-depth-intra", "1-4",
+        "pu-depth-intra", "1-3",
         "pu-depth-inter", "0-3",
-        "rd", "2",
-        "me", "tz",
-        "ref", "4",
-        "deblock", "1",
+        "rd", "1",
+        "me", "hexbs",
+        "ref", "2",
+        "deblock", "0:0",
         "signhide", "1",
-        "subme", "1",
+        "subme", "4",
         "sao", "1",
         "rdoq", "1",
+        "rdoq-skip", "1",
         "transform-skip", "0",
         "full-intra-search", "0",
         "mv-rdo", "0",
+        "smp", "0",
+        "amp", "0",
+        "cu-split-termination", "zero",
+        "me-early-termination", "on",
+        "gop", "lp-g4d2t1",
         NULL
       },
       {
         "veryslow",
         "pu-depth-intra", "1-4",
         "pu-depth-inter", "0-3",
-        "rd", "2",
-        "me", "tz",
-        "ref", "4",
-        "deblock", "1",
+        "rd", "1",
+        "me", "hexbs",
+        "ref", "3",
+        "deblock", "0:0",
         "signhide", "1",
-        "subme", "1",
+        "subme", "4",
         "sao", "1",
         "rdoq", "1",
-        "transform-skip", "1",
+        "rdoq-skip", "1",
+        "transform-skip", "0",
         "full-intra-search", "0",
-        "mv-rdo", "1",
+        "mv-rdo", "0",
+        "smp", "0",
+        "amp", "0",
+        "cu-split-termination", "zero",
+        "me-early-termination", "on",
+        "gop", "lp-g4d2t1",
         NULL
       },
       {
         "placebo",
-        "pu-depth-intra", "0-4",
+        "pu-depth-intra", "1-4",
         "pu-depth-inter", "0-3",
-        "rd", "3",
+        "rd", "1",
         "me", "tz",
-        "ref", "6",
-        "deblock", "1",
+        "ref", "4",
+        "deblock", "0:0",
         "signhide", "1",
-        "subme", "1",
+        "subme", "4",
         "sao", "1",
         "rdoq", "1",
+        "rdoq-skip", "0",
         "transform-skip", "1",
-        "full-intra-search", "1",
+        "full-intra-search", "0",
         "mv-rdo", "1",
+        "smp", "1",
+        "amp", "1",
+        "cu-split-termination", "off",
+        "me-early-termination", "off",
+        "gop", "lp-g4d2t1",
         NULL
       },
       { NULL }
@@ -533,6 +626,13 @@
     cfg->fme_level = atoi(value);
   else if OPT("source-scan-type")
     return parse_enum(value, source_scan_type_names, &cfg->source_scan_type);
+  else if OPT("mv-constraint")
+  {
+    int8_t constraint = KVZ_MV_CONSTRAIN_NONE;
+    int result = parse_enum(value, mv_constraint_names, &constraint);
+    cfg->mv_constraint = constraint;
+    return result;
+  }
   else if OPT("sar")
     return sscanf(value, "%d:%d", &cfg->vui.sar_width, &cfg->vui.sar_height) == 2;
   else if OPT("overscan")
@@ -553,10 +653,54 @@
     cfg->aud_enable = atobool(value);
   else if OPT("cqmfile")
     cfg->cqmfile = strdup(value);
-  else if OPT("tiles-width-split")
-    return parse_tiles_specification(value, &cfg->tiles_width_count, &cfg->tiles_width_split);
-  else if OPT("tiles-height-split")
-    return parse_tiles_specification(value, &cfg->tiles_height_count, &cfg->tiles_height_split);
+  else if OPT("tiles-width-split") {
+    int retval = parse_tiles_specification(value, &cfg->tiles_width_count, &cfg->tiles_width_split);
+    if (cfg->tiles_width_count > 1 && cfg->tmvp_enable) {
+      cfg->tmvp_enable = false;
+      fprintf(stderr, "Disabling TMVP because tiles are used.\n");
+    }
+    return retval;
+  }
+  else if OPT("tiles-height-split") {
+    int retval = parse_tiles_specification(value, &cfg->tiles_height_count, &cfg->tiles_height_split);
+    if (cfg->tiles_height_count > 1 && cfg->tmvp_enable) {
+      cfg->tmvp_enable = false;
+      fprintf(stderr, "Disabling TMVP because tiles are used.\n");
+    }
+    return retval;
+  }
+  else if OPT("tiles")
+  {
+    // A simpler interface for setting tiles, accepting only uniform split.
+    unsigned width;
+    unsigned height;
+    if (2 != sscanf(value, "%ux%u", &width, &height)) {
+      fprintf(stderr, "Wrong format for tiles. Expected \"%%ux%%u\", but got \"%s\"\n", value);
+      return 0;
+    }
+
+    if (MAX_TILES_PER_DIM <= width || 1 > width) {
+      fprintf(stderr, "Invalid number of tiles (0 < %d <= %d = MAX_TILES_PER_DIM)!\n", width, MAX_TILES_PER_DIM);
+      return 0;
+    }
+    if (MAX_TILES_PER_DIM <= height || 1 > height) {
+      fprintf(stderr, "Invalid number of tiles (0 < %d <= %d = MAX_TILES_PER_DIM)!\n", height, MAX_TILES_PER_DIM);
+      return 0;
+    }
+
+    // Free split arrays incase they have already been set by another parameter.
+    FREE_POINTER(cfg->tiles_width_split);
+    FREE_POINTER(cfg->tiles_height_split);
+    cfg->tiles_width_count = width;
+    cfg->tiles_height_count = height;
+
+    if (cfg->tmvp_enable) {
+      cfg->tmvp_enable = false;
+      fprintf(stderr, "Disabling TMVP because tiles are used.\n");
+    }
+
+    return 1;
+  }
   else if OPT("wpp")
     cfg->wpp = atobool(value);
   else if OPT("owf") {
@@ -565,10 +709,10 @@
       // -1 means automatic selection
       cfg->owf = -1;
     }
-  }
-  else if OPT("slice-addresses")
+  } else if OPT("slice-addresses") {
+    fprintf(stderr, "--slice-addresses doesn't do anything, because slices are not implemented.\n");
     return parse_slice_specification(value, &cfg->slice_count, &cfg->slice_addresses_in_ts);
-  else if OPT("threads")
+  } else if OPT("threads")
     cfg->threads = atoi(value);
   else if OPT("cpuid")
     cfg->cpuid = atoi(value);
@@ -583,105 +727,32 @@
       struct {
         unsigned g;  // length
         unsigned d;  // depth
-        unsigned r;  // references 
         unsigned t;  // temporal
-      } gop = { 0 };
+      } gop = { 0, 0, 0 };
 
-      if (sscanf(value, "lp-g%ud%ur%ut%u", &gop.g, &gop.d, &gop.r, &gop.t) != 4) {
-        fprintf(stderr, "Error in GOP syntax. Example: lp-g8d4r2t2\n");
+      // Parse --gop=lp-g#d#t#
+      if (sscanf(value, "lp-g%ud%ut%u", &gop.g, &gop.d, &gop.t) != 3) {
+        fprintf(stderr, "Error in GOP syntax. Example: lp-g8d4t2\n");
         return 0;
       }
 
       if (gop.g < 1 || gop.g > 32) {
         fprintf(stderr, "gop.g must be between 1 and 32.\n");
+        return 0;
       }
       if (gop.d < 1 || gop.d > 8) {
         fprintf(stderr, "gop.d must be between 1 and 8.\n");
-      }
-      if (gop.r < 1 || gop.r > 15) {
-        fprintf(stderr, "gop.d must be between 1 and 15.\n");
+        return 0;
       }
       if (gop.t < 1 || gop.t > 15) {
-        fprintf(stderr, "gop.t must be between 1 and 32.\n");
-      }
-      
-      // Initialize modulos for testing depth.
-      // The picture belong to the lowest depth in which (poc % modulo) == 0.
-      unsigned depth_modulos8 = { 0 };
-      for (int d = 0; d < gop.d; ++d) {
-        depth_modulosgop.d - 1 - d = 1 << d;
+        fprintf(stderr, "gop.t must be between 1 and 15.\n");
+        return 0;
       }
-      depth_modulos0 = gop.g;
 
-      cfg->gop_lowdelay = 1;
+      cfg->gop_lowdelay = true;
       cfg->gop_len = gop.g;
-      for (int g = 1; g <= gop.g; ++g) {
-        kvz_gop_config *gop_pic = &cfg->gopg - 1;
-
-        // Find gop depth for picture.
-        int gop_layer = 0;
-        while (gop_layer < gop.d && (g % depth_modulosgop_layer)) {
-          ++gop_layer;
-        }
-
-        gop_pic->poc_offset = g;
-        gop_pic->layer = gop_layer + 1;
-        gop_pic->qp_offset = gop_layer + 1;
-        gop_pic->ref_pos_count = 0;
-        gop_pic->ref_neg_count = gop.r;
-        gop_pic->is_ref = 0;
-
-        // Set first ref to point to previous frame, and the rest to previous
-        // key-frames.
-        // If gop.t > 1, have (poc % gop.t) == 0 point gop.t frames away,
-        // instead of the previous frame. Set the frames in between to
-        // point to the nearest frame with a lower gop-depth.
-        if (gop.t > 1) {
-          if (gop_pic->poc_offset % gop.t == 0) {
-            gop_pic->ref_neg0 = gop.t;
-          } else {
-            int r = gop_pic->poc_offset - 1;
-            while (r > 0) {
-              if (cfg->gopr.layer < gop_pic->layer) break;
-              --r;
-            }
-            // Var r is now 0 or index of the pic with layer < depth.
-            if (cfg->gopr.layer < gop_pic->layer) {
-              gop_pic->ref_neg0 = gop_pic->poc_offset - cfg->gopr.poc_offset;
-              cfg->gopr.is_ref = 1;
-            } else {
-              // No ref was found, just refer to the previous key-frame.
-              gop_pic->ref_neg0 = gop_pic->poc_offset % gop.g;
-            }
-          }
-        } else {
-          gop_pic->ref_neg0 = 1;
-          if (gop_pic->poc_offset >= 2) {
-            cfg->gopgop_pic->poc_offset - 2.is_ref = 1;
-          }
-        }
-
-        int keyframe = gop_pic->poc_offset;
-        for (int i = 1; i < gop_pic->ref_neg_count; ++i) {
-          while (keyframe == gop_pic->ref_negi - 1) {
-            keyframe += gop.g;
-          }
-          gop_pic->ref_negi = keyframe;
-        }
-
-        gop_pic->qp_factor = 0.4624;  // from HM
-      }
-
-      for (int g = 0; g < gop.g; ++g) {
-        kvz_gop_config *gop_pic = &cfg->gopg;
-        if (!gop_pic->is_ref) {
-          gop_pic->qp_factor = 0.68 * 1.31;  // derived from HM
-        }
-      }
-
-      // Key-frame is always a reference.
-      cfg->gopgop.g - 1.is_ref = 1;
-      cfg->gopgop.g - 1.qp_factor = 0.578;  // from HM
+      cfg->gop_lp_definition.d = gop.d;
+      cfg->gop_lp_definition.t = gop.t;
     } else if (atoi(value) == 8) {
       cfg->gop_lowdelay = 0;
       // GOP
@@ -759,6 +830,123 @@
     cfg->mv_rdo = atobool(value);
   else if OPT("psnr")
     cfg->calc_psnr = (bool)atobool(value);
+  else if OPT("hash")
+  {
+    int8_t hash;
+    int result;
+    if ((result = parse_enum(value, hash_names, &hash))) {
+      cfg->hash = hash;
+    }
+    return result;
+  }
+  else if OPT("cu-split-termination")
+  {
+    int8_t mode = KVZ_CU_SPLIT_TERMINATION_ZERO;
+    int result = parse_enum(value, cu_split_termination_names, &mode);
+    cfg->cu_split_termination = mode;
+    return result;
+  }
+  else if OPT("crypto")
+  {
+    // on, off, feature1+feature2
+
+    const char *token_begin = value;
+    const char *cur = token_begin;
+
+    cfg->crypto_features = KVZ_CRYPTO_OFF;
+
+    // If value is on or off, set all features to on or off.
+    int8_t toggle = 0;
+    if (parse_enum(token_begin, crypto_toggle_names, &toggle)) {
+      if (toggle == 1) {
+        cfg->crypto_features = KVZ_CRYPTO_ON;
+      }
+    } else {
+      // Try and parse "feature1+feature2" type list.
+      for (;;) {
+        if (*cur == '+' || *cur == '\0') {
+          int8_t feature = 0;
+          int num_chars = cur - token_begin;
+          if (parse_enum_n(token_begin, num_chars, crypto_feature_names, &feature)) {
+            cfg->crypto_features |= (1 << feature);
+          } else {
+            cfg->crypto_features = KVZ_CRYPTO_OFF;
+            return 0;
+          }
+          token_begin = cur + 1;
+        }
+
+        if (*cur == '\0') {
+          break;
+        } else {
+          ++cur;
+        }
+      }
+    }
+
+    // Disallow turning on the encryption when it's not compiled in.
+    bool encryption_compiled_in = false;
+#ifdef KVZ_SEL_ENCRYPTION
+    encryption_compiled_in = true;
+#endif
+    if (!encryption_compiled_in && cfg->crypto_features) {
+      fprintf(stderr, "--crypto cannot be enabled because it's not compiled in.\n");
+      cfg->crypto_features = KVZ_CRYPTO_OFF;
+      return 0;
+    }
+
+    return 1;
+  }
+  else if OPT("me-early-termination"){
+    int8_t mode = 0;
+    int result = parse_enum(value, me_early_termination_names, &mode);
+    cfg->me_early_termination = mode;
+    return result;
+  }
+  else if OPT("lossless")
+    cfg->lossless = (bool)atobool(value);
+  else if OPT("tmvp") {
+    cfg->tmvp_enable = atobool(value);
+    if (cfg->gop_len && cfg->tmvp_enable) {
+      fprintf(stderr, "Cannot enable TMVP because GOP is used.\n");
+      cfg->tmvp_enable = false;
+    }
+    if (cfg->tiles_width_count > 1 || cfg->tiles_height_count > 1) {
+      fprintf(stderr, "Cannot enable TMVP because tiles are used.\n");
+      cfg->tmvp_enable = false;
+    }
+  }
+  else if OPT("rdoq-skip"){
+    cfg->rdoq_skip = atobool(value);
+  }
+  else if OPT("input-format") {
+    static enum kvz_input_format const formats = { KVZ_FORMAT_P400, KVZ_FORMAT_P420 };
+    static const char * const format_names = { "P400", "P420", NULL };
+
+    int8_t format = 0;
+    if (!parse_enum(value, format_names, &format)) {
+      fprintf(stderr, "input-format not recognized.\n");
+      return 0;
+    }
+
+    cfg->input_format = formatsformat;
+  }
+  else if OPT("input-bitdepth") {
+    cfg->input_bitdepth = atoi(value);
+    if (cfg->input_bitdepth < 8 || cfg->input_bitdepth > 16) {
+      fprintf(stderr, "input-bitdepth not between 8 and 16.\n");
+      return 0;
+    }
+    if (cfg->input_bitdepth > 8 && KVZ_BIT_DEPTH == 8) {
+      // Because the image is read straight into the reference buffers,
+      // reading >8 bit samples doesn't work when sizeof(kvz_pixel)==1.
+      fprintf(stderr, "input-bitdepth can't be set to larger than 8 because"
+                      " Kvazaar is compiled with KVZ_BIT_DEPTH=8.\n");
+      return 0;
+    }
+  }
+  else if OPT("implicit-rdpcm")
+    cfg->implicit_rdpcm = (bool)atobool(value);
   else
     return 0;
 #undef OPT
@@ -766,6 +954,97 @@
   return 1;
 }
 
+void kvz_config_process_lp_gop(kvz_config *cfg)
+{
+  struct {
+    unsigned g;
+    unsigned d;
+    unsigned t;
+  } gop;
+
+  gop.g = cfg->gop_len;
+  gop.d = cfg->gop_lp_definition.d;
+  gop.t = cfg->gop_lp_definition.t;
+
+  // Initialize modulos for testing depth.
+  // The picture belong to the lowest depth in which (poc % modulo) == 0.
+  unsigned depth_modulos8 = { 0 };
+  for (int d = 0; d < gop.d; ++d) {
+    depth_modulosgop.d - 1 - d = 1 << d;
+  }
+  depth_modulos0 = gop.g;
+
+  cfg->gop_lowdelay = 1;
+  cfg->gop_len = gop.g;
+  for (int g = 1; g <= gop.g; ++g) {
+    kvz_gop_config *gop_pic = &cfg->gopg - 1;
+
+    // Find gop depth for picture.
+    int gop_layer = 1;
+    while (gop_layer < gop.d && (g % depth_modulosgop_layer - 1)) {
+      ++gop_layer;
+    }
+
+    gop_pic->poc_offset = g;
+    gop_pic->layer = gop_layer;
+    gop_pic->qp_offset = gop_layer;
+    gop_pic->ref_pos_count = 0;
+    gop_pic->ref_neg_count = cfg->ref_frames;
+    gop_pic->is_ref = 0;
+
+    // Set first ref to point to previous frame, and the rest to previous
+    // key-frames.
+    // If gop.t > 1, have (poc % gop.t) == 0 point gop.t frames away,
+    // instead of the previous frame. Set the frames in between to
+    // point to the nearest frame with a lower gop-depth.
+    if (gop.t > 1) {
+      if (gop_pic->poc_offset % gop.t == 0) {
+        gop_pic->ref_neg0 = gop.t;
+      } else {
+        int r = gop_pic->poc_offset - 1;
+        while (r > 0) {
+          if (cfg->gopr.layer < gop_pic->layer) break;
+          --r;
+        }
+        // Var r is now 0 or index of the pic with layer < depth.
+        if (cfg->gopr.layer < gop_pic->layer) {
+          gop_pic->ref_neg0 = gop_pic->poc_offset - cfg->gopr.poc_offset;
+          cfg->gopr.is_ref = 1;
+        } else {
+          // No ref was found, just refer to the previous key-frame.
+          gop_pic->ref_neg0 = gop_pic->poc_offset % gop.g;
+        }
+      }
+    } else {
+      gop_pic->ref_neg0 = 1;
+      if (gop_pic->poc_offset >= 2) {
+        cfg->gopgop_pic->poc_offset - 2.is_ref = 1;
+      }
+    }
+
+    int keyframe = gop_pic->poc_offset;
+    for (int i = 1; i < gop_pic->ref_neg_count; ++i) {
+      while (keyframe == gop_pic->ref_negi - 1) {
+        keyframe += gop.g;
+      }
+      gop_pic->ref_negi = keyframe;
+    }
+
+    gop_pic->qp_factor = 0.4624;  // from HM
+  }
+
+  for (int g = 0; g < gop.g; ++g) {
+    kvz_gop_config *gop_pic = &cfg->gopg;
+    if (!gop_pic->is_ref) {
+      gop_pic->qp_factor = 0.68 * 1.31;  // derived from HM
+    }
+  }
+
+  // Key-frame is always a reference.
+  cfg->gopgop.g - 1.is_ref = 1;
+  cfg->gopgop.g - 1.qp_factor = 0.578;  // from HM
+}
+
 /**
  * \brief Check that configuration is sensible.
  *
@@ -809,11 +1088,11 @@
     error = 1;
   }
 
-  if (cfg->gop_len &&
-      cfg->intra_period &&
-      cfg->intra_period % cfg->gop_len != 0) {
+  if (cfg->gop_len && cfg->intra_period && !cfg->gop_lowdelay &&
+      cfg->intra_period % cfg->gop_len != 0)
+  {
     fprintf(stderr,
-            "Input error: intra period (%d) not a multiple of gop length (%d)\n",
+            "Input error: intra period (%d) not a multiple of B-gop length (%d)\n",
             cfg->intra_period,
             cfg->gop_len);
     error = 1;
@@ -844,8 +1123,8 @@
     error = 1;
   }
 
-  if (cfg->fme_level != 0 && cfg->fme_level != 1) {
-    fprintf(stderr, "Input error: invalid --subme parameter (must be 0 or 1)\n");
+  if (cfg->fme_level != 0 && cfg->fme_level > 4) {
+    fprintf(stderr, "Input error: invalid --subme parameter (must be in range 0-4)\n");
     error = 1;
   }
 
@@ -892,7 +1171,7 @@
   if (cfg->tiles_width_split) {
     int i;
     int32_t prev_tile_split = 0;
-    for (i=0; i < cfg->tiles_width_count; ++i) {
+    for (i=0; i < cfg->tiles_width_count - 1; ++i) {
       if (cfg->tiles_width_spliti <= prev_tile_split) {
         fprintf(stderr, "Input error: tile separations in width should be strictly monotonic (%d <= %d)\n", cfg->tiles_width_spliti, prev_tile_split);
         error = 1;
@@ -905,8 +1184,8 @@
       }
       prev_tile_split = cfg->tiles_width_spliti;
     }
-    if (cfg->tiles_width_splitcfg->tiles_width_count-1 >= cfg->width) {
-      fprintf(stderr, "Input error: last x tile separation in width (%d) should smaller than image width (%d)\n", cfg->tiles_width_splitcfg->tiles_width_count-1, cfg->width);
+    if (cfg->tiles_width_splitcfg->tiles_width_count - 2 >= cfg->width) {
+      fprintf(stderr, "Input error: last x tile separation in width (%d) should smaller than image width (%d)\n", cfg->tiles_width_splitcfg->tiles_width_count - 2, cfg->width);
       error = 1;
     }
   }
@@ -914,7 +1193,7 @@
   if (cfg->tiles_height_split) {
     int i;
     int32_t prev_tile_split = 0;
-    for (i=0; i < cfg->tiles_height_count; ++i) {
+    for (i=0; i < cfg->tiles_height_count - 1; ++i) {
       if (cfg->tiles_height_spliti <= prev_tile_split) {
         fprintf(stderr, "Input error: tile separations in height should be strictly monotonic (%d <= %d)\n", cfg->tiles_height_spliti, prev_tile_split);
         error = 1;
@@ -928,11 +1207,16 @@
       prev_tile_split = cfg->tiles_height_spliti;
     }
 
-    if (cfg->tiles_height_splitcfg->tiles_height_count-1 >= cfg->height) {
-      fprintf(stderr, "Input error: last tile separation in height (%d) should smaller than image height (%d)\n", cfg->tiles_height_splitcfg->tiles_height_count-1, cfg->height);
+    if (cfg->tiles_height_splitcfg->tiles_height_count - 2 >= cfg->height) {
+      fprintf(stderr, "Input error: last tile separation in height (%d) should smaller than image height (%d)\n", cfg->tiles_height_splitcfg->tiles_height_count - 2, cfg->height);
       error = 1;
     }
   }
 
+  if (cfg->implicit_rdpcm && !cfg->lossless) {
+    fprintf(stderr, "Input error: --implicit-rdpcm is not suppoted without --lossless\n");
+    error = 1;
+  }
+
   return !error;
 }

kvazaar-0.8.3.tar.gz/src/cfg.h -> kvazaar-1.0.0.tar.gz/src/cfg.h Changed

kvazaar-0.8.3.tar.gz/src/checkpoint.h -> kvazaar-1.0.0.tar.gz/src/checkpoint.h Changed

kvazaar-0.8.3.tar.gz/src/cli.c -> kvazaar-1.0.0.tar.gz/src/cli.c Changed

@@ -78,6 +78,7 @@
   { "no-aud",                   no_argument, NULL, 0 },
   { "cqmfile",            required_argument, NULL, 0 },
   { "seek",               required_argument, NULL, 0 },
+  { "tiles",              required_argument, NULL, 0 },
   { "tiles-width-split",  required_argument, NULL, 0 },
   { "tiles-height-split", required_argument, NULL, 0 },
   { "wpp",                      no_argument, NULL, 0 },
@@ -99,6 +100,24 @@
   { "no-mv-rdo",                no_argument, NULL, 0 },
   { "psnr",                     no_argument, NULL, 0 },
   { "no-psnr",                  no_argument, NULL, 0 },
+  { "version",                  no_argument, NULL, 0 },
+  { "help",                     no_argument, NULL, 0 },
+  { "loop-input",               no_argument, NULL, 0 },
+  { "mv-constraint",      required_argument, NULL, 0 },
+  { "hash",               required_argument, NULL, 0 },
+  {"cu-split-termination",required_argument, NULL, 0 },
+  { "crypto",             required_argument, NULL, 0 },
+  { "me-early-termination",required_argument, NULL, 0 },
+  { "lossless",                 no_argument, NULL, 0 },
+  { "no-lossless",              no_argument, NULL, 0 },
+  { "tmvp",                     no_argument, NULL, 0 },
+  { "no-tmvp",                  no_argument, NULL, 0 },
+  { "rdoq-skip",                no_argument, NULL, 0 },
+  { "no-rdoq-skip",             no_argument, NULL, 0 },
+  { "input-bitdepth",     required_argument, NULL, 0 },
+  { "input-format",       required_argument, NULL, 0 },
+  { "implicit-rdpcm",           no_argument, NULL, 0 },
+  { "no-implicit-rdpcm",        no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -203,6 +222,14 @@
       opts->seek = atoi(optarg);
     } else if (!strcmp(name, "frames")) {
       opts->frames = atoi(optarg);
+    } else if (!strcmp(name, "version")) {
+      opts->version = true;
+      goto done;
+    } else if (!strcmp(name, "help")) {
+      opts->help = true;
+      goto done;
+    } else if (!strcmp(name, "loop-input")) {
+      opts->loop_input = true;
     } else if (!api->config_parse(opts->config, name, optarg)) {
       fprintf(stderr, "invalid argument: %s=%s\n", name, optarg);
       ok = 0;
@@ -232,7 +259,7 @@
   }
 
   // Set resolution automatically if necessary
-  if (opts->config->width == 0 && opts->config->width == 0){
+  if (opts->config->width == 0 && opts->config->height == 0) {
     ok = select_input_res_auto(opts->input, &opts->config->width, &opts->config->height);
     goto done;
   }
@@ -263,28 +290,36 @@
 }
 
 
+void print_usage(void)
+{
+  fprintf(stdout,
+    "Kvazaar usage: -i and --input-res to set input, -o to set output\n"
+    "               --help for more information\n");
+}
+
+
 void print_version(void)
 {
-  fprintf(stderr,
-    "/***********************************************/\n"
-    " *   Kvazaar HEVC Encoder v. " VERSION_STRING "             *\n"
-    " *     Tampere University of Technology 2015   *\n"
-    "/***********************************************/\n\n");
+  fprintf(stdout,
+    "Kvazaar " VERSION_STRING "\n"
+    "Kvazaar license: LGPL version 2\n");
 }
 
 
 void print_help(void)
 {
-  fprintf(stderr,
+  fprintf(stdout,
     "Usage:\n"
     "kvazaar -i <input> --input-res <width>x<height> -o <output>\n"
     "\n"
     "Optional parameters:\n"
+    "      --help                     : Print this help message and exit\n"
+    "      --version                  : Print version information and exit\n"
     "      -n, --frames <integer>     : Number of frames to code all\n"
     "      --seek <integer>           : First frame to code 0\n"
     "      --input-res <int>x<int>    : Input resolution (width x height) or\n"
     "                  auto           : try to detect from file name auto\n"
-    "      --input-fps <number>       : Framerate of the input video 25.0\n"
+    "      --input-fps <num>/<denom>  : Framerate of the input video 25.0\n"
     "      -q, --qp <integer>         : Quantization Parameter 32\n"
     "      -p, --period <integer>     : Period of intra pictures 0\n"
     "                                     0: only first picture is intra\n"
@@ -310,18 +345,21 @@
     "                                     2: full RDO\n"
     "          --mv-rdo               : Enable Rate-Distortion Optimized motion vector costs\n"
     "          --full-intra-search    : Try all intra modes.\n"
-    "          --me <string>          : Set integer motion estimation algorithm \"hexbs\"\n"
-    "                                     \"hexbs\": Hexagon Based Search (faster)\n"
-    "                                     \"tz\":    Test Zone Search (better quality)\n"
-    "                                     \"full\":  Full Search (super slow)\n"
     "          --no-transform-skip    : Disable transform skip\n"
     "          --aud                  : Use access unit delimiters\n"
     "          --cqmfile <string>     : Custom Quantization Matrices from a file\n"
     "          --debug <string>       : Output encoders reconstruction.\n"
     "          --cpuid <integer>      : Disable runtime cpu optimizations with value 0.\n"
-    "          --subme <integer>      : Set fractional pixel motion estimation level 1.\n"
+    "          --me <string>          : Set integer motion estimation algorithm \"hexbs\"\n"
+    "                                     \"hexbs\": Hexagon Based Search (faster)\n"
+    "                                     \"tz\":    Test Zone Search (better quality)\n"
+    "                                     \"full\":  Full Search (super slow)\n"
+    "          --subme <integer>      : Set fractional pixel motion estimation level 4.\n"
     "                                     0: only integer motion estimation\n"
-    "                                     1: fractional pixel motion estimation enabled\n"
+    "                                     1: + 1/2-pixel horizontal and vertical\n"
+    "                                     2: + 1/2-pixel diagonal\n"
+    "                                     3: + 1/4-pixel horizontal and vertical\n"
+    "                                     4: + 1/4-pixel diagonal\n"
     "          --source-scan-type <string> : Set source scan type \"progressive\".\n"
     "                                     \"progressive\": progressive scan\n"
     "                                     \"tff\": top field first\n"
@@ -331,15 +369,42 @@
     "          --pu-depth-intra <int>-<int> : Range for sizes of intra prediction units to try.\n"
     "                                     0: 64x64, 1: 32x32, 2: 16x16, 3: 8x8, 4: 4x4\n"
     "          --no-info              : Don't add information about the encoder to settings.\n"
-    "          --gop <int>           : Length of Group of Pictures, must be 8 or 0 0\n"
+    "          --gop <string>         : Definition of GOP structure 0\n"
+    "                                     \"0\":           disabled\n"
+    "                                     \"8\":           B-frame pyramid of length 8\n"
+    "                                     \"lp-<string>\": lp-gop definition (e.g. lp-g8d4r3t2)\n"
     "          --bipred               : Enable bi-prediction search\n"
     "          --bitrate <integer>    : Target bitrate. 0\n"
     "                                     0: disable rate-control\n"
     "                                     N: target N bits per second\n"
-    "          --preset <string>      : Use preset\n"
-    "                                     ultrafast, superfast,veryfast, faster,\n"
+    "          --preset <string>      : Use preset. This will override previous options.\n"
+    "                                     ultrafast, superfast, veryfast, faster,\n"
     "                                     fast, medium, slow, slower, veryslow, placebo\n"
     "          --no-psnr              : Don't calculate PSNR for frames\n"
+    "          --loop-input           : Re-read input file forever\n"
+    "          --mv-constraint        : Constrain movement vectors\n"
+    "                                     \"none\": no constraint\n"
+    "                                     \"frametile\": constrain within the tile\n"
+    "                                     \"frametilemargin\": constrain even more\n"
+    "          --hash                 : Specify which decoded picture hash to use checksum\n"
+    "                                     \"none\": 0 bytes\n"
+    "                                     \"checksum\": 18 bytes\n"
+    "                                     \"md5\": 56 bytes\n"
+    "          --cu-split-termination : Specify the cu split termination behaviour\n"
+    "                                     \"zero\": Terminate when splitting gives little\n"
+    "                                               improvement.\n"
+    "                                     \"off\": Don't terminate splitting early\n"
+    "          --me-early-termination : Specify the me early termination behaviour\n"
+    "                                     \"off\": Early termination is off\n"
+    "                                     \"on\": Early termination is on\n"
+    "                                     \"sensitive\": Sensitive early termination is on\n"
+    "          --lossless             : Use lossless coding\n"
+    "          --implicit-rdpcm       : Enable implicit residual DPCM. Currently only supported\n"
+    "                                   with lossless coding.\n"
+    "          --no-tmvp              : Disable Temporal Motion Vector Prediction\n"
+    "          --rdoq-skip            : Skips RDOQ for 4x4 blocks\n"
+    "          --input-format         : P420 or P400\n"
+    "          --input-bitdepth       : 8-16\n"
     "\n"
     "  Video Usability Information:\n"
     "          --sar <width:height>   : Specify Sample Aspect Ratio\n"
@@ -367,12 +432,13 @@
     "                                   Disable threads if set to 0.\n"
     "\n"
     "  Tiles:\n"
-    "          --tiles-width-split <string>|u<int> : \n"
+    "          --tiles <int>x<int>    : Split picture into width x height uniform tiles.\n"
+    "          --tiles-width-split <string>|u<int> :\n"
     "                                   Specifies a comma separated list of pixel\n"
     "                                   positions of tiles columns separation coordinates.\n"
     "                                   Can also be u followed by and a single int n,\n"
     "                                   in which case it produces columns of uniform width.\n"
-    "          --tiles-height-split <string>|u<int> : \n"
+    "          --tiles-height-split <string>|u<int> :\n"
     "                                   Specifies a comma separated list of pixel\n"
     "                                   positions of tiles rows separation coordinates.\n"
     "                                   Can also be u followed by and a single int n,\n"
@@ -382,13 +448,13 @@
     "          --wpp                  : Enable wavefront parallel processing\n"
     "          --owf <integer>|auto   : Number of parallel frames to process. 0 to disable.\n"
     "\n"
-    "  Slices:\n"
-    "          --slice-addresses <string>|u<int>: \n"
+    /*"  Slices:\n"
+    "          --slice-addresses <string>|u<int> :\n"
     "                                   Specifies a comma separated list of LCU\n"
     "                                   positions in tile scan order of tile separations.\n"
     "                                   Can also be u followed by and a single int n,\n"
     "                                   in which case it produces uniform slice length.\n"
-    "\n"
+    "\n"*/
     "  Deprecated parameters: (might be removed at some point)\n"
     "     Use --input-res:\n"
     "       -w, --width               : Width of input in pixels\n"

kvazaar-0.8.3.tar.gz/src/cli.h -> kvazaar-1.0.0.tar.gz/src/cli.h Changed

kvazaar-0.8.3.tar.gz/src/context.c -> kvazaar-1.0.0.tar.gz/src/context.c Changed

@@ -20,114 +20,114 @@
 
 #include "context.h"
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#include "tables.h"
 
-#include "encoder.h"
 
+static const uint8_t INIT_SAO_MERGE_FLAG3 = { 153, 153, 153 };
+static const uint8_t INIT_SAO_TYPE_IDX3 = { 160, 185, 200 };
 
-// stuff
-
-const uint8_t kvz_INIT_SAO_MERGE_FLAG3 = { 153, 153, 153 };
-const uint8_t kvz_INIT_SAO_TYPE_IDX3 = { 160, 185, 200 };
-
-const uint8_t kvz_INIT_QT_ROOT_CBF31 = {
+static const uint8_t INIT_QT_ROOT_CBF31 = {
   {  79, },
   {  79, },
   { CNU, },
 };
 
-const uint8_t kvz_INIT_MVP_IDX32 = {
+static const uint8_t INIT_MVP_IDX32 = {
   { 168,  CNU, },
   { 168,  CNU, },
   { CNU,  CNU, },
 };
 
-const uint8_t kvz_INIT_REF_PIC32 = {
+static const uint8_t INIT_REF_PIC32 = {
   { 153,  153 },
   { 153,  153 },
   { CNU,  CNU },
 };
 
-const uint8_t kvz_INIT_MVD32 = {
+static const uint8_t INIT_MVD32 = {
   { 169,  198, },
   { 140,  198, },
   { CNU,  CNU, },
 };
 
-const uint8_t kvz_INIT_MERGE_FLAG_EXT31 = {
+static const uint8_t INIT_MERGE_FLAG_EXT31 = {
   { 154, },
   { 110, },
   { CNU, },
 };
 
-const uint8_t kvz_INIT_MERGE_IDX_EXT31 = {
+static const uint8_t INIT_MERGE_IDX_EXT31 = {
   { 137, },
   { 122, },
   { CNU, },
 };
 
-const uint8_t kvz_INIT_SKIP_FLAG33 =  {
+static const uint8_t INIT_CU_TRANSQUANT_BYPASS31 = {
+  { 154, },
+  { 154, },
+  { 154, },
+};
+
+static const uint8_t INIT_SKIP_FLAG33 =  {
   { 197,  185,  201, },
   { 197,  185,  201, },
   { CNU,  CNU,  CNU, },
 };
 
-const uint8_t kvz_INIT_PRED_MODE31 = {
+static const uint8_t INIT_PRED_MODE31 = {
   { 134, },
   { 149, },
   { CNU, },
 };
 
 
-const uint8_t kvz_INIT_PART_SIZE34 = {
+static const uint8_t INIT_PART_SIZE34 = {
   { 154,  139,  CNU,  CNU, },
   { 154,  139,  CNU,  CNU, },
   { 184,  CNU,  CNU,  CNU, },
 };
 
-const uint8_t  kvz_INIT_SPLIT_FLAG33 = {
+static const uint8_t  INIT_SPLIT_FLAG33 = {
   { 107,  139,  126 },
   { 107,  139,  126 },
   { 139,  141,  157 },
 };
 
-const uint8_t kvz_INIT_INTRA_PRED_MODE3 = {
+static const uint8_t INIT_INTRA_PRED_MODE3 = {
   183, 154, 184
 };
 
-const uint8_t kvz_INIT_CHROMA_PRED_MODE32 = {
+static const uint8_t INIT_CHROMA_PRED_MODE32 = {
   { 152,  139 },
   { 152,  139 },
   {  63,  139 },
 };
 
-const uint8_t kvz_INIT_INTER_DIR35 = {
+static const uint8_t INIT_INTER_DIR35 = {
   {  95,  79,  63,  31,  31, },
   {  95,  79,  63,  31,  31, },
   { CNU, CNU, CNU, CNU, CNU, },
 };
 
-const uint8_t kvz_INIT_TRANS_SUBDIV_FLAG33 = {
+static const uint8_t INIT_TRANS_SUBDIV_FLAG33 = {
   { 224,  167,  122 },
   { 124,  138,   94 },
   { 153,  138,  138 },
 };
 
-const uint8_t kvz_INIT_QT_CBF38 = {
+static const uint8_t INIT_QT_CBF38 = {
   { 153,  111,  CNU,  CNU,   149,   92,  167,  154 },
   { 153,  111,  CNU,  CNU,   149,  107,  167,  154 },
   { 111,  141,  CNU,  CNU,    94,  138,  182,  154 },
 };
 
-const uint8_t kvz_INIT_SIG_CG_FLAG34 = {
+static const uint8_t INIT_SIG_CG_FLAG34 = {
   { 121,  140,  61,  154  },
   { 121,  140,  61,  154 },
   {  91,  171,  134,  141  },
 };
 
-const uint8_t kvz_INIT_SIG_FLAG342 = {
+static const uint8_t INIT_SIG_FLAG342 = {
    {170,154,139,153,139,123,123, 63,124,166,
     183,140,136,153,154,166,183,140,136,153,
     154,166,183,140,136,153,154,170,153,138,
@@ -145,7 +145,7 @@
    139,111},
 };
 
-const uint8_t kvz_INIT_LAST330 = {
+static const uint8_t INIT_LAST330 = {
   { 125,  110,  124,  110,   95,   94,  125,  111,  111,   79,  125,  126,  111,  111,   79,
     108,  123,   93,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU  },
   { 125,  110,   94,  110,   95,   79,  125,  111,  110,   78,  110,  111,  111,   95,   94,
@@ -154,14 +154,14 @@
     108,  123,   63,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU,  CNU  },
 };
 
-const uint8_t kvz_INIT_ONE_FLAG324 =
+static const uint8_t INIT_ONE_FLAG324 =
 {
   {154,196,167,167,154,152,167,182,182,134,149,136,153,121,136,122,169,208,166,167,154,152,167,182},
   {154,196,196,167,154,152,167,182,182,134,149,136,153,121,136,137,169,194,166,167,154,167,137,182},
   {140, 92,137,138,140,152,138,139,153, 74,149, 92,139,107,122,152,140,179,166,182,140,227,122,197},
 };
 
-const uint8_t kvz_INIT_ABS_FLAG36 =
+static const uint8_t INIT_ABS_FLAG36 =
 {
   { 107,167, 91,107,107,167},
   { 107,167, 91,122,107,167},
@@ -209,74 +209,75 @@
   kvz_ctx_init(&cabac->ctx.transform_skip_model_luma, QP, INIT_TRANSFORMSKIP_FLAGslice0);
   kvz_ctx_init(&cabac->ctx.transform_skip_model_chroma, QP, INIT_TRANSFORMSKIP_FLAGslice1);
 
-  kvz_ctx_init(&cabac->ctx.sao_merge_flag_model, QP, kvz_INIT_SAO_MERGE_FLAGslice);
-  kvz_ctx_init(&cabac->ctx.sao_type_idx_model, QP, kvz_INIT_SAO_TYPE_IDXslice);
+  kvz_ctx_init(&cabac->ctx.sao_merge_flag_model, QP, INIT_SAO_MERGE_FLAGslice);
+  kvz_ctx_init(&cabac->ctx.sao_type_idx_model, QP, INIT_SAO_TYPE_IDXslice);
 
-  kvz_ctx_init(&cabac->ctx.cu_merge_flag_ext_model, QP, kvz_INIT_MERGE_FLAG_EXTslice0);
-  kvz_ctx_init(&cabac->ctx.cu_merge_idx_ext_model, QP, kvz_INIT_MERGE_IDX_EXTslice0);
-  kvz_ctx_init(&cabac->ctx.cu_pred_mode_model, QP, kvz_INIT_PRED_MODEslice0);
+  kvz_ctx_init(&cabac->ctx.cu_merge_flag_ext_model, QP, INIT_MERGE_FLAG_EXTslice0);
+  kvz_ctx_init(&cabac->ctx.cu_merge_idx_ext_model, QP, INIT_MERGE_IDX_EXTslice0);
+  kvz_ctx_init(&cabac->ctx.cu_pred_mode_model, QP, INIT_PRED_MODEslice0);
+  kvz_ctx_init(&cabac->ctx.cu_transquant_bypass, QP, INIT_CU_TRANSQUANT_BYPASSslice0);
 
-  kvz_ctx_init(&cabac->ctx.cu_skip_flag_model0, QP, kvz_INIT_SKIP_FLAGslice0);
-  kvz_ctx_init(&cabac->ctx.cu_skip_flag_model1, QP, kvz_INIT_SKIP_FLAGslice1);
-  kvz_ctx_init(&cabac->ctx.cu_skip_flag_model2, QP, kvz_INIT_SKIP_FLAGslice2);
+  kvz_ctx_init(&cabac->ctx.cu_skip_flag_model0, QP, INIT_SKIP_FLAGslice0);
+  kvz_ctx_init(&cabac->ctx.cu_skip_flag_model1, QP, INIT_SKIP_FLAGslice1);
+  kvz_ctx_init(&cabac->ctx.cu_skip_flag_model2, QP, INIT_SKIP_FLAGslice2);
 
-  kvz_ctx_init(&cabac->ctx.split_flag_model0, QP, kvz_INIT_SPLIT_FLAGslice0);
-  kvz_ctx_init(&cabac->ctx.split_flag_model1, QP, kvz_INIT_SPLIT_FLAGslice1);
-  kvz_ctx_init(&cabac->ctx.split_flag_model2, QP, kvz_INIT_SPLIT_FLAGslice2);
+  kvz_ctx_init(&cabac->ctx.split_flag_model0, QP, INIT_SPLIT_FLAGslice0);
+  kvz_ctx_init(&cabac->ctx.split_flag_model1, QP, INIT_SPLIT_FLAGslice1);
+  kvz_ctx_init(&cabac->ctx.split_flag_model2, QP, INIT_SPLIT_FLAGslice2);
 
-  kvz_ctx_init(&cabac->ctx.intra_mode_model, QP, kvz_INIT_INTRA_PRED_MODEslice);
+  kvz_ctx_init(&cabac->ctx.intra_mode_model, QP, INIT_INTRA_PRED_MODEslice);
 
-  kvz_ctx_init(&cabac->ctx.chroma_pred_model0, QP, kvz_INIT_CHROMA_PRED_MODEslice0);
-  kvz_ctx_init(&cabac->ctx.chroma_pred_model1, QP, kvz_INIT_CHROMA_PRED_MODEslice1);
+  kvz_ctx_init(&cabac->ctx.chroma_pred_model0, QP, INIT_CHROMA_PRED_MODEslice0);
+  kvz_ctx_init(&cabac->ctx.chroma_pred_model1, QP, INIT_CHROMA_PRED_MODEslice1);
 
-  kvz_ctx_init(&cabac->ctx.cu_abs_model_chroma0, QP, kvz_INIT_ABS_FLAGslice4);
-  kvz_ctx_init(&cabac->ctx.cu_abs_model_chroma1, QP, kvz_INIT_ABS_FLAGslice5);
+  kvz_ctx_init(&cabac->ctx.cu_abs_model_chroma0, QP, INIT_ABS_FLAGslice4);
+  kvz_ctx_init(&cabac->ctx.cu_abs_model_chroma1, QP, INIT_ABS_FLAGslice5);
 
   //TODO: ignore P/B contexts on intra frame
-  kvz_ctx_init(&cabac->ctx.cu_qt_root_cbf_model, QP, kvz_INIT_QT_ROOT_CBFslice0);
+  kvz_ctx_init(&cabac->ctx.cu_qt_root_cbf_model, QP, INIT_QT_ROOT_CBFslice0);
 
-  kvz_ctx_init(&cabac->ctx.cu_mvd_model0, QP, kvz_INIT_MVDslice0);
-  kvz_ctx_init(&cabac->ctx.cu_mvd_model1, QP, kvz_INIT_MVDslice1);
-  kvz_ctx_init(&cabac->ctx.cu_ref_pic_model0, QP, kvz_INIT_REF_PICslice0);
-  kvz_ctx_init(&cabac->ctx.cu_ref_pic_model1, QP, kvz_INIT_REF_PICslice1);
-  kvz_ctx_init(&cabac->ctx.mvp_idx_model0, QP, kvz_INIT_MVP_IDXslice0);
-  kvz_ctx_init(&cabac->ctx.mvp_idx_model1, QP, kvz_INIT_MVP_IDXslice1);
+  kvz_ctx_init(&cabac->ctx.cu_mvd_model0, QP, INIT_MVDslice0);
+  kvz_ctx_init(&cabac->ctx.cu_mvd_model1, QP, INIT_MVDslice1);
+  kvz_ctx_init(&cabac->ctx.cu_ref_pic_model0, QP, INIT_REF_PICslice0);
+  kvz_ctx_init(&cabac->ctx.cu_ref_pic_model1, QP, INIT_REF_PICslice1);
+  kvz_ctx_init(&cabac->ctx.mvp_idx_model0, QP, INIT_MVP_IDXslice0);
+  kvz_ctx_init(&cabac->ctx.mvp_idx_model1, QP, INIT_MVP_IDXslice1);
 
   for (i = 0; i < 4; i++) {
-    kvz_ctx_init(&cabac->ctx.cu_sig_coeff_group_modeli, QP, kvz_INIT_SIG_CG_FLAGslicei);
-    kvz_ctx_init(&cabac->ctx.cu_abs_model_lumai, QP, kvz_INIT_ABS_FLAGslicei);
-    kvz_ctx_init(&cabac->ctx.part_size_modeli, QP, kvz_INIT_PART_SIZEslicei);
+    kvz_ctx_init(&cabac->ctx.cu_sig_coeff_group_modeli, QP, INIT_SIG_CG_FLAGslicei);
+    kvz_ctx_init(&cabac->ctx.cu_abs_model_lumai, QP, INIT_ABS_FLAGslicei);
+    kvz_ctx_init(&cabac->ctx.part_size_modeli, QP, INIT_PART_SIZEslicei);
   }
   for (i = 0; i < 3; i++) {
-    kvz_ctx_init(&cabac->ctx.trans_subdiv_modeli, QP, kvz_INIT_TRANS_SUBDIV_FLAGslicei);
+    kvz_ctx_init(&cabac->ctx.trans_subdiv_modeli, QP, INIT_TRANS_SUBDIV_FLAGslicei);
   }
   for (i = 0; i < 4; i++) {
-    kvz_ctx_init(&cabac->ctx.qt_cbf_model_lumai, QP, kvz_INIT_QT_CBFslicei);
-    kvz_ctx_init(&cabac->ctx.qt_cbf_model_chromai, QP, kvz_INIT_QT_CBFslicei + 4);
+    kvz_ctx_init(&cabac->ctx.qt_cbf_model_lumai, QP, INIT_QT_CBFslicei);
+    kvz_ctx_init(&cabac->ctx.qt_cbf_model_chromai, QP, INIT_QT_CBFslicei + 4);
   }
 
   for (i = 0; i < 5; i++) {
-    kvz_ctx_init(&cabac->ctx.inter_diri, QP, kvz_INIT_INTER_DIRslicei);
+    kvz_ctx_init(&cabac->ctx.inter_diri, QP, INIT_INTER_DIRslicei);
   }
 
   for (i = 0; i < 8; i++) {
-    kvz_ctx_init(&cabac->ctx.cu_one_model_chromai, QP, kvz_INIT_ONE_FLAGslicei+16);
+    kvz_ctx_init(&cabac->ctx.cu_one_model_chromai, QP, INIT_ONE_FLAGslicei+16);
   }
 
   for (i = 0; i < 15; i++) {
-    kvz_ctx_init(&cabac->ctx.cu_ctx_last_y_lumai, QP, kvz_INIT_LASTslicei );
-    kvz_ctx_init(&cabac->ctx.cu_ctx_last_x_lumai, QP, kvz_INIT_LASTslicei );
+    kvz_ctx_init(&cabac->ctx.cu_ctx_last_y_lumai, QP, INIT_LASTslicei );
+    kvz_ctx_init(&cabac->ctx.cu_ctx_last_x_lumai, QP, INIT_LASTslicei );
 
-    kvz_ctx_init(&cabac->ctx.cu_ctx_last_y_chromai, QP, kvz_INIT_LASTslicei+15 );
-    kvz_ctx_init(&cabac->ctx.cu_ctx_last_x_chromai, QP, kvz_INIT_LASTslicei+15 );
+    kvz_ctx_init(&cabac->ctx.cu_ctx_last_y_chromai, QP, INIT_LASTslicei+15 );
+    kvz_ctx_init(&cabac->ctx.cu_ctx_last_x_chromai, QP, INIT_LASTslicei+15 );
 
-    kvz_ctx_init(&cabac->ctx.cu_one_model_lumai, QP, kvz_INIT_ONE_FLAGslicei);
+    kvz_ctx_init(&cabac->ctx.cu_one_model_lumai, QP, INIT_ONE_FLAGslicei);
   }
-  kvz_ctx_init(&cabac->ctx.cu_one_model_luma15, QP, kvz_INIT_ONE_FLAGslice15);
+  kvz_ctx_init(&cabac->ctx.cu_one_model_luma15, QP, INIT_ONE_FLAGslice15);
 
   for (i = 0; i < 27; i++) {
-    kvz_ctx_init(&cabac->ctx.cu_sig_model_lumai, QP, kvz_INIT_SIG_FLAGslicei);
-    if(i < 15) kvz_ctx_init(&cabac->ctx.cu_sig_model_chromai, QP, kvz_INIT_SIG_FLAGslicei+27);
+    kvz_ctx_init(&cabac->ctx.cu_sig_model_lumai, QP, INIT_SIG_FLAGslicei);
+    if(i < 15) kvz_ctx_init(&cabac->ctx.cu_sig_model_chromai, QP, INIT_SIG_FLAGslicei+27);
   }
 }

kvazaar-0.8.3.tar.gz/src/context.h -> kvazaar-1.0.0.tar.gz/src/context.h Changed

kvazaar-0.8.3.tar.gz/src/cu.c -> kvazaar-1.0.0.tar.gz/src/cu.c Changed

@@ -24,6 +24,7 @@
 #include "cu.h"
 #include "threads.h"
 
+
 /**
  * \brief Number of PUs in a CU.
  *
@@ -76,6 +77,13 @@
   { {3, 4}, {1, 4}                 }, // nRx2N
 };
 
+
+#define BLIT_COEFF_CASE(n) case n:\
+  for (y = 0; y < n; ++y) {\
+    memcpy(&dsty*dst_stride, &origy*orig_stride, n * sizeof(coeff_t));\
+  }\
+  break;
+
 void kvz_coefficients_blit(const coeff_t * const orig, coeff_t * const dst,
                          const unsigned width, const unsigned height,
                          const unsigned orig_stride, const unsigned dst_stride)
@@ -84,52 +92,11 @@
   
   int nxn_width = (width == height) ? width : 0;
   switch (nxn_width) {
-    case 4:
-      *(int64_t*)&dstdst_stride*0 = *(int64_t*)&origorig_stride*0;
-      *(int64_t*)&dstdst_stride*1 = *(int64_t*)&origorig_stride*1;
-      *(int64_t*)&dstdst_stride*2 = *(int64_t*)&origorig_stride*2;
-      *(int64_t*)&dstdst_stride*3 = *(int64_t*)&origorig_stride*3;
-      break;
-    case 8:
-#define KVZ_COPY_ROW_8(row_num) \
-*(int64_t*)&dstdst_stride*(row_num) = *(int64_t*)&origorig_stride*(row_num); \
-*(int64_t*)&dstdst_stride*(row_num) + 4 = *(int64_t*)&origorig_stride*(row_num) + 4;
-      
-      KVZ_COPY_ROW_8(0);
-      KVZ_COPY_ROW_8(1);
-      KVZ_COPY_ROW_8(2);
-      KVZ_COPY_ROW_8(3);
-      KVZ_COPY_ROW_8(4);
-      KVZ_COPY_ROW_8(5);
-      KVZ_COPY_ROW_8(6);
-      KVZ_COPY_ROW_8(7);
-      break;
-#undef KVZ_COPY_ROW_8
-          case 16:
-#define KVZ_COPY_ROW_16(row_num) \
-*(int64_t*)&dstdst_stride*(row_num) = *(int64_t*)&origorig_stride*(row_num); \
-*(int64_t*)&dstdst_stride*(row_num) + 4 = *(int64_t*)&origorig_stride*(row_num) + 4; \
-*(int64_t*)&dstdst_stride*(row_num) + 8 = *(int64_t*)&origorig_stride*(row_num) + 8; \
-*(int64_t*)&dstdst_stride*(row_num) + 12 = *(int64_t*)&origorig_stride*(row_num) + 12;
-      
-      KVZ_COPY_ROW_16(0);
-      KVZ_COPY_ROW_16(1);
-      KVZ_COPY_ROW_16(2);
-      KVZ_COPY_ROW_16(3);
-      KVZ_COPY_ROW_16(4);
-      KVZ_COPY_ROW_16(5);
-      KVZ_COPY_ROW_16(6);
-      KVZ_COPY_ROW_16(7);
-      KVZ_COPY_ROW_16(8);
-      KVZ_COPY_ROW_16(9);
-      KVZ_COPY_ROW_16(10);
-      KVZ_COPY_ROW_16(11);
-      KVZ_COPY_ROW_16(12);
-      KVZ_COPY_ROW_16(13);
-      KVZ_COPY_ROW_16(14);
-      KVZ_COPY_ROW_16(15);
-      break;
-#undef KVZ_COPY_ROW_16
+    BLIT_COEFF_CASE(4)
+    BLIT_COEFF_CASE(8)
+    BLIT_COEFF_CASE(16)
+    BLIT_COEFF_CASE(32)
+    BLIT_COEFF_CASE(64)
   default:
     for (y = 0; y < height; ++y) {
       memcpy(&dsty*dst_stride, &origy*orig_stride, width * sizeof(coeff_t));
@@ -138,43 +105,122 @@
   }
 }
 
-unsigned kvz_coefficients_calc_abs(const coeff_t *const buf, const int buf_stride,
-                        const int width)
+cu_info_t* kvz_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px)
 {
-  int sum = 0;
-  int y, x;
+  return (cu_info_t*) kvz_cu_array_at_const(cua, x_px, y_px);
+}
 
-  for (y = 0; y < width; ++y) {
-    for (x = 0; x < width; ++x) {
-      sum += abs(bufx + y * buf_stride);
-    }
-  }
 
-  return sum;
+const cu_info_t* kvz_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px)
+{
+  assert(x_px < cua->width);
+  assert(y_px < cua->height);
+  return &(cua)->data(x_px >> 2) + (y_px >> 2) * ((cua)->width >> 2);
 }
 
-cu_array_t * kvz_cu_array_alloc(const int width_in_scu, const int height_in_scu) {
-  unsigned cu_array_size = height_in_scu * width_in_scu;
-  cu_array_t *cua;
-  cua = MALLOC(cu_array_t, 1);
-  cua->data = (cu_info_t*)malloc(sizeof(cu_info_t) * cu_array_size);
+
+/**
+ * \brief Allocate a CU array.
+ *
+ * \param width   width of the array in luma pixels
+ * \param height  height of the array in luma pixels
+ */
+cu_array_t * kvz_cu_array_alloc(const int width, const int height) {
+  cu_array_t *cua = MALLOC(cu_array_t, 1);
+
+  // Round up to a multiple of cell width and divide by cell width.
+  const int width_scu  = (width  + 15) >> 2;
+  const int height_scu = (height + 15) >> 2;
+  assert(width_scu  * 16 >= width);
+  assert(height_scu * 16 >= height);
+  const unsigned cu_array_size = width_scu * height_scu;
+  cua->data = calloc(cu_array_size, sizeof(cu_info_t));
+  cua->width  = width_scu  << 2;
+  cua->height = height_scu << 2;
   cua->refcount = 1;
-  FILL_ARRAY(cua->data, 0, cu_array_size);
+
   return cua;
 }
 
+
 int kvz_cu_array_free(cu_array_t * const cua)
 {
   int32_t new_refcount;
   if (!cua) return 1;
-  
+
   new_refcount = KVZ_ATOMIC_DEC(&(cua->refcount));
   //Still we have some references, do nothing
   if (new_refcount > 0) return 1;
-  
+
   FREE_POINTER(cua->data);
   free(cua);
 
   return 1;
 }
 
+
+/**
+ * \brief Copy part of a cu array to another cu array.
+ *
+ * All values are in luma pixels.
+ *
+ * \param dst     destination array
+ * \param dst_x   x-coordinate of the left edge of the copied area in dst
+ * \param dst_y   y-coordinate of the top edge of the copied area in dst
+ * \param src     source array
+ * \param src_x   x-coordinate of the left edge of the copied area in src
+ * \param src_y   y-coordinate of the top edge of the copied area in src
+ * \param width   width of the area to copy
+ * \param height  height of the area to copy
+ */
+void kvz_cu_array_copy(cu_array_t* dst,       int dst_x, int dst_y,
+                       const cu_array_t* src, int src_x, int src_y,
+                       int width, int height)
+{
+  // Convert values from pixel coordinates to array indices.
+  int src_stride = src->width >> 2;
+  int dst_stride = dst->width >> 2;
+  const cu_info_t* src_ptr = &src->data(src_x >> 2) + (src_y >> 2) * src_stride;
+  cu_info_t* dst_ptr       = &dst->data(dst_x >> 2) + (dst_y >> 2) * dst_stride;
+
+  // Number of bytes to copy per row.
+  const size_t row_size = sizeof(cu_info_t) * (width >> 2);
+
+  width = MIN(width,   MIN(src->width  - src_x, dst->width  - dst_x));
+  height = MIN(height, MIN(src->height - src_y, dst->height - dst_y));
+
+  assert(src_x + width  <= src->width);
+  assert(src_y + height <= src->height);
+  assert(dst_x + width  <= dst->width);
+  assert(dst_y + height <= dst->height);
+
+  for (int i = 0; i < (height >> 2); ++i) {
+    memcpy(dst_ptr, src_ptr, row_size);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+/**
+ * \brief Copy an lcu to a cu array.
+ *
+ * All values are in luma pixels.
+ *
+ * \param dst     destination array
+ * \param dst_x   x-coordinate of the left edge of the copied area in dst
+ * \param dst_y   y-coordinate of the top edge of the copied area in dst
+ * \param src     source lcu
+ */
+void kvz_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src)
+{
+  const int dst_stride = dst->width >> 2;
+  for (int y = 0; y < LCU_WIDTH; y += SCU_WIDTH) {
+    for (int x = 0; x < LCU_WIDTH; x += SCU_WIDTH) {
+      const cu_info_t *from_cu = LCU_GET_CU_AT_PX(src, x, y);
+      const int x_scu = (dst_x + x) >> 2;
+      const int y_scu = (dst_y + y) >> 2;
+      cu_info_t *to_cu = &dst->datax_scu + y_scu * dst_stride;
+      memcpy(to_cu,                  from_cu, sizeof(*to_cu));
+    }
+  }
+}

kvazaar-0.8.3.tar.gz/src/cu.h -> kvazaar-1.0.0.tar.gz/src/cu.h Changed

@@ -26,16 +26,21 @@
  * Coding Unit data structure and related functions.
  */
 
-#include "global.h"
-
+#include "global.h" // IWYU pragma: keep
 #include "image.h"
+#include "kvazaar.h"
 
 
 //Cu stuff
 //////////////////////////////////////////////////////////////////////////
 // CONSTANTS
 
-typedef enum { CU_NOTSET = 0, CU_PCM, CU_SKIP, CU_SPLIT, CU_INTRA, CU_INTER } cu_type_t;
+typedef enum {
+  CU_NOTSET = 0,
+  CU_INTRA  = 1,
+  CU_INTER  = 2,
+  CU_PCM    = 3,
+} cu_type_t;
 
 typedef enum {
   SIZE_2Nx2N = 0,
@@ -106,46 +111,49 @@
   int y;
 } vector2d_t;
 
-typedef struct
-{
-  uint8_t y;
-  uint8_t u;
-  uint8_t v;
-} cu_cbf_t;
-
 /**
  * \brief Struct for CU info
  */
 typedef struct
 {
-  int8_t type;       //!< \brief block type, CU_INTER / CU_INTRA
-  int8_t depth;      //!< \brief depth / size of this block
-  int8_t part_size;  //!< \brief Currently only 2Nx2N, TODO: AMP/SMP/NxN parts
-  int8_t tr_depth;   //!< \brief transform depth
-  int8_t coded;      //!< \brief flag to indicate this block is coded and reconstructed
-  int8_t skipped;    //!< \brief flag to indicate this block is skipped
-  int8_t merged;     //!< \brief flag to indicate this block is merged
-  int8_t merge_idx;  //!< \brief merge index
-
-  cu_cbf_t cbf;
-  struct {
-    int8_t mode;
-    int8_t mode_chroma;
-    int8_t tr_skip;    //!< \brief transform skip flag
-  } intra4;
-  struct {
-    double cost;
-    uint32_t bitcost;
-    int16_t mv22;  // \brief Motion vectors for L0 and L1
-    int16_t mvd22; // \brief Motion vector differences for L0 and L1
-    uint8_t mv_cand2; // \brief selected MV candidate
-    uint8_t mv_ref2; // \brief Index of the encoder_control.ref array.
-    uint8_t mv_ref_coded2; // \brief Coded and corrected index of ref picture
-    uint8_t mv_dir; // \brief Probably describes if mv_ref is L0, L1 or both (bi-pred)
-    int8_t mode;
-  } inter;
+  uint8_t type      : 2; //!< \brief block type, one of cu_type_t values
+  uint8_t depth     : 3; //!< \brief depth / size of this block
+  uint8_t part_size : 3; //!< \brief partition mode, one of part_mode_t values
+  uint8_t tr_depth  : 3; //!< \brief transform depth
+  uint8_t skipped   : 1; //!< \brief flag to indicate this block is skipped
+  uint8_t merged    : 1; //!< \brief flag to indicate this block is merged
+  uint8_t merge_idx : 3; //!< \brief merge index
+
+  uint16_t cbf;
+
+  union {
+    struct {
+      int8_t mode;
+      int8_t mode_chroma;
+      int8_t tr_skip;    //!< \brief transform skip flag
+    } intra;
+    struct {
+      int16_t mv22;  // \brief Motion vectors for L0 and L1
+      uint8_t mv_ref2; // \brief Index of the encoder_control.ref array.
+      uint8_t mv_cand0 : 3; // \brief selected MV candidate
+      uint8_t mv_cand1 : 3; // \brief selected MV candidate
+      uint8_t mv_dir   : 2; // \brief Probably describes if mv_ref is L0, L1 or both (bi-pred)
+    } inter;
+  };
 } cu_info_t;
 
+#define CU_GET_MV_CAND(cu_info_ptr, reflist) \
+  (((reflist) == 0) ? (cu_info_ptr)->inter.mv_cand0 : (cu_info_ptr)->inter.mv_cand1)
+
+#define CU_SET_MV_CAND(cu_info_ptr, reflist, value) \
+  do { \
+    if ((reflist) == 0) { \
+      (cu_info_ptr)->inter.mv_cand0 = (value); \
+    } else { \
+      (cu_info_ptr)->inter.mv_cand1 = (value); \
+    } \
+  } while (0)
+
 #define CHECKPOINT_CU(prefix_str, cu) CHECKPOINT(prefix_str " type=%d depth=%d part_size=%d tr_depth=%d coded=%d " \
   "skipped=%d merged=%d merge_idx=%d cbf.y=%d cbf.u=%d cbf.v=%d " \
   "intra0.cost=%u intra0.bitcost=%u intra0.mode=%d intra0.mode_chroma=%d intra0.tr_skip=%d " \
@@ -164,12 +172,20 @@
   (cu).inter.mv_cand, (cu).inter.mv_ref, (cu).inter.mv_dir, (cu).inter.mode)
 
 typedef struct {
-  cu_info_t *data;           //!< \brief cu_info data
-  int32_t refcount;        //!< \brief number of references in reflists to this cu_array
+  cu_info_t *data; //!< \brief cu array
+  int32_t width;    //!< \brief width of the array in pixels
+  int32_t height;   //!< \brief height of the array in pixels
+  int32_t refcount; //!< \brief number of references to this cu_array
 } cu_array_t;
 
-cu_array_t * kvz_cu_array_alloc(int width_in_scu, int height_in_scu);
+cu_array_t * kvz_cu_array_alloc(int width, int height);
 int kvz_cu_array_free(cu_array_t *cua);
+cu_info_t* kvz_cu_array_at(cu_array_t *cua, unsigned x_px, unsigned y_px);
+const cu_info_t* kvz_cu_array_at_const(const cu_array_t *cua, unsigned x_px, unsigned y_px);
+void kvz_cu_array_copy(cu_array_t* dst,       int dst_x, int dst_y,
+                       const cu_array_t* src, int src_x, int src_y,
+                       int width, int height);
+
 
 /**
  * \brief Return the 7 lowest-order bits of the pixel coordinate.
@@ -179,9 +195,10 @@
  */
 #define SUB_SCU(xy) ((xy) & (LCU_WIDTH - 1))
 
-#define LCU_CU_WIDTH 8
-#define LCU_T_CU_WIDTH 9
-#define LCU_CU_OFFSET 10
+#define LCU_CU_WIDTH 16
+#define LCU_T_CU_WIDTH (LCU_CU_WIDTH + 1)
+#define LCU_CU_OFFSET (LCU_T_CU_WIDTH + 1)
+#define SCU_WIDTH (LCU_WIDTH / LCU_CU_WIDTH)
 
 // Width from top left of the LCU, so +1 for ref buffer size.
 #define LCU_REF_PX_WIDTH (LCU_WIDTH + LCU_WIDTH / 2)
@@ -217,43 +234,34 @@
   lcu_coeff_t coeff; //!< LCU coefficients
 
   /**
-   * A 9x9 CU array for the LCU, +1 CU.
-   * - Top reference CUs on row 0.
-   * - Left reference CUs on column 0.
-   * - All of LCUs CUs on 1:9, 1:9.
-   * - Top right reference CU on the last slot.
+   * A 17x17 CU array, plus the top right reference CU.
+   * - Top reference CUs at indices 0,16 (row 0).
+   * - Left reference CUs at indices 17*n where n is in 0,16 (column 0).
+   * - All CUs of this LCU at indices 17*y + x where x,y are in 1,16.
+   * - Top right reference CU at the last index.
+   *
+   * The figure below shows how the indices map to CU locations.
    *
    \verbatim
 
-      .-- left reference CUs
-      v
-       0 |  1  2  3  4  5  6  7  8 | 81 <-- top reference CUs
-     ----+-------------------------+----
-       9 | 10 11 12 13 14 15 16 17 |
-      18 | 19 20 21 22 23 24 25 26 <-- this LCU
-      27 | 28 29 30 31 32 33 34 35 |
-      36 | 37 38 39 40 41 42 43 44 |
-      45 | 46 47 48 49 50 51 52 53 |
-      54 | 55 56 57 58 59 60 61 62 |
-      63 | 64 65 66 67 68 69 70 71 |
-      72 | 73 74 75 76 77 78 79 80 |
-     ----+-------------------------+----
+       .-- left reference CUs
+       v
+        0 |   1   2  . . .  16 | 289 <-- top reference CUs
+     -----+--------------------+----
+       17 |  18  19  . . .  33 |
+       34 |  35  36  . . .  50 <-- this LCU
+        . |   .   .  .       . |
+        . |   .   .    .     . |
+        . |   .   .      .   . |
+      272 | 273 274  . . . 288 |
+     -----+--------------------+----
 
    \endverbatim
    */
-  cu_info_t cu9*9+1;
+  cu_info_t cuLCU_T_CU_WIDTH * LCU_T_CU_WIDTH + 1;
 } lcu_t;
 
-/**
- * \brief Return pointer to a given CU.
- *
- * \param lcu   pointer to the containing LCU
- * \param x_cu  x-index of the CU
- * \param y_cu  y-index of the CU
- * \return      pointer to the CU
- */
-#define LCU_GET_CU(lcu, x_cu, y_cu) \
-  (&(lcu)->cuLCU_CU_OFFSET + (x_cu) + (y_cu) * LCU_T_CU_WIDTH)
+void kvz_cu_array_copy_from_lcu(cu_array_t* dst, int dst_x, int dst_y, const lcu_t *src);
 
 /**
  * \brief Return pointer to the top right reference CU.
@@ -269,18 +277,8 @@
  * \param y_px  y-coordinate relative to the upper left corner of the LCU
  * \return      pointer to the CU at coordinates (x_px, y_px)
  */
-#define LCU_GET_CU_AT_PX(lcu, x_px, y_px) LCU_GET_CU(lcu, (x_px) >> 3, (y_px) >> 3)
-
-/**
- * \brief Return pointer to a CU relative to the given CU.
- *
- * \param cu      pointer to a CU in the array at some location (x, y)
- * \param x_offs  x-offset
- * \param y_offs  y-offset
- * \return        pointer to the CU at (x + x_offs, y + y_offs)
- */
-#define CU_GET_CU(cu_array, x_offs, y_offs) \
-  (&cu_array(x_offs) + (y_offs) * LCU_T_CU_WIDTH)
+#define LCU_GET_CU_AT_PX(lcu, x_px, y_px) \
+  (&(lcu)->cuLCU_CU_OFFSET + ((x_px) >> 2) + ((y_px) >> 2) * LCU_T_CU_WIDTH)
 
 #define CHECKPOINT_LCU(prefix_str, lcu) do { \
   CHECKPOINT_CU(prefix_str " cu0", (lcu).cu0); \
@@ -372,40 +370,65 @@
                          unsigned width, unsigned height,
                          unsigned orig_stride, unsigned dst_stride);
 
-unsigned kvz_coefficients_calc_abs(const coeff_t *const buf, const int buf_stride,
-                        const int width);
-
-
+#define NUM_CBF_DEPTHS 5
+static const uint16_t cbf_masksNUM_CBF_DEPTHS = { 0x1f, 0x0f, 0x07, 0x03, 0x1 };
 
 /**
  * Check if CBF in a given level >= depth is true.
  */
-static INLINE int cbf_is_set(uint8_t cbf_flags, int depth)
+static INLINE int cbf_is_set(uint16_t cbf, int depth, color_t plane)
 {
-  // Transform data for 4x4 blocks is stored at depths 4-8 for luma, so masks
-  // for those levels don't include the other ones.
-  static const uint8_t masks8 = { 0xff, 0x7f, 0x3f, 0x1f, 0x8, 0x4, 0x2, 0x1 };
+  return (cbf & (cbf_masksdepth << (NUM_CBF_DEPTHS * plane))) != 0;
+}
 
-  return (cbf_flags & masksdepth) != 0;
+/**
+ * Check if CBF in a given level >= depth is true.
+ */
+static INLINE int cbf_is_set_any(uint16_t cbf, int depth)
+{
+  return cbf_is_set(cbf, depth, COLOR_Y) ||
+         cbf_is_set(cbf, depth, COLOR_U) ||
+         cbf_is_set(cbf, depth, COLOR_V);
 }
 
 /**
  * Set CBF in a level to true.
  */
-static INLINE void cbf_set(uint8_t *cbf_flags, int depth)
+static INLINE void cbf_set(uint16_t *cbf, int depth, color_t plane)
 {
   // Return value of the bit corresponding to the level.
-  *cbf_flags |= 1 << (7 - depth);
+  *cbf |= (0x10 >> depth) << (NUM_CBF_DEPTHS * plane);
+}
+
+/**
+ * Set CBF in a level to true if it is set at a lower level in any of
+ * the child_cbfs.
+ */
+static INLINE void cbf_set_conditionally(uint16_t *cbf, uint16_t child_cbfs3, int depth, color_t plane)
+{
+  bool child_cbf_set = cbf_is_set(child_cbfs0, depth + 1, plane) ||
+                       cbf_is_set(child_cbfs1, depth + 1, plane) ||
+                       cbf_is_set(child_cbfs2, depth + 1, plane);
+  if (child_cbf_set) {
+    cbf_set(cbf, depth, plane);
+  }
 }
 
 /**
  * Set CBF in a levels <= depth to false.
  */
-static INLINE void cbf_clear(uint8_t *cbf_flags, int depth)
+static INLINE void cbf_clear(uint16_t *cbf, int depth, color_t plane)
 {
-  static const uint8_t masks8 = { 0xff, 0x7f, 0x3f, 0x1f, 0x8, 0x4, 0x2, 0x1 };
+  *cbf &= ~(cbf_masksdepth << (NUM_CBF_DEPTHS * plane));
+}
 
-  *cbf_flags &= ~masksdepth;
+/**
+ * Copy cbf flags.
+ */
+static INLINE void cbf_copy(uint16_t *cbf, uint16_t src, color_t plane)
+{
+  cbf_clear(cbf, 0, plane);
+  *cbf |= src & (cbf_masks0 << (NUM_CBF_DEPTHS * plane));
 }
 
 #define GET_SPLITDATA(CU,curDepth) ((CU)->depth > curDepth)

kvazaar-0.8.3.tar.gz/src/encmain.c -> kvazaar-1.0.0.tar.gz/src/encmain.c Changed

@@ -27,24 +27,24 @@
 /* The following two defines must be located before the inclusion of any system header files. */
 #define WINVER       0x0500
 #define _WIN32_WINNT 0x0500
-#include <io.h>       /* _setmode() */
 #include <fcntl.h>    /* _O_BINARY */
+#include <io.h>       /* _setmode() */
 #endif
 
-#include "global.h"
-
-#include "kvazaar_internal.h"
-
 #include <math.h>
+#include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <time.h>
+#include <time.h> // IWYU pragma: keep for CLOCKS_PER_SEC
 
 #include "checkpoint.h"
-#include "global.h"
-#include "encoder.h"
 #include "cli.h"
+#include "encoder.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+#include "kvazaar_internal.h"
+#include "threads.h"
 #include "yuv_io.h"
 
 /**
@@ -98,14 +98,15 @@
  */
 static void compute_psnr(const kvz_picture *const src,
                          const kvz_picture *const rec,
-                         double psnrNUM_COLORS)
+                         double psnr3)
 {
   assert(src->width  == rec->width);
   assert(src->height == rec->height);
 
   int32_t pixels = src->width * src->height;
+  int colors = rec->chroma_format == KVZ_CSP_400 ? 1 : 3;
 
-  for (int32_t c = 0; c < NUM_COLORS; ++c) {
+  for (int32_t c = 0; c < colors; ++c) {
     int32_t num_pixels = pixels;
     if (c != COLOR_Y) {
       num_pixels >>= 2;
@@ -123,16 +124,20 @@
 }
 
 typedef struct {
-  FILE* input;
+  // Mutexes for synchronization.
   pthread_mutex_t* input_mutex;
   pthread_mutex_t* main_thread_mutex;
 
-  kvz_picture **img_in;
-  cmdline_opts_t *opts;
-  encoder_control_t *encoder;
-  uint8_t padding_x;
-  uint8_t padding_y;
-  const kvz_api * api;
+  // Parameters passed from main thread to input thread.
+  FILE* input;
+  const kvz_api *api;
+  const cmdline_opts_t *opts;
+  const encoder_control_t *encoder;
+  const uint8_t padding_x;
+  const uint8_t padding_y;
+
+  // Picture and thread status passed from input thread to main thread.
+  kvz_picture *img_in;
   int retval;
 } input_handler_args;
 
@@ -160,6 +165,7 @@
 
   input_handler_args* args = (input_handler_args*)in_args;
   kvz_picture *frame_in = NULL;
+  int retval = RETVAL_RUNNING;
   int frames_read = 0;
 
   for (;;) {
@@ -169,49 +175,92 @@
     bool input_empty = !(args->opts->frames == 0 // number of frames to read is unknown
                          || frames_read < args->opts->frames); // not all frames have been read
     if (feof(args->input) || input_empty) {
-      goto exit_eof;
+      retval = RETVAL_EOF;
+      goto done;
     }
 
-    frame_in = args->api->picture_alloc(args->opts->config->width + args->padding_x, args->opts->config->height + args->padding_y);
-        
+    enum kvz_chroma_format csp = KVZ_FORMAT2CSP(args->opts->config->input_format);
+    frame_in = args->api->picture_alloc_csp(csp,
+                                            args->opts->config->width  + args->padding_x,
+                                            args->opts->config->height + args->padding_y);
+
     if (!frame_in) {
       fprintf(stderr, "Failed to allocate image.\n");
-      goto exit_failure;
+      retval = RETVAL_FAILURE;
+      goto done;
     }
 
-    if (!yuv_io_read(args->input, args->opts->config->width, args->opts->config->height, frame_in)) {
+    bool read_success = yuv_io_read(args->input, 
+                                    args->opts->config->width,
+                                    args->opts->config->height,
+                                    args->encoder->cfg->input_bitdepth,
+                                    args->encoder->bitdepth,
+                                    frame_in);
+    if (!read_success) {
       // reading failed
       if (feof(args->input)) {
-        goto exit_eof;
+        // When looping input, re-open the file and re-read data.
+        if (args->opts->loop_input && args->input != stdin) {
+          fclose(args->input);
+          args->input = fopen(args->opts->input, "rb");
+          if (args->input == NULL)
+          {
+            fprintf(stderr, "Could not re-open input file, shutting down!\n");
+            retval = RETVAL_FAILURE;
+            goto done;
+          }
+          bool read_success = yuv_io_read(args->input,
+                                          args->opts->config->width,
+                                          args->opts->config->height,
+                                          args->encoder->cfg->input_bitdepth,
+                                          args->encoder->bitdepth,
+                                          frame_in);
+          if (!read_success) {
+            fprintf(stderr, "Could not re-open input file, shutting down!\n");
+            retval = RETVAL_FAILURE;
+            goto done;
+          }
+        } else {
+          retval = RETVAL_EOF;
+          goto done;
+        }
       } else {
         fprintf(stderr, "Failed to read a frame %d\n", frames_read);
-        goto exit_failure;
+        retval = RETVAL_FAILURE;
+        goto done;
       }
     }
 
+    frames_read++;
+
     if (args->encoder->cfg->source_scan_type != 0) {
       // Set source scan type for frame, so that it will be turned into fields.
       frame_in->interlacing = args->encoder->cfg->source_scan_type;
     }
-    args->img_inframes_read & 1 = frame_in;
-    frame_in = NULL;
-
-    frames_read++;
 
-    // Wait until main thread is ready to receive input and then release main thread
+    // Wait until main thread is ready to receive the next frame.
     PTHREAD_LOCK(args->input_mutex);
+    args->img_in = frame_in;
+    args->retval = retval;
+    // Unlock main_thread_mutex to notify main thread that the new img_in
+    // and retval have been placed to args.
     PTHREAD_UNLOCK(args->main_thread_mutex);
+
+    frame_in = NULL;
   }
 
-exit_eof:
-  args->retval = RETVAL_EOF;  
-  args->img_inframes_read & 1 = NULL;
-exit_failure:
-  // Do some cleaning up  
+done:
+  // Wait until main thread is ready to receive the next frame.
+  PTHREAD_LOCK(args->input_mutex);
+  args->img_in = NULL;
+  args->retval = retval;
+  // Unlock main_thread_mutex to notify main thread that the new img_in
+  // and retval have been placed to args.
+  PTHREAD_UNLOCK(args->main_thread_mutex);
+
+  // Do some cleaning up.
   args->api->picture_free(frame_in);
-  if (!args->retval) {
-    args->retval = RETVAL_FAILURE;
-  }
+
   pthread_exit(NULL);
   return 0;
 }
@@ -239,13 +288,10 @@
   clock_t encoding_end_cpu_time;
   KVZ_CLOCK_T encoding_end_real_time;
 
-  // Stdin and stdout need to be binary for input and output to work.
+#ifdef _WIN32
   // Stderr needs to be text mode to convert \n to \r\n in Windows.
-  #ifdef _WIN32
-      _setmode( _fileno( stdin ),  _O_BINARY );
-      _setmode( _fileno( stdout ), _O_BINARY );
-      _setmode( _fileno( stderr ), _O_TEXT );
-  #endif
+  setmode( _fileno( stderr ), _O_TEXT );
+#endif
       
   CHECKPOINTS_INIT();
 
@@ -254,11 +300,18 @@
   opts = cmdline_opts_parse(api, argc, argv);
   // If problem with command line options, print banner and shutdown.
   if (!opts) {
-    print_version();
-    print_help();
+    print_usage();
 
     goto exit_failure;
   }
+  if (opts->version) {
+    print_version();
+    goto done;
+  }
+  if (opts->help) {
+    print_help();
+    goto done;
+  }
 
   input = open_input_file(opts->input);
   if (input == NULL) {
@@ -272,6 +325,16 @@
     goto exit_failure;
   }
 
+#ifdef _WIN32
+  // Set stdin and stdout to binary for pipes.
+  if (input == stdin) {
+    _setmode(_fileno(stdin), _O_BINARY);
+  }
+  if (output == stdout) {
+    _setmode(_fileno(stdout), _O_BINARY);
+  }
+#endif
+
   if (opts->debug != NULL) {
     recout = open_output_file(opts->debug);
     if (recout == NULL) {
@@ -305,7 +368,6 @@
     encoding_start_cpu_time = clock();
 
     uint64_t bitstream_length = 0;
-    uint32_t frames_read = 0;
     uint32_t frames_done = 0;
     double psnr_sum3 = { 0.0, 0.0, 0.0 };
 
@@ -322,18 +384,22 @@
     PTHREAD_LOCK(&input_mutex);
 
     // Give arguments via struct to the input thread
-    input_handler_args in_args;    
-    kvz_picture *img_in2 = { NULL };
-    in_args.input = input;
-    in_args.img_in = img_in;
-    in_args.main_thread_mutex = &main_thread_mutex;
+    input_handler_args in_args = {
+      .input_mutex = NULL,
+      .main_thread_mutex = NULL,
+
+      .input = input,
+      .api = api,
+      .opts = opts,
+      .encoder = encoder,
+      .padding_x = padding_x,
+      .padding_y = padding_y,
+
+      .img_in = NULL,
+      .retval = RETVAL_RUNNING,
+    };
     in_args.input_mutex = &input_mutex;
-    in_args.opts = opts;
-    in_args.encoder = encoder;
-    in_args.padding_x = padding_x;
-    in_args.padding_y = padding_y;
-    in_args.api = api;
-    in_args.retval = RETVAL_RUNNING;
+    in_args.main_thread_mutex = &main_thread_mutex;
 
     if (pthread_create(&input_thread, NULL, input_read_thread, (void*)&in_args) != 0) {
       fprintf(stderr, "pthread_create failed!\n");
@@ -343,16 +409,20 @@
     kvz_picture *cur_in_img;
     for (;;) {
 
-      // Skip mutex locking when thread is no longer in run state
+      // Skip mutex locking if the input thread does not exist.
       if (in_args.retval == RETVAL_RUNNING) {
-        // Wait for input to be read
-        // unlock the input thread to be able to continue to the next picture
+        // Unlock input_mutex so that the input thread can write the new
+        // img_in and retval to in_args.
         PTHREAD_UNLOCK(&input_mutex);
+        // Wait until the input thread has updated in_args.
         PTHREAD_LOCK(&main_thread_mutex);
-      }      
-      cur_in_img = img_inframes_read & 1;
-      img_inframes_read & 1 = NULL;
-      frames_read++;
+
+        cur_in_img = in_args.img_in;
+        in_args.img_in = NULL;
+
+      } else {
+        cur_in_img = NULL;
+      }
 
       if (in_args.retval == EXIT_FAILURE) {
         goto exit_failure;

kvazaar-1.0.0.tar.gz/src/encode_coding_tree.c Added

@@ -0,0 +1,1083 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "encode_coding_tree.h"
+
+#include "cabac.h"
+#include "context.h"
+#include "cu.h"
+#include "encoder.h"
+#include "extras/crypto.h"
+#include "imagelist.h"
+#include "inter.h"
+#include "intra.h"
+#include "kvazaar.h"
+#include "kvz_math.h"
+#include "tables.h"
+#include "videoframe.h"
+
+/**
+ * \brief Encode (X,Y) position of the last significant coefficient
+ *
+ * \param lastpos_x   X component of last coefficient
+ * \param lastpos_y   Y component of last coefficient
+ * \param width       Block width
+ * \param height      Block height
+ * \param type        plane type / luminance or chrominance
+ * \param scan        scan type (diag, hor, ver)
+ *
+ * This method encodes the X and Y component within a block of the last
+ * significant coefficient.
+ */
+static void encode_last_significant_xy(encoder_state_t * const state,
+                                       uint8_t lastpos_x, uint8_t lastpos_y,
+                                       uint8_t width, uint8_t height,
+                                       uint8_t type, uint8_t scan)
+{
+  cabac_data_t * const cabac = &state->cabac;
+
+  const int index = kvz_math_floor_log2(width) - 2;
+  uint8_t ctx_offset = type ? 0 : (index * 3 + (index + 1) / 4);
+  uint8_t shift = type ? index : (index + 3) / 4;
+
+  cabac_ctx_t *base_ctx_x = (type ? cabac->ctx.cu_ctx_last_x_chroma : cabac->ctx.cu_ctx_last_x_luma);
+  cabac_ctx_t *base_ctx_y = (type ? cabac->ctx.cu_ctx_last_y_chroma : cabac->ctx.cu_ctx_last_y_luma);
+
+  if (scan == SCAN_VER) {
+    SWAP(lastpos_x, lastpos_y, uint8_t);
+  }
+
+  const int group_idx_x = g_group_idxlastpos_x;
+  const int group_idx_y = g_group_idxlastpos_y;
+
+  // x prefix
+  for (int last_x = 0; last_x < group_idx_x; last_x++) {
+    cabac->cur_ctx = &base_ctx_xctx_offset + (last_x >> shift);
+    CABAC_BIN(cabac, 1, "last_sig_coeff_x_prefix");
+  }
+  if (group_idx_x < g_group_idxwidth - 1) {
+    cabac->cur_ctx = &base_ctx_xctx_offset + (group_idx_x >> shift);
+    CABAC_BIN(cabac, 0, "last_sig_coeff_x_prefix");
+  }
+
+  // y prefix
+  for (int last_y = 0; last_y < group_idx_y; last_y++) {
+    cabac->cur_ctx = &base_ctx_yctx_offset + (last_y >> shift);
+    CABAC_BIN(cabac, 1, "last_sig_coeff_y_prefix");
+  }
+  if (group_idx_y < g_group_idxheight - 1) {
+    cabac->cur_ctx = &base_ctx_yctx_offset + (group_idx_y >> shift);
+    CABAC_BIN(cabac, 0, "last_sig_coeff_y_prefix");
+  }
+
+  // last_sig_coeff_x_suffix
+  if (group_idx_x > 3) {
+    const int suffix = lastpos_x - g_min_in_groupgroup_idx_x;
+    const int bits = (group_idx_x - 2) / 2;
+    CABAC_BINS_EP(cabac, suffix, bits, "last_sig_coeff_x_suffix");
+  }
+
+  // last_sig_coeff_y_suffix
+  if (group_idx_y > 3) {
+    const int suffix = lastpos_y - g_min_in_groupgroup_idx_y;
+    const int bits = (group_idx_y - 2) / 2;
+    CABAC_BINS_EP(cabac, suffix, bits, "last_sig_coeff_y_suffix");
+  }
+}
+
+void kvz_encode_coeff_nxn(encoder_state_t * const state,
+                          coeff_t *coeff,
+                          uint8_t width,
+                          uint8_t type,
+                          int8_t scan_mode,
+                          int8_t tr_skip)
+{
+  const encoder_control_t * const encoder = state->encoder_control;
+  cabac_data_t * const cabac = &state->cabac;
+  int c1 = 1;
+  uint8_t last_coeff_x = 0;
+  uint8_t last_coeff_y = 0;
+  int32_t i;
+  uint32_t sig_coeffgroup_flag8 * 8 = { 0 };
+
+  int8_t be_valid = encoder->sign_hiding;
+  int32_t scan_pos_sig;
+  uint32_t go_rice_param = 0;
+  uint32_t blk_pos, pos_y, pos_x, sig, ctx_sig;
+
+  // CONSTANTS
+  const uint32_t num_blk_side    = width >> TR_MIN_LOG2_SIZE;
+  const uint32_t log2_block_size = kvz_g_convert_to_bitwidth + 2;
+  const uint32_t *scan           =
+    kvz_g_sig_last_scanscan_modelog2_block_size - 1;
+  const uint32_t *scan_cg = g_sig_last_scan_cglog2_block_size - 2scan_mode;
+
+  // Init base contexts according to block type
+  cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_modeltype);
+  cabac_ctx_t *baseCtx           = (type == 0) ? &(cabac->ctx.cu_sig_model_luma0) :
+                                 &(cabac->ctx.cu_sig_model_chroma0);
+
+  // Scan all coeff groups to find out which of them have coeffs.
+  // Populate sig_coeffgroup_flag with that info.
+
+  unsigned sig_cg_cnt = 0;
+  for (int cg_y = 0; cg_y < width / 4; ++cg_y) {
+    for (int cg_x = 0; cg_x < width / 4; ++cg_x) {
+      unsigned cg_pos = cg_y * width * 4 + cg_x * 4;
+      for (int coeff_row = 0; coeff_row < 4; ++coeff_row) {
+        // Load four 16-bit coeffs and see if any of them are non-zero.
+        unsigned coeff_pos = cg_pos + coeff_row * width;
+        uint64_t four_coeffs = *(uint64_t*)(&coeffcoeff_pos);
+        if (four_coeffs) {
+          ++sig_cg_cnt;
+          unsigned cg_pos_y = (cg_pos >> log2_block_size) >> TR_MIN_LOG2_SIZE;
+          unsigned cg_pos_x = (cg_pos & (width - 1)) >> TR_MIN_LOG2_SIZE;
+          sig_coeffgroup_flagcg_pos_x + cg_pos_y * num_blk_side = 1;
+          break;
+        }
+      }
+    }
+  }
+
+  // Rest of the code assumes at least one non-zero coeff.
+  assert(sig_cg_cnt > 0);
+
+  // Find the last coeff group by going backwards in scan order.
+  unsigned scan_cg_last = num_blk_side * num_blk_side - 1;
+  while (!sig_coeffgroup_flagscan_cgscan_cg_last) {
+    --scan_cg_last;
+  }
+
+  // Find the last coeff by going backwards in scan order.
+  unsigned scan_pos_last = scan_cg_last * 16 + 15;
+  while (!coeffscanscan_pos_last) {
+    --scan_pos_last;
+  }
+
+  int pos_last = scanscan_pos_last;
+
+  // transform skip flag
+  if(width == 4 && encoder->trskip_enable) {
+    cabac->cur_ctx = (type == 0) ? &(cabac->ctx.transform_skip_model_luma) : &(cabac->ctx.transform_skip_model_chroma);
+    CABAC_BIN(cabac, tr_skip, "transform_skip_flag");
+  }
+
+  last_coeff_x = pos_last & (width - 1);
+  last_coeff_y = (uint8_t)(pos_last >> log2_block_size);
+
+  // Code last_coeff_x and last_coeff_y
+  encode_last_significant_xy(state, last_coeff_x, last_coeff_y, width, width,
+                             type, scan_mode);
+
+  scan_pos_sig  = scan_pos_last;
+
+  // significant_coeff_flag
+  for (i = scan_cg_last; i >= 0; i--) {
+    int32_t sub_pos        = i << 4; // LOG2_SCAN_SET_SIZE;
+    int32_t abs_coeff16;
+    int32_t cg_blk_pos     = scan_cgi;
+    int32_t cg_pos_y       = cg_blk_pos / num_blk_side;
+    int32_t cg_pos_x       = cg_blk_pos - (cg_pos_y * num_blk_side);
+
+    uint32_t coeff_signs   = 0;
+    int32_t last_nz_pos_in_cg = -1;
+    int32_t first_nz_pos_in_cg = 16;
+    int32_t num_non_zero = 0;
+    go_rice_param = 0;
+
+    if (scan_pos_sig == scan_pos_last) {
+      abs_coeff0 = abs(coeffpos_last);
+      coeff_signs  = (coeffpos_last < 0);
+      num_non_zero = 1;
+      last_nz_pos_in_cg  = scan_pos_sig;
+      first_nz_pos_in_cg = scan_pos_sig;
+      scan_pos_sig--;
+    }
+
+    if (i == scan_cg_last || i == 0) {
+      sig_coeffgroup_flagcg_blk_pos = 1;
+    } else {
+      uint32_t sig_coeff_group   = (sig_coeffgroup_flagcg_blk_pos != 0);
+      uint32_t ctx_sig  = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
+                                                      cg_pos_y, width);
+      cabac->cur_ctx = &base_coeff_group_ctxctx_sig;
+      CABAC_BIN(cabac, sig_coeff_group, "coded_sub_block_flag");
+    }
+
+    if (sig_coeffgroup_flagcg_blk_pos) {
+      int32_t pattern_sig_ctx = kvz_context_calc_pattern_sig_ctx(sig_coeffgroup_flag,
+                                                             cg_pos_x, cg_pos_y, width);
+
+      for (; scan_pos_sig >= sub_pos; scan_pos_sig--) {
+        blk_pos = scanscan_pos_sig;
+        pos_y   = blk_pos >> log2_block_size;
+        pos_x   = blk_pos - (pos_y << log2_block_size);
+        sig    = (coeffblk_pos != 0) ? 1 : 0;
+
+        if (scan_pos_sig > sub_pos || i == 0 || num_non_zero) {
+          ctx_sig  = kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
+                                             log2_block_size, type);
+          cabac->cur_ctx = &baseCtxctx_sig;
+          CABAC_BIN(cabac, sig, "sig_coeff_flag");
+        }
+
+        if (sig) {
+          abs_coeffnum_non_zero = abs(coeffblk_pos);
+          coeff_signs              = 2 * coeff_signs + (coeffblk_pos < 0);
+          num_non_zero++;
+
+          if (last_nz_pos_in_cg == -1) {
+            last_nz_pos_in_cg = scan_pos_sig;
+          }
+
+          first_nz_pos_in_cg  = scan_pos_sig;
+        }
+      }
+    } else {
+      scan_pos_sig = sub_pos - 1;
+    }
+
+    if (num_non_zero > 0) {
+      bool sign_hidden = last_nz_pos_in_cg - first_nz_pos_in_cg >= 4 /* SBH_THRESHOLD */
+                         && !encoder->cfg->lossless;
+      uint32_t ctx_set  = (i > 0 && type == 0) ? 2 : 0;
+      cabac_ctx_t *base_ctx_mod;
+      int32_t num_c1_flag, first_c2_flag_idx, idx, first_coeff2;
+
+      if (c1 == 0) {
+        ctx_set++;
+      }
+
+      c1 = 1;
+
+      base_ctx_mod     = (type == 0) ? &(cabac->ctx.cu_one_model_luma4 * ctx_set) :
+                         &(cabac->ctx.cu_one_model_chroma4 * ctx_set);
+      num_c1_flag      = MIN(num_non_zero, C1FLAG_NUMBER);
+      first_c2_flag_idx = -1;
+
+      for (idx = 0; idx < num_c1_flag; idx++) {
+        uint32_t symbol = (abs_coeffidx > 1) ? 1 : 0;
+        cabac->cur_ctx = &base_ctx_modc1;
+        CABAC_BIN(cabac, symbol, "coeff_abs_level_greater1_flag");
+
+        if (symbol) {
+          c1 = 0;
+
+          if (first_c2_flag_idx == -1) {
+            first_c2_flag_idx = idx;
+          }
+        } else if ((c1 < 3) && (c1 > 0)) {
+          c1++;
+        }
+      }
+
+      if (c1 == 0) {
+        base_ctx_mod = (type == 0) ? &(cabac->ctx.cu_abs_model_lumactx_set) :
+                       &(cabac->ctx.cu_abs_model_chromactx_set);
+
+        if (first_c2_flag_idx != -1) {
+          uint8_t symbol = (abs_coefffirst_c2_flag_idx > 2) ? 1 : 0;
+          cabac->cur_ctx      = &base_ctx_mod0;
+          CABAC_BIN(cabac, symbol, "coeff_abs_level_greater2_flag");
+        }
+      }
+      if (be_valid && sign_hidden) {
+    	coeff_signs = coeff_signs >> 1;
+    	if(!state->cabac.only_count)
+    	  if (state->encoder_control->cfg->crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) {
+    	    coeff_signs = coeff_signs ^ ff_get_key(&state->tile->dbs_g, num_non_zero-1);
+    	  }
+        CABAC_BINS_EP(cabac, coeff_signs , (num_non_zero - 1), "coeff_sign_flag");
+      } else {
+        if(!state->cabac.only_count)
+    	  if (state->encoder_control->cfg->crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS)
+    	    coeff_signs = coeff_signs ^ ff_get_key(&state->tile->dbs_g, num_non_zero);
+        CABAC_BINS_EP(cabac, coeff_signs, num_non_zero, "coeff_sign_flag");
+      }
+
+      if (c1 == 0 || num_non_zero > C1FLAG_NUMBER) {
+        first_coeff2 = 1;
+
+        for (idx = 0; idx < num_non_zero; idx++) {
+          int32_t base_level  = (idx < C1FLAG_NUMBER) ? (2 + first_coeff2) : 1;
+
+          if (abs_coeffidx >= base_level) {
+        	if(!state->cabac.only_count) {
+        	  if (state->encoder_control->cfg->crypto_features & KVZ_CRYPTO_TRANSF_COEFFS)
+                kvz_cabac_write_coeff_remain_encry(state, cabac, abs_coeffidx - base_level, go_rice_param, base_level);
+        	  else
+        		kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
+        	} else
+              kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
+
+            if (abs_coeffidx > 3 * (1 << go_rice_param)) {
+              go_rice_param = MIN(go_rice_param + 1, 4);
+            }
+          }
+
+          if (abs_coeffidx >= 2) {
+            first_coeff2 = 0;
+          }
+        }
+      }
+    }
+  }
+}
+
+static void encode_transform_unit(encoder_state_t * const state,
+                                  int x_pu, int y_pu, int depth)
+{
+  assert(depth >= 1 && depth <= MAX_PU_DEPTH);
+
+  const videoframe_t * const frame = state->tile->frame;
+  const uint8_t width = LCU_WIDTH >> depth;
+  const uint8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
+
+  const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, x_pu << 2, y_pu << 2);
+
+  const int x_cu = x_pu / 2;
+  const int y_cu = y_pu / 2;
+  const cu_info_t *cur_cu = kvz_videoframe_get_cu_const(frame, x_cu, y_cu);
+
+  coeff_t coeff_yLCU_WIDTH*LCU_WIDTH+1;
+  coeff_t coeff_uLCU_WIDTH*LCU_WIDTH>>2;
+  coeff_t coeff_vLCU_WIDTH*LCU_WIDTH>>2;
+  int32_t coeff_stride = frame->width;
+
+  int8_t scan_idx = kvz_get_scan_order(cur_pu->type, cur_pu->intra.mode, depth);
+
+  int cbf_y = cbf_is_set(cur_pu->cbf, depth, COLOR_Y);
+
+  if (cbf_y) {
+    int x = x_pu * (LCU_WIDTH >> MAX_PU_DEPTH);
+    int y = y_pu * (LCU_WIDTH >> MAX_PU_DEPTH);
+    coeff_t *orig_pos = &frame->coeff_yx + y * frame->width;
+    for (y = 0; y < width; y++) {
+      for (x = 0; x < width; x++) {
+        coeff_yx+y*width = orig_posx;
+      }
+      orig_pos += coeff_stride;
+    }
+  }
+
+  // CoeffNxN
+  // Residual Coding
+  if (cbf_y) {
+    kvz_encode_coeff_nxn(state, coeff_y, width, 0, scan_idx, cur_pu->intra.tr_skip);
+  }
+
+  if (depth == MAX_DEPTH + 1 && !(x_pu % 2 && y_pu % 2)) {
+    // For size 4x4 luma transform the corresponding chroma transforms are
+    // also of size 4x4 covering 8x8 luma pixels. The residual is coded
+    // in the last transform unit so for the other ones, don't do anything.
+    return;
+  }
+
+  bool chroma_cbf_set = cbf_is_set(cur_cu->cbf, depth, COLOR_U) ||
+                        cbf_is_set(cur_cu->cbf, depth, COLOR_V);
+  if (chroma_cbf_set) {
+    int x, y;
+    coeff_t *orig_pos_u, *orig_pos_v;
+
+    if (depth <= MAX_DEPTH) {
+      x = x_pu * (LCU_WIDTH >> (MAX_PU_DEPTH + 1));
+      y = y_pu * (LCU_WIDTH >> (MAX_PU_DEPTH + 1));
+    } else {
+      // for 4x4 select top left pixel of the CU.
+      x = x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1));
+      y = y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1));
+    }
+    orig_pos_u = &frame->coeff_ux + y * (frame->width >> 1);
+    orig_pos_v = &frame->coeff_vx + y * (frame->width >> 1);
+    for (y = 0; y < (width_c); y++) {
+      for (x = 0; x < (width_c); x++) {
+        coeff_ux+y*(width_c) = orig_pos_ux;
+        coeff_vx+y*(width_c) = orig_pos_vx;
+      }
+      orig_pos_u += coeff_stride>>1;
+      orig_pos_v += coeff_stride>>1;
+    }
+
+    scan_idx = kvz_get_scan_order(cur_cu->type, cur_cu->intra.mode_chroma, depth);
+
+    if (cbf_is_set(cur_cu->cbf, depth, COLOR_U)) {
+      kvz_encode_coeff_nxn(state, coeff_u, width_c, 2, scan_idx, 0);
+    }
+
+    if (cbf_is_set(cur_cu->cbf, depth, COLOR_V)) {
+      kvz_encode_coeff_nxn(state, coeff_v, width_c, 2, scan_idx, 0);
+    }
+  }
+}
+
+/**
+ * \param encoder
+ * \param x_pu            Prediction units' x coordinate.
+ * \param y_pu            Prediction units' y coordinate.
+ * \param depth           Depth from LCU.
+ * \param tr_depth        Depth from last CU.
+ * \param parent_coeff_u  What was signaled at previous level for cbf_cb.
+ * \param parent_coeff_v  What was signlaed at previous level for cbf_cr.
+ */
+static void encode_transform_coeff(encoder_state_t * const state,
+                                   int32_t x_pu,
+                                   int32_t y_pu,
+                                   int8_t depth,
+                                   int8_t tr_depth,
+                                   uint8_t parent_coeff_u,
+                                   uint8_t parent_coeff_v)
+{
+  cabac_data_t * const cabac = &state->cabac;
+  const videoframe_t * const frame = state->tile->frame;
+
+  const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, x_pu << 2, y_pu << 2);
+
+  const int32_t x_cu = x_pu / 2;
+  const int32_t y_cu = y_pu / 2;
+  const cu_info_t *cur_cu = kvz_videoframe_get_cu_const(frame, x_cu, y_cu);
+
+  // NxN signifies implicit transform split at the first transform level.
+  // There is a similar implicit split for inter, but it is only used when
+  // transform hierarchy is not in use.
+  int intra_split_flag = (cur_cu->type == CU_INTRA && cur_cu->part_size == SIZE_NxN);
+
+  // The implicit split by intra NxN is not counted towards max_tr_depth.
+  int tr_depth_intra = state->encoder_control->tr_depth_intra;
+  int max_tr_depth = (cur_cu->type == CU_INTRA ? tr_depth_intra + intra_split_flag : TR_DEPTH_INTER);
+
+  int8_t split = (cur_cu->tr_depth > depth);
+
+  const int cb_flag_y = cbf_is_set(cur_pu->cbf, depth, COLOR_Y);
+  const int cb_flag_u = cbf_is_set(cur_cu->cbf, depth, COLOR_U);
+  const int cb_flag_v = cbf_is_set(cur_cu->cbf, depth, COLOR_V);
+
+  // The split_transform_flag is not signaled when:
+  // - transform size is greater than 32 (depth == 0)
+  // - transform size is 4 (depth == MAX_PU_DEPTH)
+  // - transform depth is max
+  // - cu is intra NxN and it's the first split
+  if (depth > 0 &&
+      depth < MAX_PU_DEPTH &&
+      tr_depth < max_tr_depth &&
+      !(intra_split_flag && tr_depth == 0))
+  {
+    cabac->cur_ctx = &(cabac->ctx.trans_subdiv_model5 - ((kvz_g_convert_to_bitLCU_WIDTH + 2) - depth));
+    CABAC_BIN(cabac, split, "split_transform_flag");
+  }
+
+  // Chroma cb flags are not signaled when one of the following:
+  // - transform size is 4 (2x2 chroma transform doesn't exist)
+  // - they have already been signaled to 0 previously
+  // When they are not present they are inferred to be 0, except for size 4
+  // when the flags from previous level are used.
+  if (depth < MAX_PU_DEPTH && state->encoder_control->chroma_format != KVZ_CSP_400) {
+    cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_chromatr_depth);
+    if (tr_depth == 0 || parent_coeff_u) {
+      CABAC_BIN(cabac, cb_flag_u, "cbf_cb");
+    }
+    if (tr_depth == 0 || parent_coeff_v) {
+      CABAC_BIN(cabac, cb_flag_v, "cbf_cr");
+    }
+  }
+
+  if (split) {
+    uint8_t pu_offset = 1 << (MAX_PU_DEPTH - (depth + 1));
+    encode_transform_coeff(state, x_pu, y_pu, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    encode_transform_coeff(state, x_pu + pu_offset, y_pu,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    encode_transform_coeff(state, x_pu, y_pu + pu_offset,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    encode_transform_coeff(state, x_pu + pu_offset, y_pu + pu_offset,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
+    return;
+  }
+
+  // Luma coded block flag is signaled when one of the following:
+  // - prediction mode is intra
+  // - transform depth > 0
+  // - we have chroma coefficients at this level
+  // When it is not present, it is inferred to be 1.
+  if(cur_cu->type == CU_INTRA || tr_depth > 0 || cb_flag_u || cb_flag_v) {
+      cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_luma!tr_depth);
+      CABAC_BIN(cabac, cb_flag_y, "cbf_luma");
+  }
+
+  if (cb_flag_y | cb_flag_u | cb_flag_v) {
+    encode_transform_unit(state, x_pu, y_pu, depth);
+  }
+}
+
+static void encode_inter_prediction_unit(encoder_state_t * const state,
+                                         cabac_data_t * const cabac,
+                                         const cu_info_t * const cur_cu,
+                                         int x, int y, int width, int height,
+                                         int depth)
+{
+  // Mergeflag
+  int16_t num_cand = 0;
+  cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);
+  CABAC_BIN(cabac, cur_cu->merged, "MergeFlag");
+  num_cand = MRG_MAX_NUM_CANDS;
+  if (cur_cu->merged) { //merge
+    if (num_cand > 1) {
+      int32_t ui;
+      for (ui = 0; ui < num_cand - 1; ui++) {
+        int32_t symbol = (ui != cur_cu->merge_idx);
+        if (ui == 0) {
+          cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
+          CABAC_BIN(cabac, symbol, "MergeIndex");
+        } else {
+          CABAC_BIN_EP(cabac,symbol,"MergeIndex");
+        }
+        if (symbol == 0) break;
+      }
+    }
+  } else {
+    uint32_t ref_list_idx;
+    uint32_t j;
+    int ref_list2 = { 0, 0 };
+    for (j = 0; j < state->frame->ref->used_size; j++) {
+      if (state->frame->ref->pocsj < state->frame->poc) {
+        ref_list0++;
+      } else {
+        ref_list1++;
+      }
+    }
+
+    // Void TEncSbac::codeInterDir( TComDataCU* pcCU, UInt uiAbsPartIdx )
+    if (state->frame->slicetype == KVZ_SLICE_B)
+    {
+      // Code Inter Dir
+      uint8_t inter_dir = cur_cu->inter.mv_dir-1;
+      uint8_t ctx = depth;
+      
+
+      if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 8)
+      {
+        cabac->cur_ctx = &(cabac->ctx.inter_dirctx);
+        CABAC_BIN(cabac, (inter_dir == 2), "inter_pred_idc");
+      }
+      if (inter_dir < 2)
+      {
+        cabac->cur_ctx = &(cabac->ctx.inter_dir4);
+        CABAC_BIN(cabac, inter_dir, "inter_pred_idc");
+      }
+    }
+
+    for (ref_list_idx = 0; ref_list_idx < 2; ref_list_idx++) {
+      if (cur_cu->inter.mv_dir & (1 << ref_list_idx)) {
+        if (ref_listref_list_idx > 1) {
+          // parseRefFrmIdx
+          int32_t ref_frame = state->frame->refmapcur_cu->inter.mv_refref_list_idx.idx;
+
+          cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model0);
+          CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX");
+
+          if (ref_frame > 0) {
+            int32_t i;
+            int32_t ref_num = ref_listref_list_idx - 2;
+
+            cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model1);
+            ref_frame--;
+
+            for (i = 0; i < ref_num; ++i) {
+              const uint32_t symbol = (i == ref_frame) ? 0 : 1;
+
+              if (i == 0) {
+                CABAC_BIN(cabac, symbol, "ref_idx_lX");
+              } else {
+                CABAC_BIN_EP(cabac, symbol, "ref_idx_lX");
+              }
+              if (symbol == 0) break;
+            }
+          }
+        }
+
+        if (!(/*pcCU->getSlice()->getMvdL1ZeroFlag() &&*/ state->frame->ref_list == REF_PIC_LIST_1 && cur_cu->inter.mv_dir == 3)) {
+
+          int16_t mv_cand22;
+          kvz_inter_get_mv_cand_cua(
+              state,
+              x, y, width, height,
+              mv_cand, cur_cu, ref_list_idx);
+
+          uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx);
+
+          const int32_t mvd_hor = cur_cu->inter.mvref_list_idx0 - mv_candcu_mv_cand0;
+          const int32_t mvd_ver = cur_cu->inter.mvref_list_idx1 - mv_candcu_mv_cand1;
+          const int8_t hor_abs_gr0 = mvd_hor != 0;
+          const int8_t ver_abs_gr0 = mvd_ver != 0;
+          const uint32_t mvd_hor_abs = abs(mvd_hor);
+          const uint32_t mvd_ver_abs = abs(mvd_ver);
+
+
+          cabac->cur_ctx = &(cabac->ctx.cu_mvd_model0);
+          CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
+          CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");
+
+          cabac->cur_ctx = &(cabac->ctx.cu_mvd_model1);
+
+          if (hor_abs_gr0) {
+            CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor");
+          }
+
+          if (ver_abs_gr0) {
+            CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver");
+          }
+
+          if (hor_abs_gr0) {
+            if (mvd_hor_abs > 1) {
+              kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs-2, 1);
+            }
+            uint32_t mvd_hor_sign = (mvd_hor>0)?0:1;
+            if(!state->cabac.only_count)
+              if (state->encoder_control->cfg->crypto_features & KVZ_CRYPTO_MV_SIGNS)
+                mvd_hor_sign = mvd_hor_sign^ff_get_key(&state->tile->dbs_g, 1);
+            CABAC_BIN_EP(cabac, mvd_hor_sign, "mvd_sign_flag_hor");
+          }
+          if (ver_abs_gr0) {
+            if (mvd_ver_abs > 1) {
+              kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs-2, 1);
+            }
+            uint32_t mvd_ver_sign = (mvd_ver>0)?0:1;
+            if(!state->cabac.only_count)
+              if (state->encoder_control->cfg->crypto_features & KVZ_CRYPTO_MV_SIGNS)
+                mvd_ver_sign = mvd_ver_sign^ff_get_key(&state->tile->dbs_g, 1);
+            CABAC_BIN_EP(cabac, mvd_ver_sign, "mvd_sign_flag_ver");
+          }
+        }
+
+        // Signal which candidate MV to use
+        kvz_cabac_write_unary_max_symbol(cabac,
+                                         cabac->ctx.mvp_idx_model,
+                                         CU_GET_MV_CAND(cur_cu, ref_list_idx),
+                                         1,
+                                         AMVP_MAX_NUM_CANDS - 1);
+      }
+    } // for ref_list
+  } // if !merge
+}
+
+static void encode_intra_coding_unit(encoder_state_t * const state,
+                                     cabac_data_t * const cabac,
+                                     const cu_info_t * const cur_cu,
+                                     int x_ctb, int y_ctb, int depth)
+{
+  const videoframe_t * const frame = state->tile->frame;
+  uint8_t intra_pred_mode4;
+
+  uint8_t intra_pred_mode_chroma = cur_cu->intra.mode_chroma;
+  int8_t intra_preds43 = {{-1, -1, -1},{-1, -1, -1},{-1, -1, -1},{-1, -1, -1}};
+  int8_t mpm_preds4 = {-1, -1, -1, -1};
+  uint32_t flag4;
+
+  #if ENABLE_PCM == 1
+  // Code must start after variable initialization
+  kvz_cabac_encode_bin_trm(cabac, 0); // IPCMFlag == 0
+  #endif
+
+  // PREDINFO CODING
+  // If intra prediction mode is found from the predictors,
+  // it can be signaled with two EP's. Otherwise we can send
+  // 5 EP bins with the full predmode
+  const int num_pred_units = kvz_part_mode_num_partscur_cu->part_size;
+  const int cu_width = LCU_WIDTH >> depth;
+
+  for (int j = 0; j < num_pred_units; ++j) {
+    const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x_ctb << 3, j);
+    const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y_ctb << 3, j);
+    const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y);
+
+    const cu_info_t *left_pu = NULL;
+    const cu_info_t *above_pu = NULL;
+
+    if (pu_x > 0) {
+      assert(pu_x >> 2 > 0);
+      left_pu = kvz_cu_array_at_const(frame->cu_array, pu_x - 1, pu_y);
+    }
+    // Don't take the above PU across the LCU boundary.
+    if (pu_y % LCU_WIDTH > 0 && pu_y > 0) {
+      assert(pu_y >> 2 > 0);
+      above_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y - 1);
+    }
+
+    kvz_intra_get_dir_luma_predictor(pu_x, pu_y,
+                                     intra_predsj,
+                                     cur_pu,
+                                     left_pu, above_pu);
+
+    intra_pred_modej = cur_pu->intra.mode;
+
+    for (int i = 0; i < 3; i++) {
+      if (intra_predsji == intra_pred_modej) {
+        mpm_predsj = (int8_t)i;
+        break;
+      }
+    }
+    flagj = (mpm_predsj == -1) ? 0 : 1;
+  }
+
+  cabac->cur_ctx = &(cabac->ctx.intra_mode_model);
+  for (int j = 0; j < num_pred_units; ++j) {
+    CABAC_BIN(cabac, flagj, "prev_intra_luma_pred_flag");
+  }
+
+  for (int j = 0; j < num_pred_units; ++j) {
+    // Signal index of the prediction mode in the prediction list.
+    if (flagj) {
+      CABAC_BIN_EP(cabac, (mpm_predsj == 0 ? 0 : 1), "mpm_idx");
+      if (mpm_predsj != 0) {
+        CABAC_BIN_EP(cabac, (mpm_predsj == 1 ? 0 : 1), "mpm_idx");
+      }
+    } else {
+      // Signal the actual prediction mode.
+      int32_t tmp_pred = intra_pred_modej;
+
+      // Sort prediction list from lowest to highest.
+      if (intra_predsj0 > intra_predsj1) SWAP(intra_predsj0, intra_predsj1, int8_t);
+      if (intra_predsj0 > intra_predsj2) SWAP(intra_predsj0, intra_predsj2, int8_t);
+      if (intra_predsj1 > intra_predsj2) SWAP(intra_predsj1, intra_predsj2, int8_t);
+
+      // Reduce the index of the signaled prediction mode according to the
+      // prediction list, as it has been already signaled that it's not one
+      // of the prediction modes.
+      for (int i = 2; i >= 0; i--) {
+        tmp_pred = (tmp_pred > intra_predsji ? tmp_pred - 1 : tmp_pred);
+      }
+
+      CABAC_BINS_EP(cabac, tmp_pred, 5, "rem_intra_luma_pred_mode");
+    }
+  }
+
+  // Code chroma prediction mode.
+  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+    unsigned pred_mode = 5;
+    unsigned chroma_pred_modes4 = {0, 26, 10, 1};
+
+    if (intra_pred_mode_chroma == intra_pred_mode0) {
+      pred_mode = 4;
+    } else if (intra_pred_mode_chroma == 34) {
+      // Angular 34 mode is possible only if intra pred mode is one of the
+      // possible chroma pred modes, in which case it is signaled with that
+      // duplicate mode.
+      for (int i = 0; i < 4; ++i) {
+        if (intra_pred_mode0 == chroma_pred_modesi) pred_mode = i;
+      }
+    } else {
+      for (int i = 0; i < 4; ++i) {
+        if (intra_pred_mode_chroma == chroma_pred_modesi) pred_mode = i;
+      }
+    }
+
+    // pred_mode == 5 mean intra_pred_mode_chroma is something that can't
+    // be coded.
+    assert(pred_mode != 5);
+
+    /**
+     * Table 9-35 - Binarization for intra_chroma_pred_mode
+     *   intra_chroma_pred_mode  bin_string
+     *                        4           0
+     *                        0         100
+     *                        1         101
+     *                        2         110
+     *                        3         111
+     * Table 9-37 - Assignment of ctxInc to syntax elements with context coded bins
+     *   intra_chroma_pred_mode = 0, bypass, bypass
+     */
+    cabac->cur_ctx = &(cabac->ctx.chroma_pred_model0);
+    if (pred_mode == 4) {
+      CABAC_BIN(cabac, 0, "intra_chroma_pred_mode");
+    } else {
+      CABAC_BIN(cabac, 1, "intra_chroma_pred_mode");
+      CABAC_BINS_EP(cabac, pred_mode, 2, "intra_chroma_pred_mode");
+    }
+  }
+
+  encode_transform_coeff(state, x_ctb * 2, y_ctb * 2, depth, 0, 0, 0);
+}
+
+static void encode_part_mode(encoder_state_t * const state,
+                             cabac_data_t * const cabac,
+                             const cu_info_t * const cur_cu,
+                             int depth)
+{
+  // Binarization from Table 9-34 of the HEVC spec:
+  //
+  //                |   log2CbSize >     |    log2CbSize ==
+  //                |   MinCbLog2SizeY   |    MinCbLog2SizeY
+  // -------+-------+----------+---------+-----------+----------
+  //  pred  | part  | AMP      | AMP     |           |
+  //  mode  | mode  | disabled | enabled | size == 8 | size > 8
+  // -------+-------+----------+---------+-----------+----------
+  //  intra | 2Nx2N |        -         - |         1          1
+  //        |   NxN |        -         - |         0          0
+  // -------+-------+--------------------+----------------------
+  //  inter | 2Nx2N |        1         1 |         1          1
+  //        |  2NxN |       01       011 |        01         01
+  //        |  Nx2N |       00       001 |        00        001
+  //        |   NxN |        -         - |         -        000
+  //        | 2NxnU |        -      0100 |         -          -
+  //        | 2NxnD |        -      0101 |         -          -
+  //        | nLx2N |        -      0000 |         -          -
+  //        | nRx2N |        -      0001 |         -          -
+  // -------+-------+--------------------+----------------------
+  //
+  //
+  // Context indices from Table 9-37 of the HEVC spec:
+  //
+  //                                      binIdx
+  //                               |  0  1  2       3
+  // ------------------------------+------------------
+  //  log2CbSize == MinCbLog2SizeY |  0  1  2  bypass
+  //  log2CbSize >  MinCbLog2SizeY |  0  1  3  bypass
+  // ------------------------------+------------------
+
+  if (cur_cu->type == CU_INTRA) {
+    if (depth == MAX_DEPTH) {
+      cabac->cur_ctx = &(cabac->ctx.part_size_model0);
+      if (cur_cu->part_size == SIZE_2Nx2N) {
+        CABAC_BIN(cabac, 1, "part_mode 2Nx2N");
+      } else {
+        CABAC_BIN(cabac, 0, "part_mode NxN");
+      }
+    }
+  } else {
+
+    cabac->cur_ctx = &(cabac->ctx.part_size_model0);
+    if (cur_cu->part_size == SIZE_2Nx2N) {
+      CABAC_BIN(cabac, 1, "part_mode 2Nx2N");
+      return;
+    }
+    CABAC_BIN(cabac, 0, "part_mode split");
+
+    cabac->cur_ctx = &(cabac->ctx.part_size_model1);
+    if (cur_cu->part_size == SIZE_2NxN ||
+        cur_cu->part_size == SIZE_2NxnU ||
+        cur_cu->part_size == SIZE_2NxnD) {
+      CABAC_BIN(cabac, 1, "part_mode vertical");
+    } else {
+      CABAC_BIN(cabac, 0, "part_mode horizontal");
+    }
+
+    if (state->encoder_control->cfg->amp_enable && depth < MAX_DEPTH) {
+      cabac->cur_ctx = &(cabac->ctx.part_size_model3);
+
+      if (cur_cu->part_size == SIZE_2NxN ||
+          cur_cu->part_size == SIZE_Nx2N) {
+        CABAC_BIN(cabac, 1, "part_mode SMP");
+        return;
+      }
+      CABAC_BIN(cabac, 0, "part_mode AMP");
+
+      if (cur_cu->part_size == SIZE_2NxnU ||
+          cur_cu->part_size == SIZE_nLx2N) {
+        CABAC_BINS_EP(cabac, 0, 1, "part_mode AMP");
+      } else {
+        CABAC_BINS_EP(cabac, 1, 1, "part_mode AMP");
+      }
+    }
+  }
+}
+
+void kvz_encode_coding_tree(encoder_state_t * const state,
+                        uint16_t x_ctb, uint16_t y_ctb, uint8_t depth)
+{
+  cabac_data_t * const cabac = &state->cabac;
+  const videoframe_t * const frame = state->tile->frame;
+  const cu_info_t *cur_cu = kvz_videoframe_get_cu_const(frame, x_ctb, y_ctb);
+  uint8_t split_flag = GET_SPLITDATA(cur_cu, depth);
+  uint8_t split_model = 0;
+  
+  //Absolute ctb
+  uint16_t abs_x_ctb = x_ctb + (state->tile->lcu_offset_x * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
+  uint16_t abs_y_ctb = y_ctb + (state->tile->lcu_offset_y * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
+
+  // Check for slice border FIXME
+  uint8_t border_x = ((state->encoder_control->in.width) < (abs_x_ctb * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth))) ? 1 : 0;
+  uint8_t border_y = ((state->encoder_control->in.height) < (abs_y_ctb * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth))) ? 1 : 0;
+  uint8_t border_split_x = ((state->encoder_control->in.width)  < ((abs_x_ctb + 1) * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)))) ? 0 : 1;
+  uint8_t border_split_y = ((state->encoder_control->in.height) < ((abs_y_ctb + 1) * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)))) ? 0 : 1;
+  uint8_t border = border_x | border_y; /*!< are we in any border CU */
+
+  // When not in MAX_DEPTH, insert split flag and split the blocks if needed
+  if (depth != MAX_DEPTH) {
+    // Implisit split flag when on border
+    if (!border) {
+      // Get left and top block split_flags and if they are present and true, increase model number
+      if (x_ctb > 0 && GET_SPLITDATA(kvz_videoframe_get_cu_const(frame, x_ctb - 1, y_ctb), depth) == 1) {
+        split_model++;
+      }
+
+      if (y_ctb > 0 && GET_SPLITDATA(kvz_videoframe_get_cu_const(frame, x_ctb, y_ctb - 1), depth) == 1) {
+        split_model++;
+      }
+
+      cabac->cur_ctx = &(cabac->ctx.split_flag_modelsplit_model);
+      CABAC_BIN(cabac, split_flag, "SplitFlag");
+    }
+
+    if (split_flag || border) {
+      // Split blocks and remember to change x and y block positions
+      uint8_t change = 1<<(MAX_DEPTH-1-depth);
+      kvz_encode_coding_tree(state, x_ctb, y_ctb, depth + 1); // x,y
+
+      // TODO: fix when other half of the block would not be completely over the border
+      if (!border_x || border_split_x) {
+        kvz_encode_coding_tree(state, x_ctb + change, y_ctb, depth + 1);
+      }
+      if (!border_y || border_split_y) {
+        kvz_encode_coding_tree(state, x_ctb, y_ctb + change, depth + 1);
+      }
+      if (!border || (border_split_x && border_split_y)) {
+        kvz_encode_coding_tree(state, x_ctb + change, y_ctb + change, depth + 1);
+      }
+      return;
+    }
+  }
+
+  if (state->encoder_control->cfg->lossless) {
+    cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass;
+    CABAC_BIN(cabac, 1, "cu_transquant_bypass_flag");
+  }
+
+    // Encode skip flag
+  if (state->frame->slicetype != KVZ_SLICE_I) {
+    int8_t ctx_skip = 0; // uiCtxSkip = aboveskipped + leftskipped;
+    int ui;
+    int16_t num_cand = MRG_MAX_NUM_CANDS;
+    // Get left and top skipped flags and if they are present and true, increase context number
+    if (x_ctb > 0 && (kvz_videoframe_get_cu_const(frame, x_ctb - 1, y_ctb))->skipped) {
+      ctx_skip++;
+    }
+
+    if (y_ctb > 0 && (kvz_videoframe_get_cu_const(frame, x_ctb, y_ctb - 1))->skipped) {
+      ctx_skip++;
+    }
+
+    cabac->cur_ctx = &(cabac->ctx.cu_skip_flag_modelctx_skip);
+    CABAC_BIN(cabac, cur_cu->skipped, "SkipFlag");
+
+    // IF SKIP
+    if (cur_cu->skipped) {
+      if (num_cand > 1) {
+        for (ui = 0; ui < num_cand - 1; ui++) {
+          int32_t symbol = (ui != cur_cu->merge_idx);
+          if (ui == 0) {
+            cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
+            CABAC_BIN(cabac, symbol, "MergeIndex");
+          } else {
+            CABAC_BIN_EP(cabac,symbol,"MergeIndex");
+          }
+          if (symbol == 0) {
+            break;
+          }
+        }
+      }
+      return;
+    }
+  }
+
+  // ENDIF SKIP
+
+  // Prediction mode
+  if (state->frame->slicetype != KVZ_SLICE_I) {
+    cabac->cur_ctx = &(cabac->ctx.cu_pred_mode_model);
+    CABAC_BIN(cabac, (cur_cu->type == CU_INTRA), "PredMode");
+  }
+
+  // part_mode
+  encode_part_mode(state, cabac, cur_cu, depth);
+
+  if (cur_cu->type == CU_INTER) {
+    const int num_pu = kvz_part_mode_num_partscur_cu->part_size;
+    const int cu_width = LCU_WIDTH >> depth;
+
+    for (int i = 0; i < num_pu; ++i) {
+      const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x_ctb << 3, i);
+      const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y_ctb << 3, i);
+      const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i);
+      const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i);
+      const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, pu_x, pu_y);
+
+      encode_inter_prediction_unit(state, cabac, cur_pu, pu_x, pu_y, pu_w, pu_h, depth);
+    }
+
+    {
+      int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+      // Only need to signal coded block flag if not skipped or merged
+      // skip = no coded residual, merge = coded residual
+      if (cur_cu->part_size != SIZE_2Nx2N || !cur_cu->merged) {
+        cabac->cur_ctx = &(cabac->ctx.cu_qt_root_cbf_model);
+        CABAC_BIN(cabac, cbf, "rqt_root_cbf");
+      }
+      // Code (possible) coeffs to bitstream
+
+      if (cbf) {
+        encode_transform_coeff(state, x_ctb * 2, y_ctb * 2, depth, 0, 0, 0);
+      }
+    }
+  } else if (cur_cu->type == CU_INTRA) {
+    encode_intra_coding_unit(state, cabac, cur_cu, x_ctb, y_ctb, depth);
+  }
+
+    #if ENABLE_PCM == 1
+  // Code IPCM block
+  if (cur_cu->type == CU_PCM) {
+    kvz_cabac_encode_bin_trm(cabac, 1); // IPCMFlag == 1
+      kvz_cabac_finish(cabac);
+      kvz_bitstream_add_rbsp_trailing_bits(cabac.stream);
+    // PCM sample
+      {
+      unsigned y, x;
+
+      pixel *base_y = &cur_pic->y_datax_ctb * (LCU_WIDTH >> (MAX_DEPTH))    + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH))) * encoder->in.width;
+      pixel *base_u = &cur_pic->u_data(x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * encoder->in.width / 2);
+      pixel *base_v = &cur_pic->v_data(x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * encoder->in.width / 2);
+
+      // Luma
+      for (y = 0; y < LCU_WIDTH >> depth; y++) {
+        for (x = 0; x < LCU_WIDTH >> depth; x++) {
+          kvz_bitstream_put(cabac.stream, base_yx + y * encoder->in.width, 8);
+          }
+        }
+
+      // Chroma
+      if (encoder->in.video_format != FORMAT_400) {
+        for (y = 0; y < LCU_WIDTH >> (depth + 1); y++) {
+          for (x = 0; x < LCU_WIDTH >> (depth + 1); x++) {
+            kvz_bitstream_put(cabac.stream, base_ux + y * (encoder->in.width >> 1), 8);
+          }
+        }
+        for (y = 0; y < LCU_WIDTH >> (depth + 1); y++) {
+          for (x = 0; x < LCU_WIDTH >> (depth + 1); x++) {
+            kvz_bitstream_put(cabac.stream, base_vx + y * (encoder->in.width >> 1), 8);
+          }
+        }
+      }
+    }
+    // end PCM sample
+      kvz_cabac_start(cabac);
+  } // end Code IPCM block
+#endif /* END ENABLE_PCM */
+  else { /* Should not happend */
+    assert(0);
+    exit(1);
+  }
+
+   /* end prediction unit */
+  /* end coding_unit */
+}

kvazaar-1.0.0.tar.gz/src/encode_coding_tree.h Added

@@ -0,0 +1,44 @@
+#ifndef ENCODE_CODING_TREE_H_
+#define ENCODE_CODING_TREE_H_
+
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \file
+ * Functions for writing the coding quadtree and related syntax.
+ */
+
+#include "encoderstate.h"
+#include "global.h"
+
+void kvz_encode_coding_tree(encoder_state_t *state,
+                            uint16_t x_ctb,
+                            uint16_t y_ctb,
+                            uint8_t depth);
+
+void kvz_encode_coeff_nxn(encoder_state_t *state,
+                          coeff_t *coeff,
+                          uint8_t width,
+                          uint8_t type,
+                          int8_t scan_mode,
+                          int8_t tr_skip);
+
+#endif // ENCODE_CODING_TREE_H_

kvazaar-0.8.3.tar.gz/src/encoder.c -> kvazaar-1.0.0.tar.gz/src/encoder.c Changed

@@ -22,22 +22,10 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
-#include <assert.h>
 
-#include "tables.h"
 #include "cfg.h"
-#include "cabac.h"
-#include "image.h"
-#include "nal.h"
-#include "context.h"
-#include "transform.h"
-#include "intra.h"
-#include "inter.h"
-#include "filter.h"
-#include "search.h"
-#include "sao.h"
-#include "rdo.h"
+#include "strategyselector.h"
+
 
 static int encoder_control_init_gop_layer_weights(encoder_control_t * const);
 
@@ -49,57 +37,90 @@
 
 static int select_owf_auto(const kvz_config *const cfg)
 {
-  if (cfg->wpp) {
-    // If wpp is on, select owf such that less than 15% of the
-    // frame is covered by the are threads can not work at the same time.
-    const int lcu_width = CEILDIV(cfg->width, LCU_WIDTH);
-    const int lcu_height = CEILDIV(cfg->height, LCU_WIDTH);
+  if (cfg->intra_period == 1) {
+    if (cfg->wpp) {
+      // If wpp is on, select owf such that less than 15% of the
+      // frame is covered by the are threads can not work at the same time.
+      const int lcu_width = CEILDIV(cfg->width, LCU_WIDTH);
+      const int lcu_height = CEILDIV(cfg->height, LCU_WIDTH);
+
+      // Find the largest number of threads per frame that satifies the
+      // the condition: wpp start/stop inefficiency takes up  less than 15%
+      // of frame area.
+      int threads_per_frame = 1;
+      const int wpp_treshold = lcu_width * lcu_height * 15 / 100;
+      while ((threads_per_frame + 1) * 2 < lcu_width &&
+        threads_per_frame + 1 < lcu_height &&
+        size_of_wpp_ends(threads_per_frame + 1) < wpp_treshold) {
+        ++threads_per_frame;
+      }
 
-    // Find the largest number of threads per frame that satifies the
-    // the condition: wpp start/stop inefficiency takes up  less than 15%
-    // of frame area.
-    int threads_per_frame = 1;
-    const int wpp_treshold = lcu_width * lcu_height * 15 / 100;
-    while ((threads_per_frame + 1) * 2 < lcu_width &&
-           threads_per_frame + 1 < lcu_height &&
-           size_of_wpp_ends(threads_per_frame + 1) < wpp_treshold)
-    {
-      ++threads_per_frame;
-    }
+      const int threads = MAX(cfg->threads, 1);
+      const int frames = CEILDIV(threads, threads_per_frame);
 
-    const int threads = MAX(cfg->threads, 1);
-    const int frames = CEILDIV(threads, threads_per_frame);
+      // Convert from number of parallel frames to number of additional frames.
+      return CLIP(0, threads - 1, frames - 1);
+    } else {
+      // If wpp is not on, select owf such that there is enough
+      // tiles for twice the number of threads.
 
-    // Convert from number of parallel frames to number of additional frames.
-    return CLIP(0, threads - 1, frames - 1);
-  } else {
-    // If wpp is not on, select owf such that there is enough
-    // tiles for twice the number of threads.
+      int tiles_per_frame = cfg->tiles_width_count * cfg->tiles_height_count;
+      int threads = (cfg->threads > 1 ? cfg->threads : 1);
+      int frames = CEILDIV(threads * 4, tiles_per_frame);
 
-    int tiles_per_frame = 1;
-    if (cfg->tiles_width_count > 0) {
-      tiles_per_frame *= cfg->tiles_width_count + 1;
+      // Limit number of frames to 1.25x the number of threads for the case
+      // where there is only 1 tile per frame.
+      frames = CLIP(1, threads * 4 / 3, frames);
+      return frames - 1;
     }
-    if (cfg->tiles_height_count > 0) {
-      tiles_per_frame *= cfg->tiles_height_count + 1;
+  } else {
+    // Try and estimate a good number of parallel frames for inter.
+    const int lcu_width = CEILDIV(cfg->width, LCU_WIDTH);
+    const int lcu_height = CEILDIV(cfg->height, LCU_WIDTH);
+    int threads_per_frame = MIN(lcu_width / 2, lcu_height);
+    int threads = cfg->threads;
+
+    // If all threads fit into one frame, at least two parallel frames should
+    // be used to reduce the effect of WPP spin-up and wind-down.
+    int frames = 1;
+
+    while (threads > 0 && threads_per_frame > 0) {
+      frames += 1;
+      threads -= threads_per_frame;
+      threads_per_frame -= 2;
     }
-    int threads = (cfg->threads > 1 ? cfg->threads : 1);
-    int frames = CEILDIV(threads * 4, tiles_per_frame);
 
-    // Limit number of frames to 1.25x the number of threads for the case
-    // where there is only 1 tile per frame.
-    frames = CLIP(1, threads * 4 / 3, frames);
-    return frames - 1;
+    if (cfg->gop_lowdelay && cfg->gop_lp_definition.t > 1) {
+      // Temporal skipping makes every other frame very fast to encode so
+      // more parallel frames should be used.
+      frames *= 2;
+    }
+    return CLIP(0, cfg->threads * 2 - 1, frames - 1);
   }
 }
 
+
+static unsigned cfg_num_threads(void)
+{
+  unsigned cpus = kvz_g_hardware_flags.physical_cpu_count;
+  unsigned fake_cpus = kvz_g_hardware_flags.logical_cpu_count - cpus;
+
+  // Default to 4 if we don't know the number of CPUs.
+  if (cpus == 0) return 4;
+
+  // 1.5 times the number of physical cores seems to be a good compromise
+  // when hyperthreading is available on Haswell.
+  return cpus + fake_cpus / 2;
+}
+
+
 /**
  * \brief Allocate and initialize an encoder control structure.
  *
  * \param cfg   encoder configuration
  * \return      initialized encoder control or NULL on failure
  */
-encoder_control_t* kvz_encoder_control_init(const kvz_config *const cfg) {
+encoder_control_t* kvz_encoder_control_init(kvz_config *const cfg) {
   encoder_control_t *encoder = NULL;
 
   if (!cfg) {
@@ -107,6 +128,20 @@
     goto init_failed;
   }
 
+  if (cfg->threads == -1) {
+    cfg->threads = cfg_num_threads();
+  }
+
+  if (cfg->gop_len > 0) {
+    if (cfg->tmvp_enable) {
+      cfg->tmvp_enable = false;
+      fprintf(stderr, "Disabling TMVP because GOP is used.\n");
+    }
+    if (cfg->gop_lowdelay) {
+      kvz_config_process_lp_gop(cfg);
+    }
+  }
+
   // Make sure that the parameters make sense.
   if (!kvz_config_validate(cfg)) {
     goto init_failed;
@@ -147,6 +182,8 @@
 
   encoder->bitdepth = KVZ_BIT_DEPTH;
 
+  encoder->chroma_format = KVZ_FORMAT2CSP(cfg->input_format);
+
   // deblocking filter
   encoder->deblock_enable    = 1;
   encoder->beta_offset_div2  = 0;
@@ -191,8 +228,8 @@
   }
 
   //Tiles
-  encoder->tiles_enable = encoder->cfg->tiles_width_count > 0 ||
-                          encoder->cfg->tiles_height_count > 0;
+  encoder->tiles_enable = encoder->cfg->tiles_width_count > 1 ||
+                          encoder->cfg->tiles_height_count > 1;
 
   {
     int i, j; //iteration variables
@@ -202,11 +239,11 @@
     //Temporary pointers to allow encoder fields to be const
     int32_t *tiles_col_width, *tiles_row_height, *tiles_ctb_addr_rs_to_ts, *tiles_ctb_addr_ts_to_rs, *tiles_tile_id, *tiles_col_bd, *tiles_row_bd;
 
-    if (encoder->cfg->tiles_width_count >= encoder->in.width_in_lcu) {
+    if (encoder->cfg->tiles_width_count > encoder->in.width_in_lcu) {
       fprintf(stderr, "Too many tiles (width)!\n");
       goto init_failed;
 
-    } else if (encoder->cfg->tiles_height_count >= encoder->in.height_in_lcu) {
+    } else if (encoder->cfg->tiles_height_count > encoder->in.height_in_lcu) {
       fprintf(stderr, "Too many tiles (height)!\n");
       goto init_failed;
     }
@@ -215,8 +252,8 @@
     encoder->tiles_uniform_spacing_flag = 1;
 
     //tilesnx,y contains the number of _separation_ between tiles, whereas the encoder needs the number of tiles.
-    encoder->tiles_num_tile_columns = encoder->cfg->tiles_width_count + 1;
-    encoder->tiles_num_tile_rows = encoder->cfg->tiles_height_count + 1;
+    encoder->tiles_num_tile_columns = encoder->cfg->tiles_width_count;
+    encoder->tiles_num_tile_rows = encoder->cfg->tiles_height_count;
 
     encoder->tiles_col_width = tiles_col_width =
       MALLOC(int32_t, encoder->tiles_num_tile_columns);
@@ -412,35 +449,36 @@
   encoder->pu_depth_intra.min = cfg->pu_depth_intra.min;
   encoder->pu_depth_intra.max = cfg->pu_depth_intra.max;
 
-  // input init (TODO: read from commandline / config)  
-  encoder->in.video_format = FORMAT_420;
-
   // deblocking filter
-  encoder->deblock_enable = (int8_t)encoder->cfg->deblock_enable;
-  encoder->beta_offset_div2 = (int8_t)encoder->cfg->deblock_beta;
-  encoder->tc_offset_div2 = (int8_t)encoder->cfg->deblock_tc;
+  encoder->deblock_enable     = (int8_t)  (encoder->cfg->deblock_enable &&
+                                           !encoder->cfg->lossless);
+  encoder->beta_offset_div2   = (int8_t)  encoder->cfg->deblock_beta;
+  encoder->tc_offset_div2     = (int8_t)  encoder->cfg->deblock_tc;
   // SAO
-  encoder->sao_enable = (int8_t)encoder->cfg->sao_enable;
+  encoder->sao_enable         = (int8_t)  (encoder->cfg->sao_enable &&
+                                           !encoder->cfg->lossless);
   // RDO
-  encoder->rdoq_enable = (int8_t)encoder->cfg->rdoq_enable;
-  encoder->rdo = (int8_t)encoder->cfg->rdo;
-  encoder->sign_hiding = encoder->cfg->signhide_enable;
-  encoder->full_intra_search = (int8_t)encoder->cfg->full_intra_search;
+  encoder->rdoq_enable        = (int8_t)  encoder->cfg->rdoq_enable;
+  encoder->rdo                = (int8_t)  encoder->cfg->rdo;
+  encoder->sign_hiding        =           (encoder->cfg->signhide_enable &&
+                                           !encoder->cfg->lossless);
+  encoder->full_intra_search  = (int8_t)  encoder->cfg->full_intra_search;
   // TR SKIP
-  encoder->trskip_enable = (int8_t)encoder->cfg->trskip_enable;
-  encoder->tr_depth_intra = (int8_t)encoder->cfg->tr_depth_intra;
+  encoder->trskip_enable      = (int8_t)  (encoder->cfg->trskip_enable &&
+                                           !encoder->cfg->lossless);
+  encoder->tr_depth_intra     = (int8_t)  encoder->cfg->tr_depth_intra;
   // MOTION ESTIMATION
-  encoder->fme_level = (int8_t)encoder->cfg->fme_level;
+  encoder->fme_level          = (int8_t)  encoder->cfg->fme_level;
   // VUI
-  encoder->vui.sar_width = (int16_t)encoder->cfg->vui.sar_width;
-  encoder->vui.sar_height = (int16_t)encoder->cfg->vui.sar_height;
-  encoder->vui.overscan = encoder->cfg->vui.overscan;
-  encoder->vui.videoformat = encoder->cfg->vui.videoformat;
-  encoder->vui.fullrange = encoder->cfg->vui.fullrange;
-  encoder->vui.colorprim = encoder->cfg->vui.colorprim;
-  encoder->vui.transfer = encoder->cfg->vui.transfer;
-  encoder->vui.colormatrix = encoder->cfg->vui.colormatrix;
-  encoder->vui.chroma_loc = (int8_t)encoder->cfg->vui.chroma_loc;
+  encoder->vui.sar_width      = (int16_t) encoder->cfg->vui.sar_width;
+  encoder->vui.sar_height     = (int16_t) encoder->cfg->vui.sar_height;
+  encoder->vui.overscan       =           encoder->cfg->vui.overscan;
+  encoder->vui.videoformat    =           encoder->cfg->vui.videoformat;
+  encoder->vui.fullrange      =           encoder->cfg->vui.fullrange;
+  encoder->vui.colorprim      =           encoder->cfg->vui.colorprim;
+  encoder->vui.transfer       =           encoder->cfg->vui.transfer;
+  encoder->vui.colormatrix    =           encoder->cfg->vui.colormatrix;
+  encoder->vui.chroma_loc     = (int8_t)  encoder->cfg->vui.chroma_loc;
 
   // If fractional framerate is set, use that instead of the floating point framerate.
   if (cfg->framerate_num != 0) {
@@ -511,7 +549,6 @@
   encoder->in.height = height;
   encoder->in.real_width = width;
   encoder->in.real_height = height;
-  encoder->in.bitdepth = encoder->bitdepth;
 
   // If input dimensions are not divisible by the smallest block size, add
   // pixels to the dimensions, so that they are. These extra pixels will be
@@ -569,31 +606,61 @@
 
   switch (num_layers) {
     case 0:
+    case 1:
       break;
 
+    // Use the first layers of the 4-layer weights.
+    case 2:
     case 3:
+
     case 4:
-      // These weights were copied from http://doi.org/10.1109/TIP.2014.2336550
-      if (encoder->target_avg_bpp <= 0.05) {
-        encoder->gop_layer_weights0 = 30;
-        encoder->gop_layer_weights1 = 8;
-        encoder->gop_layer_weights2 = 4;
-        encoder->gop_layer_weights3 = 1;
-      } else if (encoder->target_avg_bpp <= 0.1) {
-        encoder->gop_layer_weights0 = 25;
-        encoder->gop_layer_weights1 = 7;
-        encoder->gop_layer_weights2 = 4;
-        encoder->gop_layer_weights3 = 1;
-      } else if (encoder->target_avg_bpp <= 0.2) {
-        encoder->gop_layer_weights0 = 20;
-        encoder->gop_layer_weights1 = 6;
-        encoder->gop_layer_weights2 = 4;
-        encoder->gop_layer_weights3 = 1;
+      if (encoder->cfg->gop_lowdelay) {
+        // These weights are based on http://doi.org/10.1109/TIP.2014.2336550
+        // They are meant for lp-g4d3r4t1 gop, but work ok for others.
+        if (encoder->target_avg_bpp <= 0.05) {
+          encoder->gop_layer_weights0 = 14;
+          encoder->gop_layer_weights1 = 3;
+          encoder->gop_layer_weights2 = 2;
+          encoder->gop_layer_weights3 = 1;
+        } else if (encoder->target_avg_bpp <= 0.1) {
+          encoder->gop_layer_weights0 = 12;
+          encoder->gop_layer_weights1 = 3;
+          encoder->gop_layer_weights2 = 2;
+          encoder->gop_layer_weights3 = 1;
+        } else if (encoder->target_avg_bpp <= 0.2) {
+          encoder->gop_layer_weights0 = 10;
+          encoder->gop_layer_weights1 = 3;
+          encoder->gop_layer_weights2 = 2;
+          encoder->gop_layer_weights3 = 1;
+        } else {
+          encoder->gop_layer_weights0 = 6;
+          encoder->gop_layer_weights1 = 3;
+          encoder->gop_layer_weights2 = 2;
+          encoder->gop_layer_weights3 = 1;
+        }
       } else {
-        encoder->gop_layer_weights0 = 15;
-        encoder->gop_layer_weights1 = 5;
-        encoder->gop_layer_weights2 = 4;
-        encoder->gop_layer_weights3 = 1;
+        // These weights are from http://doi.org/10.1109/TIP.2014.2336550
+        if (encoder->target_avg_bpp <= 0.05) {
+          encoder->gop_layer_weights0 = 30;
+          encoder->gop_layer_weights1 = 8;
+          encoder->gop_layer_weights2 = 4;
+          encoder->gop_layer_weights3 = 1;
+        } else if (encoder->target_avg_bpp <= 0.1) {
+          encoder->gop_layer_weights0 = 25;
+          encoder->gop_layer_weights1 = 7;
+          encoder->gop_layer_weights2 = 4;
+          encoder->gop_layer_weights3 = 1;
+        } else if (encoder->target_avg_bpp <= 0.2) {
+          encoder->gop_layer_weights0 = 20;
+          encoder->gop_layer_weights1 = 6;
+          encoder->gop_layer_weights2 = 4;
+          encoder->gop_layer_weights3 = 1;
+        } else {
+          encoder->gop_layer_weights0 = 15;
+          encoder->gop_layer_weights1 = 5;
+          encoder->gop_layer_weights2 = 4;
+          encoder->gop_layer_weights3 = 1;
+        }
       }
       break;

kvazaar-0.8.3.tar.gz/src/encoder.h -> kvazaar-1.0.0.tar.gz/src/encoder.h Changed

@@ -26,18 +26,12 @@
  * Initialization of encoder_control_t.
  */
 
-#include "global.h"
-
-#include "image.h"
-#include "bitstream.h"
-#include "cabac.h"
-#include "tables.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
 #include "scalinglist.h"
 #include "threadqueue.h"
 
 
-enum { FORMAT_400 = 0, FORMAT_420, FORMAT_422, FORMAT_444 };
-
 /* Encoder control options, the main struct */
 typedef struct encoder_control_t
 {
@@ -52,8 +46,6 @@
     int32_t height_in_lcu;
     int32_t real_width;  /*!< \brief real input picture width */
     int32_t real_height; /*!< \brief real input picture width */
-    int8_t video_format;
-    int8_t bitdepth;  /*!< \brief input bit depth (8,10) */
     int64_t pixels_per_pic;
     int8_t source_scan_type;
   } in;
@@ -66,6 +58,8 @@
   } me;
   
   int8_t bitdepth;
+  enum kvz_chroma_format chroma_format;
+
   int8_t tr_depth_intra;
 
   int8_t fme_level;
@@ -150,6 +144,8 @@
 
   bool sign_hiding;
 
+  bool implicit_rdpcm;
+
   //! Target average bits per picture.
   double target_avg_bppic;
 
@@ -161,7 +157,7 @@
 
 } encoder_control_t;
 
-encoder_control_t* kvz_encoder_control_init(const kvz_config *cfg);
+encoder_control_t* kvz_encoder_control_init(kvz_config *cfg);
 void kvz_encoder_control_free(encoder_control_t *encoder);
 
 void kvz_encoder_control_input_init(encoder_control_t *encoder, int32_t width, int32_t height);

kvazaar-0.8.3.tar.gz/src/encoder_state-bitstream.c -> kvazaar-1.0.0.tar.gz/src/encoder_state-bitstream.c Changed

@@ -20,12 +20,25 @@
 
 #include "encoder_state-bitstream.h"
 
-#include <string.h>
+#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
+#include "bitstream.h"
+#include "cabac.h"
 #include "checkpoint.h"
+#include "cu.h"
+#include "encoder.h"
+#include "encoder_state-geometry.h"
 #include "encoderstate.h"
+#include "imagelist.h"
+#include "kvazaar.h"
+#include "kvz_math.h"
 #include "nal.h"
+#include "scalinglist.h"
+#include "tables.h"
+#include "threadqueue.h"
+#include "videoframe.h"
 
 
 static void encoder_state_write_bitstream_aud(encoder_state_t * const state)
@@ -33,8 +46,8 @@
   bitstream_t * const stream = &state->stream;
   kvz_nal_write(stream, KVZ_NAL_AUD_NUT, 0, 1);
 
-  uint8_t pic_type = state->global->slicetype == KVZ_SLICE_I ? 0
-                   : state->global->slicetype == KVZ_SLICE_P ? 1
+  uint8_t pic_type = state->frame->slicetype == KVZ_SLICE_I ? 0
+                   : state->frame->slicetype == KVZ_SLICE_P ? 1
                    :                                       2;
   WRITE_U(stream, pic_type, 3, "pic_type");
 
@@ -230,7 +243,7 @@
       encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
       encoder->vui.colormatrix != 2) {
     WRITE_U(stream, 1, 1, "video_signal_type_present_flag");
-    WRITE_U(stream, encoder->vui.videoformat, 3, "video_format");
+    WRITE_U(stream, encoder->vui.videoformat, 3, "chroma_format");
     WRITE_U(stream, encoder->vui.fullrange, 1, "video_full_range_flag");
 
     if (encoder->vui.colorprim != 2 || encoder->vui.transfer != 2 ||
@@ -280,6 +293,33 @@
   //ENDIF
 }
 
+
+static void encoder_state_write_bitstream_SPS_extension(bitstream_t *stream,
+                                                        encoder_state_t * const state)
+{
+  if (state->encoder_control->cfg->implicit_rdpcm &&
+      state->encoder_control->cfg->lossless) {
+    WRITE_U(stream, 1, 1, "sps_extension_present_flag");
+
+    WRITE_U(stream, 1, 1, "sps_range_extension_flag");
+    WRITE_U(stream, 0, 1, "sps_multilayer_extension_flag");
+    WRITE_U(stream, 0, 1, "sps_3d_extension_flag");
+    WRITE_U(stream, 0, 5, "sps_extension_5bits");
+
+    WRITE_U(stream, 0, 1, "transform_skip_rotation_enabled_flag");
+    WRITE_U(stream, 0, 1, "transform_skip_context_enabled_flag");
+    WRITE_U(stream, 1, 1, "implicit_rdpcm_enabled_flag");
+    WRITE_U(stream, 0, 1, "explicit_rdpcm_enabled_flag");
+    WRITE_U(stream, 0, 1, "extended_precision_processing_flag");
+    WRITE_U(stream, 0, 1, "intra_smoothing_disabled_flag");
+    WRITE_U(stream, 0, 1, "high_precision_offsets_enabled_flag");
+    WRITE_U(stream, 0, 1, "persistent_rice_adaptation_enabled_flag");
+    WRITE_U(stream, 0, 1, "cabac_bypass_alignment_enabled_flag");
+  } else {
+    WRITE_U(stream, 0, 1, "sps_extension_present_flag");
+  }
+}
+
 static void encoder_state_write_bitstream_seq_parameter_set(bitstream_t* stream,
                                                             encoder_state_t * const state)
 {
@@ -297,10 +337,9 @@
   encoder_state_write_bitstream_PTL(stream, state);
 
   WRITE_UE(stream, 0, "sps_seq_parameter_set_id");
-  WRITE_UE(stream, encoder->in.video_format,
-           "chroma_format_idc");
+  WRITE_UE(stream, encoder->chroma_format, "chroma_format_idc");
 
-  if (encoder->in.video_format == 3) {
+  if (encoder->chroma_format == KVZ_CSP_444) {
     WRITE_U(stream, 0, 1, "separate_colour_plane_flag");
   }
 
@@ -380,14 +419,14 @@
   //IF long_term_ref_pics_present
   //ENDIF
 
-  WRITE_U(stream, ENABLE_TEMPORAL_MVP, 1,
+  WRITE_U(stream, state->encoder_control->cfg->tmvp_enable, 1,
           "sps_temporal_mvp_enable_flag");
   WRITE_U(stream, 0, 1, "sps_strong_intra_smoothing_enable_flag");
   WRITE_U(stream, 1, 1, "vui_parameters_present_flag");
 
   encoder_state_write_bitstream_VUI(stream, state);
 
-  WRITE_U(stream, 0, 1, "sps_extension_flag");
+  encoder_state_write_bitstream_SPS_extension(stream, state);
 
   kvz_bitstream_add_rbsp_trailing_bits(stream);
 }
@@ -424,7 +463,7 @@
   WRITE_U(stream, 0, 1, "weighted_bipred_idc");
 
   //WRITE_U(stream, 0, 1, "dependent_slices_enabled_flag");
-  WRITE_U(stream, 0, 1, "transquant_bypass_enable_flag");
+  WRITE_U(stream, encoder->cfg->lossless, 1, "transquant_bypass_enable_flag");
   WRITE_U(stream, encoder->tiles_enable, 1, "tiles_enabled_flag");
   //wavefronts
   WRITE_U(stream, encoder->wpp, 1, "entropy_coding_sync_enabled_flag");
@@ -566,7 +605,7 @@
 
   if (state->encoder_control->vui.frame_field_info_present_flag){
 
-    int8_t odd_picture = state->global->frame % 2;
+    int8_t odd_picture = state->frame->num % 2;
     int8_t pic_struct = 0; //0: progressive picture, 1: top field, 2: bottom field, 3...
     int8_t source_scan_type = 1; //0: interlaced, 1: progressive
 
@@ -630,16 +669,6 @@
   }
 }
 
-static int num_bitcount(unsigned int n) {
-  int pos = 0;
-  if (n >= 1<<16) { n >>= 16; pos += 16; }
-  if (n >= 1<< 8) { n >>=  8; pos +=  8; }
-  if (n >= 1<< 4) { n >>=  4; pos +=  4; }
-  if (n >= 1<< 2) { n >>=  2; pos +=  2; }
-  if (n >= 1<< 1) {           pos +=  1; }
-  return ((n == 0) ? (-1) : pos);
-}
-
 void kvz_encoder_state_write_bitstream_slice_header(encoder_state_t * const state)
 {
   const encoder_control_t * const encoder = state->encoder_control;
@@ -648,22 +677,22 @@
   int ref_negative = 0;
   int ref_positive = 0;
   if (encoder->cfg->gop_len) {
-    for (j = 0; j < state->global->ref->used_size; j++) {
-      if (state->global->ref->pocsj < state->global->poc) {
+    for (j = 0; j < state->frame->ref->used_size; j++) {
+      if (state->frame->ref->pocsj < state->frame->poc) {
         ref_negative++;
       } else {
         ref_positive++;
       }
     }
-  } else ref_negative = state->global->ref->used_size;
+  } else ref_negative = state->frame->ref->used_size;
 
 #ifdef KVZ_DEBUG
   printf("=========== Slice ===========\n");
 #endif
   WRITE_U(stream, (state->slice->start_in_rs == 0), 1, "first_slice_segment_in_pic_flag");
 
-  if (state->global->pictype >= KVZ_NAL_BLA_W_LP
-      && state->global->pictype <= KVZ_NAL_RSV_IRAP_VCL23) {
+  if (state->frame->pictype >= KVZ_NAL_BLA_W_LP
+      && state->frame->pictype <= KVZ_NAL_RSV_IRAP_VCL23) {
     WRITE_U(stream, 1, 1, "no_output_of_prior_pics_flag");
   }
 
@@ -674,7 +703,7 @@
     WRITE_UE(stream, state->slice->start_in_rs, "slice_segment_address");
   }
 
-  WRITE_UE(stream, state->global->slicetype, "slice_type");
+  WRITE_UE(stream, state->frame->slicetype, "slice_type");
 
   // if !entropy_slice_flag
 
@@ -682,12 +711,12 @@
       //WRITE_U(stream, 1, 1, "pic_output_flag");
     //end if
     //if( IdrPicFlag ) <- nal_unit_type == 5
-  if (state->global->pictype != KVZ_NAL_IDR_W_RADL
-      && state->global->pictype != KVZ_NAL_IDR_N_LP) {
+  if (state->frame->pictype != KVZ_NAL_IDR_W_RADL
+      && state->frame->pictype != KVZ_NAL_IDR_N_LP) {
     int last_poc = 0;
     int poc_shift = 0;
 
-      WRITE_U(stream, state->global->poc&0x1f, 5, "pic_order_cnt_lsb");
+      WRITE_U(stream, state->frame->poc&0x1f, 5, "pic_order_cnt_lsb");
       WRITE_U(stream, 0, 1, "short_term_ref_pic_set_sps_flag");
       WRITE_UE(stream, ref_negative, "num_negative_pics");
       WRITE_UE(stream, ref_positive, "num_positive_pics");
@@ -697,9 +726,9 @@
       if (encoder->cfg->gop_len) {
         int8_t found = 0;
         do {
-          delta_poc = encoder->cfg->gopstate->global->gop_offset.ref_negj + poc_shift;
-          for (int i = 0; i < state->global->ref->used_size; i++) {
-            if (state->global->ref->pocsi == state->global->poc - delta_poc) {
+          delta_poc = encoder->cfg->gopstate->frame->gop_offset.ref_negj + poc_shift;
+          for (int i = 0; i < state->frame->ref->used_size; i++) {
+            if (state->frame->ref->pocsi == state->frame->poc - delta_poc) {
               found = 1;
               break;
             }
@@ -724,9 +753,9 @@
       if (encoder->cfg->gop_len) {
         int8_t found = 0;
         do {
-          delta_poc = encoder->cfg->gopstate->global->gop_offset.ref_posj + poc_shift;
-          for (int i = 0; i < state->global->ref->used_size; i++) {
-            if (state->global->ref->pocsi == state->global->poc + delta_poc) {
+          delta_poc = encoder->cfg->gopstate->frame->gop_offset.ref_posj + poc_shift;
+          for (int i = 0; i < state->frame->ref->used_size; i++) {
+            if (state->frame->ref->pocsi == state->frame->poc + delta_poc) {
               found = 1;
               break;
             }
@@ -744,27 +773,41 @@
       WRITE_U(stream, 1, 1, "used_by_curr_pic_s1_flag");
     }
     //WRITE_UE(stream, 0, "short_term_ref_pic_set_idx");
+    
+    if (state->encoder_control->cfg->tmvp_enable) {
+      WRITE_U(stream, 1, 1, "slice_temporal_mvp_enabled_flag");
+    }
   }
 
     //end if
   //end if
+
+
   if (encoder->sao_enable) {
     WRITE_U(stream, 1, 1, "slice_sao_luma_flag");
-    WRITE_U(stream, 1, 1, "slice_sao_chroma_flag");
+    if (encoder->chroma_format != KVZ_CSP_400) {
+      WRITE_U(stream, 1, 1, "slice_sao_chroma_flag");
+    }
   }
 
-  if (state->global->slicetype != KVZ_SLICE_I) {
+  if (state->frame->slicetype != KVZ_SLICE_I) {
       WRITE_U(stream, 1, 1, "num_ref_idx_active_override_flag");
       WRITE_UE(stream, ref_negative != 0 ? ref_negative - 1: 0, "num_ref_idx_l0_active_minus1");
-        if (state->global->slicetype == KVZ_SLICE_B) {
+        if (state->frame->slicetype == KVZ_SLICE_B) {
           WRITE_UE(stream, ref_positive != 0 ? ref_positive - 1 : 0, "num_ref_idx_l1_active_minus1");
           WRITE_U(stream, 0, 1, "mvd_l1_zero_flag");
         }
+
+      // ToDo: handle B-frames with TMVP
+      if (state->encoder_control->cfg->tmvp_enable && ref_negative > 1) {
+        WRITE_UE(stream, 0, "collocated_ref_idx");
+      }
+
       WRITE_UE(stream, 5-MRG_MAX_NUM_CANDS, "five_minus_max_num_merge_cand");
   }
 
   {
-    int slice_qp_delta = state->global->QP - encoder->cfg->qp;
+    int slice_qp_delta = state->frame->QP - encoder->cfg->qp;
     WRITE_SE(stream, slice_qp_delta, "slice_qp_delta");
   }
    
@@ -777,7 +820,7 @@
     WRITE_UE(stream, num_entry_points - 1, "num_entry_point_offsets");
     if (num_entry_points > 0) {
       int entry_points_written = 0;
-      int offset_len = num_bitcount(max_length_seen) + 1;
+      int offset_len = kvz_math_floor_log2(max_length_seen) + 1;
       WRITE_UE(stream, offset_len - 1, "offset_len_minus1");
       encoder_state_write_bitstream_entry_points_write(stream, state, num_entry_points, offset_len, &entry_points_written); 
     }
@@ -795,24 +838,48 @@
   bitstream_t * const stream = &state->stream;
   const videoframe_t * const frame = state->tile->frame;
   unsigned char checksum3SEI_HASH_MAX_LENGTH;
-  uint32_t checksum_val;
-  unsigned int i;
 
   kvz_nal_write(stream, KVZ_NAL_SUFFIX_SEI_NUT, 0, 0);
 
-  kvz_image_checksum(frame->rec, checksum, state->encoder_control->bitdepth);
-
   WRITE_U(stream, 132, 8, "sei_type");
-  WRITE_U(stream, 13, 8, "size");
-  WRITE_U(stream, 2, 8, "hash_type"); // 2 = checksum
 
-  for (i = 0; i < 3; ++i) {
-    // Pack bits into a single 32 bit uint instead of pushing them one byte
-    // at a time.
-    checksum_val = (checksumi0 << 24) + (checksumi1 << 16) +
-                   (checksumi2 << 8) + (checksumi3);
-    WRITE_U(stream, checksum_val, 32, "picture_checksum");
-    CHECKPOINT("checksum%d = %u", i, checksum_val);
+  int num_colors = (state->encoder_control->chroma_format == KVZ_CSP_400 ? 1 : 3);
+
+  switch (state->encoder_control->cfg->hash)
+  {
+  case KVZ_HASH_CHECKSUM:
+    kvz_image_checksum(frame->rec, checksum, state->encoder_control->bitdepth);
+
+    WRITE_U(stream, 1 + num_colors * 4, 8, "size");
+    WRITE_U(stream, 2, 8, "hash_type");  // 2 = checksum
+
+    for (int i = 0; i < num_colors; ++i) {
+      uint32_t checksum_val = (
+        (checksumi0 << 24) + (checksumi1 << 16) +
+        (checksumi2 << 8) + (checksumi3));
+      WRITE_U(stream, checksum_val, 32, "picture_checksum");
+      CHECKPOINT("checksum%d = %u", i, checksum_val);
+    }
+
+    break;
+
+  case KVZ_HASH_MD5:
+    kvz_image_md5(frame->rec, checksum, state->encoder_control->bitdepth);
+
+    WRITE_U(stream, 1 + num_colors * 16, 8, "size");
+    WRITE_U(stream, 0, 8, "hash_type");  // 0 = md5
+
+    for (int i = 0; i < num_colors; ++i) {
+      for (int b = 0; b < 16; ++b) {
+        WRITE_U(stream, checksumib, 8, "picture_md5");
+      }
+    }
+
+    break;
+
+  case KVZ_HASH_NONE:
+    // Means we shouldn't be writing this SEI.
+    assert(0);
   }
 
   kvz_bitstream_align(stream);
@@ -847,15 +914,15 @@
     encoder_state_write_bitstream_aud(state);
   }
   
-  if ((encoder->vps_period > 0 && state->global->frame % encoder->vps_period == 0)
-      || (state->global->frame == 0 && encoder->vps_period >= 0))
+  if ((encoder->vps_period > 0 && state->frame->num % encoder->vps_period == 0)
+      || (state->frame->num == 0 && encoder->vps_period >= 0))
   {
     first_nal_in_au = false;
     kvz_encoder_state_write_parameter_sets(&state->stream, state);
   }
 
   // Send Kvazaar version information only in the first frame.
-  if (state->global->frame == 0 && encoder->cfg->add_encoder_info) {
+  if (state->frame->num == 0 && encoder->cfg->add_encoder_info) {
     kvz_nal_write(stream, KVZ_NAL_PREFIX_SEI_NUT, 0, first_nal_in_au);
     encoder_state_write_bitstream_prefix_sei_version(state);
 
@@ -879,38 +946,38 @@
   }
 
   {
-    uint8_t nal_type = (state->global->is_idr_frame ? KVZ_NAL_IDR_W_RADL : KVZ_NAL_TRAIL_R);
+    uint8_t nal_type = (state->frame->is_idr_frame ? KVZ_NAL_IDR_W_RADL : KVZ_NAL_TRAIL_R);
     kvz_nal_write(stream, nal_type, 0, first_nal_in_au);
   }
 
   {
     PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
     encoder_state_write_bitstream_children(state);
-    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, encoder->threadqueue, "type=write_bitstream_append,frame=%d,encoder_type=%c", state->global->frame, state->type);
+    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, encoder->threadqueue, "type=write_bitstream_append,frame=%d,encoder_type=%c", state->frame->num, state->type);
   }
   
-  {
+  if (state->encoder_control->cfg->hash != KVZ_HASH_NONE) {
     PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
     // Calculate checksum
     add_checksum(state);
-    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, encoder->threadqueue, "type=write_bitstream_checksum,frame=%d,encoder_type=%c", state->global->frame, state->type);
+    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, encoder->threadqueue, "type=write_bitstream_checksum,frame=%d,encoder_type=%c", state->frame->num, state->type);
   }
   
   //Get bitstream length for stats
   uint64_t newpos = kvz_bitstream_tell(stream);
   state->stats_bitstream_length = (newpos >> 3) - (curpos >> 3);
 
-  if (state->global->frame > 0) {
-    state->global->total_bits_coded = state->previous_encoder_state->global->total_bits_coded;
+  if (state->frame->num > 0) {
+    state->frame->total_bits_coded = state->previous_encoder_state->frame->total_bits_coded;
   }
-  state->global->total_bits_coded += newpos - curpos;
+  state->frame->total_bits_coded += newpos - curpos;
 
-  if (encoder->cfg->gop_len > 0 && state->global->gop_offset > 0) {
-    state->global->cur_gop_bits_coded = state->previous_encoder_state->global->cur_gop_bits_coded;
+  if (encoder->cfg->gop_len > 0 && state->frame->gop_offset > 0) {
+    state->frame->cur_gop_bits_coded = state->previous_encoder_state->frame->cur_gop_bits_coded;
   } else {
-    state->global->cur_gop_bits_coded = 0;
+    state->frame->cur_gop_bits_coded = 0;
   }
-  state->global->cur_gop_bits_coded += newpos - curpos;
+  state->frame->cur_gop_bits_coded += newpos - curpos;
 }
 
 void kvz_encoder_state_write_bitstream_leaf(encoder_state_t * const state)

kvazaar-0.8.3.tar.gz/src/encoder_state-bitstream.h -> kvazaar-1.0.0.tar.gz/src/encoder_state-bitstream.h Changed

kvazaar-0.8.3.tar.gz/src/encoder_state-ctors_dtors.c -> kvazaar-1.0.0.tar.gz/src/encoder_state-ctors_dtors.c Changed

@@ -20,29 +20,41 @@
 
 #include "encoder_state-ctors_dtors.h"
 
+#include <stdio.h>
 #include <stdlib.h>
 
+#include "bitstream.h"
+#include "cabac.h"
+#include "cu.h"
+#include "encoder.h"
+#include "encoder_state-geometry.h"
 #include "encoderstate.h"
+#include "extras/crypto.h"
+#include "image.h"
+#include "imagelist.h"
+#include "kvazaar.h"
+#include "threadqueue.h"
+#include "videoframe.h"
 
 
-static int encoder_state_config_global_init(encoder_state_t * const state) {
-  state->global->ref = kvz_image_list_alloc(MAX_REF_PIC_COUNT);
-  if(!state->global->ref) {
+static int encoder_state_config_frame_init(encoder_state_t * const state) {
+  state->frame->ref = kvz_image_list_alloc(MAX_REF_PIC_COUNT);
+  if(!state->frame->ref) {
     fprintf(stderr, "Failed to allocate the picture list!\n");
     return 0;
   }
-  state->global->ref_list = REF_PIC_LIST_0;
-  state->global->frame = 0;
-  state->global->poc = 0;
-  state->global->total_bits_coded = 0;
-  state->global->cur_gop_bits_coded = 0;
-  state->global->rc_alpha = 3.2003;
-  state->global->rc_beta = -1.367;
+  state->frame->ref_list = REF_PIC_LIST_0;
+  state->frame->num = 0;
+  state->frame->poc = 0;
+  state->frame->total_bits_coded = 0;
+  state->frame->cur_gop_bits_coded = 0;
+  state->frame->rc_alpha = 3.2003;
+  state->frame->rc_beta = -1.367;
   return 1;
 }
 
-static void encoder_state_config_global_finalize(encoder_state_t * const state) {
-  kvz_image_list_destroy(state->global->ref);
+static void encoder_state_config_frame_finalize(encoder_state_t * const state) {
+  kvz_image_list_destroy(state->frame->ref);
 }
 
 static int encoder_state_config_tile_init(encoder_state_t * const state, 
@@ -50,7 +62,7 @@
                                           const int width, const int height, const int width_in_lcu, const int height_in_lcu) {
   
   const encoder_control_t * const encoder = state->encoder_control;
-  state->tile->frame = kvz_videoframe_alloc(width, height, 0);
+  state->tile->frame = kvz_videoframe_alloc(width, height, state->encoder_control->chroma_format);
   
   state->tile->frame->rec = NULL;
   
@@ -72,14 +84,20 @@
   
   state->tile->lcu_offset_in_ts = encoder->tiles_ctb_addr_rs_to_tslcu_offset_x + lcu_offset_y * encoder->in.width_in_lcu;
   
-  //Allocate buffers
-  //order by row of (LCU_WIDTH * frame->width_in_lcu) pixels
-  state->tile->hor_buf_search = kvz_yuv_t_alloc(LCU_WIDTH * state->tile->frame->width_in_lcu * state->tile->frame->height_in_lcu);
-  //order by column of (LCU_WIDTH * encoder_state->height_in_lcu) pixels (there is no more extra pixel, since we can use a negative index)
-  state->tile->ver_buf_search = kvz_yuv_t_alloc(LCU_WIDTH * state->tile->frame->height_in_lcu * state->tile->frame->width_in_lcu);
+  // hor_buf_search and ver_buf_search store single row/col from each LCU row/col.
+  // Because these lines are independent, the chroma subsampling only matters in one
+  // of the directions, .
+  unsigned luma_size = LCU_WIDTH * state->tile->frame->width_in_lcu * state->tile->frame->height_in_lcu;
+  unsigned chroma_sizes_hor = { 0, luma_size / 2, luma_size / 2, luma_size };
+  unsigned chroma_sizes_ver = { 0, luma_size / 2, luma_size, luma_size };
+  unsigned chroma_size_hor = chroma_sizes_horstate->encoder_control->chroma_format;
+  unsigned chroma_size_ver = chroma_sizes_verstate->encoder_control->chroma_format;
+
+  state->tile->hor_buf_search = kvz_yuv_t_alloc(luma_size, chroma_size_hor);
+  state->tile->ver_buf_search = kvz_yuv_t_alloc(luma_size, chroma_size_ver);
   
   if (encoder->sao_enable) {
-    state->tile->hor_buf_before_sao = kvz_yuv_t_alloc(LCU_WIDTH * state->tile->frame->width_in_lcu * state->tile->frame->height_in_lcu);
+    state->tile->hor_buf_before_sao = kvz_yuv_t_alloc(luma_size, chroma_size_hor);
   } else {
     state->tile->hor_buf_before_sao = NULL;
   }
@@ -94,8 +112,14 @@
   } else {
     state->tile->wf_jobs = NULL;
   }
-  
   state->tile->id = encoder->tiles_tile_idstate->tile->lcu_offset_in_ts;
+  
+  state->tile->dbs_g = NULL;
+  if (state->encoder_control->cfg->crypto_features) {
+    state->tile->dbs_g = InitC();
+  }
+  state->tile->m_prev_pos = 0;
+
   return 1;
 }
 
@@ -107,7 +131,9 @@
   
   kvz_videoframe_free(state->tile->frame);
   state->tile->frame = NULL;
-  
+  if (state->encoder_control->cfg->crypto_features) {
+    DeleteCryptoC(state->tile->dbs_g);
+  }
   FREE_POINTER(state->tile->wf_jobs);
 }
 
@@ -227,7 +253,7 @@
   printf(" \"%p\" \n", state);
   printf("  label = \"{encoder_state|");
   printf("+ type=%c\\l", state->type);
-  if (!state->parent || state->global != state->parent->global) {
+  if (!state->parent || state->frame != state->parent->global) {
     printf("|+ global\\l");
   }
   if (!state->parent || state->tile != state->parent->tile) {
@@ -274,7 +300,7 @@
   //
   //If parent_state is not NULL, the following variable should either be set to NULL,
   //in order to inherit from parent, or should point to a valid structure:
-  //child_state->global
+  //child_state->frame
   //child_state->tile
   //child_state->slice
   //child_state->wfrow
@@ -291,9 +317,9 @@
     const encoder_control_t * const encoder = child_state->encoder_control;
     child_state->type = ENCODER_STATE_TYPE_MAIN;
     assert(child_state->encoder_control);
-    child_state->global = MALLOC(encoder_state_config_global_t, 1);
-    if (!child_state->global || !encoder_state_config_global_init(child_state)) {
-      fprintf(stderr, "Could not initialize encoder_state->global!\n");
+    child_state->frame = MALLOC(encoder_state_config_frame_t, 1);
+    if (!child_state->frame || !encoder_state_config_frame_init(child_state)) {
+      fprintf(stderr, "Could not initialize encoder_state->frame!\n");
       return 0;
     }
     child_state->tile = MALLOC(encoder_state_config_tile_t, 1);
@@ -313,7 +339,7 @@
     }
   } else {
     child_state->encoder_control = parent_state->encoder_control;
-    if (!child_state->global) child_state->global = parent_state->global;
+    if (!child_state->frame) child_state->frame = parent_state->frame;
     if (!child_state->tile) child_state->tile = parent_state->tile;
     if (!child_state->slice) child_state->slice = parent_state->slice;
     if (!child_state->wfrow) child_state->wfrow = parent_state->wfrow;
@@ -401,9 +427,9 @@
         //Create a slice
         new_child = &child_state->childrenchild_count;
         new_child->encoder_control = encoder;
-        new_child->type = ENCODER_STATE_TYPE_SLICE;
-        new_child->global = child_state->global;
-        new_child->tile = child_state->tile;
+        new_child->type  = ENCODER_STATE_TYPE_SLICE;
+        new_child->frame = child_state->frame;
+        new_child->tile  = child_state->tile;
         new_child->wfrow = child_state->wfrow;
         new_child->slice = MALLOC(encoder_state_config_slice_t, 1);
         if (!new_child->slice || !encoder_state_config_slice_init(new_child, range_start, range_end_slice)) {
@@ -427,9 +453,9 @@
         
         new_child = &child_state->childrenchild_count;
         new_child->encoder_control = encoder;
-        new_child->type = ENCODER_STATE_TYPE_TILE;
-        new_child->global = child_state->global;
-        new_child->tile = MALLOC(encoder_state_config_tile_t, 1);
+        new_child->type  = ENCODER_STATE_TYPE_TILE;
+        new_child->frame = child_state->frame;
+        new_child->tile  = MALLOC(encoder_state_config_tile_t, 1);
         new_child->slice = child_state->slice;
         new_child->wfrow = child_state->wfrow;
         
@@ -511,9 +537,9 @@
         encoder_state_t *new_child = &child_state->childreni;
         
         new_child->encoder_control = encoder;
-        new_child->type = ENCODER_STATE_TYPE_WAVEFRONT_ROW;
-        new_child->global = child_state->global;
-        new_child->tile = child_state->tile;
+        new_child->type  = ENCODER_STATE_TYPE_WAVEFRONT_ROW;
+        new_child->frame = child_state->frame;
+        new_child->tile  = child_state->tile;
         new_child->slice = child_state->slice;
         new_child->wfrow = MALLOC(encoder_state_config_wfrow_t, 1);
         
@@ -668,9 +694,9 @@
     FREE_POINTER(state->tile);
   }
   
-  if (!state->parent || (state->parent->global != state->global)) {
-    encoder_state_config_global_finalize(state);
-    FREE_POINTER(state->global);
+  if (!state->parent || (state->parent->frame != state->frame)) {
+    encoder_state_config_frame_finalize(state);
+    FREE_POINTER(state->frame);
   }
   
   kvz_bitstream_finalize(&state->stream);

kvazaar-0.8.3.tar.gz/src/encoder_state-ctors_dtors.h -> kvazaar-1.0.0.tar.gz/src/encoder_state-ctors_dtors.h Changed

kvazaar-0.8.3.tar.gz/src/encoder_state-geometry.c -> kvazaar-1.0.0.tar.gz/src/encoder_state-geometry.c Changed

kvazaar-0.8.3.tar.gz/src/encoder_state-geometry.h -> kvazaar-1.0.0.tar.gz/src/encoder_state-geometry.h Changed

kvazaar-0.8.3.tar.gz/src/encoderstate.c -> kvazaar-1.0.0.tar.gz/src/encoderstate.c Changed

@@ -21,24 +21,21 @@
 #include "encoderstate.h"
 
 #include <math.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <assert.h>
 
-#include "tables.h"
 #include "cabac.h"
-#include "image.h"
-#include "nal.h"
 #include "context.h"
-#include "transform.h"
-#include "intra.h"
-#include "inter.h"
+#include "encode_coding_tree.h"
+#include "encoder_state-bitstream.h"
 #include "filter.h"
-#include "search.h"
-#include "sao.h"
-#include "rdo.h"
+#include "image.h"
 #include "rate_control.h"
-#include "strategies/strategies-picture.h"
+#include "sao.h"
+#include "search.h"
+#include "tables.h"
+
 
 int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
   int i;
@@ -55,38 +52,57 @@
   videoframe_t* const frame = state->tile->frame;
   
   if (hor_buf) {
-    const int rdpx = lcu->position_px.x;
-    const int rdpy = lcu->position_px.y + lcu->size.y - 1;
-    const int by = lcu->position.y;
-    
     //Copy the bottom row of this LCU to the horizontal buffer
-    kvz_pixels_blit(&frame->rec->yrdpy * frame->rec->stride + rdpx,
-                        &hor_buf->ylcu->position_px.x + by * frame->width,
-                        lcu->size.x, 1, frame->rec->stride, frame->width);
-    kvz_pixels_blit(&frame->rec->u(rdpy/2) * frame->rec->stride/2 + (rdpx/2),
-                        &hor_buf->ulcu->position_px.x / 2 + by * frame->width / 2,
-                        lcu->size.x / 2, 1, frame->rec->stride / 2, frame->width / 2);
-    kvz_pixels_blit(&frame->rec->v(rdpy/2) * frame->rec->stride/2 + (rdpx/2),
-                        &hor_buf->vlcu->position_px.x / 2 + by * frame->width / 2,
-                        lcu->size.x / 2, 1, frame->rec->stride / 2, frame->width / 2);
+    vector2d_t bottom = { lcu->position_px.x, lcu->position_px.y + lcu->size.y - 1 };
+    const int lcu_row = lcu->position.y;
+
+    unsigned from_index = bottom.y * frame->rec->stride + bottom.x;
+    unsigned to_index = lcu->position_px.x + lcu_row * frame->width;
+    
+    kvz_pixels_blit(&frame->rec->yfrom_index,
+                    &hor_buf->yto_index,
+                    lcu->size.x, 1,
+                    frame->rec->stride, frame->width);
+
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      unsigned from_index_c = (bottom.y / 2) * frame->rec->stride / 2 + (bottom.x / 2);
+      unsigned to_index_c = lcu->position_px.x / 2 + lcu_row * frame->width / 2;
+
+      kvz_pixels_blit(&frame->rec->ufrom_index_c,
+                      &hor_buf->uto_index_c,
+                      lcu->size.x / 2, 1, 
+                      frame->rec->stride / 2, frame->width / 2);
+      kvz_pixels_blit(&frame->rec->vfrom_index_c,
+                      &hor_buf->vto_index_c,
+                      lcu->size.x / 2, 1,
+                      frame->rec->stride / 2, frame->width / 2);
+    }
   }
   
   if (ver_buf) {
-    const int rdpx = lcu->position_px.x + lcu->size.x - 1;
-    const int rdpy = lcu->position_px.y;
-    const int bx = lcu->position.x;
+    //Copy the right row of this LCU to the vertical buffer.
     
+    const int lcu_col = lcu->position.x;
+    vector2d_t left = { lcu->position_px.x + lcu->size.x - 1, lcu->position_px.y };
     
-    //Copy the right row of this LCU to the vertical buffer.
-    kvz_pixels_blit(&frame->rec->yrdpy * frame->rec->stride + rdpx,
-                        &ver_buf->ylcu->position_px.y + bx * frame->height,
-                        1, lcu->size.y, frame->rec->stride, 1);
-    kvz_pixels_blit(&frame->rec->u(rdpy/2) * frame->rec->stride/2 + (rdpx/2),
-                        &ver_buf->ulcu->position_px.y / 2 + bx * frame->height / 2,
-                        1, lcu->size.y / 2, frame->rec->stride / 2, 1);
-    kvz_pixels_blit(&frame->rec->v(rdpy/2) * frame->rec->stride/2 + (rdpx/2),
-                        &ver_buf->vlcu->position_px.y / 2 + bx * frame->height / 2,
-                        1, lcu->size.y / 2, frame->rec->stride / 2, 1);
+    kvz_pixels_blit(&frame->rec->yleft.y * frame->rec->stride + left.x,
+                    &ver_buf->ylcu->position_px.y + lcu_col * frame->height,
+                    1, lcu->size.y,
+                    frame->rec->stride, 1);
+
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      unsigned from_index = (left.y / 2) * frame->rec->stride / 2 + (left.x / 2);
+      unsigned to_index = lcu->position_px.y / 2 + lcu_col * frame->height / 2;
+
+      kvz_pixels_blit(&frame->rec->ufrom_index,
+                      &ver_buf->uto_index,
+                      1, lcu->size.y / 2,
+                      frame->rec->stride / 2, 1);
+      kvz_pixels_blit(&frame->rec->vfrom_index,
+                      &ver_buf->vto_index,
+                      1, lcu->size.y / 2,
+                      frame->rec->stride / 2, 1);
+    }
   }
   
 }
@@ -172,8 +188,10 @@
   // If SAO is merged, nothing else needs to be coded.
   if (!sao_luma->merge_left_flag && !sao_luma->merge_up_flag) {
     encode_sao_color(state, sao_luma, COLOR_Y);
-    encode_sao_color(state, sao_chroma, COLOR_U);
-    encode_sao_color(state, sao_chroma, COLOR_V);
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      encode_sao_color(state, sao_chroma, COLOR_U);
+      encode_sao_color(state, sao_chroma, COLOR_V);
+    }
   }
 }
 
@@ -195,51 +213,9 @@
   }
 
   if (encoder->sao_enable) {
-    const int stride = frame->width_in_lcu;
-    int32_t merge_cost_luma3 = { INT32_MAX };
-    int32_t merge_cost_chroma3 = { INT32_MAX };
-    sao_info_t *sao_luma = &frame->sao_lumalcu->position.y * stride + lcu->position.x;
-    sao_info_t *sao_chroma = &frame->sao_chromalcu->position.y * stride + lcu->position.x;
-
-    // Merge candidates
-    sao_info_t *sao_top_luma = lcu->position.y != 0 ? &frame->sao_luma(lcu->position.y - 1) * stride + lcu->position.x : NULL;
-    sao_info_t *sao_left_luma = lcu->position.x != 0 ? &frame->sao_lumalcu->position.y * stride + lcu->position.x - 1 : NULL;
-    sao_info_t *sao_top_chroma = lcu->position.y != 0 ? &frame->sao_chroma(lcu->position.y - 1) * stride + lcu->position.x : NULL;
-    sao_info_t *sao_left_chroma = lcu->position.x != 0 ? &frame->sao_chromalcu->position.y * stride + lcu->position.x - 1 : NULL;
-
-    kvz_sao_search_luma(state, frame, lcu->position.x, lcu->position.y, sao_luma, sao_top_luma, sao_left_luma, merge_cost_luma);
-    kvz_sao_search_chroma(state, frame, lcu->position.x, lcu->position.y, sao_chroma, sao_top_chroma, sao_left_chroma, merge_cost_chroma);
-
-    sao_luma->merge_up_flag = sao_luma->merge_left_flag = 0;
-    // Check merge costs
-    if (sao_top_luma) {
-      // Merge up if cost is equal or smaller to the searched mode cost
-      if (merge_cost_luma2 + merge_cost_chroma2 <= merge_cost_luma0 + merge_cost_chroma0) {        
-        *sao_luma = *sao_top_luma;
-        *sao_chroma = *sao_top_chroma;
-        sao_luma->merge_up_flag = 1;
-        sao_luma->merge_left_flag = 0;
-      }
-    }
-    if (sao_left_luma) {
-      // Merge left if cost is equal or smaller to the searched mode cost 
-      // AND smaller than merge up cost, if merge up was already chosen
-      if (merge_cost_luma1 + merge_cost_chroma1 <= merge_cost_luma0 + merge_cost_chroma0) {
-        if (!sao_luma->merge_up_flag || merge_cost_luma1 + merge_cost_chroma1 < merge_cost_luma2 + merge_cost_chroma2) {      
-          *sao_luma = *sao_left_luma;
-          *sao_chroma = *sao_left_chroma;
-          sao_luma->merge_left_flag = 1;
-          sao_luma->merge_up_flag = 0;
-        }
-      }
-    }
-    assert(sao_luma->eo_class < SAO_NUM_EO);
-    assert(sao_chroma->eo_class < SAO_NUM_EO);
-    
-    CHECKPOINT_SAO_INFO("sao_luma", *sao_luma);
-    CHECKPOINT_SAO_INFO("sao_chroma", *sao_chroma);
+    kvz_sao_search_lcu(state, lcu->position.x, lcu->position.y);
   }
-  
+
   // Copy LCU cu_array to main states cu_array, because that is the only one
   // which is given to the next frame through image_list_t.
   {
@@ -249,21 +225,17 @@
     while (main_state->parent) main_state = main_state->parent;
     assert(main_state != state);
 
-    unsigned child_width_in_scu = state->tile->frame->width_in_lcu << MAX_DEPTH;
-    unsigned main_width_in_scu = main_state->tile->frame->width_in_lcu << MAX_DEPTH;
-    unsigned tile_x = state->tile->lcu_offset_x;
-    unsigned tile_y = state->tile->lcu_offset_y;
-
-    unsigned x = lcu->position.x << MAX_DEPTH;
-    unsigned y = lcu->position.y << MAX_DEPTH;
-
-    for (unsigned lcu_row = 0; lcu_row < 8; ++lcu_row) {
-      cu_info_t *main_row = &main_state->tile->frame->cu_array->datax + tile_x + (y + tile_y + lcu_row) * main_width_in_scu;
-      cu_info_t *child_row = &state->tile->frame->cu_array->datax + (y + lcu_row) * child_width_in_scu;
-      memcpy(main_row, child_row, sizeof(cu_info_t) * 8);
-    }
+    const unsigned tile_x_px = state->tile->lcu_offset_x << LOG2_LCU_WIDTH;
+    const unsigned tile_y_px = state->tile->lcu_offset_y << LOG2_LCU_WIDTH;
+    const unsigned x_px = lcu->position_px.x;
+    const unsigned y_px = lcu->position_px.y;
+    kvz_cu_array_copy(main_state->tile->frame->cu_array,
+                      x_px + tile_x_px, y_px + tile_y_px,
+                      state->tile->frame->cu_array,
+                      x_px, y_px,
+                      LCU_WIDTH, LCU_WIDTH);
 
-    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, state->encoder_control->threadqueue, "type=copy_cuinfo,frame=%d,tile=%d", state->global->frame, state->tile->id);
+    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, state->encoder_control->threadqueue, "type=copy_cuinfo,frame=%d,tile=%d", state->frame->num, state->tile->id);
   }
   
   //Now write data to bitstream (required to have a correct CABAC state)
@@ -302,11 +274,18 @@
   }
   
   if (encoder->sao_enable && lcu->above) {
-    //If we're not the first in the row
+    // Add the post-deblocking but pre-SAO pixels of the LCU row above this
+    // row to a buffer so this row can use them on it's own SAO
+    // reconstruction.
+
+    // The pixels need to be taken to from the LCU to the top-left, because
+    // not all of the pixels could be deblocked before prediction of this
+    // LCU was reconstructed.
     if (lcu->above->left) {
       encoder_state_recdata_to_bufs(state, lcu->above->left, state->tile->hor_buf_before_sao, NULL);
     }
-    //Latest LCU in the row, copy the data from the one above also
+    // If this is the last LCU in the row, we can save the pixels from the top
+    // also, as they have been fully deblocked.
     if (!lcu->right) {
       encoder_state_recdata_to_bufs(state, lcu->above, state->tile->hor_buf_before_sao, NULL);
     }
@@ -316,6 +295,8 @@
 static void encoder_state_encode_leaf(encoder_state_t * const state) {
   assert(state->is_leaf);
   assert(state->lcu_order_count > 0);
+
+  const kvz_config *cfg = state->encoder_control->cfg;
   
   // Select whether to encode the frame/tile in current thread or to define
   // wavefront jobs for other threads to handle.
@@ -333,7 +314,7 @@
 #ifdef KVZ_DEBUG
       {
         const lcu_order_element_t * const lcu = &state->lcu_orderi;
-        PERFORMANCE_MEASURE_END(KVZ_PERF_LCU, state->encoder_control->threadqueue, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", state->global->frame, state->tile->id, state->slice->id, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1);
+        PERFORMANCE_MEASURE_END(KVZ_PERF_LCU, state->encoder_control->threadqueue, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", state->frame->num, state->tile->id, state->slice->id, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1);
       }
 #endif //KVZ_DEBUG
     }
@@ -341,7 +322,7 @@
     if (state->encoder_control->sao_enable) {
       PERFORMANCE_MEASURE_START(KVZ_PERF_SAOREC);
       kvz_sao_reconstruct_frame(state);
-      PERFORMANCE_MEASURE_END(KVZ_PERF_SAOREC, state->encoder_control->threadqueue, "type=kvz_sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", state->global->frame, state->tile->id, state->slice->id, state->lcu_order0.position.y + state->tile->lcu_offset_y, state->lcu_orderstate->lcu_order_count - 1.position.y + state->tile->lcu_offset_y,
+      PERFORMANCE_MEASURE_END(KVZ_PERF_SAOREC, state->encoder_control->threadqueue, "type=kvz_sao_reconstruct_frame,frame=%d,tile=%d,slice=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", state->frame->num, state->tile->id, state->slice->id, state->lcu_order0.position.y + state->tile->lcu_offset_y, state->lcu_orderstate->lcu_order_count - 1.position.y + state->tile->lcu_offset_y,
         state->tile->lcu_offset_x * LCU_WIDTH, state->tile->frame->width + state->tile->lcu_offset_x * LCU_WIDTH - 1,
         state->tile->lcu_offset_y * LCU_WIDTH, state->tile->frame->height + state->tile->lcu_offset_y * LCU_WIDTH - 1
       );
@@ -349,11 +330,35 @@
   } else {
     // Add each LCU in the wavefront row as it's own job to the queue.
 
+    // Select which frame dependancies should be set to.
+    const encoder_state_t * ref_state = NULL;
+    if (cfg->gop_lowdelay &&
+        cfg->gop_len > 0 &&
+        state->previous_encoder_state != state)
+    {
+      // For LP-gop, depend on the state of the first reference.
+      int ref_neg = cfg->gop(state->frame->poc - 1) % cfg->gop_len.ref_neg0;
+      if (ref_neg > state->encoder_control->owf) {
+        // If frame is not within OWF range, it's already done.
+        ref_state = NULL;
+      } else {
+        ref_state = state->previous_encoder_state;
+        while (ref_neg > 1) {
+          ref_neg -= 1;
+          ref_state = ref_state->previous_encoder_state;
+        }
+      }
+    } else {
+      // Otherwise, depend on the previous frame.
+      ref_state = state->previous_encoder_state;
+    }
+
     for (int i = 0; i < state->lcu_order_count; ++i) {
       const lcu_order_element_t * const lcu = &state->lcu_orderi;
+
 #ifdef KVZ_DEBUG
       char job_description256;
-      sprintf(job_description, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", state->global->frame, state->tile->id, state->slice->id, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1);
+      sprintf(job_description, "type=encode_lcu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", state->frame->num, state->tile->id, state->slice->id, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH, lcu->position_px.x + state->tile->lcu_offset_x * LCU_WIDTH + lcu->size.x - 1, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH, lcu->position_px.y + state->tile->lcu_offset_y * LCU_WIDTH + lcu->size.y - 1);
 #else
       char* job_description = NULL;
 #endif
@@ -365,12 +370,16 @@
         // once. The added dependancy is for the first LCU of each wavefront
         // row to depend on the reconstruction status of the row below in the
         // previous frame.
-        if (state->previous_encoder_state != state && state->previous_encoder_state->tqj_recon_done && state->global->slicetype != KVZ_SLICE_I) {
+        if (ref_state != NULL &&
+            state->previous_encoder_state->tqj_recon_done &&
+            state->frame->slicetype != KVZ_SLICE_I)
+        {
           if (!lcu->left) {
+            const lcu_order_element_t * const ref_lcu = &ref_state->lcu_orderi;
             if (lcu->below) {
-              kvz_threadqueue_job_dep_add(state->tile->wf_jobslcu->id, lcu->below->encoder_state->previous_encoder_state->tqj_recon_done);
+              kvz_threadqueue_job_dep_add(state->tile->wf_jobslcu->id, ref_lcu->below->encoder_state->tqj_recon_done);
             } else {
-              kvz_threadqueue_job_dep_add(state->tile->wf_jobslcu->id, lcu->encoder_state->previous_encoder_state->tqj_recon_done);
+              kvz_threadqueue_job_dep_add(state->tile->wf_jobslcu->id, ref_lcu->encoder_state->tqj_recon_done);
             }
           }
         }
@@ -410,12 +419,12 @@
     if (sub_state->type != ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
       PERFORMANCE_MEASURE_START(KVZ_PERF_BSLEAF);
       kvz_encoder_state_write_bitstream_leaf(sub_state);
-      PERFORMANCE_MEASURE_END(KVZ_PERF_BSLEAF, sub_state->encoder_control->threadqueue, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order0.position_px.x + sub_state->tile->lcu_offset_x * LCU_WIDTH, sub_state->lcu_ordersub_state->lcu_order_count - 1.position_px.x + sub_state->lcu_ordersub_state->lcu_order_count - 1.size.x + sub_state->tile->lcu_offset_x * LCU_WIDTH - 1, sub_state->lcu_order0.position_px.y + sub_state->tile->lcu_offset_y * LCU_WIDTH, sub_state->lcu_ordersub_state->lcu_order_count - 1.position_px.y + sub_state->lcu_ordersub_state->lcu_order_count - 1.size.y + sub_state->tile->lcu_offset_y * LCU_WIDTH - 1);
+      PERFORMANCE_MEASURE_END(KVZ_PERF_BSLEAF, sub_state->encoder_control->threadqueue, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", sub_state->frame->num, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order0.position_px.x + sub_state->tile->lcu_offset_x * LCU_WIDTH, sub_state->lcu_ordersub_state->lcu_order_count - 1.position_px.x + sub_state->lcu_ordersub_state->lcu_order_count - 1.size.x + sub_state->tile->lcu_offset_x * LCU_WIDTH - 1, sub_state->lcu_order0.position_px.y + sub_state->tile->lcu_offset_y * LCU_WIDTH, sub_state->lcu_ordersub_state->lcu_order_count - 1.position_px.y + sub_state->lcu_ordersub_state->lcu_order_count - 1.size.y + sub_state->tile->lcu_offset_y * LCU_WIDTH - 1);
     } else {
       threadqueue_job_t *job;
 #ifdef KVZ_DEBUG
       char job_description256;
-      sprintf(job_description, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", sub_state->global->frame, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order0.position_px.x + sub_state->tile->lcu_offset_x * LCU_WIDTH, sub_state->lcu_ordersub_state->lcu_order_count-1.position_px.x + sub_state->lcu_ordersub_state->lcu_order_count-1.size.x + sub_state->tile->lcu_offset_x * LCU_WIDTH - 1, sub_state->lcu_order0.position_px.y + sub_state->tile->lcu_offset_y * LCU_WIDTH, sub_state->lcu_ordersub_state->lcu_order_count-1.position_px.y + sub_state->lcu_ordersub_state->lcu_order_count-1.size.y + sub_state->tile->lcu_offset_y * LCU_WIDTH - 1);
+      sprintf(job_description, "type=encoder_state_write_bitstream_leaf,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d", sub_state->frame->num, sub_state->tile->id, sub_state->slice->id, sub_state->lcu_order0.position_px.x + sub_state->tile->lcu_offset_x * LCU_WIDTH, sub_state->lcu_ordersub_state->lcu_order_count-1.position_px.x + sub_state->lcu_ordersub_state->lcu_order_count-1.size.x + sub_state->tile->lcu_offset_x * LCU_WIDTH - 1, sub_state->lcu_order0.position_px.y + sub_state->tile->lcu_offset_y * LCU_WIDTH, sub_state->lcu_ordersub_state->lcu_order_count-1.position_px.y + sub_state->lcu_ordersub_state->lcu_order_count-1.size.y + sub_state->tile->lcu_offset_y * LCU_WIDTH - 1);
 #else
       char* job_description = NULL;
 #endif
@@ -444,8 +453,12 @@
   
   //TODO: copy only needed data
   kvz_pixel *new_y_data = MALLOC(kvz_pixel, frame->width * frame->height);
-  kvz_pixel *new_u_data = MALLOC(kvz_pixel, (frame->width * frame->height) >> 2);
-  kvz_pixel *new_v_data = MALLOC(kvz_pixel, (frame->width * frame->height) >> 2);
+  kvz_pixel *new_u_data = NULL;
+  kvz_pixel *new_v_data = NULL;
+  if (frame->rec->chroma_format != KVZ_CSP_400) {
+    new_u_data = MALLOC(kvz_pixel, (frame->width * frame->height) >> 2);
+    new_v_data = MALLOC(kvz_pixel, (frame->width * frame->height) >> 2);
+  }
   
   const int offset = frame->width * (data->y*LCU_WIDTH);
   const int offset_c = frame->width/2 * (data->y*LCU_WIDTH_C);
@@ -456,14 +469,18 @@
   }
   
   memcpy(&new_y_dataoffset, &frame->rec->yoffset, sizeof(kvz_pixel) * num_pixels);
-  memcpy(&new_u_dataoffset_c, &frame->rec->uoffset_c, sizeof(kvz_pixel) * num_pixels >> 2);
-  memcpy(&new_v_dataoffset_c, &frame->rec->voffset_c, sizeof(kvz_pixel) * num_pixels >> 2);
+  if (frame->rec->chroma_format != KVZ_CSP_400) {
+    memcpy(&new_u_dataoffset_c, &frame->rec->uoffset_c, sizeof(kvz_pixel) * num_pixels >> 2);
+    memcpy(&new_v_dataoffset_c, &frame->rec->voffset_c, sizeof(kvz_pixel) * num_pixels >> 2);
+  }
   
   if (data->y>0) {
     //copy first row from buffer
     memcpy(&new_y_dataframe->width * (data->y*LCU_WIDTH-1), &data->encoder_state->tile->hor_buf_before_sao->yframe->width * (data->y-1), frame->width * sizeof(kvz_pixel));
-    memcpy(&new_u_dataframe->width/2 * (data->y*LCU_WIDTH_C-1), &data->encoder_state->tile->hor_buf_before_sao->uframe->width/2 * (data->y-1), frame->width/2 * sizeof(kvz_pixel));
-    memcpy(&new_v_dataframe->width/2 * (data->y*LCU_WIDTH_C-1), &data->encoder_state->tile->hor_buf_before_sao->vframe->width/2 * (data->y-1), frame->width/2 * sizeof(kvz_pixel));
+    if (frame->rec->chroma_format != KVZ_CSP_400) {
+      memcpy(&new_u_dataframe->width / 2 * (data->y*LCU_WIDTH_C - 1), &data->encoder_state->tile->hor_buf_before_sao->uframe->width / 2 * (data->y - 1), frame->width / 2 * sizeof(kvz_pixel));
+      memcpy(&new_v_dataframe->width / 2 * (data->y*LCU_WIDTH_C - 1), &data->encoder_state->tile->hor_buf_before_sao->vframe->width / 2 * (data->y - 1), frame->width / 2 * sizeof(kvz_pixel));
+    }
   }
 
   for (x = 0; x < frame->width_in_lcu; x++) {
@@ -471,8 +488,10 @@
     sao_info_t *sao_luma = &frame->sao_lumadata->y * stride + x;
     sao_info_t *sao_chroma = &frame->sao_chromadata->y * stride + x;
     kvz_sao_reconstruct(data->encoder_state->encoder_control, frame, new_y_data, x, data->y, sao_luma, COLOR_Y);
-    kvz_sao_reconstruct(data->encoder_state->encoder_control, frame, new_u_data, x, data->y, sao_chroma, COLOR_U);
-    kvz_sao_reconstruct(data->encoder_state->encoder_control, frame, new_v_data, x, data->y, sao_chroma, COLOR_V);
+    if (frame->rec->chroma_format != KVZ_CSP_400) {
+      kvz_sao_reconstruct(data->encoder_state->encoder_control, frame, new_u_data, x, data->y, sao_chroma, COLOR_U);
+      kvz_sao_reconstruct(data->encoder_state->encoder_control, frame, new_v_data, x, data->y, sao_chroma, COLOR_V);
+    }
   }
   
   free(new_y_data);
@@ -532,26 +551,35 @@
           char job_description256;
           switch (main_state->childreni.type) {
             case ENCODER_STATE_TYPE_TILE: 
-              sprintf(job_description, "type=encode_child,frame=%d,tile=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", main_state->childreni.global->frame, main_state->childreni.tile->id, main_state->childreni.lcu_order0.position.y + main_state->childreni.tile->lcu_offset_y, main_state->childreni.lcu_order0.position.y + main_state->childreni.tile->lcu_offset_y, 
+              sprintf(job_description, "type=encode_child,frame=%d,tile=%d,row=%d-%d,px_x=%d-%d,px_y=%d-%d", main_state->childreni.frame->num, main_state->childreni.tile->id, main_state->childreni.lcu_order0.position.y + main_state->childreni.tile->lcu_offset_y, main_state->childreni.lcu_order0.position.y + main_state->childreni.tile->lcu_offset_y, 
                       main_state->childreni.lcu_order0.position_px.x + main_state->childreni.tile->lcu_offset_x * LCU_WIDTH, main_state->childreni.lcu_ordermain_state->childreni.lcu_order_count-1.position_px.x + main_state->childreni.lcu_ordermain_state->childreni.lcu_order_count-1.size.x + main_state->childreni.tile->lcu_offset_x * LCU_WIDTH - 1,
                       main_state->childreni.lcu_order0.position_px.y + main_state->childreni.tile->lcu_offset_y * LCU_WIDTH, main_state->childreni.lcu_ordermain_state->childreni.lcu_order_count-1.position_px.y + main_state->childreni.lcu_ordermain_state->childreni.lcu_order_count-1.size.y + main_state->childreni.tile->lcu_offset_y * LCU_WIDTH - 1);
               break;
             case ENCODER_STATE_TYPE_SLICE:
-              sprintf(job_description, "type=encode_child,frame=%d,slice=%d,start_in_ts=%d", main_state->childreni.global->frame, main_state->childreni.slice->id, main_state->childreni.slice->start_in_ts);
+              sprintf(job_description, "type=encode_child,frame=%d,slice=%d,start_in_ts=%d", main_state->childreni.frame->num, main_state->childreni.slice->id, main_state->childreni.slice->start_in_ts);
               break;
             default:
-              sprintf(job_description, "type=encode_child,frame=%d,invalid", main_state->childreni.global->frame);
+              sprintf(job_description, "type=encode_child,frame=%d,invalid", main_state->childreni.frame->num);
               break;
           }
 #else
           char* job_description = NULL;
 #endif
           main_state->childreni.tqj_recon_done = kvz_threadqueue_submit(main_state->encoder_control->threadqueue, encoder_state_worker_encode_children, &(main_state->childreni), 1, job_description);
-          if (main_state->childreni.previous_encoder_state != &main_state->childreni && main_state->childreni.previous_encoder_state->tqj_recon_done && !main_state->childreni.global->is_idr_frame) {
-            // Add dependancy to each child in the previous frame.
-            // TODO: Make it so that only adjacent tiles are dependet upon and search is constrained to those?
-            for (int child_id = 0; main_state->childrenchild_id.encoder_control; ++child_id) {
-              kvz_threadqueue_job_dep_add(main_state->childreni.tqj_recon_done, main_state->childrenchild_id.previous_encoder_state->tqj_recon_done);
+          if (main_state->childreni.previous_encoder_state != &main_state->childreni && main_state->childreni.previous_encoder_state->tqj_recon_done && !main_state->childreni.frame->is_idr_frame) {
+#if 0
+            // Disabled due to non-determinism.
+            if (main_state->encoder_control->cfg->mv_constraint == KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN)
+            {
+              // When MV's don't cross tile boundaries, add dependancy only to the same tile.
+              kvz_threadqueue_job_dep_add(main_state->childreni.tqj_recon_done, main_state->childreni.previous_encoder_state->tqj_recon_done);
+            } else 
+#endif      
+            {
+              // Add dependancy to each child in the previous frame.
+              for (int child_id = 0; main_state->childrenchild_id.encoder_control; ++child_id) {
+                kvz_threadqueue_job_dep_add(main_state->childreni.tqj_recon_done, main_state->childrenchild_id.previous_encoder_state->tqj_recon_done);
+              }
             }
           }
           kvz_threadqueue_job_unwait_job(main_state->encoder_control->threadqueue, main_state->childreni.tqj_recon_done);
@@ -562,18 +590,22 @@
         }
       }
       
-      //If children are wavefront, we need to reconstruct SAO
-      if (main_state->encoder_control->sao_enable && main_state->children0.type == ENCODER_STATE_TYPE_WAVEFRONT_ROW) {
+      // Add SAO reconstruction jobs and their dependancies when using WPP coding.
+      if (main_state->encoder_control->sao_enable && 
+          main_state->children0.type == ENCODER_STATE_TYPE_WAVEFRONT_ROW)
+      {
         int y;
         videoframe_t * const frame = main_state->tile->frame;
         threadqueue_job_t *previous_job = NULL;
         
         for (y = 0; y < frame->height_in_lcu; ++y) {
+          // Queue a single job performing SAO reconstruction for the whole wavefront row.
+
           worker_sao_reconstruct_lcu_data *data = MALLOC(worker_sao_reconstruct_lcu_data, 1);
           threadqueue_job_t *job;
 #ifdef KVZ_DEBUG
           char job_description256;
-          sprintf(job_description, "type=sao,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d", main_state->global->frame, main_state->tile->id, main_state->tile->lcu_offset_x * LCU_WIDTH, main_state->tile->lcu_offset_x * LCU_WIDTH + main_state->tile->frame->width - 1, (main_state->tile->lcu_offset_y + y) * LCU_WIDTH, MIN(main_state->tile->lcu_offset_y * LCU_WIDTH + main_state->tile->frame->height, (main_state->tile->lcu_offset_y + y + 1) * LCU_WIDTH)-1);
+          sprintf(job_description, "type=sao,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d", main_state->frame->num, main_state->tile->id, main_state->tile->lcu_offset_x * LCU_WIDTH, main_state->tile->lcu_offset_x * LCU_WIDTH + main_state->tile->frame->width - 1, (main_state->tile->lcu_offset_y + y) * LCU_WIDTH, MIN(main_state->tile->lcu_offset_y * LCU_WIDTH + main_state->tile->frame->height, (main_state->tile->lcu_offset_y + y + 1) * LCU_WIDTH)-1);
 #else
           char* job_description = NULL;
 #endif
@@ -582,24 +614,31 @@
           
           job = kvz_threadqueue_submit(main_state->encoder_control->threadqueue, encoder_state_worker_sao_reconstruct_lcu, data, 1, job_description);
           
+          // This dependancy is needed, because the pre-SAO pixels from the LCU row
+          // below this one are read straigh from the frame.
           if (previous_job) {
             kvz_threadqueue_job_dep_add(job, previous_job);
           }
           previous_job = job;
           
+          // This depepndancy ensures that the bottom edge of this LCU row
+          // has been fully deblocked.
           if (y < frame->height_in_lcu - 1) {
-            //Not last row: depend on the last LCU of the row below
+            // Not last row: depend on the last LCU of the row below.
             kvz_threadqueue_job_dep_add(job, main_state->tile->wf_jobs(y + 1) * frame->width_in_lcu + frame->width_in_lcu - 1);
           } else {
-            //Last row: depend on the last LCU of the row
+            // Last row: depend on the last LCU of the row
             kvz_threadqueue_job_dep_add(job, main_state->tile->wf_jobs(y + 0) * frame->width_in_lcu + frame->width_in_lcu - 1);
           }
           kvz_threadqueue_job_unwait_job(main_state->encoder_control->threadqueue, job);
           
-          //Set wfrow recon job
+          // The wavefront row is finished, when the SAO-reconstruction is
+          // finished.
           main_state->childreny.tqj_recon_done = job;
           
           if (y == frame->height_in_lcu - 1) {
+            // This tile is finished, when the reconstruction of the last
+            // WPP-row is finished.
             assert(!main_state->tqj_recon_done);
             main_state->tqj_recon_done = job;
           }
@@ -653,12 +692,12 @@
 
   // List all pocs of lists
   int j = 0;
-  for (j = 0; j < state->global->ref->used_size; j++) {
-    if (state->global->ref->pocsj < state->global->poc) {
-      ref_list_poc_out0ref_list_len_out0 = state->global->ref->pocsj;
+  for (j = 0; j < state->frame->ref->used_size; j++) {
+    if (state->frame->ref->pocsj < state->frame->poc) {
+      ref_list_poc_out0ref_list_len_out0 = state->frame->ref->pocsj;
       ref_list_len_out0++;
     } else {
-      ref_list_poc_out1ref_list_len_out1 = state->global->ref->pocsj;
+      ref_list_poc_out1ref_list_len_out1 = state->frame->ref->pocsj;
       ref_list_len_out1++;
     }
   }
@@ -679,164 +718,201 @@
 
   kvz_encoder_get_ref_lists(state, ref_list_len, ref_list_poc);
 
-  for (int j = 0; j < state->global->ref->used_size; j++) {
-    if (state->global->ref->pocsj < state->global->poc) {
+  for (int j = 0; j < state->frame->ref->used_size; j++) {
+    if (state->frame->ref->pocsj < state->frame->poc) {
       for (int ref_idx = 0; ref_idx < ref_list_len0; ref_idx++) {
-        if (ref_list_poc0ref_idx == state->global->ref->pocsj) {
-          state->global->refmapj.idx = ref_list_len0 - ref_idx - 1;
+        if (ref_list_poc0ref_idx == state->frame->ref->pocsj) {
+          state->frame->refmapj.idx = ref_list_len0 - ref_idx - 1;
           break;
         }
       }
-      state->global->refmapj.list = 1;
+      state->frame->refmapj.list = 1;
 
     } else {
       for (int ref_idx = 0; ref_idx < ref_list_len1; ref_idx++) {
-        if (ref_list_poc1ref_idx == state->global->ref->pocsj) {
-          state->global->refmapj.idx = ref_idx;
+        if (ref_list_poc1ref_idx == state->frame->ref->pocsj) {
+          state->frame->refmapj.idx = ref_idx;
           break;
         }
       }
-      state->global->refmapj.list = 2;
+      state->frame->refmapj.list = 2;
     }
-    state->global->refmapj.poc = state->global->ref->pocsj;
+    state->frame->refmapj.poc = state->frame->ref->pocsj;
   }
 }
 
+/**
+ * \brief Remove any references that should no longer be used.
+ */
 static void encoder_state_remove_refs(encoder_state_t *state) {
   const encoder_control_t * const encoder = state->encoder_control;
-  int8_t refnumber = encoder->cfg->ref_frames;
-  int8_t check_refs = 0;
+  
+  int neg_refs = encoder->cfg->gopstate->frame->gop_offset.ref_neg_count;
+  int pos_refs = encoder->cfg->gopstate->frame->gop_offset.ref_pos_count;
+
+  unsigned target_ref_num;
   if (encoder->cfg->gop_len) {
-    refnumber = encoder->cfg->gopstate->global->gop_offset.ref_neg_count + encoder->cfg->gopstate->global->gop_offset.ref_pos_count;
-    check_refs = 1;
-  } else if (state->global->slicetype == KVZ_SLICE_I) {
-    refnumber = 0;
+    target_ref_num = neg_refs + pos_refs;
+  } else {
+    target_ref_num = encoder->cfg->ref_frames;
   }
-  // Remove the ref pic (if present)
-  while (check_refs || state->global->ref->used_size > (uint32_t)refnumber) {
-    int8_t ref_to_remove = state->global->ref->used_size - 1;
-    if (encoder->cfg->gop_len) {
-      for (int ref = 0; ref < state->global->ref->used_size; ref++) {
-        uint8_t found = 0;
-        for (int i = 0; i < encoder->cfg->gopstate->global->gop_offset.ref_neg_count; i++) {
-          if (state->global->ref->pocsref == state->global->poc - encoder->cfg->gopstate->global->gop_offset.ref_negi) {
-            found = 1;
-            break;
-          }
-        }
-        if (found) continue;
-        for (int i = 0; i < encoder->cfg->gopstate->global->gop_offset.ref_pos_count; i++) {
-          if (state->global->ref->pocsref == state->global->poc + encoder->cfg->gopstate->global->gop_offset.ref_posi) {
-            found = 1;
-            break;
-          }
+  if (state->frame->slicetype == KVZ_SLICE_I) {
+    target_ref_num = 0;
+  }
+
+  if (encoder->cfg->gop_len && target_ref_num > 0) {
+    // With GOP in use, go through all the existing reference pictures and
+    // remove any picture that is not referenced by the current picture.
+
+    for (int ref = state->frame->ref->used_size - 1; ref >= 0; --ref) {
+      bool is_referenced = false;
+
+      int ref_poc = state->frame->ref->pocsref;
+      
+      for (int i = 0; i < neg_refs; i++) {
+        int ref_relative_poc = -encoder->cfg->gopstate->frame->gop_offset.ref_negi;
+        if (ref_poc == state->frame->poc + ref_relative_poc) {
+          is_referenced = true;
+          break;
         }
-        if (!found) {
-          kvz_image_list_rem(state->global->ref, ref);
-          ref--;
+      }
+
+      
+      for (int i = 0; i < pos_refs; i++) {
+        int ref_relative_poc = encoder->cfg->gopstate->frame->gop_offset.ref_posi;
+        if (ref_poc == state->frame->poc + ref_relative_poc) {
+          is_referenced = true;
+          break;
         }
       }
-      check_refs = 0;
-    } else kvz_image_list_rem(state->global->ref, ref_to_remove);
+
+      if (!is_referenced) {
+        // This reference is not referred to by this frame, it must be removed.
+        kvz_image_list_rem(state->frame->ref, ref);
+      }
+    }
+  } else {
+    // Without GOP, remove the oldest picture.
+    while (state->frame->ref->used_size > target_ref_num) {
+      int8_t oldest_ref = state->frame->ref->used_size - 1;
+      kvz_image_list_rem(state->frame->ref, oldest_ref);
+    }
   }
 
+  assert(state->frame->ref->used_size <= target_ref_num);
 }
 
 static void encoder_state_reset_poc(encoder_state_t *state) {
-  int i;
-
-  state->global->poc = 0;
+  state->frame->poc = 0;
   kvz_videoframe_set_poc(state->tile->frame, 0);
-  
-  for (i=0; state->childreni.encoder_control; ++i) {
+
+  for (int i = 0; state->childreni.encoder_control; ++i) {
     encoder_state_t *sub_state = &(state->childreni);
     encoder_state_reset_poc(sub_state);
   }
 }
 
-static void encoder_state_new_frame(encoder_state_t * const state) {
-  int i;
-  //FIXME Move this somewhere else!
-  if (state->type == ENCODER_STATE_TYPE_MAIN) {
-    const encoder_control_t * const encoder = state->encoder_control;
-
-    if (state->global->frame == 0) {
-      state->global->is_idr_frame = true;
-    }  else if (encoder->cfg->gop_len) {
-      // Closed GOP / CRA is not yet supported.
-      state->global->is_idr_frame = false;
-    
-      // Calculate POC according to the global frame counter and GOP structure
-      int32_t poc = state->global->frame - 1;
-      int32_t poc_offset = encoder->cfg->gopstate->global->gop_offset.poc_offset;
-      state->global->poc = poc - poc % encoder->cfg->gop_len + poc_offset;
-      kvz_videoframe_set_poc(state->tile->frame, state->global->poc);
-    } else {
-      bool is_i_idr = (encoder->cfg->intra_period == 1 && state->global->frame % 2 == 0);
-      bool is_p_idr = (encoder->cfg->intra_period > 1 && (state->global->frame % encoder->cfg->intra_period) == 0);
-      state->global->is_idr_frame = is_i_idr || is_p_idr;
-    }
-   
-    if (state->global->is_idr_frame) {
-      encoder_state_reset_poc(state);
-      state->global->slicetype = KVZ_SLICE_I;
-      state->global->pictype = KVZ_NAL_IDR_W_RADL;
-    } else {
-      state->global->slicetype = encoder->cfg->intra_period==1 ? KVZ_SLICE_I : (state->encoder_control->cfg->gop_len?KVZ_SLICE_B:KVZ_SLICE_P);
-
-      // Use P-slice for lowdelay.
-      if (state->global->slicetype == KVZ_SLICE_B && encoder->cfg->gop_lowdelay) {
-        state->global->slicetype = KVZ_SLICE_P;
-      }
-
-      state->global->pictype = KVZ_NAL_TRAIL_R;
-      if (state->encoder_control->cfg->gop_len) {
-        if (encoder->cfg->intra_period > 1 && (state->global->poc % encoder->cfg->intra_period) == 0) {
-          state->global->slicetype = KVZ_SLICE_I;
-        }
-      }
+static void encoder_set_source_picture(encoder_state_t * const state, kvz_picture* frame)
+{
+  assert(!state->tile->frame->source);
+  assert(!state->tile->frame->rec);
 
-    }
+  state->tile->frame->source = frame;
+  if (state->encoder_control->cfg->lossless) {
+    // In lossless mode, the reconstruction is equal to the source frame.
+    state->tile->frame->rec = kvz_image_copy_ref(frame);
+  } else {
+    state->tile->frame->rec = kvz_image_alloc(state->encoder_control->chroma_format, frame->width, frame->height);
+  }
 
-    encoder_state_remove_refs(state);
-    encoder_state_ref_sort(state);
-    double lambda;
-    if (encoder->cfg->target_bitrate > 0) {
-      // Rate control enabled.
-      lambda = kvz_select_picture_lambda(state);
-      state->global->QP = kvz_lambda_to_QP(lambda);
-    } else {
-      if (encoder->cfg->gop_len > 0 && state->global->slicetype != KVZ_SLICE_I) {
-        kvz_gop_config const * const gop =
-          encoder->cfg->gop + state->global->gop_offset;
-        state->global->QP = encoder->cfg->qp + gop->qp_offset;
-        state->global->QP_factor = gop->qp_factor;
-      } else {
-        state->global->QP = encoder->cfg->qp;
-      }
-      lambda = kvz_select_picture_lambda_from_qp(state);
-    }
-    state->global->cur_lambda_cost = lambda;
-    state->global->cur_lambda_cost_sqrt = sqrt(lambda);
+  kvz_videoframe_set_poc(state->tile->frame, state->frame->poc);
+}
 
-  }
+static void encoder_state_init_children(encoder_state_t * const state) {
   kvz_bitstream_clear(&state->stream);
-  
+
   if (state->is_leaf) {
     //Leaf states have cabac and context
     kvz_cabac_start(&state->cabac);
-    kvz_init_contexts(state, state->global->QP, state->global->slicetype);
+    kvz_init_contexts(state, state->frame->QP, state->frame->slicetype);
   }
-  
+
   //Clear the jobs
   state->tqj_bitstream_written = NULL;
   state->tqj_recon_done = NULL;
-  
-  for (i = 0; state->childreni.encoder_control; ++i) {
-    encoder_state_new_frame(&state->childreni);
+
+  for (int i = 0; state->childreni.encoder_control; ++i) {
+    encoder_state_init_children(&state->childreni);
   }
+}
+
+static void encoder_state_init_new_frame(encoder_state_t * const state, kvz_picture* frame) {
+  assert(state->type == ENCODER_STATE_TYPE_MAIN);
+
+  const kvz_config * const cfg = state->encoder_control->cfg;
+
+  encoder_set_source_picture(state, frame);
+
+  if (state->frame->num == 0) {
+    state->frame->is_idr_frame = true;
+  }  else if (cfg->gop_len) {
+    // Closed GOP / CRA is not yet supported.
+    state->frame->is_idr_frame = false;
   
+    // Calculate POC according to the global frame counter and GOP structure
+    int32_t poc = state->frame->num - 1;
+    int32_t poc_offset = cfg->gopstate->frame->gop_offset.poc_offset;
+    state->frame->poc = poc - poc % cfg->gop_len + poc_offset;
+    kvz_videoframe_set_poc(state->tile->frame, state->frame->poc);
+  } else {
+    bool is_i_idr = (cfg->intra_period == 1 && state->frame->num % 2 == 0);
+    bool is_p_idr = (cfg->intra_period > 1 && (state->frame->num % cfg->intra_period) == 0);
+    state->frame->is_idr_frame = is_i_idr || is_p_idr;
+  }
+ 
+  if (state->frame->is_idr_frame) {
+    encoder_state_reset_poc(state);
+    state->frame->slicetype = KVZ_SLICE_I;
+    state->frame->pictype = KVZ_NAL_IDR_W_RADL;
+  } else {
+    state->frame->slicetype = cfg->intra_period==1 ? KVZ_SLICE_I : (state->encoder_control->cfg->gop_len?KVZ_SLICE_B:KVZ_SLICE_P);
 
+    // Use P-slice for lowdelay.
+    if (state->frame->slicetype == KVZ_SLICE_B && cfg->gop_lowdelay) {
+      state->frame->slicetype = KVZ_SLICE_P;
+    }
+
+    state->frame->pictype = KVZ_NAL_TRAIL_R;
+    if (state->encoder_control->cfg->gop_len) {
+      if (cfg->intra_period > 1 && (state->frame->poc % cfg->intra_period) == 0) {
+        state->frame->slicetype = KVZ_SLICE_I;
+      }
+    }
+
+  }
+
+  encoder_state_remove_refs(state);
+  encoder_state_ref_sort(state);
+  double lambda;
+  if (cfg->target_bitrate > 0) {
+    // Rate control enabled.
+    lambda = kvz_select_picture_lambda(state);
+    state->frame->QP = kvz_lambda_to_QP(lambda);
+  } else {
+    if (cfg->gop_len > 0 && state->frame->slicetype != KVZ_SLICE_I) {
+      kvz_gop_config const * const gop =
+        cfg->gop + state->frame->gop_offset;
+      state->frame->QP = cfg->qp + gop->qp_offset;
+      state->frame->QP_factor = gop->qp_factor;
+    } else {
+      state->frame->QP = cfg->qp;
+    }
+    lambda = kvz_select_picture_lambda_from_qp(state);
+  }
+  state->frame->cur_lambda_cost = lambda;
+  state->frame->cur_lambda_cost_sqrt = sqrt(lambda);
+
+  encoder_state_init_children(state);
 }
 
 static void _encode_one_frame_add_bitstream_deps(const encoder_state_t * const state, threadqueue_job_t * const job) {
@@ -853,24 +929,24 @@
 }
 
 
-void kvz_encode_one_frame(encoder_state_t * const state)
+void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame)
 {
   {
     PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
-    encoder_state_new_frame(state);
-    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, state->encoder_control->threadqueue, "type=new_frame,frame=%d,poc=%d", state->global->frame, state->global->poc);
+    encoder_state_init_new_frame(state, frame);
+    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, state->encoder_control->threadqueue, "type=init_new_frame,frame=%d,poc=%d", state->frame->num, state->frame->poc);
   }
   {
     PERFORMANCE_MEASURE_START(KVZ_PERF_FRAME);
     encoder_state_encode(state);
-    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, state->encoder_control->threadqueue, "type=encode,frame=%d", state->global->frame);
+    PERFORMANCE_MEASURE_END(KVZ_PERF_FRAME, state->encoder_control->threadqueue, "type=encode,frame=%d", state->frame->num);
   }
   //kvz_threadqueue_flush(main_state->encoder_control->threadqueue);
   {
     threadqueue_job_t *job;
 #ifdef KVZ_DEBUG
     char job_description256;
-    sprintf(job_description, "type=write_bitstream,frame=%d", state->global->frame);
+    sprintf(job_description, "type=write_bitstream,frame=%d", state->frame->num);
 #else
     char* job_description = NULL;
 #endif
@@ -891,624 +967,71 @@
 }
 
 
-void kvz_encoder_next_frame(encoder_state_t *state)
+/**
+ * Prepare the encoder state for encoding the next frame.
+ *
+ * - Add the previous reconstructed picture as a reference, if needed.
+ * - Free the previous reconstructed and source pictures.
+ * - Create a new cu array, if needed.
+ * - Update frame count and POC.
+ */
+void kvz_encoder_prepare(encoder_state_t *state)
 {
   const encoder_control_t * const encoder = state->encoder_control;
 
   // The previous frame must be done before the next one is started.
   assert(state->frame_done);
 
-  if (state->global->frame == -1) {
-    //We're at the first frame, so don't care about all this stuff;
-    state->global->frame = 0;
-    state->global->poc = 0;
+  if (state->frame->num == -1) {
+    // We're at the first frame, so don't care about all this stuff.
+    state->frame->num = 0;
+    state->frame->poc   = 0;
     assert(!state->tile->frame->source);
     assert(!state->tile->frame->rec);
-    state->tile->frame->rec = kvz_image_alloc(state->tile->frame->width, state->tile->frame->height);
-    assert(state->tile->frame->rec);
     state->prepared = 1;
     return;
   }
-  
-  if (state->previous_encoder_state != state) {
-    encoder_state_t *prev_state = state->previous_encoder_state;
 
-    //We have a "real" previous encoder
-    state->global->frame = prev_state->global->frame + 1;
-    state->global->poc = prev_state->global->poc + 1;
+  // NOTE: prev_state is equal to state when OWF is zero
+  encoder_state_t *prev_state = state->previous_encoder_state;
 
+  if (state->previous_encoder_state != state) {
     kvz_cu_array_free(state->tile->frame->cu_array);
-    kvz_image_free(state->tile->frame->source);
-    state->tile->frame->source = NULL;
-    kvz_image_free(state->tile->frame->rec);
-    state->tile->frame->rec = kvz_image_alloc(state->tile->frame->width, state->tile->frame->height);
-    assert(state->tile->frame->rec);
-    {
-      // Allocate height_in_scu x width_in_scu x sizeof(CU_info)
-      unsigned height_in_scu = state->tile->frame->height_in_lcu << MAX_DEPTH;
-      unsigned width_in_scu = state->tile->frame->width_in_lcu << MAX_DEPTH;
-      state->tile->frame->cu_array = kvz_cu_array_alloc(width_in_scu, height_in_scu);
-    }
-    kvz_videoframe_set_poc(state->tile->frame, state->global->poc);
-    kvz_image_list_copy_contents(state->global->ref, prev_state->global->ref);
-    if (!encoder->cfg->gop_len ||
-        !prev_state->global->poc ||
-        encoder->cfg->gopprev_state->global->gop_offset.is_ref) {
-      kvz_image_list_add(state->global->ref,
-                     prev_state->tile->frame->rec,
-                     prev_state->tile->frame->cu_array,
-                     prev_state->global->poc);
-    }
+    state->tile->frame->cu_array = NULL;
+    unsigned width  = state->tile->frame->width_in_lcu  * LCU_WIDTH;
+    unsigned height = state->tile->frame->height_in_lcu * LCU_WIDTH;
+    state->tile->frame->cu_array = kvz_cu_array_alloc(width, height);
 
-    state->prepared = 1;
-    return;
+    kvz_image_list_copy_contents(state->frame->ref, prev_state->frame->ref);
   }
 
-
   if (!encoder->cfg->gop_len ||
-      !state->global->poc ||
-      encoder->cfg->gopstate->global->gop_offset.is_ref) {
-    // Add current reconstructed picture as reference
-    kvz_image_list_add(state->global->ref,
-                   state->tile->frame->rec,
-                   state->tile->frame->cu_array,
-                   state->global->poc);
+      !prev_state->frame->poc ||
+      encoder->cfg->gopprev_state->frame->gop_offset.is_ref) {
+    // Add previous reconstructed picture as a reference
+    kvz_image_list_add(state->frame->ref,
+                   prev_state->tile->frame->rec,
+                   prev_state->tile->frame->cu_array,
+                   prev_state->frame->poc);
+    kvz_cu_array_free(state->tile->frame->cu_array);
+    unsigned height = state->tile->frame->height_in_lcu * LCU_WIDTH;
+    unsigned width  = state->tile->frame->width_in_lcu  * LCU_WIDTH;
+    state->tile->frame->cu_array = kvz_cu_array_alloc(width, height);
   }
 
-
-  state->global->frame++;
-  state->global->poc++;
-
-  // Remove current source picture.
+  // Remove source and reconstructed picture.
   kvz_image_free(state->tile->frame->source);
   state->tile->frame->source = NULL;
-
-  // Remove current reconstructed picture, and alloc a new one
   kvz_image_free(state->tile->frame->rec);
+  state->tile->frame->rec = NULL;
 
-  state->tile->frame->rec = kvz_image_alloc(state->tile->frame->width, state->tile->frame->height);
-  assert(state->tile->frame->rec);
-  kvz_videoframe_set_poc(state->tile->frame, state->global->poc);
-  state->prepared = 1;
-}
-
-static void encode_part_mode(encoder_state_t * const state,
-                             cabac_data_t * const cabac,
-                             const cu_info_t * const cur_cu,
-                             int depth)
-{
-  // Binarization from Table 9-34 of the HEVC spec:
-  //
-  //                |   log2CbSize >     |    log2CbSize ==
-  //                |   MinCbLog2SizeY   |    MinCbLog2SizeY
-  // -------+-------+----------+---------+-----------+----------
-  //  pred  | part  | AMP      | AMP     |           |
-  //  mode  | mode  | disabled | enabled | size == 8 | size > 8
-  // -------+-------+----------+---------+-----------+----------
-  //  intra | 2Nx2N |        -         - |         1          1
-  //        |   NxN |        -         - |         0          0
-  // -------+-------+--------------------+----------------------
-  //  inter | 2Nx2N |        1         1 |         1          1
-  //        |  2NxN |       01       011 |        01         01
-  //        |  Nx2N |       00       001 |        00        001
-  //        |   NxN |        -         - |         -        000
-  //        | 2NxnU |        -      0100 |         -          -
-  //        | 2NxnD |        -      0101 |         -          -
-  //        | nLx2N |        -      0000 |         -          -
-  //        | nRx2N |        -      0001 |         -          -
-  // -------+-------+--------------------+----------------------
-  //
-  //
-  // Context indices from Table 9-37 of the HEVC spec:
-  //
-  //                                      binIdx
-  //                               |  0  1  2       3
-  // ------------------------------+------------------
-  //  log2CbSize == MinCbLog2SizeY |  0  1  2  bypass
-  //  log2CbSize >  MinCbLog2SizeY |  0  1  3  bypass
-  // ------------------------------+------------------
-
-  if (cur_cu->type == CU_INTRA) {
-    if (depth == MAX_DEPTH) {
-      cabac->cur_ctx = &(cabac->ctx.part_size_model0);
-      if (cur_cu->part_size == SIZE_2Nx2N) {
-        CABAC_BIN(cabac, 1, "part_mode 2Nx2N");
-      } else {
-        CABAC_BIN(cabac, 0, "part_mode NxN");
-      }
-    }
-  } else {
-
-    cabac->cur_ctx = &(cabac->ctx.part_size_model0);
-    if (cur_cu->part_size == SIZE_2Nx2N) {
-      CABAC_BIN(cabac, 1, "part_mode 2Nx2N");
-      return;
-    }
-    CABAC_BIN(cabac, 0, "part_mode split");
-
-    cabac->cur_ctx = &(cabac->ctx.part_size_model1);
-    if (cur_cu->part_size == SIZE_2NxN ||
-        cur_cu->part_size == SIZE_2NxnU ||
-        cur_cu->part_size == SIZE_2NxnD) {
-      CABAC_BIN(cabac, 1, "part_mode vertical");
-    } else {
-      CABAC_BIN(cabac, 0, "part_mode horizontal");
-    }
-
-    if (state->encoder_control->cfg->amp_enable) {
-      if (depth == MAX_DEPTH) {
-        cabac->cur_ctx = &(cabac->ctx.part_size_model2);
-      } else {
-        cabac->cur_ctx = &(cabac->ctx.part_size_model3);
-      }
-
-      if (cur_cu->part_size == SIZE_2NxN ||
-          cur_cu->part_size == SIZE_Nx2N) {
-        CABAC_BIN(cabac, 1, "part_mode SMP");
-        return;
-      }
-      CABAC_BIN(cabac, 0, "part_mode AMP");
-
-      if (cur_cu->part_size == SIZE_2NxnU ||
-          cur_cu->part_size == SIZE_nLx2N) {
-        CABAC_BINS_EP(cabac, 0, 1, "part_mode AMP");
-      } else {
-        CABAC_BINS_EP(cabac, 1, 1, "part_mode AMP");
-      }
-    }
-  }
-}
-
-static void encode_inter_prediction_unit(encoder_state_t * const state,
-                                         cabac_data_t * const cabac,
-                                         const cu_info_t * const cur_cu,
-                                         int x_ctb, int y_ctb, int depth)
-{
-  // Mergeflag
-  int16_t num_cand = 0;
-  cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);
-  CABAC_BIN(cabac, cur_cu->merged, "MergeFlag");
-  num_cand = MRG_MAX_NUM_CANDS;
-  if (cur_cu->merged) { //merge
-    if (num_cand > 1) {
-      int32_t ui;
-      for (ui = 0; ui < num_cand - 1; ui++) {
-        int32_t symbol = (ui != cur_cu->merge_idx);
-        if (ui == 0) {
-          cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
-          CABAC_BIN(cabac, symbol, "MergeIndex");
-        } else {
-          CABAC_BIN_EP(cabac,symbol,"MergeIndex");
-        }
-        if (symbol == 0) break;
-      }
-    }
-  } else {
-    uint32_t ref_list_idx;
-    uint32_t j;
-    int ref_list2 = { 0, 0 };
-    for (j = 0; j < state->global->ref->used_size; j++) {
-      if (state->global->ref->pocsj < state->global->poc) {
-        ref_list0++;
-      } else {
-        ref_list1++;
-      }
-    }
-
-    // Void TEncSbac::codeInterDir( TComDataCU* pcCU, UInt uiAbsPartIdx )
-    if (state->global->slicetype == KVZ_SLICE_B)
-    {
-      // Code Inter Dir
-      uint8_t inter_dir = cur_cu->inter.mv_dir-1;
-      uint8_t ctx = depth;
-      
-
-      if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 8)
-      {
-        cabac->cur_ctx = &(cabac->ctx.inter_dirctx);
-        CABAC_BIN(cabac, (inter_dir == 2), "inter_pred_idc");
-      }
-      if (inter_dir < 2)
-      {
-        cabac->cur_ctx = &(cabac->ctx.inter_dir4);
-        CABAC_BIN(cabac, inter_dir, "inter_pred_idc");
-      }
-    }
-
-    for (ref_list_idx = 0; ref_list_idx < 2; ref_list_idx++) {
-      if (cur_cu->inter.mv_dir & (1 << ref_list_idx)) {
-        if (ref_listref_list_idx > 1) {
-          // parseRefFrmIdx
-          int32_t ref_frame = cur_cu->inter.mv_ref_codedref_list_idx;
-
-          cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model0);
-          CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX");
-
-          if (ref_frame > 0) {
-            int32_t i;
-            int32_t ref_num = ref_listref_list_idx - 2;
-
-            cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model1);
-            ref_frame--;
-
-            for (i = 0; i < ref_num; ++i) {
-              const uint32_t symbol = (i == ref_frame) ? 0 : 1;
-
-              if (i == 0) {
-                CABAC_BIN(cabac, symbol, "ref_idx_lX");
-              } else {
-                CABAC_BIN_EP(cabac, symbol, "ref_idx_lX");
-              }
-              if (symbol == 0) break;
-            }
-          }
-        }
-
-        if (!(/*pcCU->getSlice()->getMvdL1ZeroFlag() &&*/ state->global->ref_list == REF_PIC_LIST_1 && cur_cu->inter.mv_dir == 3)) {
-          const int32_t mvd_hor = cur_cu->inter.mvdref_list_idx0;
-          const int32_t mvd_ver = cur_cu->inter.mvdref_list_idx1;
-          const int8_t hor_abs_gr0 = mvd_hor != 0;
-          const int8_t ver_abs_gr0 = mvd_ver != 0;
-          const uint32_t mvd_hor_abs = abs(mvd_hor);
-          const uint32_t mvd_ver_abs = abs(mvd_ver);
-
-          cabac->cur_ctx = &(cabac->ctx.cu_mvd_model0);
-          CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
-          CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");
+  // Update POC and frame count.
+  state->frame->num = prev_state->frame->num + 1;
+  state->frame->poc   = prev_state->frame->poc   + 1;
 
-          cabac->cur_ctx = &(cabac->ctx.cu_mvd_model1);
-
-          if (hor_abs_gr0) {
-            CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor");
-          }
-
-          if (ver_abs_gr0) {
-            CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver");
-          }
-
-          if (hor_abs_gr0) {
-            if (mvd_hor_abs > 1) {
-              kvz_cabac_write_ep_ex_golomb(cabac,mvd_hor_abs-2, 1);
-            }
-
-            CABAC_BIN_EP(cabac, (mvd_hor>0)?0:1, "mvd_sign_flag_hor");
-          }
-
-          if (ver_abs_gr0) {
-            if (mvd_ver_abs > 1) {
-              kvz_cabac_write_ep_ex_golomb(cabac,mvd_ver_abs-2, 1);
-            }
-
-            CABAC_BIN_EP(cabac, (mvd_ver>0)?0:1, "mvd_sign_flag_ver");
-          }
-        }
-
-        // Signal which candidate MV to use
-        kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.mvp_idx_model, cur_cu->inter.mv_candref_list_idx, 1,
-                                    AMVP_MAX_NUM_CANDS - 1);
-      }
-    } // for ref_list
-  } // if !merge
-}
-
-static void encode_intra_coding_unit(encoder_state_t * const state,
-                                     cabac_data_t * const cabac,
-                                     const cu_info_t * const cur_cu,
-                                     int x_ctb, int y_ctb, int depth)
-{
-  const videoframe_t * const frame = state->tile->frame;
-  uint8_t intra_pred_mode4 = {
-    cur_cu->intra0.mode, cur_cu->intra1.mode,
-    cur_cu->intra2.mode, cur_cu->intra3.mode };
-    uint8_t intra_pred_mode_chroma = cur_cu->intra0.mode_chroma;
-  int8_t intra_preds43 = {{-1, -1, -1},{-1, -1, -1},{-1, -1, -1},{-1, -1, -1}};
-  int8_t mpm_preds4 = {-1, -1, -1, -1};
-  int i, j;
-  uint32_t flag4;
-  int num_pred_units = (cur_cu->part_size == SIZE_2Nx2N ? 1 : 4);
-
-  #if ENABLE_PCM == 1
-  // Code must start after variable initialization
-  kvz_cabac_encode_bin_trm(cabac, 0); // IPCMFlag == 0
-  #endif
-
-  // PREDINFO CODING
-  // If intra prediction mode is found from the predictors,
-  // it can be signaled with two EP's. Otherwise we can send
-  // 5 EP bins with the full predmode
-  for (j = 0; j < num_pred_units; ++j) {
-    static const vector2d_t offset4 = {{0,0},{1,0},{0,1},{1,1}};
-    const cu_info_t *left_cu = NULL;
-    const cu_info_t *above_cu = NULL;
-
-    if (x_ctb > 0) {
-      left_cu = kvz_videoframe_get_cu_const(frame, x_ctb - 1, y_ctb);
-    }
-    // Don't take the above CU across the LCU boundary.
-    if (y_ctb > 0 && (y_ctb & 7) != 0) {
-      above_cu = kvz_videoframe_get_cu_const(frame, x_ctb, y_ctb - 1);
-    }
-
-    kvz_intra_get_dir_luma_predictor((x_ctb<<3) + (offsetj.x<<2),
-                                 (y_ctb<<3) + (offsetj.y<<2),
-                                 intra_predsj, cur_cu,
-                                 left_cu, above_cu);
-    for (i = 0; i < 3; i++) {
-      if (intra_predsji == intra_pred_modej) {
-        mpm_predsj = (int8_t)i;
-        break;
-      }
-    }
-    flagj = (mpm_predsj == -1) ? 0 : 1;
-  }
-
-  cabac->cur_ctx = &(cabac->ctx.intra_mode_model);
-  for (j = 0; j < num_pred_units; ++j) {
-    CABAC_BIN(cabac, flagj, "prev_intra_luma_pred_flag");
-  }
-
-  for (j = 0; j < num_pred_units; ++j) {
-    // Signal index of the prediction mode in the prediction list.
-    if (flagj) {
-      CABAC_BIN_EP(cabac, (mpm_predsj == 0 ? 0 : 1), "mpm_idx");
-      if (mpm_predsj != 0) {
-        CABAC_BIN_EP(cabac, (mpm_predsj == 1 ? 0 : 1), "mpm_idx");
-      }
-    } else {
-      // Signal the actual prediction mode.
-      int32_t tmp_pred = intra_pred_modej;
-
-      // Sort prediction list from lowest to highest.
-      if (intra_predsj0 > intra_predsj1) SWAP(intra_predsj0, intra_predsj1, int8_t);
-      if (intra_predsj0 > intra_predsj2) SWAP(intra_predsj0, intra_predsj2, int8_t);
-      if (intra_predsj1 > intra_predsj2) SWAP(intra_predsj1, intra_predsj2, int8_t);
-
-      // Reduce the index of the signaled prediction mode according to the
-      // prediction list, as it has been already signaled that it's not one
-      // of the prediction modes.
-      for (i = 2; i >= 0; i--) {
-        tmp_pred = (tmp_pred > intra_predsji ? tmp_pred - 1 : tmp_pred);
-      }
-
-      CABAC_BINS_EP(cabac, tmp_pred, 5, "rem_intra_luma_pred_mode");
-    }
-  }
-
-  {  // start intra chroma pred mode coding
-    unsigned pred_mode = 5;
-    unsigned chroma_pred_modes4 = {0, 26, 10, 1};
-
-    if (intra_pred_mode_chroma == intra_pred_mode0) {
-      pred_mode = 4;
-    } else if (intra_pred_mode_chroma == 34) {
-      // Angular 34 mode is possible only if intra pred mode is one of the
-      // possible chroma pred modes, in which case it is signaled with that
-      // duplicate mode.
-      for (i = 0; i < 4; ++i) {
-        if (intra_pred_mode0 == chroma_pred_modesi) pred_mode = i;
-      }
-    } else {
-      for (i = 0; i < 4; ++i) {
-        if (intra_pred_mode_chroma == chroma_pred_modesi) pred_mode = i;
-      }
-    }
-
-    // pred_mode == 5 mean intra_pred_mode_chroma is something that can't
-    // be coded.
-    assert(pred_mode != 5);
-
-    /**
-     * Table 9-35 - Binarization for intra_chroma_pred_mode
-     *   intra_chroma_pred_mode  bin_string
-     *                        4           0
-     *                        0         100
-     *                        1         101
-     *                        2         110
-     *                        3         111
-     * Table 9-37 - Assignment of ctxInc to syntax elements with context coded bins
-     *   intra_chroma_pred_mode = 0, bypass, bypass
-     */
-    cabac->cur_ctx = &(cabac->ctx.chroma_pred_model0);
-    if (pred_mode == 4) {
-      CABAC_BIN(cabac, 0, "intra_chroma_pred_mode");
-    } else {
-      CABAC_BIN(cabac, 1, "intra_chroma_pred_mode");
-      CABAC_BINS_EP(cabac, pred_mode, 2, "intra_chroma_pred_mode");
-    }
-  }  // end intra chroma pred mode coding
-
-  kvz_encode_transform_coeff(state, x_ctb * 2, y_ctb * 2, depth, 0, 0, 0);
-}
-
-void kvz_encode_coding_tree(encoder_state_t * const state,
-                        uint16_t x_ctb, uint16_t y_ctb, uint8_t depth)
-{
-  cabac_data_t * const cabac = &state->cabac;
-  const videoframe_t * const frame = state->tile->frame;
-  const cu_info_t *cur_cu = kvz_videoframe_get_cu_const(frame, x_ctb, y_ctb);
-  uint8_t split_flag = GET_SPLITDATA(cur_cu, depth);
-  uint8_t split_model = 0;
-  
-  //Absolute ctb
-  uint16_t abs_x_ctb = x_ctb + (state->tile->lcu_offset_x * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
-  uint16_t abs_y_ctb = y_ctb + (state->tile->lcu_offset_y * LCU_WIDTH) / (LCU_WIDTH >> MAX_DEPTH);
-
-  // Check for slice border FIXME
-  uint8_t border_x = ((state->encoder_control->in.width) < (abs_x_ctb * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth))) ? 1 : 0;
-  uint8_t border_y = ((state->encoder_control->in.height) < (abs_y_ctb * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> depth))) ? 1 : 0;
-  uint8_t border_split_x = ((state->encoder_control->in.width)  < ((abs_x_ctb + 1) * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)))) ? 0 : 1;
-  uint8_t border_split_y = ((state->encoder_control->in.height) < ((abs_y_ctb + 1) * (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)))) ? 0 : 1;
-  uint8_t border = border_x | border_y; /*!< are we in any border CU */
-
-  // When not in MAX_DEPTH, insert split flag and split the blocks if needed
-  if (depth != MAX_DEPTH) {
-    // Implisit split flag when on border
-    if (!border) {
-      // Get left and top block split_flags and if they are present and true, increase model number
-      if (x_ctb > 0 && GET_SPLITDATA(kvz_videoframe_get_cu_const(frame, x_ctb - 1, y_ctb), depth) == 1) {
-        split_model++;
-      }
-
-      if (y_ctb > 0 && GET_SPLITDATA(kvz_videoframe_get_cu_const(frame, x_ctb, y_ctb - 1), depth) == 1) {
-        split_model++;
-      }
-
-      cabac->cur_ctx = &(cabac->ctx.split_flag_modelsplit_model);
-      CABAC_BIN(cabac, split_flag, "SplitFlag");
-    }
-
-    if (split_flag || border) {
-      // Split blocks and remember to change x and y block positions
-      uint8_t change = 1<<(MAX_DEPTH-1-depth);
-      kvz_encode_coding_tree(state, x_ctb, y_ctb, depth + 1); // x,y
-
-      // TODO: fix when other half of the block would not be completely over the border
-      if (!border_x || border_split_x) {
-        kvz_encode_coding_tree(state, x_ctb + change, y_ctb, depth + 1);
-      }
-      if (!border_y || border_split_y) {
-        kvz_encode_coding_tree(state, x_ctb, y_ctb + change, depth + 1);
-      }
-      if (!border || (border_split_x && border_split_y)) {
-        kvz_encode_coding_tree(state, x_ctb + change, y_ctb + change, depth + 1);
-      }
-      return;
-    }
-  }
-
-
-
-    // Encode skip flag
-  if (state->global->slicetype != KVZ_SLICE_I) {
-    int8_t ctx_skip = 0; // uiCtxSkip = aboveskipped + leftskipped;
-    int ui;
-    int16_t num_cand = MRG_MAX_NUM_CANDS;
-    // Get left and top skipped flags and if they are present and true, increase context number
-    if (x_ctb > 0 && (kvz_videoframe_get_cu_const(frame, x_ctb - 1, y_ctb))->skipped) {
-      ctx_skip++;
-    }
-
-    if (y_ctb > 0 && (kvz_videoframe_get_cu_const(frame, x_ctb, y_ctb - 1))->skipped) {
-      ctx_skip++;
-    }
-
-    cabac->cur_ctx = &(cabac->ctx.cu_skip_flag_modelctx_skip);
-    CABAC_BIN(cabac, cur_cu->skipped, "SkipFlag");
-
-    // IF SKIP
-    if (cur_cu->skipped) {
-      if (num_cand > 1) {
-        for (ui = 0; ui < num_cand - 1; ui++) {
-          int32_t symbol = (ui != cur_cu->merge_idx);
-          if (ui == 0) {
-            cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
-            CABAC_BIN(cabac, symbol, "MergeIndex");
-          } else {
-            CABAC_BIN_EP(cabac,symbol,"MergeIndex");
-          }
-          if (symbol == 0) {
-            break;
-          }
-        }
-      }
-      return;
-    }
-  }
-
-  // ENDIF SKIP
-
-  // Prediction mode
-  if (state->global->slicetype != KVZ_SLICE_I) {
-    cabac->cur_ctx = &(cabac->ctx.cu_pred_mode_model);
-    CABAC_BIN(cabac, (cur_cu->type == CU_INTRA), "PredMode");
-  }
-
-  // part_mode
-  encode_part_mode(state, cabac, cur_cu, depth);
-
-  if (cur_cu->type == CU_INTER) {
-    const int num_pu = kvz_part_mode_num_partscur_cu->part_size;
-    const int cu_width_scu = LCU_CU_WIDTH >> depth;
-
-    for (int i = 0; i < num_pu; ++i) {
-      const int pu_x_scu = PU_GET_X(cur_cu->part_size, cu_width_scu, x_ctb, i);
-      const int pu_y_scu = PU_GET_Y(cur_cu->part_size, cu_width_scu, y_ctb, i);
-      const cu_info_t *cur_pu = kvz_videoframe_get_cu_const(frame, pu_x_scu, pu_y_scu);
-
-      encode_inter_prediction_unit(state, cabac, cur_pu, pu_x_scu, pu_y_scu, depth);
-    }
-
-    {
-      int cbf = (cbf_is_set(cur_cu->cbf.y, depth) ||
-                 cbf_is_set(cur_cu->cbf.u, depth) ||
-                 cbf_is_set(cur_cu->cbf.v, depth));
-
-      // Only need to signal coded block flag if not skipped or merged
-      // skip = no coded residual, merge = coded residual
-      if (cur_cu->part_size != SIZE_2Nx2N || !cur_cu->merged) {
-        cabac->cur_ctx = &(cabac->ctx.cu_qt_root_cbf_model);
-        CABAC_BIN(cabac, cbf, "rqt_root_cbf");
-      }
-      // Code (possible) coeffs to bitstream
-
-      if (cbf) {
-        kvz_encode_transform_coeff(state, x_ctb * 2, y_ctb * 2, depth, 0, 0, 0);
-      }
-    }
-  } else if (cur_cu->type == CU_INTRA) {
-    encode_intra_coding_unit(state, cabac, cur_cu, x_ctb, y_ctb, depth);
-  }
-
-    #if ENABLE_PCM == 1
-  // Code IPCM block
-  if (cur_cu->type == CU_PCM) {
-    kvz_cabac_encode_bin_trm(cabac, 1); // IPCMFlag == 1
-      kvz_cabac_finish(cabac);
-      kvz_bitstream_add_rbsp_trailing_bits(cabac.stream);
-    // PCM sample
-      {
-      unsigned y, x;
-
-      pixel *base_y = &cur_pic->y_datax_ctb * (LCU_WIDTH >> (MAX_DEPTH))    + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH))) * encoder->in.width;
-      pixel *base_u = &cur_pic->u_data(x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * encoder->in.width / 2);
-      pixel *base_v = &cur_pic->v_data(x_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1)) + (y_ctb * (LCU_WIDTH >> (MAX_DEPTH + 1))) * encoder->in.width / 2);
-
-      // Luma
-      for (y = 0; y < LCU_WIDTH >> depth; y++) {
-        for (x = 0; x < LCU_WIDTH >> depth; x++) {
-          kvz_bitstream_put(cabac.stream, base_yx + y * encoder->in.width, 8);
-          }
-        }
-
-      // Chroma
-      if (encoder->in.video_format != FORMAT_400) {
-        for (y = 0; y < LCU_WIDTH >> (depth + 1); y++) {
-          for (x = 0; x < LCU_WIDTH >> (depth + 1); x++) {
-            kvz_bitstream_put(cabac.stream, base_ux + y * (encoder->in.width >> 1), 8);
-          }
-        }
-        for (y = 0; y < LCU_WIDTH >> (depth + 1); y++) {
-          for (x = 0; x < LCU_WIDTH >> (depth + 1); x++) {
-            kvz_bitstream_put(cabac.stream, base_vx + y * (encoder->in.width >> 1), 8);
-          }
-        }
-      }
-    }
-    // end PCM sample
-      kvz_cabac_start(cabac);
-  } // end Code IPCM block
-#endif /* END ENABLE_PCM */
-  else { /* Should not happend */
-    printf("UNHANDLED TYPE!\r\n");
-    assert(0);
-    exit(1);
-  }
-
-   /* end prediction unit */
-  /* end coding_unit */
+  state->prepared = 1;
 }
 
-
 coeff_scan_order_t kvz_get_scan_order(int8_t cu_type, int intra_mode, int depth)
 {
   // Scan mode is diagonal, except for 4x4+8x8 luma and 4x4 chroma, where:
@@ -1524,468 +1047,3 @@
 
   return SCAN_DIAG;
 }
-
-
-static void encode_transform_unit(encoder_state_t * const state,
-                                  int x_pu, int y_pu, int depth)
-{
-  assert(depth >= 1 && depth <= MAX_PU_DEPTH);
-
-  const videoframe_t * const frame = state->tile->frame;
-  uint8_t width = LCU_WIDTH >> depth;
-  uint8_t width_c = (depth == MAX_PU_DEPTH ? width : width / 2);
-
-  int x_cu = x_pu / 2;
-  int y_cu = y_pu / 2;
-  const cu_info_t *cur_cu = kvz_videoframe_get_cu_const(frame, x_cu, y_cu);
-
-  coeff_t coeff_yLCU_WIDTH*LCU_WIDTH+1;
-  coeff_t coeff_uLCU_WIDTH*LCU_WIDTH>>2;
-  coeff_t coeff_vLCU_WIDTH*LCU_WIDTH>>2;
-  int32_t coeff_stride = frame->width;
-
-  int8_t scan_idx = kvz_get_scan_order(cur_cu->type, cur_cu->intraPU_INDEX(x_pu, y_pu).mode, depth);
-
-  int cbf_y = cbf_is_set(cur_cu->cbf.y, depth + PU_INDEX(x_pu, y_pu));
-
-  if (cbf_y) {
-    int x = x_pu * (LCU_WIDTH >> MAX_PU_DEPTH);
-    int y = y_pu * (LCU_WIDTH >> MAX_PU_DEPTH);
-    coeff_t *orig_pos = &frame->coeff_yx + y * frame->width;
-    for (y = 0; y < width; y++) {
-      for (x = 0; x < width; x++) {
-        coeff_yx+y*width = orig_posx;
-      }
-      orig_pos += coeff_stride;
-    }
-  }
-
-  // CoeffNxN
-  // Residual Coding
-  if (cbf_y) {
-    kvz_encode_coeff_nxn(state, coeff_y, width, 0, scan_idx, cur_cu->intraPU_INDEX(x_pu, y_pu).tr_skip);
-  }
-
-  if (depth == MAX_DEPTH + 1 && !(x_pu % 2 && y_pu % 2)) {
-    // For size 4x4 luma transform the corresponding chroma transforms are
-    // also of size 4x4 covering 8x8 luma pixels. The residual is coded
-    // in the last transform unit so for the other ones, don't do anything.
-    return;
-  }
-
-  if (cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth)) {
-    int x, y;
-    coeff_t *orig_pos_u, *orig_pos_v;
-
-    if (depth <= MAX_DEPTH) {
-      x = x_pu * (LCU_WIDTH >> (MAX_PU_DEPTH + 1));
-      y = y_pu * (LCU_WIDTH >> (MAX_PU_DEPTH + 1));
-    } else {
-      // for 4x4 select top left pixel of the CU.
-      x = x_cu * (LCU_WIDTH >> (MAX_DEPTH + 1));
-      y = y_cu * (LCU_WIDTH >> (MAX_DEPTH + 1));
-    }
-    orig_pos_u = &frame->coeff_ux + y * (frame->width >> 1);
-    orig_pos_v = &frame->coeff_vx + y * (frame->width >> 1);
-    for (y = 0; y < (width_c); y++) {
-      for (x = 0; x < (width_c); x++) {
-        coeff_ux+y*(width_c) = orig_pos_ux;
-        coeff_vx+y*(width_c) = orig_pos_vx;
-      }
-      orig_pos_u += coeff_stride>>1;
-      orig_pos_v += coeff_stride>>1;
-    }
-
-    scan_idx = kvz_get_scan_order(cur_cu->type, cur_cu->intra0.mode_chroma, depth);
-
-    if (cbf_is_set(cur_cu->cbf.u, depth)) {
-      kvz_encode_coeff_nxn(state, coeff_u, width_c, 2, scan_idx, 0);
-    }
-
-    if (cbf_is_set(cur_cu->cbf.v, depth)) {
-      kvz_encode_coeff_nxn(state, coeff_v, width_c, 2, scan_idx, 0);
-    }
-  }
-}
-
-/**
- * \param encoder
- * \param x_pu            Prediction units' x coordinate.
- * \param y_pu            Prediction units' y coordinate.
- * \param depth           Depth from LCU.
- * \param tr_depth        Depth from last CU.
- * \param parent_coeff_u  What was signaled at previous level for cbf_cb.
- * \param parent_coeff_v  What was signlaed at previous level for cbf_cr.
- */
-void kvz_encode_transform_coeff(encoder_state_t * const state, int32_t x_pu,int32_t y_pu,
-                            int8_t depth, int8_t tr_depth, uint8_t parent_coeff_u, uint8_t parent_coeff_v)
-{
-  cabac_data_t * const cabac = &state->cabac;
-  int32_t x_cu = x_pu / 2;
-  int32_t y_cu = y_pu / 2;
-  const videoframe_t * const frame = state->tile->frame;
-  const cu_info_t *cur_cu = kvz_videoframe_get_cu_const(frame, x_cu, y_cu);
-
-  // NxN signifies implicit transform split at the first transform level.
-  // There is a similar implicit split for inter, but it is only used when
-  // transform hierarchy is not in use.
-  int intra_split_flag = (cur_cu->type == CU_INTRA && cur_cu->part_size == SIZE_NxN);
-
-  // The implicit split by intra NxN is not counted towards max_tr_depth.
-  int tr_depth_intra = state->encoder_control->tr_depth_intra;
-  int max_tr_depth = (cur_cu->type == CU_INTRA ? tr_depth_intra + intra_split_flag : TR_DEPTH_INTER);
-
-  int8_t split = (cur_cu->tr_depth > depth);
-
-  const int cb_flag_y = cbf_is_set(cur_cu->cbf.y, depth + PU_INDEX(x_pu, y_pu));
-  const int cb_flag_u = cbf_is_set(cur_cu->cbf.u, depth);
-  const int cb_flag_v = cbf_is_set(cur_cu->cbf.v, depth);
-
-  // The split_transform_flag is not signaled when:
-  // - transform size is greater than 32 (depth == 0)
-  // - transform size is 4 (depth == MAX_PU_DEPTH)
-  // - transform depth is max
-  // - cu is intra NxN and it's the first split
-  if (depth > 0 &&
-      depth < MAX_PU_DEPTH &&
-      tr_depth < max_tr_depth &&
-      !(intra_split_flag && tr_depth == 0))
-  {
-    cabac->cur_ctx = &(cabac->ctx.trans_subdiv_model5 - ((kvz_g_convert_to_bitLCU_WIDTH + 2) - depth));
-    CABAC_BIN(cabac, split, "split_transform_flag");
-  }
-
-  // Chroma cb flags are not signaled when one of the following:
-  // - transform size is 4 (2x2 chroma transform doesn't exist)
-  // - they have already been signaled to 0 previously
-  // When they are not present they are inferred to be 0, except for size 4
-  // when the flags from previous level are used.
-  if (depth < MAX_PU_DEPTH) {
-    cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_chromatr_depth);
-    if (tr_depth == 0 || parent_coeff_u) {
-      CABAC_BIN(cabac, cb_flag_u, "cbf_cb");
-    }
-    if (tr_depth == 0 || parent_coeff_v) {
-      CABAC_BIN(cabac, cb_flag_v, "cbf_cr");
-    }
-  }
-
-  if (split) {
-    uint8_t pu_offset = 1 << (MAX_PU_DEPTH - (depth + 1));
-    kvz_encode_transform_coeff(state, x_pu, y_pu, depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
-    kvz_encode_transform_coeff(state, x_pu + pu_offset, y_pu,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
-    kvz_encode_transform_coeff(state, x_pu, y_pu + pu_offset,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
-    kvz_encode_transform_coeff(state, x_pu + pu_offset, y_pu + pu_offset,  depth + 1, tr_depth + 1, cb_flag_u, cb_flag_v);
-    return;
-  }
-
-  // Luma coded block flag is signaled when one of the following:
-  // - prediction mode is intra
-  // - transform depth > 0
-  // - we have chroma coefficients at this level
-  // When it is not present, it is inferred to be 1.
-  if(cur_cu->type == CU_INTRA || tr_depth > 0 || cb_flag_u || cb_flag_v) {
-      cabac->cur_ctx = &(cabac->ctx.qt_cbf_model_luma!tr_depth);
-      CABAC_BIN(cabac, cb_flag_y, "cbf_luma");
-  }
-
-  if (cb_flag_y | cb_flag_u | cb_flag_v) {
-    encode_transform_unit(state, x_pu, y_pu, depth);
-  }
-}
-
-void kvz_encode_coeff_nxn(encoder_state_t * const state, coeff_t *coeff, uint8_t width,
-                      uint8_t type, int8_t scan_mode, int8_t tr_skip)
-{
-  const encoder_control_t * const encoder = state->encoder_control;
-  cabac_data_t * const cabac = &state->cabac;
-  int c1 = 1;
-  uint8_t last_coeff_x = 0;
-  uint8_t last_coeff_y = 0;
-  int32_t i;
-  uint32_t sig_coeffgroup_flag8 * 8 = { 0 };
-
-  int8_t be_valid = encoder->sign_hiding;
-  int32_t scan_pos_sig;
-  uint32_t go_rice_param = 0;
-  uint32_t blk_pos, pos_y, pos_x, sig, ctx_sig;
-
-  // CONSTANTS
-  const uint32_t num_blk_side    = width >> TR_MIN_LOG2_SIZE;
-  const uint32_t log2_block_size = kvz_g_convert_to_bitwidth + 2;
-  const uint32_t *scan           =
-    kvz_g_sig_last_scanscan_modelog2_block_size - 1;
-  const uint32_t *scan_cg = g_sig_last_scan_cglog2_block_size - 2scan_mode;
-
-  // Init base contexts according to block type
-  cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_modeltype);
-  cabac_ctx_t *baseCtx           = (type == 0) ? &(cabac->ctx.cu_sig_model_luma0) :
-                                 &(cabac->ctx.cu_sig_model_chroma0);
-
-  // Scan all coeff groups to find out which of them have coeffs.
-  // Populate sig_coeffgroup_flag with that info.
-  unsigned sig_cg_cnt = 0;
-  for (int cg_y = 0; cg_y < width / 4; ++cg_y) {
-    for (int cg_x = 0; cg_x < width / 4; ++cg_x) {
-      unsigned cg_pos = cg_y * width * 4 + cg_x * 4;
-      for (int coeff_row = 0; coeff_row < 4; ++coeff_row) {
-        // Load four 16-bit coeffs and see if any of them are non-zero.
-        unsigned coeff_pos = cg_pos + coeff_row * width;
-        uint64_t four_coeffs = *(uint64_t*)(&coeffcoeff_pos);
-        if (four_coeffs) {
-          ++sig_cg_cnt;
-          unsigned cg_pos_y = (cg_pos >> log2_block_size) >> TR_MIN_LOG2_SIZE;
-          unsigned cg_pos_x = (cg_pos & (width - 1)) >> TR_MIN_LOG2_SIZE;
-          sig_coeffgroup_flagcg_pos_x + cg_pos_y * num_blk_side = 1;
-          break;
-        }
-      }
-    }
-  }
-
-  // Rest of the code assumes at least one non-zero coeff.
-  assert(sig_cg_cnt > 0);
-
-  // Find the last coeff group by going backwards in scan order.
-  unsigned scan_cg_last = num_blk_side * num_blk_side - 1;
-  while (!sig_coeffgroup_flagscan_cgscan_cg_last) {
-    --scan_cg_last;
-  }
-
-  // Find the last coeff by going backwards in scan order.
-  unsigned scan_pos_last = scan_cg_last * 16 + 15;
-  while (!coeffscanscan_pos_last) {
-    --scan_pos_last;
-  }
-
-  int pos_last = scanscan_pos_last;
-
-  // transform skip flag
-  if(width == 4 && encoder->trskip_enable) {
-    cabac->cur_ctx = (type == 0) ? &(cabac->ctx.transform_skip_model_luma) : &(cabac->ctx.transform_skip_model_chroma);
-    CABAC_BIN(cabac, tr_skip, "transform_skip_flag");
-  }
-
-  last_coeff_x = pos_last & (width - 1);
-  last_coeff_y = (uint8_t)(pos_last >> log2_block_size);
-
-  // Code last_coeff_x and last_coeff_y
-  kvz_encode_last_significant_xy(state, last_coeff_x, last_coeff_y, width, width,
-                             type, scan_mode);
-
-  scan_pos_sig  = scan_pos_last;
-
-  // significant_coeff_flag
-  for (i = scan_cg_last; i >= 0; i--) {
-    int32_t sub_pos        = i << 4; // LOG2_SCAN_SET_SIZE;
-    int32_t abs_coeff16;
-    int32_t cg_blk_pos     = scan_cgi;
-    int32_t cg_pos_y       = cg_blk_pos / num_blk_side;
-    int32_t cg_pos_x       = cg_blk_pos - (cg_pos_y * num_blk_side);
-
-    uint32_t coeff_signs   = 0;
-    int32_t last_nz_pos_in_cg = -1;
-    int32_t first_nz_pos_in_cg = 16;
-    int32_t num_non_zero = 0;
-    go_rice_param = 0;
-
-    if (scan_pos_sig == scan_pos_last) {
-      abs_coeff0 = abs(coeffpos_last);
-      coeff_signs  = (coeffpos_last < 0);
-      num_non_zero = 1;
-      last_nz_pos_in_cg  = scan_pos_sig;
-      first_nz_pos_in_cg = scan_pos_sig;
-      scan_pos_sig--;
-    }
-
-    if (i == scan_cg_last || i == 0) {
-      sig_coeffgroup_flagcg_blk_pos = 1;
-    } else {
-      uint32_t sig_coeff_group   = (sig_coeffgroup_flagcg_blk_pos != 0);
-      uint32_t ctx_sig  = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-                                                      cg_pos_y, width);
-      cabac->cur_ctx = &base_coeff_group_ctxctx_sig;
-      CABAC_BIN(cabac, sig_coeff_group, "coded_sub_block_flag");
-    }
-
-    if (sig_coeffgroup_flagcg_blk_pos) {
-      int32_t pattern_sig_ctx = kvz_context_calc_pattern_sig_ctx(sig_coeffgroup_flag,
-                                                             cg_pos_x, cg_pos_y, width);
-
-      for (; scan_pos_sig >= sub_pos; scan_pos_sig--) {
-        blk_pos = scanscan_pos_sig;
-        pos_y   = blk_pos >> log2_block_size;
-        pos_x   = blk_pos - (pos_y << log2_block_size);
-        sig    = (coeffblk_pos != 0) ? 1 : 0;
-
-        if (scan_pos_sig > sub_pos || i == 0 || num_non_zero) {
-          ctx_sig  = kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
-                                             log2_block_size, type);
-          cabac->cur_ctx = &baseCtxctx_sig;
-          CABAC_BIN(cabac, sig, "sig_coeff_flag");
-        }
-
-        if (sig) {
-          abs_coeffnum_non_zero = abs(coeffblk_pos);
-          coeff_signs              = 2 * coeff_signs + (coeffblk_pos < 0);
-          num_non_zero++;
-
-          if (last_nz_pos_in_cg == -1) {
-            last_nz_pos_in_cg = scan_pos_sig;
-          }
-
-          first_nz_pos_in_cg  = scan_pos_sig;
-        }
-      }
-    } else {
-      scan_pos_sig = sub_pos - 1;
-    }
-
-    if (num_non_zero > 0) {
-      int8_t sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >=
-                            4 /*SBH_THRESHOLD*/) ? 1 : 0;
-      uint32_t ctx_set  = (i > 0 && type == 0) ? 2 : 0;
-      cabac_ctx_t *base_ctx_mod;
-      int32_t num_c1_flag, first_c2_flag_idx, idx, first_coeff2;
-
-      if (c1 == 0) {
-        ctx_set++;
-      }
-
-      c1 = 1;
-
-      base_ctx_mod     = (type == 0) ? &(cabac->ctx.cu_one_model_luma4 * ctx_set) :
-                         &(cabac->ctx.cu_one_model_chroma4 * ctx_set);
-      num_c1_flag      = MIN(num_non_zero, C1FLAG_NUMBER);
-      first_c2_flag_idx = -1;
-
-      for (idx = 0; idx < num_c1_flag; idx++) {
-        uint32_t symbol = (abs_coeffidx > 1) ? 1 : 0;
-        cabac->cur_ctx = &base_ctx_modc1;
-        CABAC_BIN(cabac, symbol, "coeff_abs_level_greater1_flag");
-
-        if (symbol) {
-          c1 = 0;
-
-          if (first_c2_flag_idx == -1) {
-            first_c2_flag_idx = idx;
-          }
-        } else if ((c1 < 3) && (c1 > 0)) {
-          c1++;
-        }
-      }
-
-      if (c1 == 0) {
-        base_ctx_mod = (type == 0) ? &(cabac->ctx.cu_abs_model_lumactx_set) :
-                       &(cabac->ctx.cu_abs_model_chromactx_set);
-
-        if (first_c2_flag_idx != -1) {
-          uint8_t symbol = (abs_coefffirst_c2_flag_idx > 2) ? 1 : 0;
-          cabac->cur_ctx      = &base_ctx_mod0;
-          CABAC_BIN(cabac, symbol, "coeff_abs_level_greater2_flag");
-        }
-      }
-
-      if (be_valid && sign_hidden) {
-        CABAC_BINS_EP(cabac, (coeff_signs >> 1), (num_non_zero - 1), "coeff_sign_flag");
-      } else {
-        CABAC_BINS_EP(cabac, coeff_signs, num_non_zero, "coeff_sign_flag");
-      }
-
-      if (c1 == 0 || num_non_zero > C1FLAG_NUMBER) {
-        first_coeff2 = 1;
-
-        for (idx = 0; idx < num_non_zero; idx++) {
-          int32_t base_level  = (idx < C1FLAG_NUMBER) ? (2 + first_coeff2) : 1;
-
-          if (abs_coeffidx >= base_level) {
-            kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
-
-            if (abs_coeffidx > 3 * (1 << go_rice_param)) {
-              go_rice_param = MIN(go_rice_param + 1, 4);
-            }
-          }
-
-          if (abs_coeffidx >= 2) {
-            first_coeff2 = 0;
-          }
-        }
-      }
-    }
-  }
-}
-
-/*!
- \brief Encode (X,Y) position of the last significant coefficient
- \param lastpos_x X component of last coefficient
- \param lastpos_y Y component of last coefficient
- \param width  Block width
- \param height Block height
- \param type plane type / luminance or chrominance
- \param scan scan type (diag, hor, ver)
-
- This method encodes the X and Y component within a block of the last significant coefficient.
-*/
-void kvz_encode_last_significant_xy(encoder_state_t * const state,
-                                uint8_t lastpos_x, uint8_t lastpos_y,
-                                uint8_t width, uint8_t height,
-                                uint8_t type, uint8_t scan)
-{
-  cabac_data_t * const cabac = &state->cabac;
-  uint8_t offset_x  = type?0:((TOBITS(width)*3) + ((TOBITS(width)+1)>>2)),offset_y = offset_x;
-  uint8_t shift_x   = type?(TOBITS(width)):((TOBITS(width)+3)>>2), shift_y = shift_x;
-  int group_idx_x;
-  int group_idx_y;
-  int last_x,last_y,i;
-  cabac_ctx_t *base_ctx_x = (type ? cabac->ctx.cu_ctx_last_x_chroma : cabac->ctx.cu_ctx_last_x_luma);
-  cabac_ctx_t *base_ctx_y = (type ? cabac->ctx.cu_ctx_last_y_chroma : cabac->ctx.cu_ctx_last_y_luma);
-
-  if (scan == SCAN_VER) {
-    SWAP( lastpos_x, lastpos_y,uint8_t );
-  }
-
-  group_idx_x   = g_group_idxlastpos_x;
-  group_idx_y   = g_group_idxlastpos_y;
-
-  // Last X binarization
-  for (last_x = 0; last_x < group_idx_x ; last_x++) {
-    cabac->cur_ctx = &base_ctx_xoffset_x + (last_x >> shift_x);
-    CABAC_BIN(cabac,1,"last_sig_coeff_x_prefix");
-  }
-
-  if (group_idx_x < g_group_idxwidth - 1) {
-    cabac->cur_ctx = &base_ctx_xoffset_x + (last_x >> shift_x);
-    CABAC_BIN(cabac,0,"last_sig_coeff_x_prefix");
-  }
-
-  // Last Y binarization
-  for (last_y = 0; last_y < group_idx_y ; last_y++) {
-    cabac->cur_ctx = &base_ctx_yoffset_y + (last_y >> shift_y);
-    CABAC_BIN(cabac,1,"last_sig_coeff_y_prefix");
-  }
-
-  if (group_idx_y < g_group_idxheight - 1) {
-    cabac->cur_ctx = &base_ctx_yoffset_y + (last_y >> shift_y);
-    CABAC_BIN(cabac,0,"last_sig_coeff_y_prefix");
-  }
-
-  // Last X
-  if (group_idx_x > 3) {
-    lastpos_x -= g_min_in_groupgroup_idx_x;
-
-    for (i = ((group_idx_x - 2) >> 1) - 1; i >= 0; i--) {
-      CABAC_BIN_EP(cabac,(lastpos_x>>i) & 1,"last_sig_coeff_x_suffix");
-    }
-  }
-
-  // Last Y
-  if (group_idx_y > 3) {
-    lastpos_y -= g_min_in_groupgroup_idx_y;
-
-    for (i = ((group_idx_y - 2) >> 1) - 1; i >= 0; i--) {
-      CABAC_BIN_EP(cabac,(lastpos_y>>i) & 1,"last_sig_coeff_y_suffix");
-    }
-  }
-
-  // end LastSignificantXY
-}

kvazaar-0.8.3.tar.gz/src/encoderstate.h -> kvazaar-1.0.0.tar.gz/src/encoderstate.h Changed

@@ -26,25 +26,18 @@
  * Top level of the encoder implementation.
  */
 
-#include "global.h"
-
-#include "videoframe.h"
-#include "encoder.h"
-#include "image.h"
 #include "bitstream.h"
 #include "cabac.h"
+#include "cu.h"
+#include "encoder.h"
+#include "global.h" // IWYU pragma: keep
+#include "image.h"
+#include "imagelist.h"
+#include "kvazaar.h"
 #include "tables.h"
-#include "scalinglist.h"
 #include "threadqueue.h"
-#include "imagelist.h"
-
-// Submodules
-// Functions to obtain geometry information from LCU
-#include "encoder_state-geometry.h"
-// Constructors/destructors
-#include "encoder_state-ctors_dtors.h"
-// Functions writing bitstream parts
-#include "encoder_state-bitstream.h"
+#include "videoframe.h"
+#include "extras/crypto.h"
 
 
 typedef enum {
@@ -57,13 +50,13 @@
 
 
 
-typedef struct {
+typedef struct encoder_state_config_frame_t {
   double cur_lambda_cost; //!< \brief Lambda for SSE
   double cur_lambda_cost_sqrt; //!< \brief Lambda for SAD and SATD
   
-  int32_t frame;
-  int32_t poc; /*!< \brief picture order count */
-  int8_t gop_offset; /*!< \brief offset in the gop structure */
+  int32_t num;       /*!< \brief Frame number */
+  int32_t poc;       /*!< \brief Picture order count */
+  int8_t gop_offset; /*!< \brief Offset in the gop structure */
   
   int8_t QP;   //!< \brief Quantization parameter
   double QP_factor; //!< \brief Quantization factor
@@ -95,9 +88,9 @@
   double rc_alpha;
   double rc_beta;
 
-} encoder_state_config_global_t;
+} encoder_state_config_frame_t;
 
-typedef struct {
+typedef struct encoder_state_config_tile_t {
   //Current sub-frame
   videoframe_t *frame;
   
@@ -110,20 +103,29 @@
   //Position of the first element in tile scan in global coordinates
   int32_t lcu_offset_in_ts;
   
-  //Buffer for search
-  //order by row of (LCU_WIDTH * cur_pic->width_in_lcu) pixels
+  // This is a buffer for the non-loopfiltered bottom pixels of every LCU-row
+  // in the tile. They are packed such that each LCU-row index maps to the
+  // y-coordinate.
   yuv_t *hor_buf_search;
-  //order by column of (LCU_WIDTH * encoder_state->height_in_lcu) pixels (there is no more extra pixel, since we can use a negative index)
+  // This is a buffer for the non-loopfiltered rightmost pixels of every
+  // LCU-column. They are packed such that each LCU-column index maps to the
+  // x-coordinate.
   yuv_t *ver_buf_search;
   
+  // This is a buffer for the deblocked bottom pixels of every LCU-row in the
+  // tile. They are packed such that each LCU-row index maps to the y-coordinate.
   yuv_t *hor_buf_before_sao;
-  yuv_t *ver_buf_before_sao;
   
   //Jobs for each individual LCU of a wavefront row.
   threadqueue_job_t **wf_jobs;
+
+  // Instance of encryption generator by tile
+  Crypto_Handle dbs_g;
+  uint32_t m_prev_pos;
+
 } encoder_state_config_tile_t;
 
-typedef struct {
+typedef struct encoder_state_config_slice_t {
   int32_t id;
   
   //Global coordinates
@@ -135,7 +137,7 @@
   int32_t end_in_rs;
 } encoder_state_config_slice_t;
 
-typedef struct {
+typedef struct encoder_state_config_wfrow_t {
   //Row in tile coordinates of the wavefront
   int32_t lcu_offset_y;
 } encoder_state_config_wfrow_t;
@@ -171,7 +173,7 @@
   //Pointer to the encoder_state of the previous frame
   struct encoder_state_t *previous_encoder_state;
   
-  encoder_state_config_global_t *global;
+  encoder_state_config_frame_t  *frame;
   encoder_state_config_tile_t   *tile;
   encoder_state_config_slice_t  *slice;
   encoder_state_config_wfrow_t  *wfrow;
@@ -185,7 +187,7 @@
 
   /**
    * \brief Indicates that this encoder state is ready for encoding the
-   * next frame i.e. kvz_encoder_next_frame has been called.
+   * next frame i.e. kvz_encoder_prepare has been called.
    */
   int prepared;
 
@@ -203,24 +205,10 @@
   threadqueue_job_t * tqj_bitstream_written; //Bitstream is written
 } encoder_state_t;
 
-void kvz_encode_one_frame(encoder_state_t *state);
-
-void kvz_encoder_next_frame(encoder_state_t *state);
-
+void kvz_encode_one_frame(encoder_state_t * const state, kvz_picture* frame);
 
-void kvz_encode_coding_tree(encoder_state_t *state, uint16_t x_ctb,
-                        uint16_t y_ctb, uint8_t depth);
+void kvz_encoder_prepare(encoder_state_t *state);
 
-void kvz_encode_last_significant_xy(encoder_state_t *state,
-                                uint8_t lastpos_x, uint8_t lastpos_y,
-                                uint8_t width, uint8_t height,
-                                uint8_t type, uint8_t scan);
-void kvz_encode_coeff_nxn(encoder_state_t *state, coeff_t *coeff, uint8_t width,
-                      uint8_t type, int8_t scan_mode, int8_t tr_skip);
-void kvz_encode_transform_coeff(encoder_state_t *state, int32_t x_cu, int32_t y_cu,
-                            int8_t depth, int8_t tr_depth, uint8_t parent_coeff_u, uint8_t parent_coeff_v);
-void encode_block_residual(const encoder_control_t * const encoder,
-                           uint16_t x_ctb, uint16_t y_ctb, uint8_t depth);
 
 int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state);

kvazaar-1.0.0.tar.gz/src/extras/crypto.cpp Added

@@ -0,0 +1,126 @@
+#include <extras/crypto.h>
+
+#ifndef KVZ_SEL_ENCRYPTION
+extern int kvz_make_vs_ignore_crypto_not_having_symbols;
+int kvz_make_vs_ignore_crypto_not_having_symbols = 0;
+#else
+#include <cryptopp/aes.h>
+#include <cryptopp/modes.h>
+#include <cryptopp/osrng.h>
+typedef struct AESDecoder {
+#if AESEncryptionStreamMode
+        CryptoPP::CFB_Mode<CryptoPP::AES>::Encryption *CFBdec;
+#else
+    CryptoPP::CFB_Mode<CryptoPP::AES>::Decryption *CFBdec;
+#endif
+
+    byte keyCryptoPP::AES::DEFAULT_KEYLENGTH, ivCryptoPP::AES::BLOCKSIZE, out_stream_counterCryptoPP::AES::BLOCKSIZE, counterCryptoPP::AES::BLOCKSIZE;
+    int couter_avail, counter_index, counter_index_pos;
+} AESDecoder;
+
+
+AESDecoder* Init() {
+    int init_val32 = {201, 75, 219, 152, 6, 245, 237, 107, 179, 194, 81, 29, 66, 98, 198, 0, 16, 213, 27, 56, 255, 127, 242, 112, 97, 126, 197, 204, 25, 59, 38, 30};
+    AESDecoder * AESdecoder = (AESDecoder *)malloc(sizeof(AESDecoder));
+    for(int i=0;i<16; i++) {
+        AESdecoder->iv i     = init_vali;
+        AESdecoder->counteri = init_val5+i;
+        AESdecoder->keyi     = init_vali+16;
+    }
+#if AESEncryptionStreamMode
+    AESdecoder->CFBdec = new CryptoPP::CFB_Mode<CryptoPP::AES >::Encryption(AESdecoder->key, CryptoPP::AES::DEFAULT_KEYLENGTH, AESdecoder->iv);
+#else
+    AESdecoder->CFBdec = new CryptoPP::CFB_Mode<CryptoPP::AES >::Decryption(AESdecoder->key, CryptoPP::AES::DEFAULT_KEYLENGTH, AESdecoder->iv);
+#endif
+    AESdecoder->couter_avail      = 0;
+    AESdecoder->counter_index     = 0;
+    AESdecoder->counter_index_pos = 0;
+    return AESdecoder;
+}
+
+void DeleteCrypto(AESDecoder * AESdecoder) {
+    if(AESdecoder)
+        free(AESdecoder);
+}
+
+void Decrypt(AESDecoder *AESdecoder, const unsigned char *in_stream, int size_bits, unsigned char  *out_stream) {
+    int nb_bytes = ceil((double)size_bits/8);
+    AESdecoder->CFBdec->ProcessData(out_stream, in_stream, nb_bytes);
+    if(size_bits&7)
+        AESdecoder->CFBdec->SetKeyWithIV(AESdecoder->key, CryptoPP::AES::DEFAULT_KEYLENGTH, AESdecoder->iv);
+    
+}
+void Incr_counter (unsigned char *counter) {
+    counter0++;
+}
+
+#if AESEncryptionStreamMode
+void Decrypt_counter(AESDecoder * AESdecoder) {
+    AESdecoder->CFBdec->ProcessData(AESdecoder->out_stream_counter, AESdecoder->counter, 16);
+    AESdecoder->couter_avail      = 128;
+    AESdecoder->counter_index     = 15;
+    AESdecoder->counter_index_pos = 8;
+    Incr_counter(AESdecoder->counter);
+}
+#endif
+
+#if AESEncryptionStreamMode
+unsigned int get_key (AESDecoder * AESdecoder, int nb_bits) {
+    unsigned int key_ = 0;
+    if(nb_bits > 32) {
+        printf("The Generator can not generate more than 32 bit %d \n", nb_bits);
+        return 0;
+    }
+    if( !nb_bits )
+        return 0;
+    if(!AESdecoder->couter_avail)
+        Decrypt_counter(AESdecoder);
+
+    if(AESdecoder->couter_avail >= nb_bits)
+        AESdecoder->couter_avail -= nb_bits;
+    else
+        AESdecoder->couter_avail = 0;
+    int nb = 0;
+    while( nb_bits ) {
+        if( nb_bits >= AESdecoder->counter_index_pos )
+            nb = AESdecoder->counter_index_pos;
+        else
+            nb = nb_bits;
+        key_ <<= nb;
+        key_ += (AESdecoder->out_stream_counterAESdecoder->counter_index & ((1<<nb)-1));
+        AESdecoder->out_stream_counterAESdecoder->counter_index >>= nb;
+        nb_bits -= nb;
+
+        if(AESdecoder->counter_index && nb == AESdecoder->counter_index_pos ) {
+            AESdecoder->counter_index--;
+            AESdecoder->counter_index_pos = 8;
+        } else {
+            AESdecoder->counter_index_pos -= nb;
+            if(nb_bits) {
+                Decrypt_counter(AESdecoder);
+                AESdecoder->couter_avail -=  nb_bits;
+            }
+        }
+    }
+    return key_;
+}
+#endif
+
+Crypto_Handle InitC(){
+    AESDecoder* AESdecoder = Init();
+    return AESdecoder;
+}
+#if AESEncryptionStreamMode
+unsigned int ff_get_key (Crypto_Handle *hdl, int nb_bits) {
+    return get_key ((AESDecoder*)*hdl, nb_bits);
+}
+#endif
+void DecryptC(Crypto_Handle hdl, const unsigned char *in_stream, int size_bits, unsigned char  *out_stream) {
+    Decrypt((AESDecoder*)hdl, in_stream, size_bits, out_stream);
+}
+
+void DeleteCryptoC(Crypto_Handle hdl) {
+	  DeleteCrypto((AESDecoder *)hdl);
+}
+
+#endif // KVZ_SEL_ENCRYPTION

kvazaar-1.0.0.tar.gz/src/extras/crypto.h Added

@@ -0,0 +1,72 @@
+#ifndef CRYPTO_H_
+#define CRYPTO_H_
+
+#include "global.h"
+
+#ifdef KVZ_SEL_ENCRYPTION
+#define STUBBED extern
+#else
+#define STUBBED static
+#endif
+
+#include <stdio.h>
+#include <math.h>
+#define AESEncryptionStreamMode      1
+#ifdef __cplusplus
+extern "C" {
+#endif
+    typedef void* Crypto_Handle;
+
+    STUBBED Crypto_Handle InitC();
+    STUBBED void DecryptC(Crypto_Handle hdl, const unsigned char *in_stream, int size_bits, unsigned char  *out_stream);
+#if AESEncryptionStreamMode
+    STUBBED unsigned int ff_get_key(Crypto_Handle *hdl, int nb_bits);
+#endif
+    STUBBED void DeleteCryptoC(Crypto_Handle hdl);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#ifndef KVZ_SEL_ENCRYPTION
+// Provide static stubs to allow linking without libcryptopp and allows us to
+// avoid sprinkling ifdefs everywhere and having a bunch of code that's not
+// compiled during normal development.
+// Provide them in the header so we can avoid compiling the cpp file, which
+// means we don't need a C++ compiler when crypto is not enabled.
+
+#include <assert.h>
+
+static INLINE Crypto_Handle InitC()
+{
+  // Stub.
+  assert(0);
+  return 0;
+}
+
+static INLINE void DecryptC(Crypto_Handle hdl, const unsigned char *in_stream,
+              int size_bits, unsigned char  *out_stream)
+{
+  // Stub.
+  assert(0);
+}
+
+#if AESEncryptionStreamMode
+static INLINE unsigned int ff_get_key(Crypto_Handle *hdl, int nb_bits)
+{
+  // Stub.
+  assert(0);
+  return 0;
+}
+#endif
+
+static INLINE void DeleteCryptoC(Crypto_Handle hdl)
+{
+  // Stub.
+  assert(0);
+}
+
+#endif // KVZ_SEL_ENCRYPTION
+
+#endif // CRYPTO_H_

kvazaar-0.8.3.tar.gz/src/extras/getopt.c -> kvazaar-1.0.0.tar.gz/src/extras/getopt.c Changed

kvazaar-1.0.0.tar.gz/src/extras/libmd5.c Added

@@ -0,0 +1,258 @@
+/*
+ * This code implements the MD5 message-digest algorithm.  The algorithm was
+ * written by Ron Rivest.  This code was written by Colin Plumb in 1993, our
+ * understanding is that no copyright is claimed and that this code is in the
+ * public domain.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is functionally equivalent,
+ *
+ * To compute the message digest of a chunk of bytes, declare an MD5Context
+ * structure, pass it to kvz_md5_init, call kvz_md5_update as needed on buffers full of
+ * bytes, and then call kvz_md5_final, which will fill a supplied 16-byte array with
+ * the digest.
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+#include "extras/libmd5.h"
+
+
+//! \ingroup libMD5
+//! \{
+
+static void MD5Transform(uint32_t buf4, uint32_t const in16);
+
+#ifndef __BIG_ENDIAN__
+# define byteReverse(buf, len)    /* Nothing */
+#else
+void byteReverse(uint32_t *buf, unsigned len);
+/*
+ * Note: this code is harmless on little-endian machines.
+ */
+void byteReverse(uint32_t *buf, unsigned len)
+{
+  uint32_t t;
+  do {
+    char* bytes = (char *) buf;
+    t = ((unsigned) bytes3 << 8 | bytes2) << 16 |
+        ((unsigned) bytes1 << 8 | bytes0);
+    *buf = t;
+    buf++;
+  } while (--len);
+}
+#endif
+
+/*
+ * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void kvz_md5_init(context_md5_t *ctx)
+{
+  ctx->buf0 = 0x67452301;
+  ctx->buf1 = 0xefcdab89;
+  ctx->buf2 = 0x98badcfe;
+  ctx->buf3 = 0x10325476;
+
+  ctx->bits0 = 0;
+  ctx->bits1 = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void kvz_md5_update(context_md5_t *ctx, const unsigned char *buf, unsigned len)
+{
+  uint32_t t;
+
+  /* Update bitcount */
+
+  t = ctx->bits0;
+  if ((ctx->bits0 = t + ((uint32_t) len << 3)) < t)
+    ctx->bits1++;        /* Carry from low to high */
+  ctx->bits1 += len >> 29;
+
+  t = (t >> 3) & 0x3f;    /* Bytes already in shsInfo->data */
+
+  /* Handle any leading odd-sized chunks */
+
+  if (t) {
+    unsigned char *p = ctx->in.b8 + t;
+
+    t = 64 - t;
+    if (len < t) {
+      memcpy(p, buf, len);
+      return;
+    }
+    memcpy(p, buf, t);
+    byteReverse(ctx->in.b32, 16);
+    MD5Transform(ctx->buf, ctx->in.b32);
+    buf += t;
+    len -= t;
+  }
+  /* Process data in 64-byte chunks */
+
+  while (len >= 64) {
+    memcpy(ctx->in.b8, buf, 64);
+    byteReverse(ctx->in.b32, 16);
+    MD5Transform(ctx->buf, ctx->in.b32);
+    buf += 64;
+    len -= 64;
+  }
+
+    /* Handle any remaining bytes of data. */
+
+  memcpy(ctx->in.b8, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void kvz_md5_final(unsigned char digest16, context_md5_t *ctx)
+{
+  unsigned count;
+  unsigned char *p;
+
+  /* Compute number of bytes mod 64 */
+  count = (ctx->bits0 >> 3) & 0x3F;
+
+  /* Set the first char of padding to 0x80.  This is safe since there is
+     always at least one byte free */
+  p = ctx->in.b8 + count;
+  *p++ = 0x80;
+
+  /* Bytes of padding needed to make 64 bytes */
+  count = 64 - 1 - count;
+
+  /* Pad out to 56 mod 64 */
+  if (count < 8) {
+    /* Two lots of padding:  Pad the first block to 64 bytes */
+    memset(p, 0, count);
+    byteReverse(ctx->in.b32, 16);
+    MD5Transform(ctx->buf, ctx->in.b32);
+
+    /* Now fill the next block with 56 bytes */
+    memset(ctx->in.b8, 0, 56);
+  } else {
+    /* Pad block to 56 bytes */
+    memset(p, 0, count - 8);
+  }
+  byteReverse(ctx->in.b32, 14);
+
+  /* Append length in bits and transform */
+  ctx->in.b3214 = ctx->bits0;
+  ctx->in.b3215 = ctx->bits1;
+
+  MD5Transform(ctx->buf, ctx->in.b32);
+  byteReverse((uint32_t *) ctx->buf, 4);
+  memcpy(digest, ctx->buf, 16);
+
+  memset(ctx, 0, sizeof(* ctx));    /* In case it's sensitive */
+  /* The original version of this code omitted the asterisk. In
+     effect, only the first part of ctx was wiped with zeros, not
+     the whole thing. Bug found by Derek Jones. Original line: */
+  // memset(ctx, 0, sizeof(ctx));    /* In case it's sensitive */
+}
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, data, s) \
+    ( w += f(x, y, z) + data,  w = w<<s | w>>(32-s),  w += x )
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data.  kvz_md5_update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+static void MD5Transform(uint32_t buf4, uint32_t const in16)
+{
+  register uint32_t a, b, c, d;
+
+  a = buf0;
+  b = buf1;
+  c = buf2;
+  d = buf3;
+
+  MD5STEP(F1, a, b, c, d, in0 + 0xd76aa478, 7);
+  MD5STEP(F1, d, a, b, c, in1 + 0xe8c7b756, 12);
+  MD5STEP(F1, c, d, a, b, in2 + 0x242070db, 17);
+  MD5STEP(F1, b, c, d, a, in3 + 0xc1bdceee, 22);
+  MD5STEP(F1, a, b, c, d, in4 + 0xf57c0faf, 7);
+  MD5STEP(F1, d, a, b, c, in5 + 0x4787c62a, 12);
+  MD5STEP(F1, c, d, a, b, in6 + 0xa8304613, 17);
+  MD5STEP(F1, b, c, d, a, in7 + 0xfd469501, 22);
+  MD5STEP(F1, a, b, c, d, in8 + 0x698098d8, 7);
+  MD5STEP(F1, d, a, b, c, in9 + 0x8b44f7af, 12);
+  MD5STEP(F1, c, d, a, b, in10 + 0xffff5bb1, 17);
+  MD5STEP(F1, b, c, d, a, in11 + 0x895cd7be, 22);
+  MD5STEP(F1, a, b, c, d, in12 + 0x6b901122, 7);
+  MD5STEP(F1, d, a, b, c, in13 + 0xfd987193, 12);
+  MD5STEP(F1, c, d, a, b, in14 + 0xa679438e, 17);
+  MD5STEP(F1, b, c, d, a, in15 + 0x49b40821, 22);
+
+  MD5STEP(F2, a, b, c, d, in1 + 0xf61e2562, 5);
+  MD5STEP(F2, d, a, b, c, in6 + 0xc040b340, 9);
+  MD5STEP(F2, c, d, a, b, in11 + 0x265e5a51, 14);
+  MD5STEP(F2, b, c, d, a, in0 + 0xe9b6c7aa, 20);
+  MD5STEP(F2, a, b, c, d, in5 + 0xd62f105d, 5);
+  MD5STEP(F2, d, a, b, c, in10 + 0x02441453, 9);
+  MD5STEP(F2, c, d, a, b, in15 + 0xd8a1e681, 14);
+  MD5STEP(F2, b, c, d, a, in4 + 0xe7d3fbc8, 20);
+  MD5STEP(F2, a, b, c, d, in9 + 0x21e1cde6, 5);
+  MD5STEP(F2, d, a, b, c, in14 + 0xc33707d6, 9);
+  MD5STEP(F2, c, d, a, b, in3 + 0xf4d50d87, 14);
+  MD5STEP(F2, b, c, d, a, in8 + 0x455a14ed, 20);
+  MD5STEP(F2, a, b, c, d, in13 + 0xa9e3e905, 5);
+  MD5STEP(F2, d, a, b, c, in2 + 0xfcefa3f8, 9);
+  MD5STEP(F2, c, d, a, b, in7 + 0x676f02d9, 14);
+  MD5STEP(F2, b, c, d, a, in12 + 0x8d2a4c8a, 20);
+
+  MD5STEP(F3, a, b, c, d, in5 + 0xfffa3942, 4);
+  MD5STEP(F3, d, a, b, c, in8 + 0x8771f681, 11);
+  MD5STEP(F3, c, d, a, b, in11 + 0x6d9d6122, 16);
+  MD5STEP(F3, b, c, d, a, in14 + 0xfde5380c, 23);
+  MD5STEP(F3, a, b, c, d, in1 + 0xa4beea44, 4);
+  MD5STEP(F3, d, a, b, c, in4 + 0x4bdecfa9, 11);
+  MD5STEP(F3, c, d, a, b, in7 + 0xf6bb4b60, 16);
+  MD5STEP(F3, b, c, d, a, in10 + 0xbebfbc70, 23);
+  MD5STEP(F3, a, b, c, d, in13 + 0x289b7ec6, 4);
+  MD5STEP(F3, d, a, b, c, in0 + 0xeaa127fa, 11);
+  MD5STEP(F3, c, d, a, b, in3 + 0xd4ef3085, 16);
+  MD5STEP(F3, b, c, d, a, in6 + 0x04881d05, 23);
+  MD5STEP(F3, a, b, c, d, in9 + 0xd9d4d039, 4);
+  MD5STEP(F3, d, a, b, c, in12 + 0xe6db99e5, 11);
+  MD5STEP(F3, c, d, a, b, in15 + 0x1fa27cf8, 16);
+  MD5STEP(F3, b, c, d, a, in2 + 0xc4ac5665, 23);
+
+  MD5STEP(F4, a, b, c, d, in0 + 0xf4292244, 6);
+  MD5STEP(F4, d, a, b, c, in7 + 0x432aff97, 10);
+  MD5STEP(F4, c, d, a, b, in14 + 0xab9423a7, 15);
+  MD5STEP(F4, b, c, d, a, in5 + 0xfc93a039, 21);
+  MD5STEP(F4, a, b, c, d, in12 + 0x655b59c3, 6);
+  MD5STEP(F4, d, a, b, c, in3 + 0x8f0ccc92, 10);
+  MD5STEP(F4, c, d, a, b, in10 + 0xffeff47d, 15);
+  MD5STEP(F4, b, c, d, a, in1 + 0x85845dd1, 21);
+  MD5STEP(F4, a, b, c, d, in8 + 0x6fa87e4f, 6);
+  MD5STEP(F4, d, a, b, c, in15 + 0xfe2ce6e0, 10);
+  MD5STEP(F4, c, d, a, b, in6 + 0xa3014314, 15);
+  MD5STEP(F4, b, c, d, a, in13 + 0x4e0811a1, 21);
+  MD5STEP(F4, a, b, c, d, in4 + 0xf7537e82, 6);
+  MD5STEP(F4, d, a, b, c, in11 + 0xbd3af235, 10);
+  MD5STEP(F4, c, d, a, b, in2 + 0x2ad7d2bb, 15);
+  MD5STEP(F4, b, c, d, a, in9 + 0xeb86d391, 21);
+
+  buf0 += a;
+  buf1 += b;
+  buf2 += c;
+  buf3 += d;
+}

kvazaar-1.0.0.tar.gz/src/extras/libmd5.h Added

@@ -0,0 +1,58 @@
+/* The copyright in this software is being made available under the BSD
+ * License, included below. This software may be subject to other third party
+ * and contributor rights, including patent rights, and no such rights are
+ * granted under this license.
+ *
+ * Copyright (c) 2010-2015, ITU/ISO/IEC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
+ *    be used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+#include <stdint.h>
+
+//! \ingroup libMD5
+//! \{
+
+typedef struct _context_md5_t {
+  uint32_t buf4;
+  uint32_t bits2;
+  union {
+    unsigned char b864;
+    uint32_t b3216;
+  } in;
+} context_md5_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void kvz_md5_init(context_md5_t *ctx);
+void kvz_md5_update(context_md5_t *ctx, const unsigned char *buf, unsigned len);
+void kvz_md5_final(unsigned char digest16, context_md5_t *ctx);
+#ifdef __cplusplus
+}
+#endif
+
+//! \}

kvazaar-0.8.3.tar.gz/src/filter.c -> kvazaar-1.0.0.tar.gz/src/filter.c Changed

@@ -20,15 +20,14 @@
 
 #include "filter.h"
 
-#include <assert.h>
-#include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
 
-#include "bitstream.h"
-#include "videoframe.h"
-#include "cabac.h"
+#include "cu.h"
+#include "encoder.h"
+#include "kvazaar.h"
 #include "transform.h"
+#include "videoframe.h"
+
 
 //////////////////////////////////////////////////////////////////////////
 // INITIALIZATIONS
@@ -180,9 +179,8 @@
                            int32_t y,
                            edge_dir dir)
 {
-  const cu_info_t *const scu = kvz_videoframe_get_cu(state->tile->frame,
-                                                     x >> MIN_SIZE,
-                                                     y >> MIN_SIZE);
+  const cu_info_t *const scu =
+    kvz_cu_array_at_const(state->tile->frame->cu_array, x, y);
   const int tu_width = LCU_WIDTH >> scu->tr_depth;
 
   if (dir == EDGE_HOR) {
@@ -207,16 +205,14 @@
                            int32_t y,
                            edge_dir dir)
 {
-  const cu_info_t *const scu = kvz_videoframe_get_cu(state->tile->frame,
-                                                     x >> MIN_SIZE,
-                                                     y >> MIN_SIZE);
+  const cu_info_t *const scu =
+    kvz_cu_array_at_const(state->tile->frame->cu_array, x, y);
   // Get the containing CU.
   const int32_t cu_width = LCU_WIDTH >> scu->depth;
   const int32_t x_cu = x & ~(cu_width - 1);
   const int32_t y_cu = y & ~(cu_width - 1);
-  const cu_info_t *const cu = kvz_videoframe_get_cu(state->tile->frame,
-                                                    x_cu >> MIN_SIZE,
-                                                    y_cu >> MIN_SIZE);
+  const cu_info_t *const cu =
+    kvz_cu_array_at_const(state->tile->frame->cu_array, x_cu, y_cu);
 
   const int num_pu = kvz_part_mode_num_partscu->part_size;
   for (int i = 0; i < num_pu; i++) {
@@ -285,8 +281,6 @@
 {
   videoframe_t * const frame = state->tile->frame;
   const encoder_control_t * const encoder = state->encoder_control;
-  
-  cu_info_t *cu_q = kvz_videoframe_get_cu(frame, x >> MIN_SIZE, y >> MIN_SIZE);
 
   {
     int32_t stride = frame->rec->stride;
@@ -295,12 +289,9 @@
     // TODO: support 10+bits
     kvz_pixel *orig_src = &frame->rec->yx + y*stride;
     kvz_pixel *src = orig_src;
-    cu_info_t *cu_p = NULL;
-    int16_t x_cu = x >> MIN_SIZE;
-    int16_t y_cu = y >> MIN_SIZE;
 
     int8_t strength = 0;
-    int32_t qp              = state->global->QP;
+    int32_t qp              = state->frame->QP;
     int32_t bitdepth_scale  = 1 << (encoder->bitdepth - 8);
     int32_t b_index         = CLIP(0, 51, qp + (beta_offset_div2 << 1));
     int32_t beta            = kvz_g_beta_table_8x8b_index * bitdepth_scale;
@@ -321,11 +312,22 @@
       int32_t dp0, dq0, dp3, dq3, d0, d3, dp, dq, d;
 
       {
-        // CU in the side we are filtering, update every 8-pixels
-        cu_p = kvz_videoframe_get_cu(frame, x_cu - (dir == EDGE_VER) + (dir == EDGE_HOR ? block_idx>>1 : 0), y_cu - (dir == EDGE_HOR) + (dir == EDGE_VER ? block_idx>>1 : 0));
+        // CUs on both sides of the edge
+        cu_info_t *cu_p;
+        cu_info_t *cu_q;
+        if (dir == EDGE_VER) {
+          int32_t y_coord = y + 4 * block_idx;
+          cu_p = kvz_cu_array_at(frame->cu_array, x - 1, y_coord);
+          cu_q = kvz_cu_array_at(frame->cu_array, x,     y_coord);
+
+        } else {
+          int32_t x_coord = x + 4 * block_idx;
+          cu_p = kvz_cu_array_at(frame->cu_array, x_coord, y - 1);
+          cu_q = kvz_cu_array_at(frame->cu_array, x_coord, y    );
+        }
 
-        bool nonzero_coeffs = cbf_is_set(cu_q->cbf.y, cu_q->tr_depth)
-                           || cbf_is_set(cu_p->cbf.y, cu_p->tr_depth);
+        bool nonzero_coeffs = cbf_is_set(cu_q->cbf, cu_q->tr_depth, COLOR_Y)
+                           || cbf_is_set(cu_p->cbf, cu_p->tr_depth, COLOR_Y);
 
         // Filter strength
         strength = 0;
@@ -343,7 +345,7 @@
         }
         
         // B-slice related checks
-        if(!strength && state->global->slicetype == KVZ_SLICE_B) {
+        if(!strength && state->frame->slicetype == KVZ_SLICE_B) {
 
           // Zero all undefined motion vectors for easier usage
           if(!(cu_q->inter.mv_dir & 1)) {
@@ -424,10 +426,6 @@
       dq = dq0 + dq3;
       d  =  d0 + d3;
 
-      #if ENABLE_PCM
-      // TODO: add PCM deblocking
-      #endif
-
       if (d < beta) {
         int8_t filter_P = (dp < side_threshold) ? 1 : 0;
         int8_t filter_Q = (dq < side_threshold) ? 1 : 0;
@@ -480,7 +478,6 @@
 {
   const encoder_control_t * const encoder = state->encoder_control;
   const videoframe_t * const frame = state->tile->frame;
-  const cu_info_t *cu_q = kvz_videoframe_get_cu_const(frame, x >> (MIN_SIZE - 1), y >> (MIN_SIZE - 1));
 
   // For each subpart
   {
@@ -491,12 +488,9 @@
       &frame->rec->ux + y*stride,
       &frame->rec->vx + y*stride,
     };
-    const cu_info_t *cu_p = NULL;
-    int16_t x_cu = x >> (MIN_SIZE-1);
-    int16_t y_cu = y >> (MIN_SIZE-1);
     int8_t strength = 2;
 
-    int32_t QP             = kvz_g_chroma_scalestate->global->QP;
+    int32_t QP             = kvz_g_chroma_scalestate->frame->QP;
     int32_t bitdepth_scale = 1 << (encoder->bitdepth-8);
     int32_t TC_index       = CLIP(0, 51+2, (int32_t)(QP + 2*(strength-1) + (tc_offset_div2 << 1)));
     int32_t Tc             = kvz_g_tc_table_8x8TC_index*bitdepth_scale;
@@ -508,7 +502,19 @@
 
     for (uint32_t blk_idx = 0; blk_idx < num_4px_parts; ++blk_idx)
     {
-      cu_p = kvz_videoframe_get_cu_const(frame, x_cu - (dir == EDGE_VER) + (dir == EDGE_HOR ? blk_idx : 0), y_cu - (dir == EDGE_HOR) + (dir == EDGE_VER ? blk_idx : 0));
+      // CUs on both sides of the edge
+      cu_info_t *cu_p;
+      cu_info_t *cu_q;
+      if (dir == EDGE_VER) {
+        int32_t y_coord = (y + 4 * blk_idx) << 1;
+        cu_p = kvz_cu_array_at(frame->cu_array, (x - 1) << 1, y_coord);
+        cu_q = kvz_cu_array_at(frame->cu_array,  x      << 1, y_coord);
+
+      } else {
+        int32_t x_coord = (x + 4 * blk_idx) << 1;
+        cu_p = kvz_cu_array_at(frame->cu_array, x_coord, (y - 1) << 1);
+        cu_q = kvz_cu_array_at(frame->cu_array, x_coord, (y    ) << 1);
+      }
 
       // Only filter when strenght == 2 (one of the blocks is intra coded)
       if (cu_q->type == CU_INTRA || cu_p->type == CU_INTRA) {
@@ -575,7 +581,7 @@
   // Chroma pixel coordinates.
   const int32_t x_c = x >> 1;
   const int32_t y_c = y >> 1;
-  if (is_on_8x8_grid(x_c, y_c, dir)) {
+  if (state->encoder_control->chroma_format != KVZ_CSP_400 && is_on_8x8_grid(x_c, y_c, dir)) {
     filter_deblock_edge_chroma(state, x_c, y_c, length_c, dir, tu_boundary);
   }
 }
@@ -636,16 +642,18 @@
   }
 
   // Chroma
-  const int x_px_c = x_px >> 1;
-  const int y_px_c = y_px >> 1;
-  const int x_c = x_px_c - 4;
-  const int end_c = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1);
-  for (int y_c = y_px_c; y_c < end_c; y_c += 8) {
-    // The top edge of the whole frame is not filtered.
-    bool tu_boundary = is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR);
-    bool pu_boundary = is_pu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR);
-    if (y_c > 0 && (tu_boundary || pu_boundary)) {
-      filter_deblock_edge_chroma(state, x_c, y_c, 4, EDGE_HOR, tu_boundary);
+  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+    const int x_px_c = x_px >> 1;
+    const int y_px_c = y_px >> 1;
+    const int x_c = x_px_c - 4;
+    const int end_c = MIN(y_px_c + LCU_WIDTH_C, state->tile->frame->height >> 1);
+    for (int y_c = y_px_c; y_c < end_c; y_c += 8) {
+      // The top edge of the whole frame is not filtered.
+      bool tu_boundary = is_tu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR);
+      bool pu_boundary = is_pu_boundary(state, x_c << 1, y_c << 1, EDGE_HOR);
+      if (y_c > 0 && (tu_boundary || pu_boundary)) {
+        filter_deblock_edge_chroma(state, x_c, y_c, 4, EDGE_HOR, tu_boundary);
+      }
     }
   }
 }
@@ -678,6 +686,8 @@
  */
 void kvz_filter_deblock_lcu(encoder_state_t * const state, int x_px, int y_px)
 {
+  assert(!state->encoder_control->cfg->lossless);
+
   filter_deblock_lcu_inside(state, x_px, y_px, EDGE_VER);
   if (x_px > 0) {
     filter_deblock_lcu_rightmost(state, x_px, y_px);

kvazaar-0.8.3.tar.gz/src/filter.h -> kvazaar-1.0.0.tar.gz/src/filter.h Changed

kvazaar-0.8.3.tar.gz/src/global.h -> kvazaar-1.0.0.tar.gz/src/global.h Changed

@@ -32,19 +32,22 @@
 #ifdef HAVE_CONFIG_H
 // Include config.h generated by automake. This needs to be before any other
 // includes in every file, which is why it's in global.
-#include "config.h"
+#include "config.h" // IWYU pragma: export
 #endif
 
-#include <assert.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
+// Include some basics in all files, like assert, primitives and NULL.
+// If you add anything to this list with export pragma, think long and
+// and hard if it's actually a good idea to incude it for every c-file.
+#include <assert.h> // IWYU pragma: export
+#include <stdbool.h> // IWYU pragma: export
+#include <stdint.h> // IWYU pragma: export
+#include <stddef.h> // IWYU pragma: export
+
+// The stdlib.h and string.h headers are needed because of MALLOC and FILL
+// macros defined here, as IWYU will remove them from files that use only
+// those macros.
 #include <stdlib.h>
 #include <string.h>
-#include <limits.h>
-
-#include "kvazaar.h"
-
 
 /**
  * \defgroup Bitstream
@@ -121,9 +124,6 @@
 //! spec: pcm_enabled_flag, Setting to 1 will enable using PCM blocks (current intra-search does not consider PCM)
 #define ENABLE_PCM 0
 
-//! Enable usage of temporal Motion Vector Prediction
-#define ENABLE_TEMPORAL_MVP 0
-
 //! skip residual coding when it's under _some_ threshold
 #define OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD 0
 
@@ -165,7 +165,6 @@
 #define SWAP(a,b,swaptype) { swaptype tempval; tempval = a; a = b; b = tempval; }
 #define CU_WIDTH_FROM_DEPTH(depth) (LCU_WIDTH >> depth)
 #define WITHIN(val, min_val, max_val) ((min_val) <= (val) && (val) <= (max_val))
-#define PU_INDEX(x_pu, y_pu) (((x_pu) % 2)  + 2 * ((y_pu) % 2))
 #define CEILDIV(x,y) (((x) + (y) - 1) / (y))
 
 #define LOG2_LCU_WIDTH 6
@@ -181,7 +180,9 @@
 
 // NOTE: When making a release, check to see if incrementing libversion in 
 // configure.ac is necessary.
+#ifndef KVZ_VERSION
 #define KVZ_VERSION 0.8.3
+#endif
 #define VERSION_STRING QUOTE_EXPAND(KVZ_VERSION)
 
 //#define VERBOSE 1
@@ -248,6 +249,69 @@
 #define EXP_GOLOMB_TABLE_SIZE (4096*8)
 
 //Constants
-typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V, NUM_COLORS } color_t;
+typedef enum { COLOR_Y = 0, COLOR_U, COLOR_V } color_t;
+
+
+// Hardware data (abstraction of defines). Extend for other compilers
+#if defined(_M_IX86) || defined(__i586__) || defined(__i686__) || defined(_M_X64) || defined(_M_AMD64) || defined(__amd64__) || defined(__x86_64__)
+#  define COMPILE_INTEL 1
+#else
+#  define COMPILE_INTEL 0
+#endif
+
+// Visual Studio note:
+// Because these macros are only used to guard code that is guarded by CPUID
+// at runtime, use /arch parameter to disable them, but enable all intrinsics
+// supported by VisualStudio if SSE2 (highest) is enabled.
+// AVX and AVX2 are handled by /arch directly and sse intrinsics will use VEX
+// versions if they are defined.
+#define MSC_X86_SIMD(level) (_M_X64 || (_M_IX86_FP >= (level)))
+
+#if COMPILE_INTEL
+#  if defined(__MMX__) || MSC_X86_SIMD(1)
+#    define COMPILE_INTEL_MMX 1
+#  endif
+#  if defined(__SSE__) || MSC_X86_SIMD(1)
+#    define COMPILE_INTEL_SSE 1
+#  endif
+#  if defined(__SSE2__) || MSC_X86_SIMD(2)
+#    define COMPILE_INTEL_SSE2 1
+#  endif
+#  if defined(__SSE3__)
+#    define COMPILE_INTEL_SSE3 1
+#  endif
+#  if defined(__SSSE3__) || MSC_X86_SIMD(2)
+#    define COMPILE_INTEL_SSSE3 1
+#  endif
+#  if defined(__SSE4_1__) || MSC_X86_SIMD(2)
+#    define COMPILE_INTEL_SSE41 1
+#  endif
+#  if defined(__SSE4_2__) || MSC_X86_SIMD(2)
+#    define COMPILE_INTEL_SSE42 1
+#  endif
+#  if defined(__AVX__)
+#    define COMPILE_INTEL_AVX 1
+#   endif
+#  if defined(__AVX2__)
+#    define COMPILE_INTEL_AVX2 1
+#   endif
+#endif
+
+#if defined (_M_PPC) || defined(__powerpc64__) || defined(__powerpc__)
+#  define COMPILE_POWERPC 1
+#  ifdef __ALTIVEC__
+#    define COMPILE_POWERPC_ALTIVEC 1
+#  else
+#    define COMPILE_POWERPC_ALTIVEC 0
+#  endif
+#else
+#  define COMPILE_POWERPC 0
+#endif
+
+#if defined (_M_ARM) || defined(__arm__) || defined(__thumb__)
+#  define COMPILE_ARM 1
+#else
+#  define COMPILE_ARM 0
+#endif
 
 #endif

kvazaar-0.8.3.tar.gz/src/image.c -> kvazaar-1.0.0.tar.gz/src/image.c Changed

@@ -18,24 +18,29 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include "threads.h"
 #include "image.h"
-#include "strategyselector.h"
 
-#include <string.h>
-#include <stdio.h>
+#include <limits.h>
 #include <stdlib.h>
-#include <math.h>
-#include <assert.h>
 
-#include "checkpoint.h"
-#include "sao.h"
+#include "strategies/strategies-picture.h"
+#include "threads.h"
+
+/**
+* \brief Allocate a new image with 420.
+* This function signature is part of the libkvz API.
+* \return image pointer or NULL on failure
+*/
+kvz_picture * kvz_image_alloc_420(const int32_t width, const int32_t height)
+{
+  return kvz_image_alloc(KVZ_CSP_420, width, height);
+}
 
 /**
  * \brief Allocate a new image.
  * \return image pointer or NULL on failure
  */
-kvz_picture *kvz_image_alloc(const int32_t width, const int32_t height)
+kvz_picture * kvz_image_alloc(enum kvz_chroma_format chroma_format, const int32_t width, const int32_t height)
 {
   //Assert that we have a well defined image
   assert((width % 2) == 0);
@@ -45,7 +50,10 @@
   if (!im) return NULL;
 
   unsigned int luma_size = width * height;
-  unsigned int chroma_size = luma_size / 4;
+  unsigned chroma_sizes = { 0, luma_size / 4, luma_size / 2, luma_size };
+  unsigned chroma_size = chroma_sizeschroma_format;
+
+  im->chroma_format = chroma_format;
 
   //Allocate memory
   im->fulldata = MALLOC(kvz_pixel, (luma_size + 2 * chroma_size));
@@ -59,10 +67,17 @@
   im->width = width;
   im->height = height;
   im->stride = width;
+  im->chroma_format = chroma_format;
 
   im->y = im->dataCOLOR_Y = &im->fulldata0;
-  im->u = im->dataCOLOR_U = &im->fulldataluma_size;
-  im->v = im->dataCOLOR_V = &im->fulldataluma_size + chroma_size;
+
+  if (chroma_format == KVZ_CSP_400) {
+    im->u = im->dataCOLOR_U = NULL;
+    im->v = im->dataCOLOR_V = NULL;
+  } else {
+    im->u = im->dataCOLOR_U = &im->fulldataluma_size;
+    im->v = im->dataCOLOR_V = &im->fulldataluma_size + chroma_size;
+  }
 
   im->pts = 0;
   im->dts = 0;
@@ -143,10 +158,13 @@
   im->width = width;
   im->height = height;
   im->stride = orig_image->stride;
+  im->chroma_format = orig_image->chroma_format;
 
   im->y = im->dataCOLOR_Y = &orig_image->yx_offset + y_offset * orig_image->stride;
-  im->u = im->dataCOLOR_U = &orig_image->ux_offset/2 + y_offset/2 * orig_image->stride/2;
-  im->v = im->dataCOLOR_V = &orig_image->vx_offset/2 + y_offset/2 * orig_image->stride/2;
+  if (orig_image->chroma_format != KVZ_CSP_400) {
+    im->u = im->dataCOLOR_U = &orig_image->ux_offset / 2 + y_offset / 2 * orig_image->stride / 2;
+    im->v = im->dataCOLOR_V = &orig_image->vx_offset / 2 + y_offset / 2 * orig_image->stride / 2;
+  }
 
   im->pts = 0;
   im->dts = 0;
@@ -154,16 +172,22 @@
   return im;
 }
 
-yuv_t * kvz_yuv_t_alloc(int luma_size)
+yuv_t * kvz_yuv_t_alloc(int luma_size, int chroma_size)
 {
-  // Get buffers with separate mallocs in order to take advantage of
-  // automatic buffer overrun checks.
   yuv_t *yuv = (yuv_t *)malloc(sizeof(*yuv));
-  yuv->y = (kvz_pixel *)malloc(luma_size * sizeof(*yuv->y));
-  yuv->u = (kvz_pixel *)malloc(luma_size / 2 * sizeof(*yuv->u));
-  yuv->v = (kvz_pixel *)malloc(luma_size / 2 * sizeof(*yuv->v));
   yuv->size = luma_size;
 
+  // Get buffers with separate mallocs in order to take advantage of
+  // automatic buffer overrun checks.
+  yuv->y = (kvz_pixel *)malloc(luma_size * sizeof(*yuv->y));
+  if (chroma_size == 0) {
+    yuv->u = NULL;
+    yuv->v = NULL;
+  } else {
+    yuv->u = (kvz_pixel *)malloc(chroma_size * sizeof(*yuv->u));
+    yuv->v = (kvz_pixel *)malloc(chroma_size * sizeof(*yuv->v));
+  }
+  
   return yuv;
 }
 
@@ -469,4 +493,76 @@
   }
 
   return ssd;
-}
\ No newline at end of file
+}
+
+
+/**
+ * \brief BLock Image Transfer from one buffer to another.
+ *
+ * It's a stupidly simple loop that copies pixels.
+ *
+ * \param orig  Start of the originating buffer.
+ * \param dst  Start of the destination buffer.
+ * \param width  Width of the copied region.
+ * \param height  Height of the copied region.
+ * \param orig_stride  Width of a row in the originating buffer.
+ * \param dst_stride  Width of a row in the destination buffer.
+ *
+ * This should be inlined, but it's defined here for now to see if Visual
+ * Studios LTCG will inline it.
+ */
+#define BLIT_PIXELS_CASE(n) case n:\
+  for (y = 0; y < n; ++y) {\
+    memcpy(&dsty*dst_stride, &origy*orig_stride, n * sizeof(kvz_pixel));\
+  }\
+  break;
+
+void kvz_pixels_blit(const kvz_pixel * const orig, kvz_pixel * const dst,
+                         const unsigned width, const unsigned height,
+                         const unsigned orig_stride, const unsigned dst_stride)
+{
+  unsigned y;
+  //There is absolutely no reason to have a width greater than the source or the destination stride.
+  assert(width <= orig_stride);
+  assert(width <= dst_stride);
+
+#ifdef CHECKPOINTS
+  char *buffer = malloc((3 * width + 1) * sizeof(char));
+  for (y = 0; y < height; ++y) {
+    int p;
+    for (p = 0; p < width; ++p) {
+      sprintf((buffer + 3*p), "%02X ", origy*orig_stride);
+    }
+    buffer3*width = 0;
+    CHECKPOINT("kvz_pixels_blit_avx2: %04d: %s", y, buffer);
+  }
+  FREE_POINTER(buffer);
+#endif //CHECKPOINTS
+
+  if (width == orig_stride && width == dst_stride) {
+    memcpy(dst, orig, width * height * sizeof(kvz_pixel));
+    return;
+  }
+
+  int nxn_width = (width == height) ? width : 0;
+  switch (nxn_width) {
+    BLIT_PIXELS_CASE(4)
+    BLIT_PIXELS_CASE(8)
+    BLIT_PIXELS_CASE(16)
+    BLIT_PIXELS_CASE(32)
+    BLIT_PIXELS_CASE(64)
+  default:
+
+    if (orig == dst) {
+      //If we have the same array, then we should have the same stride
+      assert(orig_stride == dst_stride);
+      return;
+    }
+    assert(orig != dst || orig_stride == dst_stride);
+
+    for (y = 0; y < height; ++y) {
+      memcpy(&dsty*dst_stride, &origy*orig_stride, width * sizeof(kvz_pixel));
+    }
+    break;
+  }
+}

kvazaar-0.8.3.tar.gz/src/image.h -> kvazaar-1.0.0.tar.gz/src/image.h Changed

@@ -26,7 +26,7 @@
  * A reference counted YUV pixel buffer.
  */
 
-#include "global.h"
+#include "global.h" // IWYU pragma: keep
 
 #include "kvazaar.h"
 
@@ -35,6 +35,7 @@
   kvz_pixel yLCU_LUMA_SIZE;
   kvz_pixel uLCU_CHROMA_SIZE;
   kvz_pixel vLCU_CHROMA_SIZE;
+  enum kvz_chroma_format chroma_format;
 } lcu_yuv_t;
 
 typedef struct {
@@ -52,7 +53,8 @@
 } yuv_t;
 
 
-kvz_picture *kvz_image_alloc(const int32_t width, const int32_t height);
+kvz_picture *kvz_image_alloc_420(const int32_t width, const int32_t height);
+kvz_picture *kvz_image_alloc(enum kvz_chroma_format chroma_format, const int32_t width, const int32_t height);
 
 void kvz_image_free(kvz_picture *im);
 
@@ -64,7 +66,7 @@
                              const unsigned width,
                              const unsigned height);
 
-yuv_t * kvz_yuv_t_alloc(int luma_size);
+yuv_t * kvz_yuv_t_alloc(int luma_size, int chroma_size);
 void kvz_yuv_t_free(yuv_t * yuv);
 
 hi_prec_buf_t * kvz_hi_prec_buf_t_alloc(int luma_size);
@@ -80,4 +82,10 @@
                   const int ref_stride, const int rec_stride,
                   const int width);
 
+
+void kvz_pixels_blit(const kvz_pixel* orig, kvz_pixel *dst,
+                         unsigned width, unsigned height,
+                         unsigned orig_stride, unsigned dst_stride);
+
+
 #endif

kvazaar-0.8.3.tar.gz/src/imagelist.c -> kvazaar-1.0.0.tar.gz/src/imagelist.c Changed

@@ -18,14 +18,13 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include "threads.h"
 #include "imagelist.h"
-#include "strategyselector.h"
 
-#include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <assert.h>
+
+#include "image.h"
+#include "threads.h"
 
 
 /**
@@ -36,13 +35,10 @@
 image_list_t * kvz_image_list_alloc(int size)
 {
   image_list_t *list = (image_list_t *)malloc(sizeof(image_list_t));
-  list->size = size;
-  if (size > 0) {
-    list->images = (kvz_picture**)malloc(sizeof(kvz_picture*) * size);
-    list->cu_arrays = (cu_array_t**)malloc(sizeof(cu_array_t*) * size);
-    list->pocs = malloc(sizeof(int32_t) * size);
-  }
-
+  list->size      = size;
+  list->images    = malloc(sizeof(kvz_picture*) * size);
+  list->cu_arrays = malloc(sizeof(cu_array_t*)  * size);
+  list->pocs      = malloc(sizeof(int32_t)      * size);
   list->used_size = 0;
 
   return list;
@@ -115,7 +111,8 @@
   }
 
   if (list->size == list->used_size) {
-    if (!kvz_image_list_resize(list, list->size*2)) return 0;
+    unsigned new_size = MAX(list->size + 1, list->size * 2);
+    if (!kvz_image_list_resize(list, new_size)) return 0;
   }
 
   for (i = list->used_size; i > 0; i--) {

kvazaar-0.8.3.tar.gz/src/imagelist.h -> kvazaar-1.0.0.tar.gz/src/imagelist.h Changed

kvazaar-0.8.3.tar.gz/src/input_frame_buffer.c -> kvazaar-1.0.0.tar.gz/src/input_frame_buffer.c Changed

@@ -19,8 +19,11 @@
  ****************************************************************************/
 
 #include "input_frame_buffer.h"
+
+#include "encoder.h"
 #include "encoderstate.h"
-#include <assert.h>
+#include "image.h"
+
 
 void kvz_init_input_frame_buffer(input_frame_buffer_t *input_buffer)
 {
@@ -35,42 +38,43 @@
 /**
  * \brief Pass an input frame to the encoder state.
  *
- * Sets the source image of the encoder state if there is a suitable image
- * available.
+ * Returns the image that should be encoded next if there is a suitable
+ * image available.
  *
  * The caller must not modify img_in after calling this function.
  *
  * \param buf     an input frame buffer
  * \param state   a main encoder state
  * \param img_in  input frame or NULL
- * \return        1 if the source image was set, 0 if not
+ * \return        pointer to the next picture, or NULL if no picture is
+ *                available
  */
-int kvz_encoder_feed_frame(input_frame_buffer_t *buf,
-                           encoder_state_t *const state,
-                           kvz_picture *const img_in)
+kvz_picture* kvz_encoder_feed_frame(input_frame_buffer_t *buf,
+                                    encoder_state_t *const state,
+                                    kvz_picture *const img_in)
 {
   const encoder_control_t* const encoder = state->encoder_control;
   const kvz_config* const cfg = encoder->cfg;
 
   const int gop_buf_size = 3 * cfg->gop_len;
 
-  assert(state->global->frame >= 0);
-
-  videoframe_t *frame = state->tile->frame;
-  assert(frame->source == NULL);
-  assert(frame->rec    != NULL);
+  assert(state->frame->num >= 0);
 
   if (cfg->gop_len == 0 || cfg->gop_lowdelay) {
-    // GOP disabled, just return the input frame.
+    // No reordering of output pictures necessary.
 
-    if (img_in == NULL) return 0;
+    if (img_in == NULL) return NULL;
 
     img_in->dts = img_in->pts;
-    frame->source   = kvz_image_copy_ref(img_in);
-    frame->rec->pts = img_in->pts;
-    frame->rec->dts = img_in->dts;
-    state->global->gop_offset = cfg->gop_lowdelay ? (state->global->frame-1) % cfg->gop_len : 0;
-    return 1;
+    state->frame->gop_offset = 0;
+    if (cfg->gop_lowdelay) {
+      state->frame->gop_offset = (state->frame->num - 1) % cfg->gop_len;
+      if (state->frame->gop_offset < 0) {
+        // Set gop_offset of IDR as the highest quality picture.
+        state->frame->gop_offset += cfg->gop_len;
+      }
+    }
+    return kvz_image_copy_ref(img_in);
   }
 
   if (img_in != NULL) {
@@ -101,7 +105,7 @@
 
   if (buf->num_out == buf->num_in) {
     // All frames returned.
-    return 0;
+    return NULL;
   }
 
   if (img_in == NULL && buf->num_in < cfg->gop_len) {
@@ -128,7 +132,7 @@
     // Output the first frame.
     idx_out = -1;
     dts_out = buf->pts_buffergop_buf_size - 1 + buf->delay;
-    gop_offset = 0;
+    gop_offset = 0; // highest quality picture
 
   } else {
     gop_offset = (buf->num_out - 1) % cfg->gop_len;
@@ -164,14 +168,12 @@
   // Index in buf->pic_buffer and buf->pts_buffer.
   int buf_idx = (idx_out + gop_buf_size) % gop_buf_size;
 
-  assert(buf->pic_bufferbuf_idx != NULL);
-  frame->source      = buf->pic_bufferbuf_idx;
-  frame->rec->pts    = frame->source->pts;
-  frame->source->dts = dts_out;
-  frame->rec->dts    = dts_out;
+  kvz_picture* next_pic = buf->pic_bufferbuf_idx;
+  assert(next_pic != NULL);
+  next_pic->dts = dts_out;
   buf->pic_bufferbuf_idx = NULL;
-  state->global->gop_offset = gop_offset;
+  state->frame->gop_offset = gop_offset;
 
   buf->num_out++;
-  return 1;
+  return next_pic;
 }

kvazaar-0.8.3.tar.gz/src/input_frame_buffer.h -> kvazaar-1.0.0.tar.gz/src/input_frame_buffer.h Changed

kvazaar-0.8.3.tar.gz/src/inter.c -> kvazaar-1.0.0.tar.gz/src/inter.c Changed

@@ -20,14 +20,16 @@
 
 #include "inter.h"
 
-#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
 
-#include "filter.h"
-#include "strategies/strategies-ipol.h"
-#include "strategies/generic/ipol-generic.h"
+#include "encoder.h"
+#include "imagelist.h"
 #include "strategies/generic/picture-generic.h"
+#include "strategies/strategies-ipol.h"
+#include "videoframe.h"
+
 
 static void inter_recon_frac_luma(const encoder_state_t * const state,
                                   const kvz_picture * const ref,
@@ -44,7 +46,7 @@
  #define FILTER_SIZE_Y 8 //Luma filter size
 
   // Fractional luma 1/4-pel
-  kvz_extended_block src = {0, 0, 0};
+  kvz_extended_block src = {0, 0, 0, 0};
 
   // Fractional luma
   kvz_get_extended_block(xpos,
@@ -60,7 +62,7 @@
                          block_width,
                          block_height,
                          &src);
-  kvz_sample_quarterpel_luma_generic(state->encoder_control,
+  kvz_sample_quarterpel_luma(state->encoder_control,
                                      src.orig_topleft,
                                      src.stride,
                                      block_width,
@@ -89,7 +91,7 @@
 #define FILTER_SIZE_Y 8 //Luma filter size
 
   // Fractional luma 1/4-pel
-  kvz_extended_block src = { 0, 0, 0 };
+  kvz_extended_block src = { 0, 0, 0, 0 };
 
   // Fractional luma
   kvz_get_extended_block(xpos,
@@ -105,7 +107,7 @@
                          block_width,
                          block_height,
                          &src);
-  kvz_sample_14bit_quarterpel_luma_generic(state->encoder_control,
+  kvz_sample_14bit_quarterpel_luma(state->encoder_control,
                                            src.orig_topleft,
                                            src.stride,
                                            block_width,
@@ -140,19 +142,19 @@
 #define FILTER_SIZE_C 4 //Chroma filter size
 
   // Fractional chroma 1/8-pel
-  kvz_extended_block src_u = { 0, 0, 0 };
-  kvz_extended_block src_v = { 0, 0, 0 };
+  kvz_extended_block src_u = { 0, 0, 0, 0 };
+  kvz_extended_block src_v = { 0, 0, 0, 0 };
 
   //Fractional chroma U
   kvz_get_extended_block(xpos, ypos, (mv_param0 >> 2) >> 1, (mv_param1 >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C,
     ref->u, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_height, &src_u);
-  kvz_sample_octpel_chroma_generic(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
+  kvz_sample_octpel_chroma(state->encoder_control, src_u.orig_topleft, src_u.stride, block_width,
     block_height, lcu->rec.u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
 
   //Fractional chroma V
   kvz_get_extended_block(xpos, ypos, (mv_param0 >> 2) >> 1, (mv_param1 >> 2) >> 1, state->tile->lcu_offset_x * LCU_WIDTH_C, state->tile->lcu_offset_y * LCU_WIDTH_C,
     ref->v, ref->width >> 1, ref->height >> 1, FILTER_SIZE_C, block_width, block_height, &src_v);
-  kvz_sample_octpel_chroma_generic(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
+  kvz_sample_octpel_chroma(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
     block_height, lcu->rec.v + (ypos  % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
 
   if (src_u.malloc_used) free(src_u.buffer);
@@ -180,8 +182,8 @@
 #define FILTER_SIZE_C 4 //Chroma filter size
 
   // Fractional chroma 1/8-pel
-  kvz_extended_block src_u = { 0, 0, 0 };
-  kvz_extended_block src_v = { 0, 0, 0 };
+  kvz_extended_block src_u = { 0, 0, 0, 0 };
+  kvz_extended_block src_v = { 0, 0, 0, 0 };
 
   //Fractional chroma U
   kvz_get_extended_block(xpos,
@@ -197,7 +199,7 @@
                          block_width,
                          block_height,
                          &src_u);
-  kvz_sample_14bit_octpel_chroma_generic(state->encoder_control,
+  kvz_sample_14bit_octpel_chroma(state->encoder_control,
                                          src_u.orig_topleft,
                                          src_u.stride,
                                          block_width,
@@ -222,7 +224,7 @@
                          block_width,
                          block_height,
                          &src_v);
-  kvz_sample_14bit_octpel_chroma_generic(state->encoder_control,
+  kvz_sample_14bit_octpel_chroma(state->encoder_control,
                                          src_v.orig_topleft,
                                          src_v.stride,
                                          block_width,
@@ -237,17 +239,56 @@
   if (src_v.malloc_used) free(src_v.buffer);
 }
 
+
+/**
+* \brief Copy from frame with extended border.
+*
+* \param ref_buf      pointer to the start of ref buffer
+* \param ref_stride   stride of ref buffer
+* \param ref_width    width of frame
+* \param ref_height   height of frame
+* \param rec_buf      pointer to the start of pu in rec buffer
+* \param rec_stride   stride of rec buffer
+* \param width        width of copied block
+* \param height       height of copied block
+* \param mv_in_frame  coordinates of copied block in frame coordinates
+*/
+static void inter_cp_with_ext_border(const kvz_pixel *ref_buf, int ref_stride,
+                                     int ref_width, int ref_height,
+                                     kvz_pixel *rec_buf, int rec_stride,
+                                     int width, int height,
+                                     const vector2d_t *mv_in_frame)
+{
+  for (int y = mv_in_frame->y; y < mv_in_frame->y + height; ++y) {
+    for (int x = mv_in_frame->x; x < mv_in_frame->x + width; ++x) {
+      vector2d_t in_frame = {
+        CLIP(0, ref_width - 1, x),
+        CLIP(0, ref_height - 1, y),
+      };
+      vector2d_t in_pu = {
+        x - mv_in_frame->x,
+        y - mv_in_frame->y,
+      };
+      int pu_index = in_pu.y * rec_stride + in_pu.x;
+      int frame_index = in_frame.y * ref_stride + in_frame.x;
+      rec_bufpu_index = ref_bufframe_index;
+    }
+  }
+}
+
+
 /**
  * \brief Reconstruct inter block
- * \param ref picture to copy the data from
- * \param xpos block x position
- * \param ypos block y position
- * \param width block width
- * \param height block height
- * \param mv2 motion vector
- * \param lcu destination lcu
- * \param hi_prec destination of high precision output (null if not needed)
- * \returns Void
+ *
+ * \param state         encoder state
+ * \param ref           picture to copy the data from
+ * \param xpos          block x position
+ * \param ypos          block y position
+ * \param width         block width
+ * \param height        block height
+ * \param mv_param      motion vector
+ * \param lcu           destination lcu
+ * \param hi_prec_out   destination of high precision output (null if not needed)
 */
 void kvz_inter_recon_lcu(const encoder_state_t * const state,
                          const kvz_picture * const ref,
@@ -259,161 +300,122 @@
                          lcu_t *lcu,
                          hi_prec_buf_t *hi_prec_out)
 {
-  int x,y,coord_x,coord_y;
-  int16_t mv2 = { mv_param0, mv_param1 };
-
-  int32_t dst_width_c = LCU_WIDTH>>1; //!< Destination picture width in chroma pixels
-  int32_t ref_width_c = ref->width>>1; //!< Reference picture width in chroma pixels
-
-  // negative overflow flag
-  int8_t overflow_neg_x = (state->tile->lcu_offset_x * LCU_WIDTH + xpos + (mv0>>2) < 0)?1:0;
-  int8_t overflow_neg_y = (state->tile->lcu_offset_y * LCU_WIDTH + ypos + (mv1>>2) < 0)?1:0;
-
-  // positive overflow flag
-  int8_t overflow_pos_x = (state->tile->lcu_offset_x * LCU_WIDTH + xpos + (mv0>>2) + width > ref->width )?1:0;
-  int8_t overflow_pos_y = (state->tile->lcu_offset_y * LCU_WIDTH + ypos + (mv1>>2) + height > ref->height)?1:0;
-
-  int8_t chroma_halfpel = ((mv0>>2)&1) || ((mv1>>2)&1); //!< (luma integer mv) lsb is set -> chroma is half-pel
-  // Luma quarter-pel
-  int8_t fractional_mv = (mv0&1) || (mv1&1) || (mv0&2) || (mv1&2); // either of 2 lowest bits of mv set -> mv is fractional
-
-  if(fractional_mv) {
-    if (state->encoder_control->cfg->bipred && hi_prec_out){
-      inter_recon_14bit_frac_luma(state, ref, xpos, ypos, width, height, mv_param, hi_prec_out);
-      inter_recon_14bit_frac_chroma(state, ref, xpos, ypos, width, height, mv_param, hi_prec_out);
+  const vector2d_t tile_in_frame = {
+    state->tile->lcu_offset_x * LCU_WIDTH,
+    state->tile->lcu_offset_y * LCU_WIDTH
+  };
+  const vector2d_t pu_in_tile = { xpos, ypos };
+  const vector2d_t pu_in_lcu = { xpos % LCU_WIDTH, ypos % LCU_WIDTH };
+
+  const vector2d_t mv_in_pu = { mv_param0 >> 2, mv_param1 >> 2 };
+  const vector2d_t mv_in_frame = {
+    mv_in_pu.x + pu_in_tile.x + tile_in_frame.x,
+    mv_in_pu.y + pu_in_tile.y + tile_in_frame.y
+  };
+
+  const bool mv_is_outside_frame = mv_in_frame.x < 0 ||
+      mv_in_frame.y < 0 ||
+      mv_in_frame.x + width > ref->width ||
+      mv_in_frame.y + height > ref->height;
+
+  // With 420, odd coordinates need interpolation.
+  const int8_t fractional_chroma = (mv_in_pu.x & 1) || (mv_in_pu.y & 1);
+  const int8_t fractional_luma = ((mv_param0 & 3) || (mv_param1 & 3));
+
+  // Generate prediction for luma.
+  if (fractional_luma) {
+    // With a fractional MV, do interpolation.
+    if (state->encoder_control->cfg->bipred && hi_prec_out) {
+      inter_recon_14bit_frac_luma(state, ref,
+                                  pu_in_tile.x, pu_in_tile.y,
+                                  width, height,
+                                  mv_param, hi_prec_out);
     } else {
-      inter_recon_frac_luma(state, ref, xpos, ypos, width, height, mv_param, lcu);
-      inter_recon_frac_chroma(state, ref, xpos, ypos, width, height, mv_param, lcu);
+      inter_recon_frac_luma(state, ref,
+                            pu_in_tile.x, pu_in_tile.y,
+                            width, height,
+                            mv_param, lcu);
+    }
+  } else {
+    // With an integer MV, copy pixels directly from the reference.
+    const int lcu_pu_index = pu_in_lcu.y * LCU_WIDTH + pu_in_lcu.x;
+    if (mv_is_outside_frame) {
+      inter_cp_with_ext_border(ref->y, ref->width,
+                               ref->width, ref->height,
+                               &lcu->rec.ylcu_pu_index, LCU_WIDTH,
+                               width, height,
+                               &mv_in_frame);
+    } else {
+      const int frame_mv_index = mv_in_frame.y * ref->width + mv_in_frame.x;
+      kvz_pixels_blit(&ref->yframe_mv_index,
+                      &lcu->rec.ylcu_pu_index,
+                      width, height,
+                      ref->width, LCU_WIDTH);
     }
   }
 
-  mv0 >>= 2;
-  mv1 >>= 2;
+  if (state->encoder_control->chroma_format == KVZ_CSP_400) {
+    return;
+  }
 
-  // Chroma half-pel
-  // get half-pel interpolated block and push it to output
-  if(!fractional_mv) {
-    if(chroma_halfpel) {
-      if (state->encoder_control->cfg->bipred && hi_prec_out){
-        inter_recon_14bit_frac_chroma(state, ref, xpos, ypos, width, height, mv_param, hi_prec_out);
-      } else {
-        inter_recon_frac_chroma(state, ref, xpos, ypos, width, height, mv_param, lcu);
-      }
+  // Generate prediction for chroma.
+  if (fractional_luma || fractional_chroma) {
+    // With a fractional MV, do interpolation.
+    if (state->encoder_control->cfg->bipred && hi_prec_out) {
+      inter_recon_14bit_frac_chroma(state, ref,
+                                    pu_in_tile.x, pu_in_tile.y,
+                                    width, height,
+                                    mv_param, hi_prec_out);
+    } else {
+      inter_recon_frac_chroma(state, ref,
+                              pu_in_tile.x, pu_in_tile.y,
+                              width, height,
+                              mv_param, lcu);
     }
-
-    // With overflow present, more checking
-    if (overflow_neg_x || overflow_neg_y || overflow_pos_x || overflow_pos_y) {
-      // Copy Luma with boundary checking
-      for (y = ypos; y < ypos + height; y++) {
-        for (x = xpos; x < xpos + width; x++) {
-          int x_in_lcu = (x & ((LCU_WIDTH)-1));
-          int y_in_lcu = (y & ((LCU_WIDTH)-1));
-
-          coord_x = (x + state->tile->lcu_offset_x * LCU_WIDTH) + mv0;
-          coord_y = (y + state->tile->lcu_offset_y * LCU_WIDTH) + mv1;
-          overflow_neg_x = (coord_x < 0)?1:0;
-          overflow_neg_y = (coord_y < 0)?1:0;
-
-          overflow_pos_x = (coord_x >= ref->width )?1:0;
-          overflow_pos_y = (coord_y >= ref->height)?1:0;
-
-          // On x-overflow set coord_x accordingly
-          if (overflow_neg_x) {
-            coord_x = 0;
-          } else if (overflow_pos_x) {
-            coord_x = ref->width - 1;
-          }
-
-          // On y-overflow set coord_y accordingly
-          if (overflow_neg_y) {
-            coord_y = 0;
-          } else if (overflow_pos_y) {
-            coord_y = ref->height - 1;
-          }
-
-          // set destination to (corrected) pixel value from the reference
-          lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = ref->ycoord_y*ref->width + coord_x;
-        }
-      }
-
-      if(!chroma_halfpel) {
-        // Copy Chroma with boundary checking
-        for (y = ypos>>1; y < (ypos + height)>>1; y++) {
-          for (x = xpos>>1; x < (xpos + width)>>1; x++) {
-            int x_in_lcu = (x & ((LCU_WIDTH>>1)-1));
-            int y_in_lcu = (y & ((LCU_WIDTH>>1)-1));
-
-            coord_x = (x + state->tile->lcu_offset_x * (LCU_WIDTH >> 1)) + (mv0>>1);
-            coord_y = (y + state->tile->lcu_offset_y * (LCU_WIDTH >> 1)) + (mv1>>1);
-
-            overflow_neg_x = (coord_x < 0)?1:0;
-            overflow_neg_y = (y + (mv1>>1) < 0)?1:0;
-
-            overflow_pos_x = (coord_x >= ref->width>>1 )?1:0;
-            overflow_pos_y = (coord_y >= ref->height>>1)?1:0;
-
-            // On x-overflow set coord_x accordingly
-            if(overflow_neg_x) {
-              coord_x = 0;
-            } else if(overflow_pos_x) {
-              coord_x = (ref->width>>1) - 1;
-            }
-
-            // On y-overflow set coord_y accordingly
-            if(overflow_neg_y) {
-              coord_y = 0;
-            } else if(overflow_pos_y) {
-              coord_y = (ref->height>>1) - 1;
-            }
-
-            // set destinations to (corrected) pixel value from the reference
-            lcu->rec.uy_in_lcu*dst_width_c + x_in_lcu = ref->ucoord_y * ref_width_c + coord_x;
-            lcu->rec.vy_in_lcu*dst_width_c + x_in_lcu = ref->vcoord_y * ref_width_c + coord_x;
-          }
-        }
-      }
-    } else { //If no overflow, we can copy without checking boundaries
-      // Copy Luma
-      for (y = ypos; y < ypos + height; y++) {
-        int y_in_lcu = (y & ((LCU_WIDTH)-1));
-        coord_y = ((y + state->tile->lcu_offset_y * LCU_WIDTH) + mv1) * ref->width; // pre-calculate
-        for (x = xpos; x < xpos + width; x++) {
-          int x_in_lcu = (x & ((LCU_WIDTH)-1));
-
-          lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = ref->ycoord_y + (x + state->tile->lcu_offset_x * LCU_WIDTH) + mv0;
-        }
-      }
-
-      if(!chroma_halfpel) {
-        // Copy Chroma
-        // TODO: chroma fractional pixel interpolation
-        for (y = ypos>>1; y < (ypos + height)>>1; y++) {
-          int y_in_lcu = (y & ((LCU_WIDTH>>1)-1));
-          coord_y = ((y + state->tile->lcu_offset_y * (LCU_WIDTH>>1)) + (mv1>>1)) * ref_width_c; // pre-calculate
-          for (x = xpos>>1; x < (xpos + width)>>1; x++) {
-            int x_in_lcu = (x & ((LCU_WIDTH>>1)-1));
-            lcu->rec.uy_in_lcu*dst_width_c + x_in_lcu = ref->ucoord_y + (x + state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv0>>1);
-            lcu->rec.vy_in_lcu*dst_width_c + x_in_lcu = ref->vcoord_y + (x + state->tile->lcu_offset_x * (LCU_WIDTH>>1)) + (mv0>>1);
-          }
-        }
-      }
+  } else {
+    // With an integer MV, copy pixels directly from the reference.
+    const int lcu_pu_index_c = pu_in_lcu.y / 2 * LCU_WIDTH_C + pu_in_lcu.x / 2;
+    const vector2d_t mv_in_frame_c = { mv_in_frame.x / 2, mv_in_frame.y / 2 };
+
+    if (mv_is_outside_frame) {
+      inter_cp_with_ext_border(ref->u, ref->width / 2,
+                               ref->width / 2, ref->height / 2,
+                               &lcu->rec.ulcu_pu_index_c, LCU_WIDTH_C,
+                               width / 2, height / 2,
+                               &mv_in_frame_c);
+      inter_cp_with_ext_border(ref->v, ref->width / 2,
+                               ref->width / 2, ref->height / 2,
+                               &lcu->rec.vlcu_pu_index_c, LCU_WIDTH_C,
+                               width / 2, height / 2,
+                               &mv_in_frame_c);
+    } else {
+      const int frame_mv_index = mv_in_frame_c.y * ref->width / 2 + mv_in_frame_c.x;
+
+      kvz_pixels_blit(&ref->uframe_mv_index,
+                      &lcu->rec.ulcu_pu_index_c,
+                      width / 2, height / 2,
+                      ref->width / 2, LCU_WIDTH_C);
+      kvz_pixels_blit(&ref->vframe_mv_index,
+                      &lcu->rec.vlcu_pu_index_c,
+                      width / 2, height / 2,
+                      ref->width / 2, LCU_WIDTH_C);
     }
   }
 }
 
 /**
-* \brief Reconstruct bi-pred inter block
-* \param ref1 reference picture to copy the data from
-* \param ref2 other reference picture to copy the data from
-* \param xpos block x position
-* \param ypos block y position
-* \param width block width
-* \param height block height
-* \param mv22 motion vectors
-* \param lcu destination lcu
-* \returns Void
-*/
-
+ * \brief Reconstruct bi-pred inter block
+ *
+ * \param state     encoder state
+ * \param ref1      reference picture to copy the data from
+ * \param ref2      other reference picture to copy the data from
+ * \param xpos      block x position
+ * \param ypos      block y position
+ * \param width     block width
+ * \param height    block height
+ * \param mv_param  motion vectors
+ * \param lcu       destination lcu
+ */
 void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state,
                                 const kvz_picture * ref1,
                                 const kvz_picture * ref2,
@@ -481,10 +483,11 @@
 }
 
 /**
- * \brief Set unused L0/L1 motion vectors and reference
+ * \brief Clear unused L0/L1 motion vectors and reference
  * \param cu coding unit to clear
  */
-static void inter_clear_cu_unused(cu_info_t* cu) {
+static void inter_clear_cu_unused(cu_info_t* cu)
+{
   for (unsigned i = 0; i < 2; ++i) {
     if (cu->inter.mv_dir & (1 << i)) continue;
 
@@ -495,33 +498,264 @@
 }
 
 /**
- * \brief Get merge candidates for current block
- * \param encoder encoder control struct to use
- * \param x block x position in SCU
- * \param y block y position in SCU
- * \param width current block width
- * \param height current block height
- * \param b0 candidate b0
- * \param b1 candidate b1
- * \param b2 candidate b2
- * \param a0 candidate a0
- * \param a1 candidate a1
+ * \brief Check whether a0 mv cand block is coded before the current block.
+ * \param x       x-coordinate of the current block (in pixels)
+ * \param y       y-coordinate of the current block (in pixels)
+ * \param width   width of the current block (in pixels)
+ * \param height  height of the current block (in pixels)
+ * \return        True, if the a0 mv candidate block is coded before the
+ *                current block. Otherwise false.
  */
-void kvz_inter_get_spatial_merge_candidates(int32_t x,
-                                            int32_t y,
-                                            int32_t width,
-                                            int32_t height,
-                                            cu_info_t **b0,
-                                            cu_info_t **b1,
-                                            cu_info_t **b2,
-                                            cu_info_t **a0,
-                                            cu_info_t **a1,
-                                            lcu_t *lcu)
+static bool is_a0_cand_coded(int x, int y, int width, int height)
 {
-  // the width and height of the current block on SCU
-  uint8_t width_in_scu = width / CU_MIN_SIZE_PIXELS;
-  uint8_t height_in_scu = height / CU_MIN_SIZE_PIXELS;
+  int size = MIN(width & ~(width - 1), height & ~(height - 1));
+
+  if (height != size) {
+    // For SMP and AMP blocks the situation is equivalent to a square block
+    // at the lower left corner of the PU.
+    y = y + height - size;
+  }
 
+  while (size < LCU_WIDTH) {
+    const int parent_size = 2 * size;
+    const int cu_index    = (x % parent_size != 0) + 2 * (y % parent_size != 0);
+    switch (cu_index) {
+      case 0:
+        // A0 is in the CU directly left of the parent CU so it has been
+        // coded already.
+        //    +---+---+
+        //    | X |   |
+        //    |---+---+
+        // A0 |   |   |
+        //    +---+---+
+        return true;
+
+      case 1:
+        // A0 is in the CU that will be coded after the current CU.
+        //    +---+---+
+        //    |   | X |
+        //    |---+---+
+        //    |A0 |   |
+        //    +---+---+
+        return false;
+
+      case 2:
+        //    +---+---+
+        //    |   |   |
+        //    |---+---+
+        //    | X |   |
+        //    +---+---+
+        // A0
+
+        // Move to the parent block.
+        y -= size;
+        size = parent_size;
+        break;
+
+      case 3:
+        // A0 is in the CU directly down of the parent CU so is has not
+        // been coded yet.
+        //    +---+---+
+        //    |   |   |
+        //    |---+---+
+        //    |   | X |
+        //    +---+---+
+        //     A0
+        return false;
+    }
+  }
+
+  // For 64x64 blocks A0 candidate is located outside the LCU.
+  return false;
+}
+
+/**
+ * \brief Check whether b0 mv cand block is coded before the current block.
+ * \param x       x-coordinate of the current block (in pixels)
+ * \param y       y-coordinate of the current block (in pixels)
+ * \param width   width of the current block (in pixels)
+ * \param height  height of the current block (in pixels)
+ * \return        True, if the b0 mv candidate block is coded before the
+ *                current block. Otherwise false.
+ */
+static bool is_b0_cand_coded(int x, int y, int width, int height)
+{
+  int size = MIN(width & ~(width - 1), height & ~(height - 1));
+
+  if (width != size) {
+    // For SMP and AMP blocks the situation is equivalent to a square block
+    // at the upper right corner of the PU.
+    x = x + width - size;
+  }
+
+  while (size < LCU_WIDTH) {
+    const int parent_size = 2 * size;
+    const int cu_index    = (x % parent_size != 0) + 2 * (y % parent_size != 0);
+    switch (cu_index) {
+      case 0:
+        // B0 is in the CU directly above the parent CU so it has been
+        // coded already.
+        //         B0
+        //    +---+---+
+        //    | X |   |
+        //    |---+---+
+        //    |   |   |
+        //    +---+---+
+        return true;
+
+      case 1:
+        //             B0
+        //    +---+---+
+        //    |   | X |
+        //    |---+---+
+        //    |   |   |
+        //    +---+---+
+
+        // Move to the parent block.
+        x -= size;
+        size = parent_size;
+        break;
+
+      case 2:
+        //    +---+---+
+        //    |   |B0 |
+        //    |---+---+
+        //    | X |   |
+        //    +---+---+
+        return true;
+
+      case 3:
+        // B0 is in the CU directly right of the parent CU so is has not
+        // been coded yet.
+        //    +---+---+
+        //    |   |   | B0
+        //    |---+---+
+        //    |   | X |
+        //    +---+---+
+        return false;
+    }
+  }
+
+  // The LCU to the right and up of the current LCU has been coded already.
+  return true;
+}
+
+
+/**
+* \brief Get merge candidates for current block
+* \param encoder encoder control struct to use
+* \param x block x position in SCU
+* \param y block y position in SCU
+* \param width current block width
+* \param height current block height
+* \param H candidate H
+* \param C1 candidate C1
+*/
+static void kvz_inter_get_temporal_merge_candidates(const encoder_state_t * const state,
+                                             int32_t x,
+                                             int32_t y,
+                                             int32_t width,
+                                             int32_t height,
+                                             cu_info_t **C3,
+                                             cu_info_t **H) {
+  /*
+  Predictor block locations
+  _________
+  |CurrentPU|
+  | |C0|__  |
+  |    |C3| |
+  |_________|_
+            |H|
+  */
+
+  *C3 = NULL;
+  *H  = NULL;
+
+  // Find temporal reference, closest POC
+  if (state->frame->ref->used_size) {
+    uint32_t poc_diff = UINT_MAX;
+    int32_t closest_ref = 0;
+
+    for (int temporal_cand = 0; temporal_cand < state->frame->ref->used_size; temporal_cand++) {
+      int td = state->frame->poc - state->frame->ref->pocstemporal_cand;
+
+      td = td < 0 ? -td : td;
+      if (td < poc_diff) {
+        closest_ref = temporal_cand;
+        poc_diff = td;
+      }
+    }
+
+    cu_array_t *ref_cu_array = state->frame->ref->cu_arraysclosest_ref;
+    int cu_per_width = ref_cu_array->width / SCU_WIDTH;
+
+    uint32_t xColBr = x + width;
+    uint32_t yColBr = y + height;
+
+    // H must be available
+    if (xColBr < state->encoder_control->in.width &&
+        yColBr < state->encoder_control->in.height) {
+      int32_t H_offset = -1;
+
+      // Y inside the current CTU / LCU
+      if (yColBr % LCU_WIDTH != 0) {
+        H_offset = ((xColBr >> 4) << 4) / SCU_WIDTH +
+                  (((yColBr >> 4) << 4) / SCU_WIDTH) * cu_per_width;
+      }
+
+      if (H_offset >= 0) {
+        // Only use when it's inter block
+        if (ref_cu_array->dataH_offset.type == CU_INTER) {
+          *H = &ref_cu_array->dataH_offset;
+        }
+      }
+    }
+    uint32_t xColCtr = x + (width / 2);
+    uint32_t yColCtr = y + (height / 2);
+
+    // C3 must be inside the LCU, in the center position of current CU
+    if (xColCtr < state->encoder_control->in.width && yColCtr < state->encoder_control->in.height) {
+      uint32_t C3_offset = ((xColCtr >> 4) << 4) / SCU_WIDTH + ((((yColCtr >> 4) << 4) / SCU_WIDTH) * cu_per_width);
+      if (ref_cu_array->dataC3_offset.type == CU_INTER) {
+        *C3 = &ref_cu_array->dataC3_offset;
+      }
+    }
+  }
+}
+
+/**
+ * \brief Get merge candidates for current block.
+ *
+ * The output parameters b0, b1, b2, a0, a1 are pointed to the
+ * corresponding cu_info_t struct in lcu->cu, or set to NULL, if the
+ * candidate is not available.
+ *
+ * \param x               block x position in pixels
+ * \param y               block y position in pixels
+ * \param width           block width in pixels
+ * \param height          block height in pixels
+ * \param picture_width   tile width in pixels
+ * \param picture_height  tile height in pixels
+ * \param b0              Returns the b0 candidate.
+ * \param b1              Returns the b1 candidate.
+ * \param b2              Returns the b2 candidate.
+ * \param a0              Returns the a0 candidate.
+ * \param a1              Returns the a1 candidate.
+ * \param lcu             current LCU
+ */
+static void get_spatial_merge_candidates(int32_t x,
+                                         int32_t y,
+                                         int32_t width,
+                                         int32_t height,
+                                         int32_t picture_width,
+                                         int32_t picture_height,
+                                         cu_info_t **b0,
+                                         cu_info_t **b1,
+                                         cu_info_t **b2,
+                                         cu_info_t **a0,
+                                         cu_info_t **a1,
+                                         lcu_t *lcu)
+{
   /*
   Predictor block locations
   ____      _______
@@ -532,11 +766,11 @@
   |A1|_________|
   |A0|
   */
-  int32_t x_cu = SUB_SCU(x) >> MAX_DEPTH; //!< coordinates from top-left of this LCU
-  int32_t y_cu = SUB_SCU(y) >> MAX_DEPTH;
+  int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU
+  int32_t y_local = SUB_SCU(y);
   // A0 and A1 availability testing
   if (x != 0) {
-    *a1 = LCU_GET_CU(lcu, x_cu - 1, y_cu + height_in_scu - 1);
+    *a1 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height - 1);
     // Do not check (*a1)->coded because the block above is always coded before
     // the current one and the flag is not set when searching an SMP block.
     if ((*a1)->type == CU_INTER) {
@@ -545,9 +779,9 @@
       *a1 = NULL;
     }
 
-    if (y_cu + height_in_scu < LCU_WIDTH>>3) {
-      *a0 = LCU_GET_CU(lcu, x_cu - 1, y_cu + height_in_scu);
-      if ((*a0)->coded && (*a0)->type == CU_INTER) {
+    if (y_local + height < LCU_WIDTH && y + height < picture_height) {
+      *a0 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local + height);
+      if ((*a0)->type == CU_INTER && is_a0_cand_coded(x, y, width, height)) {
         inter_clear_cu_unused(*a0);
       } else {
         *a0 = NULL;
@@ -557,19 +791,21 @@
 
   // B0, B1 and B2 availability testing
   if (y != 0) {
-    if (x_cu + width_in_scu < LCU_WIDTH>>3) {
-      *b0 = LCU_GET_CU(lcu, x_cu + width_in_scu, y_cu - 1);
-    } else if (y_cu == 0) {
-      // Special case, top-right CU
-      *b0 = LCU_GET_TOP_RIGHT_CU(lcu);
+    if (x + width < picture_width) {
+      if (x_local + width < LCU_WIDTH) {
+        *b0 = LCU_GET_CU_AT_PX(lcu, x_local + width, y_local - 1);
+      } else if (y_local == 0) {
+        // Special case, top-right CU
+        *b0 = LCU_GET_TOP_RIGHT_CU(lcu);
+      }
     }
-    if ((*b0) && (*b0)->coded && (*b0)->type == CU_INTER) {
+    if ((*b0) && (*b0)->type == CU_INTER && is_b0_cand_coded(x, y, width, height)) {
       inter_clear_cu_unused(*b0);
     } else {
       *b0 = NULL;
     }
 
-    *b1 = LCU_GET_CU(lcu, x_cu + width_in_scu - 1, y_cu - 1);
+    *b1 = LCU_GET_CU_AT_PX(lcu, x_local + width - 1, y_local - 1);
     // Do not check (*b1)->coded because the block to the left is always coded
     // before the current one and the flag is not set when searching an SMP
     // block.
@@ -580,7 +816,7 @@
     }
 
     if (x != 0) {
-      *b2 = LCU_GET_CU(lcu, x_cu - 1, y_cu - 1);
+      *b2 = LCU_GET_CU_AT_PX(lcu, x_local - 1, y_local - 1);
       // Do not check (*b2)->coded because the block above and to the left is
       // always coded before the current one.
       if ((*b2)->type == CU_INTER) {
@@ -593,35 +829,118 @@
 }
 
 /**
- * \brief Get MV prediction for current block
- * \param encoder encoder control struct to use
- * \param x_cu block x position in SCU
- * \param y_cu block y position in SCU
- * \param width current block width
- * \param height current block height
- * \param mv_cand22 return the motion vector candidates
+ * \brief Get merge candidates for current block.
+ *
+ * The output parameters b0, b1, b2, a0, a1 are pointed to the
+ * corresponding cu_info_t struct in lcu->cu, or set to NULL, if the
+ * candidate is not available.
+ *
+ * \param cua             cu information
+ * \param x               block x position in pixels
+ * \param y               block y position in pixels
+ * \param width           block width in pixels
+ * \param height          block height in pixels
+ * \param picture_width   tile width in pixels
+ * \param picture_height  tile height in pixels
+ * \param b0              Returns the b0 candidate.
+ * \param b1              Returns the b1 candidate.
+ * \param b2              Returns the b2 candidate.
+ * \param a0              Returns the a0 candidate.
+ * \param a1              Returns the a1 candidate.
  */
-void kvz_inter_get_mv_cand(const encoder_state_t * const state,
-                           int32_t x,
-                           int32_t y,
-                           int32_t width,
-                           int32_t height,
-                           int16_t mv_cand22,
-                           cu_info_t* cur_cu,
-                           lcu_t *lcu,
-                           int8_t reflist)
+static void get_spatial_merge_candidates_cua(const cu_array_t *cua,
+                                             int32_t x,
+                                             int32_t y,
+                                             int32_t width,
+                                             int32_t height,
+                                             int32_t picture_width,
+                                             int32_t picture_height,
+                                             const cu_info_t **b0,
+                                             const cu_info_t **b1,
+                                             const cu_info_t **b2,
+                                             const cu_info_t **a0,
+                                             const cu_info_t **a1)
+{
+  /*
+  Predictor block locations
+  ____      _______
+  |B2|______|B1|B0|
+     |         |
+     |  Cur CU |
+   __|         |
+  |A1|_________|
+  |A0|
+  */
+  int32_t x_local = SUB_SCU(x); //!< coordinates from top-left of this LCU
+  int32_t y_local = SUB_SCU(y);
+  // A0 and A1 availability testing
+  if (x != 0) {
+    *a1 = kvz_cu_array_at_const(cua, x - 1, y + height - 1);
+    // The block above is always coded before the current one.
+    if ((*a1)->type != CU_INTER) {
+      *a1 = NULL;
+    }
+
+    if (y_local + height < LCU_WIDTH && y + height < picture_height) {
+      *a0 = kvz_cu_array_at_const(cua, x - 1, y + height);
+      if ((*a0)->type != CU_INTER || !is_a0_cand_coded(x, y, width, height)) {
+        *a0 = NULL;
+      }
+    }
+  }
+
+  // B0, B1 and B2 availability testing
+  if (y != 0) {
+    if (x + width < picture_width && (x_local + width < LCU_WIDTH || y_local == 0)) {
+      *b0 = kvz_cu_array_at_const(cua, x + width, y - 1);
+      if ((*b0)->type != CU_INTER || !is_b0_cand_coded(x, y, width, height)) {
+        *b0 = NULL;
+      }
+    }
+
+    *b1 = kvz_cu_array_at_const(cua, x + width - 1, y - 1);
+    // The block to the left is always coded before the current one.
+    if ((*b1)->type != CU_INTER) {
+      *b1 = NULL;
+    }
+
+    if (x != 0) {
+      *b2 = kvz_cu_array_at_const(cua, x - 1, y - 1);
+      // The block above and to the left is always coded before the current
+      // one.
+      if ((*b2)->type != CU_INTER) {
+        *b2 = NULL;
+      }
+    }
+  }
+}
+
+/**
+ * \brief Pick two mv candidates from the spatial candidates.
+ */
+static void get_mv_cand_from_spatial(const encoder_state_t * const state,
+                                     int32_t x,
+                                     int32_t y,
+                                     int32_t width,
+                                     int32_t height,
+                                     const cu_info_t *b0,
+                                     const cu_info_t *b1,
+                                     const cu_info_t *b2,
+                                     const cu_info_t *a0,
+                                     const cu_info_t *a1,
+                                     const cu_info_t *c3,
+                                     const cu_info_t *h,
+                                     const cu_info_t *cur_cu,
+                                     int8_t reflist,
+                                     int16_t mv_cand22)
 {
   uint8_t candidates = 0;
   uint8_t b_candidates = 0;
   int8_t reflist2nd = !reflist;
 
-  cu_info_t *b0, *b1, *b2, *a0, *a1;
-  b0 = b1 = b2 = a0 = a1 = NULL;
-  kvz_inter_get_spatial_merge_candidates(x, y, width, height, &b0, &b1, &b2, &a0, &a1, lcu);
-
  #define CALCULATE_SCALE(cu,tb,td) ((tb * ((0x4000 + (abs(td)>>1))/td) + 32) >> 6)
-#define APPLY_MV_SCALING(cu, cand, list) {int td = state->global->poc - state->global->ref->pocs(cu)->inter.mv_reflist;\
-                                   int tb = state->global->poc - state->global->ref->pocscur_cu->inter.mv_refreflist;\
+#define APPLY_MV_SCALING(cu, cand, list) {int td = state->frame->poc - state->frame->ref->pocs(cu)->inter.mv_reflist;\
+                                   int tb = state->frame->poc - state->frame->ref->pocscur_cu->inter.mv_refreflist;\
                                    if (td != tb) { \
                                       int scale = CALCULATE_SCALE(cu,tb,td); \
                                        mv_candcand0 = ((scale * (cu)->inter.mvlist0 + 127 + (scale * (cu)->inter.mvlist0 < 0)) >> 8 ); \
@@ -766,11 +1085,44 @@
     candidates = 1;
   }
 
-#if ENABLE_TEMPORAL_MVP
-  if(candidates < AMVP_MAX_NUM_CANDS) {
-    //TODO: add temporal mv predictor
+  if (state->encoder_control->cfg->tmvp_enable) {
+    /*
+    Predictor block locations
+    _________
+    |CurrentPU|
+    | |C0|__  |
+    |    |C3| |
+    |_________|_
+    |H|
+    */
+
+    // Find temporal reference, closest POC
+    if (state->frame->poc > 1 && state->frame->ref->used_size && candidates < AMVP_MAX_NUM_CANDS) {
+      uint32_t poc_diff = UINT_MAX;
+
+      for (int temporal_cand = 0; temporal_cand < state->frame->ref->used_size; temporal_cand++) {
+        int td = state->frame->poc - state->frame->ref->pocstemporal_cand;
+        td = td < 0 ? -td : td;
+        if (td < poc_diff) {
+          poc_diff = td;
+        }
+      }
+
+      const cu_info_t *selected_CU = (h != NULL) ? h : (c3 != NULL) ? c3 : NULL;
+
+      if (selected_CU) {
+        int td = selected_CU->inter.mv_refreflist + 1;
+        int tb = cur_cu->inter.mv_refreflist + 1;
+
+        int scale = CALCULATE_SCALE(NULL, tb, td);
+        mv_candcandidates0 = ((scale * selected_CU->inter.mv00 + 127 + (scale * selected_CU->inter.mv00 < 0)) >> 8);
+        mv_candcandidates1 = ((scale * selected_CU->inter.mv01 + 127 + (scale * selected_CU->inter.mv01 < 0)) >> 8);
+
+        candidates++;
+      }
+#undef CALCULATE_SCALE
+    }
   }
-#endif
 
   // Fill with (0,0)
   while (candidates < AMVP_MAX_NUM_CANDS) {
@@ -783,6 +1135,72 @@
 }
 
 /**
+ * \brief Get MV prediction for current block.
+ *
+ * \param state     encoder state
+ * \param x         block x position in pixels
+ * \param y         block y position in pixels
+ * \param width     block width in pixels
+ * \param height    block height in pixels
+ * \param mv_cand   Return the motion vector candidates.
+ * \param cur_cu    current CU
+ * \param lcu       current LCU
+ * \param reflist   reflist index (either 0 or 1)
+ */
+void kvz_inter_get_mv_cand(const encoder_state_t * const state,
+                           int32_t x,
+                           int32_t y,
+                           int32_t width,
+                           int32_t height,
+                           int16_t mv_cand22,
+                           cu_info_t* cur_cu,
+                           lcu_t *lcu,
+                           int8_t reflist)
+{
+  cu_info_t *b0, *b1, *b2, *a0, *a1, *c3, *h;
+  b0 = b1 = b2 = a0 = a1 = c3 = h = NULL;
+  get_spatial_merge_candidates(x, y, width, height,
+                               state->tile->frame->width, state->tile->frame->height,
+                               &b0, &b1, &b2, &a0, &a1, lcu);
+  kvz_inter_get_temporal_merge_candidates(state, x, y, width, height, &c3, &h);
+  get_mv_cand_from_spatial(state, x, y, width, height, b0, b1, b2, a0, a1, c3, h, cur_cu, reflist, mv_cand);
+}
+
+/**
+ * \brief Get MV prediction for current block using state->tile->frame->cu_array.
+ *
+ * \param state     encoder state
+ * \param x         block x position in pixels
+ * \param y         block y position in pixels
+ * \param width     block width in pixels
+ * \param height    block height in pixels
+ * \param mv_cand   Return the motion vector candidates.
+ * \param cur_cu    current CU
+ * \param reflist   reflist index (either 0 or 1)
+ */
+void kvz_inter_get_mv_cand_cua(const encoder_state_t * const state,
+                               int32_t x,
+                               int32_t y,
+                               int32_t width,
+                               int32_t height,
+                               int16_t mv_cand22,
+                               const cu_info_t* cur_cu,
+                               int8_t reflist)
+{
+  const cu_info_t *b0, *b1, *b2, *a0, *a1;
+  cu_info_t *c3, *h;
+  b0 = b1 = b2 = a0 = a1 = c3 = h = NULL;
+  
+  const cu_array_t *cua = state->tile->frame->cu_array;
+  get_spatial_merge_candidates_cua(cua,
+                                   x, y, width, height,
+                                   state->tile->frame->width, state->tile->frame->height,
+                                   &b0, &b1, &b2, &a0, &a1);
+  kvz_inter_get_temporal_merge_candidates(state, x, y, width, height, &c3, &h);
+  get_mv_cand_from_spatial(state, x, y, width, height, b0, b1, b2, a0, a1, c3, h, cur_cu, reflist, mv_cand);
+}
+
+/**
  * \brief Get merge predictions for current block
  * \param state     the encoder state
  * \param x         block x position in SCU
@@ -808,7 +1226,9 @@
   cu_info_t *b0, *b1, *b2, *a0, *a1;
   int8_t zero_idx = 0;
   b0 = b1 = b2 = a0 = a1 = NULL;
-  kvz_inter_get_spatial_merge_candidates(x, y, width, height, &b0, &b1, &b2, &a0, &a1, lcu);
+  get_spatial_merge_candidates(x, y, width, height,
+                               state->tile->frame->width, state->tile->frame->height,
+                               &b0, &b1, &b2, &a0, &a1, lcu);
 
   if (!use_a1) a1 = NULL;
   if (!use_b1) b1 = NULL;
@@ -896,14 +1316,41 @@
       }
     }
   }
+  
+  if (state->encoder_control->cfg->tmvp_enable) {
+#define CALCULATE_SCALE(cu,tb,td) ((tb * ((0x4000 + (abs(td)>>1))/td) + 32) >> 6)
+
+    if (candidates < MRG_MAX_NUM_CANDS && state->frame->ref->used_size) {
+
+      cu_info_t *c3 = NULL;
+      cu_info_t *h = NULL;
+
+      kvz_inter_get_temporal_merge_candidates(state, x, y, width, height, &c3, &h);
+
+      const cu_info_t *selected_CU = (h != NULL) ? h : (c3 != NULL) ? c3 : NULL;
 
-#if ENABLE_TEMPORAL_MVP
-  if(candidates < AMVP_MAX_NUM_CANDS) {
-    //TODO: add temporal mv predictor
+      if (selected_CU) {
+        int td = selected_CU->inter.mv_ref0 + 1;
+        int tb = 1;
+
+        int scale = CALCULATE_SCALE(NULL, tb, td);
+        mv_candcandidates.mv00 = ((scale * selected_CU->inter.mv00 + 127 + (scale * selected_CU->inter.mv00 < 0)) >> 8);
+        mv_candcandidates.mv01 = ((scale * selected_CU->inter.mv01 + 127 + (scale * selected_CU->inter.mv01 < 0)) >> 8);
+
+        /*
+        ToDo: temporal prediction in B-pictures
+        mv_candcandidates.mv10 = selected_CU->inter.mv10;
+        mv_candcandidates.mv11 = selected_CU->inter.mv11;
+        */
+        mv_candcandidates.dir = selected_CU->inter.mv_dir;
+        mv_candcandidates.ref0 = 0;
+        candidates++;
+      }
+    }
+#undef CALCULATE_SCALE
   }
-#endif
 
-  if (candidates < MRG_MAX_NUM_CANDS && state->global->slicetype == KVZ_SLICE_B) {
+  if (candidates < MRG_MAX_NUM_CANDS && state->frame->slicetype == KVZ_SLICE_B) {
     #define NUM_PRIORITY_LIST 12;
     static const uint8_t priorityList0 = { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 };
     static const uint8_t priorityList1 = { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 };
@@ -936,14 +1383,14 @@
     }
   }
 
-  int num_ref = state->global->ref->used_size;
+  int num_ref = state->frame->ref->used_size;
 
-  if (candidates < MRG_MAX_NUM_CANDS && state->global->slicetype == KVZ_SLICE_B) {
+  if (candidates < MRG_MAX_NUM_CANDS && state->frame->slicetype == KVZ_SLICE_B) {
     int j;
     int ref_negative = 0;
     int ref_positive = 0;
-    for (j = 0; j < state->global->ref->used_size; j++) {
-      if (state->global->ref->pocsj < state->global->poc) {
+    for (j = 0; j < state->frame->ref->used_size; j++) {
+      if (state->frame->ref->pocsj < state->frame->poc) {
         ref_negative++;
       } else {
         ref_positive++;
@@ -959,7 +1406,7 @@
     mv_candcandidates.ref0 = (zero_idx>=num_ref-1)?0:zero_idx;
     mv_candcandidates.ref1 = mv_candcandidates.ref0;
     mv_candcandidates.dir = 1;
-    if (state->global->slicetype == KVZ_SLICE_B) {
+    if (state->frame->slicetype == KVZ_SLICE_B) {
       mv_candcandidates.mv10 = 0;
       mv_candcandidates.mv11 = 0;
       mv_candcandidates.dir = 3;

kvazaar-0.8.3.tar.gz/src/inter.h -> kvazaar-1.0.0.tar.gz/src/inter.h Changed

@@ -26,11 +26,12 @@
  * Inter prediction.
  */
 
-#include "global.h"
-
-#include "image.h"
-#include "encoder.h"
+#include "cu.h"
 #include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "image.h"
+#include "kvazaar.h"
+
 
 typedef struct {
   uint8_t dir;
@@ -60,17 +61,6 @@
                                 int16_t mv_param22,
                                 lcu_t* lcu);
 
-void kvz_inter_get_spatial_merge_candidates(int32_t x,
-                                            int32_t y,
-                                            int32_t width,
-                                            int32_t height,
-                                            cu_info_t **b0,
-                                            cu_info_t **b1,
-                                            cu_info_t **b2,
-                                            cu_info_t **a0,
-                                            cu_info_t **a1,
-                                            lcu_t *lcu);
-
 void kvz_inter_get_mv_cand(const encoder_state_t * const state,
                            int32_t x,
                            int32_t y,
@@ -81,6 +71,15 @@
                            lcu_t *lcu,
                            int8_t reflist);
 
+void kvz_inter_get_mv_cand_cua(const encoder_state_t * const state,
+                               int32_t x,
+                               int32_t y,
+                               int32_t width,
+                               int32_t height,
+                               int16_t mv_cand22,
+                               const cu_info_t* cur_cu,
+                               int8_t reflist);
+
 uint8_t kvz_inter_get_merge_cand(const encoder_state_t * const state,
                                  int32_t x, int32_t y,
                                  int32_t width, int32_t height,

kvazaar-0.8.3.tar.gz/src/intra.c -> kvazaar-1.0.0.tar.gz/src/intra.c Changed

@@ -20,48 +20,33 @@
 
 #include "intra.h"
 
-#include <assert.h>
-#include <stdio.h>
 #include <stdlib.h>
 
-#include "encoder.h"
-#include "transform.h"
+#include "image.h"
+#include "kvz_math.h"
 #include "strategies/strategies-intra.h"
-#include "strategies/strategies-picture.h"
+#include "tables.h"
+#include "transform.h"
+#include "videoframe.h"
 
 
 int8_t kvz_intra_get_dir_luma_predictor(
   const uint32_t x,
   const uint32_t y,
   int8_t *preds,
-  const cu_info_t *const cur_cu,
-  const cu_info_t *const left_cu,
-  const cu_info_t *const above_cu)
+  const cu_info_t *const cur_pu,
+  const cu_info_t *const left_pu,
+  const cu_info_t *const above_pu)
 {
-  int y_cu = y>>3;
-
   // The default mode if block is not coded yet is INTRA_DC.
   int8_t left_intra_dir  = 1;
-  int8_t above_intra_dir = 1;
-
-  if (x & 4) {
-    // If current CU is NxN and PU is on the right half, take mode from the
-    // left half of the same CU.
-    left_intra_dir = cur_cu->intraPU_INDEX(0, y >> 2).mode;
-  } else if (left_cu && left_cu->type == CU_INTRA) {
-    // Otherwise take the mode from the right side of the CU on the left.
-    left_intra_dir = left_cu->intraPU_INDEX(1, y >> 2).mode;
+  if (left_pu && left_pu->type == CU_INTRA) {
+    left_intra_dir = left_pu->intra.mode;
   }
 
-  if (y & 4) {
-    // If current CU is NxN and PU is on the bottom half, take mode from the
-    // top half of the same CU.
-    above_intra_dir = cur_cu->intraPU_INDEX(x >> 2, 0).mode;
-  } else if (above_cu && above_cu->type == CU_INTRA &&
-             (y_cu * (LCU_WIDTH>>MAX_DEPTH)) % LCU_WIDTH != 0)
-  {
-    // Otherwise take the mode from the bottom half of the CU above.
-    above_intra_dir = above_cu->intraPU_INDEX(x >> 2, 1).mode;
+  int8_t above_intra_dir = 1;
+  if (above_pu && above_pu->type == CU_INTRA && y % LCU_WIDTH != 0) {
+    above_intra_dir = above_pu->intra.mode;
   }
 
   // If the predictions are the same, add new predictions
@@ -213,7 +198,8 @@
   int_fast8_t log2_width,
   int_fast8_t mode,
   color_t color,
-  kvz_pixel *dst)
+  kvz_pixel *dst,
+  bool filter_boundary)
 {
   const int_fast8_t width = 1 << log2_width;
 
@@ -227,7 +213,7 @@
     // Angular modes use smoothed reference pixels, unless the mode is close
     // to being either vertical or horizontal.
     static const int kvz_intra_hor_ver_dist_thres5 = { 0, 7, 1, 0, 0 };
-    int filter_threshold = kvz_intra_hor_ver_dist_thresg_to_bitswidth;
+    int filter_threshold = kvz_intra_hor_ver_dist_threskvz_math_floor_log2(width) - 2;
     int dist_from_vert_or_hor = MIN(abs(mode - 26), abs(mode - 10));
     if (dist_from_vert_or_hor > filter_threshold) {
       used_ref = &refs->filtered_ref;
@@ -249,7 +235,7 @@
     }
   } else {
     kvz_angular_pred(log2_width, mode, used_ref->top, used_ref->left, dst);
-    if (color == COLOR_Y && width < 32) {
+    if (color == COLOR_Y && width < 32 && filter_boundary) {
       if (mode == 10) {
         intra_post_process_angular(width, 1, used_ref->top, dst);
       } else if (mode == 26) {
@@ -449,12 +435,12 @@
     kvz_intra_recon_lcu_luma(state, x + offset, y + offset, depth+1, intra_mode, NULL, lcu);
 
     if (depth < MAX_DEPTH) {
-      cu_info_t *cu_a = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y);
-      cu_info_t *cu_b = LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset);
-      cu_info_t *cu_c = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset);
-      if (cbf_is_set(cu_a->cbf.y, depth+1) || cbf_is_set(cu_b->cbf.y, depth+1) || cbf_is_set(cu_c->cbf.y, depth+1)) {
-        cbf_set(&cur_cu->cbf.y, depth);
-      }
+      uint16_t child_cbfs3 = {
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
+      };
+      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_Y);
     }
 
     return;
@@ -468,7 +454,9 @@
   kvz_intra_build_reference(log2_width, COLOR_Y, &luma_px, &pic_px, lcu, &refs);
 
   kvz_pixel pred32 * 32;
-  kvz_intra_predict(&refs, log2_width, intra_mode, COLOR_Y, pred);
+  const kvz_config *cfg = state->encoder_control->cfg;
+  bool filter_boundary = !(cfg->lossless && cfg->implicit_rdpcm);
+  kvz_intra_predict(&refs, log2_width, intra_mode, COLOR_Y, pred, filter_boundary);
   
   kvz_pixel *block_in_lcu = &lcu->rec.ylcu_px.x + lcu_px.y * LCU_WIDTH;
   kvz_pixels_blit(pred, block_in_lcu, width, width, width, LCU_WIDTH);
@@ -502,18 +490,15 @@
     kvz_intra_recon_lcu_chroma(state, x,          y + offset, depth+1, intra_mode, NULL, lcu);
     kvz_intra_recon_lcu_chroma(state, x + offset, y + offset, depth+1, intra_mode, NULL, lcu);
 
-    if (depth < MAX_DEPTH) {
-      cu_info_t *cu_a = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y);
-      cu_info_t *cu_b = LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset);
-      cu_info_t *cu_c = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset);
-      if (cbf_is_set(cu_a->cbf.u, depth+1) || cbf_is_set(cu_b->cbf.u, depth+1) || cbf_is_set(cu_c->cbf.u, depth+1)) {
-        cbf_set(&cur_cu->cbf.u, depth);
-      }
-      if (cbf_is_set(cu_a->cbf.v, depth+1) || cbf_is_set(cu_b->cbf.v, depth+1) || cbf_is_set(cu_c->cbf.v, depth+1)) {
-        cbf_set(&cur_cu->cbf.v, depth);
-      }
+    if (depth <= MAX_DEPTH) {
+      uint16_t child_cbfs3 = {
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
+      };
+      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_U);
+      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_V);
     }
-
     return;
   }
 
@@ -528,7 +513,7 @@
       kvz_intra_build_reference(log2_width_c, COLOR_U, &luma_px, &pic_px, lcu, &refs);
 
       kvz_pixel pred32 * 32;
-      kvz_intra_predict(&refs, log2_width_c, intra_mode, COLOR_U, pred);
+      kvz_intra_predict(&refs, log2_width_c, intra_mode, COLOR_U, pred, false);
 
       kvz_pixel *pu_in_lcu = &lcu->rec.ulcu_px.x / 2 + (lcu_px.y * LCU_WIDTH) / 4;
       kvz_pixels_blit(pred, pu_in_lcu, width_c, width_c, width_c, LCU_WIDTH_C);
@@ -540,7 +525,7 @@
       kvz_intra_build_reference(log2_width_c, COLOR_V, &luma_px, &pic_px, lcu, &refs);
       
       kvz_pixel pred32 * 32;
-      kvz_intra_predict(&refs, log2_width_c, intra_mode, COLOR_V, pred);
+      kvz_intra_predict(&refs, log2_width_c, intra_mode, COLOR_V, pred, false);
 
       kvz_pixel *pu_in_lcu = &lcu->rec.vlcu_px.x / 2 + (lcu_px.y * LCU_WIDTH) / 4;
       kvz_pixels_blit(pred, pu_in_lcu, width_c, width_c, width_c, LCU_WIDTH_C);

kvazaar-0.8.3.tar.gz/src/intra.h -> kvazaar-1.0.0.tar.gz/src/intra.h Changed

@@ -26,9 +26,11 @@
 * Intra prediction.
 */
 
-#include "global.h"
-
+#include "cu.h"
 #include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+
 
 typedef struct {
   kvz_pixel left2 * 32 + 1;
@@ -44,19 +46,21 @@
 
 /**
 * \brief Function for deriving intra luma predictions
-* \param pic picture to use
-* \param x_cu x CU position (smallest CU)
-* \param y_cu y CU position (smallest CU)
-* \param preds output buffer for 3 predictions
-* \returns (predictions are found)?1:0
+* \param x          x-coordinate of the PU in pixels
+* \param y          y-coordinate of the PU in pixels
+* \param preds      output buffer for 3 predictions
+* \param cur_pu     PU to check
+* \param left_pu    PU to the left of cur_pu
+* \param above_pu   PU above cur_pu
+* \returns          1 if predictions are found, otherwise 0
 */
 int8_t kvz_intra_get_dir_luma_predictor(
   const uint32_t x,
   const uint32_t y,
   int8_t *preds,
-  const cu_info_t *const cur_cu,
-  const cu_info_t *const left_cu,
-  const cu_info_t *const above_cu);
+  const cu_info_t *const cur_pu,
+  const cu_info_t *const left_pu,
+  const cu_info_t *const above_pu);
 
 /**
 * \brief Generage angular predictions.
@@ -78,18 +82,20 @@
 
 /**
  * \brief Generate intra predictions.
- * \param refs   Reference pixels used for the prediction.     
- * \param log2_width  Width of the predicted block.
- * \param mode   Intra mode used for the prediction.
- * \param color  Color of the prediction.
- * \param dst    Buffer for the predicted pixels.
+ * \param refs            Reference pixels used for the prediction.
+ * \param log2_width      Width of the predicted block.
+ * \param mode            Intra mode used for the prediction.
+ * \param color           Color of the prediction.
+ * \param dst             Buffer for the predicted pixels.
+ * \param filter_boundary Whether to filter the boundary on modes 10 and 26.
  */
 void kvz_intra_predict(
   kvz_intra_references *refs,
   int_fast8_t log2_width,
   int_fast8_t mode,
   color_t color,
-  kvz_pixel *dst);
+  kvz_pixel *dst,
+  bool filter_boundary);
 
 /**
  * \brief Do a full intra prediction cycle on a CU in lcu for luma.

kvazaar-0.8.3.tar.gz/src/kvazaar.c -> kvazaar-1.0.0.tar.gz/src/kvazaar.c Changed

@@ -18,17 +18,26 @@
 * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
 ****************************************************************************/
 
-#include "kvazaar_internal.h"
+#include "kvazaar.h"
 
+#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
+#include "bitstream.h"
 #include "cfg.h"
+#include "checkpoint.h"
 #include "encoder.h"
-#include "strategyselector.h"
+#include "encoder_state-bitstream.h"
+#include "encoder_state-ctors_dtors.h"
 #include "encoderstate.h"
-#include "checkpoint.h"
-#include "bitstream.h"
+#include "global.h"
+#include "image.h"
 #include "input_frame_buffer.h"
+#include "kvazaar_internal.h"
+#include "strategyselector.h"
+#include "threadqueue.h"
+#include "videoframe.h"
 
 
 static void kvazaar_close(kvz_encoder *encoder)
@@ -66,7 +75,9 @@
     goto kvazaar_open_failure;
   }
 
-  encoder->control = kvz_encoder_control_init(cfg);
+  // FIXME: const qualifier disgarded. I don't want to change kvazaar_open
+  // but I really need to change cfg.
+  encoder->control = kvz_encoder_control_init((kvz_config*)cfg);
   if (!encoder->control) {
     goto kvazaar_open_failure;
   }
@@ -91,7 +102,7 @@
       goto kvazaar_open_failure;
     }
 
-    encoder->statesi.global->QP = (int8_t)cfg->qp;
+    encoder->statesi.frame->QP = (int8_t)cfg->qp;
   }
 
   for (int i = 0; i < encoder->num_encoder_states; ++i) {
@@ -103,7 +114,7 @@
     kvz_encoder_state_match_children_of_previous_frame(&encoder->statesi);
   }
 
-  encoder->statesencoder->cur_state_num.global->frame = -1;
+  encoder->statesencoder->cur_state_num.frame->num = -1;
 
   return encoder;
 
@@ -115,10 +126,10 @@
 
 static void set_frame_info(kvz_frame_info *const info, const encoder_state_t *const state)
 {
-  info->poc = state->global->poc,
-  info->qp = state->global->QP;
-  info->nal_unit_type = state->global->pictype;
-  info->slice_type = state->global->slicetype;
+  info->poc = state->frame->poc,
+  info->qp = state->frame->QP;
+  info->nal_unit_type = state->frame->pictype;
+  info->slice_type = state->frame->slicetype;
   kvz_encoder_get_ref_lists(state, info->ref_list_len, info->ref_list);
 }
 
@@ -203,18 +214,19 @@
   encoder_state_t *state = &enc->statesenc->cur_state_num;
 
   if (!state->prepared) {
-    kvz_encoder_next_frame(state);
+    kvz_encoder_prepare(state);
   }
 
   if (pic_in != NULL) {
     // FIXME: The frame number printed here is wrong when GOP is enabled.
-    CHECKPOINT_MARK("read source frame: %d", state->global->frame + enc->control->cfg->seek);
+    CHECKPOINT_MARK("read source frame: %d", state->frame->num + enc->control->cfg->seek);
   }
 
-  if (kvz_encoder_feed_frame(&enc->input_buffer, state, pic_in)) {
-    assert(state->global->frame == enc->frames_started);
+  kvz_picture* frame = kvz_encoder_feed_frame(&enc->input_buffer, state, pic_in);
+  if (frame) {
+    assert(state->frame->num == enc->frames_started);
     // Start encoding.
-    kvz_encode_one_frame(state);
+    kvz_encode_one_frame(state, frame);
     enc->frames_started += 1;
   }
 
@@ -274,14 +286,14 @@
   struct {
     kvz_data_chunk* data_out;
     uint32_t len_out;
-  } first = { 0 }, second = { 0 };
+  } first = { 0, 0 }, second = { 0, 0 };
 
   if (pic_in != NULL) {
-    first_field = kvz_image_alloc(state->encoder_control->in.width, state->encoder_control->in.height);
+    first_field = kvz_image_alloc(state->encoder_control->chroma_format, state->encoder_control->in.width, state->encoder_control->in.height);
     if (first_field == NULL) {
       goto kvazaar_field_encoding_adapter_failure;
     }
-    second_field = kvz_image_alloc(state->encoder_control->in.width, state->encoder_control->in.height);
+    second_field = kvz_image_alloc(state->encoder_control->chroma_format, state->encoder_control->in.width, state->encoder_control->in.height);
     if (second_field == NULL) {
       goto kvazaar_field_encoding_adapter_failure;
     }
@@ -345,7 +357,7 @@
   .config_destroy = kvz_config_destroy,
   .config_parse = kvz_config_parse,
 
-  .picture_alloc = kvz_image_alloc,
+  .picture_alloc = kvz_image_alloc_420,
   .picture_free = kvz_image_free,
 
   .chunk_free = kvz_bitstream_free_chunks,
@@ -354,6 +366,8 @@
   .encoder_close = kvazaar_close,
   .encoder_headers = kvazaar_headers,
   .encoder_encode = kvazaar_field_encoding_adapter,
+
+  .picture_alloc_csp = kvz_image_alloc,
 };

kvazaar-0.8.3.tar.gz/src/kvazaar.h -> kvazaar-1.0.0.tar.gz/src/kvazaar.h Changed

@@ -26,9 +26,9 @@
  * This file defines the public API of Kvazaar when used as a library.
  */
 
-#include <stddef.h>
 #include <stdint.h>
 
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -88,6 +88,10 @@
   KVZ_IME_HEXBS = 0,
   KVZ_IME_TZ = 1,
   KVZ_IME_FULL = 2,
+  KVZ_IME_FULL8 = 3, //! \since 3.6.0
+  KVZ_IME_FULL16 = 4, //! \since 3.6.0
+  KVZ_IME_FULL32 = 5, //! \since 3.6.0
+  KVZ_IME_FULL64 = 6, //! \since 3.6.0
 };
 
 /**
@@ -102,6 +106,92 @@
 };
 
 /**
+* \brief Constrain movement vectors.
+* \since 3.3.0
+*/
+enum kvz_mv_constraint
+{
+  KVZ_MV_CONSTRAIN_NONE = 0,
+  KVZ_MV_CONSTRAIN_FRAME = 1,  // Don't refer outside the frame.
+  KVZ_MV_CONSTRAIN_TILE = 2,  // Don't refer to other tiles.
+  KVZ_MV_CONSTRAIN_FRAME_AND_TILE = 3,  // Don't refer outside the tile.
+  KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN = 4,  // Keep enough margin for fractional pixel margins not to refer outside the tile.
+};
+
+/**
+* \brief Constrain movement vectors.
+* \since 3.5.0
+*/
+enum kvz_hash
+{
+  KVZ_HASH_NONE = 0,
+  KVZ_HASH_CHECKSUM = 1,
+  KVZ_HASH_MD5 = 2,
+};
+
+/**
+* \brief cu split termination mode
+* \since since 3.8.0
+*/
+enum kvz_cu_split_termination
+{
+  KVZ_CU_SPLIT_TERMINATION_ZERO = 0,
+  KVZ_CU_SPLIT_TERMINATION_OFF = 1
+};
+
+/**
+* \brief Enable and disable crypto features.
+* \since 3.7.0
+*/
+enum kvz_crypto_features {
+  KVZ_CRYPTO_OFF = 0,
+  KVZ_CRYPTO_MVs = (1 << 0),
+  KVZ_CRYPTO_MV_SIGNS = (1 << 1),
+  KVZ_CRYPTO_TRANSF_COEFFS = (1 << 2),
+  KVZ_CRYPTO_TRANSF_COEFF_SIGNS = (1 << 3),
+  KVZ_CRYPTO_ON = (1 << 4) - 1,
+};
+
+/**
+* \brief me early termination mode
+* \since since 3.8.0
+*/
+enum kvz_me_early_termination
+{
+  KVZ_ME_EARLY_TERMINATION_OFF = 0,
+  KVZ_ME_EARLY_TERMINATION_ON = 1,
+  KVZ_ME_EARLY_TERMINATION_SENSITIVE = 2
+};
+
+
+/**
+ * \brief Format the pixels are read in.
+ * This is separate from chroma subsampling, because we might want to read
+ * interleaved formats in the future.
+ * \since 3.12.0
+ */
+enum kvz_input_format {
+  KVZ_FORMAT_P400 = 0,
+  KVZ_FORMAT_P420 = 1,
+  KVZ_FORMAT_P422 = 2,
+  KVZ_FORMAT_P444 = 3,
+};
+
+/**
+* \brief Chroma subsampling format used for encoding.
+* \since 3.12.0
+*/
+enum kvz_chroma_format {
+  KVZ_CSP_400 = 0,
+  KVZ_CSP_420 = 1,
+  KVZ_CSP_422 = 2,
+  KVZ_CSP_444 = 3,
+};
+
+// Map from input format to chroma format.
+#define KVZ_FORMAT2CSP(format) ((enum kvz_chroma_format)"\0\1\2\3"format)
+
+/**
  * \brief GoP picture configuration.
  */
 typedef struct kvz_gop_config {
@@ -204,6 +294,31 @@
 
   int8_t mv_rdo;            /*!< \brief MV RDO calculation in search (0: estimation, 1: RDO). */
   int8_t calc_psnr;         /*!< \since 3.1.0 \brief Print PSNR in CLI. */
+
+  enum kvz_mv_constraint mv_constraint;  /*!< \since 3.3.0 \brief Constrain movement vectors. */
+  enum kvz_hash hash;  /*!< \since 3.5.0 \brief What hash algorithm to use. */
+
+  enum kvz_cu_split_termination cu_split_termination; /*!< \since 3.8.0 \brief Mode of cu split termination. */
+
+  enum kvz_crypto_features crypto_features; /*!< \since 3.7.0 */
+
+  enum kvz_me_early_termination me_early_termination; /*!< \since 3.8.0 \brief Mode of me early termination. */
+
+  int32_t lossless; /*!< \brief Use lossless coding. */
+
+  int32_t tmvp_enable; /*!> \brief Use Temporal Motion Vector Predictors. */
+
+  int32_t rdoq_skip; /*!< \brief Mode of rdoq skip */
+
+  enum kvz_input_format input_format; /*!< \brief Use Temporal Motion Vector Predictors. */
+  int32_t input_bitdepth; /*!< \brief Use Temporal Motion Vector Predictors. */
+
+  struct {
+    unsigned d;  // depth
+    unsigned t;  // temporal
+  } gop_lp_definition;
+
+  int32_t implicit_rdpcm; /*!< \brief Enable implicit residual DPCM. */
 } kvz_config;
 
 /**
@@ -231,6 +346,7 @@
   int64_t dts;             //!< \brief Decompression timestamp.
 
   enum kvz_interlacing interlacing; //!< \since 3.2.0 \brief Field order for interlaced pictures.
+  enum kvz_chroma_format chroma_format;
 } kvz_picture;
 
 /**
@@ -508,6 +624,19 @@
                                   kvz_picture **pic_out,
                                   kvz_picture **src_out,
                                   kvz_frame_info *info_out);
+
+  /**
+   * \brief Allocate a kvz_picture.
+   *
+   * The returned kvz_picture should be deallocated by calling picture_free.
+   *
+   * \since 3.12.0
+   * \param chroma_fomat  Chroma subsampling to use.
+   * \param width   width of luma pixel array to allocate
+   * \param height  height of luma pixel array to allocate
+   * \return        allocated picture, or NULL if allocation failed.
+   */
+  kvz_picture * (*picture_alloc_csp)(enum kvz_chroma_format chroma_fomat, int32_t width, int32_t height);
 } kvz_api;

kvazaar-0.8.3.tar.gz/src/kvazaar_internal.h -> kvazaar-1.0.0.tar.gz/src/kvazaar_internal.h Changed

kvazaar-1.0.0.tar.gz/src/kvz_math.h Added

@@ -0,0 +1,55 @@
+#ifndef MATH_H_
+#define MATH_H_
+/*****************************************************************************
+* This file is part of Kvazaar HEVC encoder.
+*
+* Copyright (C) 2013-2015 Tampere University of Technology and others (see
+* COPYING file).
+*
+* Kvazaar is free software: you can redistribute it and/or modify it under
+* the terms of the GNU Lesser General Public License as published by the
+* Free Software Foundation; either version 2.1 of the License, or (at your
+* option) any later version.
+*
+* Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+* FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+* more details.
+*
+* You should have received a copy of the GNU General Public License along
+* with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+****************************************************************************/
+
+/**
+* \file
+* Generic math functions
+*/
+
+#include "global.h" // IWYU pragma: keep
+
+
+static INLINE unsigned kvz_math_floor_log2(unsigned value)
+{
+  assert(value > 0);
+
+  unsigned result = 0;
+
+  for (int i = 4; i >= 0; --i) {
+    unsigned bits = 1ull << i;
+    unsigned shift = value >= (1 << bits) ? bits : 0;
+    result += shift;
+    value >>= shift;
+  }
+
+  return result;
+}
+
+static INLINE unsigned kvz_math_ceil_log2(unsigned value)
+{
+  assert(value > 0);
+
+  // The ceil_log2 is just floor_log2 + 1, except for exact powers of 2.
+  return kvz_math_floor_log2(value) + ((value & (value - 1)) ? 1 : 0);
+}
+
+#endif //CHECKPOINT_H_

kvazaar-0.8.3.tar.gz/src/nal.c -> kvazaar-1.0.0.tar.gz/src/nal.c Changed

@@ -19,15 +19,10 @@
  ****************************************************************************/
 
 #include "nal.h"
-#include "strategyselector.h"
-
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
 
 #include "bitstream.h"
-#include "cabac.h"
-#include "encoder.h"
+#include "strategies/strategies-nal.h"
+
 
 /**
  * \brief Write a Network Abstraction Layer (NAL) packet to the output.
@@ -72,6 +67,25 @@
   kvz_array_checksum(im->y, im->height, im->width, im->width, checksum_out0, bitdepth);
 
   /* The number of chroma pixels is half that of luma. */
-  kvz_array_checksum(im->u, im->height >> 1, im->width >> 1, im->width >> 1, checksum_out1, bitdepth);
-  kvz_array_checksum(im->v, im->height >> 1, im->width >> 1, im->width >> 1, checksum_out2, bitdepth);
+  if (im->chroma_format != KVZ_CSP_400) {
+    kvz_array_checksum(im->u, im->height >> 1, im->width >> 1, im->width >> 1, checksum_out1, bitdepth);
+    kvz_array_checksum(im->v, im->height >> 1, im->width >> 1, im->width >> 1, checksum_out2, bitdepth);
+  }
+}
+
+/*!
+\brief Calculate md5 for all colors of the picture.
+\param im The image that md5 is calculated for.
+\param checksum_out Result of the calculation.
+\returns Void
+*/
+void kvz_image_md5(const kvz_picture *im, unsigned char checksum_outSEI_HASH_MAX_LENGTH, const uint8_t bitdepth)
+{
+  kvz_array_md5(im->y, im->height, im->width, im->width, checksum_out0, bitdepth);
+
+  /* The number of chroma pixels is half that of luma. */
+  if (im->chroma_format != KVZ_CSP_400) {
+    kvz_array_md5(im->u, im->height >> 1, im->width >> 1, im->width >> 1, checksum_out1, bitdepth);
+    kvz_array_md5(im->v, im->height >> 1, im->width >> 1, im->width >> 1, checksum_out2, bitdepth);
+  }
 }

kvazaar-0.8.3.tar.gz/src/nal.h -> kvazaar-1.0.0.tar.gz/src/nal.h Changed

kvazaar-0.8.3.tar.gz/src/rate_control.c -> kvazaar-1.0.0.tar.gz/src/rate_control.c Changed

@@ -22,6 +22,10 @@
 
 #include <math.h>
 
+#include "encoder.h"
+#include "kvazaar.h"
+
+
 static const int SMOOTHING_WINDOW = 40;
 
 /**
@@ -38,19 +42,19 @@
   const double bpp = state->stats_bitstream_length * 8 / pixels_per_picture;
   const double log_bpp = log(bpp);
 
-  const double alpha_old = state->global->rc_alpha;
-  const double beta_old = state->global->rc_beta;
+  const double alpha_old = state->frame->rc_alpha;
+  const double beta_old = state->frame->rc_beta;
   // lambda computed from real bpp
   const double lambda_comp = CLIP(0.1, 10000, alpha_old * pow(bpp, beta_old));
   // lambda used in encoding
-  const double lambda_real = state->global->cur_lambda_cost;
+  const double lambda_real = state->frame->cur_lambda_cost;
   const double lambda_log_ratio = log(lambda_real) - log(lambda_comp);
 
   const double alpha = alpha_old + 0.1 * lambda_log_ratio * alpha_old;
-  state->global->rc_alpha = CLIP(0.05, 20, alpha);
+  state->frame->rc_alpha = CLIP(0.05, 20, alpha);
 
   const double beta = beta_old + 0.05 * lambda_log_ratio * CLIP(-5, 1, log_bpp);
-  state->global->rc_beta = CLIP(-3, -0.1, beta);
+  state->frame->rc_beta = CLIP(-3, -0.1, beta);
 }
 
 /**
@@ -67,14 +71,14 @@
 
   // At this point, total_bits_coded of the current state contains the
   // number of bits written encoder->owf frames before the current frame.
-  int bits_coded = state->global->total_bits_coded;
-  int pictures_coded = MAX(0, state->global->frame - encoder->owf);
+  uint64_t bits_coded = state->frame->total_bits_coded;
+  int pictures_coded = MAX(0, state->frame->num - encoder->owf);
 
-  int gop_offset = (state->global->gop_offset - encoder->owf) % MAX(1, encoder->cfg->gop_len);
+  int gop_offset = (state->frame->gop_offset - encoder->owf) % MAX(1, encoder->cfg->gop_len);
   // Only take fully coded GOPs into account.
   if (encoder->cfg->gop_len > 0 && gop_offset != encoder->cfg->gop_len - 1) {
     // Subtract number of bits in the partially coded GOP.
-    bits_coded -= state->global->cur_gop_bits_coded;
+    bits_coded -= state->frame->cur_gop_bits_coded;
     // Subtract number of pictures in the partially coded GOP.
     pictures_coded -= gop_offset + 1;
   }
@@ -82,7 +86,7 @@
   double gop_target_bits =
     (encoder->target_avg_bppic * (pictures_coded + SMOOTHING_WINDOW) - bits_coded)
     * MAX(1, encoder->cfg->gop_len) / SMOOTHING_WINDOW;
-  state->global->cur_gop_target_bits = MAX(200, gop_target_bits);
+  state->frame->cur_gop_target_bits = MAX(200, gop_target_bits);
 }
 
 /**
@@ -95,12 +99,12 @@
   const encoder_control_t * const encoder = state->encoder_control;
 
   if (encoder->cfg->gop_len <= 0) {
-    return state->global->cur_gop_target_bits;
+    return state->frame->cur_gop_target_bits;
   }
 
   const double pic_weight = encoder->gop_layer_weights
-    encoder->cfg->gopstate->global->gop_offset.layer - 1;
-  double pic_target_bits = state->global->cur_gop_target_bits * pic_weight;
+    encoder->cfg->gopstate->frame->gop_offset.layer - 1;
+  double pic_target_bits = state->frame->cur_gop_target_bits * pic_weight;
   return MAX(100, pic_target_bits);
 }
 
@@ -118,17 +122,20 @@
 
   assert(encoder->cfg->target_bitrate > 0);
 
-  if (state->global->frame > encoder->owf) {
+  if (state->frame->num > encoder->owf) {
     // At least one frame has been written.
     update_rc_parameters(state);
   }
 
-  if (encoder->cfg->gop_len == 0 || state->global->gop_offset == 0) {
+  if (encoder->cfg->gop_len == 0 ||
+      state->frame->gop_offset == 0 ||
+      state->frame->num == 0)
+  {
     // A new GOP begins at this frame.
     gop_allocate_bits(state);
   } else {
-    state->global->cur_gop_target_bits =
-      state->previous_encoder_state->global->cur_gop_target_bits;
+    state->frame->cur_gop_target_bits =
+      state->previous_encoder_state->frame->cur_gop_target_bits;
   }
 
   // TODO: take the picture headers into account
@@ -136,7 +143,7 @@
   const double target_bits_per_pixel =
     target_bits_current_picture / encoder->in.pixels_per_pic;
   const double lambda =
-    state->global->rc_alpha * pow(target_bits_per_pixel, state->global->rc_beta);
+    state->frame->rc_alpha * pow(target_bits_per_pixel, state->frame->rc_beta);
   return CLIP(0.1, 10000, lambda);
 }
 
@@ -160,9 +167,9 @@
   const int intra_period = state->encoder_control->cfg->intra_period;
   const int keyframe_period = gop_len > 0 ? gop_len : intra_period;
   
-  double lambda = pow(2.0, (state->global->QP - 12) / 3.0);
+  double lambda = pow(2.0, (state->frame->QP - 12) / 3.0);
 
-  if (state->global->slicetype == KVZ_SLICE_I) {
+  if (state->frame->slicetype == KVZ_SLICE_I) {
     lambda *= 0.57;
     
     // Reduce lambda for I-frames according to the number of references.
@@ -172,14 +179,14 @@
       lambda *= 1.0 - CLIP(0.0, 0.5, 0.05 * (keyframe_period - 1));
     }
   } else if (gop_len > 0) {
-    lambda *= state->global->QP_factor;
+    lambda *= state->frame->QP_factor;
   } else {
     lambda *= 0.4624;
   }
 
   // Increase lambda if not key-frame.
-  if (keyframe_period > 0 && state->global->poc % keyframe_period != 0) {
-    lambda *= CLIP(2.0, 4.0, (state->global->QP - 12) / 6.0);
+  if (keyframe_period > 0 && state->frame->poc % keyframe_period != 0) {
+    lambda *= CLIP(2.0, 4.0, (state->frame->QP - 12) / 6.0);
   }
   
   return lambda;

kvazaar-0.8.3.tar.gz/src/rate_control.h -> kvazaar-1.0.0.tar.gz/src/rate_control.h Changed

kvazaar-0.8.3.tar.gz/src/rdo.c -> kvazaar-1.0.0.tar.gz/src/rdo.c Changed

@@ -1,4 +1,4 @@
-/*****************************************************************************
+/*****************************************************************************
  * This file is part of Kvazaar HEVC encoder.
  *
  * Copyright (C) 2013-2015 Tampere University of Technology and others (see
@@ -18,17 +18,20 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include <stdio.h>
+#include "rdo.h"
+
 #include <stdlib.h>
 #include <string.h>
 
-#include "rdo.h"
-#include "transform.h"
-#include "context.h"
 #include "cabac.h"
-#include "transform.h"
-#include "strategies/strategies-quant.h"
+#include "context.h"
+#include "encode_coding_tree.h"
+#include "encoder.h"
+#include "imagelist.h"
 #include "inter.h"
+#include "scalinglist.h"
+#include "tables.h"
+#include "transform.h"
 
 
 #define QUANT_SHIFT          14
@@ -39,8 +42,6 @@
 const uint32_t kvz_g_go_rice_range5 = { 7, 14, 26, 46, 78 };
 const uint32_t kvz_g_go_rice_prefix_len5 = { 8, 7, 6, 5, 4 };
 
-
-#define CTX_ENTROPY_BITS(ctx,val) kvz_entropy_bits(ctx)->uc_state ^ val
 /**
  * Entropy bits to estimate coded bits in RDO / RDOQ (From HM 12.0)
  */
@@ -126,97 +127,6 @@
 };
 
 
-/**
- * \brief Function to compare RDO costs
- * \param rdo_costs array of current costs
- * \param cost new cost to check
- * \returns -1 if cost is worse than the one in the array or array position for worst cost
-
- This function derives the prediction samples for planar mode (intra coding).
-*/
-int kvz_intra_rdo_cost_compare(uint32_t *rdo_costs,int8_t rdo_modes_to_check, uint32_t cost)
-{
-  int i;
-  int found = 0;
-
-  for(i = 0; i < rdo_modes_to_check; i++) {
-    if(rdo_costsi > cost) {
-      found = 1;
-      break;
-    }
-  }
-  // Search for worst cost
-  if(found) {
-    uint32_t worst_cost = 0;
-    int worst_mode = -1;
-    for(i = 0; i < rdo_modes_to_check; i++) {
-      if(rdo_costsi > worst_cost) {
-        worst_cost = rdo_costsi;
-        worst_mode = i;
-      }
-    }
-    return worst_mode;
-  }
-
-  return -1;
-}
-
-
-/**
- * \brief RDO function to calculate cost for intra
- * \returns cost to code pred block
-
- ** Only for luma
- */
-uint32_t kvz_rdo_cost_intra(encoder_state_t * const state, kvz_pixel *pred, kvz_pixel *orig_block, int width, int8_t mode, int tr_depth)
-{
-    const encoder_control_t * const encoder = state->encoder_control;
-    coeff_t pre_quant_coeffLCU_WIDTH*LCU_WIDTH>>2;
-    int16_t blockLCU_WIDTH*LCU_WIDTH>>2;
-    int16_t temp_blockLCU_WIDTH*LCU_WIDTH>>2;
-    coeff_t temp_coeffLCU_WIDTH*LCU_WIDTH>>2;
-    int8_t luma_scan_mode = SCAN_DIAG;
-
-    int i = 0,x,y;
-    for (y = 0; y < width; y++) {
-      for (x = 0; x < width; x++) {
-        blocki++ = orig_blockx + y*width- predx + y*width;
-      }
-    }
-    // Scan mode is diagonal, except for 4x4 and 8x8, where:
-    // - angular 6-14 = vertical
-    // - angular 22-30 = horizontal
-    if (width <= 8) {
-      if (mode >= 6 && mode <= 14) {
-        luma_scan_mode = SCAN_VER;
-      } else if (mode >= 22 && mode <= 30) {
-        luma_scan_mode = SCAN_HOR;
-      }
-    }
-    kvz_transform2d(encoder, block,pre_quant_coeff,width,0);
-    if(encoder->rdoq_enable) {
-      kvz_rdoq(state, pre_quant_coeff, temp_coeff, width, width, 0, luma_scan_mode, CU_INTRA, tr_depth);
-    } else {
-      kvz_quant(state, pre_quant_coeff, temp_coeff, width, width, 0, luma_scan_mode, CU_INTRA);
-    }
-    kvz_dequant(state, temp_coeff, pre_quant_coeff, width, width, 0, CU_INTRA);
-    kvz_itransform2d(encoder, temp_block,pre_quant_coeff,width,0);
-
-    unsigned ssd = 0;
-    // SSD between original and reconstructed
-    for (i = 0; i < width*width; i++) {
-      //int diff = temp_blocki-blocki;
-      int diff = orig_blocki - CLIP(0, PIXEL_MAX, predi + temp_blocki);
-
-      ssd += diff*diff;
-    }
-
-    double coeff_bits = kvz_get_coeff_cost(state, temp_coeff, width, 0, luma_scan_mode);
-
-    return (uint32_t)(0.5 + ssd + coeff_bits * state->global->cur_lambda_cost);
-}
-
-
 /** Calculate actual (or really close to actual) bitcost for coding coefficients
  * \param coeff coefficient array
  * \param width coeff block width
@@ -347,7 +257,7 @@
   cabac_ctx_t* base_sig_model = type?(cabac->ctx.cu_sig_model_chroma):(cabac->ctx.cu_sig_model_luma);
 
   if( !last && max_abs_level < 3 ) {
-    *coded_cost_sig = state->global->cur_lambda_cost * CTX_ENTROPY_BITS(&base_sig_modelctx_num_sig, 0);
+    *coded_cost_sig = state->frame->cur_lambda_cost * CTX_ENTROPY_BITS(&base_sig_modelctx_num_sig, 0);
     *coded_cost     = *coded_cost0 + *coded_cost_sig;
     if (max_abs_level == 0) return best_abs_level;
   } else {
@@ -355,13 +265,13 @@
   }
 
   if( !last ) {
-    cur_cost_sig = state->global->cur_lambda_cost * CTX_ENTROPY_BITS(&base_sig_modelctx_num_sig, 1);
+    cur_cost_sig = state->frame->cur_lambda_cost * CTX_ENTROPY_BITS(&base_sig_modelctx_num_sig, 1);
   }
 
   min_abs_level    = ( max_abs_level > 1 ? max_abs_level - 1 : 1 );
   for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) {
     double err       = (double)(level_double - ( abs_level << q_bits ) );
-    double cur_cost  = err * err * temp + state->global->cur_lambda_cost *
+    double cur_cost  = err * err * temp + state->frame->cur_lambda_cost *
                        kvz_get_ic_rate( state, abs_level, ctx_num_one, ctx_num_abs,
                                     abs_go_rice, c1_idx, c2_idx, type);
     cur_cost        += cur_cost_sig;
@@ -398,7 +308,7 @@
   if( ctx_y > 3 ) {
     uiCost += 32768.0 * ((ctx_y-2)>>1);
   }
-  return state->global->cur_lambda_cost*uiCost;
+  return state->frame->cur_lambda_cost*uiCost;
 }
 
 static void calc_last_bits(encoder_state_t * const state, int32_t width, int32_t height, int8_t type,
@@ -432,6 +342,7 @@
   last_y_bitsctx = bits_y;
 }
 
+
 void kvz_rdoq_sign_hiding(const encoder_state_t *const state,
                       const int32_t qp_scaled,
                       const uint32_t *const scan,
@@ -444,36 +355,34 @@
                       coeff_t *const dest_coeff)
 {
   const encoder_control_t * const encoder = state->encoder_control;
-  const int32_t size = width * width;
-  
+
   int64_t rd_factor = (int64_t)(
     kvz_g_inv_quant_scalesqp_scaled % 6 * kvz_g_inv_quant_scalesqp_scaled % 6 * (1 << (2 * (qp_scaled / 6)))
-    / state->global->cur_lambda_cost / 16 / (1 << (2 * (encoder->bitdepth - 8)))
+    / state->frame->cur_lambda_cost / 16 / (1 << (2 * (encoder->bitdepth - 8)))
     + 0.5);
   int32_t lastCG = -1;
   int32_t absSum = 0;
-  int32_t n, subset;
 
-  for (subset = (size - 1) >> LOG2_SCAN_SET_SIZE; subset >= 0; subset--) {
+  for (int32_t subset = (width - 1) >> LOG2_SCAN_SET_SIZE; subset >= 0; subset--) {
     int32_t  subPos = subset << LOG2_SCAN_SET_SIZE;
     int32_t  firstNZPosInCG = SCAN_SET_SIZE, lastNZPosInCG = -1;
     absSum = 0;
 
-    for (n = SCAN_SET_SIZE - 1; n >= 0; --n) {
+    for (int32_t n = SCAN_SET_SIZE - 1; n >= 0; --n) {
       if (dest_coeffscann + subPos) {
         lastNZPosInCG = n;
         break;
       }
     }
 
-    for (n = 0; n <SCAN_SET_SIZE; n++) {
+    for (int32_t n = 0; n <= lastNZPosInCG; n++) {
       if (dest_coeffscann + subPos) {
         firstNZPosInCG = n;
         break;
       }
     }
 
-    for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) {
+    for (int32_t n = firstNZPosInCG; n <= lastNZPosInCG; n++) {
       absSum += dest_coeffscann + subPos;
     }
 
@@ -486,7 +395,7 @@
         int64_t minCostInc = MAX_INT64, curCost = MAX_INT64;
         int32_t minPos = -1, finalChange = 0, curChange = 0;
 
-        for (n = (lastCG == 1 ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) {
+        for (int32_t n = (lastCG == 1 ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) {
           uint32_t blkpos = scann + subPos;
           if (dest_coeffblkpos != 0) {
             int64_t costUp = rd_factor * (-delta_ublkpos) + rate_inc_upblkpos;
@@ -539,27 +448,26 @@
   }
 }
 
+
 /** RDOQ with CABAC
  * \returns void
  * Rate distortion optimized quantization for entropy
  * coding engines using probability models like CABAC
  * From HM 12.0
  */
-void  kvz_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff, int32_t width,
+void kvz_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff, int32_t width,
            int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth)
 {
   const encoder_control_t * const encoder = state->encoder_control;
   cabac_data_t * const cabac = &state->cabac;
-  uint32_t log2_tr_size    = kvz_g_convert_to_bit width  + 2;
-  int32_t  transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size;  // Represents scaling through forward transform
-  uint16_t go_rice_param   = 0;
-  uint32_t log2_block_size = kvz_g_convert_to_bit width  + 2;
-  uint32_t max_num_coeff   = width * height;
+  uint32_t log2_tr_size      = kvz_g_convert_to_bit width  + 2;
+  int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size;  // Represents scaling through forward transform
+  uint16_t go_rice_param     = 0;
+  uint32_t log2_block_size   = kvz_g_convert_to_bit width  + 2;
+  uint32_t max_num_coeff     = width * height;
   int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"type);
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->global->QP, (encoder->bitdepth-8)*6);
-  uint32_t abs_sum = 0;
-
+  int32_t qp_scaled = kvz_get_scaled_qp(type, state->frame->QP, (encoder->bitdepth - 8) * 6);
   
   int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
 
@@ -567,7 +475,7 @@
   const double *err_scale     = encoder->scaling_list.error_scalelog2_tr_size-2scalinglist_typeqp_scaled%6;
 
   double block_uncoded_cost = 0;
-
+  
   double cost_coeff  32 * 32 ;
   double cost_sig    32 * 32 ;
   double cost_coeff0 32 * 32 ;
@@ -577,21 +485,17 @@
   int32_t sig_rate_delta 32 * 32 ;
   int32_t delta_u        32 * 32 ;
 
-
   const uint32_t *scan_cg = g_sig_last_scan_cglog2_block_size - 2scan_mode;
-  const int32_t  shift   = 4>>1;
   const uint32_t cg_size = 16;
-  const uint32_t num_blk_side    = width >> shift;
+  const int32_t  shift = 4 >> 1;
+  const uint32_t num_blk_side = width >> shift;
   double   cost_coeffgroup_sig 64 ;
   uint32_t sig_coeffgroup_flag 64 ;
 
-  int32_t  cg_last_scanpos = -1;
-
-  uint16_t    ctx_set        = 0;
-  int16_t     c1             = 1;
-  int16_t     c2             = 0;
-  double      base_cost      = 0;
-  int32_t     last_scanpos   = -1;
+  uint16_t    ctx_set    = 0;
+  int16_t     c1         = 1;
+  int16_t     c2         = 0;
+  double      base_cost  = 0;
 
   uint32_t    c1_idx     = 0;
   uint32_t    c2_idx     = 0;
@@ -599,20 +503,15 @@
 
   const uint32_t *scan = kvz_g_sig_last_scan scan_mode  log2_block_size - 1 ;
 
+  int32_t cg_last_scanpos = -1;
+  int32_t last_scanpos = -1;
 
   uint32_t cg_num = width * height >> 4;
-  int32_t  scanpos;
 
   cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_modeltype);
   cabac_ctx_t *baseCtx              = (type == 0) ? &(cabac->ctx.cu_sig_model_luma0) : &(cabac->ctx.cu_sig_model_chroma0);
   cabac_ctx_t *base_one_ctx = (type == 0) ? &(cabac->ctx.cu_one_model_luma0) : &(cabac->ctx.cu_one_model_chroma0);
 
-  double  best_cost        = 0;
-  int32_t ctx_cbf          = 0;
-  int32_t best_last_idx_p1 = 0;
-  int8_t found_last        = 0;
-  int32_t cg_scanpos, scanpos_in_cg;
-
   struct {
     double coded_level_and_dist;
     double uncoded_dist;
@@ -621,12 +520,35 @@
     int32_t nnz_before_pos0;
   } rd_stats;
 
-  int32_t last_x_bits32,last_y_bits32;
-  calc_last_bits(state, width, height, type,last_x_bits, last_y_bits);
+  //Find last cg and last scanpos
+  for (int32_t cg_scanpos = (cg_num - 1); cg_scanpos >= 0; cg_scanpos--)
+  {
+    for (int32_t scanpos_in_cg = (cg_size - 1); scanpos_in_cg >= 0; scanpos_in_cg--)
+    {
+      int32_t  scanpos        = cg_scanpos*cg_size + scanpos_in_cg;
+      uint32_t blkpos         = scanscanpos;
+      int32_t q               = quant_coeffblkpos;
+      int32_t level_double    = coefblkpos;
+      level_double            = MIN(abs(level_double) * q, MAX_INT - (1 << (q_bits - 1)));
+      uint32_t max_abs_level  = (level_double + (1 << (q_bits - 1))) >> q_bits;
+
+      if (max_abs_level > 0) {
+        last_scanpos    = scanpos;
+        ctx_set         = (scanpos > 0 && type == 0) ? 2 : 0;
+        cg_last_scanpos = cg_scanpos;
+        break;
+      }
+      dest_coeffblkpos = 0;
+    }
+    if (last_scanpos != -1) break;
+  }
+
+  if (last_scanpos == -1) {
+    return;
+  }
 
   FILL_ARRAY(cost_coeff, 0, max_num_coeff);
   FILL_ARRAY(cost_sig, 0, max_num_coeff);
-  
 
   if (encoder->sign_hiding) {
     FILL_ARRAY(rate_inc_up, 0, max_num_coeff);
@@ -638,253 +560,234 @@
   FILL(cost_coeffgroup_sig, 0);
   FILL(sig_coeffgroup_flag, 0);
 
-  for (cg_scanpos = cg_num-1; cg_scanpos >= 0; cg_scanpos--) {
-    uint32_t cg_blkpos = scan_cg cg_scanpos ;
+  int32_t last_x_bits32, last_y_bits32;
+  calc_last_bits(state, width, height, type, last_x_bits, last_y_bits);
+
+  for (int32_t cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
+    uint32_t cg_blkpos  = scan_cgcg_scanpos;
     uint32_t cg_pos_y   = cg_blkpos / num_blk_side;
     uint32_t cg_pos_x   = cg_blkpos - (cg_pos_y * num_blk_side);
-    int32_t  scanpos_in_cg;
 
     int32_t pattern_sig_ctx = kvz_context_calc_pattern_sig_ctx(sig_coeffgroup_flag,
                                                            cg_pos_x, cg_pos_y, width);
 
     FILL(rd_stats, 0);
-    for (scanpos_in_cg = cg_size-1; scanpos_in_cg >= 0; scanpos_in_cg--)  {
-      uint32_t blkpos;
-      int32_t q;
-      double temp, err;
-      int32_t level_double;
-      uint32_t max_abs_level;
-
-      scanpos = cg_scanpos*cg_size + scanpos_in_cg;
-      blkpos          = scanscanpos;
-      q  = quant_coeffblkpos;
-      temp = err_scaleblkpos;
-      level_double        = coefblkpos;
-      level_double        = MIN(abs(level_double) * q , MAX_INT - (1 << (q_bits - 1)));
-      max_abs_level       = (level_double + (1 << (q_bits - 1))) >> q_bits;
-
-      err               = (double)level_double;
-      cost_coeff0 scanpos   = err * err * temp;
+    for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--)  {
+      int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
+      if (scanpos > last_scanpos) continue;
+      uint32_t blkpos         = scanscanpos;
+      int32_t q               = quant_coeffblkpos;
+      double temp             = err_scaleblkpos;
+      int32_t level_double    = coefblkpos;
+      level_double            = MIN(abs(level_double) * q , MAX_INT - (1 << (q_bits - 1)));
+      uint32_t max_abs_level  = (level_double + (1 << (q_bits - 1))) >> q_bits;
+
+      double err              = (double)level_double;
+      cost_coeff0scanpos    = err * err * temp; 
       block_uncoded_cost      += cost_coeff0 scanpos ;
-      dest_coeff blkpos  = (coeff_t)max_abs_level;
-
-      if ( max_abs_level > 0 && last_scanpos < 0 ) {
-        last_scanpos             = scanpos;
-        ctx_set                  = (scanpos > 0 && type == 0) ? 2 : 0;
-        cg_last_scanpos          = cg_scanpos;
-      }
-
-      if ( last_scanpos >= 0 ) {
-        //===== coefficient level estimation =====
-        int32_t  level;
-        uint16_t  one_ctx = 4 * ctx_set + c1;
-        uint16_t  abs_ctx = ctx_set + c2;
-
-        if( scanpos == last_scanpos ) {
-          level            = kvz_get_coded_level(state, &cost_coeff scanpos , &cost_coeff0 scanpos , &cost_sig scanpos ,
-                                               level_double, max_abs_level, 0, one_ctx, abs_ctx, go_rice_param,
-                                               c1_idx, c2_idx, q_bits, temp, 1, type );
-        } else {
-          uint32_t  pos_y    = blkpos >> log2_block_size;
-          uint32_t  pos_x    = blkpos - ( pos_y << log2_block_size );
-          uint16_t  ctx_sig  = (uint16_t)kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
-                                                       log2_block_size, type);
-          level              = kvz_get_coded_level(state, &cost_coeff scanpos , &cost_coeff0 scanpos , &cost_sig scanpos ,
-                                               level_double, max_abs_level, ctx_sig, one_ctx, abs_ctx, go_rice_param,
-                                               c1_idx, c2_idx, q_bits, temp, 0, type );
-          if (encoder->sign_hiding) {
-            sig_rate_deltablkpos = CTX_ENTROPY_BITS(&baseCtxctx_sig, 1) - CTX_ENTROPY_BITS(&baseCtxctx_sig, 0);
-          }
-        }
-
+      //===== coefficient level estimation =====
+      int32_t  level;
+      uint16_t  one_ctx = 4 * ctx_set + c1;
+      uint16_t  abs_ctx = ctx_set + c2;
+
+      if( scanpos == last_scanpos ) {
+        level            = kvz_get_coded_level(state, &cost_coeff scanpos , &cost_coeff0 scanpos , &cost_sig scanpos ,
+                                             level_double, max_abs_level, 0, one_ctx, abs_ctx, go_rice_param,
+                                             c1_idx, c2_idx, q_bits, temp, 1, type );
+      } else {
+        uint32_t  pos_y    = blkpos >> log2_block_size;
+        uint32_t  pos_x    = blkpos - ( pos_y << log2_block_size );
+        uint16_t  ctx_sig  = (uint16_t)kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
+                                                     log2_block_size, type);
+        level              = kvz_get_coded_level(state, &cost_coeff scanpos , &cost_coeff0 scanpos , &cost_sig scanpos ,
+                                             level_double, max_abs_level, ctx_sig, one_ctx, abs_ctx, go_rice_param,
+                                             c1_idx, c2_idx, q_bits, temp, 0, type );
         if (encoder->sign_hiding) {
-          delta_ublkpos = (level_double - ((int32_t)level << q_bits)) >> (q_bits - 8);
-          if (level > 0) {
-            int32_t rate_now = kvz_get_ic_rate(state, level, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type);
-            rate_inc_upblkpos = kvz_get_ic_rate(state, level + 1, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type) - rate_now;
-            rate_inc_downblkpos = kvz_get_ic_rate(state, level - 1, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type) - rate_now;
-          } else { // level == 0
-            rate_inc_upblkpos = CTX_ENTROPY_BITS(&base_one_ctxone_ctx, 0);
-          }
+          sig_rate_deltablkpos = CTX_ENTROPY_BITS(&baseCtxctx_sig, 1) - CTX_ENTROPY_BITS(&baseCtxctx_sig, 0);
         }
+      }
 
-        dest_coeffblkpos = (coeff_t)level;
-        base_cost         += cost_coeffscanpos;
-
-        base_level = (c1_idx < C1FLAG_NUMBER) ? (2 + (c2_idx < C2FLAG_NUMBER)) : 1;
-        if( level >= base_level ) {
-          if(level  > 3*(1<<go_rice_param)) {
-            go_rice_param = MIN(go_rice_param + 1, 4);
-          }
+      if (encoder->sign_hiding) {
+        delta_ublkpos = (level_double - ((int32_t)level << q_bits)) >> (q_bits - 8);
+        if (level > 0) {
+          int32_t rate_now      = kvz_get_ic_rate(state, level, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type);
+          rate_inc_upblkpos   = kvz_get_ic_rate(state, level + 1, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type) - rate_now;
+          rate_inc_downblkpos = kvz_get_ic_rate(state, level - 1, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type) - rate_now;
+        } else { // level == 0
+          rate_inc_upblkpos   = CTX_ENTROPY_BITS(&base_one_ctxone_ctx, 0);
         }
-        if (level >= 1) c1_idx ++;
-
-        //===== update bin model =====
-        if (level > 1) {
-          c1 = 0;
-          c2 += (c2 < 2);
-          c2_idx ++;
-        } else if( (c1 < 3) && (c1 > 0) && level) {
-          c1++;
+      }
+      dest_coeffblkpos = (coeff_t)level;
+      base_cost         += cost_coeffscanpos;
+
+      base_level = (c1_idx < C1FLAG_NUMBER) ? (2 + (c2_idx < C2FLAG_NUMBER)) : 1;
+      if (level >= base_level) {
+        if(level  > 3*(1<<go_rice_param)) {
+          go_rice_param = MIN(go_rice_param + 1, 4);
         }
+      }
+      if (level >= 1) c1_idx ++;
+
+      //===== update bin model =====
+      if (level > 1) {
+        c1 = 0;
+        c2 += (c2 < 2);
+        c2_idx ++;
+      } else if( (c1 < 3) && (c1 > 0) && level) {
+        c1++;
+      }
 
-        //===== context set update =====
-        if ((scanpos % SCAN_SET_SIZE == 0) && scanpos > 0) {
-          c2                = 0;
-          go_rice_param     = 0;
+      //===== context set update =====
+      if ((scanpos % SCAN_SET_SIZE == 0) && scanpos > 0) {
+        c2                = 0;
+        go_rice_param     = 0;
 
-          c1_idx   = 0;
-          c2_idx   = 0;
-          ctx_set = (scanpos == SCAN_SET_SIZE || type!=0) ? 0 : 2;
-          if( c1 == 0 ) {
-            ctx_set++;
-          }
-          c1 = 1;
+        c1_idx   = 0;
+        c2_idx   = 0;
+        ctx_set = (scanpos == SCAN_SET_SIZE || type != 0) ? 0 : 2;
+        if( c1 == 0 ) {
+          ctx_set++;
         }
-      } else {
-        base_cost += cost_coeff0scanpos;
+        c1 = 1;
       }
+
       rd_stats.sig_cost += cost_sigscanpos;
-      if (scanpos_in_cg == 0 ) {
+      if ( scanpos_in_cg == 0 ) {
         rd_stats.sig_cost_0 = cost_sigscanpos;
       }
-      if (dest_coeff blkpos  )  {
-        sig_coeffgroup_flag cg_blkpos  = 1;
-        rd_stats.coded_level_and_dist += cost_coeffscanpos - cost_sigscanpos;
-        rd_stats.uncoded_dist += cost_coeff0scanpos;
+      if ( dest_coeffblkpos )  {
+        sig_coeffgroup_flagcg_blkpos = 1;
+        rd_stats.coded_level_and_dist   += cost_coeffscanpos - cost_sigscanpos;
+        rd_stats.uncoded_dist           += cost_coeff0scanpos;
         if ( scanpos_in_cg != 0 ) {
           rd_stats.nnz_before_pos0++;
         }
       }
     } //end for (scanpos_in_cg)
 
-    if (cg_last_scanpos >= 0) {
-      if( cg_scanpos ) {
-        if (sig_coeffgroup_flag cg_blkpos  == 0) {
-          uint32_t ctx_sig  = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-                                                          cg_pos_y, width);
-          cost_coeffgroup_sig cg_scanpos  = state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctxctx_sig,0);
-          base_cost += cost_coeffgroup_sig cg_scanpos   - rd_stats.sig_cost;
-        } else {
-          if (cg_scanpos < cg_last_scanpos) {//skip the last coefficient group, which will be handled together with last position below.
-            double cost_zero_cg;
-            uint32_t ctx_sig;
-            if (rd_stats.nnz_before_pos0 == 0) {
-              base_cost -= rd_stats.sig_cost_0;
-              rd_stats.sig_cost -= rd_stats.sig_cost_0;
-            }
-            // rd-cost if SigCoeffGroupFlag = 0, initialization
-            cost_zero_cg = base_cost;
-
-            // add SigCoeffGroupFlag cost to total cost
-            ctx_sig  = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-                                                            cg_pos_y, width);
-            if (cg_scanpos < cg_last_scanpos) {
-              cost_coeffgroup_sigcg_scanpos = state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctxctx_sig,1);
-              base_cost    += cost_coeffgroup_sigcg_scanpos;
-              cost_zero_cg += state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctxctx_sig,0);
-            }
-
-            // try to convert the current coeff group from non-zero to all-zero
-            cost_zero_cg += rd_stats.uncoded_dist;          // distortion for resetting non-zero levels to zero levels
-            cost_zero_cg -= rd_stats.coded_level_and_dist;  // distortion and level cost for keeping all non-zero levels
-            cost_zero_cg -= rd_stats.sig_cost;              // sig cost for all coeffs, including zero levels and non-zerl levels
-
-            // if we can save cost, change this block to all-zero block
-            if (cost_zero_cg < base_cost) {
-              int32_t scanpos_in_cg;
-              sig_coeffgroup_flag cg_blkpos  = 0;
-              base_cost = cost_zero_cg;
-              if (cg_scanpos < cg_last_scanpos) {
-                cost_coeffgroup_sig cg_scanpos  = state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctxctx_sig,0);
-              }
-              // reset coeffs to 0 in this block
-              for (scanpos_in_cg = cg_size-1; scanpos_in_cg >= 0; scanpos_in_cg--) {
-                uint32_t blkpos;
-                scanpos      = cg_scanpos*cg_size + scanpos_in_cg;
-                blkpos = scan scanpos ;
-
-                if (dest_coeff blkpos ) {
-                  dest_coeff blkpos   = 0;
-                  cost_coeff scanpos  = cost_coeff0 scanpos ;
-                  cost_sig   scanpos  = 0;
-                }
-              }
-            } // end if ( cost_all_zeros < base_cost )
-          }
-        } // end if if (sig_coeffgroup_flag cg_blkpos  == 0)
+    if( cg_scanpos ) {
+      if (sig_coeffgroup_flagcg_blkpos == 0) {
+        uint32_t ctx_sig  = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
+                                                        cg_pos_y, width);
+        cost_coeffgroup_sigcg_scanpos = state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctxctx_sig,0);
+        base_cost += cost_coeffgroup_sigcg_scanpos  - rd_stats.sig_cost;
       } else {
-        sig_coeffgroup_flag cg_blkpos  = 1;
-      }
+        if (cg_scanpos < cg_last_scanpos){
+          double cost_zero_cg;
+          uint32_t ctx_sig;
+          if (rd_stats.nnz_before_pos0 == 0) {
+            base_cost -= rd_stats.sig_cost_0;
+            rd_stats.sig_cost -= rd_stats.sig_cost_0;
+          }
+          // rd-cost if SigCoeffGroupFlag = 0, initialization
+          cost_zero_cg = base_cost;
+
+          // add SigCoeffGroupFlag cost to total cost
+          ctx_sig = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
+            cg_pos_y, width);
+
+          cost_coeffgroup_sigcg_scanpos = state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctxctx_sig, 1);
+          base_cost += cost_coeffgroup_sigcg_scanpos;
+          cost_zero_cg += state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctxctx_sig, 0);
+
+          // try to convert the current coeff group from non-zero to all-zero
+          cost_zero_cg += rd_stats.uncoded_dist;          // distortion for resetting non-zero levels to zero levels
+          cost_zero_cg -= rd_stats.coded_level_and_dist;  // distortion and level cost for keeping all non-zero levels
+          cost_zero_cg -= rd_stats.sig_cost;              // sig cost for all coeffs, including zero levels and non-zerl levels
+
+          // if we can save cost, change this block to all-zero block
+          if (cost_zero_cg < base_cost) {
+
+            sig_coeffgroup_flagcg_blkpos = 0;
+            base_cost = cost_zero_cg;
+
+            cost_coeffgroup_sigcg_scanpos = state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&base_coeff_group_ctxctx_sig, 0);
+
+            // reset coeffs to 0 in this block
+            for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
+              int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
+              uint32_t blkpos = scanscanpos;
+              if (dest_coeffblkpos){
+                dest_coeffblkpos = 0;
+                cost_coeffscanpos = cost_coeff0scanpos;
+                cost_sigscanpos = 0;
+              }
+            }
+          } // end if ( cost_all_zeros < base_cost )
+        }
+      } // end if if (sig_coeffgroup_flag cg_blkpos  == 0)
+    } else {
+      sig_coeffgroup_flagcg_blkpos = 1;
     }
   } //end for (cg_scanpos)
 
   //===== estimate last position =====
-  if (last_scanpos < 0) return;
-
+  double  best_cost        = 0;
+  int32_t ctx_cbf          = 0;
+  int8_t found_last        = 0;
+  int32_t best_last_idx_p1 = 0;
 
   if( block_type != CU_INTRA && !type/* && pcCU->getTransformIdx( uiAbsPartIdx ) == 0*/ ) {
-    best_cost  = block_uncoded_cost +   state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),0);
-    base_cost +=   state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),1);
+    best_cost  = block_uncoded_cost +   state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),0);
+    base_cost +=   state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),1);
   } else {
     cabac_ctx_t* base_cbf_model = type?(cabac->ctx.qt_cbf_model_chroma):(cabac->ctx.qt_cbf_model_luma);
-    ctx_cbf   = ( type ? tr_depth : !tr_depth);
-    best_cost  = block_uncoded_cost +  state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&base_cbf_modelctx_cbf,0);
-    base_cost +=   state->global->cur_lambda_cost*CTX_ENTROPY_BITS(&base_cbf_modelctx_cbf,1);
+    ctx_cbf    = ( type ? tr_depth : !tr_depth);
+    best_cost  = block_uncoded_cost +  state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&base_cbf_modelctx_cbf,0);
+    base_cost +=   state->frame->cur_lambda_cost*CTX_ENTROPY_BITS(&base_cbf_modelctx_cbf,1);
   }
 
-  for (cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
+  for ( int32_t cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
     uint32_t cg_blkpos = scan_cgcg_scanpos;
-
     base_cost -= cost_coeffgroup_sigcg_scanpos;
+
     if (sig_coeffgroup_flag cg_blkpos ) {
-      for (scanpos_in_cg = cg_size-1; scanpos_in_cg >= 0; scanpos_in_cg--) {
-        uint32_t   blkpos;
-        scanpos = cg_scanpos*cg_size + scanpos_in_cg;
+      for ( int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
+        int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
         if (scanpos > last_scanpos) continue;
-        blkpos  = scanscanpos;
+        uint32_t blkpos  = scanscanpos;
 
         if( dest_coeff blkpos  ) {
-          uint32_t   pos_y       = blkpos >> log2_block_size;
-          uint32_t   pos_x       = blkpos - ( pos_y << log2_block_size );
+          uint32_t   pos_y = blkpos >> log2_block_size;
+          uint32_t   pos_x = blkpos - ( pos_y << log2_block_size );
 
           double cost_last = (scan_mode == SCAN_VER) ? get_rate_last(state, pos_y, pos_x,last_x_bits,last_y_bits) : get_rate_last(state, pos_x, pos_y, last_x_bits,last_y_bits );
           double totalCost = base_cost + cost_last - cost_sig scanpos ;
 
           if( totalCost < best_cost ) {
-            best_last_idx_p1  = scanpos + 1;
-            best_cost         = totalCost;
+            best_last_idx_p1 = scanpos + 1;
+            best_cost        = totalCost;
           }
           if( dest_coeff blkpos  > 1 ) {
             found_last = 1;
             break;
           }
-          base_cost  -= cost_coeff scanpos ;
-          base_cost  += cost_coeff0 scanpos ;
+          base_cost -= cost_coeffscanpos;
+          base_cost += cost_coeff0scanpos;
         } else {
-          base_cost  -= cost_sig scanpos ;
+          base_cost -= cost_sigscanpos;
         }
       } //end for
       if (found_last) break;
     } // end if (sig_coeffgroup_flag cg_blkpos )
   } // end for
 
-  for ( scanpos = 0; scanpos < best_last_idx_p1; scanpos++ ) {
-    int32_t blkPos = scan scanpos ;
-    int32_t level  = dest_coeff blkPos ;
-    abs_sum += level;
-    dest_coeff blkPos  = (coeff_t)(( coef blkPos  < 0 ) ? -level : level);
+  uint32_t abs_sum = 0;
+  for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) {
+    int32_t blkPos     = scanscanpos;
+    int32_t level      = dest_coeffblkPos;
+    abs_sum            += level;
+    dest_coeffblkPos = (coeff_t)(( coefblkPos < 0 ) ? -level : level);
   }
-
   //===== clean uncoded coefficients =====
-  for ( scanpos = best_last_idx_p1; scanpos <= last_scanpos; scanpos++ ) {
-    dest_coeff scan scanpos   = 0;
+  for ( int32_t scanpos = best_last_idx_p1; scanpos <= last_scanpos; scanpos++) {
+    dest_coeffscanscanpos = 0;
   }
 
   if (encoder->sign_hiding && abs_sum >= 2) {
     kvz_rdoq_sign_hiding(state, qp_scaled, scan,
                      delta_u, rate_inc_up, rate_inc_down, sig_rate_delta,
-                     width, coef, dest_coeff);
+                     best_last_idx_p1, coef, dest_coeff);
   }
 }
 
@@ -892,7 +795,8 @@
 * \returns int
 * Calculates cost of actual motion vectors using CABAC coding
 */
-uint32_t kvz_get_mvd_coding_cost_cabac(vector2d_t *mvd, cabac_data_t* cabac) {
+uint32_t kvz_get_mvd_coding_cost_cabac(encoder_state_t * const state, vector2d_t *mvd, const cabac_data_t* real_cabac) 
+{
   uint32_t bitcost = 0;
   const int32_t mvd_hor = mvd->x;
   const int32_t mvd_ver = mvd->y;
@@ -902,7 +806,8 @@
   const uint32_t mvd_ver_abs = abs(mvd_ver);
 
   cabac_data_t cabac_copy;
-  memcpy(&cabac_copy, cabac, sizeof(cabac_data_t));
+  memcpy(&cabac_copy, real_cabac, sizeof(cabac_data_t));
+  cabac_data_t *cabac = &cabac_copy;
   cabac->only_count = 1;
 
   cabac->cur_ctx = &(cabac->ctx.cu_mvd_model0);
@@ -917,19 +822,17 @@
   }
   if (hor_abs_gr0) {
     if (mvd_hor_abs > 1) {
-      kvz_cabac_write_ep_ex_golomb(cabac, mvd_hor_abs - 2, 1);
+      kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1);
     }
     CABAC_BIN_EP(cabac, (mvd_hor > 0) ? 0 : 1, "mvd_sign_flag_hor");
   }
   if (ver_abs_gr0) {
     if (mvd_ver_abs > 1) {
-      kvz_cabac_write_ep_ex_golomb(cabac, mvd_ver_abs - 2, 1);
+      kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1);
     }
     CABAC_BIN_EP(cabac, (mvd_ver > 0) ? 0 : 1, "mvd_sign_flag_ver");
   }
-  bitcost = ((23 - cabac->bits_left) + (cabac->num_buffered_bytes << 3)) - ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3));
-
-  memcpy(cabac, &cabac_copy, sizeof(cabac_data_t));
+  bitcost = ((23 - cabac->bits_left) + (cabac->num_buffered_bytes << 3)) - ((23 - real_cabac->bits_left) + (real_cabac->num_buffered_bytes << 3));
 
   return bitcost;
 }
@@ -938,7 +841,7 @@
 * \returns int
 * Calculates Motion Vector cost and related costs using CABAC coding
 */
-int kvz_calc_mvd_cost_cabac(const encoder_state_t * const state, int x, int y, int mv_shift,
+int kvz_calc_mvd_cost_cabac(encoder_state_t * const state, int x, int y, int mv_shift,
   int16_t mv_cand22, inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
   int16_t num_cand, int32_t ref_idx, uint32_t *bitcost) {
 
@@ -977,11 +880,11 @@
   if (!merged) {
     mvd_temp1.x = x - mv_cand00;
     mvd_temp1.y = y - mv_cand01;
-    cand1_cost = kvz_get_mvd_coding_cost_cabac(&mvd_temp1, cabac);
+    cand1_cost = kvz_get_mvd_coding_cost_cabac(state, &mvd_temp1, cabac);
 
     mvd_temp2.x = x - mv_cand10;
     mvd_temp2.y = y - mv_cand11;
-    cand2_cost = kvz_get_mvd_coding_cost_cabac(&mvd_temp2, cabac);
+    cand2_cost = kvz_get_mvd_coding_cost_cabac(state, &mvd_temp2, cabac);
 
     // Select candidate 1 if it has lower cost
     if (cand2_cost < cand1_cost) {
@@ -1014,8 +917,8 @@
     uint32_t ref_list_idx;
     uint32_t j;
     int ref_list2 = { 0, 0 };
-    for (j = 0; j < state->global->ref->used_size; j++) {
-      if (state->global->ref->pocsj < state->global->poc) {
+    for (j = 0; j < state->frame->ref->used_size; j++) {
+      if (state->frame->ref->pocsj < state->frame->poc) {
         ref_list0++;
       } else {
         ref_list1++;
@@ -1053,7 +956,7 @@
         }
 
         // ToDo: Bidir vector support
-        if (!(state->global->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) {
+        if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) {
           const int32_t mvd_hor = mvd.x;
           const int32_t mvd_ver = mvd.y;
           const int8_t hor_abs_gr0 = mvd_hor != 0;
@@ -1077,7 +980,7 @@
 
           if (hor_abs_gr0) {
             if (mvd_hor_abs > 1) {
-              kvz_cabac_write_ep_ex_golomb(cabac, mvd_hor_abs - 2, 1);
+              kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1);
             }
 
             CABAC_BIN_EP(cabac, (mvd_hor > 0) ? 0 : 1, "mvd_sign_flag_hor");
@@ -1085,7 +988,7 @@
 
           if (ver_abs_gr0) {
             if (mvd_ver_abs > 1) {
-              kvz_cabac_write_ep_ex_golomb(cabac, mvd_ver_abs - 2, 1);
+              kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1);
             }
 
             CABAC_BIN_EP(cabac, (mvd_ver > 0) ? 0 : 1, "mvd_sign_flag_ver");
@@ -1103,5 +1006,5 @@
   *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3);
 
   // Store bitcost before restoring cabac
-  return *bitcost * (int32_t)(state->global->cur_lambda_cost_sqrt + 0.5);
+  return *bitcost * (int32_t)(state->frame->cur_lambda_cost_sqrt + 0.5);
 }

kvazaar-0.8.3.tar.gz/src/rdo.h -> kvazaar-1.0.0.tar.gz/src/rdo.h Changed

@@ -26,23 +26,19 @@
  * Rate-Distortion Optimization related functionality.
  */
 
-#include "global.h"
-
-#include "encoder.h"
+#include "cabac.h"
+#include "cu.h"
 #include "encoderstate.h"
-#include "inter.h"
+#include "global.h" // IWYU pragma: keep
+#include "search_inter.h"
 
 
 extern const uint32_t kvz_g_go_rice_range5;
 extern const uint32_t kvz_g_go_rice_prefix_len5;
 
-int kvz_intra_rdo_cost_compare(uint32_t *rdo_costs,int8_t rdo_modes_to_check, uint32_t cost);
-
 void  kvz_rdoq(encoder_state_t *state, coeff_t *coef, coeff_t *dest_coeff, int32_t width,
            int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth);
 
-uint32_t kvz_rdo_cost_intra(encoder_state_t *state, kvz_pixel* pred, kvz_pixel* orig_block, int width, int8_t mode, int tr_depth);
-
 int32_t kvz_get_coeff_cost(const encoder_state_t *state, coeff_t *coeff, int32_t width, int32_t type, int8_t scan_mode);
 
 int32_t kvz_get_ic_rate(encoder_state_t *state, uint32_t abs_level, uint16_t ctx_num_one, uint16_t ctx_num_abs,
@@ -54,12 +50,16 @@
                          uint32_t c1_idx, uint32_t c2_idx,
                          int32_t q_bits,double temp, int8_t last, int8_t type);
 
-int kvz_calc_mvd_cost_cabac(const encoder_state_t * const state, int x, int y, int mv_shift,
-  int16_t mv_cand22, inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
-  int16_t num_cand, int32_t ref_idx, uint32_t *bitcost);
-uint32_t kvz_get_mvd_coding_cost_cabac(vector2d_t *mvd, cabac_data_t* cabac);
+kvz_mvd_cost_func kvz_calc_mvd_cost_cabac;
+
+uint32_t kvz_get_mvd_coding_cost_cabac(encoder_state_t * const state, vector2d_t *mvd, const cabac_data_t* cabac);
+
+// Fixed points fractional bits, 16b.16b
+extern const uint32_t kvz_entropy_bits128;
+#define CTX_ENTROPY_BITS(ctx, val) kvz_entropy_bits(ctx)->uc_state ^ (val)
 
+// Floating point fractional bits, derived from kvz_entropy_bits
 extern const float kvz_f_entropy_bits128;
-#define CTX_ENTROPY_FBITS(ctx,val) kvz_f_entropy_bits(ctx)->uc_state ^ (val)
+#define CTX_ENTROPY_FBITS(ctx, val) kvz_f_entropy_bits(ctx)->uc_state ^ (val)
 
 #endif

kvazaar-0.8.3.tar.gz/src/sao.c -> kvazaar-1.0.0.tar.gz/src/sao.c Changed

@@ -19,97 +19,18 @@
  ****************************************************************************/
 
 #include "sao.h"
-#include "rdo.h"
-#include "strategies/strategies-picture.h"
 
-#include <string.h>
+#include <limits.h>
 #include <stdlib.h>
-#include <assert.h>
-
-// Offsets of a and b in relation to c.
-// dir_offsetdira or b
-// |       |   a   | a     |     a |
-// | a c b |   c   |   c   |   c   |
-// |       |   b   |     b | b     |
-static const vector2d_t g_sao_edge_offsetsSAO_NUM_EO2 = {
-  { { -1, 0 }, { 1, 0 } },
-  { { 0, -1 }, { 0, 1 } },
-  { { -1, -1 }, { 1, 1 } },
-  { { 1, -1 }, { -1, 1 } }
-};
-
-// Mapping of edge_idx values to eo-classes.
-
-
-static int sao_calc_eo_cat(kvz_pixel a, kvz_pixel b, kvz_pixel c)
-{
-  // Mapping relationships between a, b and c to eo_idx.
-  static const int sao_eo_idx_to_eo_category = { 1, 2, 0, 3, 4 };
-
-  int eo_idx = 2 + SIGN3((int)c - (int)a) + SIGN3((int)c - (int)b);
-
-  return sao_eo_idx_to_eo_categoryeo_idx;
-}
-
-
-int kvz_sao_band_ddistortion(const encoder_state_t * const state, const kvz_pixel *orig_data, const kvz_pixel *rec_data,
-                         int block_width, int block_height,
-                         int band_pos, int sao_bands4)
-{
-  int y, x;
-  int shift = state->encoder_control->bitdepth-5;
-  int sum = 0;
-
-  for (y = 0; y < block_height; ++y) {
-    for (x = 0; x < block_width; ++x) {
-      int band = (rec_datay * block_width + x >> shift) - band_pos;
-      int offset = 0;
-      if (band >= 0 && band < 4) {
-        offset = sao_bandsband;
-      }
-      if (offset != 0) {
-        int diff = orig_datay * block_width + x - rec_datay * block_width + x;
-        // Offset is applied to reconstruction, so it is subtracted from diff.
-        sum += (diff - offset) * (diff - offset) - diff * diff;
-      }
-    }
-  }
-
-  return sum;
-}
-
-
-int kvz_sao_edge_ddistortion(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
-                         int block_width, int block_height,
-                         int eo_class, int offsetsNUM_SAO_EDGE_CATEGORIES)
-{
-  int y, x;
-  int sum = 0;
-  vector2d_t a_ofs = g_sao_edge_offsetseo_class0;
-  vector2d_t b_ofs = g_sao_edge_offsetseo_class1;
-
-  for (y = 1; y < block_height - 1; ++y) {
-    for (x = 1; x < block_width - 1; ++x) {
-      const kvz_pixel *c_data = &rec_datay * block_width + x;
-      kvz_pixel a = c_dataa_ofs.y * block_width + a_ofs.x;
-      kvz_pixel c = c_data0;
-      kvz_pixel b = c_datab_ofs.y * block_width + b_ofs.x;
-
-      int offset = offsetssao_calc_eo_cat(a, b, c);
-
-      if (offset != 0) {
-        int diff = orig_datay * block_width + x - c;
-        // Offset is applied to reconstruction, so it is subtracted from diff.
-        sum += (diff - offset) * (diff - offset) - diff * diff;
-      }
-    }
-  }
+#include <string.h>
 
-  return sum;
-}
+#include "cabac.h"
+#include "image.h"
+#include "rdo.h"
+#include "strategies/strategies-sao.h"
 
 
-void kvz_init_sao_info(sao_info_t *sao) {
+static void init_sao_info(sao_info_t *sao) {
   sao->type = SAO_TYPE_NONE;
   sao->merge_left_flag = 0;
   sao->merge_up_flag = 0;
@@ -240,7 +161,7 @@
 /**
  * \brief calculate an array of intensity correlations for each intensity value
  */
-static void calc_sao_offset_array(const encoder_control_t * const encoder, const sao_info_t *sao, int *offset, color_t color_i)
+void kvz_calc_sao_offset_array(const encoder_control_t * const encoder, const sao_info_t *sao, int *offset, color_t color_i)
 {
   int val;
   int values = (1<<encoder->bitdepth);
@@ -341,78 +262,6 @@
 
 
 /**
- * \param orig_data  Original pixel data. 64x64 for luma, 32x32 for chroma.
- * \param rec_data  Reconstructed pixel data. 64x64 for luma, 32x32 for chroma.
- * \param dir_offsets
- * \param is_chroma  0 for luma, 1 for chroma. Indicates
- */
-static void calc_sao_edge_dir(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
-                              int eo_class, int block_width, int block_height,
-                              int cat_sum_cnt2NUM_SAO_EDGE_CATEGORIES)
-{
-  int y, x;
-  vector2d_t a_ofs = g_sao_edge_offsetseo_class0;
-  vector2d_t b_ofs = g_sao_edge_offsetseo_class1;
-  // Arrays orig_data and rec_data are quarter size for chroma.
-
-  // Don't sample the edge pixels because this function doesn't have access to
-  // their neighbours.
-  for (y = 1; y < block_height - 1; ++y) {
-    for (x = 1; x < block_width - 1; ++x) {
-      const kvz_pixel *c_data = &rec_datay * block_width + x;
-      kvz_pixel a = c_dataa_ofs.y * block_width + a_ofs.x;
-      kvz_pixel c = c_data0;
-      kvz_pixel b = c_datab_ofs.y * block_width + b_ofs.x;
-
-      int eo_cat = sao_calc_eo_cat(a, b, c);
-
-      cat_sum_cnt0eo_cat += orig_datay * block_width + x - c;
-      cat_sum_cnt1eo_cat += 1;
-    }
-  }
-}
-
-static void sao_reconstruct_color(const encoder_control_t * const encoder, 
-                                  const kvz_pixel *rec_data, kvz_pixel *new_rec_data,
-                                  const sao_info_t *sao,
-                                  int stride, int new_stride,
-                                  int block_width, int block_height,
-                                  color_t color_i)
-{
-  int y, x;
-  // Arrays orig_data and rec_data are quarter size for chroma.
-  int offset_v = color_i == COLOR_V ? 5 : 0;
-
-  if(sao->type == SAO_TYPE_BAND) {
-    int offsets1<<KVZ_BIT_DEPTH;
-    calc_sao_offset_array(encoder, sao, offsets, color_i);
-    for (y = 0; y < block_height; ++y) {
-      for (x = 0; x < block_width; ++x) {
-        new_rec_datay * new_stride + x = offsetsrec_datay * stride + x;
-      }
-    }
-  } else {
-    // Don't sample the edge pixels because this function doesn't have access to
-    // their neighbours.
-    for (y = 0; y < block_height; ++y) {
-      for (x = 0; x < block_width; ++x) {
-        vector2d_t a_ofs = g_sao_edge_offsetssao->eo_class0;
-        vector2d_t b_ofs = g_sao_edge_offsetssao->eo_class1;
-        const kvz_pixel *c_data = &rec_datay * stride + x;
-        kvz_pixel *new_data = &new_rec_datay * new_stride + x;
-        kvz_pixel a = c_dataa_ofs.y * stride + a_ofs.x;
-        kvz_pixel c = c_data0;
-        kvz_pixel b = c_datab_ofs.y * stride + b_ofs.x;
-
-        int eo_cat = sao_calc_eo_cat(a, b, c);
-
-        new_data0 = (kvz_pixel)CLIP(0, (1 << KVZ_BIT_DEPTH) - 1, c_data0 + sao->offsetseo_cat + offset_v);
-      }
-    }
-  }
-}
-
-/**
  * \brief Calculate dimensions of the buffer used by sao reconstruction.
 
  * \param pic  Picture.
@@ -575,7 +424,7 @@
                       tl.y + block.y + br.y,
                       pic_stride, buf_stride);
 
-  sao_reconstruct_color(encoder, &buf_rectl.y * buf_stride + tl.x,
+  kvz_sao_reconstruct_color(encoder, &buf_rectl.y * buf_stride + tl.x,
                         &new_rec(ofs.y + tl.y) * lcu_stride + ofs.x + tl.x,
                         sao,
                         buf_stride, lcu_stride,
@@ -613,7 +462,7 @@
     // Call calc_sao_edge_dir once for luma and twice for chroma.
     for (i = 0; i < buf_cnt; ++i) {
       FILL(cat_sum_cnt, 0);
-      calc_sao_edge_dir(datai, recdatai, edge_class,
+      kvz_calc_sao_edge_dir(datai, recdatai, edge_class,
                         block_width, block_height, cat_sum_cnt);
     
 
@@ -652,7 +501,7 @@
 
     {
       float mode_bits = sao_mode_bits_edge(state, edge_class, edge_offset, sao_top, sao_left, buf_cnt);
-      sum_ddistortion += (int)((double)mode_bits*state->global->cur_lambda_cost+0.5);
+      sum_ddistortion += (int)((double)mode_bits*state->frame->cur_lambda_cost+0.5);
     }
     // SAO is not applied for category 0.
     edge_offsetSAO_EO_CAT0 = 0;
@@ -696,7 +545,7 @@
     }
 
     temp_rate = sao_mode_bits_band(state, sao_out->band_position, temp_offsets, sao_top, sao_left, buf_cnt);
-    ddistortion += (int)((double)temp_rate*state->global->cur_lambda_cost + 0.5);
+    ddistortion += (int)((double)temp_rate*state->frame->cur_lambda_cost + 0.5);
 
     // Select band sao over edge sao when distortion is lower
     if (ddistortion < sao_out->ddistortion) {
@@ -725,8 +574,8 @@
   sao_info_t edge_sao;
   sao_info_t band_sao;
 
-  kvz_init_sao_info(&edge_sao);
-  kvz_init_sao_info(&band_sao);
+  init_sao_info(&edge_sao);
+  init_sao_info(&band_sao);
   
   //Avoid "random" uninitialized value
   edge_sao.band_position0 = edge_sao.band_position1 = 0;
@@ -740,7 +589,7 @@
 
   {
     float mode_bits = sao_mode_bits_edge(state, edge_sao.eo_class, edge_sao.offsets, sao_top, sao_left, buf_cnt);
-    int ddistortion = (int)(mode_bits * state->global->cur_lambda_cost + 0.5);
+    int ddistortion = (int)(mode_bits * state->frame->cur_lambda_cost + 0.5);
     unsigned buf_i;
     
     for (buf_i = 0; buf_i < buf_cnt; ++buf_i) {
@@ -754,7 +603,7 @@
 
   {
     float mode_bits = sao_mode_bits_band(state, band_sao.band_position, band_sao.offsets, sao_top, sao_left, buf_cnt);
-    int ddistortion = (int)(mode_bits * state->global->cur_lambda_cost + 0.5);
+    int ddistortion = (int)(mode_bits * state->frame->cur_lambda_cost + 0.5);
     unsigned buf_i;
     
     for (buf_i = 0; buf_i < buf_cnt; ++buf_i) {
@@ -777,7 +626,7 @@
   // Choose between SAO and doing nothing, taking into account the
   // rate-distortion cost of coding do nothing.
   {
-    int cost_of_nothing = (int)(sao_mode_bits_none(state, sao_top, sao_left) * state->global->cur_lambda_cost + 0.5);
+    int cost_of_nothing = (int)(sao_mode_bits_none(state, sao_top, sao_left) * state->frame->cur_lambda_cost + 0.5);
     if (sao_out->ddistortion >= cost_of_nothing) {
       sao_out->type = SAO_TYPE_NONE;
       merge_cost0 = cost_of_nothing;
@@ -794,7 +643,7 @@
       if (merge_cand) {
         unsigned buf_i;
         float mode_bits = sao_mode_bits_merge(state, i + 1);
-        int ddistortion = (int)(mode_bits * state->global->cur_lambda_cost + 0.5);
+        int ddistortion = (int)(mode_bits * state->frame->cur_lambda_cost + 0.5);
 
         switch (merge_cand->type) {
           case SAO_TYPE_EDGE:
@@ -824,7 +673,7 @@
   return;
 }
 
-void kvz_sao_search_chroma(const encoder_state_t * const state, const videoframe_t *frame, unsigned x_ctb, unsigned y_ctb, sao_info_t *sao, sao_info_t *sao_top, sao_info_t *sao_left, int32_t merge_cost3)
+static void sao_search_chroma(const encoder_state_t * const state, const videoframe_t *frame, unsigned x_ctb, unsigned y_ctb, sao_info_t *sao, sao_info_t *sao_top, sao_info_t *sao_left, int32_t merge_cost3)
 {
   int block_width  = (LCU_WIDTH / 2);
   int block_height = (LCU_WIDTH / 2);
@@ -860,7 +709,7 @@
   sao_search_best_mode(state, orig_list, rec_list, block_width, block_height, 2, sao, sao_top, sao_left, merge_cost);
 }
 
-void kvz_sao_search_luma(const encoder_state_t * const state, const videoframe_t *frame, unsigned x_ctb, unsigned y_ctb, sao_info_t *sao, sao_info_t *sao_top, sao_info_t *sao_left, int32_t merge_cost3)
+static void sao_search_luma(const encoder_state_t * const state, const videoframe_t *frame, unsigned x_ctb, unsigned y_ctb, sao_info_t *sao, sao_info_t *sao_top, sao_info_t *sao_left, int32_t merge_cost3)
 {
   kvz_pixel origLCU_LUMA_SIZE;
   kvz_pixel recLCU_LUMA_SIZE;
@@ -890,6 +739,71 @@
   sao_search_best_mode(state, orig_list, rec_list, block_width, block_height, 1, sao, sao_top, sao_left, merge_cost);
 }
 
+void kvz_sao_search_lcu(const encoder_state_t* const state, int lcu_x, int lcu_y)
+{
+  assert(!state->encoder_control->cfg->lossless);
+
+  videoframe_t* const frame = state->tile->frame;
+  const int stride = frame->width_in_lcu;
+  int32_t merge_cost_luma3 = { INT32_MAX };
+  int32_t merge_cost_chroma3 = { INT32_MAX };
+  sao_info_t *sao_luma = &frame->sao_lumalcu_y * stride + lcu_x;
+  sao_info_t *sao_chroma = NULL;
+  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+    sao_chroma = &frame->sao_chromalcu_y * stride + lcu_x;
+  }
+
+  // Merge candidates
+  sao_info_t *sao_top_luma    = lcu_y != 0 ? &frame->sao_luma  (lcu_y - 1) * stride + lcu_x : NULL;
+  sao_info_t *sao_left_luma   = lcu_x != 0 ? &frame->sao_luma  lcu_y       * stride + lcu_x - 1 : NULL;
+  sao_info_t *sao_top_chroma  = NULL;
+  sao_info_t *sao_left_chroma = NULL;
+  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+    if (lcu_y != 0) sao_top_chroma =  &frame->sao_chroma(lcu_y - 1) * stride + lcu_x;
+    if (lcu_x != 0) sao_left_chroma = &frame->sao_chromalcu_y       * stride + lcu_x - 1;
+  }
+
+  sao_search_luma(state, frame, lcu_x, lcu_y, sao_luma, sao_top_luma, sao_left_luma, merge_cost_luma);
+  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+    sao_search_chroma(state, frame, lcu_x, lcu_y, sao_chroma, sao_top_chroma, sao_left_chroma, merge_cost_chroma);
+  } else {
+    merge_cost_chroma0 = 0;
+    merge_cost_chroma1 = 0;
+    merge_cost_chroma2 = 0;
+  }
+
+  sao_luma->merge_up_flag = sao_luma->merge_left_flag = 0;
+  // Check merge costs
+  if (sao_top_luma) {
+    // Merge up if cost is equal or smaller to the searched mode cost
+    if (merge_cost_luma2 + merge_cost_chroma2 <= merge_cost_luma0 + merge_cost_chroma0) {
+      *sao_luma = *sao_top_luma;
+      if (sao_top_chroma) *sao_chroma = *sao_top_chroma;
+      sao_luma->merge_up_flag = 1;
+      sao_luma->merge_left_flag = 0;
+    }
+  }
+  if (sao_left_luma) {
+    // Merge left if cost is equal or smaller to the searched mode cost
+    // AND smaller than merge up cost, if merge up was already chosen
+    if (merge_cost_luma1 + merge_cost_chroma1 <= merge_cost_luma0 + merge_cost_chroma0) {
+      if (!sao_luma->merge_up_flag || merge_cost_luma1 + merge_cost_chroma1 < merge_cost_luma2 + merge_cost_chroma2) {
+        *sao_luma = *sao_left_luma;
+        if (sao_left_chroma) *sao_chroma = *sao_left_chroma;
+        sao_luma->merge_left_flag = 1;
+        sao_luma->merge_up_flag = 0;
+      }
+    }
+  }
+  assert(sao_luma->eo_class < SAO_NUM_EO);
+  CHECKPOINT_SAO_INFO("sao_luma", *sao_luma);
+
+  if (sao_chroma) {
+    assert(sao_chroma->eo_class < SAO_NUM_EO);
+    CHECKPOINT_SAO_INFO("sao_chroma", *sao_chroma);
+  }
+}
+
 void kvz_sao_reconstruct_frame(encoder_state_t * const state)
 {
   vector2d_t lcu;
@@ -899,27 +813,36 @@
   // top LCUs. Single pixel wide buffers, like what kvz_search_lcu takes, would
   // be enough though.
   kvz_pixel *new_y_data = MALLOC(kvz_pixel, frame->rec->width * frame->rec->height);
-  kvz_pixel *new_u_data = MALLOC(kvz_pixel, (frame->rec->width * frame->rec->height) >> 2);
-  kvz_pixel *new_v_data = MALLOC(kvz_pixel, (frame->rec->width * frame->rec->height) >> 2);
-  
   kvz_pixels_blit(frame->rec->y, new_y_data, frame->rec->width, frame->rec->height, frame->rec->stride, frame->rec->width);
-  kvz_pixels_blit(frame->rec->u, new_u_data, frame->rec->width/2, frame->rec->height/2, frame->rec->stride/2, frame->rec->width/2);
-  kvz_pixels_blit(frame->rec->v, new_v_data, frame->rec->width/2, frame->rec->height/2, frame->rec->stride/2, frame->rec->width/2);
-
   for (lcu.y = 0; lcu.y < frame->height_in_lcu; lcu.y++) {
     for (lcu.x = 0; lcu.x < frame->width_in_lcu; lcu.x++) {
       unsigned stride = frame->width_in_lcu;
       sao_info_t *sao_luma = &frame->sao_lumalcu.y * stride + lcu.x;
-      sao_info_t *sao_chroma = &frame->sao_chromalcu.y * stride + lcu.x;
-
+      
       // sao_do_rdo(encoder, lcu.x, lcu.y, sao_luma, sao_chroma);
       kvz_sao_reconstruct(state->encoder_control, frame, new_y_data, lcu.x, lcu.y, sao_luma, COLOR_Y);
-      kvz_sao_reconstruct(state->encoder_control, frame, new_u_data, lcu.x, lcu.y, sao_chroma, COLOR_U);
-      kvz_sao_reconstruct(state->encoder_control, frame, new_v_data, lcu.x, lcu.y, sao_chroma, COLOR_V);
     }
   }
-
   free(new_y_data);
-  free(new_u_data);
-  free(new_v_data);
+
+  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+    kvz_pixel *new_u_data = MALLOC(kvz_pixel, (frame->rec->width * frame->rec->height) >> 2);
+    kvz_pixel *new_v_data = MALLOC(kvz_pixel, (frame->rec->width * frame->rec->height) >> 2);
+
+    kvz_pixels_blit(frame->rec->u, new_u_data, frame->rec->width / 2, frame->rec->height / 2, frame->rec->stride / 2, frame->rec->width / 2);
+    kvz_pixels_blit(frame->rec->v, new_v_data, frame->rec->width / 2, frame->rec->height / 2, frame->rec->stride / 2, frame->rec->width / 2);
+
+    for (lcu.y = 0; lcu.y < frame->height_in_lcu; lcu.y++) {
+      for (lcu.x = 0; lcu.x < frame->width_in_lcu; lcu.x++) {
+        unsigned stride = frame->width_in_lcu;
+        sao_info_t *sao_chroma = &frame->sao_chromalcu.y * stride + lcu.x;
+
+        kvz_sao_reconstruct(state->encoder_control, frame, new_u_data, lcu.x, lcu.y, sao_chroma, COLOR_U);
+        kvz_sao_reconstruct(state->encoder_control, frame, new_v_data, lcu.x, lcu.y, sao_chroma, COLOR_V);
+      }
+    }
+
+    free(new_u_data);
+    free(new_v_data);
+  }
 }

kvazaar-0.8.3.tar.gz/src/sao.h -> kvazaar-1.0.0.tar.gz/src/sao.h Changed

@@ -26,14 +26,14 @@
  * Sample Adaptive Offset filter.
  */
 
-#include "global.h"
-
 #include "checkpoint.h"
-#include "global.h"
-#include "videoframe.h"
+#include "cu.h"
 #include "encoder.h"
 #include "encoderstate.h"
-#include "math.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+#include "videoframe.h"
+
 
 typedef enum { SAO_TYPE_NONE = 0, SAO_TYPE_BAND, SAO_TYPE_EDGE } sao_type;
 typedef enum { SAO_EO0 = 0, SAO_EO1, SAO_EO2, SAO_EO3, SAO_NUM_EO } sao_eo_class;
@@ -50,6 +50,20 @@
   int offsetsNUM_SAO_EDGE_CATEGORIES * 2;
 } sao_info_t;
 
+
+// Offsets of a and b in relation to c.
+// dir_offsetdira or b
+// |       |   a   | a     |     a |
+// | a c b |   c   |   c   |   c   |
+// |       |   b   |     b | b     |
+static const vector2d_t g_sao_edge_offsetsSAO_NUM_EO2 = {
+  { { -1, 0 }, { 1, 0 } },
+  { { 0, -1 }, { 0, 1 } },
+  { { -1, -1 }, { 1, 1 } },
+  { { 1, -1 }, { -1, 1 } }
+};
+
+
 #define CHECKPOINT_SAO_INFO(prefix_str, sao) CHECKPOINT(prefix_str " type=%d eo_class=%d ddistortion=%d " \
   "merge_left_flag=%d merge_up_flag=%d band_position=%d " \
   "offsets0=%d offsets1=%d offsets2=%d offsets3=%d offsets4=%d", \
@@ -58,12 +72,11 @@
   (sao).offsets0, (sao).offsets1, (sao).offsets2, (sao).offsets3, (sao).offsets4)
 
 
-void kvz_init_sao_info(sao_info_t *sao);
-void kvz_sao_search_chroma(const encoder_state_t * state, const videoframe_t *frame, unsigned x_ctb, unsigned y_ctb, sao_info_t *sao, sao_info_t *sao_top, sao_info_t *sao_left, int32_t merge_cost3);
-void kvz_sao_search_luma(const encoder_state_t * state, const videoframe_t *frame, unsigned x_ctb, unsigned y_ctb, sao_info_t *sao, sao_info_t *sao_top, sao_info_t *sao_left, int32_t merge_cost3);
 void kvz_sao_reconstruct(const encoder_control_t * encoder, videoframe_t *frame, const kvz_pixel *old_rec,
                      unsigned x_ctb, unsigned y_ctb,
                      const sao_info_t *sao, color_t color_i);
 void kvz_sao_reconstruct_frame(encoder_state_t *state);
+void kvz_sao_search_lcu(const encoder_state_t* const state, int lcu_x, int lcu_y);
+void kvz_calc_sao_offset_array(const encoder_control_t * const encoder, const sao_info_t *sao, int *offset, color_t color_i);
 
 #endif

kvazaar-0.8.3.tar.gz/src/scalinglist.c -> kvazaar-1.0.0.tar.gz/src/scalinglist.c Changed

kvazaar-0.8.3.tar.gz/src/scalinglist.h -> kvazaar-1.0.0.tar.gz/src/scalinglist.h Changed

kvazaar-0.8.3.tar.gz/src/search.c -> kvazaar-1.0.0.tar.gz/src/search.c Changed

@@ -20,18 +20,22 @@
 
 #include "search.h"
 
-#include <stdio.h>
-#include <stdlib.h>
+#include <limits.h>
 #include <string.h>
-#include <assert.h>
 
-#include "intra.h"
+#include "cabac.h"
+#include "encoder.h"
+#include "imagelist.h"
 #include "inter.h"
+#include "intra.h"
+#include "kvazaar.h"
 #include "rdo.h"
-#include "transform.h"
 #include "search_inter.h"
 #include "search_intra.h"
-#include "strategies/strategies-picture.h"
+#include "threadqueue.h"
+#include "transform.h"
+#include "videoframe.h"
+
 
 #define IN_FRAME(x, y, width, height, block_width, block_height) \
   ((x) >= 0 && (y) >= 0 \
@@ -43,10 +47,7 @@
 # define INTRA_TRESHOLD 20
 #endif
 
-// Disable early cu-split pruning.
-#ifndef FULL_CU_SPLIT_SEARCH
-#  define FULL_CU_SPLIT_SEARCH false
-#endif
+
 // Modify weight of luma SSD.
 #ifndef LUMA_MULT
 # define LUMA_MULT 0.8
@@ -66,14 +67,13 @@
 
   // Copy non-reference CUs.
   {
-    const int x_cu = SUB_SCU(x_px) >> MAX_DEPTH;
-    const int y_cu = SUB_SCU(y_px) >> MAX_DEPTH;
-    const int width_cu = LCU_WIDTH >> MAX_DEPTH >> depth;
-    int x, y;
-    for (y = y_cu; y < y_cu + width_cu; ++y) {
-      for (x = x_cu; x < x_cu + width_cu; ++x) {
-        const cu_info_t *from_cu = LCU_GET_CU(&work_treedepth + 1, x, y);
-        cu_info_t *to_cu = LCU_GET_CU(&work_treedepth, x, y);
+    const int x_orig = SUB_SCU(x_px);
+    const int y_orig = SUB_SCU(y_px);
+    const int width_cu = LCU_WIDTH >> depth;
+    for (int y = y_orig; y < y_orig + width_cu; y += SCU_WIDTH) {
+      for (int x = x_orig; x < x_orig + width_cu; x += SCU_WIDTH) {
+        const cu_info_t *from_cu = LCU_GET_CU_AT_PX(&work_treedepth + 1, x, y);
+        cu_info_t *to_cu = LCU_GET_CU_AT_PX(&work_treedepth, x, y);
         memcpy(to_cu, from_cu, sizeof(*to_cu));
       }
     }
@@ -94,20 +94,24 @@
     lcu_coeff_t *to_coeff = &work_treedepth.coeff;
 
     kvz_pixels_blit(&from->yluma_index, &to->yluma_index,
-                        width_px, width_px, LCU_WIDTH, LCU_WIDTH);
-    kvz_pixels_blit(&from->uchroma_index, &to->uchroma_index,
-                        width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
-    kvz_pixels_blit(&from->vchroma_index, &to->vchroma_index,
-                        width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+                    width_px, width_px, LCU_WIDTH, LCU_WIDTH);
+    if (from->chroma_format != KVZ_CSP_400) {
+      kvz_pixels_blit(&from->uchroma_index, &to->uchroma_index,
+                      width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+      kvz_pixels_blit(&from->vchroma_index, &to->vchroma_index,
+                      width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+    }
 
     // Copy coefficients up. They do not have to be copied down because they
     // are not used for the search.
     kvz_coefficients_blit(&from_coeff->yluma_index, &to_coeff->yluma_index,
-                        width_px, width_px, LCU_WIDTH, LCU_WIDTH);
-    kvz_coefficients_blit(&from_coeff->uchroma_index, &to_coeff->uchroma_index,
-                        width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
-    kvz_coefficients_blit(&from_coeff->vchroma_index, &to_coeff->vchroma_index,
-                        width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+                          width_px, width_px, LCU_WIDTH, LCU_WIDTH);
+    if (from->chroma_format != KVZ_CSP_400) {
+      kvz_coefficients_blit(&from_coeff->uchroma_index, &to_coeff->uchroma_index,
+                            width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+      kvz_coefficients_blit(&from_coeff->vchroma_index, &to_coeff->vchroma_index,
+                            width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+    }
   }
 }
 
@@ -125,15 +129,13 @@
   int d;
 
   for (d = depth + 1; d < MAX_PU_DEPTH + 1; ++d) {
-    const int x_cu = SUB_SCU(x_px) >> MAX_DEPTH;
-    const int y_cu = SUB_SCU(y_px) >> MAX_DEPTH;
-    const int width_cu = width_px >> MAX_DEPTH;
-
-    int x, y;
-    for (y = y_cu; y < y_cu + width_cu; ++y) {
-      for (x = x_cu; x < x_cu + width_cu; ++x) {
-        const cu_info_t *from_cu = LCU_GET_CU(&work_treedepth, x, y);
-        cu_info_t *to_cu = LCU_GET_CU(&work_treed, x, y);
+    const int x_orig = SUB_SCU(x_px);
+    const int y_orig = SUB_SCU(y_px);
+
+    for (int y = y_orig; y < y_orig + width_px; y += SCU_WIDTH) {
+      for (int x = x_orig; x < x_orig + width_px; x += SCU_WIDTH) {
+        const cu_info_t *from_cu = LCU_GET_CU_AT_PX(&work_treedepth, x, y);
+        cu_info_t *to_cu = LCU_GET_CU_AT_PX(&work_treed, x, y);
         memcpy(to_cu, from_cu, sizeof(*to_cu));
       }
     }
@@ -151,27 +153,28 @@
     lcu_yuv_t *to = &work_treed.rec;
 
     kvz_pixels_blit(&from->yluma_index, &to->yluma_index,
-                        width_px, width_px, LCU_WIDTH, LCU_WIDTH);
-    kvz_pixels_blit(&from->uchroma_index, &to->uchroma_index,
-                        width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
-    kvz_pixels_blit(&from->vchroma_index, &to->vchroma_index,
-                        width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+                    width_px, width_px, LCU_WIDTH, LCU_WIDTH);
+    if (from->chroma_format != KVZ_CSP_400) {
+      kvz_pixels_blit(&from->uchroma_index, &to->uchroma_index,
+                      width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+      kvz_pixels_blit(&from->vchroma_index, &to->vchroma_index,
+                      width_px / 2, width_px / 2, LCU_WIDTH / 2, LCU_WIDTH / 2);
+    }
   }
 }
 
 
 void kvz_lcu_set_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth)
 {
-  const int width_cu = LCU_CU_WIDTH >> depth;
-  const vector2d_t lcu_cu = { SUB_SCU(x_px) / 8, SUB_SCU(y_px) / 8 };
-  int x, y;
+  const int width = LCU_WIDTH >> depth;
+  const vector2d_t lcu_cu = { SUB_SCU(x_px), SUB_SCU(y_px) };
 
   // Depth 4 doesn't go inside the loop. Set the top-left CU.
-  LCU_GET_CU(lcu, lcu_cu.x, lcu_cu.y)->tr_depth = tr_depth;
+  LCU_GET_CU_AT_PX(lcu, lcu_cu.x, lcu_cu.y)->tr_depth = tr_depth;
 
-  for (y = 0; y < width_cu; ++y) {
-    for (x = 0; x < width_cu; ++x) {
-      cu_info_t *cu = LCU_GET_CU(lcu, lcu_cu.x + x, lcu_cu.y + y);
+  for (unsigned y = 0; y < width; y += SCU_WIDTH) {
+    for (unsigned x = 0; x < width; x += SCU_WIDTH) {
+      cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, lcu_cu.x + x, lcu_cu.y + y);
       cu->tr_depth = tr_depth;
     }
   }
@@ -180,48 +183,41 @@
 
 static void lcu_set_intra_mode(lcu_t *lcu, int x_px, int y_px, int depth, int pred_mode, int chroma_mode, int part_mode)
 {
-  const int width_cu = LCU_CU_WIDTH >> depth;
-  const int x_cu = SUB_SCU(x_px) >> MAX_DEPTH;
-  const int y_cu = SUB_SCU(y_px) >> MAX_DEPTH;
-  int x, y;
+  const int width = LCU_WIDTH >> depth;
+  const int x_cu  = SUB_SCU(x_px);
+  const int y_cu  = SUB_SCU(y_px);
 
-  // NxN can only be applied to a single CU at a time.
   if (part_mode == SIZE_NxN) {
-    cu_info_t *cu = LCU_GET_CU(lcu, x_cu, y_cu);
-    cu->depth = MAX_DEPTH;
-    cu->type = CU_INTRA;
-    cu->intraPU_INDEX(x_px / 4, y_px / 4).mode = pred_mode;
-    cu->intraPU_INDEX(x_px / 4, y_px / 4).mode_chroma = chroma_mode;
-    cu->part_size = part_mode;
-    return;
+    assert(depth == MAX_DEPTH + 1);
+    assert(width == SCU_WIDTH);
+  }
+
+  if (depth > MAX_DEPTH) {
+    depth = MAX_DEPTH;
+    assert(part_mode == SIZE_NxN);
   }
 
   // Set mode in every CU covered by part_mode in this depth.
-  for (y = y_cu; y < y_cu + width_cu; ++y) {
-    for (x = x_cu; x < x_cu + width_cu; ++x) {
-      cu_info_t *cu = LCU_GET_CU(lcu, x, y);
+  for (int y = y_cu; y < y_cu + width; y += SCU_WIDTH) {
+    for (int x = x_cu; x < x_cu + width; x += SCU_WIDTH) {
+      cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x, y);
       cu->depth = depth;
       cu->type = CU_INTRA;
-      cu->intra0.mode = pred_mode;
-      cu->intra1.mode = pred_mode;
-      cu->intra2.mode = pred_mode;
-      cu->intra3.mode = pred_mode;
-      cu->intra0.mode_chroma = chroma_mode;
+      cu->intra.mode = pred_mode;
+      cu->intra.mode_chroma = chroma_mode;
       cu->part_size = part_mode;
-      cu->coded = 1;
     }
   }
 }
 
 
-static void lcu_set_inter_pu(lcu_t *lcu, int x_pu, int y_pu, int width_pu, int height_pu, cu_info_t *cur_pu)
+static void lcu_set_inter_pu(lcu_t *lcu, int x_px, int y_px, int width, int height, cu_info_t *cur_pu)
 {
   // Set mode in every CU covered by part_mode in this depth.
-  for (int y = y_pu; y < y_pu + height_pu; ++y) {
-    for (int x = x_pu; x < x_pu + width_pu; ++x) {
-      cu_info_t *cu = LCU_GET_CU(lcu, x, y);
+  for (int y = y_px; y < y_px + height; y += SCU_WIDTH) {
+    for (int x = x_px; x < x_px + width; x += SCU_WIDTH) {
+      cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x, y);
       //Check if this could be moved inside the if
-      cu->coded    = 1;
       if (cu != cur_pu) {
         cu->depth     = cur_pu->depth;
         cu->part_size = cur_pu->part_size;
@@ -238,17 +234,17 @@
 
 static void lcu_set_inter(lcu_t *lcu, int x_px, int y_px, int depth, cu_info_t *cur_cu)
 {
-  const int width_cu = LCU_CU_WIDTH >> depth;
-  const int x_cu = SUB_SCU(x_px) >> MAX_DEPTH;
-  const int y_cu = SUB_SCU(y_px) >> MAX_DEPTH;
+  const int width = LCU_WIDTH >> depth;
+  const int x_local = SUB_SCU(x_px);
+  const int y_local = SUB_SCU(y_px);
   const int num_pu = kvz_part_mode_num_partscur_cu->part_size;
 
   for (int i = 0; i < num_pu; ++i) {
-    const int x_pu      = PU_GET_X(cur_cu->part_size, width_cu, x_cu, i);
-    const int y_pu      = PU_GET_Y(cur_cu->part_size, width_cu, y_cu, i);
-    const int width_pu  = PU_GET_W(cur_cu->part_size, width_cu, i);
-    const int height_pu = PU_GET_H(cur_cu->part_size, width_cu, i);
-    cu_info_t *cur_pu   = LCU_GET_CU(lcu, x_pu, y_pu);
+    const int x_pu      = PU_GET_X(cur_cu->part_size, width, x_local, i);
+    const int y_pu      = PU_GET_Y(cur_cu->part_size, width, y_local, i);
+    const int width_pu  = PU_GET_W(cur_cu->part_size, width, i);
+    const int height_pu = PU_GET_H(cur_cu->part_size, width, i);
+    cu_info_t *cur_pu   = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu);
     lcu_set_inter_pu(lcu, x_pu, y_pu, width_pu, height_pu, cur_pu);
   }
 }
@@ -256,22 +252,21 @@
 
 static void lcu_set_coeff(lcu_t *lcu, int x_px, int y_px, int depth, cu_info_t *cur_cu)
 {
-  const int width_cu = LCU_CU_WIDTH >> depth;
-  const int x_cu = SUB_SCU(x_px) >> MAX_DEPTH;
-  const int y_cu = SUB_SCU(y_px) >> MAX_DEPTH;
-  int x, y;
-  int tr_split = cur_cu->tr_depth-cur_cu->depth;
+  const uint32_t width = LCU_WIDTH >> depth;
+  const uint32_t x_local = SUB_SCU(x_px);
+  const uint32_t y_local = SUB_SCU(y_px);
+  const uint32_t tr_split = cur_cu->tr_depth-cur_cu->depth;
+  const uint32_t mask = ~((width >> tr_split)-1);
 
   // Set coeff flags in every CU covered by part_mode in this depth.
-  for (y = y_cu; y < y_cu + width_cu; ++y) {
-    for (x = x_cu; x < x_cu + width_cu; ++x) {
-      cu_info_t *cu = LCU_GET_CU(lcu, x, y);
+  for (uint32_t y = y_local; y < y_local + width; y += SCU_WIDTH) {
+    for (uint32_t x = x_local; x < x_local + width; x += SCU_WIDTH) {
+      cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, x, y);
       // Use TU top-left CU to propagate coeff flags
-      uint32_t mask = ~((width_cu>>tr_split)-1);
-      cu_info_t *cu_from = LCU_GET_CU(lcu, x & mask, y & mask);
+      cu_info_t *cu_from = LCU_GET_CU_AT_PX(lcu, x & mask, y & mask);
       if (cu != cu_from) {
         // Chroma coeff data is not used, luma is needed for deblocking
-        cu->cbf.y = cu_from->cbf.y;
+        cbf_copy(&cu->cbf, cu_from->cbf, COLOR_Y);
       }
     }
   }
@@ -293,7 +288,6 @@
                        lcu_t *const lcu)
 {
   const int width = LCU_WIDTH >> depth;
-  const uint8_t pu_index = PU_INDEX(x_px / 4, y_px / 4);
 
   // cur_cu is used for TU parameters.
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
@@ -326,31 +320,31 @@
     sum += kvz_cu_rd_cost_luma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
     sum += kvz_cu_rd_cost_luma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
 
-    return sum + tr_tree_bits * state->global->cur_lambda_cost;
+    return sum + tr_tree_bits * state->frame->cur_lambda_cost;
   }
 
   // Add transform_tree cbf_luma bit cost.
   if (pred_cu->type == CU_INTRA ||
       tr_depth > 0 ||
-      cbf_is_set(tr_cu->cbf.u, depth) ||
-      cbf_is_set(tr_cu->cbf.v, depth))
+      cbf_is_set(tr_cu->cbf, depth, COLOR_U) ||
+      cbf_is_set(tr_cu->cbf, depth, COLOR_V))
   {
     const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_luma!tr_depth);
-    tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.y, depth + pu_index));
+    tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_Y));
   }
 
-  unsigned ssd = 0;
   // SSD between reconstruction and original
-  for (int y = y_px; y < y_px + width; ++y) {
-    for (int x = x_px; x < x_px + width; ++x) {
-      int diff = (int)lcu->rec.yy * LCU_WIDTH + x - (int)lcu->ref.yy * LCU_WIDTH + x;
-      ssd += diff*diff;
-    }
+  int ssd = 0;
+  if (!state->encoder_control->cfg->lossless) {
+    int index = y_px * LCU_WIDTH + x_px;
+    ssd = kvz_pixels_calc_ssd(&lcu->ref.yindex, &lcu->rec.yindex,
+                                        LCU_WIDTH,          LCU_WIDTH,
+                                        width);
   }
 
   {
     coeff_t coeff_temp32 * 32;
-    int8_t luma_scan_mode = kvz_get_scan_order(pred_cu->type, pred_cu->intraPU_INDEX(x_px / 4, y_px / 4).mode, depth);
+    int8_t luma_scan_mode = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode, depth);
 
     // Code coeffs using cabac to get a better estimate of real coding costs.
     kvz_coefficients_blit(&lcu->coeff.y(y_px*LCU_WIDTH) + x_px, coeff_temp, width, width, LCU_WIDTH, width);
@@ -358,7 +352,7 @@
   }
 
   double bits = tr_tree_bits + coeff_bits;
-  return (double)ssd * LUMA_MULT + bits * state->global->cur_lambda_cost;
+  return (double)ssd * LUMA_MULT + bits * state->frame->cur_lambda_cost;
 }
 
 
@@ -369,7 +363,7 @@
 {
   const vector2d_t lcu_px = { x_px / 2, y_px / 2 };
   const int width = (depth <= MAX_DEPTH) ? LCU_WIDTH >> (depth + 1) : LCU_WIDTH >> depth;
-  cu_info_t *const tr_cu = LCU_GET_CU(lcu, lcu_px.x / 4, lcu_px.y / 4);
+  cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, x_px, y_px);
 
   double tr_tree_bits = 0;
   double coeff_bits = 0;
@@ -377,7 +371,7 @@
   assert(x_px >= 0 && x_px < LCU_WIDTH);
   assert(y_px >= 0 && y_px < LCU_WIDTH);
 
-  if (PU_INDEX(x_px / 4, y_px / 4) != 0) {
+  if (x_px % 8 != 0 || y_px % 8 != 0) {
     // For MAX_PU_DEPTH calculate chroma for previous depth for the first
     // block and return 0 cost for all others.
     return 0;
@@ -386,11 +380,11 @@
   if (depth < MAX_PU_DEPTH) {
     const int tr_depth = depth - pred_cu->depth;
     const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_chromatr_depth);
-    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.u, depth - 1)) {
-      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.u, depth));
+    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
+      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U));
     }
-    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.v, depth - 1)) {
-      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.v, depth));
+    if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
+      tr_tree_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V));
     }
   }
 
@@ -403,27 +397,25 @@
     sum += kvz_cu_rd_cost_chroma(state, x_px, y_px + offset, depth + 1, pred_cu, lcu);
     sum += kvz_cu_rd_cost_chroma(state, x_px + offset, y_px + offset, depth + 1, pred_cu, lcu);
 
-    return sum + tr_tree_bits * state->global->cur_lambda_cost;
+    return sum + tr_tree_bits * state->frame->cur_lambda_cost;
   }
 
   // Chroma SSD
   int ssd = 0;
-  for (int y = lcu_px.y; y < lcu_px.y + width; ++y) {
-    for (int x = lcu_px.x; x < lcu_px.x + width; ++x) {
-      int diff = (int)lcu->rec.uy * LCU_WIDTH_C + x - (int)lcu->ref.uy * LCU_WIDTH_C + x;
-      ssd += diff * diff;
-    }
-  }
-  for (int y = lcu_px.y; y < lcu_px.y + width; ++y) {
-    for (int x = lcu_px.x; x < lcu_px.x + width; ++x) {
-      int diff = (int)lcu->rec.vy * LCU_WIDTH_C + x - (int)lcu->ref.vy * LCU_WIDTH_C + x;
-      ssd += diff * diff;
-    }
+  if (!state->encoder_control->cfg->lossless) {
+    int index = lcu_px.y * LCU_WIDTH_C + lcu_px.x;
+    int ssd_u = kvz_pixels_calc_ssd(&lcu->ref.uindex, &lcu->rec.uindex,
+                                    LCU_WIDTH_C,         LCU_WIDTH_C,
+                                    width);
+    int ssd_v = kvz_pixels_calc_ssd(&lcu->ref.vindex, &lcu->rec.vindex,
+                                    LCU_WIDTH_C,        LCU_WIDTH_C,
+                                    width);
+    ssd = ssd_u + ssd_v;
   }
 
   {
     coeff_t coeff_temp16 * 16;
-    int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra0.mode_chroma, depth);
+    int8_t scan_order = kvz_get_scan_order(pred_cu->type, pred_cu->intra.mode_chroma, depth);
     
     kvz_coefficients_blit(&lcu->coeff.u(lcu_px.y*(LCU_WIDTH_C)) + lcu_px.x,
                       coeff_temp, width, width, LCU_WIDTH_C, width);
@@ -435,31 +427,32 @@
   }
 
   double bits = tr_tree_bits + coeff_bits;
-  return (double)ssd * CHROMA_MULT + bits * state->global->cur_lambda_cost;
+  return (double)ssd * CHROMA_MULT + bits * state->frame->cur_lambda_cost;
 }
 
 
 // Return estimate of bits used to code prediction mode of cur_cu.
 static double calc_mode_bits(const encoder_state_t *state,
+                             const lcu_t *lcu,
                              const cu_info_t * cur_cu,
                              int x, int y)
 {
-  double mode_bits;
-  
-  if (cur_cu->type == CU_INTER) {
-    mode_bits = cur_cu->inter.bitcost;
-  } else {
-    int8_t candidate_modes3;
-    {
-      const cu_info_t *left_cu  = ((x > 8) ? CU_GET_CU(cur_cu, -1,  0) : NULL);
-      const cu_info_t *above_cu = ((y > 8) ? CU_GET_CU(cur_cu,  0, -1) : NULL);
-      kvz_intra_get_dir_luma_predictor(x, y, candidate_modes, cur_cu, left_cu, above_cu);
-    }
+  int x_local = SUB_SCU(x);
+  int y_local = SUB_SCU(y);
 
-    mode_bits = kvz_luma_mode_bits(state, cur_cu->intraPU_INDEX(x >> 2, y >> 2).mode, candidate_modes);
-    if (PU_INDEX(x >> 2, y >> 2) == 0) {
-      mode_bits += kvz_chroma_mode_bits(state, cur_cu->intra0.mode_chroma, cur_cu->intraPU_INDEX(x >> 2, y >> 2).mode);
-    }
+  assert(cur_cu->type == CU_INTRA);
+
+  int8_t candidate_modes3;
+  {
+    const cu_info_t *left_cu  = ((x >= SCU_WIDTH) ? LCU_GET_CU_AT_PX(lcu, x_local - SCU_WIDTH, y_local) : NULL);
+    const cu_info_t *above_cu = ((y >= SCU_WIDTH) ? LCU_GET_CU_AT_PX(lcu, x_local, y_local - SCU_WIDTH) : NULL);
+    kvz_intra_get_dir_luma_predictor(x, y, candidate_modes, cur_cu, left_cu, above_cu);
+  }
+
+  double mode_bits = kvz_luma_mode_bits(state, cur_cu->intra.mode, candidate_modes);
+
+  if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != KVZ_CSP_400) {
+    mode_bits += kvz_chroma_mode_bits(state, cur_cu->intra.mode_chroma, cur_cu->intra.mode);
   }
 
   return mode_bits;
@@ -468,9 +461,9 @@
 
 static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth)
 {
-  vector2d_t lcu_cu = { SUB_SCU(x) / 8, SUB_SCU(y) / 8 };
-  bool condA = x >= 8 && LCU_GET_CU(lcu, lcu_cu.x - 1, lcu_cu.y    )->depth > depth;
-  bool condL = y >= 8 && LCU_GET_CU(lcu, lcu_cu.x,     lcu_cu.y - 1)->depth > depth;
+  vector2d_t lcu_cu = { SUB_SCU(x), SUB_SCU(y) };
+  bool condA = x >= 8 && LCU_GET_CU_AT_PX(lcu, lcu_cu.x - 1, lcu_cu.y    )->depth > depth;
+  bool condL = y >= 8 && LCU_GET_CU_AT_PX(lcu, lcu_cu.x,     lcu_cu.y - 1)->depth > depth;
   return condA + condL;
 }
 
@@ -490,6 +483,7 @@
   const videoframe_t * const frame = state->tile->frame;
   int cu_width = LCU_WIDTH >> depth;
   double cost = MAX_INT;
+  uint32_t inter_bitcost = MAX_INT;
   cu_info_t *cur_cu;
 
   lcu_t *const lcu = &work_treedepth;
@@ -520,40 +514,47 @@
   {
 
     bool can_use_inter =
-        state->global->slicetype != KVZ_SLICE_I
+        state->frame->slicetype != KVZ_SLICE_I
         && WITHIN(depth, ctrl->pu_depth_inter.min, ctrl->pu_depth_inter.max);
 
     if (can_use_inter) {
-      int mode_cost = kvz_search_cu_inter(state, x, y, depth, &work_treedepth);
+      double mode_cost;
+      uint32_t mode_bitcost;
+      kvz_search_cu_inter(state,
+                          x, y,
+                          depth,
+                          &work_treedepth,
+                          &mode_cost, &mode_bitcost);
       if (mode_cost < cost) {
         cost = mode_cost;
+        inter_bitcost = mode_bitcost;
         cur_cu->type = CU_INTER;
       }
 
-      if (depth < MAX_DEPTH) {
-        // Try SMP and AMP partitioning.
-        static const part_mode_t mp_modes = {
-          // SMP
-          SIZE_2NxN, SIZE_Nx2N,
-          // AMP
-          SIZE_2NxnU, SIZE_2NxnD,
-          SIZE_nLx2N, SIZE_nRx2N,
-        };
-
-        const int first_mode = ctrl->cfg->smp_enable ? 0 : 2;
-        const int last_mode  = (ctrl->cfg->amp_enable && cu_width >= 32) ? 5 : 1;
-        for (int i = first_mode; i <= last_mode; ++i) {
-          mode_cost = kvz_search_cu_smp(state,
-                                        x, y,
-                                        depth,
-                                        mp_modesi,
-                                        &work_treedepth + 1);
-          // TODO: take cost of coding part mode into account
-          if (mode_cost < cost) {
-            cost = mode_cost;
-            // TODO: only copy inter prediction info, not pixels
-            work_tree_copy_up(x, y, depth, work_tree);
-          }
+      // Try SMP and AMP partitioning.
+      static const part_mode_t mp_modes = {
+        // SMP
+        SIZE_2NxN, SIZE_Nx2N,
+        // AMP
+        SIZE_2NxnU, SIZE_2NxnD,
+        SIZE_nLx2N, SIZE_nRx2N,
+      };
+
+      const int first_mode = ctrl->cfg->smp_enable ? 0 : 2;
+      const int last_mode  = (ctrl->cfg->amp_enable && cu_width >= 16) ? 5 : 1;
+      for (int i = first_mode; i <= last_mode; ++i) {
+        kvz_search_cu_smp(state,
+                          x, y,
+                          depth,
+                          mp_modesi,
+                          &work_treedepth + 1,
+                          &mode_cost, &mode_bitcost);
+        // TODO: take cost of coding part mode into account
+        if (mode_cost < cost) {
+          cost = mode_cost;
+          inter_bitcost = mode_bitcost;
+          // TODO: only copy inter prediction info, not pixels
+          work_tree_copy_up(x, y, depth, work_tree);
         }
       }
     }
@@ -564,14 +565,18 @@
     bool skip_intra = state->encoder_control->rdo == 0
                       && cur_cu->type != CU_NOTSET
                       && cost / (cu_width * cu_width) < INTRA_TRESHOLD;
-    if (!skip_intra 
+    if (!skip_intra
         && WITHIN(depth, ctrl->pu_depth_intra.min, ctrl->pu_depth_intra.max))
     {
-      double mode_cost = kvz_search_cu_intra(state, x, y, depth, &work_treedepth);
-      if (mode_cost < cost) {
-        cost = mode_cost;
+      int8_t intra_mode;
+      double intra_cost;
+      kvz_search_cu_intra(state, x, y, depth, &work_treedepth,
+                          &intra_mode, &intra_cost);
+      if (intra_cost < cost) {
+        cost = intra_cost;
         cur_cu->type = CU_INTRA;
         cur_cu->part_size = depth > MAX_DEPTH ? SIZE_NxN : SIZE_2Nx2N;
+        cur_cu->intra.mode = intra_mode;
       }
     }
 
@@ -579,14 +584,14 @@
     // mode search of adjacent CUs.
     if (cur_cu->type == CU_INTRA) {
       assert(cur_cu->part_size == SIZE_2Nx2N || cur_cu->part_size == SIZE_NxN);
-      int8_t intra_mode = cur_cu->intraPU_INDEX(x >> 2, y >> 2).mode;
+      int8_t intra_mode = cur_cu->intra.mode;
       lcu_set_intra_mode(&work_treedepth, x, y, depth,
                          intra_mode,
                          intra_mode,
                          cur_cu->part_size);
       kvz_intra_recon_lcu_luma(state, x, y, depth, intra_mode, NULL, &work_treedepth);
 
-      if (PU_INDEX(x >> 2, y >> 2) == 0) {
+      if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != KVZ_CSP_400) {
         int8_t intra_mode_chroma = intra_mode;
 
         // There is almost no benefit to doing the chroma mode search for
@@ -621,8 +626,8 @@
 
         if (cur_pu->inter.mv_dir == 3) {
           const kvz_picture *const refs2 = {
-            state->global->ref->imagescur_pu->inter.mv_ref0,
-            state->global->ref->imagescur_pu->inter.mv_ref1,
+            state->frame->ref->imagescur_pu->inter.mv_ref0,
+            state->frame->ref->imagescur_pu->inter.mv_ref1,
           };
           kvz_inter_recon_lcu_bipred(state,
                                      refs0, refs1,
@@ -633,7 +638,7 @@
         } else {
           const int mv_idx = cur_pu->inter.mv_dir - 1;
           const kvz_picture *const ref =
-              state->global->ref->imagescur_pu->inter.mv_refmv_idx;
+              state->frame->ref->imagescur_pu->inter.mv_refmv_idx;
           kvz_inter_recon_lcu(state,
                               ref,
                               pu_x, pu_y,
@@ -645,16 +650,18 @@
       }
 
       kvz_quantize_lcu_luma_residual(state, x, y, depth, NULL, &work_treedepth);
-      kvz_quantize_lcu_chroma_residual(state, x, y, depth, NULL, &work_treedepth);
+      if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+        kvz_quantize_lcu_chroma_residual(state, x, y, depth, NULL, &work_treedepth);
+      }
 
-      int cbf = cbf_is_set(cur_cu->cbf.y, depth) || cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth);
+      int cbf = cbf_is_set_any(cur_cu->cbf, depth);
 
       if(cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
         cur_cu->merged = 0;
         cur_cu->skipped = 1;
         // Selecting skip reduces bits needed to code the CU
-        if (cur_cu->inter.bitcost > 1) {
-          cur_cu->inter.bitcost -= 1;
+        if (inter_bitcost > 1) {
+          inter_bitcost -= 1;
         }
       }
       lcu_set_inter(&work_treedepth, x, y, depth, cur_cu);
@@ -663,36 +670,45 @@
   }
   if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) {
     cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, &work_treedepth);
-    cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, &work_treedepth);
-    double mode_bits = calc_mode_bits(state, cur_cu, x, y);
-    cost += mode_bits * state->global->cur_lambda_cost;
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, &work_treedepth);
+    }
+
+    double mode_bits;
+    if (cur_cu->type == CU_INTRA) {
+      mode_bits = calc_mode_bits(state, &work_treedepth, cur_cu, x, y);
+    } else {
+      mode_bits = inter_bitcost;
+    }
+
+    cost += mode_bits * state->frame->cur_lambda_cost;
   }
   
   // Recursively split all the way to max search depth.
-  if (depth < ctrl->pu_depth_intra.max || (depth < ctrl->pu_depth_inter.max && state->global->slicetype != KVZ_SLICE_I)) {
+  if (depth < ctrl->pu_depth_intra.max || (depth < ctrl->pu_depth_inter.max && state->frame->slicetype != KVZ_SLICE_I)) {
     int half_cu = cu_width / 2;
     double split_cost = 0.0;
-    int cbf = cbf_is_set(cur_cu->cbf.y, depth) || cbf_is_set(cur_cu->cbf.u, depth) || cbf_is_set(cur_cu->cbf.v, depth);
-        
+    int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+
     if (depth < MAX_DEPTH) {
       // Add cost of cu_split_flag.
       uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth);
       const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_modelsplit_model);
-      cost += CTX_ENTROPY_FBITS(ctx, 0) * state->global->cur_lambda_cost;
-      split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->global->cur_lambda_cost;
+      cost += CTX_ENTROPY_FBITS(ctx, 0) * state->frame->cur_lambda_cost;
+      split_cost += CTX_ENTROPY_FBITS(ctx, 1) * state->frame->cur_lambda_cost;
     }
 
     if (cur_cu->type == CU_INTRA && depth == MAX_DEPTH) {
       // Add cost of intra part_size.
       const cabac_ctx_t *ctx = &(state->cabac.ctx.part_size_model0);
-      cost += CTX_ENTROPY_FBITS(ctx, 1) * state->global->cur_lambda_cost;  // 2Nx2N
-      split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->global->cur_lambda_cost;  // NxN
+      cost += CTX_ENTROPY_FBITS(ctx, 1) * state->frame->cur_lambda_cost;  // 2Nx2N
+      split_cost += CTX_ENTROPY_FBITS(ctx, 0) * state->frame->cur_lambda_cost;  // NxN
     }
 
     // If skip mode was selected for the block, skip further search.
     // Skip mode means there's no coefficients in the block, so splitting
     // might not give any better results but takes more time to do.
-    if (cur_cu->type == CU_NOTSET || cbf || FULL_CU_SPLIT_SEARCH) {
+    if (cur_cu->type == CU_NOTSET || cbf || state->encoder_control->cfg->cu_split_termination == KVZ_CU_SPLIT_TERMINATION_OFF) {
       split_cost += search_cu(state, x,           y,           depth + 1, work_tree);
       split_cost += search_cu(state, x + half_cu, y,           depth + 1, work_tree);
       split_cost += search_cu(state, x,           y + half_cu, depth + 1, work_tree);
@@ -708,34 +724,36 @@
     if (cur_cu->type == CU_NOTSET && depth < MAX_PU_DEPTH
         && x + cu_width <= frame->width && y + cu_width <= frame->height)
     {
-      vector2d_t lcu_cu = { x_local / 8, y_local / 8 };
-      cu_info_t *cu_d1 = LCU_GET_CU(&work_treedepth + 1, lcu_cu.x, lcu_cu.y);
+      cu_info_t *cu_d1 = LCU_GET_CU_AT_PX(&work_treedepth + 1, x_local, y_local);
 
       // If the best CU in depth+1 is intra and the biggest it can be, try it.
       if (cu_d1->type == CU_INTRA && cu_d1->depth == depth + 1) {
         cost = 0;
 
-        cur_cu->intra0 = cu_d1->intra0;
+        cur_cu->intra = cu_d1->intra;
         cur_cu->type = CU_INTRA;
         cur_cu->part_size = depth > MAX_DEPTH ? SIZE_NxN : SIZE_2Nx2N;
 
         kvz_lcu_set_trdepth(&work_treedepth, x, y, depth, cur_cu->tr_depth);
         lcu_set_intra_mode(&work_treedepth, x, y, depth,
-                           cur_cu->intra0.mode, cur_cu->intra0.mode_chroma,
+                           cur_cu->intra.mode, cur_cu->intra.mode_chroma,
                            cur_cu->part_size);
-        kvz_intra_recon_lcu_luma(state, x, y, depth, cur_cu->intra0.mode, NULL, &work_treedepth);
-        kvz_intra_recon_lcu_chroma(state, x, y, depth, cur_cu->intra0.mode_chroma, NULL, &work_treedepth);
+        kvz_intra_recon_lcu_luma(state, x, y, depth, cur_cu->intra.mode, NULL, &work_treedepth);
         cost += kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, &work_treedepth);
-        cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, &work_treedepth);
+
+        if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+          kvz_intra_recon_lcu_chroma(state, x, y, depth, cur_cu->intra.mode_chroma, NULL, &work_treedepth);
+          cost += kvz_cu_rd_cost_chroma(state, x_local, y_local, depth, cur_cu, &work_treedepth);
+        }
 
         // Add the cost of coding no-split.
         uint8_t split_model = get_ctx_cu_split_model(lcu, x, y, depth);
         const cabac_ctx_t *ctx = &(state->cabac.ctx.split_flag_modelsplit_model);
-        cost += CTX_ENTROPY_FBITS(ctx, 0) * state->global->cur_lambda_cost;
+        cost += CTX_ENTROPY_FBITS(ctx, 0) * state->frame->cur_lambda_cost;
 
         // Add the cost of coding intra mode only once.
-        double mode_bits = calc_mode_bits(state, cur_cu, x, y);
-        cost += mode_bits * state->global->cur_lambda_cost;
+        double mode_bits = calc_mode_bits(state, &work_treedepth, cur_cu, x, y);
+        cost += mode_bits * state->frame->cur_lambda_cost;
       }
     }
 
@@ -757,7 +775,7 @@
     work_tree_copy_down(x, y, depth, work_tree);
   }
 
-  PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHCU, state->encoder_control->threadqueue, "type=search_cu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d,depth=%d,split=%d,cur_cu_is_intra=%d", state->global->frame, state->tile->id, state->slice->id,
+  PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHCU, state->encoder_control->threadqueue, "type=search_cu,frame=%d,tile=%d,slice=%d,px_x=%d-%d,px_y=%d-%d,depth=%d,split=%d,cur_cu_is_intra=%d", state->frame->num, state->tile->id, state->slice->id,
                           (state->tile->lcu_offset_x * LCU_WIDTH) + x,
                           (state->tile->lcu_offset_x * LCU_WIDTH) + x + (LCU_WIDTH >> depth), 
                           (state->tile->lcu_offset_y * LCU_WIDTH) + y,
@@ -777,44 +795,43 @@
 static void init_lcu_t(const encoder_state_t * const state, const int x, const int y, lcu_t *lcu, const yuv_t *hor_buf, const yuv_t *ver_buf)
 {
   const videoframe_t * const frame = state->tile->frame;
+
+  FILL(*lcu, 0);
   
+  lcu->rec.chroma_format = state->encoder_control->chroma_format;
+  lcu->ref.chroma_format = state->encoder_control->chroma_format;
+
   // Copy reference cu_info structs from neighbouring LCUs.
-  {
-    const int x_cu = x >> MAX_DEPTH;
-    const int y_cu = y >> MAX_DEPTH;
-
-    // Copy top CU row.
-    if (y_cu > 0) {
-      int i;
-      for (i = 0; i < LCU_CU_WIDTH; ++i) {
-        const cu_info_t *from_cu = kvz_videoframe_get_cu_const(frame, x_cu + i, y_cu - 1);
-        cu_info_t *to_cu = LCU_GET_CU(lcu, i, -1);
-        memcpy(to_cu, from_cu, sizeof(*to_cu));
-      }
-    }
-    // Copy left CU column.
-    if (x_cu > 0) {
-      int i;
-      for (i = 0; i < LCU_CU_WIDTH; ++i) {
-        const cu_info_t *from_cu = kvz_videoframe_get_cu_const(frame, x_cu - 1, y_cu + i);
-        cu_info_t *to_cu = LCU_GET_CU(lcu, -1, i);
-        memcpy(to_cu, from_cu, sizeof(*to_cu));
-      }
-    }
-    // Copy top-left CU.
-    if (x_cu > 0 && y_cu > 0) {
-      const cu_info_t *from_cu = kvz_videoframe_get_cu_const(frame, x_cu - 1, y_cu - 1);
-      cu_info_t *to_cu = LCU_GET_CU(lcu, -1, -1);
+
+  // Copy top CU row.
+  if (y > 0) {
+    for (int i = 0; i < LCU_WIDTH; i += SCU_WIDTH) {
+      const cu_info_t *from_cu = kvz_cu_array_at_const(frame->cu_array, x + i, y - 1);
+      cu_info_t *to_cu = LCU_GET_CU_AT_PX(lcu, i, -1);
       memcpy(to_cu, from_cu, sizeof(*to_cu));
     }
-
-    // Copy top-right CU.
-    if (y_cu > 0 && x + LCU_WIDTH < frame->width) {
-      const cu_info_t *from_cu = kvz_videoframe_get_cu_const(frame, x_cu + LCU_CU_WIDTH, y_cu - 1);
-      cu_info_t *to_cu = LCU_GET_TOP_RIGHT_CU(lcu);
+  }
+  // Copy left CU column.
+  if (x > 0) {
+    for (int i = 0; i < LCU_WIDTH; i += SCU_WIDTH) {
+      const cu_info_t *from_cu = kvz_cu_array_at_const(frame->cu_array, x - 1, y + i);
+      cu_info_t *to_cu = LCU_GET_CU_AT_PX(lcu, -1, i);
       memcpy(to_cu, from_cu, sizeof(*to_cu));
     }
   }
+  // Copy top-left CU.
+  if (x > 0 && y > 0) {
+    const cu_info_t *from_cu = kvz_cu_array_at_const(frame->cu_array, x - 1, y - 1);
+    cu_info_t *to_cu = LCU_GET_CU_AT_PX(lcu, -1, -1);
+    memcpy(to_cu, from_cu, sizeof(*to_cu));
+  }
+
+  // Copy top-right CU.
+  if (y > 0 && x + LCU_WIDTH < frame->width) {
+    const cu_info_t *from_cu = kvz_cu_array_at_const(frame->cu_array, x + LCU_WIDTH, y - 1);
+    cu_info_t *to_cu = LCU_GET_TOP_RIGHT_CU(lcu);
+    memcpy(to_cu, from_cu, sizeof(*to_cu));
+  }
 
   // Copy reference pixels.
   {
@@ -825,16 +842,30 @@
       // number of allocated pixels left.
       int x_max = MIN(LCU_REF_PX_WIDTH, pic_width - x);
       int x_min_in_lcu = (x>0) ? 0 : 1;
-      memcpy(&lcu->top_ref.yx_min_in_lcu, &hor_buf->yOFFSET_HOR_BUF(x, y, frame, x_min_in_lcu-1), (x_max + (1-x_min_in_lcu))*sizeof(kvz_pixel));
-      memcpy(&lcu->top_ref.ux_min_in_lcu, &hor_buf->uOFFSET_HOR_BUF_C(x, y, frame, x_min_in_lcu - 1), (x_max / 2 + (1 - x_min_in_lcu))*sizeof(kvz_pixel));
-      memcpy(&lcu->top_ref.vx_min_in_lcu, &hor_buf->vOFFSET_HOR_BUF_C(x, y, frame, x_min_in_lcu - 1), (x_max / 2 + (1 - x_min_in_lcu))*sizeof(kvz_pixel));
+      int luma_offset = OFFSET_HOR_BUF(x, y, frame, x_min_in_lcu - 1);
+      int chroma_offset = OFFSET_HOR_BUF_C(x, y, frame, x_min_in_lcu - 1);
+      int luma_bytes = (x_max + (1 - x_min_in_lcu))*sizeof(kvz_pixel);
+      int chroma_bytes = (x_max / 2 + (1 - x_min_in_lcu))*sizeof(kvz_pixel);
+
+      memcpy(&lcu->top_ref.yx_min_in_lcu, &hor_buf->yluma_offset, luma_bytes);
+      if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+        memcpy(&lcu->top_ref.ux_min_in_lcu, &hor_buf->uchroma_offset, chroma_bytes);
+        memcpy(&lcu->top_ref.vx_min_in_lcu, &hor_buf->vchroma_offset, chroma_bytes);
+      }
     }
     // Copy left reference pixels.
     if (x > 0) {
       int y_min_in_lcu = (y>0) ? 0 : 1;
-      memcpy(&lcu->left_ref.yy_min_in_lcu, &ver_buf->yOFFSET_VER_BUF(x, y, frame, y_min_in_lcu - 1), (LCU_WIDTH + (1 - y_min_in_lcu))*sizeof(kvz_pixel));
-      memcpy(&lcu->left_ref.uy_min_in_lcu, &ver_buf->uOFFSET_VER_BUF_C(x, y, frame, y_min_in_lcu - 1), (LCU_WIDTH / 2 + (1 - y_min_in_lcu))*sizeof(kvz_pixel));
-      memcpy(&lcu->left_ref.vy_min_in_lcu, &ver_buf->vOFFSET_VER_BUF_C(x, y, frame, y_min_in_lcu - 1), (LCU_WIDTH / 2 + (1 - y_min_in_lcu))*sizeof(kvz_pixel));
+      int luma_offset = OFFSET_VER_BUF(x, y, frame, y_min_in_lcu - 1);
+      int chroma_offset = OFFSET_VER_BUF_C(x, y, frame, y_min_in_lcu - 1);
+      int luma_bytes = (LCU_WIDTH + (1 - y_min_in_lcu)) * sizeof(kvz_pixel);
+      int chroma_bytes = (LCU_WIDTH / 2 + (1 - y_min_in_lcu)) * sizeof(kvz_pixel);
+
+      memcpy(&lcu->left_ref.yy_min_in_lcu, &ver_buf->yluma_offset, luma_bytes);
+      if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+        memcpy(&lcu->left_ref.uy_min_in_lcu, &ver_buf->uchroma_offset, chroma_bytes);
+        memcpy(&lcu->left_ref.vy_min_in_lcu, &ver_buf->vchroma_offset, chroma_bytes);
+      }
     }
   }
 
@@ -851,10 +882,12 @@
 
     kvz_pixels_blit(&frame->source->yx + y * frame->source->stride, lcu->ref.y,
                         x_max, y_max, frame->source->stride, LCU_WIDTH);
-    kvz_pixels_blit(&frame->source->ux_c + y_c * frame->source->stride/2, lcu->ref.u,
-                        x_max_c, y_max_c, frame->source->stride/2, LCU_WIDTH / 2);
-    kvz_pixels_blit(&frame->source->vx_c + y_c * frame->source->stride/2, lcu->ref.v,
-                        x_max_c, y_max_c, frame->source->stride/2, LCU_WIDTH / 2);
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      kvz_pixels_blit(&frame->source->ux_c + y_c * frame->source->stride / 2, lcu->ref.u,
+                      x_max_c, y_max_c, frame->source->stride / 2, LCU_WIDTH / 2);
+      kvz_pixels_blit(&frame->source->vx_c + y_c * frame->source->stride / 2, lcu->ref.v,
+                      x_max_c, y_max_c, frame->source->stride / 2, LCU_WIDTH / 2);
+    }
   }
 }
 
@@ -865,20 +898,7 @@
 static void copy_lcu_to_cu_data(const encoder_state_t * const state, int x_px, int y_px, const lcu_t *lcu)
 {
   // Copy non-reference CUs to picture.
-  {
-    const int x_cu = x_px >> MAX_DEPTH;
-    const int y_cu = y_px >> MAX_DEPTH;
-    videoframe_t * const frame = state->tile->frame;
-
-    int x, y;
-    for (y = 0; y < LCU_CU_WIDTH; ++y) {
-      for (x = 0; x < LCU_CU_WIDTH; ++x) {
-        const cu_info_t *from_cu = LCU_GET_CU(lcu, x, y);
-        cu_info_t *to_cu = kvz_videoframe_get_cu(frame, x_cu + x, y_cu + y);
-        memcpy(to_cu, from_cu, sizeof(*to_cu));
-      }
-    }
-  }
+  kvz_cu_array_copy_from_lcu(state->tile->frame->cu_array, x_px, y_px, lcu);
 
   // Copy pixels to picture.
   {
@@ -894,14 +914,16 @@
     kvz_coefficients_blit(lcu->coeff.y, &pic->coeff_yluma_index,
                         x_max, y_max, LCU_WIDTH, pic_width);
 
-    kvz_pixels_blit(lcu->rec.u, &pic->rec->u(x_px / 2) + (y_px / 2) * (pic->rec->stride / 2),
-                        x_max / 2, y_max / 2, LCU_WIDTH / 2, pic->rec->stride / 2);
-    kvz_pixels_blit(lcu->rec.v, &pic->rec->v(x_px / 2) + (y_px / 2) * (pic->rec->stride / 2),
-                        x_max / 2, y_max / 2, LCU_WIDTH / 2, pic->rec->stride / 2);
-    kvz_coefficients_blit(lcu->coeff.u, &pic->coeff_uchroma_index,
-                        x_max / 2, y_max / 2, LCU_WIDTH / 2, pic_width / 2);
-    kvz_coefficients_blit(lcu->coeff.v, &pic->coeff_vchroma_index,
-                        x_max / 2, y_max / 2, LCU_WIDTH / 2, pic_width / 2);
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      kvz_pixels_blit(lcu->rec.u, &pic->rec->u(x_px / 2) + (y_px / 2) * (pic->rec->stride / 2),
+                      x_max / 2, y_max / 2, LCU_WIDTH / 2, pic->rec->stride / 2);
+      kvz_pixels_blit(lcu->rec.v, &pic->rec->v(x_px / 2) + (y_px / 2) * (pic->rec->stride / 2),
+                      x_max / 2, y_max / 2, LCU_WIDTH / 2, pic->rec->stride / 2);
+      kvz_coefficients_blit(lcu->coeff.u, &pic->coeff_uchroma_index,
+                            x_max / 2, y_max / 2, LCU_WIDTH / 2, pic_width / 2);
+      kvz_coefficients_blit(lcu->coeff.v, &pic->coeff_vchroma_index,
+                            x_max / 2, y_max / 2, LCU_WIDTH / 2, pic_width / 2);
+    }
   }
 }
 
@@ -912,16 +934,23 @@
  */
 void kvz_search_lcu(encoder_state_t * const state, const int x, const int y, const yuv_t * const hor_buf, const yuv_t * const ver_buf)
 {
+  assert(x % LCU_WIDTH == 0);
+  assert(y % LCU_WIDTH == 0);
+
+  // Initialize the same starting state to every depth. The search process
+  // will use these as temporary storage for predictions before making
+  // a decision on which to use, and they get updated during the search
+  // process.
   lcu_t work_treeMAX_PU_DEPTH + 1;
-  int depth;
-  // Initialize work tree.
-  for (depth = 0; depth <= MAX_PU_DEPTH; ++depth) {
-    FILL(work_treedepth, 0);
-    init_lcu_t(state, x, y, &work_treedepth, hor_buf, ver_buf);
+  init_lcu_t(state, x, y, &work_tree0, hor_buf, ver_buf);
+  for (int depth = 1; depth <= MAX_PU_DEPTH; ++depth) {
+    work_treedepth = work_tree0;
   }
 
   // Start search from depth 0.
   search_cu(state, x, y, 0, work_tree);
 
+  // The best decisions through out the LCU got propagated back to depth 0,
+  // so copy those back to the frame.
   copy_lcu_to_cu_data(state, x, y, &work_tree0);
 }

kvazaar-0.8.3.tar.gz/src/search.h -> kvazaar-1.0.0.tar.gz/src/search.h Changed

kvazaar-0.8.3.tar.gz/src/search_inter.c -> kvazaar-1.0.0.tar.gz/src/search_inter.c Changed

@@ -20,62 +20,197 @@
 
 #include "search_inter.h"
 
+#include <limits.h>
 #include <stdlib.h>
 
+#include "cabac.h"
+#include "encoder.h"
+#include "image.h"
+#include "imagelist.h"
 #include "inter.h"
-#include "strategies/strategies-picture.h"
-#include "strategies/strategies-ipol.h"
+#include "kvazaar.h"
 #include "rdo.h"
+#include "strategies/strategies-ipol.h"
+#include "strategies/strategies-picture.h"
+#include "videoframe.h"
+
+
+/**
+ * \return  True if referred block is within current tile.
+ */
+static INLINE bool fracmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int height, int wpp_limit)
+{
+  if (state->encoder_control->cfg->mv_constraint == KVZ_MV_CONSTRAIN_NONE) {
+    return (wpp_limit == -1 || y + (height << 2) <= (wpp_limit << 2));
+  };
+
+  int margin = 0;
+  if (state->encoder_control->cfg->mv_constraint == KVZ_MV_CONSTRAIN_FRAME_AND_TILE_MARGIN) {
+    // Enforce a distance of 8 from any tile boundary.
+    margin = 4 * 4;
+  }
+
+  // TODO implement KVZ_MV_CONSTRAIN_FRAM and KVZ_MV_CONSTRAIN_TILE.
+  const vector2d_t abs_mv = { (orig->x << 2) + x, (orig->y << 2) + y };
+
+  // Check that both margin and wpp_limit constraints are satisfied.
+  if (abs_mv.x >= margin && abs_mv.x + (width << 2) <= (state->tile->frame->width << 2) - margin &&
+      abs_mv.y >= margin && abs_mv.y + (height << 2) <= (state->tile->frame->height << 2) - margin &&
+      (wpp_limit == -1 || y + (height << 2) <= (wpp_limit << 2)))
+  {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+
+static INLINE int get_wpp_limit(const encoder_state_t *state, const vector2d_t* orig)
+{
+  const encoder_control_t *ctrl = state->encoder_control;
+  if (ctrl->owf && ctrl->wpp) {
+    // Limit motion vectors to the LCU-row below this row.
+    // To avoid fractional pixel interpolation depending on things outside
+    // this range, add a margin of 4 pixels.
+    // - fme needs 4 pixels
+    // - odd chroma interpolation needs 4 pixels
+    int wpp_limit = 2 * LCU_WIDTH - 4 - orig->y % LCU_WIDTH;
+    if (ctrl->deblock_enable && !ctrl->sao_enable) {
+      // As a special case, when deblocking is enabled but SAO is not, we have
+      // to avoid the possibility of interpolation filters reaching the
+      // non-deblocked pixels. The deblocking for the horizontal edge on the
+      // LCU boundary can reach 4 pixels. If SAO is enabled, this WPP-row
+      // depends on the SAO job, which depends on the deblocking having
+      // already been done.
+      wpp_limit -= 4;
+    }
+    return wpp_limit;
+  } else {
+    return -1;
+  }
+}
+
+
+/**
+ * \return  True if referred block is within current tile.
+ */
+static INLINE bool intmv_within_tile(const encoder_state_t *state, const vector2d_t* orig, int x, int y, int width, int height, int wpp_limit)
+{
+  return fracmv_within_tile(state, orig, x << 2, y << 2, width, height, wpp_limit);
+}
+
+
+static unsigned get_ep_ex_golomb_bitcost(unsigned symbol)
+{
+  // Calculate 2 * log2(symbol + 2)
+
+  unsigned bins = 0;
+  symbol += 2;
+  if (symbol >= 1 << 8) { bins += 16; symbol >>= 8; }
+  if (symbol >= 1 << 4) { bins += 8; symbol >>= 4; }
+  if (symbol >= 1 << 2) { bins += 4; symbol >>= 2; }
+  if (symbol >= 1 << 1) { bins += 2; }
+
+  // TODO: It might be a good idea to put a small slope on this function to
+  // make sure any search function that follows the gradient heads towards
+  // a smaller MVD, but that would require fractinal costs and bits being
+  // used everywhere in inter search.
+  // return num_bins + 0.001 * symbol;
+
+  return bins;
+}
 
 
-static uint32_t get_ep_ex_golomb_bitcost(uint32_t symbol, uint32_t count)
+/**Checks if mv is one of the merge candidates
+* \return true if found else return false
+*/
+static bool mv_in_merge(const inter_merge_cand_t* merge_cand, int16_t num_cand, const vector2d_t* mv)
 {
-  int32_t num_bins = 0;
-  while (symbol >= (uint32_t)(1 << count)) {
-    ++num_bins;
-    symbol -= 1 << count;
-    ++count;
+  for (int i = 0; i < num_cand; ++i) {
+    if (merge_candi.dir == 3) continue;
+    const vector2d_t merge_mv = {
+      merge_candi.mvmerge_candi.dir - 10 >> 2,
+      merge_candi.mvmerge_candi.dir - 11 >> 2
+    };
+    if (merge_mv.x == mv->x && merge_mv.y == mv->y) {
+      return true;
+    }
   }
-  num_bins ++;
+  return false;
+}
+
+
+static unsigned select_starting_point(int16_t num_cand, inter_merge_cand_t *merge_cand, vector2d_t *mv_in_out, vector2d_t *mv, encoder_state_t *const state,
+                                      const vector2d_t *orig, unsigned width, unsigned height, int wpp_limit, const kvz_picture *pic, const kvz_picture *ref,
+                                      int16_t mv_cand22, int32_t ref_idx, unsigned best_cost, unsigned *best_index, uint32_t *best_bitcost,
+                                      kvz_mvd_cost_func *calc_mvd){
+  // Go through candidates
+  for (unsigned i = 0; i < num_cand; ++i) {
+    if (merge_candi.dir == 3) continue;
+    mv->x = merge_candi.mvmerge_candi.dir - 10 >> 2;
+    mv->y = merge_candi.mvmerge_candi.dir - 11 >> 2;
 
-  return num_bins;
+    if (mv->x == 0 && mv->y == 0) continue;
+    if (!intmv_within_tile(state, orig, mv->x, mv->y, width, height, wpp_limit)) {
+      continue;
+    }
+
+    uint32_t bitcost = 0;
+    unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
+      (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x,
+      (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y,
+      width, height, -1);
+    cost += calc_mvd(state, mv->x, mv->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+
+    if (cost < best_cost) {
+      best_cost = cost;
+      *best_index = i;
+      *best_bitcost = bitcost;
+    }
+  }  
+  if (*best_index < num_cand) {
+    mv->x = merge_cand*best_index.mvmerge_cand*best_index.dir - 10 >> 2;
+    mv->y = merge_cand*best_index.mvmerge_cand*best_index.dir - 11 >> 2;
+  } else if (*best_index == num_cand) {
+    mv->x = mv_in_out->x >> 2;
+    mv->y = mv_in_out->y >> 2;
+  } else {
+    mv->x = 0;
+    mv->y = 0;
+  }
+  return best_cost;
 }
 
 
-static uint32_t get_mvd_coding_cost(vector2d_t *mvd, cabac_data_t* cabac)
+static uint32_t get_mvd_coding_cost(encoder_state_t * const state, vector2d_t *mvd, const cabac_data_t* cabac)
 {
-  uint32_t bitcost = 0;
-  const int32_t mvd_hor = mvd->x;
-  const int32_t mvd_ver = mvd->y;
-  const int8_t hor_abs_gr0 = mvd_hor != 0;
-  const int8_t ver_abs_gr0 = mvd_ver != 0;
-  const uint32_t mvd_hor_abs = abs(mvd_hor);
-  const uint32_t mvd_ver_abs = abs(mvd_ver);
-
-  // Greater than 0 for x/y
-  bitcost += 2;
-
-  if (hor_abs_gr0) {
-    if (mvd_hor_abs > 1) {
-      bitcost += get_ep_ex_golomb_bitcost(mvd_hor_abs-2, 1) - 2; // TODO: tune the costs
+  unsigned bitcost = 0;
+  const vector2d_t abs_mvd = { abs(mvd->x), abs(mvd->y) };
+
+  bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model0, abs_mvd.x > 0);
+  if (abs_mvd.x > 0) {
+    bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model1, abs_mvd.x > 1);
+    if (abs_mvd.x > 1) {
+      bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x - 2) << 15;
     }
-    // Greater than 1 + sign
-    bitcost += 2;
+    bitcost += 1 << 15; // sign
   }
 
-  if (ver_abs_gr0) {
-    if (mvd_ver_abs > 1) {
-      bitcost += get_ep_ex_golomb_bitcost(mvd_ver_abs-2, 1) - 2; // TODO: tune the costs
+  bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model0, abs_mvd.y > 0);
+  if (abs_mvd.y > 0) {
+    bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model1, abs_mvd.y > 1);
+    if (abs_mvd.y > 1) {
+      bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y - 2) << 15;
     }
-    // Greater than 1 + sign
-    bitcost += 2;
+    bitcost += 1 << 15; // sign
   }
 
-  return bitcost;
+  // Round and shift back to integer bits.
+  return (bitcost + (1 << 14)) >> 15;
 }
 
 
-static int calc_mvd_cost(const encoder_state_t * const state, int x, int y, int mv_shift,
+static int calc_mvd_cost(encoder_state_t * const state, int x, int y, int mv_shift,
                          int16_t mv_cand22, inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
                          int16_t num_cand,int32_t ref_idx, uint32_t *bitcost)
 {
@@ -105,11 +240,11 @@
   if(!merged) {
     mvd_temp1.x = x - mv_cand00;
     mvd_temp1.y = y - mv_cand01;
-    cand1_cost = get_mvd_coding_cost(&mvd_temp1, NULL);
+    cand1_cost = get_mvd_coding_cost(state, &mvd_temp1, &state->cabac);
 
     mvd_temp2.x = x - mv_cand10;
     mvd_temp2.y = y - mv_cand11;
-    cand2_cost = get_mvd_coding_cost(&mvd_temp2, NULL);
+    cand2_cost = get_mvd_coding_cost(state, &mvd_temp2, &state->cabac);
 
     // Select candidate 1 if it has lower cost
     if (cand2_cost < cand1_cost) {
@@ -118,14 +253,68 @@
     temp_bitcost += cur_mv_cand ? cand2_cost : cand1_cost;
   }
   *bitcost = temp_bitcost;
-  return temp_bitcost*(int32_t)(state->global->cur_lambda_cost_sqrt+0.5);
+  return temp_bitcost*(int32_t)(state->frame->cur_lambda_cost_sqrt+0.5);
+}
+
+
+static bool early_terminate(int16_t num_cand, inter_merge_cand_t *merge_cand, vector2d_t *mv_in_out, vector2d_t *mv, encoder_state_t *const state,
+  const vector2d_t *orig, unsigned width, unsigned height, int wpp_limit, const kvz_picture *pic, const kvz_picture *ref,
+  int16_t mv_cand22, int32_t ref_idx, unsigned *best_cost, uint32_t *bitcost_out, uint32_t *best_bitcost,
+  kvz_mvd_cost_func *calc_mvd)
+{
+  static const vector2d_t small_hexbs5 = {
+      { 0, 0 },
+      { 0, -1 }, { -1, 0 }, { 1, 0 }, { 0, 1 },
+  };
+  double multiplier = 1;
+  // If early termination is set to fast set multiplier to 0.9
+  if (state->encoder_control->cfg->me_early_termination == KVZ_ME_EARLY_TERMINATION_SENSITIVE){
+    multiplier = 0.95;
+  }
+  const vector2d_t *offset;
+  for (int k = 0; k < 2; ++k){
+    unsigned best_index = 0;
+    for (int i = 1; i < 5; ++i) {
+      offset = &small_hexbsi;
+      if (!intmv_within_tile(state, orig, mv->x + offset->x, mv->y + offset->y, width, height, wpp_limit)) {
+        continue;
+      }
+
+      unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
+        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + offset->x,
+        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + offset->y,
+        width, height, -1);
+      unsigned bitcost;
+      cost += calc_mvd(state, mv->x + offset->x, mv->y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+
+      if (cost < multiplier * *best_cost ) {
+        *best_cost = cost;
+        best_index = i;
+        *best_bitcost = bitcost;
+      }
+    }
+    // Adjust the movement vector
+    mv->x += small_hexbsbest_index.x;
+    mv->y += small_hexbsbest_index.y;
+
+    // if best match is at center we stop the search
+    if (best_index == 0){
+      // Return final movement vector in quarter-pixel precision.
+      mv_in_out->x = mv->x << 2;
+      mv_in_out->y = mv->y << 2;
+
+      *bitcost_out = *best_bitcost;
+      return true;
+    }
+  }
+  return false;
 }
 
 
-unsigned kvz_tz_pattern_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, unsigned pattern_type,
+unsigned kvz_tz_pattern_search(encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref, unsigned pattern_type,
                            const vector2d_t *orig, const int iDist, vector2d_t *mv, unsigned best_cost, int *best_dist,
                            int16_t mv_cand22, inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS, int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
-                           int width, int height, int max_px_below_lcu)
+                           int width, int height, int wpp_limit)
 {
   int n_points;
   int best_index = -1;
@@ -134,9 +323,7 @@
   vector2d_t mv_best = { 0, 0 };
 
 
-  int(*calc_mvd)(const encoder_state_t * const, int, int, int,
-    int16_t22, inter_merge_cand_tMRG_MAX_NUM_CANDS,
-    int16_t, int32_t, uint32_t *) = calc_mvd_cost;
+  kvz_mvd_cost_func *calc_mvd = calc_mvd_cost;
   if (state->encoder_control->cfg->mv_rdo) {
     calc_mvd = kvz_calc_mvd_cost_cabac;
   }
@@ -243,22 +430,19 @@
   for (i = 0; i < n_points; i++)
   {
     vector2d_t *current = &patternpattern_typei;
+    if (!intmv_within_tile(state, orig, mv->x + current->x, mv->y + current->y, width, height, wpp_limit)) {
+      continue;
+    }
+
     unsigned cost;
     uint32_t bitcost;
 
     {
-      PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
       cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                             (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x,
                             (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y,
-                            width, height, max_px_below_lcu);
+                            width, height, -1);
       cost += calc_mvd(state, mv->x + current->x, mv->y + current->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-
-      PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
-        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x,
-        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + current->x + width,
-        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y,
-        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + current->y + height);
     }
 
     if (cost < best_cost)
@@ -284,19 +468,17 @@
 }
 
 
-unsigned kvz_tz_raster_search(const encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref,
+unsigned kvz_tz_raster_search(encoder_state_t * const state, const kvz_picture *pic, const kvz_picture *ref,
                           const vector2d_t *orig, vector2d_t *mv, unsigned best_cost,
                           int16_t mv_cand22, inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS, int16_t num_cand, int32_t ref_idx, uint32_t *best_bitcost,
-                          int width, int height, int iSearchRange, int iRaster, int max_px_below_lcu)
+                          int width, int height, int iSearchRange, int iRaster, int wpp_limit)
 {
   int i;
   int k;
 
   vector2d_t mv_best = { 0, 0 };
 
-  int(*calc_mvd)(const encoder_state_t * const, int, int, int,
-    int16_t22, inter_merge_cand_tMRG_MAX_NUM_CANDS,
-    int16_t, int32_t, uint32_t *) = calc_mvd_cost;
+  kvz_mvd_cost_func *calc_mvd = calc_mvd_cost;
   if (state->encoder_control->cfg->mv_rdo) {
     calc_mvd = kvz_calc_mvd_cost_cabac;
   }
@@ -307,22 +489,19 @@
     for (k = -iSearchRange; k <= iSearchRange; k += iRaster)
     {
       vector2d_t current = { k, i };
+      if (!intmv_within_tile(state, orig, mv->x + current.x, mv->y + current.y, width, height, wpp_limit)) {
+        continue;
+      }
+
       unsigned cost;
       uint32_t bitcost;
 
       {
-        PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
         cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
           (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k,
           (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i,
-          width, height, max_px_below_lcu);
+          width, height, -1);
         cost += calc_mvd(state, mv->x + k, mv->y + i, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-
-        PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
-          (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k,
-          (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv->x + k + width,
-          (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i,
-          (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv->y + i + height);
       }
 
       if (cost < best_cost)
@@ -343,7 +522,7 @@
 }
 
 
-static unsigned tz_search(const encoder_state_t * const state,
+static unsigned tz_search(encoder_state_t * const state,
                           unsigned width, unsigned height,
                           const kvz_picture *pic, const kvz_picture *ref,
                           const vector2d_t *orig, vector2d_t *mv_in_out,
@@ -365,92 +544,57 @@
   uint32_t best_bitcost = 0;
   int iDist;
   int best_dist = 0;
-  unsigned best_index = num_cand;
-  int max_px_below_lcu = -1;
+  unsigned best_index = num_cand + 1;
+  int wpp_limit = get_wpp_limit(state, orig);
 
-  int(*calc_mvd)(const encoder_state_t * const, int, int, int,
-    int16_t22, inter_merge_cand_tMRG_MAX_NUM_CANDS,
-    int16_t, int32_t, uint32_t *) = calc_mvd_cost;
+  kvz_mvd_cost_func *calc_mvd = calc_mvd_cost;
   if (state->encoder_control->cfg->mv_rdo) {
     calc_mvd = kvz_calc_mvd_cost_cabac;
   }
 
-  if (state->encoder_control->owf) {
-    max_px_below_lcu = LCU_WIDTH;
-    if (state->encoder_control->fme_level > 0) {
-      // Fractional motion estimation can change the mv by at most 1 pixel.
-      max_px_below_lcu -= 1;
-    }
-    if (state->encoder_control->deblock_enable) {
-      // Strong deblock filter modifies 3 pixels.
-      max_px_below_lcu -= 3;
-    }
-  }
-
-  //step 1, compare (0,0) vector to predicted vectors
-  
-  // Check whatever input vector we got, unless its (0, 0) which will be checked later.
-  if (mv.x || mv.y) 
-  {
-    PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
-
+  // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list.
+  if (intmv_within_tile(state, orig, 0, 0, width, height, wpp_limit)) {
     best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                                        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                                        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                        width, height, max_px_below_lcu);
-    best_cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost);
-
-    PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + width,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + height);
+                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x,
+                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y,
+                                   width, height, -1);
+    best_cost += calc_mvd(state, 0, 0, 2, mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost);
+    best_index = num_cand + 1;
   }
 
-  int i;
-  // Select starting point from among merge candidates. These should include
-  // both mv_cand vectors and (0, 0).
-  for (i = 0; i < num_cand; ++i) 
+  // Check mv_in if it's not one of the merge candidates.
+  if (!mv_in_merge(merge_cand, num_cand, &mv) &&
+      intmv_within_tile(state, orig, mv.x, mv.y, width, height, wpp_limit))
   {
-    if (merge_candi.dir == 3) continue;
-    mv.x = merge_candi.mvmerge_candi.dir - 10 >> 2;
-    mv.y = merge_candi.mvmerge_candi.dir - 11 >> 2;
-
-    PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
-
-	  uint32_t bitcost;
     unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                   width, height, max_px_below_lcu);
+                                      (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
+                                      (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
+                                      width, height, -1);
+    unsigned bitcost;
     cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-
-    PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + width,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + height);
-
     if (cost < best_cost) {
       best_cost = cost;
-      best_index = i;
+      best_index = num_cand;
       best_bitcost = bitcost;
     }
   }
-  
-  if (best_index < (unsigned)num_cand) {
-    mv.x = merge_candbest_index.mvmerge_candbest_index.dir - 10 >> 2;
-    mv.y = merge_candbest_index.mvmerge_candbest_index.dir - 11 >> 2;
-  } else {
-    mv.x = mv_in_out->x >> 2;
-    mv.y = mv_in_out->y >> 2;
+
+  // Select starting point from among merge candidates. These should include
+  // both mv_cand vectors and (0, 0).
+  best_cost = select_starting_point(num_cand, merge_cand, mv_in_out, &mv, state, orig, width, height, wpp_limit,
+                                pic, ref, mv_cand, ref_idx, best_cost, &best_index, &best_bitcost, calc_mvd);
+
+  // Check if we should stop search
+  if (state->encoder_control->cfg->me_early_termination){
+    if (early_terminate(num_cand, merge_cand, mv_in_out, &mv, state, orig, width, height, wpp_limit,
+      pic, ref, mv_cand, ref_idx, &best_cost, bitcost_out, &best_bitcost, calc_mvd)) return best_cost;
   }
 
   //step 2, grid search
   for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
   {
     best_cost = kvz_tz_pattern_search(state, pic, ref, step2_type, orig, iDist, &mv, best_cost, &best_dist,
-                                  mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, max_px_below_lcu);
+                                  mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit);
   }
 
   //step 3, raster scan
@@ -459,7 +603,7 @@
     best_dist = iRaster;
 
     best_cost = kvz_tz_raster_search(state, pic, ref, orig, &mv, best_cost, mv_cand, merge_cand,
-                                 num_cand, ref_idx, &best_bitcost, width, height, iSearchRange, iRaster, max_px_below_lcu);
+                                 num_cand, ref_idx, &best_bitcost, width, height, iSearchRange, iRaster, wpp_limit);
   }
 
   //step 4
@@ -471,7 +615,7 @@
     while (iDist > 0)
     {
       best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
-                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, max_px_below_lcu);
+                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit);
 
       iDist = iDist >> 1;
     }
@@ -483,7 +627,7 @@
     for (iDist = 1; iDist <= iSearchRange; iDist *= 2)
     {
       best_cost = kvz_tz_pattern_search(state, pic, ref, step4_type, orig, iDist, &mv, best_cost, &best_dist,
-                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, max_px_below_lcu);
+                                   mv_cand, merge_cand, num_cand, ref_idx, &best_bitcost, width, height, wpp_limit);
     }
   }
 
@@ -518,7 +662,7 @@
  * the predicted motion vector is way off. In the future even more additional
  * points like 0,0 might be used, such as vectors from top or left.
  */
-static unsigned hexagon_search(const encoder_state_t * const state,
+static unsigned hexagon_search(encoder_state_t * const state,
                                unsigned width, unsigned height,
                                const kvz_picture *pic, const kvz_picture *ref,
                                const vector2d_t *orig, vector2d_t *mv_in_out,
@@ -533,7 +677,7 @@
   // 5   0  2,8
   //  \     /
   //   4---3
-  static const vector2d_t large_hexbs10 = {
+  static const vector2d_t large_hexbs9 = {
       { 0, 0 },
       { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 }, { -1, -2 },
       { 1, -2 }, { 2, 0 }
@@ -551,113 +695,68 @@
   unsigned best_cost = UINT32_MAX;
   uint32_t best_bitcost = 0, bitcost;
   unsigned i;
-  unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
-  int max_px_below_lcu = -1;
+  // Current best index, either to merge_cands, large_hebx or small_hexbs.
+  unsigned best_index = num_cand + 1;
+  int wpp_limit = get_wpp_limit(state, orig);
 
-  int (*calc_mvd)(const encoder_state_t * const, int, int, int,
-    int16_t22, inter_merge_cand_tMRG_MAX_NUM_CANDS,
-    int16_t, int32_t, uint32_t *) = calc_mvd_cost;
+  kvz_mvd_cost_func *calc_mvd = calc_mvd_cost;
   if (state->encoder_control->cfg->mv_rdo) {
     calc_mvd = kvz_calc_mvd_cost_cabac;
   }
 
-  
-  if (state->encoder_control->owf) {
-    max_px_below_lcu = LCU_WIDTH;
-    if (state->encoder_control->fme_level > 0) {
-      // Fractional motion estimation can change the mv by at most 1 pixel.
-      max_px_below_lcu -= 1;
-    }
-    if (state->encoder_control->deblock_enable) {
-      // Strong deblock filter modifies 3 pixels.
-      max_px_below_lcu -= 3;
-    }
-  }
-
-  // Check mv_in, if it's not in merge candidates.
-  bool mv_in_merge_cand = false;
-  for (int i = 0; i < num_cand; ++i) {
-    if (merge_candi.dir == 3) continue;
-    if (merge_candi.mvmerge_candi.dir - 10 >> 2 == mv.x &&
-        merge_candi.mvmerge_candi.dir - 11 >> 2 == mv.y) {
-      mv_in_merge_cand = true;
-      break;
-    }
-  }
-
-  if (!mv_in_merge_cand) {
-    PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
-
+  // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list.
+  if (intmv_within_tile(state, orig, 0, 0, width, height, wpp_limit)) {
     best_cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                                        (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                                        (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                        width, height, max_px_below_lcu);
-    best_cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x,
+                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y,
+                                   width, height, -1);
+    best_cost += calc_mvd(state, 0, 0, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
     best_bitcost = bitcost;
-    best_index = num_cand; 
-
-    PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + width,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + height);
+    best_index = num_cand + 1;
   }
 
-  // Select starting point from among merge candidates. These should include
-  // both mv_cand vectors and (0, 0).
-  for (i = 0; i < num_cand; ++i) {
-    if (merge_candi.dir == 3) continue;
-    mv.x = merge_candi.mvmerge_candi.dir - 10 >> 2;
-    mv.y = merge_candi.mvmerge_candi.dir - 11 >> 2;
-
-    PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
-
+  // Check mv_in if it's not one of the merge candidates.
+  if (!mv_in_merge(merge_cand, num_cand, &mv) &&
+      intmv_within_tile(state, orig, mv.x, mv.y, width, height, wpp_limit)) 
+  {
     unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
                                    (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
                                    (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                                   width, height, max_px_below_lcu);
+                                   width, height, -1);
     cost += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
 
-    PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x,
-                            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + width,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y,
-                            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + height);
-
     if (cost < best_cost) {
-      best_cost = cost;
-      best_index = i;
+      best_cost    = cost;
+      best_index   = num_cand;
       best_bitcost = bitcost;
     }
   }
-  if (best_index < num_cand) {
-    mv.x = merge_candbest_index.mvmerge_candbest_index.dir - 10 >> 2;
-    mv.y = merge_candbest_index.mvmerge_candbest_index.dir - 11 >> 2;
-  } else {
-    mv.x = mv_in_out->x >> 2;
-    mv.y = mv_in_out->y >> 2;
+
+  // Select starting point from among merge candidates. These should include
+  // both mv_cand vectors and (0, 0).
+  best_cost = select_starting_point(num_cand, merge_cand, mv_in_out, &mv, state, orig, width, height, wpp_limit,
+                                pic, ref, mv_cand, ref_idx, best_cost, &best_index, &best_bitcost, calc_mvd);
+
+  // Check if we should stop search
+  if (state->encoder_control->cfg->me_early_termination){
+    if (early_terminate(num_cand, merge_cand, mv_in_out, &mv, state, orig, width, height, wpp_limit,
+      pic, ref, mv_cand, ref_idx, &best_cost, bitcost_out, &best_bitcost, calc_mvd)) return best_cost;
   }
-  
+
   // Search the initial 7 points of the hexagon.
   best_index = 0;
   for (i = 0; i < 7; ++i) {
     const vector2d_t *pattern = &large_hexbsi;
-    unsigned cost;
-    {
-      PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
-      cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                             (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, 
-                             (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
-                             width, height, max_px_below_lcu);
-      cost += calc_mvd(state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-
-      PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
-                              (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x, 
-                              (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x + width,
-                              (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y, 
-                              (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y + height);
+    if (!intmv_within_tile(state, orig, mv.x + pattern->x, mv.y + pattern->y, width, height, wpp_limit)) {
+      continue;
     }
 
+    unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
+                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + pattern->x,
+                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + pattern->y,
+                                   width, height, -1);
+    cost += calc_mvd(state, mv.x + pattern->x, mv.y + pattern->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+
     if (cost < best_cost) {
       best_cost    = cost;
       best_index   = i;
@@ -685,27 +784,21 @@
     // Iterate through the next 3 points.
     for (i = 0; i < 3; ++i) {
       const vector2d_t *offset = &large_hexbsstart + i;
-      unsigned cost;
-      {
-        PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
-        cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                               (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
-                               (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
-                               width, height, max_px_below_lcu);
-        cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-        PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=large_hexbs_iterative,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
-              (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, 
-              (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x + width,
-              (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, 
-              (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y + height);
+      if (!intmv_within_tile(state, orig, mv.x + offset->x, mv.y + offset->y, width, height, wpp_limit)) {
+        continue;
       }
 
+      unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
+                                     (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
+                                     (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
+                                     width, height, -1);
+      cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+
       if (cost < best_cost) {
         best_cost    = cost;
         best_index   = start + i;
         best_bitcost = bitcost;
       }
-      ++offset;
     }
   }
 
@@ -717,21 +810,16 @@
   // Do the final step of the search with a small pattern.
   for (i = 1; i < 5; ++i) {
     const vector2d_t *offset = &small_hexbsi;
-    unsigned cost;
-    {
-      PERFORMANCE_MEASURE_START(KVZ_PERF_SEARCHPX);
-      cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                             (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
-                             (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
-                             width, height, max_px_below_lcu);
-      cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-      PERFORMANCE_MEASURE_END(KVZ_PERF_SEARCHPX, state->encoder_control->threadqueue, "type=sad,step=small_hexbs,frame=%d,tile=%d,px_x=%d-%d,px_y=%d-%d,ref_px_x=%d-%d,ref_px_y=%d-%d", state->global->frame, state->tile->id, orig->x, orig->x + width, orig->y, orig->y + height,
-            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x, 
-            (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x + width,
-            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y, 
-            (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y + height);
+    if (!intmv_within_tile(state, orig, mv.x + offset->x, mv.y + offset->y, width, height, wpp_limit)) {
+      continue;
     }
 
+    unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
+                                   (state->tile->lcu_offset_x * LCU_WIDTH) + orig->x + mv.x + offset->x,
+                                   (state->tile->lcu_offset_y * LCU_WIDTH) + orig->y + mv.y + offset->y,
+                                   width, height, -1);
+    cost += calc_mvd(state, mv.x + offset->x, mv.y + offset->y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+
     if (cost > 0 && cost < best_cost) {
       best_cost    = cost;
       best_index   = i;
@@ -753,61 +841,132 @@
 }
 
 
-#define IME_FULL_SEARCH_RADIUS 32
-static unsigned search_mv_full(const encoder_state_t * const state,
+static unsigned search_mv_full(encoder_state_t * const state,
                                unsigned width, unsigned height,
                                const kvz_picture *pic, const kvz_picture *ref,
                                const vector2d_t *orig, vector2d_t *mv_in_out,
                                int16_t mv_cand22, inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
-                               int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
+                               int16_t num_cand, int32_t ref_idx, const int32_t search_range, uint32_t *bitcost_out)
 {
   vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
+  vector2d_t best_mv = { 0, 0 };
   unsigned best_cost = UINT32_MAX;
   uint32_t best_bitcost = 0, bitcost;
-  const int max_lcu_below = state->encoder_control->owf ? 1 : -1;
-
-  int (*calc_mvd)(const encoder_state_t * const,
-                  int, int, int,
-                  int16_t22, inter_merge_cand_tMRG_MAX_NUM_CANDS,
-                  int16_t, int32_t, uint32_t *) =
-    state->encoder_control->cfg->mv_rdo ? kvz_calc_mvd_cost_cabac : calc_mvd_cost;
-
-  /*if (abs(mv.x) > IME_FULL_SEARCH_RADIUS || abs(mv.y) > IME_FULL_SEARCH_RADIUS) {
-    best_cost = calc_sad(pic, ref, orig->x, orig->y,
-                         orig->x, orig->y,
-                         block_width, block_width);
-    mv.x = 0;
-    mv.y = 0;
-  }*/
-
-  vector2d_t min_mv = {
-    mv.x - IME_FULL_SEARCH_RADIUS,
-    mv.y - IME_FULL_SEARCH_RADIUS,
-  };
-  vector2d_t max_mv = {
-    mv.x + IME_FULL_SEARCH_RADIUS,
-    mv.y + IME_FULL_SEARCH_RADIUS,
-  };
+  int wpp_limit = get_wpp_limit(state, orig);
 
-  for (int y = min_mv.y; y < max_mv.y; ++y) {
-    for (int x = min_mv.x; x < max_mv.x; ++x) {
-      unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
-                                         orig->x + x,
-                                         orig->y + y,
-                                         width, height,
-                                         max_lcu_below);
-      cost += calc_mvd(state, x, y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
-      if (cost < best_cost) {
-        best_cost    = cost;
-        best_bitcost = bitcost;
-        mv.x = x;
-        mv.y = y;
+  kvz_mvd_cost_func *calc_mvd = calc_mvd_cost;
+  if (state->encoder_control->cfg->mv_rdo) {
+    calc_mvd = kvz_calc_mvd_cost_cabac;
+  }
+
+  // Check the 0-vector, so we can ignore all 0-vectors in the merge cand list.
+  if (intmv_within_tile(state, orig, 0, 0, width, height, wpp_limit)) {
+    vector2d_t min_mv = { 0 - search_range, 0 - search_range };
+    vector2d_t max_mv = { 0 + search_range, 0 + search_range };
+
+    for (int y = min_mv.y; y <= max_mv.y; ++y) {
+      for (int x = min_mv.x; x <= max_mv.x; ++x) {
+        if (!intmv_within_tile(state, orig, x, y, width, height, wpp_limit)) {
+          continue;
+        }
+        unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
+                                           orig->x + x,
+                                           orig->y + y,
+                                           width, height, -1);
+        cost += calc_mvd(state, x, y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+        if (cost < best_cost) {
+          best_cost = cost;
+          best_bitcost = bitcost;
+          best_mv.x = x;
+          best_mv.y = y;
+        }
       }
     }
   }
 
-  mv_in_out->x = mv.x << 2;
-  mv_in_out->y = mv.y << 2;
+  // Check mv_in if it's not one of the merge candidates.
+  if (!mv_in_merge(merge_cand, num_cand, &mv) &&
+      intmv_within_tile(state, orig, mv.x, mv.y, width, height, wpp_limit))
+  {
+    vector2d_t min_mv = { mv.x - search_range, mv.y - search_range };
+    vector2d_t max_mv = { mv.x + search_range, mv.y + search_range };
+
+    for (int y = min_mv.y; y <= max_mv.y; ++y) {
+      for (int x = min_mv.x; x <= max_mv.x; ++x) {
+        if (!intmv_within_tile(state, orig, x, y, width, height, wpp_limit)) {
+          continue;
+        }
+        unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
+                                           orig->x + x,
+                                           orig->y + y,
+                                           width, height, -1);
+        cost += calc_mvd(state, x, y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+        if (cost < best_cost) {
+          best_cost = cost;
+          best_bitcost = bitcost;
+          best_mv.x = x;
+          best_mv.y = y;
+        }
+      }
+    }
+  }
+
+  // Select starting point from among merge candidates. These should include
+  // both mv_cand vectors and (0, 0).
+  for (int i = 0; i < num_cand; ++i) {
+    if (merge_candi.dir == 3) continue;
+    mv.x = merge_candi.mvmerge_candi.dir - 10 >> 2;
+    mv.y = merge_candi.mvmerge_candi.dir - 11 >> 2;
+
+    // Ignore 0-vector because it has already been checked.
+    if (mv.x == 0 && mv.y == 0) continue;
+
+    vector2d_t min_mv = { mv.x - search_range, mv.y - search_range };
+    vector2d_t max_mv = { mv.x + search_range, mv.y + search_range };
+
+    for (int y = min_mv.y; y <= max_mv.y; ++y) {
+      for (int x = min_mv.x; x <= max_mv.x; ++x) {
+        if (!intmv_within_tile(state, orig, x, y, width, height, wpp_limit)) {
+          continue;
+        }
+
+        // Avoid calculating the same points over and over again.
+        bool already_tested = false;
+        for (int j = -1; j < i; ++j) {
+          int xx = 0;
+          int yy = 0;
+          if (j >= 0) {
+            if (merge_candj.dir == 3) continue;
+            xx = merge_candj.mvmerge_candj.dir - 10 >> 2;
+            yy = merge_candj.mvmerge_candj.dir - 11 >> 2;
+          }
+          if (x >= xx - search_range && x <= xx + search_range &&
+              y >= yy - search_range && y <= yy + search_range)
+          {
+            already_tested = true;
+            x = xx + search_range;
+            break;
+          }
+        }
+        if (already_tested) continue;
+
+        unsigned cost = kvz_image_calc_sad(pic, ref, orig->x, orig->y,
+                                           orig->x + x,
+                                           orig->y + y,
+                                           width, height, -1);
+        cost += calc_mvd(state, x, y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+        if (cost < best_cost) {
+          best_cost = cost;
+          best_bitcost = bitcost;
+          best_mv.x = x;
+          best_mv.y = y;
+        }
+      }
+    }
+  }
+
+  mv_in_out->x = best_mv.x << 2;
+  mv_in_out->y = best_mv.y << 2;
 
   *bitcost_out = best_bitcost;
 
@@ -830,7 +989,7 @@
  * Algoritm first searches 1/2-pel positions around integer mv and after best match is found,
  * refines the search by searching best 1/4-pel postion around best 1/2-pel position.
  */
-static unsigned search_frac(const encoder_state_t * const state,
+static unsigned search_frac(encoder_state_t * const state,
                             unsigned width, unsigned height,
                             const kvz_picture *pic, const kvz_picture *ref,
                             const vector2d_t *orig, vector2d_t *mv_in_out,
@@ -838,39 +997,55 @@
                             int16_t num_cand, int32_t ref_idx, uint32_t *bitcost_out)
 {
   // Map indexes to relative coordinates in the following way:
-  // 6 7 8
-  // 3 4 5
-  // 0 1 2
+  // 5 3 6
+  // 1 0 2
+  // 7 4 8
   static const vector2d_t square9 = {
-      { -1, 1 },  { 0, 1 },  { 1, 1 },
-      { -1, 0 },  { 0, 0 },  { 1, 0 },
-      { -1, -1 }, { 0, -1 }, { 1, -1 }
+      {  0,  0 },  { -1,  0 },  {  1,  0 },
+      {  0, -1 },  {  0,  1 },  { -1, -1 },
+      {  1, -1 },  { -1,  1 },  {  1,  1 }
   };
 
+  int wpp_limit = get_wpp_limit(state, orig);
+
   //Set mv to halfpel precision
   vector2d_t mv = { mv_in_out->x >> 2, mv_in_out->y >> 2 };
   unsigned best_cost = UINT32_MAX;
-  uint32_t best_bitcost = 0, bitcost;
+  uint32_t best_bitcost = 0;
+  uint32_t bitcosts4 = { 0 };
   unsigned i;
-  unsigned best_index = 0; // Index of large_hexbs or finally small_hexbs.
-
-  unsigned cost = 0;
+  unsigned best_index = 0;
 
-  vector2d_t halfpel_offset;
+  unsigned costs4 = { 0 };
 
-  #define FILTER_SIZE 8
-  #define HALF_FILTER (FILTER_SIZE>>1)
+  kvz_extended_block src = { 0, 0, 0, 0 };
 
-  kvz_extended_block src = { 0, 0, 0 };
+  // Buffers for interpolated fractional pixels one 
+  // for each position excluding the integer position.
+  // Has one extra column on left and row on top because
+  // samples are used also from those integer pixels when
+  // searching positions to the left and up.
+  frac_search_block fracpel_blocks15;
+  
+  kvz_pixel *hpel_pos8;
+  
+  // Horizontal hpel positions
+  hpel_pos0 = fracpel_blocksHPEL_POS_HOR + (LCU_WIDTH + 1);
+  hpel_pos1 = fracpel_blocksHPEL_POS_HOR + (LCU_WIDTH + 1) + 1;
+  
+  // Vertical hpel positions
+  hpel_pos2 = fracpel_blocksHPEL_POS_VER + 1;
+  hpel_pos3 = fracpel_blocksHPEL_POS_VER + (LCU_WIDTH + 1) + 1;
+  
+  // Diagonal hpel positions
+  hpel_pos4 = fracpel_blocksHPEL_POS_DIA;
+  hpel_pos5 = fracpel_blocksHPEL_POS_DIA + 1;
+  hpel_pos6 = fracpel_blocksHPEL_POS_DIA + (LCU_WIDTH + 1);
+  hpel_pos7 = fracpel_blocksHPEL_POS_DIA + (LCU_WIDTH + 1) + 1;
 
-  //destination buffer for interpolation
-  int dst_stride = (width + 1) * 4;
-  kvz_pixel dst(LCU_WIDTH+1) * (LCU_WIDTH+1) * 16;
-  kvz_pixel* dst_off = &dstdst_stride*4+4;
+  int fme_level = state->encoder_control->fme_level;
 
-  int(*calc_mvd)(const encoder_state_t * const, int, int, int,
-    int16_t22, inter_merge_cand_tMRG_MAX_NUM_CANDS,
-    int16_t, int32_t, uint32_t *) = calc_mvd_cost;
+  kvz_mvd_cost_func *calc_mvd = calc_mvd_cost;
   if (state->encoder_control->cfg->mv_rdo) {
     calc_mvd = kvz_calc_mvd_cost_cabac;
   }
@@ -880,92 +1055,160 @@
                 state->tile->lcu_offset_y * LCU_WIDTH,
                 ref->y, ref->width, ref->height, FILTER_SIZE, width+1, height+1, &src);
 
-  kvz_filter_inter_quarterpel_luma(state->encoder_control, src.orig_topleft, src.stride, width+1,
-      height+1, dst, dst_stride, 1, 1);
+  kvz_filter_frac_blocks_luma(state->encoder_control, src.orig_topleft, src.stride, width,
+    height, fracpel_blocks, fme_level);
 
-  if (src.malloc_used) free(src.buffer);
+  kvz_pixel tmp_picLCU_WIDTH*LCU_WIDTH;
+  kvz_pixels_blit(pic->y + orig->y*pic->width + orig->x, tmp_pic, width, height, pic->stride, width);
+
+  // Search integer position
+  costs0 = kvz_satd_any_size(width, height,
+                            tmp_pic, width,
+                            src.orig_topleft + src.stride + 1, src.stride);
+
+  costs0 += calc_mvd(state, mv.x, mv.y, 2, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts0);
+  best_cost = costs0;
+  best_bitcost = bitcosts0;
+
+  int last_hpel_index = (fme_level == 1) ? 4 : 8;
 
   //Set mv to half-pixel precision
   mv.x <<= 1;
   mv.y <<= 1;
 
-  kvz_pixel tmp_filteredLCU_WIDTH*LCU_WIDTH;
-  kvz_pixel tmp_picLCU_WIDTH*LCU_WIDTH;
-  kvz_pixels_blit(pic->y + orig->y*pic->width + orig->x, tmp_pic, width, height, pic->stride, width);
-
   // Search halfpel positions around best integer mv
-  for (i = 0; i < 9; ++i) {
-    const vector2d_t *pattern = &squarei;
-
-    int y,x;
-    for(y = 0; y < height; ++y) {
-      int dst_y = y*4+pattern->y*2;
-      for(x = 0; x < width; ++x) {
-        int dst_x = x*4+pattern->x*2;
-        tmp_filteredy*width+x = dst_offdst_y*dst_stride+dst_x;
-      }
-    }
+  for (i = 1; i <= last_hpel_index; i+=4) {
+    const vector2d_t *pattern4 = { &squarei, &squarei + 1, &squarei + 2, &squarei + 3 };
+    
+    int8_t within_tile4 = {
+      fracmv_within_tile(state, orig, (mv.x + pattern0->x) << 1, (mv.y + pattern0->y) << 1, width, height, wpp_limit),
+      fracmv_within_tile(state, orig, (mv.x + pattern1->x) << 1, (mv.y + pattern1->y) << 1, width, height, wpp_limit),
+      fracmv_within_tile(state, orig, (mv.x + pattern2->x) << 1, (mv.y + pattern2->y) << 1, width, height, wpp_limit),
+      fracmv_within_tile(state, orig, (mv.x + pattern3->x) << 1, (mv.y + pattern3->y) << 1, width, height, wpp_limit),
+    };
 
-    cost = kvz_satd_any_size(width, height,
-                             tmp_pic, width,
-                             tmp_filtered, width);
+    int hpel_strides4 = {
+      (LCU_WIDTH + 1), 
+      (LCU_WIDTH + 1), 
+      (LCU_WIDTH + 1), 
+      (LCU_WIDTH + 1)
+    };
 
-    cost += calc_mvd(state, mv.x + pattern->x, mv.y + pattern->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+    kvz_satd_any_size_quad(width, height, (const kvz_pixel**)(hpel_pos + i - 1), hpel_strides, tmp_pic, width, 4, costs, within_tile);
 
-    if (cost < best_cost) {
-      best_cost    = cost;
-      best_index   = i;
-      best_bitcost = bitcost;
+    costs0 += calc_mvd(state, mv.x + pattern0->x, mv.y + pattern0->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts0);
+    costs1 += calc_mvd(state, mv.x + pattern1->x, mv.y + pattern1->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts1);
+    costs2 += calc_mvd(state, mv.x + pattern2->x, mv.y + pattern2->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts2);
+    costs3 += calc_mvd(state, mv.x + pattern3->x, mv.y + pattern3->y, 1, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts3);
 
+    for (int j = 0; j < 4; ++j) {
+      if (within_tilej && costsj < best_cost) {
+        best_cost = costsj;
+        best_index = i + j;
+        best_bitcost = bitcostsj;
+      }
     }
   }
 
-  //Set mv to best match
+  unsigned int best_hpel_index = best_index;
+
+  // Move search to best_index
   mv.x += squarebest_index.x;
   mv.y += squarebest_index.y;
 
-  halfpel_offset.x = squarebest_index.x*2;
-  halfpel_offset.y = squarebest_index.y*2;
-
   //Set mv to quarterpel precision
   mv.x <<= 1;
   mv.y <<= 1;
 
-  //Search quarterpel points around best halfpel mv
-  for (i = 0; i < 9; ++i) {
-    const vector2d_t *pattern = &squarei;
+  if (fme_level >= 3) {
 
-    int y,x;
-    for(y = 0; y < height; ++y) {
-      int dst_y = y*4+halfpel_offset.y+pattern->y;
-      for(x = 0; x < width; ++x) {
-        int dst_x = x*4+halfpel_offset.x+pattern->x;
-        tmp_filteredy*width+x = dst_offdst_y*dst_stride+dst_x;
-      }
-    }
+    best_index = 0;
 
-    cost = kvz_satd_any_size(width, height,
-                             tmp_pic, width,
-                             tmp_filtered, width);
+    int last_qpel_index = (fme_level == 3) ? 4 : 8;
 
-    cost += calc_mvd(state, mv.x + pattern->x, mv.y + pattern->y, 0, mv_cand, merge_cand, num_cand, ref_idx, &bitcost);
+    //Search quarterpel points around best halfpel mv
+    for (i = 1; i <= last_qpel_index; i += 4) {
+      const vector2d_t *pattern4 = { &squarei, &squarei + 1, &squarei + 2, &squarei + 3 };
 
-    if (cost < best_cost) {
-      best_cost    = cost;
-      best_index   = i;
-      best_bitcost = bitcost;
+      int8_t within_tile4 = {
+        fracmv_within_tile(state, orig, (mv.x + pattern0->x), (mv.y + pattern0->y), width, height, wpp_limit),
+        fracmv_within_tile(state, orig, (mv.x + pattern1->x), (mv.y + pattern1->y), width, height, wpp_limit),
+        fracmv_within_tile(state, orig, (mv.x + pattern2->x), (mv.y + pattern2->y), width, height, wpp_limit),
+        fracmv_within_tile(state, orig, (mv.x + pattern3->x), (mv.y + pattern3->y), width, height, wpp_limit),
+      };
+
+      int qpel_indices4 = { 0 };
+      int int_offset_x4 = { 0 };
+      int int_offset_y4 = { 0 };
+
+      for (int j = 0; j < 4; ++j) {
+        int hpel_offset_x = squarebest_hpel_index.x;
+        int hpel_offset_y = squarebest_hpel_index.y;
+
+        int qpel_offset_x = 2 * hpel_offset_x + patternj->x;
+        int qpel_offset_y = 2 * hpel_offset_y + patternj->y;
+
+        unsigned qpel_filter_x = (qpel_offset_x + 4) % 4;
+        unsigned qpel_filter_y = (qpel_offset_y + 4) % 4;
+
+        // The first value (-1) is for the integer position and
+        // it will not be used
+        int filters_to_block_idx44 = {
+            { -1, 3, 0, 4 },
+            { 7, 11, 8, 12 },
+            { 1, 5, 2, 6 },
+            { 9, 13, 10, 14 }
+        };
+
+        qpel_indicesj = filters_to_block_idxqpel_filter_yqpel_filter_x;
+
+        // Select values filtered from correct integer samples
+        int_offset_xj = qpel_offset_x >= 0;
+        int_offset_yj = qpel_offset_y >= 0;
+      }
+
+      kvz_pixel *qpel_pos4 = {
+        fracpel_blocksqpel_indices0 + int_offset_y0 * (LCU_WIDTH + 1) + int_offset_x0,
+        fracpel_blocksqpel_indices1 + int_offset_y1 * (LCU_WIDTH + 1) + int_offset_x1,
+        fracpel_blocksqpel_indices2 + int_offset_y2 * (LCU_WIDTH + 1) + int_offset_x2,
+        fracpel_blocksqpel_indices3 + int_offset_y3 * (LCU_WIDTH + 1) + int_offset_x3
+      };
+
+      int qpel_strides4 = {
+        (LCU_WIDTH + 1),
+        (LCU_WIDTH + 1),
+        (LCU_WIDTH + 1),
+        (LCU_WIDTH + 1)
+      };
+
+      kvz_satd_any_size_quad(width, height, (const kvz_pixel**)qpel_pos, qpel_strides, tmp_pic, width, 4, costs, within_tile);
+
+      costs0 += calc_mvd(state, mv.x + pattern0->x, mv.y + pattern0->y, 0, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts0);
+      costs1 += calc_mvd(state, mv.x + pattern1->x, mv.y + pattern1->y, 0, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts1);
+      costs2 += calc_mvd(state, mv.x + pattern2->x, mv.y + pattern2->y, 0, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts2);
+      costs3 += calc_mvd(state, mv.x + pattern3->x, mv.y + pattern3->y, 0, mv_cand, merge_cand, num_cand, ref_idx, &bitcosts3);
+
+      for (int j = 0; j < 4; ++j) {
+        if (within_tilej && costsj < best_cost) {
+          best_cost = costsj;
+          best_index = i + j;
+          best_bitcost = bitcostsj;
+        }
+      }
     }
-  }
 
-  //Set mv to best final best match
-  mv.x += squarebest_index.x;
-  mv.y += squarebest_index.y;
+    //Set mv to best final best match
+    mv.x += squarebest_index.x;
+    mv.y += squarebest_index.y;
+  }
 
   mv_in_out->x = mv.x;
   mv_in_out->y = mv.y;
 
   *bitcost_out = best_bitcost;
 
+  if (src.malloc_used) free(src.buffer);
+
   return best_cost;
 }
 
@@ -973,7 +1216,7 @@
 /**
  * \brief Perform inter search for a single reference frame.
  */
-static void search_pu_inter_ref(const encoder_state_t * const state,
+static void search_pu_inter_ref(encoder_state_t * const state,
                                 int x, int y,
                                 int width, int height,
                                 int depth,
@@ -982,19 +1225,21 @@
                                 inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
                                 int16_t num_cand,
                                 unsigned ref_idx,
-                                uint32_t(*get_mvd_cost)(vector2d_t *, cabac_data_t*))
+                                uint32_t(*get_mvd_cost)(encoder_state_t * const, vector2d_t *, const cabac_data_t*),
+                                double *inter_cost,
+                                uint32_t *inter_bitcost)
 {
   const int x_cu = x >> 3;
   const int y_cu = y >> 3;
   const videoframe_t * const frame = state->tile->frame;
-  kvz_picture *ref_image = state->global->ref->imagesref_idx;
+  kvz_picture *ref_image = state->frame->ref->imagesref_idx;
   uint32_t temp_bitcost = 0;
   uint32_t temp_cost = 0;
-  vector2d_t orig, mvd;
+  vector2d_t orig;
   int32_t merged = 0;
   uint8_t cu_mv_cand = 0;
   int8_t merge_idx = 0;
-  int8_t ref_list = state->global->refmapref_idx.list-1;
+  int8_t ref_list = state->frame->refmapref_idx.list-1;
   int8_t temp_ref_idx = cur_cu->inter.mv_refref_list;
   orig.x = x_cu * CU_MIN_SIZE_PIXELS;
   orig.y = y_cu * CU_MIN_SIZE_PIXELS;
@@ -1009,10 +1254,14 @@
     // Take starting point for MV search from previous frame.
     // When temporal motion vector candidates are added, there is probably
     // no point to this anymore, but for now it helps.
-    // TODO: Update this to work with SMP/AMP blocks.
-    int mid_x_cu = (x + (width >> 1)) / 8;
-    int mid_y_cu = (y + (height >> 1)) / 8;
-    cu_info_t *ref_cu = &state->global->ref->cu_arraysref_idx->datamid_x_cu + mid_y_cu * (frame->width_in_lcu << MAX_DEPTH);
+    const vector2d_t tile_top_left_corner = {
+        (state->tile->lcu_offset_x << LOG2_LCU_WIDTH),
+        (state->tile->lcu_offset_y << LOG2_LCU_WIDTH)
+    };
+    const int mid_x = tile_top_left_corner.x + x + (width >> 1);
+    const int mid_y = tile_top_left_corner.y + y + (height >> 1);
+    const cu_array_t* ref_array = state->frame->ref->cu_arraysref_idx;
+    const cu_info_t* ref_cu = kvz_cu_array_at_const(ref_array, mid_x, mid_y);
     if (ref_cu->type == CU_INTER) {
       if (ref_cu->inter.mv_dir & 1) {
         mv.x = ref_cu->inter.mv00;
@@ -1024,6 +1273,15 @@
     }
   }
 
+  int search_range = 32;
+  switch (state->encoder_control->cfg->ime_algorithm) {
+    case KVZ_IME_FULL64: search_range = 64; break;
+    case KVZ_IME_FULL32: search_range = 32; break;
+    case KVZ_IME_FULL16: search_range = 16; break;
+    case KVZ_IME_FULL8: search_range = 8; break;
+    default: break;
+  }
+
   switch (state->encoder_control->cfg->ime_algorithm) {
     case KVZ_IME_TZ:
       temp_cost += tz_search(state,
@@ -1039,6 +1297,11 @@
                              &temp_bitcost);
       break;
 
+
+    case KVZ_IME_FULL64:
+    case KVZ_IME_FULL32:
+    case KVZ_IME_FULL16:
+    case KVZ_IME_FULL8:
     case KVZ_IME_FULL:
       temp_cost += search_mv_full(state,
                                   width, height,
@@ -1050,6 +1313,7 @@
                                   merge_cand,
                                   num_cand,
                                   ref_idx,
+                                  search_range,
                                   &temp_bitcost);
       break;
 
@@ -1081,7 +1345,7 @@
                             ref_idx,
                             &temp_bitcost);
   }
-
+  
   merged = 0;
   // Check every candidate to find a match
   for(merge_idx = 0; merge_idx < num_cand; merge_idx++) {
@@ -1101,36 +1365,32 @@
 
     mvd_temp1.x = mv.x - mv_cand00;
     mvd_temp1.y = mv.y - mv_cand01;
-    cand1_cost = get_mvd_cost(&mvd_temp1, (cabac_data_t*)&state->cabac);
+    cand1_cost = get_mvd_cost(state, &mvd_temp1, &state->cabac);
 
     mvd_temp2.x = mv.x - mv_cand10;
     mvd_temp2.y = mv.y - mv_cand11;
-    cand2_cost = get_mvd_cost(&mvd_temp2, (cabac_data_t*)&state->cabac);
+    cand2_cost = get_mvd_cost(state, &mvd_temp2, &state->cabac);
 
     // Select candidate 1 if it has lower cost
     if (cand2_cost < cand1_cost) {
       cu_mv_cand = 1;
     }
   }
-  mvd.x = mv.x - mv_candcu_mv_cand0;
-  mvd.y = mv.y - mv_candcu_mv_cand1;
-
-  if(temp_cost < cur_cu->inter.cost) {
 
+  if (temp_cost < *inter_cost) {
     // Map reference index to L0/L1 pictures
     cur_cu->inter.mv_dir = ref_list+1;
-    cur_cu->inter.mv_ref_codedref_list = state->global->refmapref_idx.idx;
+    uint8_t mv_ref_coded = state->frame->refmapref_idx.idx;
 
     cur_cu->merged        = merged;
     cur_cu->merge_idx     = merge_idx;
     cur_cu->inter.mv_refref_list = ref_idx;
     cur_cu->inter.mvref_list0 = (int16_t)mv.x;
     cur_cu->inter.mvref_list1 = (int16_t)mv.y;
-    cur_cu->inter.mvdref_list0 = (int16_t)mvd.x;
-    cur_cu->inter.mvdref_list1 = (int16_t)mvd.y;
-    cur_cu->inter.cost    = temp_cost;
-    cur_cu->inter.bitcost = temp_bitcost + cur_cu->inter.mv_dir - 1 + cur_cu->inter.mv_ref_codedref_list;
-    cur_cu->inter.mv_candref_list = cu_mv_cand;
+    CU_SET_MV_CAND(cur_cu, ref_list, cu_mv_cand);
+
+    *inter_cost = temp_cost;
+    *inter_bitcost = temp_bitcost + cur_cu->inter.mv_dir - 1 + mv_ref_coded;
   }
 }
 
@@ -1146,15 +1406,21 @@
  * \param i_pu        index of the PU in the CU
  * \param lcu         containing LCU
  *
- * \return            cost of the best mode
+ * \param inter_cost    Return inter cost of the best mode
+ * \param inter_bitcost Return inter bitcost of the best mode
  */
-static int search_pu_inter(const encoder_state_t * const state,
-                           int x_cu, int y_cu,
-                           int depth,
-                           part_mode_t part_mode,
-                           int i_pu,
-                           lcu_t *lcu)
+static void search_pu_inter(encoder_state_t * const state,
+                            int x_cu, int y_cu,
+                            int depth,
+                            part_mode_t part_mode,
+                            int i_pu,
+                            lcu_t *lcu,
+                            double *inter_cost,
+                            uint32_t *inter_bitcost)
 {
+  *inter_cost = MAX_INT;
+  *inter_bitcost = MAX_INT;
+
   const videoframe_t * const frame = state->tile->frame;
   const int width_cu  = LCU_WIDTH >> depth;
   const int x         = PU_GET_X(part_mode, width_cu, x_cu, i_pu);
@@ -1184,32 +1450,17 @@
                                               merge_cand,
                                               lcu);
 
-  uint32_t(*get_mvd_cost)(vector2d_t *, cabac_data_t*) = get_mvd_coding_cost;
+  uint32_t(*get_mvd_cost)(encoder_state_t * const state, vector2d_t *, const cabac_data_t*) = get_mvd_coding_cost;
   if (state->encoder_control->cfg->mv_rdo) {
     get_mvd_cost = kvz_get_mvd_coding_cost_cabac;
   }
 
-  int max_px_below_lcu = -1;
-  if (state->encoder_control->owf) {
-    max_px_below_lcu = LCU_WIDTH;
-    if (state->encoder_control->fme_level > 0) {
-      // Fractional motion estimation can change the mv by at most 1 pixel.
-      max_px_below_lcu -= 1;
-    }
-    if (state->encoder_control->deblock_enable) {
-      // Strong deblock filter modifies 3 pixels.
-      max_px_below_lcu -= 3;
-    }
-  }
-
   // Default to candidate 0
-  cur_cu->inter.mv_cand0 = 0;
-  cur_cu->inter.mv_cand1 = 0;
-
-  cur_cu->inter.cost = UINT_MAX;
+  CU_SET_MV_CAND(cur_cu, 0, 0);
+  CU_SET_MV_CAND(cur_cu, 1, 0);
 
   uint32_t ref_idx;
-  for (ref_idx = 0; ref_idx < state->global->ref->used_size; ref_idx++) {
+  for (ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) {
     search_pu_inter_ref(state,
                         x, y,
                         width, height,
@@ -1217,11 +1468,17 @@
                         lcu, cur_cu,
                         mv_cand, merge_cand, num_cand,
                         ref_idx,
-                        get_mvd_cost);
+                        get_mvd_cost,
+                        inter_cost,
+                        inter_bitcost);
   }
 
   // Search bi-pred positions
-  if (state->global->slicetype == KVZ_SLICE_B && state->encoder_control->cfg->bipred) {
+  bool can_use_bipred = state->frame->slicetype == KVZ_SLICE_B
+    && state->encoder_control->cfg->bipred
+    && width + height >= 16; // 4x8 and 8x4 PBs are restricted to unipred
+
+  if (can_use_bipred) {
     lcu_t *templcu = MALLOC(lcu_t, 1);
     unsigned cu_width = LCU_WIDTH >> depth;
     #define NUM_PRIORITY_LIST 12;
@@ -1230,9 +1487,7 @@
     uint8_t cutoff = num_cand;
 
 
-    int(*calc_mvd)(const encoder_state_t * const, int, int, int,
-      int16_t22, inter_merge_cand_tMRG_MAX_NUM_CANDS,
-      int16_t, int32_t, uint32_t *) = calc_mvd_cost;
+    kvz_mvd_cost_func *calc_mvd = calc_mvd_cost;
     if (state->encoder_control->cfg->mv_rdo) {
       calc_mvd = kvz_calc_mvd_cost_cabac;
     }
@@ -1254,30 +1509,26 @@
           kvz_pixel tmp_block64 * 64;
           kvz_pixel tmp_pic64 * 64;
           // Force L0 and L1 references
-          if (state->global->refmapmerge_candi.ref0.list == 2 || state->global->refmapmerge_candj.ref1.list == 1) continue;
+          if (state->frame->refmapmerge_candi.ref0.list == 2 || state->frame->refmapmerge_candj.ref1.list == 1) continue;
 
           mv00 = merge_candi.mv00;
           mv01 = merge_candi.mv01;
           mv10 = merge_candj.mv10;
           mv11 = merge_candj.mv11;
 
-          // Check boundaries when using owf to process multiple frames at the same time
-          if (max_px_below_lcu >= 0) {
-            // When SAO is off, row is considered reconstructed when the last LCU
-            // is done, although the bottom 2 pixels might still need deblocking.
-            // To work around this, add 2 luma pixels to the reach of the mv
-            // in order to avoid referencing those possibly non-deblocked pixels.
-            int mv_lcu_row_reach_1 = ((y+(mv01>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH;
-            int mv_lcu_row_reach_2 = ((y+(mv11>>2)) + (LCU_WIDTH >> depth) - 1 + 2) / LCU_WIDTH;
-            int cur_lcu_row = y / LCU_WIDTH;
-            if (mv_lcu_row_reach_1 > cur_lcu_row + max_px_below_lcu || mv_lcu_row_reach_2 > cur_lcu_row + max_px_below_lcu) {
+          {
+            // Don't try merge candidates that don't satisfy mv constraints.
+            vector2d_t orig = { x, y };
+            if (!fracmv_within_tile(state, &orig, mv00, mv01, width, height, -1) ||
+                !fracmv_within_tile(state, &orig, mv10, mv11, width, height, -1))
+            {
               continue;
             }
           }
 
           kvz_inter_recon_lcu_bipred(state,
-                                     state->global->ref->imagesmerge_candi.ref0,
-                                     state->global->ref->imagesmerge_candj.ref1,
+                                     state->frame->ref->imagesmerge_candi.ref0,
+                                     state->frame->ref->imagesmerge_candj.ref1,
                                      x, y,
                                      width,
                                      height,
@@ -1298,13 +1549,13 @@
           cost += calc_mvd(state, merge_candi.mv00, merge_candi.mv01, 0, mv_cand, merge_cand, 0, ref_idx, &bitcost0);
           cost += calc_mvd(state, merge_candi.mv10, merge_candi.mv11, 0, mv_cand, merge_cand, 0, ref_idx, &bitcost1);
 
-          if (cost < cur_cu->inter.cost) {
+          if (cost < *inter_cost) {
 
             cur_cu->inter.mv_dir = 3;
-            cur_cu->inter.mv_ref_coded0 = state->global->refmapmerge_candi.ref0.idx;
-            cur_cu->inter.mv_ref_coded1 = state->global->refmapmerge_candj.ref1.idx;
-
-
+            uint8_t mv_ref_coded2 = {
+              state->frame->refmapmerge_candi.ref0.idx,
+              state->frame->refmapmerge_candj.ref1.idx
+            };
 
             cur_cu->inter.mv_ref0 = merge_candi.ref0;
             cur_cu->inter.mv_ref1 = merge_candj.ref1;
@@ -1340,23 +1591,21 @@
 
                 mvd_temp1.x = cur_cu->inter.mvreflist0 - mv_cand00;
                 mvd_temp1.y = cur_cu->inter.mvreflist1 - mv_cand01;
-                cand1_cost = get_mvd_cost(&mvd_temp1, (cabac_data_t*)&state->cabac);
+                cand1_cost = get_mvd_cost(state, &mvd_temp1, (cabac_data_t*)&state->cabac);
 
                 mvd_temp2.x = cur_cu->inter.mvreflist0 - mv_cand10;
                 mvd_temp2.y = cur_cu->inter.mvreflist1 - mv_cand11;
-                cand2_cost = get_mvd_cost(&mvd_temp2, (cabac_data_t*)&state->cabac);
+                cand2_cost = get_mvd_cost(state, &mvd_temp2, (cabac_data_t*)&state->cabac);
 
                 // Select candidate 1 if it has lower cost
                 if (cand2_cost < cand1_cost) {
                   cu_mv_cand = 1;                  
                 }
               }
-              cur_cu->inter.mvdreflist0 = cur_cu->inter.mvreflist0 - mv_candcu_mv_cand0;
-              cur_cu->inter.mvdreflist1 = cur_cu->inter.mvreflist1 - mv_candcu_mv_cand1;
-              cur_cu->inter.mv_candreflist = cu_mv_cand;
+              CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand);
             }
-            cur_cu->inter.cost = cost;
-            cur_cu->inter.bitcost = bitcost0 + bitcost1 + cur_cu->inter.mv_dir - 1 + cur_cu->inter.mv_ref_coded0 + cur_cu->inter.mv_ref_coded1;
+            *inter_cost = cost;
+            *inter_bitcost = bitcost0 + bitcost1 + cur_cu->inter.mv_dir - 1 + mv_ref_coded0 + mv_ref_coded1;
           }
         }
       }
@@ -1364,7 +1613,12 @@
     FREE_POINTER(templcu);
   }
 
-  return cur_cu->inter.cost;
+  if (*inter_cost < INT_MAX) {
+    const vector2d_t orig = { x, y };
+    if (cur_cu->inter.mv_dir == 1) {
+      assert(fracmv_within_tile(state, &orig, cur_cu->inter.mv00, cur_cu->inter.mv01, width, height, -1));
+    }
+  }
 }
 
 
@@ -1379,11 +1633,21 @@
  * \param depth       depth of the CU in the quadtree
  * \param lcu         containing LCU
  *
- * \return            cost of the best mode
+ * \param inter_cost    Return inter cost
+ * \param inter_bitcost Return inter bitcost
  */
-int kvz_search_cu_inter(const encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu)
+void kvz_search_cu_inter(encoder_state_t * const state,
+                         int x, int y, int depth,
+                         lcu_t *lcu,
+                         double   *inter_cost,
+                         uint32_t *inter_bitcost)
 {
-  return search_pu_inter(state, x, y, depth, SIZE_2Nx2N, 0, lcu);
+  search_pu_inter(state,
+                  x, y, depth,
+                  SIZE_2Nx2N, 0,
+                  lcu,
+                  inter_cost,
+                  inter_bitcost);
 }
 
 
@@ -1399,41 +1663,50 @@
  * \param part_mode   partition mode to search
  * \param lcu         containing LCU
  *
- * \return            cost of the best mode
+ * \param inter_cost    Return inter cost
+ * \param inter_bitcost Return inter bitcost
  */
-int kvz_search_cu_smp(const encoder_state_t * const state,
-                      int x, int y,
-                      int depth,
-                      part_mode_t part_mode,
-                      lcu_t *lcu)
+void kvz_search_cu_smp(encoder_state_t * const state,
+                       int x, int y,
+                       int depth,
+                       part_mode_t part_mode,
+                       lcu_t *lcu,
+                       double *inter_cost,
+                       uint32_t *inter_bitcost)
 {
-  const int num_pu    = kvz_part_mode_num_partspart_mode;
-  const int width_scu = (LCU_WIDTH >> depth) >> MAX_DEPTH;
-  const int y_scu     = SUB_SCU(y) >> MAX_DEPTH;
-  const int x_scu     = SUB_SCU(x) >> MAX_DEPTH;
+  const int num_pu  = kvz_part_mode_num_partspart_mode;
+  const int width   = LCU_WIDTH >> depth;
+  const int y_local = SUB_SCU(y);
+  const int x_local = SUB_SCU(x);
+
+  *inter_cost    = 0;
+  *inter_bitcost = 0;
 
-  int cost = 0;
   for (int i = 0; i < num_pu; ++i) {
-    const int x_pu      = PU_GET_X(part_mode, width_scu, x_scu, i);
-    const int y_pu      = PU_GET_Y(part_mode, width_scu, y_scu, i);
-    const int width_pu  = PU_GET_W(part_mode, width_scu, i);
-    const int height_pu = PU_GET_H(part_mode, width_scu, i);
-    cu_info_t *cur_pu   = LCU_GET_CU(lcu, x_pu, y_pu);
+    const int x_pu      = PU_GET_X(part_mode, width, x_local, i);
+    const int y_pu      = PU_GET_Y(part_mode, width, y_local, i);
+    const int width_pu  = PU_GET_W(part_mode, width, i);
+    const int height_pu = PU_GET_H(part_mode, width, i);
+    cu_info_t *cur_pu   = LCU_GET_CU_AT_PX(lcu, x_pu, y_pu);
 
-    cur_pu->type = CU_INTER;
+    cur_pu->type      = CU_INTER;
     cur_pu->part_size = part_mode;
-    cur_pu->depth = depth;
+    cur_pu->depth     = depth;
+
+    double cost      = MAX_INT;
+    uint32_t bitcost = MAX_INT;
 
-    cost += search_pu_inter(state, x, y, depth, part_mode, i, lcu);
+    search_pu_inter(state, x, y, depth, part_mode, i, lcu, &cost, &bitcost);
 
-    for (int y = y_pu; y < y_pu + height_pu; ++y) {
-      for (int x = x_pu; x < x_pu + width_pu; ++x) {
-        cu_info_t *scu = LCU_GET_CU(lcu, x, y);
+    *inter_cost    += cost;
+    *inter_bitcost += bitcost;
+
+    for (int y = y_pu; y < y_pu + height_pu; y += SCU_WIDTH) {
+      for (int x = x_pu; x < x_pu + width_pu; x += SCU_WIDTH) {
+        cu_info_t *scu = LCU_GET_CU_AT_PX(lcu, x, y);
         scu->type = CU_INTER;
-        memcpy(&scu->inter, &cur_pu->inter, sizeof(cur_pu->inter));
+        scu->inter = cur_pu->inter;
       }
     }
   }
-
-  return cost;
 }

kvazaar-0.8.3.tar.gz/src/search_inter.h -> kvazaar-1.0.0.tar.gz/src/search_inter.h Changed

@@ -26,16 +26,51 @@
  * Inter prediction parameter search.
  */
 
-#include "global.h"
-
+#include "cu.h"
 #include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "inter.h"
+#include "kvazaar.h"
+
+#define FILTER_SIZE 8
+#define HALF_FILTER (FILTER_SIZE>>1)
+
+// Maximum extra width a block needs to filter 
+// a fractional pixel with positive fractional mv.x and mv.y
+#define KVZ_EXT_PADDING (FILTER_SIZE - 1)
+
+// Maximum block width for extended block
+#define KVZ_EXT_BLOCK_W (LCU_WIDTH + KVZ_EXT_PADDING)
+
+typedef kvz_pixel frac_search_block(LCU_WIDTH + 1) * (LCU_WIDTH + 1);
+
+enum hpel_position {
+  HPEL_POS_HOR = 0,
+  HPEL_POS_VER = 1,
+  HPEL_POS_DIA = 2
+};
+
+typedef int kvz_mvd_cost_func(encoder_state_t * const state,
+                              int x, int y,
+                              int mv_shift,
+                              int16_t mv_cand22,
+                              inter_merge_cand_t merge_candMRG_MAX_NUM_CANDS,
+                              int16_t num_cand,
+                              int32_t ref_idx,
+                              uint32_t *bitcost);
 
-int kvz_search_cu_inter(const encoder_state_t * const state, int x, int y, int depth, lcu_t *lcu);
+void kvz_search_cu_inter(encoder_state_t * const state,
+                         int x, int y, int depth,
+                         lcu_t *lcu,
+                         double *inter_cost,
+                         uint32_t *inter_bitcost);
 
-int kvz_search_cu_smp(const encoder_state_t * const state,
-                      int x, int y,
-                      int depth,
-                      part_mode_t part_mode,
-                      lcu_t *lcu);
+void kvz_search_cu_smp(encoder_state_t * const state,
+                       int x, int y,
+                       int depth,
+                       part_mode_t part_mode,
+                       lcu_t *lcu,
+                       double *inter_cost,
+                       uint32_t *inter_bitcost);
 
 #endif // SEARCH_INTER_H_

kvazaar-0.8.3.tar.gz/src/search_intra.c -> kvazaar-1.0.0.tar.gz/src/search_intra.c Changed

@@ -20,12 +20,18 @@
 
 #include "search_intra.h"
 
+#include <limits.h>
+
+#include "cabac.h"
+#include "encoder.h"
 #include "encoderstate.h"
-#include "videoframe.h"
-#include "strategies/strategies-picture.h"
+#include "image.h"
+#include "intra.h"
+#include "kvazaar.h"
 #include "rdo.h"
 #include "search.h"
-#include "intra.h"
+#include "strategies/strategies-picture.h"
+#include "videoframe.h"
 
 
 // Normalize SAD for comparison against SATD to estimate transform skip
@@ -105,10 +111,13 @@
     // versus signaling 'luma and chroma don't use trskip' to the SAD cost.
     const cabac_ctx_t *ctx = &state->cabac.ctx.transform_skip_model_luma;
     double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0);
-    ctx = &state->cabac.ctx.transform_skip_model_chroma;
-    trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
 
-    double sad_cost = TRSKIP_RATIO * sad_func(pred, orig_block) + state->global->cur_lambda_cost_sqrt * trskip_bits;
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      ctx = &state->cabac.ctx.transform_skip_model_chroma;
+      trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
+    }
+
+    double sad_cost = TRSKIP_RATIO * sad_func(pred, orig_block) + state->frame->cur_lambda_cost_sqrt * trskip_bits;
     if (sad_cost < satd_cost) {
       return sad_cost;
     }
@@ -145,14 +154,17 @@
     // versus signaling 'luma and chroma don't use trskip' to the SAD cost.
     const cabac_ctx_t *ctx = &state->cabac.ctx.transform_skip_model_luma;
     double trskip_bits = CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0);
-    ctx = &state->cabac.ctx.transform_skip_model_chroma;
-    trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
+
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+      ctx = &state->cabac.ctx.transform_skip_model_chroma;
+      trskip_bits += 2.0 * (CTX_ENTROPY_FBITS(ctx, 1) - CTX_ENTROPY_FBITS(ctx, 0));
+    }
 
     unsigned unsigned_sad_costsPARALLEL_BLKS = { 0 };
     double sad_costsPARALLEL_BLKS = { 0 };
     sad_twin_func(preds, orig_block, PARALLEL_BLKS, unsigned_sad_costs);
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
-      sad_costsi = TRSKIP_RATIO * (double)unsigned_sad_costsi + state->global->cur_lambda_cost_sqrt * trskip_bits;
+      sad_costsi = TRSKIP_RATIO * (double)unsigned_sad_costsi + state->frame->cur_lambda_cost_sqrt * trskip_bits;
       if (sad_costsi < (double)satd_costsi) {
         costs_outi = sad_costsi;
       }
@@ -189,14 +201,14 @@
   const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
   cu_info_t *const tr_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
 
-  const bool reconstruct_chroma = !(x_px & 4 || y_px & 4);
+  const bool reconstruct_chroma = !(x_px & 4 || y_px & 4) && state->encoder_control->chroma_format != KVZ_CSP_400;
 
   struct {
     kvz_pixel yTR_MAX_WIDTH*TR_MAX_WIDTH;
     kvz_pixel uTR_MAX_WIDTH*TR_MAX_WIDTH;
     kvz_pixel vTR_MAX_WIDTH*TR_MAX_WIDTH;
   } nosplit_pixels;
-  cu_cbf_t nosplit_cbf = { .y = 0, .u = 0, .v = 0 };
+  uint16_t nosplit_cbf = 0;
 
   double split_cost = INT32_MAX;
   double nosplit_cost = INT32_MAX;
@@ -207,14 +219,14 @@
 
     nosplit_cost = 0.0;
 
-    cbf_clear(&pred_cu->cbf.y, depth + PU_INDEX(x_px / 4, y_px / 4));
+    cbf_clear(&pred_cu->cbf, depth, COLOR_Y);
 
     kvz_intra_recon_lcu_luma(state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
     nosplit_cost += kvz_cu_rd_cost_luma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
 
     if (reconstruct_chroma) {
-      cbf_clear(&pred_cu->cbf.u, depth);
-      cbf_clear(&pred_cu->cbf.v, depth);
+      cbf_clear(&pred_cu->cbf, depth, COLOR_U);
+      cbf_clear(&pred_cu->cbf, depth, COLOR_V);
 
       kvz_intra_recon_lcu_chroma(state, x_px, y_px, depth, intra_mode, pred_cu, lcu);
       nosplit_cost += kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, pred_cu, lcu);
@@ -242,7 +254,7 @@
   //     max_depth.
   // - Min transform size hasn't been reached (MAX_PU_DEPTH).
   if (depth < max_depth && depth < MAX_PU_DEPTH) {
-    split_cost = 3 * state->global->cur_lambda_cost;
+    split_cost = 3 * state->frame->cur_lambda_cost;
 
     split_cost += search_intra_trdepth(state, x_px, y_px, depth + 1, max_depth, intra_mode, nosplit_cost, pred_cu, lcu);
     if (split_cost < nosplit_cost) {
@@ -271,20 +283,20 @@
     // if this and any previous transform block has no chroma coefficients.
     // When searching the first block we don't actually know the real values,
     // so this will code cbf as 0 and not code the cbf at all for descendants.
-    {
+    if (state->encoder_control->chroma_format != KVZ_CSP_400) {
       const uint8_t tr_depth = depth - pred_cu->depth;
 
       const cabac_ctx_t *ctx = &(state->cabac.ctx.qt_cbf_model_chromatr_depth);
-      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.u, depth - 1)) {
-        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.u, depth));
+      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_U)) {
+        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_U));
       }
-      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf.v, depth - 1)) {
-        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf.v, depth));
+      if (tr_depth == 0 || cbf_is_set(pred_cu->cbf, depth - 1, COLOR_V)) {
+        cbf_bits += CTX_ENTROPY_FBITS(ctx, cbf_is_set(pred_cu->cbf, depth, COLOR_V));
       }
     }
 
     double bits = tr_split_bit + cbf_bits;
-    split_cost += bits * state->global->cur_lambda_cost;
+    split_cost += bits * state->frame->cur_lambda_cost;
   } else {
     assert(width <= TR_MAX_WIDTH);
   }
@@ -337,7 +349,7 @@
   kvz_pixels_blit(orig_u, orig_block, width, width, origstride, width);
   for (int i = 0; i < 5; ++i) {
     if (modesi == luma_mode) continue;
-    kvz_intra_predict(refs_u, log2_width_c, modesi, COLOR_U, pred);
+    kvz_intra_predict(refs_u, log2_width_c, modesi, COLOR_U, pred, false);
     //costsi += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     costsi += satd_func(pred, orig_block);
   }
@@ -345,7 +357,7 @@
   kvz_pixels_blit(orig_v, orig_block, width, width, origstride, width);
   for (int i = 0; i < 5; ++i) {
     if (modesi == luma_mode) continue;
-    kvz_intra_predict(refs_v, log2_width_c, modesi, COLOR_V, pred);
+    kvz_intra_predict(refs_v, log2_width_c, modesi, COLOR_V, pred, false);
     //costsi += get_cost(encoder_state, pred, orig_block, satd_func, sad_func, width);
     costsi += satd_func(pred, orig_block);
   }
@@ -398,6 +410,9 @@
   cost_pixel_nxn_multi_func *satd_dual_func = kvz_pixels_get_satd_dual_func(width);
   cost_pixel_nxn_multi_func *sad_dual_func = kvz_pixels_get_sad_dual_func(width);
 
+  const kvz_config *cfg = state->encoder_control->cfg;
+  const bool filter_boundary = !(cfg->lossless && cfg->implicit_rdpcm);
+
   // Temporary block arrays
   kvz_pixel _predsPARALLEL_BLKS * 32 * 32 + SIMD_ALIGNMENT;
   pred_buffer preds = ALIGNED_POINTER(_preds, SIMD_ALIGNMENT);
@@ -428,7 +443,9 @@
     
     double costs_outPARALLEL_BLKS = { 0 };
     for (int i = 0; i < PARALLEL_BLKS; ++i) {
-      if (mode + i * offset <= 34) kvz_intra_predict(refs, log2_width, mode + i * offset, COLOR_Y, predsi);
+      if (mode + i * offset <= 34) {
+        kvz_intra_predict(refs, log2_width, mode + i * offset, COLOR_Y, predsi, filter_boundary);
+      }
     }
     
     //TODO: add generic version of get cost  multi
@@ -465,7 +482,9 @@
 
       if (mode_in_range) {
         for (int i = 0; i < PARALLEL_BLKS; ++i) {
-          if (test_modesi >= 2 && test_modesi <= 34) kvz_intra_predict(refs, log2_width, test_modesi, COLOR_Y, predsi);
+          if (test_modesi >= 2 && test_modesi <= 34) {
+            kvz_intra_predict(refs, log2_width, test_modesi, COLOR_Y, predsi, filter_boundary);
+          }
         }
 
         //TODO: add generic version of get cost multi
@@ -501,7 +520,7 @@
     }
 
     if (!has_mode) {
-      kvz_intra_predict(refs, log2_width, mode, COLOR_Y, preds0);
+      kvz_intra_predict(refs, log2_width, mode, COLOR_Y, preds0, filter_boundary);
       costsmodes_selected = get_cost(state, preds0, orig_block, satd_func, sad_func, width);
       modesmodes_selected = mode;
       ++modes_selected;
@@ -510,7 +529,7 @@
 
   // Add prediction mode coding cost as the last thing. We don't want this
   // affecting the halving search.
-  int lambda_cost = (int)(state->global->cur_lambda_cost_sqrt + 0.5);
+  int lambda_cost = (int)(state->frame->cur_lambda_cost_sqrt + 0.5);
   for (int mode_i = 0; mode_i < modes_selected; ++mode_i) {
     costsmode_i += lambda_cost * kvz_luma_mode_bits(state, modesmode_i, intra_preds);
   }
@@ -581,18 +600,15 @@
 
   for(int rdo_mode = 0; rdo_mode < modes_to_check; rdo_mode ++) {
     int rdo_bitcost = kvz_luma_mode_bits(state, modesrdo_mode, intra_preds);
-    costsrdo_mode = rdo_bitcost * (int)(state->global->cur_lambda_cost + 0.5);
+    costsrdo_mode = rdo_bitcost * (int)(state->frame->cur_lambda_cost + 0.5);
 
     // Perform transform split search and save mode RD cost for the best one.
     cu_info_t pred_cu;
     pred_cu.depth = depth;
     pred_cu.type = CU_INTRA;
     pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N);
-    pred_cu.intra0.mode = modesrdo_mode;
-    pred_cu.intra1.mode = modesrdo_mode;
-    pred_cu.intra2.mode = modesrdo_mode;
-    pred_cu.intra3.mode = modesrdo_mode;
-    pred_cu.intra0.mode_chroma = modesrdo_mode;
+    pred_cu.intra.mode = modesrdo_mode;
+    pred_cu.intra.mode_chroma = modesrdo_mode;
     FILL(pred_cu.cbf, 0);
 
     // Reset transform split data in lcu.cu for this area.
@@ -610,11 +626,8 @@
     pred_cu.depth = depth;
     pred_cu.type = CU_INTRA;
     pred_cu.part_size = ((depth == MAX_PU_DEPTH) ? SIZE_NxN : SIZE_2Nx2N);
-    pred_cu.intra0.mode = modes0;
-    pred_cu.intra1.mode = modes0;
-    pred_cu.intra2.mode = modes0;
-    pred_cu.intra3.mode = modes0;
-    pred_cu.intra0.mode_chroma = modes0;
+    pred_cu.intra.mode = modes0;
+    pred_cu.intra.mode_chroma = modes0;
     FILL(pred_cu.cbf, 0);
     search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes0, MAX_INT, &pred_cu, lcu);
   }
@@ -688,7 +701,7 @@
       chroma.cost = kvz_cu_rd_cost_chroma(state, lcu_px.x, lcu_px.y, depth, tr_cu, lcu);
 
       double mode_bits = kvz_chroma_mode_bits(state, chroma.mode, intra_mode);
-      chroma.cost += mode_bits * state->global->cur_lambda_cost;
+      chroma.cost += mode_bits * state->frame->cur_lambda_cost;
 
       if (chroma.cost < best_chroma.cost) {
         best_chroma = chroma;
@@ -707,10 +720,9 @@
                               const int depth, lcu_t *lcu)
 {
   const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
-  const vector2d_t lcu_cu = { lcu_px.x >> 3, lcu_px.y >> 3 };
 
-  cu_info_t *cur_cu = LCU_GET_CU(lcu, lcu_cu.x, lcu_cu.y);
-  int8_t intra_mode = cur_cu->intraPU_INDEX(x_px >> 2, y_px >> 2).mode;
+  cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
+  int8_t intra_mode = cur_pu->intra.mode;
 
   double costs5;
   int8_t modes5 = { 0, 26, 10, 1, 34 };
@@ -766,16 +778,16 @@
  * Update lcu to have best modes at this depth.
  * \return Cost of best mode.
  */
-double kvz_search_cu_intra(encoder_state_t * const state,
-                           const int x_px, const int y_px,
-                           const int depth, lcu_t *lcu)
+void kvz_search_cu_intra(encoder_state_t * const state,
+                         const int x_px, const int y_px,
+                         const int depth, lcu_t *lcu,
+                         int8_t *mode_out, double *cost_out)
 {
   const vector2d_t lcu_px = { SUB_SCU(x_px), SUB_SCU(y_px) };
-  const vector2d_t lcu_cu = { lcu_px.x >> 3, lcu_px.y >> 3 };
-  const int8_t cu_width = (LCU_WIDTH >> (depth));
+  const int8_t cu_width = LCU_WIDTH >> depth;
   const int_fast8_t log2_width = LOG2_LCU_WIDTH - depth;
 
-  cu_info_t *cur_cu = LCU_GET_CU(lcu, lcu_cu.x, lcu_cu.y);
+  cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
 
   kvz_intra_references refs;
 
@@ -786,11 +798,11 @@
 
   // Select left and top CUs if they are available.
   // Top CU is not available across LCU boundary.
-  if ((x_px >> 3) > 0) {
-    left_cu = LCU_GET_CU(lcu, lcu_cu.x - 1, lcu_cu.y);
+  if (x_px >= SCU_WIDTH) {
+    left_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x - 1, lcu_px.y);
   }
-  if ((y_px >> 3) > 0 && lcu_cu.y != 0) {
-    above_cu = LCU_GET_CU(lcu, lcu_cu.x, lcu_cu.y - 1);
+  if (y_px >= SCU_WIDTH && lcu_px.y > 0) {
+    above_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y - 1);
   }
   kvz_intra_get_dir_luma_predictor(x_px, y_px, candidate_modes, cur_cu, left_cu, above_cu);
 
@@ -805,7 +817,6 @@
 
   // Find best intra mode for 2Nx2N.
   kvz_pixel *ref_pixels = &lcu->ref.ylcu_px.x + lcu_px.y * LCU_WIDTH;
-  unsigned pu_index = PU_INDEX(x_px >> 2, y_px >> 2);
 
   int8_t number_of_modes;
   bool skip_rough_search = (depth == 0 || state->encoder_control->rdo >= 3);
@@ -849,7 +860,7 @@
   }
 
   uint8_t best_mode_i = select_best_mode_index(modes, costs, number_of_modes);
-  cur_cu->intrapu_index.mode = modesbest_mode_i;
 
-  return costsbest_mode_i;
+  *mode_out = modesbest_mode_i;
+  *cost_out = costsbest_mode_i;
 }

kvazaar-0.8.3.tar.gz/src/search_intra.h -> kvazaar-1.0.0.tar.gz/src/search_intra.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/altivec/picture-altivec.c -> kvazaar-1.0.0.tar.gz/src/strategies/altivec/picture-altivec.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/altivec/picture-altivec.h -> kvazaar-1.0.0.tar.gz/src/strategies/altivec/picture-altivec.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/avx2/dct-avx2.c -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/dct-avx2.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/avx2/dct-avx2.h -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/dct-avx2.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/avx2/intra-avx2.c -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/intra-avx2.c Changed

@@ -18,14 +18,15 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
+#include "strategies/avx2/intra-avx2.h"
+
+#if COMPILE_INTEL_AVX2 && defined X86_64
+#include <immintrin.h>
 #include <stdlib.h>
 
-#include "intra-avx2.h"
+#include "kvazaar.h"
 #include "strategyselector.h"
 
-#if COMPILE_INTEL_AVX2 && defined X86_64
-#include <immintrin.h>
-#include "strategies/strategies-common.h"
 
  /**
  * \brief Linear interpolation for 4 pixels. Returns 4 filtered pixels in lowest 32-bits of the register.
@@ -250,14 +251,14 @@
       int rx = 0;
       int ry = y;
 
-      row0 = _mm256_permute4x64_epi64(row0, KVZ_PERMUTE(0,2,1,3));
-      row1 = _mm256_permute4x64_epi64(row1, KVZ_PERMUTE(1,3,0,2));
-      row2 = _mm256_permute4x64_epi64(row2, KVZ_PERMUTE(0,2,1,3));
-      row3 = _mm256_permute4x64_epi64(row3, KVZ_PERMUTE(1,3,0,2));
-      row4 = _mm256_permute4x64_epi64(row4, KVZ_PERMUTE(0,2,1,3));
-      row5 = _mm256_permute4x64_epi64(row5, KVZ_PERMUTE(1,3,0,2));
-      row6 = _mm256_permute4x64_epi64(row6, KVZ_PERMUTE(0,2,1,3));
-      row7 = _mm256_permute4x64_epi64(row7, KVZ_PERMUTE(1,3,0,2));
+      row0 = _mm256_permute4x64_epi64(row0, _MM_SHUFFLE(3,1,2,0));
+      row1 = _mm256_permute4x64_epi64(row1, _MM_SHUFFLE(2,0,3,1));
+      row2 = _mm256_permute4x64_epi64(row2, _MM_SHUFFLE(3,1,2,0));
+      row3 = _mm256_permute4x64_epi64(row3, _MM_SHUFFLE(2,0,3,1));
+      row4 = _mm256_permute4x64_epi64(row4, _MM_SHUFFLE(3,1,2,0));
+      row5 = _mm256_permute4x64_epi64(row5, _MM_SHUFFLE(2,0,3,1));
+      row6 = _mm256_permute4x64_epi64(row6, _MM_SHUFFLE(3,1,2,0));
+      row7 = _mm256_permute4x64_epi64(row7, _MM_SHUFFLE(2,0,3,1));
 
       _mm_storeu_si128((__m128i*)(dst + (ry + 0) * 16 + rx), _mm256_castsi256_si128(row0));
       _mm_storeu_si128((__m128i*)(dst + (ry + 1) * 16 + rx), _mm256_castsi256_si128(row1));
@@ -341,14 +342,14 @@
       } else {
 
         //Move all filtered pixels to the lower lane to reduce memory accesses
-        row0 = _mm256_permute4x64_epi64(row0, KVZ_PERMUTE(0,2,1,3));
-        row1 = _mm256_permute4x64_epi64(row1, KVZ_PERMUTE(1,3,0,2));
-        row2 = _mm256_permute4x64_epi64(row2, KVZ_PERMUTE(0,2,1,3));
-        row3 = _mm256_permute4x64_epi64(row3, KVZ_PERMUTE(1,3,0,2));
-        row4 = _mm256_permute4x64_epi64(row4, KVZ_PERMUTE(0,2,1,3));
-        row5 = _mm256_permute4x64_epi64(row5, KVZ_PERMUTE(1,3,0,2));
-        row6 = _mm256_permute4x64_epi64(row6, KVZ_PERMUTE(0,2,1,3));
-        row7 = _mm256_permute4x64_epi64(row7, KVZ_PERMUTE(1,3,0,2));
+        row0 = _mm256_permute4x64_epi64(row0, _MM_SHUFFLE(3,1,2,0));
+        row1 = _mm256_permute4x64_epi64(row1, _MM_SHUFFLE(2,0,3,1));
+        row2 = _mm256_permute4x64_epi64(row2, _MM_SHUFFLE(3,1,2,0));
+        row3 = _mm256_permute4x64_epi64(row3, _MM_SHUFFLE(2,0,3,1));
+        row4 = _mm256_permute4x64_epi64(row4, _MM_SHUFFLE(3,1,2,0));
+        row5 = _mm256_permute4x64_epi64(row5, _MM_SHUFFLE(2,0,3,1));
+        row6 = _mm256_permute4x64_epi64(row6, _MM_SHUFFLE(3,1,2,0));
+        row7 = _mm256_permute4x64_epi64(row7, _MM_SHUFFLE(2,0,3,1));
 
         _mm_storeu_si128((__m128i*)(dst + (y + 0) * width + x), _mm256_castsi256_si128(row0));
         _mm_storeu_si128((__m128i*)(dst + (y + 1) * width + x), _mm256_castsi256_si128(row1));

kvazaar-0.8.3.tar.gz/src/strategies/avx2/intra-avx2.h -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/intra-avx2.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/avx2/ipol-avx2.c -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/ipol-avx2.c Changed

@@ -22,17 +22,19 @@
 * \file
 */
 
-#include "ipol-avx2.h"
-#include "strategyselector.h"
+#include "strategies/avx2/ipol-avx2.h"
 
 #if COMPILE_INTEL_AVX2
-#include <stdlib.h>
-
 #include <immintrin.h>
-
+#include <stdio.h>
+#include <string.h>
 
 #include "encoder.h"
+#include "kvazaar.h"
 #include "strategies/generic/picture-generic.h"
+#include "strategies/strategies-ipol.h"
+#include "strategyselector.h"
+#include "strategies/generic/ipol-generic.h"
 
 
 #define FILTER_OFFSET 3
@@ -62,6 +64,235 @@
   _mm_storeu_si128(dst, a);
 }
 
+static __m128i kvz_eight_tap_filter_flip_x8_16bit_avx2(__m128i *row, int8_t *filter, int32_t offset23, int32_t shift23)
+{
+  __m128i temp8;
+  __m128i temp_lo;
+  __m128i temp_hi;
+  __m128i fir = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)filter));
+
+  temp0 = _mm_madd_epi16(row0, fir);
+  temp1 = _mm_madd_epi16(row1, fir);
+  temp_lo = _mm_unpacklo_epi32(temp0, temp1);
+  temp_hi = _mm_unpackhi_epi32(temp0, temp1);
+  temp0 = _mm_add_epi32(temp_lo, temp_hi);
+
+  temp2 = _mm_madd_epi16(row2, fir);
+  temp3 = _mm_madd_epi16(row3, fir);
+  temp_lo = _mm_unpacklo_epi32(temp2, temp3);
+  temp_hi = _mm_unpackhi_epi32(temp2, temp3);
+  temp2 = _mm_add_epi32(temp_lo, temp_hi);
+
+  temp4 = _mm_madd_epi16(row4, fir);
+  temp5 = _mm_madd_epi16(row5, fir);
+  temp_lo = _mm_unpacklo_epi32(temp4, temp5);
+  temp_hi = _mm_unpackhi_epi32(temp4, temp5);
+  temp4 = _mm_add_epi32(temp_lo, temp_hi);
+
+  temp6 = _mm_madd_epi16(row6, fir);
+  temp7 = _mm_madd_epi16(row7, fir);
+  temp_lo = _mm_unpacklo_epi32(temp6, temp7);
+  temp_hi = _mm_unpackhi_epi32(temp6, temp7);
+  temp6 = _mm_add_epi32(temp_lo, temp_hi);
+
+  temp_lo = _mm_unpacklo_epi32(temp0, temp2);
+  temp_hi = _mm_unpackhi_epi32(temp0, temp2);
+  temp0 = _mm_add_epi32(temp_lo, temp_hi);
+  temp0 = _mm_shuffle_epi32(temp0, _MM_SHUFFLE(3, 1, 2, 0));
+
+  temp_lo = _mm_unpacklo_epi32(temp4, temp6);
+  temp_hi = _mm_unpackhi_epi32(temp4, temp6);
+  temp4 = _mm_add_epi32(temp_lo, temp_hi);
+  temp4 = _mm_shuffle_epi32(temp4, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m128i add = _mm_set1_epi32(offset23);
+  temp0 = _mm_add_epi32(temp0, add);
+  temp4 = _mm_add_epi32(temp4, add);
+  temp0 = _mm_srai_epi32(temp0, shift23);
+  temp4 = _mm_srai_epi32(temp4, shift23);
+
+  temp0 = _mm_packus_epi32(temp0, temp4);
+  temp0 = _mm_packus_epi16(temp0, temp0);
+
+  return temp0;
+}
+
+static __m256i kvz_eight_tap_filter_flip_x8_16bit_dual_avx2(__m256i *row, int8_t *filter2, int32_t offset23, int32_t shift23)
+{
+  __m256i temp8;
+  __m256i temp_lo;
+  __m256i temp_hi;
+  __m256i fir = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)filter0), _mm_loadl_epi64((__m128i*)filter1)));
+
+  temp0 = _mm256_madd_epi16(row0, fir);
+  temp1 = _mm256_madd_epi16(row1, fir);
+  temp_lo = _mm256_unpacklo_epi32(temp0, temp1);
+  temp_hi = _mm256_unpackhi_epi32(temp0, temp1);
+  temp0 = _mm256_add_epi32(temp_lo, temp_hi);
+
+  temp2 = _mm256_madd_epi16(row2, fir);
+  temp3 = _mm256_madd_epi16(row3, fir);
+  temp_lo = _mm256_unpacklo_epi32(temp2, temp3);
+  temp_hi = _mm256_unpackhi_epi32(temp2, temp3);
+  temp2 = _mm256_add_epi32(temp_lo, temp_hi);
+
+  temp4 = _mm256_madd_epi16(row4, fir);
+  temp5 = _mm256_madd_epi16(row5, fir);
+  temp_lo = _mm256_unpacklo_epi32(temp4, temp5);
+  temp_hi = _mm256_unpackhi_epi32(temp4, temp5);
+  temp4 = _mm256_add_epi32(temp_lo, temp_hi);
+
+  temp6 = _mm256_madd_epi16(row6, fir);
+  temp7 = _mm256_madd_epi16(row7, fir);
+  temp_lo = _mm256_unpacklo_epi32(temp6, temp7);
+  temp_hi = _mm256_unpackhi_epi32(temp6, temp7);
+  temp6 = _mm256_add_epi32(temp_lo, temp_hi);
+
+  temp_lo = _mm256_unpacklo_epi32(temp0, temp2);
+  temp_hi = _mm256_unpackhi_epi32(temp0, temp2);
+  temp0 = _mm256_add_epi32(temp_lo, temp_hi);
+  temp0 = _mm256_shuffle_epi32(temp0, _MM_SHUFFLE(3, 1, 2, 0));
+
+  temp_lo = _mm256_unpacklo_epi32(temp4, temp6);
+  temp_hi = _mm256_unpackhi_epi32(temp4, temp6);
+  temp4 = _mm256_add_epi32(temp_lo, temp_hi);
+  temp4 = _mm256_shuffle_epi32(temp4, _MM_SHUFFLE(3, 1, 2, 0));
+
+  __m256i add = _mm256_set1_epi32(offset23);
+  temp0 = _mm256_add_epi32(temp0, add);
+  temp4 = _mm256_add_epi32(temp4, add);
+  temp0 = _mm256_srai_epi32(temp0, shift23);
+  temp4 = _mm256_srai_epi32(temp4, shift23);
+
+  temp0 = _mm256_packus_epi32(temp0, temp4);
+  temp0 = _mm256_packus_epi16(temp0, temp0);
+
+  return temp0;
+}
+
+/*
+static __m128i kvz_eight_tap_filter_flip_x8_avx2(__m128i *row, int8_t *filter,  int32_t shift1)
+{
+  __m128i temp4;
+  __m128i fir = _mm_broadcastq_epi64(_mm_loadl_epi64((__m128i*)filter));
+  
+  temp0 = _mm_unpacklo_epi64(row0, row1);
+  temp0 = _mm_maddubs_epi16(temp0, fir);
+
+  temp1 = _mm_unpacklo_epi64(row2, row3);
+  temp1 = _mm_maddubs_epi16(temp1, fir);
+
+  temp0 = _mm_hadd_epi16(temp0, temp1);
+
+  temp2 = _mm_unpacklo_epi64(row4, row5);
+  temp2 = _mm_maddubs_epi16(temp2, fir);
+
+  temp3 = _mm_unpacklo_epi64(row6, row7);
+  temp3 = _mm_maddubs_epi16(temp3, fir);
+  
+  temp2 = _mm_hadd_epi16(temp2, temp3);
+
+  temp0 = _mm_hadd_epi16(temp0, temp2);
+
+  temp0 = _mm_srai_epi16(temp0, shift1);
+
+  return temp0;
+}
+*/
+
+static __m256i kvz_eight_tap_filter_flip_x8_dual_avx2(__m256i *row, int8_t *filter2,  int32_t shift1)
+{
+  __m256i temp4;
+  __m256i fir = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)filter0)), _mm_loadl_epi64((__m128i*)filter1), 1);
+  fir = _mm256_shuffle_epi32(fir, _MM_SHUFFLE(1, 0, 1, 0));
+  
+  temp0 = _mm256_unpacklo_epi64(row0, row1);
+  temp0 = _mm256_maddubs_epi16(temp0, fir);
+
+  temp1 = _mm256_unpacklo_epi64(row2, row3);
+  temp1 = _mm256_maddubs_epi16(temp1, fir);
+
+  temp0 = _mm256_hadd_epi16(temp0, temp1);
+
+  temp2 = _mm256_unpacklo_epi64(row4, row5);
+  temp2 = _mm256_maddubs_epi16(temp2, fir);
+
+  temp3 = _mm256_unpacklo_epi64(row6, row7);
+  temp3 = _mm256_maddubs_epi16(temp3, fir);
+  
+  temp2 = _mm256_hadd_epi16(temp2, temp3);
+
+  temp0 = _mm256_hadd_epi16(temp0, temp2);
+
+  temp0 = _mm256_srai_epi16(temp0, shift1);
+
+  return temp0;
+}
+
+/*
+static INLINE void kvz_filter_flip_shift_x8_avx2(kvz_pixel *src, int16_t src_stride, int8_t *filter, int32_t shift1, int16_t *dst){
+
+  __m128i rows8;
+  rows0 = _mm_loadl_epi64((__m128i*)(src + 0 * src_stride));
+  rows1 = _mm_loadl_epi64((__m128i*)(src + 1 * src_stride));
+  rows2 = _mm_loadl_epi64((__m128i*)(src + 2 * src_stride));
+  rows3 = _mm_loadl_epi64((__m128i*)(src + 3 * src_stride));
+  rows4 = _mm_loadl_epi64((__m128i*)(src + 4 * src_stride));
+  rows5 = _mm_loadl_epi64((__m128i*)(src + 5 * src_stride));
+  rows6 = _mm_loadl_epi64((__m128i*)(src + 6 * src_stride));
+  rows7 = _mm_loadl_epi64((__m128i*)(src + 7 * src_stride));
+  __m128i out = kvz_eight_tap_filter_flip_x8_avx2(rows, filter, shift1);
+  _mm_storeu_si128((__m128i*)dst,  out);
+}
+*/
+
+static INLINE void kvz_filter_flip_shift_x8_dual_avx2(kvz_pixel *src, int16_t src_stride, int8_t *firs2, int32_t shift1, int16_t *dst2){
+
+  __m256i rows8;
+  rows0 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 0 * src_stride)));
+  rows1 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 1 * src_stride)));
+  rows2 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 2 * src_stride)));
+  rows3 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 3 * src_stride)));
+  rows4 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 4 * src_stride)));
+  rows5 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 5 * src_stride)));
+  rows6 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 6 * src_stride)));
+  rows7 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 7 * src_stride)));
+  __m256i out = kvz_eight_tap_filter_flip_x8_dual_avx2(rows, firs, shift1);
+  _mm_storeu_si128((__m128i*)dst0,  _mm256_castsi256_si128(out));
+  _mm_storeu_si128((__m128i*)dst1,  _mm256_extracti128_si256(out, 1));
+}
+
+static INLINE void kvz_filter_flip_round_clip_x8_16bit_avx2(int16_t *flipped_filtered, int16_t src_stride, int8_t *filter, int32_t offset23, int32_t shift23, kvz_pixel *dst){
+
+  __m128i rows8;
+  rows0 = _mm_loadu_si128((__m128i*)(flipped_filtered + 0 * src_stride));
+  rows1 = _mm_loadu_si128((__m128i*)(flipped_filtered + 1 * src_stride));
+  rows2 = _mm_loadu_si128((__m128i*)(flipped_filtered + 2 * src_stride));
+  rows3 = _mm_loadu_si128((__m128i*)(flipped_filtered + 3 * src_stride));
+  rows4 = _mm_loadu_si128((__m128i*)(flipped_filtered + 4 * src_stride));
+  rows5 = _mm_loadu_si128((__m128i*)(flipped_filtered + 5 * src_stride));
+  rows6 = _mm_loadu_si128((__m128i*)(flipped_filtered + 6 * src_stride));
+  rows7 = _mm_loadu_si128((__m128i*)(flipped_filtered + 7 * src_stride));
+  _mm_storel_epi64((__m128i*)dst, kvz_eight_tap_filter_flip_x8_16bit_avx2(rows, filter, offset23, shift23) );
+}
+
+static INLINE void kvz_filter_flip_round_clip_x8_16bit_dual_avx2(int16_t *flipped_filtered2, int16_t src_stride, int8_t *firs2, int32_t offset23, int32_t shift23, kvz_pixel *dst2){
+
+  __m256i rows8;
+  rows0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 0 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 0 * src_stride)), 1);
+  rows1 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 1 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 1 * src_stride)), 1);
+  rows2 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 2 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 2 * src_stride)), 1);
+  rows3 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 3 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 3 * src_stride)), 1);
+  rows4 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 4 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 4 * src_stride)), 1);
+  rows5 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 5 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 5 * src_stride)), 1);
+  rows6 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 6 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 6 * src_stride)), 1);
+  rows7 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 7 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 7 * src_stride)), 1);
+  __m256i out = kvz_eight_tap_filter_flip_x8_16bit_dual_avx2(rows, firs, offset23, shift23);
+  _mm_storel_epi64((__m128i*)dst0,  _mm256_castsi256_si128(out));
+  _mm_storel_epi64((__m128i*)dst1,  _mm256_extracti128_si256(out, 1));
+
+}
+
 __m128i kvz_eight_tap_filter_x4_and_flip_16bit(__m128i *data0, __m128i *data1, __m128i *data2, __m128i *data3, __m128i *filter)
 {
   __m128i a, b, c, d;
@@ -159,30 +390,32 @@
 
 int16_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data)
 {
-  union {
-    __m128i vector;
-    int16_t array8;
-  } sample;
 
-  __m128i packed_data = _mm_loadu_si128((__m128i*)data);
-  __m128i packed_filter = _mm_loadu_si128((__m128i*)filter);
+  __m128i sample;
+
+  __m128i packed_data = _mm_loadl_epi64((__m128i*)data);
+  __m128i packed_filter = _mm_loadl_epi64((__m128i*)filter);
 
-  sample.vector = _mm_maddubs_epi16(packed_data, packed_filter);
-  sample.vector = _mm_hadd_epi16(sample.vector, sample.vector);
-  sample.vector = _mm_hadd_epi16(sample.vector, sample.vector);
+  sample = _mm_maddubs_epi16(packed_data, packed_filter);
+  sample = _mm_add_epi16(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(0, 1, 0, 1)));
+  sample = _mm_add_epi16(sample, _mm_shufflelo_epi16(sample, _MM_SHUFFLE(0, 1, 0, 1)));
 
-  return sample.array0;
+  return (int16_t)_mm_cvtsi128_si32(sample);
 }
 
+
 int32_t kvz_eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data)
 {
-  int32_t temp = 0;
-  for (int i = 0; i < 8; ++i)
-  {
-    temp += filteri * datai;
-  }
+  __m128i sample;
 
-  return temp;
+  __m128i packed_data = _mm_loadu_si128((__m128i*)data);
+  __m128i packed_filter = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)filter));
+
+  sample = _mm_madd_epi16(packed_data, packed_filter);
+  sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(0, 1, 3, 2)));
+  sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(0, 1, 0, 1)));
+
+  return _mm_extract_epi32(sample, 0);
 }
 
 int16_t kvz_eight_tap_filter_ver_avx2(int8_t *filter, kvz_pixel *data, int16_t stride)
@@ -209,24 +442,24 @@
 
 int16_t kvz_four_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data)
 {
-  int16_t temp = 0;
-  for (int i = 0; i < 4; ++i)
-  {
-    temp += filteri * datai;
-  }
+  __m128i packed_data = _mm_cvtsi32_si128(*(int32_t*)data);
+  __m128i packed_filter = _mm_cvtsi32_si128(*(int32_t*)filter);
 
-  return temp;
+  __m128i temp = _mm_maddubs_epi16(packed_data, packed_filter);
+  temp = _mm_hadd_epi16(temp, temp);
+
+  return _mm_extract_epi16(temp, 0);
 }
 
 int32_t kvz_four_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data)
 {
-  int32_t temp = 0;
-  for (int i = 0; i < 4; ++i)
-  {
-    temp += filteri * datai;
-  }
+  __m128i packed_data = _mm_loadl_epi64((__m128i*)data);
+  __m128i packed_filter = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(*(int32_t*)filter) );
 
-  return temp;
+  __m128i temp = _mm_madd_epi16(packed_data, packed_filter);
+  temp = _mm_hadd_epi32(temp, temp);
+
+  return _mm_cvtsi128_si32(temp);
 }
 
 int16_t kvz_four_tap_filter_ver_avx2(int8_t *filter, kvz_pixel *data, int16_t stride)
@@ -251,6 +484,185 @@
   return temp;
 }
 
+void kvz_eight_tap_filter_x4_hor_avx2(int8_t *filter, kvz_pixel *data, int shift, int16_t* dst)
+{
+  __m256i packed_data = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)data)), _mm_loadl_epi64((__m128i*)(data + 2)), 1);
+  __m256i packed_filter = _mm256_broadcastq_epi64(_mm_loadl_epi64((__m128i*)filter));
+  __m256i idx_lookup = _mm256_broadcastsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8));
+
+  __m256i temp = _mm256_shuffle_epi8(packed_data, idx_lookup);
+
+  temp = _mm256_maddubs_epi16(temp, packed_filter);
+  __m128i temp_128 = _mm_hadd_epi16(_mm256_extracti128_si256(temp, 0), _mm256_extracti128_si256(temp, 1));
+  temp_128 = _mm_hadd_epi16(temp_128, temp_128);
+  temp_128 = _mm_srai_epi16(temp_128, shift);
+
+  _mm_storel_epi64((__m128i*)dst, temp_128);
+}
+
+void kvz_four_tap_filter_x4_hor_avx2(int8_t *filter, kvz_pixel *data, int shift, int16_t* dst)
+{
+  __m128i packed_data = _mm_loadl_epi64((__m128i*)data);
+  __m128i packed_filter = _mm_set1_epi32(*(int32_t*)filter);
+  __m128i idx_lookup = _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
+
+  __m128i temp = _mm_shuffle_epi8(packed_data, idx_lookup);
+
+  temp = _mm_maddubs_epi16(temp, packed_filter);
+  temp = _mm_hadd_epi16(temp, temp);
+  temp = _mm_srai_epi16(temp, shift);
+
+  _mm_storel_epi64((__m128i*)dst, temp);
+}
+
+void kvz_eight_tap_filter_x8_hor_avx2(int8_t *filter, kvz_pixel *data, int shift, int16_t* dst)
+{
+  __m256i packed_data = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)data)), _mm_loadu_si128((__m128i*)(data + 4)), 1);
+  __m256i packed_filter = _mm256_broadcastq_epi64(_mm_loadl_epi64((__m128i*)filter));
+  __m256i idx_lookup0 = _mm256_broadcastsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8));
+  __m256i idx_lookup1 = _mm256_broadcastsi128_si256(_mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10));
+
+  __m256i temp0 = _mm256_shuffle_epi8(packed_data, idx_lookup0);
+  __m256i temp1 = _mm256_shuffle_epi8(packed_data, idx_lookup1);
+
+  temp0 = _mm256_maddubs_epi16(temp0, packed_filter);
+  temp1 = _mm256_maddubs_epi16(temp1, packed_filter);
+  temp0 = _mm256_hadd_epi16(temp0, temp1);
+  temp0 = _mm256_hadd_epi16(temp0, temp0);
+
+  temp0 = _mm256_srai_epi16(temp0, shift);
+
+  temp0 = _mm256_permute4x64_epi64(temp0, _MM_SHUFFLE(3, 1, 2, 0));
+
+  _mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(temp0));
+}
+
+void kvz_four_tap_filter_x8_hor_avx2(int8_t *filter, kvz_pixel *data, int shift, int16_t* dst)
+{
+  __m256i packed_data = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)data)), _mm_loadl_epi64((__m128i*)(data + 4)), 1);
+  __m256i packed_filter = _mm256_set1_epi32(*(int32_t*)filter);
+  __m256i idx_lookup = _mm256_broadcastsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6));
+
+  __m256i temp = _mm256_shuffle_epi8(packed_data, idx_lookup);
+
+  temp = _mm256_maddubs_epi16(temp, packed_filter);
+  temp = _mm256_hadd_epi16(temp, temp);
+  temp = _mm256_srai_epi16(temp, shift);
+
+  _mm_storel_epi64((__m128i*)dst, _mm256_castsi256_si128(temp));
+  _mm_storel_epi64((__m128i*)(dst + 4), _mm256_extracti128_si256(temp, 1));
+}
+
+int32_t kvz_eight_tap_filter_x4_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3)
+{
+
+  __m128i v_filter = _mm_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter0)));
+  __m128i v_data0 = _mm_loadl_epi64((__m128i*)(data + stride * 0));
+  __m128i v_data1 = _mm_loadl_epi64((__m128i*)(data + stride * 1));
+  __m128i v_data = _mm_unpacklo_epi16(v_data0, v_data1);
+  __m128i temp =  _mm_madd_epi16(v_filter, v_data);
+
+  v_filter = _mm_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter2)));
+  __m128i v_data2 = _mm_loadl_epi64((__m128i*)(data + stride * 2));
+  __m128i v_data3 = _mm_loadl_epi64((__m128i*)(data + stride * 3));
+  v_data = _mm_unpacklo_epi16(v_data2, v_data3);
+  temp = _mm_add_epi32(temp, _mm_madd_epi16(v_filter, v_data) );
+
+  temp = _mm_add_epi32(temp, _mm_set1_epi32(offset));
+  temp = _mm_srai_epi32(temp, shift2 + shift3);
+
+  temp = _mm_packus_epi32(temp, temp);
+  temp = _mm_packus_epi16(temp, temp);
+
+  return _mm_cvtsi128_si32(temp);
+}
+
+int32_t kvz_four_tap_filter_x4_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3)
+{
+
+  __m128i v_filter = _mm_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter0)));
+  __m128i v_data0 = _mm_loadl_epi64((__m128i*)(data + stride * 0));
+  __m128i v_data1 = _mm_loadl_epi64((__m128i*)(data + stride * 1));
+  __m128i v_data = _mm_unpacklo_epi16(v_data0, v_data1);
+  __m128i temp =  _mm_madd_epi16(v_filter, v_data);
+
+  v_filter = _mm_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter2)));
+  __m128i v_data2 = _mm_loadl_epi64((__m128i*)(data + stride * 2));
+  __m128i v_data3 = _mm_loadl_epi64((__m128i*)(data + stride * 3));
+  v_data = _mm_unpacklo_epi16(v_data2, v_data3);
+  temp = _mm_add_epi32(temp, _mm_madd_epi16(v_filter, v_data) );
+
+  temp = _mm_add_epi32(temp, _mm_set1_epi32(offset));
+  temp = _mm_srai_epi32(temp, shift2 + shift3);
+
+  temp = _mm_packus_epi32(temp, temp);
+  temp = _mm_packus_epi16(temp, temp);
+
+  return _mm_cvtsi128_si32(temp);
+}
+
+void kvz_eight_tap_filter_x8_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3, kvz_pixel* dst)
+{
+
+  __m256i v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter0)));
+  __m256i v_data0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 0)));
+  __m256i v_data1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 1)));
+  __m256i v_data = _mm256_or_si256(v_data0, _mm256_slli_epi32(v_data1, 16));
+  __m256i temp =  _mm256_madd_epi16(v_filter, v_data);
+
+  v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter2)));
+  __m256i v_data2 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 2)));
+  __m256i v_data3 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 3)));
+  v_data = _mm256_or_si256(v_data2, _mm256_slli_epi32(v_data3, 16));
+  temp = _mm256_add_epi32(temp, _mm256_madd_epi16(v_filter, v_data) );
+
+  v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter4)));
+  __m256i v_data4 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 4)));
+  __m256i v_data5 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 5)));
+  v_data = _mm256_or_si256(v_data4, _mm256_slli_epi32(v_data5, 16));
+  temp = _mm256_add_epi32(temp, _mm256_madd_epi16(v_filter, v_data) );
+
+  v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter6)));
+  __m256i v_data6 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 6)));
+  __m256i v_data7 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 7)));
+  v_data = _mm256_or_si256(v_data6, _mm256_slli_epi32(v_data7, 16));
+  temp = _mm256_add_epi32(temp, _mm256_madd_epi16(v_filter, v_data) );
+
+  temp = _mm256_add_epi32(temp, _mm256_set1_epi32(offset));
+  temp = _mm256_srai_epi32(temp, shift2 + shift3);
+
+  temp = _mm256_packus_epi32(temp, temp);
+  temp = _mm256_packus_epi16(temp, temp);
+
+  *(int32_t*)dst = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp));
+  *(int32_t*)(dst + 4) = _mm_cvtsi128_si32(_mm256_extracti128_si256(temp, 1));
+}
+
+void kvz_four_tap_filter_x8_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3, kvz_pixel* dst)
+{
+
+  __m256i v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter0)));
+  __m256i v_data0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 0)));
+  __m256i v_data1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 1)));
+  __m256i v_data = _mm256_or_si256(v_data0, _mm256_slli_epi32(v_data1, 16));
+  __m256i temp =  _mm256_madd_epi16(v_filter, v_data);
+
+  v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter2)));
+  __m256i v_data2 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 2)));
+  __m256i v_data3 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 3)));
+  v_data = _mm256_or_si256(v_data2, _mm256_slli_epi32(v_data3, 16));
+  temp = _mm256_add_epi32(temp, _mm256_madd_epi16(v_filter, v_data) );
+
+  temp = _mm256_add_epi32(temp, _mm256_set1_epi32(offset));
+  temp = _mm256_srai_epi32(temp, shift2 + shift3);
+
+  temp = _mm256_packus_epi32(temp, temp);
+  temp = _mm256_packus_epi16(temp, temp);
+
+  *(int32_t*)dst = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp));
+  *(int32_t*)(dst + 4) = _mm_cvtsi128_si32(_mm256_extracti128_si256(temp, 1));
+}
+
 void kvz_filter_inter_quarterpel_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
 {
 
@@ -481,6 +893,476 @@
   }
 }
 
+void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+{
+  int x, y;
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  int8_t *fir0 = kvz_g_luma_filter0;
+  int8_t *fir2 = kvz_g_luma_filter2;
+
+  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+
+  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
+  int16_t dst_stride = (LCU_WIDTH + 1);
+
+  // Horizontal positions
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y + 8 < height + KVZ_EXT_PADDING + 1; y += 8) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      int8_t *firs2 = { fir0, fir2 };
+      int16_t *dsts2 = { &flipped0x * temp_stride + y, &flipped2x * temp_stride + y };
+      kvz_filter_flip_shift_x8_dual_avx2(&srcsrc_stride*ypos + xpos, src_stride, &firs0, shift1, &dsts0);
+    }
+
+    for (; y < height + KVZ_EXT_PADDING + 1; ++y) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
+    }
+  }
+
+  // Filter vertically and flip x and y
+  for (x = 0; x + 8 < width + 1; x += 8) {
+    for (y = 0; y < height + 1; ++y) {  
+      int8_t *firs2 = { fir0, fir2 };
+      kvz_pixel *dsts2 = { &filteredHPEL_POS_HORy * dst_stride + x, &filteredHPEL_POS_VERy * dst_stride + x};
+      int16_t *srcs2 = {flipped2 + x * temp_stride + y, flipped0 + x * temp_stride + y };
+      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(srcs, temp_stride, firs, offset23, shift2 + shift3, dsts);
+    }
+  }
+
+  // The remaining pixels
+  for (; x < width + 1; ++x) {
+    for (y = 0; y < height + 1; ++y) {
+      filteredHPEL_POS_HORy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filteredHPEL_POS_VERy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+    }
+  }
+}
+
+void kvz_filter_hpel_blocks_full_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+{
+  int x, y;
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  int8_t *fir0 = kvz_g_luma_filter0;
+  int8_t *fir2 = kvz_g_luma_filter2;
+
+  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+
+  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
+  int16_t dst_stride = (LCU_WIDTH + 1);
+
+  // Horizontal positions
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y + 8 < height + KVZ_EXT_PADDING + 1; y += 8) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      int8_t *firs2 = { fir0, fir2 };
+      int16_t *dsts2 = { &flipped0x * temp_stride + y, &flipped2x * temp_stride + y };
+      kvz_filter_flip_shift_x8_dual_avx2(&srcsrc_stride*ypos + xpos, src_stride, &firs0, shift1, &dsts0);
+
+    }
+
+    for (; y < height + KVZ_EXT_PADDING + 1; ++y) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
+    }
+  }
+
+  // Filter vertically and flip x and y
+  for (x = 0; x + 8 < width + 1; x += 8) {
+    for (y = 0; y < height + 1; ++y) {
+      int8_t *firs2 = { fir0, fir2 };
+      kvz_pixel *dsts2 = { &filteredHPEL_POS_HORy * dst_stride + x, &filteredHPEL_POS_VERy * dst_stride + x};
+      int16_t *srcs2 = {flipped2 + x * temp_stride + y, flipped0 + x * temp_stride + y };
+      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(srcs, temp_stride, firs, offset23, shift2 + shift3, dsts);
+      kvz_filter_flip_round_clip_x8_16bit_avx2(flipped2 + x * temp_stride + y, temp_stride, fir2, offset23, shift2 + shift3, &filteredHPEL_POS_DIAy * dst_stride + x);
+
+    }
+  }
+
+  // The remaining pixels
+  for (; x < width + 1; ++x) {
+    for (y = 0; y < height + 1; ++y) {
+      filteredHPEL_POS_HORy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filteredHPEL_POS_VERy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filteredHPEL_POS_DIAy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+    }
+  }
+}
+
+void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+{
+  int x, y;
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  int8_t *fir0 = kvz_g_luma_filter0;
+  int8_t *fir2 = kvz_g_luma_filter2;
+  int8_t *fir1 = kvz_g_luma_filter1;
+  int8_t *fir3 = kvz_g_luma_filter3;
+
+  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped1(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped3(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+
+  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
+  int16_t dst_stride = (LCU_WIDTH + 1);
+  
+  // Horizontal positions
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y + 8 < height + KVZ_EXT_PADDING + 1; y += 8) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      int8_t *firs4 = { fir0, fir2, fir1, fir3 };
+      int16_t *dsts4 = { &flipped0x * temp_stride + y, &flipped2x * temp_stride + y, &flipped1x * temp_stride + y, &flipped3x * temp_stride + y};
+      kvz_filter_flip_shift_x8_dual_avx2(&srcsrc_stride*ypos + xpos, src_stride, &firs0, shift1, &dsts0);
+      kvz_filter_flip_shift_x8_dual_avx2(&srcsrc_stride*ypos + xpos, src_stride, &firs2, shift1, &dsts2);
+    }
+
+    for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped1x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir1, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped3x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir3, &srcsrc_stride*ypos + xpos) >> shift1;
+    }
+  }
+
+  // Filter vertically and flip x and y
+  for (x = 0; x + 8 < width + 1; x += 8) {
+    for (y = 0; y < height + 1; ++y) {
+      
+      // HPEL
+      int8_t *firs02 = { fir0, fir2 };
+      kvz_pixel *dsts02 = { &filtered0y * dst_stride + x, &filtered1y * dst_stride + x};
+      int16_t *srcs04 = { flipped2 + x * temp_stride + y, flipped0 + x * temp_stride + y};
+      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(srcs0, temp_stride, firs0, offset23, shift2 + shift3, dsts0);
+      kvz_filter_flip_round_clip_x8_16bit_avx2(flipped2 + x * temp_stride + y, temp_stride, fir2, offset23, shift2 + shift3, &filtered2y * dst_stride + x);
+     
+      // QPEL
+      // Horizontal
+      int8_t *firs14 = { fir0, fir0, fir2, fir2 };
+      kvz_pixel *dsts14 = { &filtered3y * dst_stride + x, &filtered4y * dst_stride + x, 
+                              &filtered5y * dst_stride + x, &filtered6y * dst_stride + x };
+      int16_t *srcs14 = { flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, 
+                            flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, };
+      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs10, temp_stride, &firs10, offset23, shift2 + shift3, &dsts10);
+      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs12, temp_stride, &firs12, offset23, shift2 + shift3, &dsts12);
+
+      // Vertical
+      int8_t *firs24 = { fir1, fir1, fir3, fir3 };
+      kvz_pixel *dsts24 = { &filtered7y * dst_stride + x, &filtered8y * dst_stride + x, 
+                              &filtered9y * dst_stride + x, &filtered10y * dst_stride + x };
+      int16_t *srcs24 = { flipped0 + x * temp_stride + y, flipped2 + x * temp_stride + y, 
+                            flipped0 + x * temp_stride + y, flipped2 + x * temp_stride + y, };
+      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs20, temp_stride, &firs20, offset23, shift2 + shift3, &dsts20);
+      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs22, temp_stride, &firs22, offset23, shift2 + shift3, &dsts22);
+    }
+  }
+
+  // The remaining pixels
+  for (; x < width + 1; ++x) {
+    for (y = 0; y < height + 1; ++y) {
+
+      // HPEL
+      filtered0y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered1y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered2y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+
+      // QPEL
+      // Horizontal
+      filtered3y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered4y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered5y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered6y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
+
+      // Vertical
+      filtered7y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered8y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered9y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered10y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+    }
+  }
+}
+
+void kvz_filter_qpel_blocks_full_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+{
+  int x, y;
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  int8_t *fir0 = kvz_g_luma_filter0;
+  int8_t *fir2 = kvz_g_luma_filter2;
+  int8_t *fir1 = kvz_g_luma_filter1;
+  int8_t *fir3 = kvz_g_luma_filter3;
+
+  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped1(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped3(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+
+  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
+  int16_t dst_stride = (LCU_WIDTH + 1);
+  
+  // Horizontal positions
+  for (x = 0; x < (width + 1); ++x) {
+    for (y = 0; y + 8 < height + KVZ_EXT_PADDING + 1; y += 8) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET; 
+      int8_t *firs4 = { fir0, fir2, fir1, fir3 };
+      int16_t *dsts4 = { &flipped0x * temp_stride + y, &flipped2x * temp_stride + y, &flipped1x * temp_stride + y, &flipped3x * temp_stride + y};
+      kvz_filter_flip_shift_x8_dual_avx2(&srcsrc_stride*ypos + xpos, src_stride, &firs0, shift1, &dsts0);
+      kvz_filter_flip_shift_x8_dual_avx2(&srcsrc_stride*ypos + xpos, src_stride, &firs2, shift1, &dsts2);
+    }
+
+    for (; y < height + KVZ_EXT_PADDING + 1; ++y) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped1x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir1, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped3x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir3, &srcsrc_stride*ypos + xpos) >> shift1;
+    }
+  }
+
+  // Filter vertically and flip x and y
+  for (x = 0; x + 8 < width + 1; x += 8) {
+    for (y = 0; y < height + 1; ++y) {
+      
+      // HPEL
+      int8_t *firs02 = { fir0, fir2 };
+      kvz_pixel *dsts02 = { &filtered0y * dst_stride + x, &filtered1y * dst_stride + x};
+      int16_t *srcs04 = { flipped2 + x * temp_stride + y, flipped0 + x * temp_stride + y};
+      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(srcs0, temp_stride, firs0, offset23, shift2 + shift3, dsts0);
+      kvz_filter_flip_round_clip_x8_16bit_avx2(flipped2 + x * temp_stride + y, temp_stride, fir2, offset23, shift2 + shift3, &filtered2y * dst_stride + x);
+     
+      // QPEL
+      // Horizontal
+      int8_t *firs14 = { fir0, fir0, fir2, fir2 };
+      kvz_pixel *dsts14 = { &filtered3y * dst_stride + x, &filtered4y * dst_stride + x, 
+                              &filtered5y * dst_stride + x, &filtered6y * dst_stride + x };
+      int16_t *srcs14 = { flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, 
+                            flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, };
+      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs10, temp_stride, &firs10, offset23, shift2 + shift3, &dsts10);
+      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs12, temp_stride, &firs12, offset23, shift2 + shift3, &dsts12);
+
+      // Vertical
+      int8_t *firs24 = { fir1, fir1, fir3, fir3 };
+      kvz_pixel *dsts24 = { &filtered7y * dst_stride + x, &filtered8y * dst_stride + x, 
+                              &filtered9y * dst_stride + x, &filtered10y * dst_stride + x };
+      int16_t *srcs24 = { flipped0 + x * temp_stride + y, flipped2 + x * temp_stride + y, 
+                            flipped0 + x * temp_stride + y, flipped2 + x * temp_stride + y, };
+      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs20, temp_stride, &firs20, offset23, shift2 + shift3, &dsts20);
+      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs22, temp_stride, &firs22, offset23, shift2 + shift3, &dsts22);
+
+      // Diagonal
+      int8_t *firs34 = { fir1, fir1, fir3, fir3 };
+      kvz_pixel *dsts34 = { &filtered11y * dst_stride + x, &filtered12y * dst_stride + x, 
+                              &filtered13y * dst_stride + x, &filtered14y * dst_stride + x };
+      int16_t *srcs34 = { flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, 
+                            flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, };
+      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs30, temp_stride, &firs30, offset23, shift2 + shift3, &dsts30);
+      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs32, temp_stride, &firs32, offset23, shift2 + shift3, &dsts32);
+    }
+  }
+
+  // The remaining pixels
+  for (; x < width + 1; ++x) {
+    for (y = 0; y < height + 1; ++y) {
+
+      // HPEL
+      filtered0y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered1y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered2y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+
+      // QPEL
+      // Horizontal
+      filtered3y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered4y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered5y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered6y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
+
+      // Vertical
+      filtered7y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered8y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered9y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered10y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+
+      // Diagonal
+      filtered11y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered12y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered13y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered14y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
+    }
+  }
+}
+
+void kvz_filter_frac_blocks_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block filtered15, int8_t fme_level)
+{
+  switch (fme_level) {
+    case 1:
+      kvz_filter_hpel_blocks_hor_ver_luma_avx2(encoder, src, src_stride, width, height, filtered);
+      break;
+    case 2:
+      kvz_filter_hpel_blocks_full_luma_avx2(encoder, src, src_stride, width, height, filtered);
+      break;
+    case 3:
+      kvz_filter_qpel_blocks_hor_ver_luma_avx2(encoder, src, src_stride, width, height, filtered);
+      break;
+    default:
+      kvz_filter_qpel_blocks_full_luma_avx2(encoder, src, src_stride, width, height, filtered);
+      break;
+  }
+}
+
+void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height,kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2)
+{
+  //Check for amp
+  if (width != height) {
+    kvz_sample_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
+    return;
+  }
+  //TODO: horizontal and vertical only filtering
+  int32_t x, y;
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  int8_t *hor_filter = kvz_g_luma_filtermv0 & 3;
+  int8_t *ver_filter = kvz_g_luma_filtermv1 & 3;
+
+  int16_t hor_filtered(LCU_WIDTH + 1) + FILTER_SIZE(LCU_WIDTH + 1) + FILTER_SIZE;
+
+  if (width == 4) {
+    // Filter horizontally and flip x and y
+    for (y = 0; y < height + FILTER_SIZE - 1; ++y) {
+      for (x = 0; x < width; x += 4) {
+        int ypos = y - FILTER_OFFSET;
+        int xpos = x - FILTER_OFFSET;
+        int16_t *out = &(hor_filteredyx);
+        kvz_eight_tap_filter_x4_hor_avx2(hor_filter, &srcsrc_stride*ypos + xpos, shift1, out);
+      }
+    }
+
+    // Filter vertically and flip x and y
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; x+=4) {
+        int ypos = y;
+        int xpos = x;
+        *(int32_t*)&(dsty*dst_stride + x) = kvz_eight_tap_filter_x4_ver_16bit_avx2(ver_filter, &hor_filteredyposxpos, sizeof(hor_filtered0)/sizeof(int16_t), offset23, shift2, shift3);
+      }
+    }
+
+  } else {
+    // Filter horizontally and flip x and y
+    for (y = 0; y < height + FILTER_SIZE - 1; ++y) {
+      for (x = 0; x < width; x+=8) {
+        int ypos = y - FILTER_OFFSET;
+        int xpos = x - FILTER_OFFSET;
+        int16_t *dst = &(hor_filteredyx);
+        kvz_eight_tap_filter_x8_hor_avx2(hor_filter, &srcsrc_stride*ypos + xpos, shift1, dst);
+      }
+    }
+
+    // Filter vertically and flip x and y
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; x+=8) {
+        int ypos = y;
+        int xpos = x;
+        kvz_pixel *out = &(dsty*dst_stride + x);
+        kvz_eight_tap_filter_x8_ver_16bit_avx2(ver_filter, &hor_filteredyposxpos, sizeof(hor_filtered0)/sizeof(int16_t), offset23, shift2, shift3, out);
+      }
+    }
+  }
+}
+
+void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height,kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2)
+{
+  //Check for amp
+  if (width != height) {
+    kvz_sample_octpel_chroma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
+    return;
+  }
+  //TODO: horizontal and vertical only filtering
+  int32_t x, y;
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  int8_t *hor_filter = kvz_g_chroma_filtermv0 & 7;
+  int8_t *ver_filter = kvz_g_chroma_filtermv1 & 7;
+
+#define FILTER_SIZE_C (FILTER_SIZE / 2)
+#define FILTER_OFFSET_C (FILTER_OFFSET / 2)
+  int16_t hor_filtered(LCU_WIDTH_C + 1) + FILTER_SIZE_C(LCU_WIDTH_C + 1) + FILTER_SIZE_C;
+
+  if (width == 4) {
+    // Filter horizontally and flip x and y
+    for (y = 0; y < height + FILTER_SIZE_C - 1; ++y) {
+      for (x = 0; x < width; x += 4) {
+        int ypos = y - FILTER_OFFSET_C;
+        int xpos = x - FILTER_OFFSET_C;
+        int16_t *out = &(hor_filteredyx);
+        kvz_four_tap_filter_x4_hor_avx2(hor_filter, &srcsrc_stride*ypos + xpos, shift1, out);
+      }
+    }
+
+    // Filter vertically and flip x and y
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; x+=4) {
+        int ypos = y;
+        int xpos = x;
+        *(int32_t*)&(dsty*dst_stride + x) = kvz_four_tap_filter_x4_ver_16bit_avx2(ver_filter, &hor_filteredyposxpos, sizeof(hor_filtered0)/sizeof(int16_t), offset23, shift2, shift3);
+      }
+    }
+
+  } else {
+    // Filter horizontally and flip x and y
+    for (y = 0; y < height + FILTER_SIZE_C - 1; ++y) {
+      for (x = 0; x < width; x += 8) {
+        int ypos = y - FILTER_OFFSET_C;
+        int xpos = x - FILTER_OFFSET_C;
+        int16_t *dst = &(hor_filteredyx);
+        kvz_four_tap_filter_x8_hor_avx2(hor_filter, &srcsrc_stride*ypos + xpos, shift1, dst);
+      }
+    }
+
+    // Filter vertically and flip x and y
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; x+=8) {
+        int ypos = y;
+        int xpos = x;
+        kvz_pixel *out = &(dsty*dst_stride + x);
+        kvz_four_tap_filter_x8_ver_16bit_avx2(ver_filter, &hor_filteredyposxpos, sizeof(hor_filtered0)/sizeof(int16_t), offset23, shift2, shift3, out);
+      }
+    }
+  }
+}
+
+
 void kvz_get_extended_block_avx2(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
   int filter_size, int width, int height, kvz_extended_block *out) {
 
@@ -546,6 +1428,9 @@
     success &= kvz_strategyselector_register(opaque, "filter_inter_quarterpel_luma", "avx2", 40, &kvz_filter_inter_quarterpel_luma_avx2);
     success &= kvz_strategyselector_register(opaque, "filter_inter_halfpel_chroma", "avx2", 40, &kvz_filter_inter_halfpel_chroma_avx2);
     success &= kvz_strategyselector_register(opaque, "filter_inter_octpel_chroma", "avx2", 40, &kvz_filter_inter_octpel_chroma_avx2);
+    success &= kvz_strategyselector_register(opaque, "filter_frac_blocks_luma", "avx2", 40, &kvz_filter_frac_blocks_luma_avx2);
+    success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "avx2", 40, &kvz_sample_quarterpel_luma_avx2);
+    success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "avx2", 40, &kvz_sample_octpel_chroma_avx2);
   }
   success &= kvz_strategyselector_register(opaque, "get_extended_block", "avx2", 40, &kvz_get_extended_block_avx2);
 #endif //COMPILE_INTEL_AVX2

kvazaar-0.8.3.tar.gz/src/strategies/avx2/ipol-avx2.h -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/ipol-avx2.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/avx2/picture-avx2.c -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/picture-avx2.c Changed

@@ -21,13 +21,16 @@
 /*
  * \file
  */
-#include "picture-avx2.h"
-#include "strategyselector.h"
+#include "strategies/avx2/picture-avx2.h"
 
 #if COMPILE_INTEL_AVX2
-#  include "image.h"
-#  include "strategies/strategies-common.h"
-#  include <immintrin.h>
+#include <immintrin.h>
+#include <string.h>
+
+#include "kvazaar.h"
+#include "strategies/strategies-picture.h"
+#include "strategyselector.h"
+#include "strategies/generic/picture-generic.h"
 
 
 /**
@@ -171,9 +174,9 @@
 
   row3 = _mm_add_epi16(row2, row3);
 
-  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
-  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
-  row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
+  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, _MM_SHUFFLE(1, 0, 3, 2) ));
+  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, _MM_SHUFFLE(0, 1, 0, 1) ));
+  row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, _MM_SHUFFLE(0, 1, 0, 1) ));
 
   unsigned sum = _mm_extract_epi16(row3, 0);
   unsigned satd = (sum + 1) >> 1;
@@ -218,9 +221,9 @@
 
   row3 = _mm256_add_epi16(row2, row3);
 
-  row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
-  row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
-  row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
+  row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, _MM_SHUFFLE(1, 0, 3, 2) ));
+  row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, _MM_SHUFFLE(0, 1, 0, 1) ));
+  row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, _MM_SHUFFLE(0, 1, 0, 1) ));
 
   unsigned sum1 = _mm_extract_epi16(_mm256_castsi256_si128(row3), 0);
   sum1 = (sum1 + 1) >> 1;
@@ -237,18 +240,18 @@
   __m128i mask_pos = _mm_set1_epi16(1);
   __m128i mask_neg = _mm_set1_epi16(-1);
   __m128i sign_mask = _mm_unpacklo_epi64(mask_pos, mask_neg);
-  __m128i temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1));
+  __m128i temp = _mm_shuffle_epi32(*row, _MM_SHUFFLE(1, 0, 3, 2));
   *row = _mm_sign_epi16(*row, sign_mask);
   *row = _mm_add_epi16(*row, temp);
 
   sign_mask = _mm_unpacklo_epi32(mask_pos, mask_neg);
-  temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2));
+  temp = _mm_shuffle_epi32(*row, _MM_SHUFFLE(2, 3, 0, 1));
   *row = _mm_sign_epi16(*row, sign_mask);
   *row = _mm_add_epi16(*row, temp);
 
   sign_mask = _mm_unpacklo_epi16(mask_pos, mask_neg);
-  temp = _mm_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2));
-  temp = _mm_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2));
+  temp = _mm_shufflelo_epi16(*row, _MM_SHUFFLE(2,3,0,1));
+  temp = _mm_shufflehi_epi16(temp, _MM_SHUFFLE(2,3,0,1));
   *row = _mm_sign_epi16(*row, sign_mask);
   *row = _mm_add_epi16(*row, temp);
 }
@@ -258,18 +261,18 @@
   __m256i mask_pos = _mm256_set1_epi16(1);
   __m256i mask_neg = _mm256_set1_epi16(-1);
   __m256i sign_mask = _mm256_unpacklo_epi64(mask_pos, mask_neg);
-  __m256i temp = _mm256_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1));
+  __m256i temp = _mm256_shuffle_epi32(*row, _MM_SHUFFLE(1, 0, 3, 2));
   *row = _mm256_sign_epi16(*row, sign_mask);
   *row = _mm256_add_epi16(*row, temp);
 
   sign_mask = _mm256_unpacklo_epi32(mask_pos, mask_neg);
-  temp = _mm256_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2));
+  temp = _mm256_shuffle_epi32(*row, _MM_SHUFFLE(2, 3, 0, 1));
   *row = _mm256_sign_epi16(*row, sign_mask);
   *row = _mm256_add_epi16(*row, temp);
 
   sign_mask = _mm256_unpacklo_epi16(mask_pos, mask_neg);
-  temp = _mm256_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2));
-  temp = _mm256_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2));
+  temp = _mm256_shufflelo_epi16(*row, _MM_SHUFFLE(2,3,0,1));
+  temp = _mm256_shufflehi_epi16(temp, _MM_SHUFFLE(2,3,0,1));
   *row = _mm256_sign_epi16(*row, sign_mask);
   *row = _mm256_add_epi16(*row, temp);
 }
@@ -353,8 +356,8 @@
   haddwd_accumulate_avx2(&sad, ver_row + 6);
   haddwd_accumulate_avx2(&sad, ver_row + 7);
 
-  sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, KVZ_PERMUTE(2, 3, 0, 1)));
-  sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, KVZ_PERMUTE(1, 0, 1, 0)));
+  sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, _MM_SHUFFLE(1, 0, 3, 2)));
+  sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, _MM_SHUFFLE(0, 1, 0, 1)));
 
   return _mm_cvtsi128_si32(sad);
 }
@@ -371,8 +374,8 @@
   haddwd_accumulate_dual_avx2(&sad, ver_row + 6);
   haddwd_accumulate_dual_avx2(&sad, ver_row + 7);
 
-  sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, KVZ_PERMUTE(2, 3, 0, 1)));
-  sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, KVZ_PERMUTE(1, 0, 1, 0)));
+  sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, _MM_SHUFFLE(1, 0, 3, 2)));
+  sad = _mm256_add_epi32(sad, _mm256_shuffle_epi32(sad, _MM_SHUFFLE(0, 1, 0, 1)));
 
   *sum0 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 0));
   *sum1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sad, 1));
@@ -451,6 +454,45 @@
   hor_transform_row_dual_avx2((*row_diff) + 7);
 }
 
+static void kvz_satd_8bit_8x8_general_dual_avx2(const kvz_pixel * buf1, unsigned stride1,
+                                                const kvz_pixel * buf2, unsigned stride2,
+                                                const kvz_pixel * orig, unsigned stride_orig,
+                                                unsigned *sum0, unsigned *sum1)
+{
+  __m256i temp8;
+
+  diff_blocks_dual_avx2(&temp, buf1, stride1, buf2, stride2, orig, stride_orig);
+  hor_transform_block_dual_avx2(&temp);
+  ver_transform_block_dual_avx2(&temp);
+  
+  sum_block_dual_avx2(temp, sum0, sum1);
+
+  *sum0 = (*sum0 + 2) >> 2;
+  *sum1 = (*sum1 + 2) >> 2;
+}
+
+/**
+* \brief  Calculate SATD between two 4x4 blocks inside bigger arrays.
+*/
+static unsigned kvz_satd_4x4_subblock_8bit_avx2(const kvz_pixel * buf1,
+                                                const int32_t     stride1,
+                                                const kvz_pixel * buf2,
+                                                const int32_t     stride2)
+{
+  // TODO: AVX2 implementation
+  return kvz_satd_4x4_subblock_generic(buf1, stride1, buf2, stride2);
+}
+
+static void kvz_satd_4x4_subblock_quad_avx2(const kvz_pixel *preds4,
+                                       const int strides4,
+                                       const kvz_pixel *orig,
+                                       const int orig_stride,
+                                       unsigned costs4)
+{
+  // TODO: AVX2 implementation
+  kvz_satd_4x4_subblock_quad_generic(preds, strides, orig, orig_stride, costs);
+}
+
 static unsigned satd_8x8_subblock_8bit_avx2(const kvz_pixel * buf1, unsigned stride1, const kvz_pixel * buf2, unsigned stride2)
 {
   __m128i temp8;
@@ -465,6 +507,15 @@
   return result;
 }
 
+static void satd_8x8_subblock_quad_avx2(const kvz_pixel **preds,
+  const int *strides,
+  const kvz_pixel *orig,
+  const int orig_stride,
+  unsigned *costs)
+{
+  kvz_satd_8bit_8x8_general_dual_avx2(preds0, strides0, preds1, strides1, orig, orig_stride, &costs0, &costs1);
+  kvz_satd_8bit_8x8_general_dual_avx2(preds2, strides2, preds3, strides3, orig, orig_stride, &costs2, &costs3);
+}
 
 SATD_NxN(8bit_avx2,  8)
 SATD_NxN(8bit_avx2, 16)
@@ -472,25 +523,6 @@
 SATD_NxN(8bit_avx2, 64)
 SATD_ANY_SIZE(8bit_avx2)
 
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-static void kvz_satd_8bit_8x8_general_dual_avx2(const kvz_pixel * buf1, unsigned stride1,
-                                                const kvz_pixel * buf2, unsigned stride2,
-                                                const kvz_pixel * orig, unsigned stride_orig,
-                                                unsigned *sum0, unsigned *sum1)
-{
-  __m256i temp8;
-
-  diff_blocks_dual_avx2(&temp, buf1, stride1, buf2, stride2, orig, stride_orig);
-  hor_transform_block_dual_avx2(&temp);
-  ver_transform_block_dual_avx2(&temp);
-  
-  sum_block_dual_avx2(temp, sum0, sum1);
-
-  *sum0 = (*sum0 + 2) >> 2;
-  *sum1 = (*sum1 + 2) >> 2;
-}
-
 // Function macro for defining hadamard calculating functions
 // for fixed size blocks. They calculate hadamard for integer
 // multiples of 8x8 with the 8x8 hadamard function.
@@ -540,86 +572,71 @@
 SATD_NXN_DUAL_AVX2(32)
 SATD_NXN_DUAL_AVX2(64)
 
-void kvz_pixels_blit_avx2(const kvz_pixel * const orig, kvz_pixel * const dst,
-                         const unsigned width, const unsigned height,
-                         const unsigned orig_stride, const unsigned dst_stride)
-{
-  unsigned y;
-  //There is absolutely no reason to have a width greater than the source or the destination stride.
-  assert(width <= orig_stride);
-  assert(width <= dst_stride);
-
-#ifdef CHECKPOINTS
-  char *buffer = malloc((3 * width + 1) * sizeof(char));
-  for (y = 0; y < height; ++y) {
-    int p;
-    for (p = 0; p < width; ++p) {
-      sprintf((buffer + 3*p), "%02X ", origy*orig_stride);
-    }
-    buffer3*width = 0;
-    CHECKPOINT("kvz_pixels_blit_avx2: %04d: %s", y, buffer);
-  }
-  FREE_POINTER(buffer);
-#endif //CHECKPOINTS
-
-  if (width == orig_stride && width == dst_stride) {
-    memcpy(dst, orig, width * height * sizeof(kvz_pixel));
-    return;
+#define SATD_ANY_SIZE_MULTI_AVX2(suffix, num_parallel_blocks) \
+  static cost_pixel_any_size_multi_func satd_any_size_## suffix; \
+  static void satd_any_size_ ## suffix ( \
+      int width, int height, \
+      const kvz_pixel **preds, \
+      const int *strides, \
+      const kvz_pixel *orig, \
+      const int orig_stride, \
+      unsigned num_modes, \
+      unsigned *costs_out, \
+      int8_t *valid) \
+  { \
+    unsigned sumsnum_parallel_blocks = { 0 }; \
+    const kvz_pixel *pred_ptrs4 = { preds0, preds1, preds2, preds3 };\
+    const kvz_pixel *orig_ptr = orig; \
+    costs_out0 = 0; costs_out1 = 0; costs_out2 = 0; costs_out3 = 0; \
+    if (width % 8 != 0) { \
+      /* Process the first column using 4x4 blocks. */ \
+      for (int y = 0; y < height; y += 4) { \
+        kvz_satd_4x4_subblock_ ## suffix(preds, strides, orig, orig_stride, sums); \
+            } \
+      orig_ptr += 4; \
+      for(int blk = 0; blk < num_parallel_blocks; ++blk){\
+        pred_ptrsblk += 4; \
+            }\
+      width -= 4; \
+            } \
+    if (height % 8 != 0) { \
+      /* Process the first row using 4x4 blocks. */ \
+      for (int x = 0; x < width; x += 4 ) { \
+        kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \
+            } \
+      orig_ptr += 4 * orig_stride; \
+      for(int blk = 0; blk < num_parallel_blocks; ++blk){\
+        pred_ptrsblk += 4 * stridesblk; \
+            }\
+      height -= 4; \
+        } \
+    /* The rest can now be processed with 8x8 blocks. */ \
+    for (int y = 0; y < height; y += 8) { \
+      orig_ptr = &origy * orig_stride; \
+      pred_ptrs0 = &preds0y * strides0; \
+      pred_ptrs1 = &preds1y * strides1; \
+      pred_ptrs2 = &preds2y * strides2; \
+      pred_ptrs3 = &preds3y * strides3; \
+      for (int x = 0; x < width; x += 8) { \
+        satd_8x8_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \
+        orig_ptr += 8; \
+        pred_ptrs0 += 8; \
+        pred_ptrs1 += 8; \
+        pred_ptrs2 += 8; \
+        pred_ptrs3 += 8; \
+        costs_out0 += sums0; \
+        costs_out1 += sums1; \
+        costs_out2 += sums2; \
+        costs_out3 += sums3; \
+      } \
+    } \
+    for(int i = 0; i < num_parallel_blocks; ++i){\
+      costs_outi = costs_outi >> (KVZ_BIT_DEPTH - 8);\
+    } \
+    return; \
   }
 
-  int nxn_width = (width == height) ? width : 0;
-  switch (nxn_width) {
-    case 4:
-      *(int32_t*)&dstdst_stride*0 = *(int32_t*)&origorig_stride*0;
-      *(int32_t*)&dstdst_stride*1 = *(int32_t*)&origorig_stride*1;
-      *(int32_t*)&dstdst_stride*2 = *(int32_t*)&origorig_stride*2;
-      *(int32_t*)&dstdst_stride*3 = *(int32_t*)&origorig_stride*3;
-      break;
-    case 8:
-      *(int64_t*)&dstdst_stride*0 = *(int64_t*)&origorig_stride*0;
-      *(int64_t*)&dstdst_stride*1 = *(int64_t*)&origorig_stride*1;
-      *(int64_t*)&dstdst_stride*2 = *(int64_t*)&origorig_stride*2;
-      *(int64_t*)&dstdst_stride*3 = *(int64_t*)&origorig_stride*3;
-      *(int64_t*)&dstdst_stride*4 = *(int64_t*)&origorig_stride*4;
-      *(int64_t*)&dstdst_stride*5 = *(int64_t*)&origorig_stride*5;
-      *(int64_t*)&dstdst_stride*6 = *(int64_t*)&origorig_stride*6;
-      *(int64_t*)&dstdst_stride*7 = *(int64_t*)&origorig_stride*7;
-       break;
-    case 16:
-      for (int i = 0; i < 16; ++i) {
-        __m128i temp = _mm_loadu_si128((__m128i*)(orig + i * orig_stride));
-        _mm_storeu_si128((__m128i*)(dst + i * dst_stride), temp);
-      }
-      break;
-    case 32:
-      for (int i = 0; i < 32; ++i) {
-        __m256i temp = _mm256_loadu_si256((__m256i*)(orig + i * orig_stride));
-        _mm256_storeu_si256((__m256i*)(dst + i * dst_stride), temp);
-      }
-    break;
-    case 64:
-      for (int i = 0; i < 64; ++i) {
-        __m256i temp0 = _mm256_loadu_si256((__m256i*)(orig + i * orig_stride));
-        _mm256_storeu_si256((__m256i*)(dst + i * dst_stride), temp0);
-        __m256i temp1 = _mm256_loadu_si256((__m256i*)(orig + i * orig_stride + sizeof(__m256)));
-        _mm256_storeu_si256((__m256i*)(dst + i * dst_stride + sizeof(__m256)), temp1);
-      }
-    break;
-  default:
-
-    if (orig == dst) {
-      //If we have the same array, then we should have the same stride
-      assert(orig_stride == dst_stride);
-      return;
-    }
-    assert(orig != dst || orig_stride == dst_stride);
-
-    for (y = 0; y < height; ++y) {
-      memcpy(&dsty*dst_stride, &origy*orig_stride, width * sizeof(kvz_pixel));
-    }
-    break;
-  }
-}
+SATD_ANY_SIZE_MULTI_AVX2(quad_avx2, 4)
 
 #endif //COMPILE_INTEL_AVX2
 
@@ -650,8 +667,7 @@
     success &= kvz_strategyselector_register(opaque, "satd_32x32_dual", "avx2", 40, &satd_8bit_32x32_dual_avx2);
     success &= kvz_strategyselector_register(opaque, "satd_64x64_dual", "avx2", 40, &satd_8bit_64x64_dual_avx2);
     success &= kvz_strategyselector_register(opaque, "satd_any_size", "avx2", 40, &satd_any_size_8bit_avx2);
-
-    success &= kvz_strategyselector_register(opaque, "pixels_blit", "avx2", 40, &kvz_pixels_blit_avx2);
+    success &= kvz_strategyselector_register(opaque, "satd_any_size_quad", "generic", 40, &satd_any_size_quad_avx2);
   }
 #endif
   return success;

kvazaar-0.8.3.tar.gz/src/strategies/avx2/picture-avx2.h -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/picture-avx2.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/avx2/quant-avx2.c -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/quant-avx2.c Changed

@@ -22,25 +22,29 @@
 * \file
 */
 
+#include "strategies/avx2/quant-avx2.h"
+
+#if COMPILE_INTEL_AVX2 && defined X86_64
+#include <immintrin.h>
 #include <stdlib.h>
 
-#include "quant-avx2.h"
-#include "../generic/quant-generic.h"
-#include "../strategies-common.h"
-#include "strategyselector.h"
+#include "cu.h"
 #include "encoder.h"
-#include "transform.h"
+#include "encoderstate.h"
+#include "kvazaar.h"
 #include "rdo.h"
+#include "scalinglist.h"
+#include "strategies/generic/quant-generic.h"
+#include "strategies/strategies-quant.h"
+#include "strategyselector.h"
+#include "tables.h"
+#include "transform.h"
 
-#if COMPILE_INTEL_AVX2 && defined X86_64
-#include <immintrin.h>
-#include <smmintrin.h>
 
 /**
-* \brief quantize transformed coefficents
-*
-*/
-
+ * \brief quantize transformed coefficents
+ *
+ */
 void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
   int32_t height, int8_t type, int8_t scan_idx, int8_t block_type)
 {
@@ -48,13 +52,13 @@
   const uint32_t log2_block_size = kvz_g_convert_to_bitwidth + 2;
   const uint32_t * const scan = kvz_g_sig_last_scanscan_idxlog2_block_size - 1;
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->global->QP, (encoder->bitdepth - 8) * 6);
+  int32_t qp_scaled = kvz_get_scaled_qp(type, state->frame->QP, (encoder->bitdepth - 8) * 6);
   const uint32_t log2_tr_size = kvz_g_convert_to_bitwidth + 2;
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"type);
   const int32_t *quant_coeff = encoder->scaling_list.quant_coefflog2_tr_size - 2scalinglist_typeqp_scaled % 6;
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size; //!< Represents scaling through forward transform
   const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + transform_shift;
-  const int32_t add = ((state->global->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9);
+  const int32_t add = ((state->frame->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9);
   const int32_t q_bits8 = q_bits - 8;
 
   assert(quant_coeff0 <= (1 << 15) - 1 && quant_coeff0 >= -(1 << 15)); //Assuming flat values to fit int16_t
@@ -96,8 +100,8 @@
   }
 
   __m128i temp = _mm_add_epi32(_mm256_castsi256_si128(v_ac_sum), _mm256_extracti128_si256(v_ac_sum, 1));
-  temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(2, 3, 0, 1)));
-  temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(1, 0, 1, 0)));
+  temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, _MM_SHUFFLE(1, 0, 3, 2)));
+  temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, _MM_SHUFFLE(0, 1, 0, 1)));
   ac_sum += _mm_cvtsi128_si32(temp);
 
   if (!(encoder->sign_hiding && ac_sum >= 2)) return;
@@ -376,7 +380,7 @@
   }
 
   // Quantize coeffs. (coeff -> quant_coeff)
-  if (state->encoder_control->rdoq_enable) {
+  if (state->encoder_control->rdoq_enable && (width > 4 || !state->encoder_control->cfg->rdoq_skip)) {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
     kvz_rdoq(state, coeff, quant_coeff, width, width, (color == COLOR_Y ? 0 : 2),
@@ -453,7 +457,7 @@
   int32_t n;
   int32_t transform_shift = 15 - encoder->bitdepth - (kvz_g_convert_to_bit width  + 2);
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->global->QP, (encoder->bitdepth-8)*6);
+  int32_t qp_scaled = kvz_get_scaled_qp(type, state->frame->QP, (encoder->bitdepth-8)*6);
 
   shift = 20 - QUANT_SHIFT - transform_shift;

kvazaar-0.8.3.tar.gz/src/strategies/avx2/quant-avx2.h -> kvazaar-1.0.0.tar.gz/src/strategies/avx2/quant-avx2.h Changed

kvazaar-1.0.0.tar.gz/src/strategies/avx2/sao-avx2.c Added

@@ -0,0 +1,358 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "strategies/avx2/sao-avx2.h"
+
+#if COMPILE_INTEL_AVX2
+#include <immintrin.h>
+
+#include "cu.h"
+#include "encoder.h"
+#include "encoderstate.h"
+#include "kvazaar.h"
+#include "sao.h"
+#include "strategyselector.h"
+
+
+// These optimizations are based heavily on sao-generic.c.
+// Might be useful to check that if (when) this file
+// is difficult to understand.
+
+
+static INLINE __m256i load_6_offsets(const int* offsets){
+
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*) offsets)), _mm_loadl_epi64((__m128i*)&(offsets4)), 1);
+}
+
+static INLINE __m128i load_6_pixels(const kvz_pixel* data){
+
+  return _mm_insert_epi16(_mm_cvtsi32_si128(*(int32_t*)&(data0)), *(int16_t*)&(data4), 2);
+}
+
+static INLINE __m256i load_5_offsets(const int* offsets){
+
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*) offsets)), _mm_insert_epi32(_mm_setzero_si128(), offsets4, 0), 1);
+}
+
+
+static __m128i sao_calc_eo_cat_avx2(__m128i* a, __m128i* b, __m128i* c)
+{
+  __m128i v_eo_idx = _mm_set1_epi16(2);
+  __m128i v_a = _mm_cvtepu8_epi16(*a);
+  __m128i v_c = _mm_cvtepu8_epi16(*c);
+  __m128i v_b = _mm_cvtepu8_epi16(*b);
+  
+  __m128i temp_a = _mm_sign_epi16(_mm_set1_epi16(1), _mm_sub_epi16(v_c, v_a));
+  __m128i temp_b = _mm_sign_epi16(_mm_set1_epi16(1), _mm_sub_epi16(v_c, v_b));
+  v_eo_idx = _mm_add_epi16(v_eo_idx, temp_a);
+  v_eo_idx = _mm_add_epi16(v_eo_idx, temp_b);
+  
+  v_eo_idx = _mm_packus_epi16(v_eo_idx, v_eo_idx);
+  __m128i v_cat_lookup = _mm_setr_epi8(1,2,0,3,4,0,0,0,0,0,0,0,0,0,0,0);
+  __m128i v_cat = _mm_shuffle_epi8(v_cat_lookup, v_eo_idx);
+
+
+  return v_cat;
+}
+
+
+int kvz_sao_edge_ddistortion_avx2(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+                         int block_width, int block_height,
+                         int eo_class, int offsetsNUM_SAO_EDGE_CATEGORIES)
+{
+  int y, x;
+  int sum = 0;
+  vector2d_t a_ofs = g_sao_edge_offsetseo_class0;
+  vector2d_t b_ofs = g_sao_edge_offsetseo_class1;
+
+  __m256i v_accum = { 0 };
+
+  for (y = 1; y < block_height - 1; ++y) {
+
+    for (x = 1; x < block_width - 8; x+=8) {
+      const kvz_pixel *c_data = &rec_datay * block_width + x;
+
+      __m128i v_c_data = _mm_loadl_epi64((__m128i*)c_data);
+      __m128i v_a = _mm_loadl_epi64((__m128i*)(&c_dataa_ofs.y * block_width + a_ofs.x));
+      __m128i v_c = v_c_data;
+      __m128i v_b = _mm_loadl_epi64((__m128i*)(&c_datab_ofs.y * block_width + b_ofs.x));
+
+      __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c));
+
+      __m256i v_offset = _mm256_loadu_si256((__m256i*) offsets);
+      v_offset = _mm256_permutevar8x32_epi32(v_offset, v_cat);
+   
+      __m256i v_diff = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(orig_datay * block_width + x)));
+      v_diff = _mm256_sub_epi32(v_diff, _mm256_cvtepu8_epi32(v_c));
+      __m256i v_diff_minus_offset = _mm256_sub_epi32(v_diff, v_offset);
+      __m256i v_temp_sum = _mm256_sub_epi32(_mm256_mullo_epi32(v_diff_minus_offset, v_diff_minus_offset), _mm256_mullo_epi32(v_diff, v_diff));
+      v_accum = _mm256_add_epi32(v_accum, v_temp_sum);
+    }
+
+    //Handle last 6 pixels separately to prevent reading over boundary
+    const kvz_pixel *c_data = &rec_datay * block_width + x;
+    __m128i v_c_data = load_6_pixels(c_data);
+    const kvz_pixel* a_ptr = &c_dataa_ofs.y * block_width + a_ofs.x;
+    const kvz_pixel* b_ptr = &c_datab_ofs.y * block_width + b_ofs.x;
+    __m128i v_a = load_6_pixels(a_ptr);
+    __m128i v_c = v_c_data;
+    __m128i v_b = load_6_pixels(b_ptr);
+
+    __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c));
+
+    __m256i v_offset = load_6_offsets(offsets);
+    v_offset = _mm256_permutevar8x32_epi32(v_offset, v_cat);
+   
+    const kvz_pixel* orig_ptr = &(orig_datay * block_width + x);
+    __m256i v_diff = _mm256_cvtepu8_epi32(load_6_pixels(orig_ptr));
+    v_diff = _mm256_sub_epi32(v_diff, _mm256_cvtepu8_epi32(v_c));
+
+    __m256i v_diff_minus_offset = _mm256_sub_epi32(v_diff, v_offset);
+    __m256i v_temp_sum = _mm256_sub_epi32(_mm256_mullo_epi32(v_diff_minus_offset, v_diff_minus_offset), _mm256_mullo_epi32(v_diff, v_diff));
+    v_accum = _mm256_add_epi32(v_accum, v_temp_sum);
+  }
+
+  //Full horizontal sum
+  v_accum = _mm256_add_epi32(v_accum, _mm256_castsi128_si256(_mm256_extracti128_si256(v_accum, 1)));
+  v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(1, 0, 3, 2)));
+  v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(0, 1, 0, 1)));
+  sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_accum));
+
+  return sum;
+}
+
+
+static INLINE void accum_count_eo_cat_avx2(__m256i*  __restrict v_diff_accum, __m256i* __restrict v_count, __m256i* __restrict v_cat, __m256i* __restrict v_diff, int eo_cat){
+        __m256i v_mask = _mm256_cmpeq_epi32(*v_cat, _mm256_set1_epi32(eo_cat));
+        *v_diff_accum = _mm256_add_epi32(*v_diff_accum, _mm256_and_si256(*v_diff, v_mask));
+        *v_count = _mm256_sub_epi32(*v_count, v_mask);
+}
+
+
+#define ACCUM_COUNT_EO_CAT_AVX2(EO_CAT, V_CAT) \
+  \
+  accum_count_eo_cat_avx2(&(v_diff_accum EO_CAT ), &(v_count EO_CAT ), &V_CAT , &v_diff, EO_CAT);
+
+
+void kvz_calc_sao_edge_dir_avx2(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+                              int eo_class, int block_width, int block_height,
+                              int cat_sum_cnt2NUM_SAO_EDGE_CATEGORIES)
+{
+  int y, x;
+  vector2d_t a_ofs = g_sao_edge_offsetseo_class0;
+  vector2d_t b_ofs = g_sao_edge_offsetseo_class1;
+
+  // Don't sample the edge pixels because this function doesn't have access to
+  // their neighbours.
+
+  __m256i v_diff_accumNUM_SAO_EDGE_CATEGORIES = { { 0 } };
+  __m256i v_countNUM_SAO_EDGE_CATEGORIES = { { 0 } };
+
+  for (y = 1; y < block_height - 1; ++y) {
+
+    //Calculation for 8 pixels per round
+    for (x = 1; x < block_width - 8; x += 8) {
+      const kvz_pixel *c_data = &rec_datay * block_width + x;
+
+      __m128i v_c_data = _mm_loadl_epi64((__m128i* __restrict)c_data);
+      __m128i v_a = _mm_loadl_epi64((__m128i* __restrict)(&c_dataa_ofs.y * block_width + a_ofs.x));
+      __m128i v_c = v_c_data;
+      __m128i v_b = _mm_loadl_epi64((__m128i* __restrict)(&c_datab_ofs.y * block_width + b_ofs.x));
+
+      __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c));
+
+      __m256i v_diff = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i* __restrict)&(orig_datay * block_width + x)));
+      v_diff = _mm256_sub_epi32(v_diff, _mm256_cvtepu8_epi32(v_c));
+
+      //Accumulate differences and occurrences for each category
+      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT0, v_cat);
+      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT1, v_cat);
+      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT2, v_cat);
+      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT3, v_cat);
+      ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT4, v_cat);
+    }
+
+    //Handle last 6 pixels separately to prevent reading over boundary
+    const kvz_pixel *c_data = &rec_datay * block_width + x;
+    __m128i v_c_data = load_6_pixels(c_data);
+    const kvz_pixel* a_ptr = &c_dataa_ofs.y * block_width + a_ofs.x;
+    const kvz_pixel* b_ptr = &c_datab_ofs.y * block_width + b_ofs.x;
+    __m128i v_a = load_6_pixels(a_ptr);
+    __m128i v_c = v_c_data;
+    __m128i v_b = load_6_pixels(b_ptr);
+
+    __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c));
+
+    //Set the last two elements to a non-existing category to cause
+    //the accumulate-count macro to discard those values.
+    __m256i v_mask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+    v_cat = _mm256_or_si256(v_cat, v_mask);
+
+    const kvz_pixel* orig_ptr = &(orig_datay * block_width + x);
+    __m256i v_diff = _mm256_cvtepu8_epi32(load_6_pixels(orig_ptr));
+    v_diff = _mm256_sub_epi32(v_diff, _mm256_cvtepu8_epi32(v_c));
+
+    //Accumulate differences and occurrences for each category
+    ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT0, v_cat);
+    ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT1, v_cat);
+    ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT2, v_cat);
+    ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT3, v_cat);
+    ACCUM_COUNT_EO_CAT_AVX2(SAO_EO_CAT4, v_cat);
+  }
+
+  for (int eo_cat = 0; eo_cat < NUM_SAO_EDGE_CATEGORIES; ++eo_cat) {
+    int accum = 0;
+    int count = 0;
+
+    //Full horizontal sum of accumulated values
+    v_diff_accumeo_cat = _mm256_add_epi32(v_diff_accumeo_cat, _mm256_castsi128_si256(_mm256_extracti128_si256(v_diff_accumeo_cat, 1)));
+    v_diff_accumeo_cat = _mm256_add_epi32(v_diff_accumeo_cat, _mm256_shuffle_epi32(v_diff_accumeo_cat, _MM_SHUFFLE(1, 0, 3, 2)));
+    v_diff_accumeo_cat = _mm256_add_epi32(v_diff_accumeo_cat, _mm256_shuffle_epi32(v_diff_accumeo_cat, _MM_SHUFFLE(0, 1, 0, 1)));
+    accum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_diff_accumeo_cat));
+
+    //Full horizontal sum of accumulated values
+    v_counteo_cat = _mm256_add_epi32(v_counteo_cat, _mm256_castsi128_si256(_mm256_extracti128_si256(v_counteo_cat, 1)));
+    v_counteo_cat = _mm256_add_epi32(v_counteo_cat, _mm256_shuffle_epi32(v_counteo_cat, _MM_SHUFFLE(1, 0, 3, 2)));
+    v_counteo_cat = _mm256_add_epi32(v_counteo_cat, _mm256_shuffle_epi32(v_counteo_cat, _MM_SHUFFLE(0, 1, 0, 1)));
+    count += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_counteo_cat));
+
+    cat_sum_cnt0eo_cat += accum;
+    cat_sum_cnt1eo_cat += count; 
+
+  }
+}
+
+
+void kvz_sao_reconstruct_color_avx2(const encoder_control_t * const encoder, 
+                                  const kvz_pixel *rec_data, kvz_pixel *new_rec_data,
+                                  const sao_info_t *sao,
+                                  int stride, int new_stride,
+                                  int block_width, int block_height,
+                                  color_t color_i)
+{
+  int y, x;
+  // Arrays orig_data and rec_data are quarter size for chroma.
+  int offset_v = color_i == COLOR_V ? 5 : 0;
+
+  if(sao->type == SAO_TYPE_BAND) {
+    int offsets1<<KVZ_BIT_DEPTH;
+    kvz_calc_sao_offset_array(encoder, sao, offsets, color_i);
+    for (y = 0; y < block_height; ++y) {
+      for (x = 0; x < block_width; ++x) {
+        new_rec_datay * new_stride + x = offsetsrec_datay * stride + x;
+      }
+    }
+  } else {
+    // Don't sample the edge pixels because this function doesn't have access to
+    // their neighbours.
+    for (y = 0; y < block_height; ++y) {
+      for (x = 0; x < block_width; x+=8) {
+        vector2d_t a_ofs = g_sao_edge_offsetssao->eo_class0;
+        vector2d_t b_ofs = g_sao_edge_offsetssao->eo_class1;
+        const kvz_pixel *c_data = &rec_datay * stride + x;
+        kvz_pixel *new_data = &new_rec_datay * new_stride + x;
+        const kvz_pixel* a_ptr = &c_dataa_ofs.y * stride + a_ofs.x;
+        const kvz_pixel* c_ptr = &c_data0;
+        const kvz_pixel* b_ptr = &c_datab_ofs.y * stride + b_ofs.x;
+
+        __m128i v_a = _mm_loadl_epi64((__m128i*)a_ptr);
+        __m128i v_b = _mm_loadl_epi64((__m128i*)b_ptr);
+        __m128i v_c = _mm_loadl_epi64((__m128i*)c_ptr);
+
+        __m256i v_cat = _mm256_cvtepu8_epi32(sao_calc_eo_cat_avx2(&v_a, &v_b, &v_c) );
+
+        __m256i v_offset_v = load_5_offsets(sao->offsets + offset_v);
+        __m256i v_new_data = _mm256_permutevar8x32_epi32(v_offset_v, v_cat);
+        v_new_data = _mm256_add_epi32(v_new_data, _mm256_cvtepu8_epi32(v_c));
+        __m128i v_new_data_128 = _mm_packus_epi32(_mm256_castsi256_si128(v_new_data), _mm256_extracti128_si256(v_new_data, 1));
+        v_new_data_128 = _mm_packus_epi16(v_new_data_128, v_new_data_128);
+        
+        if ((block_width - x) >= 8) {
+          _mm_storel_epi64((__m128i*)new_data, v_new_data_128);
+        } else {
+          
+          kvz_pixel arr8;
+          _mm_storel_epi64((__m128i*)arr, v_new_data_128);
+          for (int i = 0; i < block_width - x; ++i) new_datai = arri;
+        }
+      
+      }
+    }
+  }
+}
+
+
+int kvz_sao_band_ddistortion_avx2(const encoder_state_t * const state, const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+                         int block_width, int block_height,
+                         int band_pos, int sao_bands4)
+{
+  int y, x;
+  int shift = state->encoder_control->bitdepth-5;
+  int sum = 0;
+
+  __m256i v_accum = { 0 };
+
+  for (y = 0; y < block_height; ++y) {
+    for (x = 0; x < block_width; x+=8) {
+      
+      __m256i v_band = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(rec_datay * block_width + x)));
+      v_band = _mm256_srli_epi32(v_band, shift);
+      v_band = _mm256_sub_epi32(v_band, _mm256_set1_epi32(band_pos));
+
+      __m256i v_offset = { 0 };
+      __m256i v_mask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_set1_epi32(~3), v_band), _mm256_setzero_si256());
+      v_offset = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)sao_bands)), v_band);
+
+      v_offset = _mm256_and_si256(v_offset, v_mask);
+      
+
+      __m256i v_diff = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(orig_datay * block_width + x)));
+      __m256i v_rec = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*)&(rec_datay * block_width + x)));
+      v_diff = _mm256_sub_epi32(v_diff, v_rec);
+      __m256i v_diff_minus_offset = _mm256_sub_epi32(v_diff, v_offset);
+      __m256i v_temp_sum = _mm256_sub_epi32(_mm256_mullo_epi32(v_diff_minus_offset, v_diff_minus_offset), _mm256_mullo_epi32(v_diff, v_diff));
+      v_accum = _mm256_add_epi32(v_accum, v_temp_sum);
+    }
+  }
+
+  //Full horizontal sum
+  v_accum = _mm256_add_epi32(v_accum, _mm256_castsi128_si256(_mm256_extracti128_si256(v_accum, 1)));
+  v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(1, 0, 3, 2)));
+  v_accum = _mm256_add_epi32(v_accum, _mm256_shuffle_epi32(v_accum, _MM_SHUFFLE(0, 1, 0, 1)));
+  sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(v_accum));
+
+  return sum;
+}
+
+#endif //COMPILE_INTEL_AVX2
+
+int kvz_strategy_register_sao_avx2(void* opaque, uint8_t bitdepth)
+{
+  bool success = true;
+#if COMPILE_INTEL_AVX2
+  if (bitdepth == 8) {
+    success &= kvz_strategyselector_register(opaque, "sao_edge_ddistortion", "avx2", 40, &kvz_sao_edge_ddistortion_avx2);
+    success &= kvz_strategyselector_register(opaque, "calc_sao_edge_dir", "avx2", 40, &kvz_calc_sao_edge_dir_avx2);
+    success &= kvz_strategyselector_register(opaque, "sao_reconstruct_color", "avx2", 40, &kvz_sao_reconstruct_color_avx2);
+    success &= kvz_strategyselector_register(opaque, "sao_band_ddistortion", "avx2", 40, &kvz_sao_band_ddistortion_avx2);
+  }
+#endif //COMPILE_INTEL_AVX2
+  return success;
+}

kvazaar-1.0.0.tar.gz/src/strategies/avx2/sao-avx2.h Added

@@ -0,0 +1,34 @@
+#ifndef STRATEGIES_SAO_AVX2_H_
+#define STRATEGIES_SAO_AVX2_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * AVX2 implementations of optimized functions.
+ */
+
+#include "global.h" // IWYU pragma: keep
+
+
+int kvz_strategy_register_sao_avx2(void* opaque, uint8_t bitdepth);
+
+#endif //STRATEGIES_SAO_AVX2_H_

kvazaar-0.8.3.tar.gz/src/strategies/generic/dct-generic.c -> kvazaar-1.0.0.tar.gz/src/strategies/generic/dct-generic.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/generic/dct-generic.h -> kvazaar-1.0.0.tar.gz/src/strategies/generic/dct-generic.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/generic/intra-generic.c -> kvazaar-1.0.0.tar.gz/src/strategies/generic/intra-generic.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/generic/intra-generic.h -> kvazaar-1.0.0.tar.gz/src/strategies/generic/intra-generic.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/generic/ipol-generic.c -> kvazaar-1.0.0.tar.gz/src/strategies/generic/ipol-generic.c Changed

@@ -18,12 +18,15 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include <stdlib.h>
+#include "strategies/generic/ipol-generic.h"
+
+#include <stdio.h>
+#include <string.h>
 
-#include "ipol-generic.h"
-#include "strategyselector.h"
 #include "encoder.h"
-#include "picture-generic.h"
+#include "strategies/generic/picture-generic.h"
+#include "strategies/strategies-ipol.h"
+#include "strategyselector.h"
 
 extern int8_t kvz_g_luma_filter48;
 extern int8_t kvz_g_chroma_filter84;
@@ -410,6 +413,219 @@
   }
 }
 
+void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+{
+  int x, y;
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  int8_t *fir0 = kvz_g_luma_filter0;
+  int8_t *fir2 = kvz_g_luma_filter2;
+
+  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+
+  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
+  int16_t dst_stride = (LCU_WIDTH + 1);
+
+  // Horizontal positions
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
+    }
+  }
+
+  // Filter vertically and flip x and y
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + 1; ++y) {
+      filteredHPEL_POS_HORy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filteredHPEL_POS_VERy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+    }
+  }
+}
+
+void kvz_filter_hpel_blocks_full_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+{
+  int x, y;
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  int8_t *fir0 = kvz_g_luma_filter0;
+  int8_t *fir2 = kvz_g_luma_filter2;
+
+  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+
+  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
+  int16_t dst_stride = (LCU_WIDTH + 1);
+
+  // Horizontal positions
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
+    }
+  }
+
+  // Filter vertically and flip x and y
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + 1; ++y) {
+      filteredHPEL_POS_HORy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filteredHPEL_POS_VERy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filteredHPEL_POS_DIAy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+    }
+  }
+}
+
+void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+{
+  int x, y;
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  int8_t *fir0 = kvz_g_luma_filter0;
+  int8_t *fir2 = kvz_g_luma_filter2;
+  int8_t *fir1 = kvz_g_luma_filter1;
+  int8_t *fir3 = kvz_g_luma_filter3;
+
+  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped1(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped3(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+
+  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
+  int16_t dst_stride = (LCU_WIDTH + 1);
+  
+  // Horizontal positions
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped1x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir1, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped3x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir3, &srcsrc_stride*ypos + xpos) >> shift1;
+    }
+  }
+
+  // Filter vertically and flip x and y
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + 1; ++y) {
+      
+      // HPEL
+      filtered 0y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered 1y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered 2y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      
+      // QPEL
+      // Horizontal
+      filtered 3y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered 4y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered 5y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered 6y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
+
+      // Vertical
+      filtered 7y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered 8y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered 9y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered10y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+    }
+  }
+}
+
+void kvz_filter_qpel_blocks_full_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+{
+  int x, y;
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
+  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
+  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  int8_t *fir0 = kvz_g_luma_filter0;
+  int8_t *fir2 = kvz_g_luma_filter2;
+  int8_t *fir1 = kvz_g_luma_filter1;
+  int8_t *fir3 = kvz_g_luma_filter3;
+
+  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped1(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t flipped3(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+
+  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
+  int16_t dst_stride = (LCU_WIDTH + 1);
+  
+  // Horizontal positions
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) {
+      int ypos = y - FILTER_OFFSET;
+      int xpos = x - FILTER_OFFSET;
+      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped1x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir1, &srcsrc_stride*ypos + xpos) >> shift1;
+      flipped3x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir3, &srcsrc_stride*ypos + xpos) >> shift1;
+    }
+  }
+
+  // Filter vertically and flip x and y
+  for (x = 0; x < width + 1; ++x) {
+    for (y = 0; y < height + 1; ++y) {
+      
+      // HPEL
+      filtered 0y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered 1y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered 2y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      
+      // QPEL
+      // Horizontal
+      filtered 3y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered 4y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered 5y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered 6y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
+
+      // Vertical
+      filtered 7y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered 8y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered 9y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered10y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+
+      // Diagonal
+      filtered11y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered12y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered13y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
+      filtered14y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);    
+    }
+  }
+}
+
+void kvz_filter_frac_blocks_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block filtered15, int8_t fme_level)
+{
+  switch (fme_level) {
+    case 1:
+      kvz_filter_hpel_blocks_hor_ver_luma_generic(encoder, src, src_stride, width, height, filtered);
+      break;
+    case 2:
+      kvz_filter_hpel_blocks_full_luma_generic(encoder, src, src_stride, width, height, filtered);
+      break;
+    case 3:
+      kvz_filter_qpel_blocks_hor_ver_luma_generic(encoder, src, src_stride, width, height, filtered);
+      break;
+    default:
+      kvz_filter_qpel_blocks_full_luma_generic(encoder, src, src_stride, width, height, filtered);
+      break;
+  }
+}
+
 void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height,kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2)
 {
   //TODO: horizontal and vertical only filtering
@@ -541,6 +757,11 @@
   success &= kvz_strategyselector_register(opaque, "filter_inter_quarterpel_luma", "generic", 0, &kvz_filter_inter_quarterpel_luma_generic);
   success &= kvz_strategyselector_register(opaque, "filter_inter_halfpel_chroma", "generic", 0, &kvz_filter_inter_halfpel_chroma_generic);
   success &= kvz_strategyselector_register(opaque, "filter_inter_octpel_chroma", "generic", 0, &kvz_filter_inter_octpel_chroma_generic);
+  success &= kvz_strategyselector_register(opaque, "filter_frac_blocks_luma", "generic", 0, &kvz_filter_frac_blocks_luma_generic);
+  success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "generic", 0, &kvz_sample_quarterpel_luma_generic);
+  success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "generic", 0, &kvz_sample_octpel_chroma_generic);
+  success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "generic", 0, &kvz_sample_14bit_quarterpel_luma_generic);
+  success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "generic", 0, &kvz_sample_14bit_octpel_chroma_generic);
   success &= kvz_strategyselector_register(opaque, "get_extended_block", "generic", 0, &kvz_get_extended_block_generic);
 
   return success;

kvazaar-0.8.3.tar.gz/src/strategies/generic/ipol-generic.h -> kvazaar-1.0.0.tar.gz/src/strategies/generic/ipol-generic.h Changed

@@ -26,16 +26,13 @@
  * Generic C implementations of optimized functions.
  */
 
-#include "global.h"
-
 #include "encoder.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
 
 int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth);
-
-//TODO: create strategies from sample functions
 void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
 void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
 
-void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
-void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
+
 #endif //STRATEGIES_IPOL_GENERIC_H_

kvazaar-0.8.3.tar.gz/src/strategies/generic/nal-generic.c -> kvazaar-1.0.0.tar.gz/src/strategies/generic/nal-generic.c Changed

@@ -18,12 +18,29 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include <stdlib.h>
-#include <assert.h>
+#include "strategies/generic/nal-generic.h"
 
-#include "strategyselector.h"
+#include "extras/libmd5.h"
+#include "kvazaar.h"
 #include "nal.h"
+#include "strategyselector.h"
+
+
+static void array_md5_generic(const kvz_pixel* data,
+                              const int height, const int width,
+                              const int stride,
+                              unsigned char checksum_outSEI_HASH_MAX_LENGTH, const uint8_t bitdepth)
+{
+  assert(SEI_HASH_MAX_LENGTH >= 16);
 
+  context_md5_t md5_ctx;
+  kvz_md5_init(&md5_ctx);
+  
+  unsigned bytes = width * height * sizeof(kvz_pixel);
+  kvz_md5_update(&md5_ctx, (const unsigned char *)data, bytes);
+
+  kvz_md5_final(checksum_out, &md5_ctx);
+}
 
 static void array_checksum_generic(const kvz_pixel* data,
                                    const int height, const int width,
@@ -150,6 +167,7 @@
 int kvz_strategy_register_nal_generic(void* opaque, uint8_t bitdepth) {
   bool success = true;
 
+  success &= kvz_strategyselector_register(opaque, "array_md5", "generic", 0, &array_md5_generic);
   success &= kvz_strategyselector_register(opaque, "array_checksum", "generic", 0, &array_checksum_generic);
   success &= kvz_strategyselector_register(opaque, "array_checksum", "generic4", 1, &array_checksum_generic4);
   success &= kvz_strategyselector_register(opaque, "array_checksum", "generic8", 2, &array_checksum_generic8);

kvazaar-0.8.3.tar.gz/src/strategies/generic/nal-generic.h -> kvazaar-1.0.0.tar.gz/src/strategies/generic/nal-generic.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/generic/picture-generic.c -> kvazaar-1.0.0.tar.gz/src/strategies/generic/picture-generic.c Changed

@@ -18,8 +18,11 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
+#include "strategies/generic/picture-generic.h"
+
 #include <stdlib.h>
 
+#include "strategies/strategies-picture.h"
 #include "strategyselector.h"
 
 // Function to clip int16_t to pixel. (0-255 or 0-1023)
@@ -95,19 +98,13 @@
   return sad;
 }
 
-
 /**
- * \brief  Calculate SATD between two 4x4 blocks inside bigger arrays.
+ * \brief  Transform differences between two 4x4 blocks.
  * From HM 13.0
  */
-static unsigned satd_4x4_generic(const kvz_pixel *piOrg, const kvz_pixel *piCur)
+static int32_t hadamard_4x4_generic(int32_t diff4*4)
 {
-  int32_t k, satd = 0, diff16, m16, d16;
-  for (k = 0; k < 16; ++k) {
-    diffk = piOrgk - piCurk;
-  }
-
-  /*===== hadamard transform =====*/
+  int32_t m4 * 4;
   m0 = diff0 + diff12;
   m1 = diff1 + diff13;
   m2 = diff2 + diff14;
@@ -125,6 +122,7 @@
   m14 = diff2 - diff14;
   m15 = diff3 - diff15;
 
+  int32_t d4 * 4;
   d0 = m0 + m4;
   d1 = m1 + m5;
   d2 = m2 + m6;
@@ -176,8 +174,9 @@
   d14 = m14 + m15;
   d15 = m15 - m14;
 
-  for (k = 0; k<16; ++k) {
-    satd += abs(dk);
+  int32_t satd = 0;
+  for (int i = 0; i < 16; i++) {
+    satd += abs(di);
   }
   satd = ((satd + 1) >> 1);
 
@@ -185,6 +184,57 @@
 }
 
 /**
+ * \brief  Calculate SATD between two 4x4 blocks.
+ */
+static unsigned satd_4x4_generic(const kvz_pixel *piOrg, const kvz_pixel *piCur)
+{
+  int32_t diff4 * 4;
+  for (int i = 0; i < 4 * 4; i++) {
+    diffi = piOrgi - piCuri;
+  }
+  return hadamard_4x4_generic(diff);
+}
+
+/**
+* \brief  Calculate SATD between two 4x4 blocks inside bigger arrays.
+*/
+unsigned kvz_satd_4x4_subblock_generic(const kvz_pixel * buf1,
+                                       const int32_t     stride1,
+                                       const kvz_pixel * buf2,
+                                       const int32_t     stride2)
+{
+  int32_t diff4 * 4;
+  for (int y = 0; y < 4; y++) {
+    for (int x = 0; x < 4; x++) {
+      diffx + y * 4 = buf1x + y * stride1 - buf2x + y * stride2;
+    }
+  }
+  return hadamard_4x4_generic(diff);
+}
+
+void kvz_satd_4x4_subblock_quad_generic(const kvz_pixel *preds4,
+                                       const int strides4,
+                                       const kvz_pixel *orig,
+                                       const int orig_stride,
+                                       unsigned costs4)
+{
+  int32_t diff44 * 4;
+  for (int y = 0; y < 4; y++) {
+    for (int x = 0; x < 4; x++) {
+      diff0x + y * 4 = origx + y * orig_stride - preds0x + y * strides0;
+      diff1x + y * 4 = origx + y * orig_stride - preds1x + y * strides1;
+      diff2x + y * 4 = origx + y * orig_stride - preds2x + y * strides2;
+      diff3x + y * 4 = origx + y * orig_stride - preds3x + y * strides3;
+    }
+  }
+
+  costs0 = hadamard_4x4_generic(diff0);
+  costs1 = hadamard_4x4_generic(diff1);
+  costs2 = hadamard_4x4_generic(diff2);
+  costs3 = hadamard_4x4_generic(diff3);
+}
+
+/**
 * \brief  Calculate SATD between two 8x8 blocks inside bigger arrays.
 */
 static unsigned satd_8x8_subblock_generic(const kvz_pixel * piOrg, const int32_t iStrideOrg,
@@ -277,6 +327,18 @@
   return sad;
 }
 
+static void satd_8x8_subblock_quad_generic(const kvz_pixel **preds,
+                                       const int *strides,
+                                       const kvz_pixel *orig,
+                                       const int orig_stride,
+                                       unsigned *costs)
+{
+  costs0 = satd_8x8_subblock_generic(orig, orig_stride, preds0, strides0);
+  costs1 = satd_8x8_subblock_generic(orig, orig_stride, preds1, strides1);
+  costs2 = satd_8x8_subblock_generic(orig, orig_stride, preds2, strides2);
+  costs3 = satd_8x8_subblock_generic(orig, orig_stride, preds3, strides3);
+}
+
 // These macros define sadt_16bit_NxN for N = 8, 16, 32, 64
 SATD_NxN(generic,  8)
 SATD_NxN(generic, 16)
@@ -327,6 +389,72 @@
 SATD_DUAL_NXN(32, kvz_pixel)
 SATD_DUAL_NXN(64, kvz_pixel)
 
+#define SATD_ANY_SIZE_MULTI_GENERIC(suffix, num_parallel_blocks) \
+  static cost_pixel_any_size_multi_func satd_any_size_## suffix; \
+  static void satd_any_size_ ## suffix ( \
+      int width, int height, \
+      const kvz_pixel **preds, \
+      const int *strides, \
+      const kvz_pixel *orig, \
+      const int orig_stride, \
+      unsigned num_modes, \
+      unsigned *costs_out, \
+      int8_t *valid) \
+  { \
+    unsigned sumsnum_parallel_blocks = { 0 }; \
+    const kvz_pixel *pred_ptrs4 = { preds0, preds1, preds2, preds3 };\
+    const kvz_pixel *orig_ptr = orig; \
+    costs_out0 = 0; costs_out1 = 0; costs_out2 = 0; costs_out3 = 0; \
+    if (width % 8 != 0) { \
+      /* Process the first column using 4x4 blocks. */ \
+      for (int y = 0; y < height; y += 4) { \
+        kvz_satd_4x4_subblock_ ## suffix(preds, strides, orig, orig_stride, sums); \
+            } \
+      orig_ptr += 4; \
+      for(int blk = 0; blk < num_parallel_blocks; ++blk){\
+        pred_ptrsblk += 4; \
+            }\
+      width -= 4; \
+            } \
+    if (height % 8 != 0) { \
+      /* Process the first row using 4x4 blocks. */ \
+      for (int x = 0; x < width; x += 4 ) { \
+        kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \
+            } \
+      orig_ptr += 4 * orig_stride; \
+      for(int blk = 0; blk < num_parallel_blocks; ++blk){\
+        pred_ptrsblk += 4 * stridesblk; \
+            }\
+      height -= 4; \
+        } \
+    /* The rest can now be processed with 8x8 blocks. */ \
+    for (int y = 0; y < height; y += 8) { \
+      orig_ptr = &origy * orig_stride; \
+      pred_ptrs0 = &preds0y * strides0; \
+      pred_ptrs1 = &preds1y * strides1; \
+      pred_ptrs2 = &preds2y * strides2; \
+      pred_ptrs3 = &preds3y * strides3; \
+      for (int x = 0; x < width; x += 8) { \
+        satd_8x8_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \
+        orig_ptr += 8; \
+        pred_ptrs0 += 8; \
+        pred_ptrs1 += 8; \
+        pred_ptrs2 += 8; \
+        pred_ptrs3 += 8; \
+        costs_out0 += sums0; \
+        costs_out1 += sums1; \
+        costs_out2 += sums2; \
+        costs_out3 += sums3; \
+      } \
+    } \
+    for(int i = 0; i < num_parallel_blocks; ++i){\
+      costs_outi = costs_outi >> (KVZ_BIT_DEPTH - 8);\
+    } \
+    return; \
+  }
+
+SATD_ANY_SIZE_MULTI_GENERIC(quad_generic, 4)
+
 // Function macro for defining SAD calculating functions
 // for fixed size blocks.
 #define SAD_NXN(n, pixel_type) \
@@ -390,55 +518,6 @@
 SAD_DUAL_NXN(32, kvz_pixel)
 SAD_DUAL_NXN(64, kvz_pixel)
 
-/**
- * \brief BLock Image Transfer from one buffer to another.
- *
- * It's a stupidly simple loop that copies pixels.
- *
- * \param orig  Start of the originating buffer.
- * \param dst  Start of the destination buffer.
- * \param width  Width of the copied region.
- * \param height  Height of the copied region.
- * \param orig_stride  Width of a row in the originating buffer.
- * \param dst_stride  Width of a row in the destination buffer.
- *
- * This should be inlined, but it's defined here for now to see if Visual
- * Studios LTCG will inline it.
- */
-void kvz_pixels_blit_generic(const kvz_pixel * const orig, kvz_pixel * const dst,
-                         const unsigned width, const unsigned height,
-                         const unsigned orig_stride, const unsigned dst_stride)
-{
-  unsigned y;
-  //There is absolutely no reason to have a width greater than the source or the destination stride.
-  assert(width <= orig_stride);
-  assert(width <= dst_stride);
-
-#ifdef CHECKPOINTS
-  char *buffer = malloc((3 * width + 1) * sizeof(char));
-  for (y = 0; y < height; ++y) {
-    int p;
-    for (p = 0; p < width; ++p) {
-      sprintf((buffer + 3*p), "%02X ", origy*orig_stride);
-    }
-    buffer3*width = 0;
-    CHECKPOINT("kvz_pixels_blit: %04d: %s", y, buffer);
-  }
-  FREE_POINTER(buffer);
-#endif //CHECKPOINTS
-
-  if (orig == dst) {
-    //If we have the same array, then we should have the same stride
-    assert(orig_stride == dst_stride);
-    return;
-  }
-  assert(orig != dst || orig_stride == dst_stride);
-
-  for (y = 0; y < height; ++y) {
-    memcpy(&dsty*dst_stride, &origy*orig_stride, width * sizeof(kvz_pixel));
-  }
-}
-
 
 int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
 {
@@ -470,8 +549,7 @@
   success &= kvz_strategyselector_register(opaque, "satd_32x32_dual", "generic", 0, &satd_32x32_dual_generic);
   success &= kvz_strategyselector_register(opaque, "satd_64x64_dual", "generic", 0, &satd_64x64_dual_generic);
   success &= kvz_strategyselector_register(opaque, "satd_any_size", "generic", 0, &satd_any_size_generic);
-
-  success &= kvz_strategyselector_register(opaque, "pixels_blit", "generic", 0, &kvz_pixels_blit_generic);
+  success &= kvz_strategyselector_register(opaque, "satd_any_size_quad", "generic", 0, &satd_any_size_quad_generic);
 
   return success;
 }

kvazaar-0.8.3.tar.gz/src/strategies/generic/picture-generic.h -> kvazaar-1.0.0.tar.gz/src/strategies/generic/picture-generic.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/generic/quant-generic.c -> kvazaar-1.0.0.tar.gz/src/strategies/generic/quant-generic.c Changed

@@ -18,13 +18,16 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
+#include "strategies/generic/quant-generic.h"
+
 #include <stdlib.h>
 
-#include "quant-generic.h"
-#include "strategyselector.h"
 #include "encoder.h"
-#include "transform.h"
 #include "rdo.h"
+#include "scalinglist.h"
+#include "strategies/strategies-quant.h"
+#include "strategyselector.h"
+#include "transform.h"
 
 #define QUANT_SHIFT 14
 /**
@@ -38,13 +41,13 @@
   const uint32_t log2_block_size = kvz_g_convert_to_bitwidth + 2;
   const uint32_t * const scan = kvz_g_sig_last_scanscan_idxlog2_block_size - 1;
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->global->QP, (encoder->bitdepth - 8) * 6);
+  int32_t qp_scaled = kvz_get_scaled_qp(type, state->frame->QP, (encoder->bitdepth - 8) * 6);
   const uint32_t log2_tr_size = kvz_g_convert_to_bitwidth + 2;
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"type);
   const int32_t *quant_coeff = encoder->scaling_list.quant_coefflog2_tr_size - 2scalinglist_typeqp_scaled % 6;
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size; //!< Represents scaling through forward transform
   const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + transform_shift;
-  const int32_t add = ((state->global->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9);
+  const int32_t add = ((state->frame->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9);
   const int32_t q_bits8 = q_bits - 8;
 
   uint32_t ac_sum = 0;
@@ -210,7 +213,7 @@
   }
 
   // Quantize coeffs. (coeff -> quant_coeff)
-  if (state->encoder_control->rdoq_enable) {
+  if (state->encoder_control->rdoq_enable && (width > 4 || !state->encoder_control->cfg->rdoq_skip)) {
     int8_t tr_depth = cur_cu->tr_depth - cur_cu->depth;
     tr_depth += (cur_cu->part_size == SIZE_NxN ? 1 : 0);
     kvz_rdoq(state, coeff, quant_coeff, width, width, (color == COLOR_Y ? 0 : 2),
@@ -283,7 +286,7 @@
   int32_t n;
   int32_t transform_shift = 15 - encoder->bitdepth - (kvz_g_convert_to_bit width  + 2);
 
-  int32_t qp_scaled = kvz_get_scaled_qp(type, state->global->QP, (encoder->bitdepth-8)*6);
+  int32_t qp_scaled = kvz_get_scaled_qp(type, state->frame->QP, (encoder->bitdepth-8)*6);
 
   shift = 20 - QUANT_SHIFT - transform_shift;

kvazaar-0.8.3.tar.gz/src/strategies/generic/quant-generic.h -> kvazaar-1.0.0.tar.gz/src/strategies/generic/quant-generic.h Changed

kvazaar-1.0.0.tar.gz/src/strategies/generic/sao-generic.c Added

@@ -0,0 +1,184 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "strategies/generic/sao-generic.h"
+
+#include "cu.h"
+#include "encoder.h"
+#include "encoderstate.h"
+#include "kvazaar.h"
+#include "sao.h"
+#include "strategyselector.h"
+
+
+// Mapping of edge_idx values to eo-classes.
+static int sao_calc_eo_cat(kvz_pixel a, kvz_pixel b, kvz_pixel c)
+{
+  // Mapping relationships between a, b and c to eo_idx.
+  static const int sao_eo_idx_to_eo_category = { 1, 2, 0, 3, 4 };
+
+  int eo_idx = 2 + SIGN3((int)c - (int)a) + SIGN3((int)c - (int)b);
+
+  return sao_eo_idx_to_eo_categoryeo_idx;
+}
+
+
+int kvz_sao_edge_ddistortion_generic(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+                         int block_width, int block_height,
+                         int eo_class, int offsetsNUM_SAO_EDGE_CATEGORIES)
+{
+  int y, x;
+  int sum = 0;
+  vector2d_t a_ofs = g_sao_edge_offsetseo_class0;
+  vector2d_t b_ofs = g_sao_edge_offsetseo_class1;
+
+  for (y = 1; y < block_height - 1; ++y) {
+    for (x = 1; x < block_width - 1; ++x) {
+      const kvz_pixel *c_data = &rec_datay * block_width + x;
+      kvz_pixel a = c_dataa_ofs.y * block_width + a_ofs.x;
+      kvz_pixel c = c_data0;
+      kvz_pixel b = c_datab_ofs.y * block_width + b_ofs.x;
+
+      int offset = offsetssao_calc_eo_cat(a, b, c);
+
+      if (offset != 0) {
+        int diff = orig_datay * block_width + x - c;
+        // Offset is applied to reconstruction, so it is subtracted from diff.
+        sum += (diff - offset) * (diff - offset) - diff * diff;
+      }
+    }
+  }
+
+  return sum;
+}
+
+
+/**
+ * \param orig_data  Original pixel data. 64x64 for luma, 32x32 for chroma.
+ * \param rec_data  Reconstructed pixel data. 64x64 for luma, 32x32 for chroma.
+ * \param dir_offsets
+ * \param is_chroma  0 for luma, 1 for chroma. Indicates
+ */
+void kvz_calc_sao_edge_dir_generic(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+                              int eo_class, int block_width, int block_height,
+                              int cat_sum_cnt2NUM_SAO_EDGE_CATEGORIES)
+{
+  int y, x;
+  vector2d_t a_ofs = g_sao_edge_offsetseo_class0;
+  vector2d_t b_ofs = g_sao_edge_offsetseo_class1;
+  // Arrays orig_data and rec_data are quarter size for chroma.
+
+  // Don't sample the edge pixels because this function doesn't have access to
+  // their neighbours.
+  for (y = 1; y < block_height - 1; ++y) {
+    for (x = 1; x < block_width - 1; ++x) {
+      const kvz_pixel *c_data = &rec_datay * block_width + x;
+      kvz_pixel a = c_dataa_ofs.y * block_width + a_ofs.x;
+      kvz_pixel c = c_data0;
+      kvz_pixel b = c_datab_ofs.y * block_width + b_ofs.x;
+
+      int eo_cat = sao_calc_eo_cat(a, b, c);
+
+      cat_sum_cnt0eo_cat += orig_datay * block_width + x - c;
+      cat_sum_cnt1eo_cat += 1;
+    }
+  }
+}
+
+
+void kvz_sao_reconstruct_color_generic(const encoder_control_t * const encoder, 
+                                  const kvz_pixel *rec_data, kvz_pixel *new_rec_data,
+                                  const sao_info_t *sao,
+                                  int stride, int new_stride,
+                                  int block_width, int block_height,
+                                  color_t color_i)
+{
+  int y, x;
+  // Arrays orig_data and rec_data are quarter size for chroma.
+  int offset_v = color_i == COLOR_V ? 5 : 0;
+
+  if(sao->type == SAO_TYPE_BAND) {
+    int offsets1<<KVZ_BIT_DEPTH;
+    kvz_calc_sao_offset_array(encoder, sao, offsets, color_i);
+    for (y = 0; y < block_height; ++y) {
+      for (x = 0; x < block_width; ++x) {
+        new_rec_datay * new_stride + x = offsetsrec_datay * stride + x;
+      }
+    }
+  } else {
+    // Don't sample the edge pixels because this function doesn't have access to
+    // their neighbours.
+    for (y = 0; y < block_height; ++y) {
+      for (x = 0; x < block_width; ++x) {
+        vector2d_t a_ofs = g_sao_edge_offsetssao->eo_class0;
+        vector2d_t b_ofs = g_sao_edge_offsetssao->eo_class1;
+        const kvz_pixel *c_data = &rec_datay * stride + x;
+        kvz_pixel *new_data = &new_rec_datay * new_stride + x;
+        kvz_pixel a = c_dataa_ofs.y * stride + a_ofs.x;
+        kvz_pixel c = c_data0;
+        kvz_pixel b = c_datab_ofs.y * stride + b_ofs.x;
+
+        int eo_cat = sao_calc_eo_cat(a, b, c);
+
+        new_data0 = (kvz_pixel)CLIP(0, (1 << KVZ_BIT_DEPTH) - 1, c_data0 + sao->offsetseo_cat + offset_v);
+      }
+    }
+  }
+}
+
+
+int kvz_sao_band_ddistortion_generic(const encoder_state_t * const state, const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+                         int block_width, int block_height,
+                         int band_pos, int sao_bands4)
+{
+  int y, x;
+  int shift = state->encoder_control->bitdepth-5;
+  int sum = 0;
+
+  for (y = 0; y < block_height; ++y) {
+    for (x = 0; x < block_width; ++x) {
+      int band = (rec_datay * block_width + x >> shift) - band_pos;
+      int offset = 0;
+      if (band >= 0 && band < 4) {
+        offset = sao_bandsband;
+      }
+      if (offset != 0) {
+        int diff = orig_datay * block_width + x - rec_datay * block_width + x;
+        // Offset is applied to reconstruction, so it is subtracted from diff.
+        sum += (diff - offset) * (diff - offset) - diff * diff;
+      }
+    }
+  }
+
+  return sum;
+}
+
+
+int kvz_strategy_register_sao_generic(void* opaque, uint8_t bitdepth)
+{
+  bool success = true;
+  
+  success &= kvz_strategyselector_register(opaque, "sao_edge_ddistortion", "generic", 0, &kvz_sao_edge_ddistortion_generic);
+  success &= kvz_strategyselector_register(opaque, "calc_sao_edge_dir", "generic", 0, &kvz_calc_sao_edge_dir_generic);
+  success &= kvz_strategyselector_register(opaque, "sao_reconstruct_color", "generic", 0, &kvz_sao_reconstruct_color_generic);
+  success &= kvz_strategyselector_register(opaque, "sao_band_ddistortion", "generic", 0, &kvz_sao_band_ddistortion_generic);
+
+  return success;
+}

kvazaar-1.0.0.tar.gz/src/strategies/generic/sao-generic.h Added

@@ -0,0 +1,33 @@
+#ifndef STRATEGIES_SAO_GENERIC_H_
+#define STRATEGIES_SAO_GENERIC_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Generic C implementations of optimized functions.
+ */
+
+#include "global.h" // IWYU pragma: keep
+
+int kvz_strategy_register_sao_generic(void* opaque, uint8_t bitdepth);
+
+#endif //STRATEGIES_SAO_GENERIC_H_

kvazaar-0.8.3.tar.gz/src/strategies/sse2/picture-sse2.c -> kvazaar-1.0.0.tar.gz/src/strategies/sse2/picture-sse2.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/sse2/picture-sse2.h -> kvazaar-1.0.0.tar.gz/src/strategies/sse2/picture-sse2.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/sse41/picture-sse41.c -> kvazaar-1.0.0.tar.gz/src/strategies/sse41/picture-sse41.c Changed

@@ -18,18 +18,18 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include "picture-sse41.h"
-#include "strategyselector.h"
+#include "strategies/sse41/picture-sse41.h"
 
 #if COMPILE_INTEL_SSE41
-#  include "image.h"
-#  include <immintrin.h>
-#  include <assert.h>
-#  include <stdlib.h>
+#include <immintrin.h>
+#include <stdlib.h>
+
+#include "kvazaar.h"
+#include "strategyselector.h"
 
 
-static unsigned reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2,
-                        const int width, const int height, const unsigned stride1, const unsigned stride2)
+unsigned kvz_reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                           const int width, const int height, const unsigned stride1, const unsigned stride2)
 {
   int y, x;
   unsigned sad = 0;
@@ -94,7 +94,7 @@
   bool success = true;
 #if COMPILE_INTEL_SSE41
   if (bitdepth == 8){
-    success &= kvz_strategyselector_register(opaque, "reg_sad", "sse41", 20, &reg_sad_sse41);
+    success &= kvz_strategyselector_register(opaque, "reg_sad", "sse41", 20, &kvz_reg_sad_sse41);
   }
 #endif
   return success;

kvazaar-0.8.3.tar.gz/src/strategies/sse41/picture-sse41.h -> kvazaar-1.0.0.tar.gz/src/strategies/sse41/picture-sse41.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-common.h -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-common.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-dct.c -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-dct.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-dct.h -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-dct.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-intra.c -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-intra.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-intra.h -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-intra.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-ipol.c -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-ipol.c Changed

@@ -18,18 +18,23 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include "strategies-ipol.h"
+#include "strategies/strategies-ipol.h"
+
+#include "strategies/avx2/ipol-avx2.h"
+#include "strategies/generic/ipol-generic.h"
 #include "strategyselector.h"
 
+
 // Define function pointers.
 ipol_func *kvz_filter_inter_quarterpel_luma;
 ipol_func *kvz_filter_inter_halfpel_chroma;
 ipol_func *kvz_filter_inter_octpel_chroma;
+ipol_frac_blocks_func *kvz_filter_frac_blocks_luma;
 epol_func *kvz_get_extended_block;
-
-// Headers for platform optimizations.
-#include "generic/ipol-generic.h"
-#include "avx2/ipol-avx2.h"
+kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
+kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
+kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
+kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
 
 
 int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth) {
@@ -41,4 +46,4 @@
     success &= kvz_strategy_register_ipol_avx2(opaque, bitdepth);
   }
   return success;
-}
\ No newline at end of file
+}

kvazaar-0.8.3.tar.gz/src/strategies/strategies-ipol.h -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-ipol.h Changed

@@ -26,26 +26,39 @@
  * Interface for subpixel interpolation functions.
  */
 
-#include "global.h"
-
-#include <stdint.h>
-
 #include "encoder.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+#include "search_inter.h"
+
 
 typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } kvz_extended_block;
 
 typedef unsigned(ipol_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst,
   int16_t dst_stride, int8_t hor_flag, int8_t ver_flag);
 
+typedef unsigned(ipol_frac_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height,
+  frac_search_block filtered_out15, int8_t fme_level);
+
 typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
   int filter_size, int width, int height, kvz_extended_block *out);
 
+typedef void(kvz_sample_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
+typedef void(kvz_sample_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
+
+typedef void(kvz_sample_14bit_quarterpel_luma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
+typedef void(kvz_sample_14bit_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
 
 // Declare function pointers.
 extern ipol_func * kvz_filter_inter_quarterpel_luma;
 extern ipol_func * kvz_filter_inter_halfpel_chroma;
 extern ipol_func * kvz_filter_inter_octpel_chroma;
+extern ipol_frac_blocks_func *kvz_filter_frac_blocks_luma;
 extern epol_func * kvz_get_extended_block;
+extern kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
+extern kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
+extern kvz_sample_14bit_quarterpel_luma_func * kvz_sample_14bit_quarterpel_luma;
+extern kvz_sample_14bit_octpel_chroma_func * kvz_sample_14bit_octpel_chroma;
 
 
 int kvz_strategy_register_ipol(void* opaque, uint8_t bitdepth);
@@ -55,6 +68,11 @@
   {"filter_inter_quarterpel_luma", (void**) &kvz_filter_inter_quarterpel_luma}, \
   {"filter_inter_halfpel_chroma", (void**) &kvz_filter_inter_halfpel_chroma}, \
   {"filter_inter_octpel_chroma", (void**) &kvz_filter_inter_octpel_chroma}, \
+  {"filter_frac_blocks_luma", (void**) &kvz_filter_frac_blocks_luma}, \
+  {"sample_quarterpel_luma", (void**) &kvz_sample_quarterpel_luma}, \
+  {"sample_octpel_chroma", (void**) &kvz_sample_octpel_chroma}, \
+  {"sample_14bit_quarterpel_luma", (void**) &kvz_sample_14bit_quarterpel_luma}, \
+  {"sample_14bit_octpel_chroma", (void**) &kvz_sample_14bit_octpel_chroma}, \
   {"get_extended_block", (void**) &kvz_get_extended_block}, \

kvazaar-0.8.3.tar.gz/src/strategies/strategies-nal.c -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-nal.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-nal.h -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-nal.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-picture.c -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-picture.c Changed

@@ -18,9 +18,17 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include "strategies-picture.h"
+#include "strategies/strategies-picture.h"
+
+#include "strategies/altivec/picture-altivec.h"
+#include "strategies/avx2/picture-avx2.h"
+#include "strategies/generic/picture-generic.h"
+#include "strategies/sse2/picture-sse2.h"
+#include "strategies/sse41/picture-sse41.h"
+#include "strategies/x86_asm/picture-x86-asm.h"
 #include "strategyselector.h"
 
+
 // Define function pointers.
 reg_sad_func * kvz_reg_sad = 0;
 
@@ -49,17 +57,7 @@
 cost_pixel_nxn_multi_func * kvz_satd_64x64_dual = 0;
 
 cost_pixel_any_size_func * kvz_satd_any_size = 0;
-
-pixels_blit_func * kvz_pixels_blit = 0;
-
-
-// Headers for platform optimizations.
-#include "generic/picture-generic.h"
-#include "sse2/picture-sse2.h"
-#include "sse41/picture-sse41.h"
-#include "avx2/picture-avx2.h"
-#include "altivec/picture-altivec.h"
-#include "x86_asm/picture-x86-asm.h"
+cost_pixel_any_size_multi_func * kvz_satd_any_size_quad = 0;
 
 
 int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth) {

kvazaar-0.8.3.tar.gz/src/strategies/strategies-picture.h -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-picture.h Changed

@@ -26,9 +26,9 @@
  * Interface for distortion metric functions.
  */
 
-#include "global.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
 
-#include "../image.h"
 
 typedef kvz_pixel (*pred_buffer)32 * 32;
 
@@ -66,17 +66,38 @@
       const kvz_pixel *block2, int stride2) \
   { \
     unsigned sum = 0; \
+    if (width % 8 != 0) { \
+      /* Process the first column using 4x4 blocks. */ \
+      for (int y = 0; y < height; y += 4) { \
+        sum += kvz_satd_4x4_subblock_ ## suffix(&block1y * stride1, stride1, \
+                                                &block2y * stride2, stride2); \
+      } \
+      block1 += 4; \
+      block2 += 4; \
+      width -= 4; \
+    } \
+    if (height % 8 != 0) { \
+      /* Process the first row using 4x4 blocks. */ \
+      for (int x = 0; x < width; x += 4) { \
+        sum += kvz_satd_4x4_subblock_ ## suffix(&block1x, stride1, \
+                                                &block2x, stride2); \
+      } \
+      block1 += 4 * stride1; \
+      block2 += 4 * stride2; \
+      height -= 4; \
+    } \
+    /* The rest can now be processed with 8x8 blocks. */ \
     for (int y = 0; y < height; y += 8) { \
       const kvz_pixel *row1 = &block1y * stride1; \
       const kvz_pixel *row2 = &block2y * stride2; \
       for (int x = 0; x < width; x += 8) { \
-        sum += satd_8x8_subblock_ ## suffix(&row1x, stride1, &row2x, stride2); \
+        sum += satd_8x8_subblock_ ## suffix(&row1x, stride1, \
+                                            &row2x, stride2); \
       } \
     } \
     return sum >> (KVZ_BIT_DEPTH - 8); \
   }
 
-
 typedef unsigned(reg_sad_func)(const kvz_pixel *const data1, const kvz_pixel *const data2,
   const int width, const int height,
   const unsigned stride1, const unsigned stride2);
@@ -87,10 +108,7 @@
     const kvz_pixel *block2, int stride2
 );
 typedef void (cost_pixel_nxn_multi_func)(const pred_buffer preds, const kvz_pixel *orig, unsigned num_modes, unsigned *costs_out);
-
-typedef void pixels_blit_func(const kvz_pixel* orig, kvz_pixel *dst,
-                         unsigned width, unsigned height,
-                         unsigned orig_stride, unsigned dst_stride);
+typedef void (cost_pixel_any_size_multi_func)(int width, int height, const kvz_pixel **preds, const int *strides, const kvz_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid);
 
 
 // Declare function pointers.
@@ -121,8 +139,7 @@
 extern cost_pixel_nxn_multi_func * kvz_satd_32x32_dual;
 extern cost_pixel_nxn_multi_func * kvz_satd_64x64_dual;
 
-extern pixels_blit_func * kvz_pixels_blit;
-
+extern cost_pixel_any_size_multi_func *kvz_satd_any_size_quad;
 
 int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth);
 cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n);
@@ -153,7 +170,7 @@
   {"satd_16x16_dual", (void**) &kvz_satd_16x16_dual}, \
   {"satd_32x32_dual", (void**) &kvz_satd_32x32_dual}, \
   {"satd_64x64_dual", (void**) &kvz_satd_64x64_dual}, \
-  {"pixels_blit", (void**) &kvz_pixels_blit}, \
+  {"satd_any_size_quad", (void**) &kvz_satd_any_size_quad}, \

kvazaar-0.8.3.tar.gz/src/strategies/strategies-quant.c -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-quant.c Changed

kvazaar-0.8.3.tar.gz/src/strategies/strategies-quant.h -> kvazaar-1.0.0.tar.gz/src/strategies/strategies-quant.h Changed

kvazaar-1.0.0.tar.gz/src/strategies/strategies-sao.c Added

@@ -0,0 +1,44 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "strategies/strategies-sao.h"
+#include "strategies/avx2/sao-avx2.h"
+#include "strategies/generic/sao-generic.h"
+#include "strategyselector.h"
+
+
+// Define function pointers.
+sao_edge_ddistortion_func * kvz_sao_edge_ddistortion;
+calc_sao_edge_dir_func * kvz_calc_sao_edge_dir;
+sao_reconstruct_color_func * kvz_sao_reconstruct_color;
+sao_band_ddistortion_func * kvz_sao_band_ddistortion;
+
+
+int kvz_strategy_register_sao(void* opaque, uint8_t bitdepth) {
+  bool success = true;
+
+  success &= kvz_strategy_register_sao_generic(opaque, bitdepth);
+
+  if (kvz_g_hardware_flags.intel_flags.avx2) {
+    success &= kvz_strategy_register_sao_avx2(opaque, bitdepth);
+  }
+
+  return success;
+}
\ No newline at end of file

kvazaar-1.0.0.tar.gz/src/strategies/strategies-sao.h Added

@@ -0,0 +1,73 @@
+#ifndef STRATEGIES_SAO_H_
+#define STRATEGIES_SAO_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Interface for sao functions.
+ */
+
+#include "encoder.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+#include "sao.h"
+
+
+// Declare function pointers.
+typedef int (sao_edge_ddistortion_func)(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+  int block_width, int block_height,
+  int eo_class, int offsetsNUM_SAO_EDGE_CATEGORIES);
+
+typedef void (calc_sao_edge_dir_func)(const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+  int eo_class, int block_width, int block_height,
+  int cat_sum_cnt2NUM_SAO_EDGE_CATEGORIES);
+
+typedef void (sao_reconstruct_color_func)(const encoder_control_t * const encoder,
+  const kvz_pixel *rec_data, kvz_pixel *new_rec_data,
+  const sao_info_t *sao,
+  int stride, int new_stride,
+  int block_width, int block_height,
+  color_t color_i);
+
+typedef int (sao_band_ddistortion_func)(const encoder_state_t * const state, const kvz_pixel *orig_data, const kvz_pixel *rec_data,
+  int block_width, int block_height,
+  int band_pos, int sao_bands4);
+
+// Declare function pointers.
+extern sao_edge_ddistortion_func * kvz_sao_edge_ddistortion;
+extern calc_sao_edge_dir_func * kvz_calc_sao_edge_dir;
+extern sao_reconstruct_color_func * kvz_sao_reconstruct_color;
+extern sao_band_ddistortion_func * kvz_sao_band_ddistortion;
+
+int kvz_strategy_register_sao(void* opaque, uint8_t bitdepth);
+
+
+#define STRATEGIES_SAO_EXPORTS \
+  {"sao_edge_ddistortion", (void**) &kvz_sao_edge_ddistortion}, \
+  {"calc_sao_edge_dir", (void**) &kvz_calc_sao_edge_dir}, \
+  {"sao_reconstruct_color", (void**) &kvz_sao_reconstruct_color}, \
+  {"sao_band_ddistortion", (void**) &kvz_sao_band_ddistortion}, \
+
+
+
+#endif //STRATEGIES_SAO_H_

kvazaar-0.8.3.tar.gz/src/strategies/x86_asm/picture-x86-asm-sad.asm -> kvazaar-1.0.0.tar.gz/src/strategies/x86_asm/picture-x86-asm-sad.asm Changed

@@ -291,3 +291,83 @@
     vmovd eax, m4
 
     RET
+
+
+;KVZ_SAD_32x32_STRIDE
+;Calculates SAD of a 32x32 block inside a frame with stride
+;r0 address of the first value(current)
+;r1 address of the first value(reference)
+;r2 stride
+cglobal sad_32x32_stride, 3, 3, 5
+    vpxor m4, m4
+
+	; Handle 2 lines per iteration
+    %rep 16
+        vmovdqu m0, r0
+        vmovdqu m1, r0 + 16
+        vmovdqu m2, r0 + r2
+        vmovdqu m3, r0 + r2 + 16
+        lea r0, r0 + 2 * r2
+
+        vpsadbw m0, r1
+        vpsadbw m1, r1 + 16
+        vpsadbw m2, r1 + r2
+        vpsadbw m3, r1 + r2 + 16
+        lea r1, r1 + 2 * r2
+ 
+        vpaddd m4, m0
+        vpaddd m4, m1
+        vpaddd m4, m2
+        vpaddd m4, m3
+    %endrep
+
+    vmovhlps m0, m4
+    vpaddd m4, m0
+
+    vmovd eax, m4
+
+    RET
+
+
+;KVZ_SAD_64x64_STRIDE
+;Calculates SAD of a 64x64 block inside a frame with stride
+;r0 address of the first value(current)
+;r1 address of the first value(reference)
+;r2 stride
+cglobal sad_64x64_stride, 3, 4, 5
+    vpxor m4, m4 ; sum accumulation register
+	mov r3, 4 ; number of iterations in the loop
+
+Process16Lines:
+	; Intel optimization manual says to not unroll beyond 500 instructions.
+	; Didn't seem to have much of an affect on Ivy Bridge or Haswell, but
+	; smaller is better, when speed is the same, right?
+    %rep 16
+        vmovdqu m0, r0
+        vmovdqu m1, r0 + 1*16
+        vmovdqu m2, r0 + 2*16
+        vmovdqu m3, r0 + 3*16
+
+        vpsadbw m0, r1
+        vpsadbw m1, r1 + 1*16
+        vpsadbw m2, r1 + 2*16
+        vpsadbw m3, r1 + 3*16
+
+        lea r0, r0 + r2
+        lea r1, r1 + r2
+ 
+        vpaddd m4, m0
+        vpaddd m4, m1
+        vpaddd m4, m2
+        vpaddd m4, m3
+    %endrep
+
+	dec r3
+	jnz Process16Lines
+
+    vmovhlps m0, m4
+    vpaddd m4, m0
+
+    vmovd eax, m4
+
+    RET

kvazaar-0.8.3.tar.gz/src/strategies/x86_asm/picture-x86-asm-sad.h -> kvazaar-1.0.0.tar.gz/src/strategies/x86_asm/picture-x86-asm-sad.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/x86_asm/picture-x86-asm-satd.h -> kvazaar-1.0.0.tar.gz/src/strategies/x86_asm/picture-x86-asm-satd.h Changed

kvazaar-0.8.3.tar.gz/src/strategies/x86_asm/picture-x86-asm.c -> kvazaar-1.0.0.tar.gz/src/strategies/x86_asm/picture-x86-asm.c Changed

@@ -18,13 +18,17 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include <stdlib.h>
-#include "strategyselector.h"
+#include "strategies/x86_asm/picture-x86-asm.h"
 
 #if defined(KVZ_COMPILE_ASM)
+#include <stdlib.h>
+
+#include "kvazaar.h"
+#include "strategies/x86_asm/picture-x86-asm-sad.h"
+#include "strategies/x86_asm/picture-x86-asm-satd.h"
+#include "strategies/sse41/picture-sse41.h"
+#include "strategyselector.h"
 
-#include "picture-x86-asm-sad.h"
-#include "picture-x86-asm-satd.h"
 
 static unsigned kvz_sad_32x32_avx(const kvz_pixel *data1, const kvz_pixel *data2)
 {
@@ -36,16 +40,6 @@
   return sad;
 }
 
-static unsigned kvz_sad_32x32_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride)
-{
-  unsigned sad = 0;
-  sad += kvz_sad_16x16_stride_avx(data1, data2, stride);
-  sad += kvz_sad_16x16_stride_avx(data1 + 16, data2 + 16, stride);
-  sad += kvz_sad_16x16_stride_avx(data1 + 16 * stride, data2 + 16 * stride, stride);
-  sad += kvz_sad_16x16_stride_avx(data1 + 16 * stride + 16, data2 + 16 * stride + 16, stride);
-  return sad;
-}
-
 static unsigned kvz_sad_64x64_avx(const kvz_pixel *data1, const kvz_pixel *data2)
 {
   unsigned sad = 0;
@@ -56,52 +50,50 @@
   return sad;
 }
 
-static unsigned kvz_sad_64x64_stride_avx(const kvz_pixel *data1, const kvz_pixel *data2, unsigned stride)
+static unsigned kvz_sad_other_avx(const kvz_pixel *data1, const kvz_pixel *data2,
+                                  int width, int height,
+                                  unsigned stride)
 {
   unsigned sad = 0;
-  sad += kvz_sad_32x32_stride_avx(data1, data2, stride);
-  sad += kvz_sad_32x32_stride_avx(data1 + 32, data2 + 32, stride);
-  sad += kvz_sad_32x32_stride_avx(data1 + 32 * stride, data2 + 32 * stride, stride);
-  sad += kvz_sad_32x32_stride_avx(data1 + 32 * stride + 32, data2 + 32 * stride + 32, stride);
-  return sad;
-}
-
-static unsigned kvz_sad_other_avx(const kvz_pixel * const data1, const kvz_pixel * const data2,
-  const int width, const int height, const unsigned stride1, const unsigned stride2)
-{
-  int y, x;
-  unsigned sad = 0;
 
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      sad += abs(data1y * stride1 + x - data2y * stride2 + x);
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      sad += abs(data1y * stride + x - data2y * stride + x);
     }
   }
 
   return sad;
 }
 
-static unsigned reg_sad_x86_asm(const kvz_pixel * const data1, const kvz_pixel * const data2,
-const int width, const int height, const unsigned stride1, const unsigned stride2)
+static unsigned reg_sad_x86_asm(const kvz_pixel *data1, const kvz_pixel * data2,
+                                const int width, const int height,
+                                const unsigned stride1, const unsigned stride2)
 {
-  if (width == 4 && height == 4) {
-    return kvz_sad_4x4_stride_avx(data1, data2, stride1);
-  } else if (width == 8 && height == 8) {
-    return kvz_sad_8x8_stride_avx(data1, data2, stride1);
-  } else if (width == 16 && height == 16) {
-    return kvz_sad_16x16_stride_avx(data1, data2, stride1);
-  } else if (width == 32 && height == 32) {
-    return kvz_sad_32x32_stride_avx(data1, data2, stride1);
-  } else if (width == 64 && height == 64) {
-    return kvz_sad_64x64_stride_avx(data1, data2, stride1);
+  if (width == height) {
+    if (width == 8) {
+      return kvz_sad_8x8_stride_avx(data1, data2, stride1);
+    } else if (width == 16) {
+      return kvz_sad_16x16_stride_avx(data1, data2, stride1);
+    } else if (width == 32) {
+      return kvz_sad_32x32_stride_avx(data1, data2, stride1);
+    } else if (width == 64) {
+      return kvz_sad_64x64_stride_avx(data1, data2, stride1);
+    }
+  }
+
+  if (width * height >= 16) {
+    // Call the vectorized general SAD SSE41 function when the block
+    // is big enough to make it worth it.
+    return kvz_reg_sad_sse41(data1, data2, width, height, stride1, stride2);
   } else {
-    return kvz_sad_other_avx(data1, data2, width, height, stride1, stride2);
+    return kvz_sad_other_avx(data1, data2, width, height, stride1);
   }
 }
 
 #endif //defined(KVZ_COMPILE_ASM)
 
-int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth) {
+int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth)
+{
   bool success = true;
 #if defined(KVZ_COMPILE_ASM)
   if (bitdepth == 8){

kvazaar-0.8.3.tar.gz/src/strategies/x86_asm/picture-x86-asm.h -> kvazaar-1.0.0.tar.gz/src/strategies/x86_asm/picture-x86-asm.h Changed

kvazaar-0.8.3.tar.gz/src/strategyselector.c -> kvazaar-1.0.0.tar.gz/src/strategyselector.c Changed

@@ -20,14 +20,22 @@
 
 #include "strategyselector.h"
 
-#include <assert.h>
-#include <string.h>
+#include <stdio.h>
 #include <stdlib.h>
-#if COMPILE_INTEL
-#include <immintrin.h>
+#include <string.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#elif MACOS
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#else
+#include <unistd.h>
 #endif
 
 hardware_flags_t kvz_g_hardware_flags;
+hardware_flags_t kvz_g_strategies_in_use;
+hardware_flags_t kvz_g_strategies_available;
 
 static void set_hardware_flags(int32_t cpuid);
 static void* strategyselector_choose_for(const strategy_list_t * const strategies, const char * const strategy_type);
@@ -75,6 +83,11 @@
     fprintf(stderr, "kvz_strategy_register_intra failed!\n");
     return 0;
   }
+
+  if (!kvz_strategy_register_sao(&strategies, bitdepth)) {
+    fprintf(stderr, "kvz_strategy_register_sao failed!\n");
+    return 0;
+  }
   
   while(cur_strategy_to_select->fptr) {
     *(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type);
@@ -85,10 +98,118 @@
     }
     ++cur_strategy_to_select;
   }
-  
+
   //We can free the structure now, as all strategies are statically set to pointers
   if (strategies.allocated) {
-    free(strategies.strategies);
+	  //Also check what optimizations are available and what are in use
+	  //SIMD optimizations available
+	  bool strategies_available = false;
+	  fprintf(stderr, "Available: ");
+	  if (kvz_g_strategies_available.intel_flags.avx != 0){
+		  fprintf(stderr, "avx(%d) ", kvz_g_strategies_available.intel_flags.avx);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.avx2 != 0){
+		  fprintf(stderr, "avx2(%d) ", kvz_g_strategies_available.intel_flags.avx2);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.mmx != 0) {
+		  fprintf(stderr, "mmx(%d) ", kvz_g_strategies_available.intel_flags.mmx);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.sse != 0) {
+		  fprintf(stderr, "sse(%d) ", kvz_g_strategies_available.intel_flags.sse);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.sse2 != 0) {
+		  fprintf(stderr, "sse2(%d) ", kvz_g_strategies_available.intel_flags.sse2);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.sse3 != 0) {
+		  fprintf(stderr, "sse3(%d) ", kvz_g_strategies_available.intel_flags.sse3);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.sse41 != 0) {
+		  fprintf(stderr, "sse41(%d) ", kvz_g_strategies_available.intel_flags.sse41);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.sse42 != 0) {
+		  fprintf(stderr, "sse42(%d) ", kvz_g_strategies_available.intel_flags.sse42);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.intel_flags.ssse3 != 0) {
+		  fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_available.intel_flags.ssse3);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.arm_flags.neon != 0) {
+		  fprintf(stderr, "neon(%d) ", kvz_g_strategies_available.arm_flags.neon);
+		  strategies_available = true;
+	  }
+	  if (kvz_g_strategies_available.powerpc_flags.altivec != 0) {
+		  fprintf(stderr, "altivec(%d) ", kvz_g_strategies_available.powerpc_flags.altivec);
+		  strategies_available = true;
+	  }
+	  //If there is no strategies available
+	  if (!strategies_available){
+		  fprintf(stderr, "no SIMD optimizations");
+	  }
+	  fprintf(stderr, "\n");
+
+	  //SIMD optimizations in use
+	  bool strategies_in_use = false;
+	  fprintf(stderr, "In use: ");
+	  if (kvz_g_strategies_in_use.intel_flags.avx != 0){
+		  fprintf(stderr, "avx(%d) ", kvz_g_strategies_in_use.intel_flags.avx);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.avx2 != 0){ 
+		  fprintf(stderr, "avx2(%d) ", kvz_g_strategies_in_use.intel_flags.avx2);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.mmx != 0) {
+		  fprintf(stderr, "mmx(%d) ", kvz_g_strategies_in_use.intel_flags.mmx);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.sse != 0) {
+		  fprintf(stderr, "sse(%d) ", kvz_g_strategies_in_use.intel_flags.sse);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.sse2 != 0) {
+		  fprintf(stderr, "sse2(%d) ", kvz_g_strategies_in_use.intel_flags.sse2);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.sse3 != 0) {
+		  fprintf(stderr, "sse3(%d) ", kvz_g_strategies_in_use.intel_flags.sse3);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.sse41 != 0) {
+		  fprintf(stderr, "sse41(%d) ", kvz_g_strategies_in_use.intel_flags.sse41);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.sse42 != 0) {
+		  fprintf(stderr, "sse42(%d) ", kvz_g_strategies_in_use.intel_flags.sse42);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.intel_flags.ssse3 != 0) {
+		  fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_in_use.intel_flags.ssse3);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.arm_flags.neon != 0) {
+		  fprintf(stderr, "neon(%d) ", kvz_g_strategies_in_use.arm_flags.neon);
+		  strategies_in_use = true;
+	  }
+	  if (kvz_g_strategies_in_use.powerpc_flags.altivec != 0) {
+		  fprintf(stderr, "altivec(%d) ", kvz_g_strategies_in_use.powerpc_flags.altivec);
+		  strategies_in_use = true;
+	  }
+	  //If there is no strategies in use
+	  if (!strategies_in_use){
+		  fprintf(stderr, "no SIMD optimizations");
+	  }
+	  fprintf(stderr, "\n");
+
+	  //Free memory
+	  free(strategies.strategies);
   }
 
   return 1;
@@ -115,6 +236,21 @@
     new_strategy->priority = priority;
     new_strategy->fptr = fptr;
   }
+
+  //Check what strategies are available when they are registered
+  if (strcmp(strategy_name, "avx") == 0) kvz_g_strategies_available.intel_flags.avx++;
+  if (strcmp(strategy_name, "x86_asm_avx") == 0) kvz_g_strategies_available.intel_flags.avx++;
+  if (strcmp(strategy_name, "avx2") == 0) kvz_g_strategies_available.intel_flags.avx2++;
+  if (strcmp(strategy_name, "mmx") == 0) kvz_g_strategies_available.intel_flags.mmx++;
+  if (strcmp(strategy_name, "sse") == 0) kvz_g_strategies_available.intel_flags.sse++;
+  if (strcmp(strategy_name, "sse2") == 0) kvz_g_strategies_available.intel_flags.sse2++;
+  if (strcmp(strategy_name, "sse3") == 0) kvz_g_strategies_available.intel_flags.sse3++;
+  if (strcmp(strategy_name, "sse41") == 0) kvz_g_strategies_available.intel_flags.sse41++;
+  if (strcmp(strategy_name, "sse42") == 0) kvz_g_strategies_available.intel_flags.sse42++;
+  if (strcmp(strategy_name, "ssse3") == 0) kvz_g_strategies_available.intel_flags.ssse3++;
+  if (strcmp(strategy_name, "altivec") == 0) kvz_g_strategies_available.powerpc_flags.altivec++;
+  if (strcmp(strategy_name, "neon") == 0) kvz_g_strategies_available.arm_flags.neon++;
+
 #ifdef DEBUG_STRATEGYSELECTOR
   fprintf(stderr, "Registered strategy %s:%s with priority %d (%p)\n", type, strategy_name, priority, fptr);
 #endif //DEBUG_STRATEGYSELECTOR
@@ -172,6 +308,20 @@
   if (max_priority_i == -1) {
     return NULL;
   }
+
+  //Check what strategy we are going to use
+  if (strcmp(strategies->strategiesmax_priority_i.strategy_name, "avx") == 0) kvz_g_strategies_in_use.intel_flags.avx++;
+  if (strcmp(strategies->strategiesmax_priority_i.strategy_name, "x86_asm_avx") == 0) kvz_g_strategies_in_use.intel_flags.avx++;
+  if (strcmp(strategies->strategiesmax_priority_i.strategy_name, "avx2") == 0) kvz_g_strategies_in_use.intel_flags.avx2++;
+  if (strcmp(strategies->strategiesmax_priority_i.strategy_name, "mmx") == 0) kvz_g_strategies_in_use.intel_flags.mmx++;
+  if (strcmp(strategies->strategiesmax_priority_i.strategy_name, "sse") == 0) kvz_g_strategies_in_use.intel_flags.sse++;
+  if (strcmp(strategies->strategiesmax_priority_i.strategy_name, "sse2") == 0) kvz_g_strategies_in_use.intel_flags.sse2++;
+  if (strcmp(strategies->strategiesmax_priority_i.strategy_name, "sse3") == 0) kvz_g_strategies_in_use.intel_flags.sse3++;
+  if (strcmp(strategies->strategiesmax_priority_i.strategy_name, "sse41") == 0) kvz_g_strategies_in_use.intel_flags.sse41++;
+  if (strcmp(strategies->strategiesmax_priority_i.strategy_name, "sse42") == 0) kvz_g_strategies_in_use.intel_flags.sse42++;
+  if (strcmp(strategies->strategiesmax_priority_i.strategy_name, "ssse3") == 0) kvz_g_strategies_in_use.intel_flags.ssse3++;
+  if (strcmp(strategies->strategiesmax_priority_i.strategy_name, "altivec") == 0) kvz_g_strategies_in_use.powerpc_flags.altivec++;
+  if (strcmp(strategies->strategiesmax_priority_i.strategy_name, "neon") == 0) kvz_g_strategies_in_use.arm_flags.neon++;
   
   return strategies->strategiesmax_priority_i.fptr;
 }
@@ -188,6 +338,7 @@
 // CPUID adapters for different compilers.
 #  if defined(__GNUC__)
 #include <cpuid.h>
+
 static INLINE int get_cpuid(unsigned level, unsigned sublevel, cpuid_t *cpu_info) {
   if (__get_cpuid_max(level & 0x80000000, NULL) < level) return 0;
   __cpuid_count(level, sublevel, cpu_info->eax, cpu_info->ebx, cpu_info->ecx, cpu_info->edx);
@@ -195,6 +346,7 @@
 }
 #  elif defined(_MSC_VER)
 #include <intrin.h>
+
 static INLINE int get_cpuid(unsigned level, unsigned sublevel, cpuid_t *cpu_info) {
   int vendor_info4 = { 0, 0, 0, 0 };
   __cpuidex(vendor_info, 0, 0);
@@ -220,8 +372,8 @@
 #endif // COMPILE_INTEL
 
 #if COMPILE_POWERPC
-#include <unistd.h>
 #include <fcntl.h>
+#include <unistd.h>
 #include <linux/auxvec.h>
 #include <asm/cputable.h>
 
@@ -267,6 +419,7 @@
       CPUID1_EDX_MMX = 1 << 23,
       CPUID1_EDX_SSE = 1 << 25,
       CPUID1_EDX_SSE2 = 1 << 26,
+      CPUID1_EDX_HYPER_THREADING = 1 << 28,
     };
     enum {
       CPUID1_ECX_SSE3 = 1 << 0,
@@ -287,6 +440,21 @@
 
     // Dig CPU features with cpuid
     get_cpuid(1, 0, &cpuid1);
+
+#ifdef _WIN32
+    SYSTEM_INFO systeminfo;
+    GetSystemInfo(&systeminfo);
+
+    kvz_g_hardware_flags.logical_cpu_count = systeminfo.dwNumberOfProcessors;
+#else
+    kvz_g_hardware_flags.logical_cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
+#endif
+
+    kvz_g_hardware_flags.physical_cpu_count = kvz_g_hardware_flags.logical_cpu_count;
+    kvz_g_hardware_flags.intel_flags.hyper_threading = cpuid1.edx & CPUID1_EDX_HYPER_THREADING;
+    if (kvz_g_hardware_flags.intel_flags.hyper_threading) {
+      kvz_g_hardware_flags.physical_cpu_count /= 2;
+    }
     
     // EDX
     if (cpuid1.edx & CPUID1_EDX_MMX)   kvz_g_hardware_flags.intel_flags.mmx = 1;
@@ -365,7 +533,6 @@
   if (kvz_g_hardware_flags.intel_flags.avx) fprintf(stderr, " AVX");
   if (kvz_g_hardware_flags.intel_flags.avx2) fprintf(stderr, " AVX2");
   fprintf(stderr, "\n");
-  
 #endif //COMPILE_INTEL
 
 #if COMPILE_POWERPC

kvazaar-0.8.3.tar.gz/src/strategyselector.h -> kvazaar-1.0.0.tar.gz/src/strategyselector.h Changed

@@ -26,77 +26,12 @@
  * Dynamic dispatch based on cpuid.
  */
 
-#include "global.h"
+#include "global.h" // IWYU pragma: keep
 
 #if defined(KVZ_DEBUG) && !defined(DEBUG_STRATEGYSELECTOR)
 # define DEBUG_STRATEGYSELECTOR
 #endif
 
-//Hardware data (abstraction of defines). Extend for other compilers
-
-#if defined(_M_IX86) || defined(__i586__) || defined(__i686__) || defined(_M_X64) || defined(_M_AMD64) || defined(__amd64__) || defined(__x86_64__)
-#  define COMPILE_INTEL 1
-#else
-#  define COMPILE_INTEL 0
-#endif
-
-// Visual Studio note:
-// Because these macros are only used to guard code that is guarded by CPUID
-// at runtime, use /arch parameter to disable them, but enable all intrinsics
-// supported by VisualStudio if SSE2 (highest) is enabled.
-// AVX and AVX2 are handled by /arch directly and sse intrinsics will use VEX
-// versions if they are defined.
-#define MSC_X86_SIMD(level) (_M_X64 || (_M_IX86_FP >= (level)))
-
-#if COMPILE_INTEL
-#  if defined(__MMX__) || MSC_X86_SIMD(1)
-#    define COMPILE_INTEL_MMX 1
-#  endif
-#  if defined(__SSE__) || MSC_X86_SIMD(1)
-#    define COMPILE_INTEL_SSE 1
-#  endif
-#  if defined(__SSE2__) || MSC_X86_SIMD(2)
-#    define COMPILE_INTEL_SSE2 1
-#  endif
-#  if defined(__SSE3__)
-#    define COMPILE_INTEL_SSE3 1
-#  endif
-#  if defined(__SSSE3__) || MSC_X86_SIMD(2)
-#    define COMPILE_INTEL_SSSE3 1
-#  endif
-#  if defined(__SSE4_1__) || MSC_X86_SIMD(2)
-#    define COMPILE_INTEL_SSE41 1
-#  endif
-#  if defined(__SSE4_2__) || MSC_X86_SIMD(2)
-#    define COMPILE_INTEL_SSE42 1
-#  endif
-#  if defined(__AVX__)
-#    define COMPILE_INTEL_AVX 1
-#   endif
-#  if defined(__AVX2__)
-#    define COMPILE_INTEL_AVX2 1
-#   endif
-#endif
-
-#if defined (_M_PPC) || defined(__powerpc64__) || defined(__powerpc__)
-#  define COMPILE_POWERPC 1
-#  ifdef __ALTIVEC__
-#    define COMPILE_POWERPC_ALTIVEC 1
-#  else
-#    define COMPILE_POWERPC_ALTIVEC 0
-#  endif
-#else
-#  define COMPILE_POWERPC 0
-#endif
-
-#if defined (_M_ARM) || defined(__arm__) || defined(__thumb__)
-#  define COMPILE_ARM 1
-#else
-#  define COMPILE_ARM 0
-#endif
-
-
-
 typedef struct {
   const char *type; //Type of the function, usually its name
   const char *strategy_name; //Name of the strategy (e.g. sse2)
@@ -106,7 +41,7 @@
 
 typedef struct {
   unsigned int count;
-  unsigned int allocated;
+  unsigned int allocated;//How much memory is allocated
   strategy_t* strategies;
 } strategy_list_t;
 
@@ -128,6 +63,8 @@
     int sse42;
     int avx;
     int avx2;
+
+    bool hyper_threading;
   } intel_flags;
   
   struct {
@@ -137,10 +74,14 @@
   struct {
     int neon;
   } arm_flags;
+
+  int logical_cpu_count;
+  int physical_cpu_count;
 } hardware_flags_t;
 
 extern hardware_flags_t kvz_g_hardware_flags;
-
+extern hardware_flags_t kvz_g_strategies_in_use;
+extern hardware_flags_t kvz_g_strategies_available;
 
 int kvz_strategyselector_init(int32_t cpuid, uint8_t bitdepth);
 int kvz_strategyselector_register(void *opaque, const char *type, const char *strategy_name, int priority, void *fptr);
@@ -153,6 +94,7 @@
 #include "strategies/strategies-ipol.h"
 #include "strategies/strategies-quant.h"
 #include "strategies/strategies-intra.h"
+#include "strategies/strategies-sao.h"
 
 static const strategy_to_select_t strategies_to_select = {
   STRATEGIES_NAL_EXPORTS
@@ -161,6 +103,7 @@
   STRATEGIES_IPOL_EXPORTS
   STRATEGIES_QUANT_EXPORTS
   STRATEGIES_INTRA_EXPORTS
+  STRATEGIES_SAO_EXPORTS
   { NULL, NULL },
 };

kvazaar-0.8.3.tar.gz/src/tables.h -> kvazaar-1.0.0.tar.gz/src/tables.h Changed

kvazaar-0.8.3.tar.gz/src/threadqueue.c -> kvazaar-1.0.0.tar.gz/src/threadqueue.c Changed

@@ -17,21 +17,19 @@
  * You should have received a copy of the GNU General Public License along
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
- 
-#include <assert.h>
+
+#include "threadqueue.h"
+
+#include <errno.h> // ETIMEDOUT
 #include <pthread.h>
-#include <errno.h> //ETIMEDOUT
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#ifdef KVZ_DEBUG
-#include <string.h>
-#endif //KVZ_DEBUG
-
 #include "global.h"
-#include "threadqueue.h"
 #include "threads.h"
 
+
 typedef struct {
   threadqueue_queue_t * threadqueue;
   int worker_id;
@@ -74,36 +72,38 @@
 } while (0);
 #endif //PTHREAD_DUMP
 
-const struct timespec kvz_time_to_wait = {1, 0};
-
-static void* threadqueue_worker(void* threadqueue_worker_spec_opaque) {
+static void* threadqueue_worker(void* threadqueue_worker_spec_opaque)
+{
   threadqueue_worker_spec * const threadqueue_worker_spec = threadqueue_worker_spec_opaque;
   threadqueue_queue_t * const threadqueue = threadqueue_worker_spec->threadqueue;
   threadqueue_job_t * next_job = NULL;
-  
+
 #ifdef KVZ_DEBUG
   KVZ_GET_TIME(&threadqueue->debug_clock_thread_startthreadqueue_worker_spec->worker_id);
 #endif //KVZ_DEBUG
 
   for(;;) {
-    int i = 0;
     threadqueue_job_t * job = NULL;
-    
+
     PTHREAD_LOCK(&threadqueue->lock);
 
     while(!threadqueue->stop && threadqueue->queue_waiting_execution == 0 && !next_job) {
+      // Wait until there is something to do in the queue.
       PTHREAD_COND_WAIT(&threadqueue->cond, &threadqueue->lock);
     }
-    
+
     if(threadqueue->stop) {
       if (next_job) {
+        // Put a job we had already reserved back into the queue.
+        // FIXME: This lock should be unnecessary, as nobody else is allowed
+        // to touch this job when it's running.
         PTHREAD_LOCK(&next_job->lock);
         next_job->state = THREADQUEUE_JOB_STATE_QUEUED;
         PTHREAD_UNLOCK(&next_job->lock);
       }
       break;
     }
-    
+
     //Find a task (should be fast enough)
     job = NULL;
     if (next_job) {
@@ -113,13 +113,15 @@
       //FIXME: if not using OWF, the first is better than the second, otherwise we should use the second order
       //for (i = threadqueue->queue_count - 1; i >= threadqueue->queue_start; --i) {
       //for (i = threadqueue->queue_start; i < threadqueue->queue_count; ++i) {
-        
-      for (i = (threadqueue->fifo ? threadqueue->queue_start : threadqueue->queue_count - 1);
+
+      for (int i = (threadqueue->fifo ? threadqueue->queue_start : threadqueue->queue_count - 1);
            (threadqueue->fifo ? i < threadqueue->queue_count : i >= threadqueue->queue_start); 
            (threadqueue->fifo ? ++i : --i)) {
         threadqueue_job_t * const i_job = threadqueue->queuei;
-        
+
         if (i_job->state == THREADQUEUE_JOB_STATE_QUEUED && i_job->ndepends == 0) {
+          // Once we found the job with no dependancies, lock it and change
+          // its state to running, so nobody else can claim it.
           PTHREAD_LOCK(&i_job->lock);
           if (i_job->state == THREADQUEUE_JOB_STATE_QUEUED && i_job->ndepends == 0) {
             job = i_job;
@@ -130,58 +132,69 @@
         }
       }
     }
-    
-    //Ok we got a job (and we have a lock on it)
-    if (job) {
-      int queue_waiting_dependency_decr, queue_waiting_execution_incr;
 
+    if (!job) {
+      // We have no job. Probably because more threads were woken up than
+      // there were jobs to do.
+      PTHREAD_UNLOCK(&threadqueue->lock);
+    } else {
+      // We have a job with ndepends==0 and its state is running.
       assert(job->state == THREADQUEUE_JOB_STATE_RUNNING);
-      
-      //Move the queue_start "pointer" if needed
-      while (threadqueue->queue_start < threadqueue->queue_count && threadqueue->queuethreadqueue->queue_start->state != THREADQUEUE_JOB_STATE_QUEUED) threadqueue->queue_start++;
+
+      // Advance queue_start to skip all the running jobs.
+      while (threadqueue->queue_start < threadqueue->queue_count &&
+             threadqueue->queuethreadqueue->queue_start->state != THREADQUEUE_JOB_STATE_QUEUED)
+      {
+        threadqueue->queue_start++;
+      }
       
       if (!next_job) {
         --threadqueue->queue_waiting_execution;
         ++threadqueue->queue_running;
       }
-      
-      //Unlock the queue
+
       PTHREAD_UNLOCK(&threadqueue->lock);
-      
+
 #ifdef KVZ_DEBUG
       job->debug_worker_id = threadqueue_worker_spec->worker_id;
       KVZ_GET_TIME(&job->debug_clock_start);
 #endif //KVZ_DEBUG
-      
+
       job->fptr(job->arg);
-      
+
 #ifdef KVZ_DEBUG
       job->debug_worker_id = threadqueue_worker_spec->worker_id;
       KVZ_GET_TIME(&job->debug_clock_stop);
 #endif //KVZ_DEBUG
-      
-      //Re-lock the job to update its status and treat its dependencies
+
+      // FIXME: This lock should be unnecessary, as nobody else is allowed
+      // to touch this job when it's running.
       PTHREAD_LOCK(&job->lock);
       assert(job->state == THREADQUEUE_JOB_STATE_RUNNING);
-      
+
       job->state = THREADQUEUE_JOB_STATE_DONE;
-      
+
       next_job = NULL;
-      
-      queue_waiting_dependency_decr = 0;
-      queue_waiting_execution_incr = 0;
-      //Decrease counter of dependencies
-      for (i = 0; i < job->rdepends_count; ++i) {
+
+      int queue_waiting_dependency_decr = 0;
+      int queue_waiting_execution_incr = 0;
+
+      // Go throught all the jobs that depend on this one, decresing their ndepends.
+      for (int i = 0; i < job->rdepends_count; ++i) {
         threadqueue_job_t * const depjob = job->rdependsi;
-        //Note that we lock the dependency AFTER locking the source. This avoids a deadlock in dep_add
+        // Note that we lock the dependency AFTER locking the source. This avoids a deadlock in dep_add.
         PTHREAD_LOCK(&depjob->lock);
-        
+
         assert(depjob->state == THREADQUEUE_JOB_STATE_QUEUED);
         assert(depjob->ndepends > 0);
         --depjob->ndepends;
-        
+
+        // Count how many jobs can now start executing so we know how many
+        // threads to wake up.
         if (depjob->ndepends == 0) {
           if (!next_job) {
+            // Avoid having to find a new job for this worker through the
+            // queue by taking one of the jobs that depended on current job.
             next_job = depjob;
             depjob->state = THREADQUEUE_JOB_STATE_RUNNING;
           } else {
@@ -189,30 +202,37 @@
           }
           ++queue_waiting_dependency_decr;
         }
-        
+
         PTHREAD_UNLOCK(&depjob->lock);
       }
-      //Unlock the job
-      PTHREAD_UNLOCK(&job->lock);
       
-      //Signal the queue that we've done a job
+      PTHREAD_UNLOCK(&job->lock);
+
       PTHREAD_LOCK(&threadqueue->lock);
-      if (!next_job) threadqueue->queue_running--;
+
       assert(threadqueue->queue_waiting_dependency >= queue_waiting_dependency_decr);
+
+      // This thread will 
+      if (!next_job) {
+        // We didn't find a new job, so this thread will have to go wait.
+        threadqueue->queue_running--;
+      }
       threadqueue->queue_waiting_dependency -= queue_waiting_dependency_decr;
       threadqueue->queue_waiting_execution += queue_waiting_execution_incr;
-      for (i = 0; i < queue_waiting_execution_incr; ++i) {
+
+      // Wake up enough threads to take care of the tasks now lacking dependancies.
+      for (int i = 0; i < queue_waiting_execution_incr; ++i) {
         PTHREAD_COND_SIGNAL(&threadqueue->cond);
       }
-      //We only signal cb_cond since we finished a job
+
+      // Signal main thread that a job has been completed.
       pthread_cond_signal(&threadqueue->cb_cond);
-      PTHREAD_UNLOCK(&threadqueue->lock);
-    } else {
+
       PTHREAD_UNLOCK(&threadqueue->lock);
     }
   }
 
-  //We got out of the loop because threadqueue->stop == 1. The queue is locked.
+  // We got out of the loop because threadqueue->stop == 1. The queue is locked.
   assert(threadqueue->stop);
   --threadqueue->threads_running;
   
@@ -230,6 +250,8 @@
   
   return NULL;
 }
+
+
 int kvz_threadqueue_init(threadqueue_queue_t * const threadqueue, int thread_count, int fifo) {
   int i;
   if (pthread_mutex_init(&threadqueue->lock, NULL) != 0) {
@@ -438,10 +460,10 @@
     if (notdone > 0) {
       int ret;
       PTHREAD_COND_BROADCAST(&(threadqueue->cond));
-      PTHREAD_UNLOCK(&threadqueue->lock);
-      KVZ_SLEEP();
-      PTHREAD_LOCK(&threadqueue->lock);
-      ret = pthread_cond_timedwait(&threadqueue->cb_cond, &threadqueue->lock, &kvz_time_to_wait);
+      
+      struct timespec wait_moment;
+      ms_from_now_timespec(&wait_moment, 100);
+      ret = pthread_cond_timedwait(&threadqueue->cb_cond, &threadqueue->lock, &wait_moment);
       if (ret != 0 && ret != ETIMEDOUT) {
         fprintf(stderr, "pthread_cond_timedwait failed!\n"); 
         assert(0); 
@@ -476,10 +498,9 @@
     if (!job_done) {
       int ret;
       PTHREAD_COND_BROADCAST(&(threadqueue->cond));
-      PTHREAD_UNLOCK(&threadqueue->lock);
-      KVZ_SLEEP();
-      PTHREAD_LOCK(&threadqueue->lock);
-      ret = pthread_cond_timedwait(&threadqueue->cb_cond, &threadqueue->lock, &kvz_time_to_wait);
+      struct timespec wait_moment;
+      ms_from_now_timespec(&wait_moment, 100);
+      ret = pthread_cond_timedwait(&threadqueue->cb_cond, &threadqueue->lock, &wait_moment);
       if (ret != 0 && ret != ETIMEDOUT) {
         fprintf(stderr, "pthread_cond_timedwait failed!\n"); 
         assert(0);

kvazaar-0.8.3.tar.gz/src/threadqueue.h -> kvazaar-1.0.0.tar.gz/src/threadqueue.h Changed

kvazaar-0.8.3.tar.gz/src/threads.h -> kvazaar-1.0.0.tar.gz/src/threads.h Changed

@@ -26,20 +26,24 @@
  * Abstractions for operating system specific stuff.
  */
 
-#include "global.h"
+#include "global.h" // IWYU pragma: keep
 
 #include <pthread.h>
 
+#define E3 1000
+#define E9 1000000000
+#define FILETIME_TO_EPOCH 0x19DB1DED53E8000LL
+
 #if defined(__GNUC__) && !defined(__MINGW32__) 
-#include <unistd.h>
-#include <time.h>
+#include <unistd.h> // IWYU pragma: export
+#include <time.h> // IWYU pragma: export
 
 #define KVZ_CLOCK_T struct timespec
 
 #ifdef __MACH__
 // Workaround Mac OS not having clock_gettime.
-#include <mach/clock.h>
-#include <mach/mach.h>
+#include <mach/clock.h> // IWYU pragma: export
+#include <mach/mach.h> // IWYU pragma: export
 #define KVZ_GET_TIME(clock_t) { \
   clock_serv_t cclock; \
   mach_timespec_t mts; \
@@ -53,31 +57,68 @@
 #define KVZ_GET_TIME(clock_t) { clock_gettime(CLOCK_MONOTONIC, (clock_t)); }
 #endif
 
-#define KVZ_CLOCK_T_AS_DOUBLE(ts) ((double)((ts).tv_sec) + (double)((ts).tv_nsec) / (double)1000000000L)
-#define KVZ_CLOCK_T_DIFF(start, stop) ((double)((stop).tv_sec - (start).tv_sec) + (double)((stop).tv_nsec - (start).tv_nsec) / (double)1000000000L)
+#define KVZ_CLOCK_T_AS_DOUBLE(ts) ((double)((ts).tv_sec) + (double)((ts).tv_nsec) / 1e9)
+#define KVZ_CLOCK_T_DIFF(start, stop) ((double)((stop).tv_sec - (start).tv_sec) + (double)((stop).tv_nsec - (start).tv_nsec) / 1e9)
+
+static INLINE struct timespec * ms_from_now_timespec(struct timespec * result, int wait_ms)
+{
+  KVZ_GET_TIME(result);
+  int64_t secs = result->tv_sec + wait_ms / E3;
+  int64_t nsecs = result->tv_nsec + (wait_ms % E3) * (E9 / E3);
+  
+  if (nsecs >= E9) {
+    secs += 1;
+    nsecs -= E9;
+  }
+  
+  result->tv_sec = secs;
+  result->tv_nsec = nsecs;
+
+  return result;
+}
 
 #define KVZ_ATOMIC_INC(ptr)                     __sync_add_and_fetch((volatile int32_t*)ptr, 1)
 #define KVZ_ATOMIC_DEC(ptr)                     __sync_add_and_fetch((volatile int32_t*)ptr, -1)
-#define KVZ_SLEEP()                             usleep(0)
 
 #else //__GNUC__
 //TODO: we assume !GCC => Windows... this may be bad
-#include <windows.h>
+#include <windows.h> // IWYU pragma: export
 
 #define KVZ_CLOCK_T struct _FILETIME
 #define KVZ_GET_TIME(clock_t) { GetSystemTimeAsFileTime(clock_t); }
 // _FILETIME has 32bit low and high part of 64bit 100ns resolution timestamp (since 12:00 AM January 1, 1601)
-#define KVZ_CLOCK_T_AS_DOUBLE(ts) ((double)(((uint64_t)(ts).dwHighDateTime)<<32 | (uint64_t)(ts).dwLowDateTime) / (double)10000000L)
+#define KVZ_CLOCK_T_AS_DOUBLE(ts) ((double)(((uint64_t)(ts).dwHighDateTime)<<32 | (uint64_t)(ts).dwLowDateTime) / 1e7)
 #define KVZ_CLOCK_T_DIFF(start, stop) ((double)((((uint64_t)(stop).dwHighDateTime)<<32 | (uint64_t)(stop).dwLowDateTime) - \
-                                  (((uint64_t)(start).dwHighDateTime)<<32 | (uint64_t)(start).dwLowDateTime)) / (double)10000000L)
+                                  (((uint64_t)(start).dwHighDateTime)<<32 | (uint64_t)(start).dwLowDateTime)) / 1e7)
+
+static INLINE struct timespec * ms_from_now_timespec(struct timespec * result, int wait_ms)
+{
+  KVZ_CLOCK_T now;
+  KVZ_GET_TIME(&now);
 
+  int64_t moment_100ns = (int64_t)now.dwHighDateTime << 32 | (int64_t)now.dwLowDateTime;
+  moment_100ns -= (int64_t)FILETIME_TO_EPOCH;
+   
+  int64_t secs = moment_100ns / (E9 / 100) + (wait_ms / E3);
+  int64_t nsecs = (moment_100ns % (E9 / 100))*100 + ((wait_ms % E3) * (E9 / E3));
+  
+  if (nsecs >= E9) {
+    secs += 1;
+    nsecs -= E9;
+  }
+
+  result->tv_sec = secs;
+  result->tv_nsec = nsecs;
+
+  return result;
+}
 
 #define KVZ_ATOMIC_INC(ptr)                     InterlockedIncrement((volatile LONG*)ptr)
 #define KVZ_ATOMIC_DEC(ptr)                     InterlockedDecrement((volatile LONG*)ptr)
-// Sleep(0) results in bad performance on Windows for some reason,
-// As a work around sleep for 10ms.
-#define KVZ_SLEEP()                             Sleep(10)
 
 #endif //__GNUC__
 
+#undef E9
+#undef E3
+
 #endif //THREADS_H_

kvazaar-0.8.3.tar.gz/src/transform.c -> kvazaar-1.0.0.tar.gz/src/transform.c Changed

@@ -20,17 +20,20 @@
 
 #include "transform.h"
 
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-
-#include "nal.h"
+#include "image.h"
+#include "kvazaar.h"
 #include "rdo.h"
 #include "strategies/strategies-dct.h"
 #include "strategies/strategies-quant.h"
-#include "strategies/generic/quant-generic.h"
-#include "strategies/strategies-picture.h"
+#include "tables.h"
+
+/**
+ * \brief RDPCM direction.
+ */
+typedef enum rdpcm_dir {
+  RDPCM_VER = 0, // vertical
+  RDPCM_HOR = 1, // horizontal
+} rdpcm_dir;
 
 //////////////////////////////////////////////////////////////////////////
 // INITIALIZATIONS
@@ -50,6 +53,76 @@
 //
 
 /**
+ * \brief Bypass transform and quantization.
+ *
+ * Copies the reference pixels directly to reconstruction and the residual
+ * directly to coefficients. Used when cu_transquant_bypass_flag is set.
+ * Parameters pred_in and rec_out may be aliased.
+ *
+ * \param width       Transform width.
+ * \param in_stride   Stride for ref_in and pred_in
+ * \param out_stride  Stride for rec_out and coeff_out.
+ * \param ref_in      Reference pixels.
+ * \param pred_in     Predicted pixels.
+ * \param rec_out     Returns the reconstructed pixels.
+ * \param coeff_out   Returns the coefficients used for reconstruction of rec_out.
+ *
+ * \returns  Whether coeff_out contains any non-zero coefficients.
+ */
+static bool bypass_transquant(const int width,
+                              const int in_stride,
+                              const int out_stride,
+                              const kvz_pixel *const ref_in,
+                              const kvz_pixel *const pred_in,
+                              kvz_pixel *rec_out,
+                              coeff_t *coeff_out)
+{
+  bool nonzero_coeffs = false;
+
+  for (int y = 0; y < width; ++y) {
+    for (int x = 0; x < width; ++x) {
+      int32_t in_idx  = x + y * in_stride;
+      int32_t out_idx = x + y * out_stride;
+
+      // The residual must be computed before writing to rec_out because
+      // pred_in and rec_out may point to the same array.
+      coeff_t coeff      = (coeff_t)(ref_inin_idx - pred_inin_idx);
+      coeff_outout_idx = coeff;
+      rec_outout_idx   = ref_inin_idx;
+
+      nonzero_coeffs |= (coeff != 0);
+    }
+  }
+
+  return nonzero_coeffs;
+}
+
+/**
+ * Apply DPCM to residual.
+ *
+ * \param width   width of the block
+ * \param stride  stride of coeff array
+ * \param dir     RDPCM direction
+ * \param coeff   coefficients (residual) to filter
+ */
+static void rdpcm(const int width,
+                  const int stride,
+                  const rdpcm_dir dir,
+                  coeff_t *coeff)
+{
+  const int offset = (dir == RDPCM_HOR) ? 1 : stride;
+  const int min_x  = (dir == RDPCM_HOR) ? 1 : 0;
+  const int min_y  = (dir == RDPCM_HOR) ? 0 : 1;
+
+  for (int y = width - 1; y >= min_y; y--) {
+    for (int x = width - 1; x >= min_x; x--) {
+      const int index = x + y * stride;
+      coeffindex -= coeffindex - offset;
+    }
+  }
+}
+
+/**
  * \brief Get scaled QP used in quantization
  *
  */
@@ -158,7 +231,7 @@
     int has_coeffs;
   } skip, noskip, *best;
 
-  const int bit_cost = (int)(state->global->cur_lambda_cost+0.5);
+  const int bit_cost = (int)(state->frame->cur_lambda_cost+0.5);
   
   noskip.has_coeffs = kvz_quantize_residual(
       state, cur_cu, width, color, scan_order,
@@ -212,13 +285,12 @@
  * - lcu->cbf  coded block flags for the area
  * - lcu->cu.intra.tr_skip  for the area
  */
-void kvz_quantize_lcu_luma_residual(encoder_state_t * const state, int32_t x, int32_t y, const uint8_t depth, cu_info_t *cur_cu, lcu_t* lcu)
+void kvz_quantize_lcu_luma_residual(encoder_state_t * const state, int32_t x, int32_t y, const uint8_t depth, cu_info_t *cur_pu, lcu_t* lcu)
 {
   // we have 64>>depth transform size
   const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
-  const int pu_index = PU_INDEX(lcu_px.x / 4, lcu_px.y / 4);
-  if (cur_cu == NULL) {
-    cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
+  if (cur_pu == NULL) {
+    cur_pu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
   }
   const int8_t width = LCU_WIDTH>>depth;
   
@@ -227,7 +299,7 @@
   assert(width == 4 || width == 8 || width == 16 || width == 32 || width == 64);
 
   // Split transform and increase depth
-  if (depth == 0 || cur_cu->tr_depth > depth) {
+  if (depth == 0 || cur_pu->tr_depth > depth) {
     int offset = width / 2;
     kvz_quantize_lcu_luma_residual(state, x,          y,          depth+1, NULL, lcu);
     kvz_quantize_lcu_luma_residual(state, x + offset, y,          depth+1, NULL, lcu);
@@ -235,13 +307,13 @@
     kvz_quantize_lcu_luma_residual(state, x + offset, y + offset, depth+1, NULL, lcu);
 
     // Propagate coded block flags from child CUs to parent CU.
-    if (depth < MAX_DEPTH) {
-      cu_info_t *cu_a = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y);
-      cu_info_t *cu_b = LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset);
-      cu_info_t *cu_c = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset);
-      if (cbf_is_set(cu_a->cbf.y, depth+1) || cbf_is_set(cu_b->cbf.y, depth+1) || cbf_is_set(cu_c->cbf.y, depth+1)) {
-        cbf_set(&cur_cu->cbf.y, depth);
-      }
+    if (depth <= MAX_DEPTH) {
+      uint16_t child_cbfs3 = {
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
+      };
+      cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_Y);
     }
 
     return;
@@ -257,7 +329,7 @@
     // Pointers to current location in arrays with kvantized coefficients.
     coeff_t *orig_coeff_y = &lcu->coeff.yluma_offset;
 
-    coeff_scan_order_t scan_idx_luma = kvz_get_scan_order(cur_cu->type, cur_cu->intrapu_index.mode, depth);
+    coeff_scan_order_t scan_idx_luma = kvz_get_scan_order(cur_pu->type, cur_pu->intra.mode, depth);
 
     #if OPTIMIZATION_SKIP_RESIDUAL_ON_THRESHOLD
     uint32_t residual_sum = 0;
@@ -266,30 +338,45 @@
     // Clear coded block flag structures for depths lower than current depth.
     // This should ensure that the CBF data doesn't get corrupted if this function
     // is called more than once.
-    cbf_clear(&cur_cu->cbf.y, depth + pu_index);
+    cbf_clear(&cur_pu->cbf, depth, COLOR_Y);
 
-    if (width == 4 && 
-        state->encoder_control->trskip_enable)
-    {
+
+    if (state->encoder_control->cfg->lossless) {
+      if (bypass_transquant(width,
+                            LCU_WIDTH, LCU_WIDTH,
+                            base_y, recbase_y,
+                            recbase_y, orig_coeff_y)) {
+        cbf_set(&cur_pu->cbf, depth, COLOR_Y);
+      }
+      if (state->encoder_control->cfg->implicit_rdpcm && cur_pu->type == CU_INTRA) {
+        // implicit rdpcm for horizontal and vertical intra modes
+        if (cur_pu->intra.mode == 10) {
+          rdpcm(width, LCU_WIDTH, RDPCM_HOR, orig_coeff_y);
+
+        } else if (cur_pu->intra.mode == 26) {
+          rdpcm(width, LCU_WIDTH, RDPCM_VER, orig_coeff_y);
+        }
+      }
+    } else if (width == 4 && state->encoder_control->trskip_enable) {
       // Try quantization with trskip and use it if it's better.
       int has_coeffs = kvz_quantize_residual_trskip(
-          state, cur_cu, width, COLOR_Y, scan_idx_luma,
-          &cur_cu->intrapu_index.tr_skip,
+          state, cur_pu, width, COLOR_Y, scan_idx_luma,
+          &cur_pu->intra.tr_skip,
           LCU_WIDTH, LCU_WIDTH,
           base_y, recbase_y, recbase_y, orig_coeff_y
       );
       if (has_coeffs) {
-        cbf_set(&cur_cu->cbf.y, depth + pu_index);
+        cbf_set(&cur_pu->cbf, depth, COLOR_Y);
       }
     } else {
       int has_coeffs = kvz_quantize_residual(
-          state, cur_cu, width, COLOR_Y, scan_idx_luma,
+          state, cur_pu, width, COLOR_Y, scan_idx_luma,
           0,
           LCU_WIDTH, LCU_WIDTH,
           base_y, recbase_y, recbase_y, orig_coeff_y
       );
       if (has_coeffs) {
-        cbf_set(&cur_cu->cbf.y, depth + pu_index);
+        cbf_set(&cur_pu->cbf, depth, COLOR_Y);
       }
     }
   }
@@ -300,7 +387,6 @@
 {
   // we have 64>>depth transform size
   const vector2d_t lcu_px = { SUB_SCU(x), SUB_SCU(y) };
-  const int pu_index = PU_INDEX(lcu_px.x / 4, lcu_px.y / 4);
   const int8_t width = LCU_WIDTH>>depth;
   if (cur_cu == NULL) {
     cur_cu = LCU_GET_CU_AT_PX(lcu, lcu_px.x, lcu_px.y);
@@ -320,15 +406,13 @@
 
     // Propagate coded block flags from child CUs to parent CU.
     if (depth < MAX_DEPTH) {
-      cu_info_t *cu_a = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y);
-      cu_info_t *cu_b = LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset);
-      cu_info_t *cu_c = LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset);
-      if (cbf_is_set(cu_a->cbf.u, depth+1) || cbf_is_set(cu_b->cbf.u, depth+1) || cbf_is_set(cu_c->cbf.u, depth+1)) {
-        cbf_set(&cur_cu->cbf.u, depth);
-      }
-      if (cbf_is_set(cu_a->cbf.v, depth+1) || cbf_is_set(cu_b->cbf.v, depth+1) || cbf_is_set(cu_c->cbf.v, depth+1)) {
-        cbf_set(&cur_cu->cbf.v, depth);
-      }
+      uint16_t child_cbfs3 = {
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y         )->cbf,
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x,          lcu_px.y + offset)->cbf,
+        LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
+      };
+      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_U);
+      cbf_set_conditionally(&cur_cu->cbf, child_cbfs, depth, COLOR_V);
     }
 
     return;
@@ -336,9 +420,9 @@
 
   // If luma is 4x4, do chroma for the 8x8 luma area when handling the top
   // left PU because the coordinates are correct.
-  if (depth <= MAX_DEPTH || pu_index == 0) {
-    cbf_clear(&cur_cu->cbf.u, depth);
-    cbf_clear(&cur_cu->cbf.v, depth);
+  if (depth <= MAX_DEPTH || (lcu_px.x % 8 == 0 && lcu_px.y % 8 == 0)) {
+    cbf_clear(&cur_cu->cbf, depth, COLOR_U);
+    cbf_clear(&cur_cu->cbf, depth, COLOR_V);
 
     const int chroma_offset = lcu_px.x / 2 + lcu_px.y / 2 * LCU_WIDTH_C;
     kvz_pixel *recbase_u = &lcu->rec.uchroma_offset;
@@ -352,12 +436,39 @@
     int chroma_depth = (depth == MAX_PU_DEPTH ? depth - 1 : depth);
     int chroma_width = LCU_WIDTH_C >> chroma_depth;
 
-    scan_idx_chroma = kvz_get_scan_order(cur_cu->type, cur_cu->intra0.mode_chroma, depth);
-    if (kvz_quantize_residual(state, cur_cu, chroma_width, COLOR_U, scan_idx_chroma, tr_skip, LCU_WIDTH_C, LCU_WIDTH_C, base_u, recbase_u, recbase_u, orig_coeff_u)) {
-      cbf_set(&cur_cu->cbf.u, depth);
-    }
-    if (kvz_quantize_residual(state, cur_cu, chroma_width, COLOR_V, scan_idx_chroma, tr_skip, LCU_WIDTH_C, LCU_WIDTH_C, base_v, recbase_v, recbase_v, orig_coeff_v)) {
-      cbf_set(&cur_cu->cbf.v, depth);
+    scan_idx_chroma = kvz_get_scan_order(cur_cu->type, cur_cu->intra.mode_chroma, depth);
+
+    if (state->encoder_control->cfg->lossless) {
+      if (bypass_transquant(chroma_width,
+                            LCU_WIDTH_C, LCU_WIDTH_C,
+                            base_u, recbase_u,
+                            recbase_u, orig_coeff_u)) {
+        cbf_set(&cur_cu->cbf, depth, COLOR_U);
+      }
+      if (bypass_transquant(chroma_width,
+                            LCU_WIDTH_C, LCU_WIDTH_C,
+                            base_v, recbase_v,
+                            recbase_v, orig_coeff_v)) {
+        cbf_set(&cur_cu->cbf, depth, COLOR_V);
+      }
+      if (state->encoder_control->cfg->implicit_rdpcm && cur_cu->type == CU_INTRA) {
+        // implicit rdpcm for horizontal and vertical intra modes
+        if (cur_cu->intra.mode_chroma == 10) {
+          rdpcm(chroma_width, LCU_WIDTH_C, RDPCM_HOR, orig_coeff_u);
+          rdpcm(chroma_width, LCU_WIDTH_C, RDPCM_HOR, orig_coeff_v);
+
+        } else if (cur_cu->intra.mode_chroma == 26) {
+          rdpcm(chroma_width, LCU_WIDTH_C, RDPCM_VER, orig_coeff_u);
+          rdpcm(chroma_width, LCU_WIDTH_C, RDPCM_VER, orig_coeff_v);
+        }
+      }
+    } else {
+      if (kvz_quantize_residual(state, cur_cu, chroma_width, COLOR_U, scan_idx_chroma, tr_skip, LCU_WIDTH_C, LCU_WIDTH_C, base_u, recbase_u, recbase_u, orig_coeff_u)) {
+        cbf_set(&cur_cu->cbf, depth, COLOR_U);
+      }
+      if (kvz_quantize_residual(state, cur_cu, chroma_width, COLOR_V, scan_idx_chroma, tr_skip, LCU_WIDTH_C, LCU_WIDTH_C, base_v, recbase_v, recbase_v, orig_coeff_v)) {
+        cbf_set(&cur_cu->cbf, depth, COLOR_V);
+      }
     }
   }
 }

kvazaar-0.8.3.tar.gz/src/transform.h -> kvazaar-1.0.0.tar.gz/src/transform.h Changed

kvazaar-0.8.3.tar.gz/src/videoframe.c -> kvazaar-1.0.0.tar.gz/src/videoframe.c Changed

@@ -18,19 +18,23 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
+#include "videoframe.h"
+
 #include <stdlib.h>
-#include <string.h>
 
+#include "image.h"
 #include "sao.h"
-#include "threads.h"
-#include "videoframe.h"
+
 
 /**
  * \brief Allocate new frame
  * \param pic picture pointer
  * \return picture pointer
  */
-videoframe_t *kvz_videoframe_alloc(const int32_t width, const int32_t height, const int32_t poc) {
+videoframe_t * kvz_videoframe_alloc(int32_t width,
+                                    int32_t height,
+                                    enum kvz_chroma_format chroma_format)
+{
   videoframe_t *frame = MALLOC(videoframe_t, 1);
 
   if (!frame) return 0;
@@ -45,16 +49,17 @@
   if (frame->height_in_lcu * LCU_WIDTH < frame->height) frame->height_in_lcu++;
 
   {
-    // Allocate height_in_scu x width_in_scu x sizeof(CU_info)
-    unsigned height_in_scu = frame->height_in_lcu << MAX_DEPTH;
-    unsigned width_in_scu = frame->width_in_lcu << MAX_DEPTH;
-    frame->cu_array = kvz_cu_array_alloc(width_in_scu, height_in_scu);
+    unsigned cu_array_width  = frame->width_in_lcu  * LCU_WIDTH;
+    unsigned cu_array_height = frame->height_in_lcu * LCU_WIDTH;
+    frame->cu_array = kvz_cu_array_alloc(cu_array_width, cu_array_height);
   }
 
   frame->coeff_y = NULL; frame->coeff_u = NULL; frame->coeff_v = NULL;
 
   frame->sao_luma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
-  frame->sao_chroma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
+  if (chroma_format != KVZ_CSP_400) {
+    frame->sao_chroma = MALLOC(sao_info_t, frame->width_in_lcu * frame->height_in_lcu);
+  }
 
   return frame;
 }
@@ -89,18 +94,16 @@
   frame->poc = poc;
 }
 
-const cu_info_t* kvz_videoframe_get_cu_const(const videoframe_t * const frame, unsigned int x_in_scu, unsigned int y_in_scu)
+const cu_info_t* kvz_videoframe_get_cu_const(const videoframe_t * const frame,
+                                             unsigned int x_in_scu,
+                                             unsigned int y_in_scu)
 {
-  assert(x_in_scu < (frame->width_in_lcu << MAX_DEPTH));
-  assert(y_in_scu < (frame->height_in_lcu << MAX_DEPTH));
-  
-  return &frame->cu_array->datax_in_scu + y_in_scu * (frame->width_in_lcu << MAX_DEPTH);
+  return kvz_cu_array_at_const(frame->cu_array, x_in_scu << 3, y_in_scu << 3);
 }
 
-cu_info_t* kvz_videoframe_get_cu(videoframe_t * const frame, const unsigned int x_in_scu, const unsigned int y_in_scu)
+cu_info_t* kvz_videoframe_get_cu(videoframe_t * const frame,
+                                 const unsigned int x_in_scu,
+                                 const unsigned int y_in_scu)
 {
-  assert(x_in_scu < (frame->width_in_lcu << MAX_DEPTH));
-  assert(y_in_scu < (frame->height_in_lcu << MAX_DEPTH));
-  
-  return &frame->cu_array->datax_in_scu + y_in_scu * (frame->width_in_lcu << MAX_DEPTH);
+  return kvz_cu_array_at(frame->cu_array, x_in_scu << 3, y_in_scu << 3);
 }

kvazaar-0.8.3.tar.gz/src/videoframe.h -> kvazaar-1.0.0.tar.gz/src/videoframe.h Changed

kvazaar-0.8.3.tar.gz/src/yuv_io.c -> kvazaar-1.0.0.tar.gz/src/yuv_io.c Changed

@@ -42,7 +42,7 @@
 
 
 static int read_and_fill_frame_data(FILE *file,
-                                    unsigned width, unsigned height,
+                                    unsigned width, unsigned height, unsigned bytes_per_sample,
                                     unsigned array_width, kvz_pixel *data)
 {
   kvz_pixel* p = data;
@@ -52,7 +52,7 @@
 
   while (p < end) {
     // Read the beginning of the line from input.
-    if (width != fread(p, sizeof(unsigned char), width, file))
+    if (width != fread(p, bytes_per_sample, width, file))
       return 0;
 
     // Fill the rest with the last pixel value.
@@ -68,21 +68,104 @@
 }
 
 
-/**
-* \brief Convert 8 bit (single byte per pixel) to 10bit (two bytes per pixel) array
-*
-* \param input   input/output buffer
-* \return        1
-*/
-int frame_8bit_to_10bit(kvz_pixel* input, int width, int height) {
-  uint8_t* temp_buffer = (uint8_t*)input;
-  const uint32_t pixels = width*height;
-  for (int i = pixels - 1; i >= 0; i--) {
-    inputi = temp_bufferi << 2;
+static void swap_16b_buffer_bytes(kvz_pixel* input, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    inputi = ((inputi & 0xff) << 8) + ((inputi & 0xff00) >> 8);
+  }
+}
+
+
+static void shift_to_bitdepth(kvz_pixel* input, int size, int from_bitdepth, int to_bitdepth)
+{
+  int shift = to_bitdepth - from_bitdepth;
+  for (int i = 0; i < size; ++i) {
+    // Shifting by a negative number is undefined.
+    if (shift > 0) {
+      inputi <<= shift;
+    } else {
+      inputi >>= shift;
+    }
+  }
+}
+
+
+// Shift and copy 1-byte aligned samples to 2-byte aligned array
+static void shift_to_bitdepth_and_spread(kvz_pixel *input,
+                                         int size,
+                                         int from_bitdepth,
+                                         int to_bitdepth)
+{
+  assert(sizeof(kvz_pixel) > 1);
+  int shift = to_bitdepth - from_bitdepth;
+  unsigned char *byte_buf = (unsigned char *)input;
+  
+  // Starting from the back of the 1-byte samples, copy each sample to it's
+  // place in the 2-byte per sample array, overwriting the bytes that have
+  // already been copied in the process.
+  // Even though the two pointers are aliased, this should work because the
+  // future values read through byte_buf poiner never change as a result of
+  // writing through input pointer.
+  for (int i = size - 1; i >= 0; --i) {
+    // Shifting by a negative number is undefined.
+    if (shift > 0) {
+      inputi = byte_bufi << shift;
+    } else {
+      inputi = byte_bufi >> shift;
+    }
   }
+}
+
+
+bool machine_is_big_endian()
+{
+  uint16_t number = 1;
+  char first_byte = *(char*)&number;
+
+  return (first_byte != 0);
+}
+
+
+static int yuv_io_read_plane(
+    FILE* file,
+    unsigned in_width, unsigned in_height, unsigned in_bitdepth,
+    unsigned out_width, unsigned out_height, unsigned out_bitdepth,
+    kvz_pixel *out_buf)
+{
+  unsigned bytes_per_sample = in_bitdepth > 8 ? 2 : 1;
+  unsigned buf_length = in_width * in_height;
+  unsigned buf_bytes = buf_length * bytes_per_sample;
+
+  if (in_width == out_width) {
+    // No need to extend pixels.
+    const size_t pixel_size = sizeof(unsigned char);
+    if (fread(out_buf, pixel_size, buf_bytes, file) != buf_bytes)  return 0;
+  } else {
+    // Need to copy pixels to fill the image in horizontal direction.
+    if (!read_and_fill_frame_data(file, in_width, in_height, bytes_per_sample, out_width, out_buf)) return 0;
+  }
+
+  if (in_height != out_height) {
+    // Need to copy pixels to fill the image in vertical direction.
+    fill_after_frame(in_height, out_width, out_height, out_buf);
+  }
+
+  if (in_bitdepth > 8) {
+    if (machine_is_big_endian()) {
+      swap_16b_buffer_bytes(out_buf, buf_length);
+    }
+  }
+
+  if (in_bitdepth <= 8 && out_bitdepth > 8) {
+    shift_to_bitdepth_and_spread(out_buf, buf_length, in_bitdepth, out_bitdepth);
+  } else if (in_bitdepth != out_bitdepth) {
+    shift_to_bitdepth(out_buf, buf_length, in_bitdepth, out_bitdepth);
+  }
+  
   return 1;
 }
 
+
 /**
  * \brief Read a single frame from a file.
  *
@@ -97,46 +180,43 @@
  * \return              1 on success, 0 on failure
  */
 int yuv_io_read(FILE* file,
-                unsigned input_width, unsigned input_height,
+                unsigned in_width, unsigned out_width,
+                unsigned in_bitdepth, unsigned out_bitdepth,
                 kvz_picture *img_out)
 {
-  assert(input_width % 2 == 0);
-  assert(input_height % 2 == 0);
+  assert(in_width % 2 == 0);
+  assert(out_width % 2 == 0);
 
-  const unsigned y_size = input_width * input_height;
-  const unsigned uv_input_width  = input_width  / 2;
-  const unsigned uv_input_height = input_height / 2;
-  const unsigned uv_size = uv_input_width * uv_input_height;
+  int ok;
 
-  const unsigned uv_array_width  = img_out->width  / 2;
-  const unsigned uv_array_height = img_out->height  / 2;
+  ok = yuv_io_read_plane(
+      file, 
+      in_width, out_width, in_bitdepth,
+      img_out->width, img_out->height, out_bitdepth,
+      img_out->y);
+  if (!ok) return 0;
 
-  if (input_width == img_out->width) {
-    // No need to extend pixels.
-    const size_t pixel_size = sizeof(unsigned char);
-    if (fread(img_out->y, pixel_size, y_size,  file) != y_size)  return 0;
-    if (fread(img_out->u, pixel_size, uv_size, file) != uv_size) return 0;
-    if (fread(img_out->v, pixel_size, uv_size, file) != uv_size) return 0;
-  } else {
-    // Need to copy pixels to fill the image in horizontal direction.
-    if (!read_and_fill_frame_data(file, input_width,    input_height,    img_out->width, img_out->y)) return 0;
-    if (!read_and_fill_frame_data(file, uv_input_width, uv_input_height, uv_array_width, img_out->u)) return 0;
-    if (!read_and_fill_frame_data(file, uv_input_width, uv_input_height, uv_array_width, img_out->v)) return 0;
-  }
-
-  if (input_height != img_out->height) {
-    // Need to copy pixels to fill the image in vertical direction.
-    fill_after_frame(input_height,    img_out->width, img_out->height,    img_out->y);
-    fill_after_frame(uv_input_height, uv_array_width, uv_array_height, img_out->u);
-    fill_after_frame(uv_input_height, uv_array_width, uv_array_height, img_out->v);
-  }
+  if (img_out->chroma_format != KVZ_CSP_400) {
+    unsigned uv_width_in = in_width / 2;
+    unsigned uv_height_in = out_width / 2;
+    unsigned uv_width_out = img_out->width / 2;
+    unsigned uv_height_out = img_out->height / 2;
 
-#if KVZ_BIT_DEPTH == 10
-  frame_8bit_to_10bit(img_out->y, img_out->width, img_out->height);
-	frame_8bit_to_10bit(img_out->u, img_out->width >> 1, img_out->height >> 1);
-	frame_8bit_to_10bit(img_out->v, img_out->width >> 1, img_out->height >> 1);
-#endif
+    ok = yuv_io_read_plane(
+        file,
+        uv_width_in, uv_height_in, in_bitdepth,
+        uv_width_out, uv_height_out, out_bitdepth,
+        img_out->u);
+    if (!ok) return 0;
 
+    ok = yuv_io_read_plane(
+        file, 
+        uv_width_in, uv_height_in, in_bitdepth,
+        uv_width_out, uv_height_out, out_bitdepth,
+        img_out->v);
+    if (!ok) return 0;
+  }
+  
   return 1;
 }
 
@@ -194,11 +274,14 @@
     fwrite(&img->yy * width, sizeof(*img->y), output_width, file);
     // TODO: Check that fwrite succeeded.
   }
-  for (int y = 0; y < output_height / 2; ++y) {
-    fwrite(&img->uy * width / 2, sizeof(*img->u), output_width / 2, file);
-  }
-  for (int y = 0; y < output_height / 2; ++y) {
-    fwrite(&img->vy * width / 2, sizeof(*img->v), output_width / 2, file);
+
+  if (img->chroma_format != KVZ_CSP_400) {
+    for (int y = 0; y < output_height / 2; ++y) {
+      fwrite(&img->uy * width / 2, sizeof(*img->u), output_width / 2, file);
+    }
+    for (int y = 0; y < output_height / 2; ++y) {
+      fwrite(&img->vy * width / 2, sizeof(*img->v), output_width / 2, file);
+    }
   }
 
   return 1;

kvazaar-0.8.3.tar.gz/src/yuv_io.h -> kvazaar-1.0.0.tar.gz/src/yuv_io.h Changed

kvazaar-0.8.3.tar.gz/tests/mv_cand_tests.c -> kvazaar-1.0.0.tar.gz/tests/mv_cand_tests.c Changed

@@ -17,46 +17,206 @@
  * along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
+#include "src/inter.c"
+
 #include <string.h>
 
 #include "greatest/greatest.h"
 
-#include "src/cu.h"
-#include "src/inter.h"
-
 TEST test_get_spatial_merge_cand(void)
 {
   lcu_t lcu;
   memset(&lcu, 0, sizeof(lcu));
   for (int i = 0; i < sizeof(lcu.cu) / sizeof(cu_info_t); i++) {
-    lcu.cui.coded = 1;
     lcu.cui.type = CU_INTER;
   }
-  lcu.cu20.coded = 1;
-  lcu.cu22.coded = 1;
-  lcu.cu23.coded = 1;
-  lcu.cu56.coded = 1;
-  lcu.cu65.coded = 1;
 
   cu_info_t *mv_cand5 = { NULL };
-  kvz_inter_get_spatial_merge_candidates(16, 16, // x, y
-                                         16, 32, // width, height
-                                         &mv_cand0, // b0
-                                         &mv_cand1, // b1
-                                         &mv_cand2, // b2
-                                         &mv_cand3, // a0
-                                         &mv_cand4, // a1
-                                         &lcu);
-
-  ASSERT_EQ(mv_cand0, &lcu.cu23); // b0
-  ASSERT_EQ(mv_cand1, &lcu.cu22); // b1
-  ASSERT_EQ(mv_cand2, &lcu.cu20); // b2
-  ASSERT_EQ(mv_cand3, &lcu.cu65); // a0
-  ASSERT_EQ(mv_cand4, &lcu.cu56); // a1
+  get_spatial_merge_candidates(64 + 32, 64, // x, y
+                               32, 24,      // width, height
+                               1920, 1080,  // picture size
+                               &mv_cand0, // b0
+                               &mv_cand1, // b1
+                               &mv_cand2, // b2
+                               &mv_cand3, // a0
+                               &mv_cand4, // a1
+                               &lcu);
+
+  ASSERT_EQ(mv_cand0, &lcu.cu289); // b0
+  ASSERT_EQ(mv_cand1, &lcu.cu 16); // b1
+  ASSERT_EQ(mv_cand2, &lcu.cu  8); // b2
+  ASSERT_EQ(mv_cand3, &lcu.cu127); // a0
+  ASSERT_EQ(mv_cand4, &lcu.cu110); // a1
+
+  PASS();
+}
+
+TEST test_is_a0_cand_coded()
+{
+  // +--+--+
+  // |##|  |
+  // +--+--+
+  // |  |  |
+  // +--+--+
+  ASSERT_EQ(is_a0_cand_coded(32, 64, 16, 16), true);
+  // Same as above with a 2NxN block
+  ASSERT_EQ(is_a0_cand_coded(32, 64, 32, 16), true);
+  // Same as above with a 2NxnU block
+  ASSERT_EQ(is_a0_cand_coded(32, 64, 32, 8), true);
+  // Same as above with a 2NxnD block
+  ASSERT_EQ(is_a0_cand_coded(32, 64, 32, 24), true);
+
+  // +--+--+
+  // |  |##|
+  // +--+--+
+  // |  |  |
+  // +--+--+
+  ASSERT_EQ(is_a0_cand_coded(16, 0, 16, 16), false);
+
+  // +--+--+
+  // |  |  |
+  // +--+--+
+  // |  |##|
+  // +--+--+
+  ASSERT_EQ(is_a0_cand_coded(48, 16, 16, 16), false);
+  // Same as above with a Nx2N block
+  ASSERT_EQ(is_a0_cand_coded(48, 0, 16, 32), false);
+  // Same as above with a nLx2N block
+  ASSERT_EQ(is_a0_cand_coded(40, 0, 24, 32), false);
+  // Same as above with a nRx2N block
+  ASSERT_EQ(is_a0_cand_coded(56, 0, 8, 32), false);
+
+  // +-----+--+--+
+  // |     |  |  |
+  // |     +--+--+
+  // |     |##|  |
+  // +-----+--+--+
+  // |     |     |
+  // |     |     |
+  // |     |     |
+  // +-----+-----+
+  ASSERT_EQ(is_a0_cand_coded(32, 16, 16, 16), false);
+
+  // Same as above with a 2NxnU block
+  ASSERT_EQ(is_a0_cand_coded(32, 8, 32, 24), false);
+  // Same as above with a 2NxnD block
+  ASSERT_EQ(is_a0_cand_coded(32, 24, 32, 8), false);
+
+  // Same as above with a Nx2N block
+  ASSERT_EQ(is_a0_cand_coded(32, 0, 16, 32), false);
+  // Same as above with a nLx2N block
+  ASSERT_EQ(is_a0_cand_coded(32, 0, 8, 32), false);
+  // Same as above with a nRx2N block
+  ASSERT_EQ(is_a0_cand_coded(32, 0, 24, 32), false);
+
+  // +--+--+-----+
+  // |  |  |     |
+  // +--+--+     |
+  // |##|  |     |
+  // +--+--+-----+
+  // |     |     |
+  // |     |     |
+  // |     |     |
+  // +-----+-----+
+  ASSERT_EQ(is_a0_cand_coded(32, 8, 8, 8), true);
+
+  // Same as above with a 2NxnU block
+  ASSERT_EQ(is_a0_cand_coded(32, 4, 16, 12), true);
+  // Same as above with a 2NxnD block
+  ASSERT_EQ(is_a0_cand_coded(32, 12, 16, 4), true);
+
+  // Same as above with a Nx2N block
+  ASSERT_EQ(is_a0_cand_coded(32, 0, 8, 16), true);
+  // Same as above with a nLx2N block
+  ASSERT_EQ(is_a0_cand_coded(32, 0, 4, 16), true);
+  // Same as above with a nRx2N block
+  ASSERT_EQ(is_a0_cand_coded(32, 0, 12, 16), true);
+
+  PASS();
+}
+
+TEST test_is_b0_cand_coded()
+{
+  // +--+--+
+  // |##|  |
+  // +--+--+
+  // |  |  |
+  // +--+--+
+  ASSERT_EQ(is_b0_cand_coded(32, 64, 16, 16), true);
+  // Same as above with a Nx2N block
+  ASSERT_EQ(is_b0_cand_coded(32, 64, 16, 32), true);
+  // Same as above with a nLx2N block
+  ASSERT_EQ(is_b0_cand_coded(32, 64, 24, 32), true);
+  // Same as above with a nRx2N block
+  ASSERT_EQ(is_b0_cand_coded(32, 64, 8, 32), true);
+
+  // +--+--+
+  // |  |  |
+  // +--+--+
+  // |##|  |
+  // +--+--+
+  ASSERT_EQ(is_b0_cand_coded(32, 16, 16, 16), true);
+
+  // +--+--+
+  // |  |  |
+  // +--+--+
+  // |  |##|
+  // +--+--+
+  ASSERT_EQ(is_b0_cand_coded(48, 16, 16, 16), false);
+  // Same as above with a 2NxN block
+  ASSERT_EQ(is_b0_cand_coded(32, 16, 32, 16), false);
+  // Same as above with a 2NxnU block
+  ASSERT_EQ(is_b0_cand_coded(32, 8, 32, 24), false);
+  // Same as above with a 2NxnD block
+  ASSERT_EQ(is_b0_cand_coded(32, 24, 32, 8), false);
+
+  // +-----+-----+
+  // |     |     |
+  // |     |     |
+  // |     |     |
+  // +-----+--+--+
+  // |     |  |##|
+  // |     +--+--+
+  // |     |  |  |
+  // +-----+--+--+
+  ASSERT_EQ(is_b0_cand_coded(48, 32, 16, 16), false);
+
+  // Same as above with a 2NxnU block
+  ASSERT_EQ(is_b0_cand_coded(32, 32, 32, 8), false);
+  // Same as above with a 2NxnD block
+  ASSERT_EQ(is_b0_cand_coded(32, 32, 32, 24), false);
+
+  // Same as above with a nLx2N block
+  ASSERT_EQ(is_b0_cand_coded(56, 32, 8, 32), false);
+  // Same as above with a nRx2N block
+  ASSERT_EQ(is_b0_cand_coded(40, 32, 24, 32), false);
+
+  // +--+--+-----+
+  // |  |##|     |
+  // +--+--+     |
+  // |  |  |     |
+  // +--+--+-----+
+  // |     |     |
+  // |     |     |
+  // |     |     |
+  // +-----+-----+
+  ASSERT_EQ(is_b0_cand_coded(16, 0, 16, 16), true);
+
+  // Same as above with a 2NxnU block
+  ASSERT_EQ(is_b0_cand_coded(0, 0, 32, 8), true);
+  // Same as above with a 2NxnD block
+  ASSERT_EQ(is_b0_cand_coded(0, 0, 32, 24), true);
+
+  // Same as above with a nLx2N block
+  ASSERT_EQ(is_b0_cand_coded(8, 0, 24, 32), true);
+  // Same as above with a nRx2N block
+  ASSERT_EQ(is_b0_cand_coded(24, 0, 8, 32), true);
 
   PASS();
 }
 
 SUITE(mv_cand_tests) {
   RUN_TEST(test_get_spatial_merge_cand);
+  RUN_TEST(test_is_a0_cand_coded);
+  RUN_TEST(test_is_b0_cand_coded);
 }

kvazaar-0.8.3.tar.gz/tests/sad_tests.c -> kvazaar-1.0.0.tar.gz/tests/sad_tests.c Changed

@@ -59,26 +59,60 @@
 
 static kvz_picture *g_pic = 0;
 static kvz_picture *g_ref = 0;
+static kvz_picture *g_big_pic = 0;
+static kvz_picture *g_big_ref = 0;
+static kvz_picture *g_64x64_zero = 0;
+static kvz_picture *g_64x64_max = 0;
+
+static struct sad_test_env_t {
+  int width;
+  int height;
+  void * tested_func;
+  const strategy_t * strategy;
+  char msg255;
+} sad_test_env;
 
 //////////////////////////////////////////////////////////////////////////
 // SETUP, TEARDOWN AND HELPER FUNCTIONS
 static void setup_tests()
 {
-  g_pic = kvz_image_alloc(8, 8);
+  g_pic = kvz_image_alloc(KVZ_CSP_420, 8, 8);
   for (int i = 0; i < 64; ++i) {
     g_pic->yi = pic_datai + 48;
   }
 
-  g_ref = kvz_image_alloc(8, 8);
+  g_ref = kvz_image_alloc(KVZ_CSP_420, 8, 8);
   for (int i = 0; i < 64; ++i) {
     g_ref->yi = ref_datai + 48;
   }
+
+  g_big_pic = kvz_image_alloc(KVZ_CSP_420, 64, 64);
+  for (int i = 0; i < 64*64; ++i) {
+    g_big_pic->yi = (i*i / 32 + i) % 255;
+    //g_big_pic->yi = i % 255;
+  }
+
+  g_big_ref = kvz_image_alloc(KVZ_CSP_420, 64, 64);
+  for (int i = 0; i < 64 * 64; ++i) {
+    g_big_ref->yi = (i*i / 16 + i) % 255;
+    //g_big_ref->yi = (i / 2) % 255;
+  }
+
+  g_64x64_zero = kvz_image_alloc(KVZ_CSP_420, 64, 64);
+  memset(g_64x64_zero->y, 0, 64 * 64 * sizeof(kvz_pixel));
+  
+  g_64x64_max = kvz_image_alloc(KVZ_CSP_420, 64, 64);
+  memset(g_64x64_max->y, PIXEL_MAX, 64 * 64 * sizeof(kvz_pixel));
 }
 
 static void tear_down_tests()
 {
   kvz_image_free(g_pic);
   kvz_image_free(g_ref);
+  kvz_image_free(g_big_pic);
+  kvz_image_free(g_big_ref);
+  kvz_image_free(g_64x64_zero);
+  kvz_image_free(g_64x64_max);
 }
 
 
@@ -224,11 +258,66 @@
   PASS();
 }
 
+static unsigned simple_sad(const kvz_pixel* buf1, const kvz_pixel* buf2, unsigned stride,
+                           unsigned width, unsigned height)
+{
+  unsigned sum = 0;
+  for (unsigned y = 0; y < height; ++y) {
+    for (unsigned x = 0; x < width; ++x) {
+      sum += abs((int)buf1y * stride + x - (int)buf2y * stride + x);
+    }
+  }
+  return sum;
+}
 
-struct sad_test_env_t {
-  kvz_picture *g_pic;
-  kvz_picture *g_ref;
-};
+TEST test_reg_sad(void)
+{
+  unsigned width = sad_test_env.width;
+  unsigned height = sad_test_env.height;
+  unsigned stride = 64;
+
+  unsigned correct_result = simple_sad(g_big_pic->y, g_big_ref->y, stride, width, height);
+  
+  unsigned(*tested_func)(const kvz_pixel *, const kvz_pixel *, int, int, unsigned, unsigned) = sad_test_env.tested_func;
+  unsigned result = tested_func(g_big_pic->y, g_big_ref->y, width, height, stride, stride);
+  
+  sprintf(sad_test_env.msg, "%s(%ux%u):%s",
+          sad_test_env.strategy->type,
+          width,
+          height,
+          sad_test_env.strategy->strategy_name);
+
+  if (result != correct_result) {
+    FAILm(sad_test_env.msg);
+  }
+
+  PASSm(sad_test_env.msg);
+}
+
+
+TEST test_reg_sad_overflow(void)
+{
+  unsigned width = sad_test_env.width;
+  unsigned height = sad_test_env.height;
+  unsigned stride = 64;
+
+  unsigned correct_result = simple_sad(g_64x64_zero->y, g_64x64_max->y, stride, width, height);
+
+  unsigned(*tested_func)(const kvz_pixel *, const kvz_pixel *, int, int, unsigned, unsigned) = sad_test_env.tested_func;
+  unsigned result = tested_func(g_64x64_zero->y, g_64x64_max->y, width, height, stride, stride);
+
+  sprintf(sad_test_env.msg, "overflow %s(%ux%u):%s",
+    sad_test_env.strategy->type,
+    width,
+    height,
+    sad_test_env.strategy->strategy_name);
+
+  if (result != correct_result) {
+    FAILm(sad_test_env.msg);
+  }
+
+  PASSm(sad_test_env.msg);
+}
 
 
 //////////////////////////////////////////////////////////////////////////
@@ -272,6 +361,29 @@
     RUN_TEST(test_bottomleft_out);
     RUN_TEST(test_bottom_out);
     RUN_TEST(test_bottomright_out);
+
+    struct dimension {
+      int width;
+      int height;
+    };
+    static const struct dimension tested_dims = {
+      // Square motion partitions
+      {64, 64}, {32, 32}, {16, 16}, {8, 8},
+      // Symmetric motion partitions
+      {64, 32}, {32, 64}, {32, 16}, {16, 32}, {16, 8}, {8, 16}, {8, 4}, {4, 8},
+      // Asymmetric motion partitions
+      {48, 16}, {16, 48}, {24, 16}, {16, 24}, {12, 4}, {4, 12}
+    };
+
+    sad_test_env.tested_func = strategies.strategiesi.fptr;
+    sad_test_env.strategy = &strategies.strategiesi;
+    int num_dim_tests = sizeof(tested_dims) / sizeof(tested_dims0);
+    for (int dim_test = 0; dim_test < num_dim_tests; ++dim_test) {
+      sad_test_env.width = tested_dimsdim_test.width;
+      sad_test_env.height = tested_dimsdim_test.height;
+      RUN_TEST(test_reg_sad);
+      RUN_TEST(test_reg_sad_overflow);
+    }
   }
   
   tear_down_tests();

kvazaar-0.8.3.tar.gz/tests/speed_tests.c -> kvazaar-1.0.0.tar.gz/tests/speed_tests.c Changed

@@ -43,11 +43,18 @@
 static kvz_pixel * bufsNUM_TESTS; // SIMD aligned pointers.
 static kvz_pixel * actual_bufsNUM_TESTS; // pointers returned by malloc.
 
+#define WIDTH_4K 3840 
+#define HEIGHT_4K 2160
+
 static struct test_env_t {
-  int log_width; // for selecting dim from bufs
+  int width;
+  int height;
   void * tested_func;
   const strategy_t * strategy;
   char msg1024;
+  
+  kvz_picture *inter_a;
+  kvz_picture *inter_b;
 } test_env;
 
 
@@ -83,6 +90,16 @@
       init_gradient(width - x, y, width, 255 / width, &bufstestchunk * 64*64);
     }
   }
+
+  test_env.inter_a = kvz_image_alloc(KVZ_CSP_420, WIDTH_4K, HEIGHT_4K);
+  test_env.inter_b = kvz_image_alloc(KVZ_CSP_420, WIDTH_4K, HEIGHT_4K);
+  for (unsigned i = 0; i < WIDTH_4K * HEIGHT_4K; ++i) {
+    kvz_pixel pattern1 = ((i*i >> 10) % 255) >> 2;
+    kvz_pixel pattern2 = ((i*i >> 15) % 255) >> 2;
+    kvz_pixel gradient = (i >> 12) + i;
+    test_env.inter_a->yi = (pattern1 + gradient) % PIXEL_MAX;
+    test_env.inter_b->yi = (pattern2 + gradient) % PIXEL_MAX;
+  }
 }
 
 static void tear_down_tests()
@@ -90,6 +107,8 @@
   for (int test = 0; test < NUM_TESTS; ++test) {
     free(actual_bufstest);
   }
+  kvz_image_free(test_env.inter_a);
+  kvz_image_free(test_env.inter_b);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -126,55 +145,107 @@
     KVZ_GET_TIME(&clock_now)
   }
 
+  double test_time = TIME_PER_TEST + KVZ_CLOCK_T_AS_DOUBLE(clock_now) - test_end;
   sprintf(test_env.msg, "%.3fM x %s:%s",
-    (double)call_cnt / 1000000.0,
+    (double)call_cnt / 1000000.0 / test_time,
     test_env.strategy->type,
     test_env.strategy->strategy_name);
   PASSm(test_env.msg);
 }
 
 
-TEST test_inter_speed(const int width)
+TEST test_intra_dual_speed(const int width)
 {
   const int size = width * width;
-  unsigned call_cnt = 0;
+  uint64_t call_cnt = 0;
   KVZ_CLOCK_T clock_now;
   KVZ_GET_TIME(&clock_now);
   double test_end = KVZ_CLOCK_T_AS_DOUBLE(clock_now) + TIME_PER_TEST;
 
   // Loop until time allocated for test has passed.
   for (unsigned i = 0;
-      test_end > KVZ_CLOCK_T_AS_DOUBLE(clock_now);
-      ++i)
+    test_end > KVZ_CLOCK_T_AS_DOUBLE(clock_now);
+    ++i)
   {
     int test = i % NUM_TESTS;
     uint64_t sum = 0;
     for (int offset = 0; offset < NUM_CHUNKS * 64 * 64; offset += NUM_CHUNKS * size) {
-      // Treat 4 consecutive chunks as one chunk with double width and height,
-      // and do a 8x8 grid search against the first chunk to simulate real usage.
+      // Compare the first chunk against the 35 other chunks to simulate real usage.
       kvz_pixel * buf1 = &bufstestoffset;
-      for (int chunk = 0; chunk < NUM_CHUNKS; chunk += 4) {
-        kvz_pixel * buf2 = &bufstestchunk * size + offset;
-        for (int y = 0; y < 8; ++y) {
-          for (int x = 0; x < 8; ++x) {
-            const int stride1 = 2 * 64;
-            const int stride2 = 2 * 64;
-            reg_sad_func *tested_func = test_env.tested_func;
-            sum += tested_func(buf1, &buf2y * stride2 + x, width, width, stride1, stride2);
-            ++call_cnt;
-          }
-        }
+      for (int chunk = 0; chunk < NUM_CHUNKS; chunk += 2) {
+        cost_pixel_nxn_multi_func *tested_func = test_env.tested_func;
+        const kvz_pixel *buf_pair2 = { &bufstestchunk * size + offset, &bufstest(chunk + 1) * size + offset };
+        unsigned costs2 = { 0, 0 };
+        tested_func((pred_buffer)buf_pair, buf1, 2, costs);
+        sum += costs0 + costs1;
+        ++call_cnt;
       }
     }
+
     ASSERT(sum > 0);
     KVZ_GET_TIME(&clock_now)
   }
 
+  double test_time = TIME_PER_TEST + KVZ_CLOCK_T_AS_DOUBLE(clock_now) - test_end;
+  sprintf(test_env.msg, "%.3fM x %s:%s",
+    (double)call_cnt / 1000000.0 / test_time,
+    test_env.strategy->type,
+    test_env.strategy->strategy_name);
+  PASSm(test_env.msg);
+}
+
+
+TEST test_inter_speed(const int width, const int height)
+{
+  unsigned call_cnt = 0;
+  KVZ_CLOCK_T clock_now;
+  KVZ_GET_TIME(&clock_now);
+  double test_end = KVZ_CLOCK_T_AS_DOUBLE(clock_now) + TIME_PER_TEST;
+
+  const vector2d_t dims_lcu = { WIDTH_4K / 64 - 2, HEIGHT_4K / 64 - 2 };
+  const int step = 3;
+  const int range = 2 * step;
+
+  // Loop until time allocated for test has passed.
+  for (uint64_t i = 0;
+      test_end > KVZ_CLOCK_T_AS_DOUBLE(clock_now);
+      ++i)
+  {
+    // Do a sparse full search on the first CU of every LCU.
+    
+    uint64_t sum = 0;
+
+    // Go through the non-edge LCU's in raster scan order.
+    const vector2d_t lcu = {
+      1 + i % dims_lcu.x,
+      1 + (i / dims_lcu.y) % dims_lcu.y,
+    };
+
+    vector2d_t mv;
+    for (mv.y = -range; mv.y <= range; mv.y += step) {
+      for (mv.x = -range; mv.x <= range; mv.x += step) {
+        reg_sad_func *tested_func = test_env.tested_func;
+
+        int lcu_index = lcu.y * 64 * WIDTH_4K + lcu.x * 64;
+        int mv_index = mv.y * WIDTH_4K + mv.x;
+        kvz_pixel *buf1 = &test_env.inter_a->ylcu_index;
+        kvz_pixel *buf2 = &test_env.inter_a->ylcu_index + mv_index;
+
+        sum += tested_func(buf1, buf2, width, height, WIDTH_4K, WIDTH_4K);
+        ++call_cnt;
+      }
+    }
+
+    ASSERT(sum > 0);
+    KVZ_GET_TIME(&clock_now)
+  }
+
+  double test_time = TIME_PER_TEST + KVZ_CLOCK_T_AS_DOUBLE(clock_now) - test_end;
   sprintf(test_env.msg, "%.3fM x %s(%ix%i):%s",
-    (double)call_cnt / 1000000.0,
+    (double)call_cnt / 1000000.0 / test_time,
     test_env.strategy->type,
     width,
-    width,
+    height,
     test_env.strategy->strategy_name);
   PASSm(test_env.msg);
 }
@@ -221,8 +292,9 @@
     KVZ_GET_TIME(&clock_now)
   }
   
+  double test_time = TIME_PER_TEST + KVZ_CLOCK_T_AS_DOUBLE(clock_now) - test_end;
   sprintf(test_env.msg, "%.3fM x %s:%s",
-    (double)call_cnt / 1000000.0,
+    (double)call_cnt / 1000000.0 / test_time,
     test_env.strategy->type,
     test_env.strategy->strategy_name);
   PASSm(test_env.msg);
@@ -231,36 +303,43 @@
 
 TEST intra_sad(void)
 {
-  const int width = 1 << test_env.log_width;
-  return test_intra_speed(width);
+  return test_intra_speed(test_env.width);
+}
+
+
+TEST intra_sad_dual(void)
+{
+  return test_intra_dual_speed(test_env.width);
 }
 
 
 TEST intra_satd(void)
 {
-  const int width = 1 << test_env.log_width;
-  return test_intra_speed(width);
+  return test_intra_speed(test_env.width);
+}
+
+
+TEST intra_satd_dual(void)
+{
+  return test_intra_dual_speed(test_env.width);
 }
 
 
 TEST inter_sad(void)
 {
-  const int width = 1 << test_env.log_width;
-  return test_inter_speed(width);
+  return test_inter_speed(test_env.width, test_env.height);
 }
 
 
 TEST fdct(void)
 {
-  const int width = 1 << test_env.log_width;
-  return dct_speed(width);
+  return dct_speed(test_env.width);
 }
 
 
 TEST idct(void)
 {
-  const int width = 1 << test_env.log_width;
-  return dct_speed(width);
+  return dct_speed(test_env.width);
 }
 
 
@@ -279,49 +358,25 @@
   for (unsigned i = 0; i < strategies.count; ++i) {
     const strategy_t * strategy = &strategies.strategiesi;
 
-    // Select buffer width according to function name for intra cost functions.
-    if (strcmp(strategy->type, "sad_4x4") == 0) {
-      test_env.log_width = 2;
-    } else if (strcmp(strategy->type, "sad_8x8") == 0) {
-      test_env.log_width = 3;
-    } else if (strcmp(strategy->type, "sad_16x16") == 0) {
-      test_env.log_width = 4;
-    } else if (strcmp(strategy->type, "sad_32x32") == 0) {
-      test_env.log_width = 5;
-    } else if (strcmp(strategy->type, "sad_64x64") == 0) {
-      test_env.log_width = 6;
-    } else if (strcmp(strategy->type, "satd_4x4") == 0) {
-      test_env.log_width = 2;
-    } else if (strcmp(strategy->type, "satd_8x8") == 0) {
-      test_env.log_width = 3;
-    } else if (strcmp(strategy->type, "satd_16x16") == 0) {
-      test_env.log_width = 4;
-    } else if (strcmp(strategy->type, "satd_32x32") == 0) {
-      test_env.log_width = 5;
-    } else if (strcmp(strategy->type, "satd_64x64") == 0) {
-      test_env.log_width = 6;
-    } else if (strcmp(strategy->type, "dct_4x4") == 0) {
-      test_env.log_width = 2;
-    } else if (strcmp(strategy->type, "dct_8x8") == 0) {
-      test_env.log_width = 3;
-    } else if (strcmp(strategy->type, "dct_16x16") == 0) {
-      test_env.log_width = 4;
-    } else if (strcmp(strategy->type, "dct_32x32") == 0) {
-      test_env.log_width = 5;
-    } else if (strcmp(strategy->type, "idct_4x4") == 0) {
-      test_env.log_width = 2;
-    } else if (strcmp(strategy->type, "idct_8x8") == 0) {
-      test_env.log_width = 3;
-    } else if (strcmp(strategy->type, "idct_16x16") == 0) {
-      test_env.log_width = 4;
-    } else if (strcmp(strategy->type, "idct_32x32") == 0) {
-      test_env.log_width = 5;
-    } else if (strcmp(strategy->type, "fast_forward_dst_4x4") == 0) {
-      test_env.log_width = 2;
-    } else if (strcmp(strategy->type, "fast_inverse_dst_4x4") == 0) {
-      test_env.log_width = 2;
+    // Select buffer width according to function name.
+    if (strstr(strategy->type, "_4x4")) {
+      test_env.width = 4;
+      test_env.height = 4;
+    } else if (strstr(strategy->type, "_8x8")) {
+      test_env.width = 8;
+      test_env.height = 8;
+    } else if (strstr(strategy->type, "_16x16")) {
+      test_env.width = 16;
+      test_env.height = 16;
+    } else if (strstr(strategy->type, "_32x32")) {
+      test_env.width = 32;
+      test_env.height = 32;
+    } else if (strstr(strategy->type, "_64x64")) {
+      test_env.width = 64;
+      test_env.height = 64;
     } else {
-      test_env.log_width = 0;
+      test_env.width = 0;
+      test_env.height = 0;
     }
 
     test_env.tested_func = strategies.strategiesi.fptr;
@@ -329,16 +384,33 @@
 
     // Call different tests depending on type of function.
     // This allows for selecting a subset of tests with -t parameter.
-    if (strncmp(strategy->type, "satd_", 5) == 0) {
-      RUN_TEST(intra_satd);
+    if (strncmp(strategy->type, "satd_", 5) == 0 && strcmp(strategy->type, "satd_any_size") != 0) {
+      if (strlen(strategy->type) <= 10) {
+        RUN_TEST(intra_satd);
+      } else if (strstr(strategy->type, "_dual")) {
+        RUN_TEST(intra_satd_dual);
+      }
     } else if (strncmp(strategy->type, "sad_", 4) == 0) {
-      RUN_TEST(intra_sad);
+      if (strlen(strategy->type) <= 9) {
+        RUN_TEST(intra_sad);
+      } else if (strstr(strategy->type, "_dual")) {
+        RUN_TEST(intra_sad_dual);
+      }
     } else if (strcmp(strategy->type, "reg_sad") == 0) {
+      static const vector2d_t tested_dims = { 
+          { 8, 8 }, { 16, 16 }, { 32, 32 }, { 64, 64 },
+          { 64, 63 }, { 1, 1 }
+      };
+
+      int num_tested_dims = sizeof(tested_dims) / sizeof(*tested_dims);
+
       // Call reg_sad with all the sizes it is actually called with.
-      for (int width = 3; width <= 6; ++width) {
-        test_env.log_width = width;
+      for (int dim_i = 0; dim_i < num_tested_dims; ++dim_i) {
+        test_env.width = tested_dimsdim_i.x;
+        test_env.height = tested_dimsdim_i.y;
         RUN_TEST(inter_sad);
       }
+
     } else if (strncmp(strategy->type, "dct_", 4) == 0 ||
                strcmp(strategy->type, "fast_forward_dst_4x4") == 0)
     {

kvazaar-1.0.0.tar.gz/tools/genmanpage.sh Added

kvazaar-1.0.0.tar.gz/tools/update_readme.sh Added

@@ -0,0 +1,37 @@
+#!/bin/sh
+# This file is part of Kvazaar HEVC encoder.
+#
+# Copyright (C) 2013-2016 Tampere University of Technology and others (see
+# COPYING file).
+#
+# Kvazaar is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License version 2.1 as
+# published by the Free Software Foundation.
+#
+# Kvazaar is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+
+# This script updates parameter documentation in ../README.md file.
+
+LANG=C
+set -e
+
+cd "$(dirname "$0")"
+
+tmpfile="$(mktemp)"
+readme_file="../README.md"
+
+{
+    sed '/BEGIN KVAZAAR HELP MESSAGE/q' -- "$readme_file";
+    printf '```\n';
+    ../src/kvazaar --help;
+    printf '```\n';
+    sed -n '/END KVAZAAR HELP MESSAGE/{:a;p;n;ba}' -- "$readme_file";
+} >> "$tmpfile"
+
+mv -- "$tmpfile" "../README.md"

kvazaar-1.0.0.tar.gz/tools/version.sh Added