Packman Build Service PMBS

Changes of Revision 14

kvazaar.changes Changed

@@ -1,4 +1,47 @@
 -------------------------------------------------------------------
+Tue Jul  9 20:15:25 UTC 2019 - Luigi Baldoni <aloisio@gmx.com>
+
+- Update to version 1.3.0
+  Features:
+  * Add release notes like this (#159, cf85d52)
+  * Changed --rd=2 to use SSD metric for CU mode decision
+    (662430d)
+  * Changed inter search to check the cost of flushing residual
+    to zero (75a8700)
+  * Changed rectangular and asymmetric blocks to use a transform
+    split (774c666)
+  * Added diamond search ME algorithm (4e13608)
+  * Enabled low delay B GOP structure with --bipred
+    --gop=lp-g4d3t1 (7155dd0)
+  * Added termination of intra search at zero residual with
+    --intra-rdo-et (4fb1c16)
+  Optimization:
+  * Made TZ search faster and slightly better (c136044)
+  * Optimized bi-prediction (69756e2)
+  Fixes:
+  * Fixed transform skip with rectangular inter blocks (fb462b2)
+  * Fixed accidental inter search for 4x4 blocks (649113a)
+  User Interface:
+  * Changed options for all preset levels (f033ad0)
+  * Added an option for limiting the number of steps in motion
+    estimation with --me-steps (39ed368)
+  * Added --me=dia (4e13608)
+  * Added --level, --force-level and --high-tier for setting
+    bitstream level and tier (bac0745)
+  Building:
+  * Fixed issue with struct timespec redefinition with Visual
+    Studio 2015 and later (713e694)
+  * Fixed building .asm files in Visual Studio 2017 (6be8195)
+  * Fixed compatibility with crypto++ 6.0 (4b24cd0)
+  * Added support for crypto++ with the name libcryptopp
+    (411276d)
+  * Dockerfile base image was updated to Ubuntu 18.04 (8380b6c)
+  * Enabled -Wextra by default (ff17e0b)
+  Refactoring:
+  * Inter motion vector cost functions (c73cce3)
+  * Dockerfile (0164291)
+
+-------------------------------------------------------------------
 Fri Nov 17 14:01:40 UTC 2017 - aloisio@gmx.com
 
 - Update to version 1.2.0

kvazaar.spec Changed

@@ -1,8 +1,8 @@
 #
 # spec file for package kvazaar
 #
+# Copyright (c) 2019 Packman Team <packman@links2linux.de>
 # Copyright (c) 2017 SUSE LINUX GmbH, Nuernberg, Germany.
-# Copyright (c) 2017 Packman Team <packman@links2linux.de>
 #
 # All modifications and additions to the file contributed by third parties
 # remain the property of their copyright owners, unless otherwise agreed
@@ -13,19 +13,19 @@
 # license that conforms to the Open Source Definition (Version 1.9)
 # published by the Open Source Initiative.
 
-# Please submit bugfixes or comments via http://bugs.opensuse.org/
+# Please submit bugfixes or comments via https://bugs.links2linux.org/
 #
 
 
 %define libname libkvazaar
 %define libmver 4
 Name:           kvazaar
-Version:        1.2.0
+Version:        1.3.0
 Release:        0
 Summary:        HEVC encoder
-License:        LGPL-2.1
+License:        LGPL-2.1-or-later
 Group:          Productivity/Multimedia/Video/Editors and Convertors
-Url:            http://ultravideo.cs.tut.fi/#encoder
+URL:            http://ultravideo.cs.tut.fi/#encoder
 Source0:        https://github.com/ultravideo/%{name}/archive/v%{version}.tar.gz#/%{name}-%{version}.tar.gz
 Patch0:         kvazaar.memset.patch
 BuildRequires:  automake
@@ -33,7 +33,6 @@
 BuildRequires:  gcc >= 4.4
 BuildRequires:  gcc-c++
 BuildRequires:  libtool
-BuildRequires:  make
 BuildRequires:  pkgconfig
 Requires:       %{libname}%{libmver} = %{version}
 %ifnarch %{arm}
@@ -66,34 +65,32 @@
 autoreconf -fvi
 %configure \
     --disable-static \
-    --disable-silent-rules
+    --disable-silent-rules \
+    --docdir=%{_defaultdocdir}/%{name}
 make %{?_smp_mflags}
 
 %install
 %make_install
 find %{buildroot} -type f -name "*.la" -delete -print
+rm %{buildroot}%{_defaultdocdir}/%{name}/COPYING
 
 %post   -n %{libname}%{libmver} -p /sbin/ldconfig
 %postun -n %{libname}%{libmver} -p /sbin/ldconfig
 
 %files
-%defattr(-,root,root)
-%dir %{_datadir}/doc/%{name}
-%doc %{_datadir}/doc/%{name}/COPYING
-%doc %{_datadir}/doc/%{name}/CREDITS
-%doc %{_datadir}/doc/%{name}/README.md
-%{_bindir}/kvazaar
-%{_mandir}/man1/kvazaar.1%{ext_man}
+%license COPYING
+%doc CREDITS README.md
+%{_bindir}/%{name}
+%{_mandir}/man1/%{name}.1%{ext_man}
 
 %files -n %{libname}%{libmver}
-%defattr(-,root,root)
-%doc COPYING CREDITS README.md
+%license COPYING
+%doc CREDITS README.md
 %{_libdir}/%{libname}.so.%{libmver}*
 
 %files -n %{libname}-devel
-%defattr(-,root,root)
-%{_includedir}/kvazaar.h
+%{_includedir}/%{name}.h
 %{_libdir}/%{libname}.so
-%{_libdir}/pkgconfig/kvazaar.pc
+%{_libdir}/pkgconfig/%{name}.pc
 
 %changelog

kvazaar-1.2.0.tar.gz/build/kvazaar_VS2013.sln Deleted

@@ -1,55 +0,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 2013
-VisualStudioVersion = 12.0.30723.0
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_lib", "kvazaar_lib\kvazaar_lib.vcxproj", "{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{50AB7A17-4885-4D20-BF01-376DE4417FCD}"
-	ProjectSection(SolutionItems) = preProject
-		kvazaar_VS2010.vsd = kvazaar_VS2010.vsd
-		kvazaar_VS2010.vsmdi = kvazaar_VS2010.vsmdi
-		Local.testsettings = Local.testsettings
-		TraceAndTestImpact.testsettings = TraceAndTestImpact.testsettings
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_tests", "kvazaar_tests\kvazaar_tests.vcxproj", "{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_cli", "kvazaar_cli\kvazaar_cli.vcxproj", "{C755308D-9B3E-4712-99AB-7F6F4E2DA567}"
-	ProjectSection(ProjectDependencies) = postProject
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF} = {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}
-	EndProjectSection
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Win32 = Debug|Win32
-		Debug|x64 = Debug|x64
-		Release|Win32 = Release|Win32
-		Release|x64 = Release|x64
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|Win32.ActiveCfg = Debug|Win32
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|Win32.Build.0 = Debug|Win32
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|x64.ActiveCfg = Debug|x64
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|x64.Build.0 = Debug|x64
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|Win32.ActiveCfg = Release|Win32
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|Win32.Build.0 = Release|Win32
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|x64.ActiveCfg = Release|x64
-		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|x64.Build.0 = Release|x64
-		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Debug|Win32.ActiveCfg = Debug|Win32
-		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Debug|x64.ActiveCfg = Debug|x64
-		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Release|Win32.ActiveCfg = Release|Win32
-		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Release|x64.ActiveCfg = Release|x64
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|Win32.ActiveCfg = Debug|Win32
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|Win32.Build.0 = Debug|Win32
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|x64.ActiveCfg = Debug|x64
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|x64.Build.0 = Debug|x64
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|Win32.ActiveCfg = Release|Win32
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|Win32.Build.0 = Release|Win32
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|x64.ActiveCfg = Release|x64
-		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|x64.Build.0 = Release|x64
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal

kvazaar-1.2.0.tar.gz/.gitignore -> kvazaar-1.3.0.tar.gz/.gitignore Changed

kvazaar-1.3.0.tar.gz/.gitlab-ci.yml Added

@@ -0,0 +1,47 @@
+# Use Kvazaar CI base image which includes the build tools and ffmpeg + hmdec in ${HOME}/bin
+image: ultravideo/kvazaar_ci_base:latest
+
+# Build and test kvazaar
+test-kvazaar: &test-template
+  stage: test
+  script:
+    - export PATH="${HOME}/bin:${PATH}"
+    - ./autogen.sh
+    - ./configure --enable-werror || (cat config.log && false)
+    - make --jobs=8
+    - make check --jobs=8 VERBOSE=1
+  artifacts:
+    paths:
+    - src/kvazaar
+    - src/.libs
+    expire_in: 1 week
+
+test-asan:
+  <<: *test-template
+  variables:
+    CFLAGS: '-fsanitize=address'
+    # LeakSanitizer doesn't work inside the container because it requires
+    # ptrace so we disable it.
+    ASAN_OPTIONS: 'detect_leaks=0'
+    # AddressSanitizer adds some extra symbols so we expect a failure from
+    # the external symbols test.
+    XFAIL_TESTS: test_external_symbols.sh
+
+test-tsan:
+  <<: *test-template
+  variables:
+    CFLAGS: '-fsanitize=thread'
+
+test-ubsan:
+  <<: *test-template
+  variables:
+    CFLAGS: '-fsanitize=undefined -fno-sanitize-recover=all -fno-sanitize=alignment'
+
+test-valgrind:
+  <<: *test-template
+  variables:
+    KVAZAAR_OVERRIDE_angular_pred: generic
+    KVAZAAR_OVERRIDE_sao_band_ddistortion: generic
+    KVAZAAR_OVERRIDE_sao_edge_ddistortion: generic
+    KVAZAAR_OVERRIDE_calc_sao_edge_dir: generic
+    KVZ_TEST_VALGRIND: 1

kvazaar-1.2.0.tar.gz/.travis.yml -> kvazaar-1.3.0.tar.gz/.travis.yml Changed

kvazaar-1.2.0.tar.gz/Dockerfile -> kvazaar-1.3.0.tar.gz/Dockerfile Changed

@@ -9,34 +9,35 @@
 #
 #     RESOLUTION=`avconv -i input.avi 2>&1 | grep Stream | grep -oP ', \K0-9+x0-9+'`
 #     avconv -i input.avi -an -f rawvideo -pix_fmt yuv420p - | docker run -i -a STDIN -a STDOUT kvazaar -i - --wpp --threads=8 --input-res=$RESOLUTION --preset=ultrafast -o - > output.265
-#  or 
+#  or
 #     RESOLUTION=`ffmpeg -i input.avi 2>&1 | grep Stream | grep -oP ', \K0-9+x0-9+'`
 #     ffmpeg -i input.avi -an -f rawvideo -pix_fmt yuv420p - | docker run -i -a STDIN -a STDOUT kvazaar -i - --wpp --threads=8 --input-res=$RESOLUTION --preset=ultrafast -o - > output.265
 #
 
-# Use Ubuntu 15.10 as a base for now, it's around 136MB
-FROM ubuntu:15.10
+# Use Ubuntu 18.04 as a base for now, it's around 88MB
+FROM ubuntu:18.04
 
 MAINTAINER Marko Viitanen <fador@iki.fi>
 
-    # List of needed packages to be able to build kvazaar with autotools
-    ENV REQUIRED_PACKAGES automake autoconf libtool m4 build-essential git yasm pkgconf
-    
-    # Run all the commands in one RUN so we don't have any extra history
-    # data in the image.
-    RUN apt-get update \
+# List of needed packages to be able to build kvazaar with autotools
+ENV REQUIRED_PACKAGES automake autoconf libtool m4 build-essential git yasm pkgconf
+
+ADD . kvazaar
+# Run all the commands in one RUN so we don't have any extra history
+# data in the image.
+RUN apt-get update \
     && apt-get install -y $REQUIRED_PACKAGES \
     && apt-get clean \
-    && git clone --depth=1 git://github.com/ultravideo/kvazaar.git; \
-        cd kvazaar; \
-        ./autogen.sh; \
-        ./configure --disable-shared;\
-        make;\
-        make install; \
-    AUTOINSTALLED_PACKAGES=`apt-mark showauto`; \
-    apt-get remove --purge --force-yes -y $REQUIRED_PACKAGES $AUTOINSTALLED_PACKAGES; \
-        apt-get clean autoclean; \
-        apt-get autoremove -y; \
-        rm -rf /var/lib/{apt,dpkg,cache,log}/
+    && cd kvazaar \
+    && ./autogen.sh \
+    && ./configure --disable-shared \
+    && make\
+    && make install \
+    && AUTOINSTALLED_PACKAGES=`apt-mark showauto` \
+    && apt-get remove --purge --force-yes -y $REQUIRED_PACKAGES $AUTOINSTALLED_PACKAGES \
+    && apt-get clean autoclean \
+    && apt-get autoremove -y \
+    && rm -rf /var/lib/{apt,dpkg,cache,log}/
+
 ENTRYPOINT "kvazaar"
 CMD "--help"

kvazaar-1.2.0.tar.gz/README.md -> kvazaar-1.3.0.tar.gz/README.md Changed

@@ -11,6 +11,29 @@
 - Linux/Mac !Build Status(https://travis-ci.org/ultravideo/kvazaar.svg?branch=master)(https://travis-ci.org/ultravideo/kvazaar)
 - Windows !Build status(https://ci.appveyor.com/api/projects/status/88sg1h25lp0k71pu?svg=true)(https://ci.appveyor.com/project/Ultravideo/kvazaar)
 
+## Table of Contents
+
+- Using Kvazaar(#using-kvazaar)
+  - Example:(#example)
+  - Parameters(#parameters)
+  - LP-GOP syntax(#lp-gop-syntax)
+- Presets(#presets)
+- Kvazaar library(#kvazaar-library)
+- Compiling Kvazaar(#compiling-kvazaar)
+  - Required libraries(#required-libraries)
+  - Autotools(#autotools)
+  - OS X(#os-x)
+  - Visual Studio(#visual-studio)
+  - Docker(#docker)
+  - Visualization (Windows only)(#visualization-windows-only)
+- Paper(#paper)
+- Contributing to Kvazaar(#contributing-to-kvazaar)
+  - Code documentation(#code-documentation)
+  - For version control we try to follow these conventions:(#for-version-control-we-try-to-follow-these-conventions)
+  - Testing(#testing)
+  - Unit tests(#unit-tests)
+  - Code style(#code-style)
+
 ## Using Kvazaar
 
 ### Example:
@@ -31,14 +54,14 @@
 kvazaar -i <input> --input-res <width>x<height> -o <output>
 
 Required:
-  -i, --input                : Input file
+  -i, --input <filename>     : Input file
       --input-res <res>      : Input resolution auto
-                               auto: detect from file name
-                               <int>x<int>: width times height
-  -o, --output               : Output file
+                                   - auto: Detect from file name.
+                                   - <int>x<int>: width times height
+  -o, --output <filename>    : Output file
 
 Presets:
-      --preset=<preset>      : Set options to a preset medium
+      --preset <preset>      : Set options to a preset medium
                                    - ultrafast, superfast, veryfast, faster,
                                      fast, medium, slow, slower, veryslow
                                      placebo
@@ -46,144 +69,190 @@
 Input:
   -n, --frames <integer>     : Number of frames to code all
       --seek <integer>       : First frame to code 0
-      --input-fps <num>/<denom> : Framerate of the input video 25.0
-      --source-scan-type <string> : Set source scan type progressive.
-                                   - progressive: progressive scan
-                                   - tff: top field first
-                                   - bff: bottom field first
-      --input-format         : P420 or P400
-      --input-bitdepth       : 8-16
-      --loop-input           : Re-read input file forever
+      --input-fps <num>/<denom> : Frame rate of the input video 25
+      --source-scan-type <string> : Source scan type progressive
+                                   - progressive: Progressive scan
+                                   - tff: Top field first
+                                   - bff: Bottom field first
+      --input-format <string> : P420 or P400 P420
+      --input-bitdepth <int> : 8-16 8
+      --loop-input           : Re-read input file forever.
 
 Options:
-      --help                 : Print this help message and exit
-      --version              : Print version information and exit
-      --aud                  : Use access unit delimiters
-      --debug <string>       : Output encoders reconstruction.
-      --cpuid <integer>      : Disable runtime cpu optimizations with value 0.
-      --hash                 : Decoded picture hash checksum
+      --help                 : Print this help message and exit.
+      --version              : Print version information and exit.
+      --(no-)aud             : Use access unit delimiters. disabled
+      --debug <filename>     : Output internal reconstruction.
+      --(no-)cpuid           : Enable runtime CPU optimizations. enabled
+      --hash <string>        : Decoded picture hash checksum
                                    - none: 0 bytes
                                    - checksum: 18 bytes
                                    - md5: 56 bytes
-      --no-psnr              : Don't calculate PSNR for frames
-      --no-info              : Don't add encoder info SEI.
+      --(no-)psnr            : Calculate PSNR for frames. enabled
+      --(no-)info            : Add encoder info SEI. enabled
+      --crypto <string>      : Selective encryption. Crypto support must be
+                               enabled at compile-time. Can be 'on' or 'off' or
+                               a list of features separated with a '+'. off
+                                   - on: Enable all encryption features.
+                                   - off: Disable selective encryption.
+                                   - mvs: Motion vector magnitudes.
+                                   - mv_signs: Motion vector signs.
+                                   - trans_coeffs: Coefficient magnitudes.
+                                   - trans_coeff_signs: Coefficient signs.
+                                   - intra_pred_modes: Intra prediction modes.
+      --key <string>         : Encryption key 16,213,27,56,255,127,242,112,
+                                               97,126,197,204,25,59,38,30
 
 Video structure:
-  -q, --qp <integer>         : Quantization Parameter 32
-  -p, --period <integer>     : Period of intra pictures 0
-                               - 0: only first picture is intra
-                               - 1: all pictures are intra
-                               - 2-N: every Nth picture is intra
-      --vps-period <integer> : Specify how often the video parameter set is
-                               re-sent. 0
-                                   - 0: only send VPS with the first frame
-                                   - N: send VPS with every Nth intra frame
-  -r, --ref <integer>        : Reference frames, range 1..15 3
-      --gop <string>         : Definition of GOP structure 0
-                                   - 0: disabled
+  -q, --qp <integer>         : Quantization parameter 22
+  -p, --period <integer>     : Period of intra pictures 64
+                                   - 0: Only first picture is intra.
+                                   - 1: All pictures are intra.
+                                   - N: Every Nth picture is intra.
+      --vps-period <integer> : How often the video parameter set is re-sent 0
+                                   - 0: Only send VPS with the first frame.
+                                   - N: Send VPS with every Nth intra frame.
+  -r, --ref <integer>        : Number of reference frames, in range 1..15 4
+      --gop <string>         : GOP structure 8
+                                   - 0: Disabled
                                    - 8: B-frame pyramid of length 8
-                                   - lp-<string>: lp-gop definition
-                                         (e.g. lp-g8d4t2, see README)
-      --cqmfile <string>     : Custom Quantization Matrices from a file
-      --bitrate <integer>    : Target bitrate. 0
-                                   - 0: disable rate-control
-                                   - N: target N bits per second
-      --lossless             : Use lossless coding
-      --mv-constraint        : Constrain movement vectors
-                                   - none: no constraint
-                                   - frametile: constrain within the tile
-                                   - frametilemargin: constrain even more
-      --roi <string>         : Use a delta QP map for region of interest
-                                   Read an array of delta QP values from
-                                   a file, where the first two values are the
-                                   width and height, followed by width*height
-                                   delta QP values in raster order.
-                                   The delta QP map can be any size or aspect
-                                   ratio, and will be mapped to LCU's.
-      --(no-)erp-aqp         : Use adaptive QP for 360 video with
-                               equirectangular projection
+                                   - lp-<string>: Low-delay P-frame GOP
+                                     (e.g. lp-g8d4t2, see README)
+      --(no-)open-gop        : Use open GOP configuration. enabled
+      --cqmfile <filename>   : Read custom quantization matrices from a file.
+      --scaling-list <string>: Set scaling list mode. off
+                                   - off: Disable scaling lists.
+                                   - custom: use custom list (with --cqmfile).
+                                   - default: Use default lists.
+      --bitrate <integer>    : Target bitrate 0
+                                   - 0: Disable rate control.
+                                   - N: Target N bits per second.
+      --(no-)lossless        : Use lossless coding. disabled
+      --mv-constraint <string> : Constrain movement vectors. none
+                                   - none: No constraint
+                                   - frametile: Constrain within the tile.
+                                   - frametilemargin: Constrain even more.
+      --roi <filename>       : Use a delta QP map for region of interest.
+                               Reads an array of delta QP values from a text
+                               file. The file format is: width and height of
+                               the QP delta map followed by width*height delta
+                               QP values in raster order. The map can be of any
+                               size and will be scaled to the video size.
+      --set-qp-in-cu         : Set QP at CU level keeping pic_init_qp_minus26.
+                               in PPS and slice_qp_delta in slize header zero.
+      --(no-)erp-aqp         : Use adaptive QP for 360 degree video with
+                               equirectangular projection. disabled
+      --level <number>       : Use the given HEVC level in the output and give
+                               an error if level limits are exceeded. 6.2
+                                   - 1, 2, 2.1, 3, 3.1, 4, 4.1, 5, 5.1, 5.2, 6,
+                                     6.1, 6.2
+      --force-level <number> : Same as --level but warnings instead of errors.
+      --high-tier            : Used with --level. Use high tier bitrate limits
+                               instead of the main tier limits during encoding.
+                               High tier requires level 4 or higher.
 
 Compression tools:
-      --deblock <beta:tc>  : Deblocking
-                                     - beta: between -6 and 6
-                                     - tc: between -6 and 6
-      --(no-)sao             : Sample Adaptive Offset
-      --(no-)rdoq            : Rate-Distortion Optimized Quantization
-      --(no-)signhide        : Sign Hiding
-      --(no-)smp             : Symmetric Motion Partition
-      --(no-)amp             : Asymmetric Motion Partition
-      --rd <integer>         : Intra mode search complexity
-                                   - 0: skip intra if inter is good enough
-                                   - 1: rough intra mode search with SATD
-                                   - 2: refine intra mode search with SSE
-      --(no-)mv-rdo          : Rate-Distortion Optimized motion vector costs
-      --(no-)full-intra-search
-                             : Try all intra modes during rough search.
-      --(no-)transform-skip  : Transform skip
-      --me <string>          : Integer motion estimation
+      --(no-)deblock <beta:tc> : Deblocking filter. 0:0
+                                   - beta: Between -6 and 6
+                                   - tc: Between -6 and 6
+      --sao <string>         : Sample Adaptive Offset full
+                                   - off: SAO disabled
+                                   - band: Band offset only
+                                   - edge: Edge offset only
+                                   - full: Full SAO
+      --(no-)rdoq            : Rate-distortion optimized quantization enabled
+      --(no-)rdoq-skip       : Skip RDOQ for 4x4 blocks. disabled
+      --(no-)signhide        : Sign hiding disabled
+      --(no-)smp             : Symmetric motion partition disabled
+      --(no-)amp             : Asymmetric motion partition disabled
+      --rd <integer>         : Intra mode search complexity 0
+                                   - 0: Skip intra if inter is good enough.
+                                   - 1: Rough intra mode search with SATD.
+                                   - 2: Refine intra mode search with SSE.
+                                   - 3: Try all intra modes and enable intra
+                                        chroma mode search.
+      --(no-)mv-rdo          : Rate-distortion optimized motion vector costs
+                               disabled
+      --(no-)full-intra-search : Try all intra modes during rough search.
+                               disabled
+      --(no-)transform-skip  : Try transform skip disabled
+      --me <string>          : Integer motion estimation algorithm hexbs
                                    - hexbs: Hexagon Based Search
                                    - tz:    Test Zone Search
                                    - full:  Full Search
                                    - full8, full16, full32, full64
-      --subme <integer>      : Set fractional pixel motion estimation level
-                                   - 0: only integer motion estimation
+                                   - dia:   Diamond Search
+      --me-steps <integer>   : Motion estimation search step limit. Only
+                               affects 'hexbs' and 'dia'. -1
+      --subme <integer>      : Fractional pixel motion estimation level 4
+                                   - 0: Integer motion estimation only
                                    - 1: + 1/2-pixel horizontal and vertical
                                    - 2: + 1/2-pixel diagonal
                                    - 3: + 1/4-pixel horizontal and vertical
                                    - 4: + 1/4-pixel diagonal
-      --pu-depth-inter <int>-<int>
-                             : Range for sizes for inter predictions
+      --pu-depth-inter <int>-<int> : Inter prediction units sizes 0-3
                                    - 0, 1, 2, 3: from 64x64 to 8x8
-      --pu-depth-intra <int>-<int> : Range for sizes for intra predictions
+      --pu-depth-intra <int>-<int> : Intra prediction units sizes 1-4
                                    - 0, 1, 2, 3, 4: from 64x64 to 4x4
-      --(no-)bipred          : Bi-prediction
-      --(no-)cu-split-termination
-                             : CU split search termination condition
-                                   - off: Never terminate cu-split search
-                                   - zero: Terminate with zero residual
-      --(no-)me-early-termination : ME early termination condition
-                                   - off: Don't terminate early
-                                   - on: Terminate early
-                                   - sensitive: Terminate even earlier
-      --(no-)implicit-rdpcm  : Implicit residual DPCM
-                               Currently only supported with lossless coding.
-      --(no-)tmvp            : Temporal Motion Vector Prediction
-      --(no-)rdoq-skip       : Skips RDOQ for 4x4 blocks
+      --tr-depth-intra <int> : Transform split depth for intra blocks 0
+      --(no-)bipred          : Bi-prediction disabled
+      --cu-split-termination <string> : CU split search termination zero
+                                   - off: Don't terminate early.
+                                   - zero: Terminate when residual is zero.
+      --me-early-termination <string> : Motion estimation termination on
+                                   - off: Don't terminate early.
+                                   - on: Terminate early.
+                                   - sensitive: Terminate even earlier.
+      --fast-residual-cost <int> : Skip CABAC cost for residual coefficients
+                                   when QP is below the limit. 0
+      --(no-)intra-rdo-et    : Check intra modes in rdo stage only until
+                               a zero coefficient CU is found. disabled
+      --(no-)early-skip      : Try to find skip cu from merge candidates.
+                               Perform no further search if skip is found.
+                               For rd=0..1: Try the first candidate.
+                               For rd=2.. : Try the best candidate based
+                                            on luma satd cost. enabled
+      --max-merge <integer>  : Maximum number of merge candidates, 1..5 5
+      --(no-)implicit-rdpcm  : Implicit residual DPCM. Currently only supported
+                               with lossless coding. disabled
+      --(no-)tmvp            : Temporal motion vector prediction enabled
 
 Parallel processing:
       --threads <integer>    : Number of threads to use auto
-                                   - 0: process everything with main thread
-                                   - N: use N threads for encoding
-                                   - auto: select based on number of cores
-      --owf <integer>        : Frame parallelism auto
-                                   - N: Process N-1 frames at a time
-                                   - auto: Select automatically
-      --(no-)wpp             : Wavefront parallel processing enabled
+                                   - 0: Process everything with main thread.
+                                   - N: Use N threads for encoding.
+                                   - auto: Select automatically.
+      --owf <integer>        : Frame-level parallelism auto
+                                   - N: Process N+1 frames at a time.
+                                   - auto: Select automatically.
+      --(no-)wpp             : Wavefront parallel processing. enabled
                                Enabling tiles automatically disables WPP.
                                To enable WPP with tiles, re-enable it after
-                               enabling tiles.
+                               enabling tiles. Enabling wpp with tiles is,
+                               however, an experimental feature since it is
+                               not supported in any HEVC profile.
       --tiles <int>x<int>    : Split picture into width x height uniform tiles.
       --tiles-width-split <string>|u<int> :
-                               Specifies a comma separated list of pixel
-                               positions of tiles columns separation coordinates.
-                               Can also be u followed by and a single int n,
-                               in which case it produces columns of uniform width.
+                                   - <string>: A comma-separated list of tile
+                                               column pixel coordinates.
+                                   - u<int>: Number of tile columns of uniform
+                                             width.
       --tiles-height-split <string>|u<int> :
-                               Specifies a comma separated list of pixel
-                               positions of tiles rows separation coordinates.
-                               Can also be u followed by and a single int n,
-                               in which case it produces rows of uniform height.
-      --slices <string>      : Control how slices are used
-                                   - tiles: put tiles in independent slices
-                                   - wpp: put rows in dependent slices
-                                   - tiles+wpp: do both
+                                   - <string>: A comma-separated list of tile row
+                                               column pixel coordinates.
+                                   - u<int>: Number of tile rows of uniform
+                                             height.
+      --slices <string>      : Control how slices are used.
+                                   - tiles: Put tiles in independent slices.
+                                   - wpp: Put rows in dependent slices.
+                                   - tiles+wpp: Do both.
 
 Video Usability Information:
-      --sar <width:height>   : Specify Sample Aspect Ratio
+      --sar <width:height>   : Specify sample aspect ratio
       --overscan <string>    : Specify crop overscan setting undef
                                    - undef, show, crop
       --videoformat <string> : Specify video format undef
-                                   - component, pal, ntsc, secam, mac, undef
+                                   - undef, component, pal, ntsc, secam, mac
       --range <string>       : Specify color range tv
                                    - tv, pc
       --colorprim <string>   : Specify color primaries undef
@@ -200,8 +269,8 @@
       --chromaloc <integer>  : Specify chroma sample location (0 to 5) 0
 
 Deprecated parameters: (might be removed at some point)
-  -w, --width                 : Use --input-res
-  -h, --height                : Use --input-res
+  -w, --width <integer>       : Use --input-res.
+  -h, --height <integer>      : Use --input-res.
 ```
 comment: # (END KVAZAAR HELP MESSAGE)
 
@@ -230,24 +299,30 @@
 
 |                      | 0-uf  | 1-sf  | 2-vf  | 3-fr  | 4-f   | 5-m   | 6-s   | 7-sr  | 8-vs  | 9-p   |
 | -------------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- |
-| rd                   | 0     | 0     | 0     | 1     | 1     | 1     | 1     | 1     | 1     | 1     |
-| pu-depth-intra       | 2-3   | 2-3   | 2-3   | 2-3   | 2-3   | 1-3   | 1-3   | 1-3   | 1-4   | 1-4   |
-| pu-depth-inter       | 2-3   | 2-3   | 2-3   | 1-3   | 1-3   | 1-3   | 1-3   | 0-3   | 0-3   | 0-3   |
+| rd                   | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 2     | 2     | 2     |
+| pu-depth-intra       | 2-3   | 2-3   | 2-3   | 2-3   | 1-3   | 1-4   | 1-4   | 1-4   | 1-4   | 1-4   |
+| pu-depth-inter       | 2-3   | 2-3   | 1-3   | 1-3   | 1-3   | 0-3   | 0-3   | 0-3   | 0-3   | 0-3   |
 | me                   | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | tz    |
-| ref                  | 1     | 1     | 1     | 1     | 1     | 1     | 2     | 2     | 3     | 4     |
+| gop                  | g4d4t1| g4d4t1| g4d4t1| g4d4t1| g4d4t1| 8     | 8     | 8     | 8     | 8     |
+| ref                  | 1     | 1     | 1     | 1     | 2     | 4     | 4     | 4     | 4     | 4     |
+| bipred               | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1     |
 | deblock              | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     |
-| signhide             | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1     |
-| subme                | 0     | 0     | 2     | 2     | 4     | 4     | 4     | 4     | 4     | 4     |
-| sao                  | 0     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     |
+| signhide             | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     |
+| subme                | 2     | 2     | 2     | 4     | 4     | 4     | 4     | 4     | 4     | 4     |
+| sao                  | off   | full  | full  | full  | full  | full  | full  | full  | full  | full  |
 | rdoq                 | 0     | 0     | 0     | 0     | 0     | 1     | 1     | 1     | 1     | 1     |
-| rdoq-skip            | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 0     |
+| rdoq-skip            | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     |
 | transform-skip       | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     |
 | mv-rdo               | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     |
 | full-intra-search    | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     |
-| smp                  | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     |
+| smp                  | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     | 1     |
 | amp                  | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 1     |
 | cu-split-termination | zero  | zero  | zero  | zero  | zero  | zero  | zero  | zero  | zero  | off   |
-| me-early-termination | sens. | sens. | sens. | sens. | on    | on    | on    | on    | on    | off   |
+| me-early-termination | sens. | sens. | sens. | sens. | sens. | on    | on    | off   | off   | off   |
+| intra-rdo-et         | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     | 0     |
+| early-skip           | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     | 1     |
+| fast-residual-cost   | 28    | 28    | 28    | 0     | 0     | 0     | 0     | 0     | 0     | 0     |
+| max-merge            | 5     | 5     | 5     | 5     | 5     | 5     | 5     | 5     | 5     | 5     |
 
 
 ## Kvazaar library
@@ -268,16 +343,6 @@
 possible.
 
 
-### Required libraries
-- For Visual Studio, the pthreads-w32 library is required. Platforms
-  with native POSIX thread support don't need anything.
-  - The project file expects the library to be in ../pthreads.2/
-    relative to Kvazaar. You can just extract the pre-built library
-    there.
-  - The executable needs pthreadVC2.dll to be present. Either install it
-    somewhere or ship it with the executable.
-
-
 ### Autotools
 Depending on the platform, some additional tools are required for compiling Kvazaar with autotools.
 For Ubuntu, the required packages are `automake autoconf libtool m4 build-essential yasm`. Yasm is
@@ -300,7 +365,7 @@
 
 
 ### Visual Studio
-- At least VisualStudio 2013 is required.
+- At least VisualStudio 2015 is required.
 - Project files can be found under build/.
 - Requires external vsyasm.exe(http://yasm.tortall.net/Download.html)
   in %PATH%
@@ -314,18 +379,37 @@
 
 
 ### Visualization (Windows only)
-Branch `visualizer` has a visual studio project, which can be compiled to enable visualization feature in Kvazaar.
+Compiling `kvazaar_cli` project in the `visualizer` branch results in a Kvazaar executable with visualization enabled.
 
 Additional Requirements: `SDL2`(https://www.libsdl.org/download-2.0.php), `SDL2-ttf`(https://www.libsdl.org/projects/SDL_ttf/).
 
-Directory `visualizer_extras` is expected to be found from the same directory level as the kvazaar project directory. Inside should be directories `include` and `lib` found from the development library zip packages.
+Directory `visualizer_extras` has to be added into the same directory level as the kvazaar project directory. Inside should be directories `include` and `lib` found from the development library zip packages.
 
-`SDL2.dll`, `SDL2_ttf.dll`, `libfreetype-6.dll`, `zlib1.dll`, and `pthreadVC2.dll` should be placed in the working directory (i.e. the folder the `kvazaar.exe` is in after compiling the `kvazaar_cli` project/solution) when running the visualizer. The required `.dll` can be found in the aforementioned `lib`-folder (`lib\x64`) and the dll folder inside the pthreads folder (see `Required libraries`).
+`SDL2.dll`, `SDL2_ttf.dll`, `libfreetype-6.dll`, and `zlib1.dll` should be placed in the working directory (i.e. the folder the `kvazaar.exe` is in after compiling the `kvazaar_cli` project/solution) when running the visualizer. The required `.dll` can be found in the aforementioned `lib`-folder (`lib\x64`).
 
 Note: The solution should be compiled on the x64 platform in visual studio.
 
 Optional font file `arial.ttf` is to be placed in the working directory, if block info tool is used.
 
+## Paper
+
+Please cite this paper(https://dl.acm.org/citation.cfm?doid=2964284.2973796) for Kvazaar:
+
+```M. Viitanen, A. Koivula, A. Lemmetti, A. Ylä-Outinen, J. Vanne, and T. D. Hämäläinen, “Kvazaar: open-source HEVC/H.265 encoder,” in Proc. ACM Int. Conf. Multimedia, Amsterdam, The Netherlands, Oct. 2016.```
+
+Or in BibTex:
+
+```
+@inproceedings{Kvazaar2016,
+ author = {Viitanen, Marko and Koivula, Ari and Lemmetti, Ari and Yl\"{a}-Outinen, Arttu and Vanne, Jarno and H\"{a}m\"{a}l\"{a}inen, Timo D.},
+ title = {Kvazaar: Open-Source HEVC/H.265 Encoder},
+ booktitle = {Proceedings of the 24th ACM International Conference on Multimedia},
+ year = {2016},
+ isbn = {978-1-4503-3603-1},
+ location = {Amsterdam, The Netherlands},
+ url = {http://doi.acm.org/10.1145/2964284.2973796},
+}
+```
 
 ## Contributing to Kvazaar
 We are happy to look at pull requests in Github. There is still lots of work to be done.
@@ -353,7 +437,7 @@
   - Uninitialized variables and such are checked with Valgrind.
   - Bitstream validity is checked with HM.
   - Compilation is checked on GCC and Clang on Linux, and Clang on OSX.
-- Windows msys2 build is checked automatically on Appveyor.
+- Windows msys2 and msvc builds are checked automatically on Appveyor.
 - If your changes change the bitstream, decode with HM to check that
   it doesn't throw checksum errors or asserts.
 - If your changes shouldn't alter the bitstream, check that they don't.
@@ -377,7 +461,7 @@
 
 ### Code style
 We try to follow the following conventions:
-- C99 without features not supported by Visual Studio 2013 (VLAs).
+- C99 without features not supported by Visual Studio 2015 (VLAs).
  - // comments allowed and encouraged.
 - Follow overall conventions already established in the code.
 - Indent by 2 spaces. (no tabs)

kvazaar-1.2.0.tar.gz/appveyor.yml -> kvazaar-1.3.0.tar.gz/appveyor.yml Changed

@@ -1,28 +1,85 @@
+# Only the whitelisted branches get built, regardless of build config
 branches:
   only:
     - master
-    - appveyor
 
+# Email the author if their commit either failed to build or fixed a failed build
+# good -> bad, bad -> bad, bad -> good  but not  good -> good
+notifications:
+  - provider: Email
+    to:
+      - '{{commitAuthorEmail}}'
+    on_build_success: false
+    on_build_failure: true
+    on_build_status_changed: true
+
+# Skip commits that don't affect the code / compiling the code
+skip_commits:
+  files:
+    - .gitignore
+    - .gitlab-ci.yml
+    - .travis-install.bash
+    - .travis.yml
+    - COPYING
+    - CREDITS
+    - README.md
+    - docs.doxy
+
+# Download only a zip file of the latest commit
+# Downloading the whole history of the repository would be unnecessary
+shallow_clone: true
+
+# Only try building the app, don't run any tests
+test: off
+
+# Don't bother with debug builds
+configuration: 
+  - Release
+
+# Build with multiple compilers / build suites
+image: Visual Studio 2015
 environment:
   matrix:
-    - MSYSTEM: MINGW64
+    - platform: Win32
+    - platform: x64
     - MSYSTEM: MINGW32
+    - MSYSTEM: MINGW64
 
-shallow_clone: true
-test: off
+for:
+-
+  # MinGW builds need all kinds of build scripts
+  matrix:
+    only:
+      - MSYSTEM: MINGW32
+      - MSYSTEM: MINGW64
+
+  install:
+    # Update core packages
+    - C:\msys64\usr\bin\pacman -Syyuu --noconfirm --noprogressbar
+    # Update non-core packages
+    - C:\msys64\usr\bin\pacman -Suu --noconfirm --noprogressbar
+    # Install required MSYS2 packages
+    - C:\msys64\usr\bin\pacman -S --noconfirm --noprogressbar --needed automake-wrapper make
+    # Now MSYS2 is up to date, do the rest of the install from a bash script
+    - C:\msys64\usr\bin\bash -lc "cd \"$APPVEYOR_BUILD_FOLDER\" && exec ./tools/appveyor-install.sh"
+
+  build_script:
+    - C:\msys64\usr\bin\bash -lc "cd \"$APPVEYOR_BUILD_FOLDER\" && exec ./tools/appveyor-build.sh"
+
+  cache:
+    - C:\msys64\var\cache\pacman\pkg
+-
+  # MSVC builds only need vsyasm and the solution file
+  matrix:
+    except:
+      - MSYSTEM: MINGW32
+      - MSYSTEM: MINGW64
+  
+  install:
+    - ps: $url = "http://ultravideo.cs.tut.fi/vsyasm.exe"
+    - ps: $output = "C:\Tools\vsyasm.exe"
+    - ps: "(New-Object System.Net.WebClient).DownloadFile($url, $output)"
+    - ps: '$env:Path += ";$output\.."'
 
-install:
-  # Update core packages
-  - C:\msys64\usr\bin\pacman -Syyuu --noconfirm --noprogressbar
-  # Update non-core packages
-  - C:\msys64\usr\bin\pacman -Suu --noconfirm --noprogressbar
-  # Install required MSYS2 packages
-  - C:\msys64\usr\bin\pacman -S --noconfirm --noprogressbar --needed automake-wrapper make
-  # Now MSYS2 is up to date, do the rest of the install from a bash script
-  - C:\msys64\usr\bin\bash -lc "cd \"$APPVEYOR_BUILD_FOLDER\" && exec ./tools/appveyor-install.sh"
-
-build_script:
-  - C:\msys64\usr\bin\bash -lc "cd \"$APPVEYOR_BUILD_FOLDER\" && exec ./tools/appveyor-build.sh"
-
-cache:
-  - C:\msys64\var\cache\pacman\pkg
+  build:
+    project: .\build\kvazaar_VS2015.sln

kvazaar-1.2.0.tar.gz/autogen.sh -> kvazaar-1.3.0.tar.gz/autogen.sh Changed

kvazaar-1.2.0.tar.gz/build/C_Properties.props -> kvazaar-1.3.0.tar.gz/build/C_Properties.props Changed

@@ -13,7 +13,7 @@
       <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <PreprocessorDefinitions>KVZ_DLL_EXPORTS;KVZ_COMPILE_ASM;WIN32_LEAN_AND_MEAN;WIN32;WIN64;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(SolutionDir)..\..\pthreads.2\include;$(SolutionDir)..\src;$(SolutionDir)..\src\extras;$(SolutionDir)..\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(SolutionDir)..\src\threadwrapper\include;$(SolutionDir)..\src;$(SolutionDir)..\src\extras;$(SolutionDir)..\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <DisableSpecificWarnings>4244;4204;4206;4028;4152;4996;4018;4456;4389;4100;4131;4459;4706;4214;4127;4201</DisableSpecificWarnings>
       <OpenMPSupport>false</OpenMPSupport>
       <TreatSpecificWarningsAsErrors>4013;4029;4047;4716;4700;4020;4021;4133</TreatSpecificWarningsAsErrors>

kvazaar-1.3.0.tar.gz/build/kvazaar_VS2015.sln Added

@@ -0,0 +1,55 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 14
+VisualStudioVersion = 12.0.30723.0
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_lib", "kvazaar_lib\kvazaar_lib.vcxproj", "{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{50AB7A17-4885-4D20-BF01-376DE4417FCD}"
+	ProjectSection(SolutionItems) = preProject
+		kvazaar_VS2010.vsd = kvazaar_VS2010.vsd
+		kvazaar_VS2010.vsmdi = kvazaar_VS2010.vsmdi
+		Local.testsettings = Local.testsettings
+		TraceAndTestImpact.testsettings = TraceAndTestImpact.testsettings
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_tests", "kvazaar_tests\kvazaar_tests.vcxproj", "{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_cli", "kvazaar_cli\kvazaar_cli.vcxproj", "{C755308D-9B3E-4712-99AB-7F6F4E2DA567}"
+	ProjectSection(ProjectDependencies) = postProject
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF} = {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}
+	EndProjectSection
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|Win32.ActiveCfg = Debug|Win32
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|Win32.Build.0 = Debug|Win32
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|x64.ActiveCfg = Debug|x64
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|x64.Build.0 = Debug|x64
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|Win32.ActiveCfg = Release|Win32
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|Win32.Build.0 = Release|Win32
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|x64.ActiveCfg = Release|x64
+		{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|x64.Build.0 = Release|x64
+		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Debug|Win32.ActiveCfg = Debug|Win32
+		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Debug|x64.ActiveCfg = Debug|x64
+		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Release|Win32.ActiveCfg = Release|Win32
+		{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Release|x64.ActiveCfg = Release|x64
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|Win32.ActiveCfg = Debug|Win32
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|Win32.Build.0 = Debug|Win32
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|x64.ActiveCfg = Debug|x64
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|x64.Build.0 = Debug|x64
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|Win32.ActiveCfg = Release|Win32
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|Win32.Build.0 = Release|Win32
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|x64.ActiveCfg = Release|x64
+		{C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal

kvazaar-1.2.0.tar.gz/build/kvazaar_cli/kvazaar_cli.vcxproj -> kvazaar-1.3.0.tar.gz/build/kvazaar_cli/kvazaar_cli.vcxproj Changed

@@ -22,23 +22,24 @@
     <ProjectGuid>{C755308D-9B3E-4712-99AB-7F6F4E2DA567}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>kvazaar_cli</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">

kvazaar-1.2.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj -> kvazaar-1.3.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj Changed

@@ -22,27 +22,28 @@
     <ProjectGuid>{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>kvazaar_lib</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
@@ -78,20 +79,26 @@
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <YASM />
     <Lib>
-      <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x64</AdditionalLibraryDirectories>
-      <AdditionalDependencies>pthreadVC2.lib</AdditionalDependencies>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+      <AdditionalDependencies>
+      </AdditionalDependencies>
     </Lib>
     <YASM>
       <Defines>ARCH_X86_64=1;%(Defines)</Defines>
+      <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
     </YASM>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
     <YASM>
       <Defines>ARCH_X86_64=0;PREFIX</Defines>
+      <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
     </YASM>
     <Lib>
-      <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x86</AdditionalLibraryDirectories>
-      <AdditionalDependencies>pthreadVC2.lib</AdditionalDependencies>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+      <AdditionalDependencies>
+      </AdditionalDependencies>
     </Lib>
     <ClCompile>
       <UndefinePreprocessorDefinitions>
@@ -101,10 +108,13 @@
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <YASM>
       <Defines>ARCH_X86_64=0;PREFIX</Defines>
+      <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
     </YASM>
     <Lib>
-      <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x86</AdditionalLibraryDirectories>
-      <AdditionalDependencies>pthreadVC2.lib</AdditionalDependencies>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+      <AdditionalDependencies>
+      </AdditionalDependencies>
     </Lib>
     <ClCompile>
       <UndefinePreprocessorDefinitions>
@@ -114,10 +124,13 @@
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <YASM>
       <Defines>ARCH_X86_64=1;%(Defines)</Defines>
+      <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
     </YASM>
     <Lib>
-      <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x64</AdditionalLibraryDirectories>
-      <AdditionalDependencies>pthreadVC2.lib</AdditionalDependencies>
+      <AdditionalLibraryDirectories>
+      </AdditionalLibraryDirectories>
+      <AdditionalDependencies>
+      </AdditionalDependencies>
     </Lib>
     <ClCompile>
       <UndefinePreprocessorDefinitions>
@@ -154,6 +167,12 @@
     <ClCompile Include="..\..\src\search.c" />
     <ClCompile Include="..\..\src\search_inter.c" />
     <ClCompile Include="..\..\src\search_intra.c" />
+    <ClCompile Include="..\..\src\strategies\avx2\encode_coding_tree-avx2.c">
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
+    </ClCompile>
     <ClCompile Include="..\..\src\strategies\avx2\intra-avx2.c">
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
@@ -172,9 +191,11 @@
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
     </ClCompile>
+    <ClCompile Include="..\..\src\strategies\generic\encode_coding_tree-generic.c" />
     <ClCompile Include="..\..\src\strategies\generic\intra-generic.c" />
     <ClCompile Include="..\..\src\strategies\generic\quant-generic.c" />
     <ClCompile Include="..\..\src\strategies\generic\sao-generic.c" />
+    <ClCompile Include="..\..\src\strategies\strategies-encode.c" />
     <ClCompile Include="..\..\src\strategies\strategies-intra.c" />
     <ClCompile Include="..\..\src\strategies\strategies-quant.c" />
     <ClInclude Include="..\..\src\checkpoint.h" />
@@ -214,6 +235,18 @@
     <ClCompile Include="..\..\src\strategies\strategies-picture.c" />
     <ClCompile Include="..\..\src\strategies\strategies-sao.c" />
     <ClCompile Include="..\..\src\strategies\x86_asm\picture-x86-asm.c" />
+    <ClCompile Include="..\..\src\threadwrapper\src\pthread.cpp">
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CompileAsCpp</CompileAs>
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CompileAsCpp</CompileAs>
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CompileAsCpp</CompileAs>
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CompileAsCpp</CompileAs>
+    </ClCompile>
+    <ClCompile Include="..\..\src\threadwrapper\src\semaphore.cpp">
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CompileAsCpp</CompileAs>
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CompileAsCpp</CompileAs>
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CompileAsCpp</CompileAs>
+      <CompileAs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CompileAsCpp</CompileAs>
+    </ClCompile>
     <ClCompile Include="..\..\src\videoframe.c" />
     <ClInclude Include="..\..\src\encoder_state-bitstream.h" />
     <ClInclude Include="..\..\src\encoder_state-ctors_dtors.h" />
@@ -228,13 +261,19 @@
     <ClInclude Include="..\..\src\kvz_math.h" />
     <ClInclude Include="..\..\src\search_inter.h" />
     <ClInclude Include="..\..\src\search_intra.h" />
+    <ClInclude Include="..\..\src\strategies\avx2\avx2_common_functions.h" />
+    <ClInclude Include="..\..\src\strategies\avx2\encode_coding_tree-avx2.h" />
     <ClInclude Include="..\..\src\strategies\avx2\intra-avx2.h" />
+    <ClInclude Include="..\..\src\strategies\avx2\reg_sad_pow2_widths-avx2.h" />
     <ClInclude Include="..\..\src\strategies\avx2\sao-avx2.h" />
+    <ClInclude Include="..\..\src\strategies\generic\encode_coding_tree-generic.h" />
     <ClInclude Include="..\..\src\strategies\generic\intra-generic.h" />
     <ClInclude Include="..\..\src\strategies\generic\sao-generic.h" />
+    <ClInclude Include="..\..\src\strategies\sse41\reg_sad_pow2_widths-sse41.h" />
     <ClInclude Include="..\..\src\strategies\strategies-common.h" />
     <ClInclude Include="..\..\src\strategies\avx2\quant-avx2.h" />
     <ClInclude Include="..\..\src\strategies\generic\quant-generic.h" />
+    <ClInclude Include="..\..\src\strategies\strategies-encode.h" />
     <ClInclude Include="..\..\src\strategies\strategies-intra.h" />
     <ClInclude Include="..\..\src\strategies\strategies-quant.h" />
   </ItemGroup>
@@ -279,6 +318,8 @@
     <ClInclude Include="..\..\src\tables.h" />
     <ClInclude Include="..\..\src\threadqueue.h" />
     <ClInclude Include="..\..\src\threads.h" />
+    <ClInclude Include="..\..\src\threadwrapper\include\pthread.h" />
+    <ClInclude Include="..\..\src\threadwrapper\include\semaphore.h" />
     <ClInclude Include="..\..\src\transform.h" />
     <ClInclude Include="..\..\src\videoframe.h" />
   </ItemGroup>
@@ -296,4 +337,4 @@
   <ImportGroup Label="ExtensionTargets">
     <Import Project="..\yasm\vsyasm.targets" />
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file

kvazaar-1.2.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters -> kvazaar-1.3.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters Changed

@@ -49,6 +49,9 @@
     <Filter Include="Threading">
       <UniqueIdentifier>{63c21cb2-b379-4d38-bcb8-173786c2466d}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Threadwrapper">
+      <UniqueIdentifier>{f4abece9-e209-4817-a57e-c64ca7c5e05c}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\..\src\strategies\strategies-nal.c">
@@ -221,6 +224,21 @@
     </ClCompile>
     <ClCompile Include="..\..\src\extras\libmd5.c" />
     <ClCompile Include="..\..\src\extras\crypto.cpp" />
+    <ClCompile Include="..\..\src\strategies\avx2\encode_coding_tree-avx2.c">
+      <Filter>Optimization\strategies\avx2</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\strategies\generic\encode_coding_tree-generic.c">
+      <Filter>Optimization\strategies\generic</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\strategies\strategies-encode.c">
+      <Filter>Optimization\strategies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\threadwrapper\src\pthread.cpp">
+      <Filter>Threadwrapper</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\threadwrapper\src\semaphore.cpp">
+      <Filter>Threadwrapper</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\bitstream.h">
@@ -411,6 +429,30 @@
     </ClInclude>
     <ClInclude Include="..\..\src\extras\libmd5.h" />
     <ClInclude Include="..\..\src\extras\crypto.h" />
+    <ClInclude Include="..\..\src\strategies\avx2\encode_coding_tree-avx2.h">
+      <Filter>Optimization\strategies\avx2</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\generic\encode_coding_tree-generic.h">
+      <Filter>Optimization\strategies\generic</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\avx2\avx2_common_functions.h">
+      <Filter>Optimization\strategies\avx2</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\strategies-encode.h">
+      <Filter>Optimization\strategies</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\avx2\reg_sad_pow2_widths-avx2.h">
+      <Filter>Optimization\strategies\avx2</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\strategies\sse41\reg_sad_pow2_widths-sse41.h">
+      <Filter>Optimization\strategies\sse41</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\threadwrapper\include\pthread.h">
+      <Filter>Threadwrapper</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\src\threadwrapper\include\semaphore.h">
+      <Filter>Threadwrapper</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <YASM Include="..\..\src\extras\x86inc.asm">
@@ -423,4 +465,4 @@
       <Filter>Optimization\strategies\x86_asm</Filter>
     </YASM>
   </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file

kvazaar-1.2.0.tar.gz/build/kvazaar_tests/kvazaar_tests.vcxproj -> kvazaar-1.3.0.tar.gz/build/kvazaar_tests/kvazaar_tests.vcxproj Changed

@@ -22,23 +22,24 @@
     <ProjectGuid>{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <RootNamespace>kvazaar_tests</RootNamespace>
+    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
     <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
     <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
+    <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
@@ -115,4 +116,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file

kvazaar-1.2.0.tar.gz/build/yasm/vsyasm.targets -> kvazaar-1.3.0.tar.gz/build/yasm/vsyasm.targets Changed

@@ -20,7 +20,7 @@
     AfterTargets="$(YASMAfterTargets)"
     Condition="'@(YASM)' != ''"
     DependsOnTargets="$(YASMDependsOn);ComputeYASMOutput"
-    Outputs="@(YASM->'%(ObjectFile)')"
+    Outputs="@(YASM->'$(ProjectDir)$(IntDir)%(Filename).obj')"
     Inputs="@(YASM);%(YASM.AdditionalDependencies);$(MSBuildProjectFile)">
     <ItemGroup
       Condition="'@(SelectedFiles)' != ''">
@@ -32,7 +32,7 @@
       <YASM_tlog
         Include="%(YASM.ObjectFile)"
         Condition="'%(YASM.ObjectFile)' != '' and '%(YASM.ExcludedFromBuild)' != 'true'">
-        <Source>@(YASM, '|')</Source>
+        <Source>@(YASM->'%(FullPath)', '|')</Source>
       </YASM_tlog>
     </ItemGroup>
     <Message
@@ -40,8 +40,9 @@
       Text="%(YASM.ExecutionDescription)" />
     <WriteLinesToFile
       Condition="'@(YASM_tlog)' != '' and '%(YASM_tlog.ExcludedFromBuild)' != 'true'"
-      File="$(IntDir)$(ProjectName).write.1.tlog"
-      Lines="^%(YASM_tlog.Source);@(YASM_tlog->'%(Fullpath)')" />
+      File="$(TLogLocation)$(ProjectName).write.1.tlog"
+      Lines="^%(YASM_tlog.Source);@(YASM->'$(ProjectDir)$(IntDir)%(Filename).obj')"
+      Encoding="Unicode" />
     <YASM
       Condition="'@(YASM)' != '' and '%(YASM.ExcludedFromBuild)' != 'true'"
       CommandLineTemplate="%(YASM.CommandLineTemplate)"

kvazaar-1.2.0.tar.gz/configure.ac -> kvazaar-1.3.0.tar.gz/configure.ac Changed

@@ -23,7 +23,7 @@
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
 ver_major=4
-ver_minor=0
+ver_minor=2
 ver_release=0
 
 # Prevents configure from adding a lot of defines to the CFLAGS
@@ -45,15 +45,20 @@
 
 LT_INIT(win32-dll)
 
+AX_CHECK_COMPILE_FLAG(-maltivec,flag_altivec="true")
 AX_CHECK_COMPILE_FLAG(-mavx2,   flag_avx2="true")
 AX_CHECK_COMPILE_FLAG(-msse4.1, flag_sse4_1="true")
 AX_CHECK_COMPILE_FLAG(-msse2,   flag_sse2="true")
+AX_CHECK_COMPILE_FLAG(-mbmi,    flag_bmi="true")
+AX_CHECK_COMPILE_FLAG(-mabm,    flag_abm="true")
+AX_CHECK_COMPILE_FLAG(-mbmi2,   flag_bmi2="true")
 
-AM_CONDITIONAL(HAVE_AVX2, test x"$flag_avx2" = x"true")
+AM_CONDITIONAL(HAVE_ALTIVEC, test x"$flag_altivec" = x"true")
+AM_CONDITIONAL(HAVE_AVX2, test x"$flag_avx2" = x"true" -a x"$flag_bmi" = x"true" -a x"$flag_abm" = x"true" -a x"$flag_bmi2" = x"true")
 AM_CONDITIONAL(HAVE_SSE4_1, test x"$flag_sse4_1" = x"true")
 AM_CONDITIONAL(HAVE_SSE2, test x"$flag_sse2" = x"true")
 
-KVZ_CFLAGS="-Wall -Wtype-limits -Wvla -I$srcdir/src -I$srcdir/src/extras -ftree-vectorize -fvisibility=hidden"
+KVZ_CFLAGS="-Wall -Wextra -Wvla -Wno-sign-compare -Wno-unused-parameter -I$srcdir/src -I$srcdir/src/extras -ftree-vectorize -fvisibility=hidden"
 CFLAGS="$KVZ_CFLAGS $CFLAGS"
 
 AC_SEARCH_LIBS(log, m c, , exit 1)
@@ -68,7 +73,10 @@
         AC_DEFINE(KVZ_SEL_ENCRYPTION, 1, With cryptopp),
         PKG_CHECK_MODULES(cryptopp, libcrypto++,
             AC_DEFINE(KVZ_SEL_ENCRYPTION, 1, With cryptopp),
-            AC_MSG_ERROR(neither cryptopp nor libcrypto++ found with pkg-config)
+            PKG_CHECK_MODULES(cryptopp, libcryptopp,
+                AC_DEFINE(KVZ_SEL_ENCRYPTION, 1, With cryptopp),
+                AC_MSG_ERROR(neither cryptopp, libcrypto++ nor libcryptopp found with pkg-config)
+            )
         )
     )
 )

kvazaar-1.2.0.tar.gz/doc/kvazaar.1 -> kvazaar-1.3.0.tar.gz/doc/kvazaar.1 Changed

@@ -1,24 +1,24 @@
-.TH KVAZAAR "1" "November 2017" "kvazaar v1.2.0" "User Commands"
+.TH KVAZAAR "1" "July 2019" "kvazaar v1.3.0" "User Commands"
 .SH NAME
 kvazaar \- open source HEVC encoder
 .SH SYNOPSIS
 \fBkvazaar \fR\-i <input> \-\-input\-res <width>x<height> \-o <output>
 .SH DESCRIPTION
 .TP
-\fB\-i\fR, \fB\-\-input               
+\fB\-i\fR, \fB\-\-input <filename>    
 Input file
 .TP
 \fB\-\-input\-res <res>     
 Input resolution auto
-auto: detect from file name
-<int>x<int>: width times height
+    \- auto: Detect from file name.
+    \- <int>x<int>: width times height
 .TP
-\fB\-o\fR, \fB\-\-output              
+\fB\-o\fR, \fB\-\-output <filename>   
 Output file
 
 .SS "Presets:"
 .TP
-\fB\-\-preset=<preset>     
+\fB\-\-preset <preset>     
 Set options to a preset medium
     \- ultrafast, superfast, veryfast, faster,
       fast, medium, slow, slower, veryslow
@@ -32,241 +32,315 @@
 \fB\-\-seek <integer>      
 First frame to code 0
 .TP
-\fB\-\-input\-fps <num>/<denom>
-Framerate of the input video 25.0
+\fB\-\-input\-fps <num>/<denom>
+Frame rate of the input video 25
 .TP
 \fB\-\-source\-scan\-type <string>
-Set source scan type progressive.
-    \- progressive: progressive scan
-    \- tff: top field first
-    \- bff: bottom field first
+Source scan type progressive
+    \- progressive: Progressive scan
+    \- tff: Top field first
+    \- bff: Bottom field first
 .TP
-\fB\-\-input\-format        
-P420 or P400
+\fB\-\-input\-format <string>
+P420 or P400 P420
 .TP
-\fB\-\-input\-bitdepth      
-8\-16
+\fB\-\-input\-bitdepth <int>
+8\-16 8
 .TP
 \fB\-\-loop\-input          
-Re\-read input file forever
+Re\-read input file forever.
 
 .SS "Options:"
 .TP
 \fB\-\-help                
-Print this help message and exit
+Print this help message and exit.
 .TP
 \fB\-\-version             
-Print version information and exit
+Print version information and exit.
 .TP
-\fB\-\-aud                 
-Use access unit delimiters
+\fB\-\-(no\-)aud            
+Use access unit delimiters. disabled
 .TP
-\fB\-\-debug <string>      
-Output encoders reconstruction.
+\fB\-\-debug <filename>    
+Output internal reconstruction.
 .TP
-\fB\-\-cpuid <integer>     
-Disable runtime cpu optimizations with value 0.
+\fB\-\-(no\-)cpuid          
+Enable runtime CPU optimizations. enabled
 .TP
-\fB\-\-hash                
+\fB\-\-hash <string>       
 Decoded picture hash checksum
     \- none: 0 bytes
     \- checksum: 18 bytes
     \- md5: 56 bytes
 .TP
-\fB\-\-no\-psnr             
-Don't calculate PSNR for frames
-.TP
-\fB\-\-no\-info             
-Don't add encoder info SEI.
+\fB\-\-(no\-)psnr           
+Calculate PSNR for frames. enabled
+.TP
+\fB\-\-(no\-)info           
+Add encoder info SEI. enabled
+.TP
+\fB\-\-crypto <string>     
+Selective encryption. Crypto support must be
+enabled at compile\-time. Can be 'on' or 'off' or
+a list of features separated with a '+'. off
+    \- on: Enable all encryption features.
+    \- off: Disable selective encryption.
+    \- mvs: Motion vector magnitudes.
+    \- mv_signs: Motion vector signs.
+    \- trans_coeffs: Coefficient magnitudes.
+    \- trans_coeff_signs: Coefficient signs.
+    \- intra_pred_modes: Intra prediction modes.
+.TP
+\fB\-\-key <string>        
+Encryption key 16,213,27,56,255,127,242,112,
+                97,126,197,204,25,59,38,30
 
 .SS "Video structure:"
 .TP
 \fB\-q\fR, \fB\-\-qp <integer>        
-Quantization Parameter 32
+Quantization parameter 22
 .TP
 \fB\-p\fR, \fB\-\-period <integer>    
-Period of intra pictures 0
-\- 0: only first picture is intra
-\- 1: all pictures are intra
-\- 2\-N: every Nth picture is intra
+Period of intra pictures 64
+    \- 0: Only first picture is intra.
+    \- 1: All pictures are intra.
+    \- N: Every Nth picture is intra.
 .TP
 \fB\-\-vps\-period <integer>
-Specify how often the video parameter set is
-re\-sent. 0
-    \- 0: only send VPS with the first frame
-    \- N: send VPS with every Nth intra frame
+How often the video parameter set is re\-sent 0
+    \- 0: Only send VPS with the first frame.
+    \- N: Send VPS with every Nth intra frame.
 .TP
 \fB\-r\fR, \fB\-\-ref <integer>       
-Reference frames, range 1..15 3
+Number of reference frames, in range 1..15 4
 .TP
 \fB\-\-gop <string>        
-Definition of GOP structure 0
-    \- 0: disabled
+GOP structure 8
+    \- 0: Disabled
     \- 8: B\-frame pyramid of length 8
-    \- lp\-<string>: lp\-gop definition
-          (e.g. lp\-g8d4t2, see README)
+    \- lp\-<string>: Low\-delay P\-frame GOP
+      (e.g. lp\-g8d4t2, see README)
+.TP
+\fB\-\-(no\-)open\-gop
+Use open GOP configuration. enabled
 .TP
-\fB\-\-cqmfile <string>    
-Custom Quantization Matrices from a file
+\fB\-\-cqmfile <filename>  
+Read custom quantization matrices from a file.
+.TP
+\fB\-\-scaling-list <string>
+Set scaling list mode. off
+    \- off: Disable scaling lists.
+    \- custom: use custom list (with \-\-cqmfile).
+    \- default: Use default lists.
 .TP
 \fB\-\-bitrate <integer>   
-Target bitrate. 0
-    \- 0: disable rate\-control
-    \- N: target N bits per second
-.TP
-\fB\-\-lossless            
-Use lossless coding
-.TP
-\fB\-\-mv\-constraint       
-Constrain movement vectors
-    \- none: no constraint
-    \- frametile: constrain within the tile
-    \- frametilemargin: constrain even more
-.TP
-\fB\-\-roi <string>        
-Use a delta QP map for region of interest
-    Read an array of delta QP values from
-    a file, where the first two values are the
-    width and height, followed by width*height
-    delta QP values in raster order.
-    The delta QP map can be any size or aspect
-    ratio, and will be mapped to LCU's.
+Target bitrate 0
+    \- 0: Disable rate control.
+    \- N: Target N bits per second.
+.TP
+\fB\-\-(no\-)lossless       
+Use lossless coding. disabled
+.TP
+\fB\-\-mv\-constraint <string>
+Constrain movement vectors. none
+    \- none: No constraint
+    \- frametile: Constrain within the tile.
+    \- frametilemargin: Constrain even more.
+.TP
+\fB\-\-roi <filename>      
+Use a delta QP map for region of interest.
+Reads an array of delta QP values from a text
+file. The file format is: width and height of
+the QP delta map followed by width*height delta
+QP values in raster order. The map can be of any
+size and will be scaled to the video size.
+.TP
+\fB\-\-set\-qp\-in\-cu        
+Set QP at CU level keeping pic_init_qp_minus26.
+in PPS and slice_qp_delta in slize header zero.
 .TP
 \fB\-\-(no\-)erp\-aqp        
-Use adaptive QP for 360 video with
-equirectangular projection
+Use adaptive QP for 360 degree video with
+equirectangular projection. disabled
+.TP
+\fB\-\-level <number>      
+Use the given HEVC level in the output and give
+an error if level limits are exceeded. 6.2
+    \- 1, 2, 2.1, 3, 3.1, 4, 4.1, 5, 5.1, 5.2, 6,
+      6.1, 6.2
+.TP
+\fB\-\-force\-level <number>
+Same as \-\-level but warnings instead of errors.
+.TP
+\fB\-\-high\-tier           
+Used with \-\-level. Use high tier bitrate limits
+instead of the main tier limits during encoding.
+High tier requires level 4 or higher.
 
 .SS "Compression tools:"
 .TP
-\fB\-\-deblock <beta:tc> 
-Deblocking
-      \- beta: between \-6 and 6
-      \- tc: between \-6 and 6
+\fB\-\-(no\-)deblock <beta:tc>
+Deblocking filter. 0:0
+    \- beta: Between \-6 and 6
+    \- tc: Between \-6 and 6
 .TP
-\fB\-\-(no\-)sao            
-Sample Adaptive Offset
+\fB\-\-sao <string>        
+Sample Adaptive Offset full
+    \- off: SAO disabled
+    \- band: Band offset only
+    \- edge: Edge offset only
+    \- full: Full SAO
 .TP
 \fB\-\-(no\-)rdoq           
-Rate\-Distortion Optimized Quantization
+Rate\-distortion optimized quantization enabled
+.TP
+\fB\-\-(no\-)rdoq\-skip      
+Skip RDOQ for 4x4 blocks. disabled
 .TP
 \fB\-\-(no\-)signhide       
-Sign Hiding
+Sign hiding disabled
 .TP
 \fB\-\-(no\-)smp            
-Symmetric Motion Partition
+Symmetric motion partition disabled
 .TP
 \fB\-\-(no\-)amp            
-Asymmetric Motion Partition
+Asymmetric motion partition disabled
 .TP
 \fB\-\-rd <integer>        
-Intra mode search complexity
-    \- 0: skip intra if inter is good enough
-    \- 1: rough intra mode search with SATD
-    \- 2: refine intra mode search with SSE
+Intra mode search complexity 0
+    \- 0: Skip intra if inter is good enough.
+    \- 1: Rough intra mode search with SATD.
+    \- 2: Refine intra mode search with SSE.
+    \- 3: Try all intra modes and enable intra
+         chroma mode search.
 .TP
 \fB\-\-(no\-)mv\-rdo         
-Rate\-Distortion Optimized motion vector costs
+Rate\-distortion optimized motion vector costs
+disabled
 .TP
 \fB\-\-(no\-)full\-intra\-search
-                            
 Try all intra modes during rough search.
+disabled
 .TP
 \fB\-\-(no\-)transform\-skip 
-Transform skip
+Try transform skip disabled
 .TP
 \fB\-\-me <string>         
-Integer motion estimation
+Integer motion estimation algorithm hexbs
     \- hexbs: Hexagon Based Search
     \- tz:    Test Zone Search
     \- full:  Full Search
     \- full8, full16, full32, full64
+    \- dia:   Diamond Search
+.TP
+\fB\-\-me\-steps <integer>  
+Motion estimation search step limit. Only
+affects 'hexbs' and 'dia'. \-1
 .TP
 \fB\-\-subme <integer>     
-Set fractional pixel motion estimation level
-    \- 0: only integer motion estimation
+Fractional pixel motion estimation level 4
+    \- 0: Integer motion estimation only
     \- 1: + 1/2\-pixel horizontal and vertical
     \- 2: + 1/2\-pixel diagonal
     \- 3: + 1/4\-pixel horizontal and vertical
     \- 4: + 1/4\-pixel diagonal
 .TP
 \fB\-\-pu\-depth\-inter <int>\-<int>
-                            
-Range for sizes for inter predictions
+Inter prediction units sizes 0\-3
     \- 0, 1, 2, 3: from 64x64 to 8x8
 .TP
 \fB\-\-pu\-depth\-intra <int>\-<int>
-Range for sizes for intra predictions
+Intra prediction units sizes 1\-4
     \- 0, 1, 2, 3, 4: from 64x64 to 4x4
 .TP
+\fB\-\-tr\-depth\-intra <int>
+Transform split depth for intra blocks 0
+.TP
 \fB\-\-(no\-)bipred         
-Bi\-prediction
+Bi\-prediction disabled
 .TP
-\fB\-\-(no\-)cu\-split\-termination
-                            
-CU split search termination condition
-    \- off: Never terminate cu\-split search
-    \- zero: Terminate with zero residual
+\fB\-\-cu\-split\-termination <string>
+CU split search termination zero
+    \- off: Don't terminate early.
+    \- zero: Terminate when residual is zero.
 .TP
-\fB\-\-(no\-)me\-early\-termination
-ME early termination condition
-    \- off: Don't terminate early
-    \- on: Terminate early
-    \- sensitive: Terminate even earlier
+\fB\-\-me\-early\-termination <string>
+Motion estimation termination on
+    \- off: Don't terminate early.
+    \- on: Terminate early.
+    \- sensitive: Terminate even earlier.
+.TP
+\fB\-\-fast\-residual\-cost <int>
+Skip CABAC cost for residual coefficients
+    when QP is below the limit. 0
+.TP
+\fB\-\-(no\-)intra\-rdo\-et   
+Check intra modes in rdo stage only until
+a zero coefficient CU is found. disabled
+.TP
+\fB\-\-(no\-)early\-skip     
+Try to find skip cu from merge candidates.
+Perform no further search if skip is found.
+For rd=0..1: Try the first candidate.
+For rd=2.. : Try the best candidate based
+             on luma satd cost. enabled
+.TP
+\fB\-\-max\-merge <integer> 
+Maximum number of merge candidates, 1..5 5
 .TP
 \fB\-\-(no\-)implicit\-rdpcm 
-Implicit residual DPCM
-Currently only supported with lossless coding.
+Implicit residual DPCM. Currently only supported
+with lossless coding. disabled
 .TP
 \fB\-\-(no\-)tmvp           
-Temporal Motion Vector Prediction
-.TP
-\fB\-\-(no\-)rdoq\-skip      
-Skips RDOQ for 4x4 blocks
+Temporal motion vector prediction enabled
 
 .SS "Parallel processing:"
 .TP
 \fB\-\-threads <integer>   
 Number of threads to use auto
-    \- 0: process everything with main thread
-    \- N: use N threads for encoding
-    \- auto: select based on number of cores
+    \- 0: Process everything with main thread.
+    \- N: Use N threads for encoding.
+    \- auto: Select automatically.
 .TP
 \fB\-\-owf <integer>       
-Frame parallelism auto
-    \- N: Process N\-1 frames at a time
-    \- auto: Select automatically
+Frame\-level parallelism auto
+    \- N: Process N+1 frames at a time.
+    \- auto: Select automatically.
 .TP
 \fB\-\-(no\-)wpp            
-Wavefront parallel processing enabled
+Wavefront parallel processing. enabled
 Enabling tiles automatically disables WPP.
 To enable WPP with tiles, re\-enable it after
-enabling tiles.
+enabling tiles. Enabling wpp with tiles is,
+however, an experimental feature since it is
+not supported in any HEVC profile.
 .TP
 \fB\-\-tiles <int>x<int>   
 Split picture into width x height uniform tiles.
 .TP
 \fB\-\-tiles\-width\-split <string>|u<int>
-Specifies a comma separated list of pixel
-positions of tiles columns separation coordinates.
-Can also be u followed by and a single int n,
-in which case it produces columns of uniform width.
+    \- <string>: A comma\-separated list of tile
+                column pixel coordinates.
+    \- u<int>: Number of tile columns of uniform
+              width.
 .TP
 \fB\-\-tiles\-height\-split <string>|u<int>
-Specifies a comma separated list of pixel
-positions of tiles rows separation coordinates.
-Can also be u followed by and a single int n,
-in which case it produces rows of uniform height.
+    \- <string>: A comma\-separated list of tile row
+                column pixel coordinates.
+    \- u<int>: Number of tile rows of uniform
+              height.
 .TP
 \fB\-\-slices <string>     
-Control how slices are used
-    \- tiles: put tiles in independent slices
-    \- wpp: put rows in dependent slices
-    \- tiles+wpp: do both
+Control how slices are used.
+    \- tiles: Put tiles in independent slices.
+    \- wpp: Put rows in dependent slices.
+    \- tiles+wpp: Do both.
 
 .SS "Video Usability Information:"
 .TP
 \fB\-\-sar <width:height>  
-Specify Sample Aspect Ratio
+Specify sample aspect ratio
 .TP
 \fB\-\-overscan <string>   
 Specify crop overscan setting undef
@@ -274,7 +348,7 @@
 .TP
 \fB\-\-videoformat <string>
 Specify video format undef
-    \- component, pal, ntsc, secam, mac, undef
+    \- undef, component, pal, ntsc, secam, mac
 .TP
 \fB\-\-range <string>      
 Specify color range tv

kvazaar-1.2.0.tar.gz/src/Makefile.am -> kvazaar-1.3.0.tar.gz/src/Makefile.am Changed

@@ -124,6 +124,8 @@
 	strategies/generic/quant-generic.h \
 	strategies/generic/sao-generic.c \
 	strategies/generic/sao-generic.h \
+	strategies/generic/encode_coding_tree-generic.c \
+	strategies/generic/encode_coding_tree-generic.h \
 	strategies/strategies-common.h \
 	strategies/strategies-dct.c \
 	strategies/strategies-dct.h \
@@ -139,6 +141,8 @@
 	strategies/strategies-quant.h \
 	strategies/strategies-sao.c \
 	strategies/strategies-sao.h \
+	strategies/strategies-encode.c \
+	strategies/strategies-encode.h \
 	strategies/x86_asm/picture-x86-asm.c \
 	strategies/x86_asm/picture-x86-asm.h \
 	strategyselector.c \
@@ -186,7 +190,9 @@
 	strategies/avx2/quant-avx2.c \
 	strategies/avx2/quant-avx2.h \
 	strategies/avx2/sao-avx2.c \
-	strategies/avx2/sao-avx2.h
+	strategies/avx2/sao-avx2.h \
+	strategies/avx2/encode_coding_tree-avx2.c \
+	strategies/avx2/encode_coding_tree-avx2.h
 
 libsse2_la_SOURCES = \
 	strategies/sse2/picture-sse2.c \
@@ -197,13 +203,17 @@
 	strategies/sse41/picture-sse41.h
 
 if HAVE_PPC
+
+if HAVE_ALTIVEC
 libaltivec_la_CFLAGS = -maltivec
 endif
 
+endif #HAVE_PPC
+
 if HAVE_X86
 
 if HAVE_AVX2
-libavx2_la_CFLAGS = -mavx2
+libavx2_la_CFLAGS = -mavx2 -mbmi -mabm -mbmi2
 endif
 if HAVE_SSE4_1
 libsse41_la_CFLAGS = -msse4.1

kvazaar-1.2.0.tar.gz/src/cfg.c -> kvazaar-1.3.0.tar.gz/src/cfg.c Changed

@@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <math.h>
 
 
 kvz_config *kvz_config_alloc(void)
@@ -36,7 +37,7 @@
   cfg->width           = 0;
   cfg->height          = 0;
   cfg->framerate       = 25; // deprecated and will be removed.
-  cfg->framerate_num   = 0;
+  cfg->framerate_num   = 25;
   cfg->framerate_denom = 1;
   cfg->qp              = 22;
   cfg->intra_period    = 64;
@@ -78,6 +79,7 @@
   cfg->lossless        = false;
   cfg->tmvp_enable     = true;
   cfg->implicit_rdpcm  = false;
+  cfg->fast_residual_cost_limit = 0;
 
   cfg->cu_split_termination = KVZ_CU_SPLIT_TERMINATION_ZERO;
 
@@ -85,13 +87,13 @@
   cfg->tiles_height_count = 1;
   cfg->tiles_width_split  = NULL;
   cfg->tiles_height_split = NULL;
-  
+
   cfg->wpp = 1;
   cfg->owf = -1;
   cfg->slice_count = 1;
   cfg->slice_addresses_in_ts = MALLOC(int32_t, 1);
   cfg->slice_addresses_in_ts0 = 0;
-  
+
   cfg->threads = -1;
   cfg->cpuid = 1;
 
@@ -108,16 +110,19 @@
   cfg->crypto_features = KVZ_CRYPTO_OFF;
 
   cfg->me_early_termination = 1;
+  cfg->intra_rdo_et         = 0;
 
   cfg->input_format = KVZ_FORMAT_P420;
   cfg->input_bitdepth = 8;
 
   cfg->gop_lp_definition.d = 3;
   cfg->gop_lp_definition.t = 1;
+  cfg->open_gop = true;
 
   cfg->roi.width = 0;
   cfg->roi.height = 0;
   cfg->roi.dqps = NULL;
+  cfg->set_qp_in_cu = false;
 
   cfg->erp_aqp = false;
 
@@ -125,6 +130,17 @@
 
   cfg->optional_key = NULL;
 
+  cfg->level = 62; // default hevc level, 6.2 (the highest)
+  cfg->force_level = true; // don't care about level limits by-default
+  cfg->high_tier = false;
+
+  cfg->me_max_steps = (uint32_t)-1;
+
+  cfg->scaling_list = KVZ_SCALING_LIST_OFF;
+
+  cfg->max_merge = 5;
+  cfg->early_skip = true;
+
   return 1;
 }
 
@@ -178,14 +194,14 @@
   const char* current_arg = NULL;
   int32_t current_value;
   int32_t valuesMAX_TILES_PER_DIM;
-  
+
   int i;
-  
+
   //Free pointer in any case
   if (*array) {
     FREE_POINTER(*array);
   }
-  
+
   //If the arg starts with u, we want an uniform split
   if (arg0=='u') {
     *ntiles = atoi(arg + 1);
@@ -196,7 +212,7 @@
     //Done with parsing
     return 1;
   }
-  
+
   //We have a comma-separated list of int for the split...
   current_arg = arg;
   *ntiles = 1;
@@ -213,27 +229,27 @@
     ++(*ntiles);
     if (MAX_TILES_PER_DIM <= *ntiles) break;
   } while (current_arg);
-  
+
   if (MAX_TILES_PER_DIM <= *ntiles || 1 >= *ntiles) {
     fprintf(stderr, "Invalid number of tiles (1 <= %d <= %d = MAX_TILES_PER_DIM)!\n", *ntiles, MAX_TILES_PER_DIM);
     return 0;
   }
-  
+
   *array = MALLOC(int32_t, *ntiles - 1);
   if (!*array) {
     fprintf(stderr, "Could not allocate array for tiles\n");
     return 0;
   }
-  
+
   //TODO: memcpy?
   for (i = 0; i < *ntiles - 1; ++i) {
     (*array)i = valuesi;
   }
-  
+
   return 1;
 }
 
-static int parse_uint8(const char *numstr,uint8_t* number,int min, int max)                               
+static int parse_uint8(const char *numstr,uint8_t* number,int min, int max)
 {
   char *tail;
   int d = strtol(numstr, &tail, 10);
@@ -285,14 +301,14 @@
   const char* current_arg = NULL;
   int32_t current_value;
   int32_t valuesMAX_SLICES;
-  
+
   int i;
-  
+
   //Free pointer in any case
   if (*array) {
     FREE_POINTER(*array);
   }
-  
+
   //If the arg starts with u, we want an uniform split
   if (arg0=='u') {
     *nslices = atoi(arg+1);
@@ -303,7 +319,7 @@
     //Done with parsing
     return 1;
   }
-  
+
   //We have a comma-separated list of int for the split...
   current_arg = arg;
   //We always have a slice starting at 0
@@ -322,29 +338,29 @@
     ++(*nslices);
     if (MAX_SLICES <= *nslices) break;
   } while (current_arg);
-  
+
   if (MAX_SLICES <= *nslices || 0 >= *nslices) {
     fprintf(stderr, "Invalid number of slices (0 < %d <= %d = MAX_SLICES)!\n", *nslices, MAX_SLICES);
     return 0;
   }
-  
+
   *array = MALLOC(int32_t, *nslices);
   if (!*array) {
     fprintf(stderr, "Could not allocate array for slices\n");
     return 0;
   }
-  
+
   //TODO: memcpy?
   for (i = 0; i < *nslices; ++i) {
     (*array)i = valuesi;
   }
-  
+
   return 1;
 }
 
 int kvz_config_parse(kvz_config *cfg, const char *name, const char *value)
 {
-  static const char * const me_names          = { "hexbs", "tz", "full", "full8", "full16", "full32", "full64", NULL };
+  static const char * const me_names          = { "hexbs", "tz", "full", "full8", "full16", "full32", "full64", "dia", NULL };
   static const char * const source_scan_type_names = { "progressive", "tff", "bff", NULL };
 
   static const char * const overscan_names    = { "undef", "show", "crop", NULL };
@@ -368,221 +384,270 @@
 
   static const char * const sao_names = { "off", "edge", "band", "full", NULL };
 
-  static const char * const preset_values1120*2 = {
-      { 
-        "ultrafast", 
+  static const char * const scaling_list_names = { "off", "custom", "default", NULL };
+
+  static const char * const preset_values1125*2 = {
+      {
+        "ultrafast",
+        "rd", "0",
         "pu-depth-intra", "2-3",
         "pu-depth-inter", "2-3",
-        "rd", "0",
         "me", "hexbs",
+        "gop", "lp-g4d4t1",
         "ref", "1",
+        "bipred", "0",
         "deblock", "0:0",
         "signhide", "0",
-        "subme", "0",
+        "subme", "2",
         "sao", "off",
         "rdoq", "0",
-        "rdoq-skip", "1",
-        "transform-skip", "0", 
-        "full-intra-search", "0",
+        "rdoq-skip", "0",
+        "transform-skip", "0",
         "mv-rdo", "0",
+        "full-intra-search", "0",
         "smp", "0",
         "amp", "0",
         "cu-split-termination", "zero",
         "me-early-termination", "sensitive",
-        "gop", "lp-g4d3t1",
-        NULL 
+        "intra-rdo-et", "0",
+        "early-skip", "1",
+        "fast-residual-cost", "28",
+        "max-merge", "5",
+        NULL
       },
-      { 
+      {
         "superfast",
+        "rd", "0",
         "pu-depth-intra", "2-3",
         "pu-depth-inter", "2-3",
-        "rd", "0",
         "me", "hexbs",
+        "gop", "lp-g4d4t1",
         "ref", "1",
+        "bipred", "0",
         "deblock", "0:0",
         "signhide", "0",
-        "subme", "0",
+        "subme", "2",
         "sao", "full",
         "rdoq", "0",
-        "rdoq-skip", "1",
+        "rdoq-skip", "0",
         "transform-skip", "0",
-        "full-intra-search", "0",
         "mv-rdo", "0",
+        "full-intra-search", "0",
         "smp", "0",
         "amp", "0",
         "cu-split-termination", "zero",
         "me-early-termination", "sensitive",
-        "gop", "lp-g4d3t1",
+        "intra-rdo-et", "0",
+        "early-skip", "1",
+        "fast-residual-cost", "28",
+        "max-merge", "5",
         NULL
       },
       {
         "veryfast",
-        "pu-depth-intra", "2-3",
-        "pu-depth-inter", "2-3",
         "rd", "0",
+        "pu-depth-intra", "2-3",
+        "pu-depth-inter", "1-3",
         "me", "hexbs",
+        "gop", "lp-g4d4t1",
         "ref", "1",
+        "bipred", "0",
         "deblock", "0:0",
         "signhide", "0",
         "subme", "2",
         "sao", "full",
         "rdoq", "0",
-        "rdoq-skip", "1",
+        "rdoq-skip", "0",
         "transform-skip", "0",
-        "full-intra-search", "0",
         "mv-rdo", "0",
+        "full-intra-search", "0",
         "smp", "0",
         "amp", "0",
         "cu-split-termination", "zero",
         "me-early-termination", "sensitive",
-        "gop", "lp-g4d3t1",
+        "intra-rdo-et", "0",
+        "early-skip", "1",
+        "fast-residual-cost", "28",
+        "max-merge", "5",
         NULL
       },
       {
         "faster",
+        "rd", "0",
         "pu-depth-intra", "2-3",
         "pu-depth-inter", "1-3",
-        "rd", "1",
         "me", "hexbs",
+        "gop", "lp-g4d4t1",
         "ref", "1",
+        "bipred", "0",
         "deblock", "0:0",
         "signhide", "0",
-        "subme", "2",
+        "subme", "4",
         "sao", "full",
         "rdoq", "0",
-        "rdoq-skip", "1",
+        "rdoq-skip", "0",
         "transform-skip", "0",
-        "full-intra-search", "0",
         "mv-rdo", "0",
+        "full-intra-search", "0",
         "smp", "0",
         "amp", "0",
         "cu-split-termination", "zero",
         "me-early-termination", "sensitive",
-        "gop", "lp-g4d3t1",
+        "intra-rdo-et", "0",
+        "early-skip", "1",
+        "fast-residual-cost", "0",
+        "max-merge", "5",
         NULL
       },
       {
         "fast",
-        "pu-depth-intra", "2-3",
+        "rd", "0",
+        "pu-depth-intra", "1-3",
         "pu-depth-inter", "1-3",
-        "rd", "1",
         "me", "hexbs",
-        "ref", "1",
+        "gop", "lp-g4d4t1",
+        "ref", "2",
+        "bipred", "0",
         "deblock", "0:0",
         "signhide", "0",
         "subme", "4",
         "sao", "full",
         "rdoq", "0",
-        "rdoq-skip", "1",
+        "rdoq-skip", "0",
         "transform-skip", "0",
-        "full-intra-search", "0",
         "mv-rdo", "0",
+        "full-intra-search", "0",
         "smp", "0",
         "amp", "0",
         "cu-split-termination", "zero",
-        "me-early-termination", "on",
-        "gop", "lp-g4d3t1",
+        "me-early-termination", "sensitive",
+        "intra-rdo-et", "0",
+        "early-skip", "1",
+        "fast-residual-cost", "0",
+        "max-merge", "5",
         NULL
       },
       {
         "medium",
-        "pu-depth-intra", "1-3",
-        "pu-depth-inter", "1-3",
-        "rd", "1",
+        "rd", "0",
+        "pu-depth-intra", "1-4",
+        "pu-depth-inter", "0-3",
         "me", "hexbs",
-        "ref", "1",
+        "gop", "8",
+        "ref", "4",
+        "bipred", "0",
         "deblock", "0:0",
         "signhide", "0",
         "subme", "4",
         "sao", "full",
         "rdoq", "1",
-        "rdoq-skip", "1",
+        "rdoq-skip", "0",
         "transform-skip", "0",
-        "full-intra-search", "0",
         "mv-rdo", "0",
+        "full-intra-search", "0",
         "smp", "0",
         "amp", "0",
         "cu-split-termination", "zero",
         "me-early-termination", "on",
-        "gop", "lp-g4d3t1",
+        "intra-rdo-et", "0",
+        "early-skip", "1",
+        "fast-residual-cost", "0",
+        "max-merge", "5",
         NULL
       },
       {
         "slow",
-        "pu-depth-intra", "1-3",
-        "pu-depth-inter", "1-3",
-        "rd", "1",
+        "rd", "0",
+        "pu-depth-intra", "1-4",
+        "pu-depth-inter", "0-3",
         "me", "hexbs",
-        "ref", "2",
+        "gop", "8",
+        "ref", "4",
+        "bipred", "1",
         "deblock", "0:0",
-        "signhide", "1",
+        "signhide", "0",
         "subme", "4",
         "sao", "full",
         "rdoq", "1",
-        "rdoq-skip", "1",
+        "rdoq-skip", "0",
         "transform-skip", "0",
-        "full-intra-search", "0",
         "mv-rdo", "0",
+        "full-intra-search", "0",
         "smp", "0",
         "amp", "0",
         "cu-split-termination", "zero",
         "me-early-termination", "on",
-        "gop", "lp-g4d2t1",
+        "intra-rdo-et", "0",
+        "early-skip", "1",
+        "fast-residual-cost", "0",
+        "max-merge", "5",
         NULL
       },
       {
         "slower",
-        "pu-depth-intra", "1-3",
+        "rd", "2",
+        "pu-depth-intra", "1-4",
         "pu-depth-inter", "0-3",
-        "rd", "1",
         "me", "hexbs",
-        "ref", "2",
+        "gop", "8",
+        "ref", "4",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "1",
         "subme", "4",
         "sao", "full",
         "rdoq", "1",
-        "rdoq-skip", "1",
+        "rdoq-skip", "0",
         "transform-skip", "0",
-        "full-intra-search", "0",
         "mv-rdo", "0",
+        "full-intra-search", "0",
         "smp", "0",
         "amp", "0",
         "cu-split-termination", "zero",
-        "me-early-termination", "on",
-        "gop", "lp-g4d2t1",
+        "me-early-termination", "off",
+        "intra-rdo-et", "0",
+        "early-skip", "1",
+        "fast-residual-cost", "0",
+        "max-merge", "5",
         NULL
       },
       {
         "veryslow",
+        "rd", "2",
         "pu-depth-intra", "1-4",
         "pu-depth-inter", "0-3",
-        "rd", "1",
         "me", "hexbs",
-        "ref", "3",
+        "gop", "8",
+        "ref", "4",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "1",
         "subme", "4",
         "sao", "full",
         "rdoq", "1",
-        "rdoq-skip", "1",
+        "rdoq-skip", "0",
         "transform-skip", "0",
-        "full-intra-search", "0",
         "mv-rdo", "0",
-        "smp", "0",
+        "full-intra-search", "0",
+        "smp", "1",
         "amp", "0",
         "cu-split-termination", "zero",
-        "me-early-termination", "on",
-        "gop", "lp-g4d2t1",
+        "me-early-termination", "off",
+        "intra-rdo-et", "0",
+        "early-skip", "1",
+        "fast-residual-cost", "0",
+        "max-merge", "5",
         NULL
       },
       {
         "placebo",
+        "rd", "2",
         "pu-depth-intra", "1-4",
         "pu-depth-inter", "0-3",
-        "rd", "1",
         "me", "tz",
+        "gop", "8",
         "ref", "4",
+        "bipred", "1",
         "deblock", "0:0",
         "signhide", "1",
         "subme", "4",
@@ -590,13 +655,16 @@
         "rdoq", "1",
         "rdoq-skip", "0",
         "transform-skip", "1",
-        "full-intra-search", "0",
         "mv-rdo", "1",
+        "full-intra-search", "0",
         "smp", "1",
         "amp", "1",
         "cu-split-termination", "off",
         "me-early-termination", "off",
-        "gop", "lp-g4d2t1",
+        "intra-rdo-et", "0",
+        "early-skip", "1",
+        "fast-residual-cost", "0",
+        "max-merge", "5",
         NULL
       },
       { NULL }
@@ -717,10 +785,17 @@
     }
     FREE_POINTER(cfg->cqmfile);
     cfg->cqmfile = cqmfile;
+    cfg->scaling_list = KVZ_SCALING_LIST_CUSTOM;
+  }
+  else if OPT("scaling-list") {    
+    int8_t scaling_list = KVZ_SCALING_LIST_OFF;
+    int result = parse_enum(value, scaling_list_names, &scaling_list);
+    cfg->scaling_list = scaling_list;
+    return result;
   }
   else if OPT("tiles-width-split") {
     int retval = parse_tiles_specification(value, &cfg->tiles_width_count, &cfg->tiles_width_split);
-    
+
     if (cfg->tiles_width_count > 1 && cfg->tmvp_enable) {
       cfg->tmvp_enable = false;
       fprintf(stderr, "Disabling TMVP because tiles are used.\n");
@@ -735,7 +810,7 @@
   }
   else if OPT("tiles-height-split") {
     int retval = parse_tiles_specification(value, &cfg->tiles_height_count, &cfg->tiles_height_split);
-    
+
     if (cfg->tiles_height_count > 1 && cfg->tmvp_enable) {
       cfg->tmvp_enable = false;
       fprintf(stderr, "Disabling TMVP because tiles are used.\n");
@@ -815,7 +890,7 @@
     }
   }
   else if OPT("cpuid")
-    cfg->cpuid = atoi(value);
+    cfg->cpuid = atobool(value);
   else if OPT("pu-depth-inter")
     return sscanf(value, "%d-%d", &cfg->pu_depth_inter.min, &cfg->pu_depth_inter.max) == 2;
   else if OPT("pu-depth-intra")
@@ -899,6 +974,9 @@
       return 0;
     }
   }
+  else if OPT("open-gop") {
+    cfg->open_gop = (bool)atobool(value);
+  }
   else if OPT("bipred")
     cfg->bipred = atobool(value);
   else if OPT("bitrate")
@@ -1015,6 +1093,8 @@
     cfg->me_early_termination = mode;
     return result;
   }
+  else if OPT("intra-rdo-et")
+    cfg->intra_rdo_et = (bool)atobool(value);
   else if OPT("lossless")
     cfg->lossless = (bool)atobool(value);
   else if OPT("tmvp") {
@@ -1081,6 +1161,7 @@
 
     if (width > 10000 || height > 10000) {
       fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n");
+      fclose(f);
       return 0;
     }
 
@@ -1109,10 +1190,79 @@
 
     fclose(f);
   }
-  else if OPT("erp-aqp")
+  else if OPT("set-qp-in-cu") {
+    cfg->set_qp_in_cu = (bool)atobool(value);
+  }
+  else if OPT("erp-aqp") {
     cfg->erp_aqp = (bool)atobool(value);
-  else
+  }
+  else if (OPT("level") || OPT("force-level")) {
+    if OPT("force-level") {
+      cfg->force_level = true;
+    } else {
+      cfg->force_level = false;
+    }
+
+    unsigned int num_first, num_second, level;
+    int matched_amount = sscanf(value, "%u.%u", &num_first, &num_second);
+
+    if (matched_amount == 2) {
+      // of form x.y
+      level = num_first * 10 + num_second;
+    } else if (matched_amount == 1) {
+      // no dot
+      if (num_first < 10) {
+        // of form x
+        level = num_first * 10;
+      } else {
+        // of form xx
+        level = num_first;
+      }
+    } else {
+      fprintf(stderr, "Invalid level value: \"%s\"\n", value);
+      return 0;
+    }
+    if (level < 10 || level > 62) {
+      fprintf(stderr, "Level value of %s is out of bounds\n", value);
+      return 0;
+    }
+
+    cfg->level = level;
+  }
+  else if (OPT("high-tier")) {
+    cfg->high_tier = true;
+  }
+  else if (OPT("me-steps")) {
+    char * tailptr = NULL;
+    long steps = strtol(value, &tailptr, 0);
+
+    if (*tailptr != '\0') {
+      fprintf(stderr, "Invalid me-steps value: \"%s\"", value);
+      return 0;
+    }
+    if (steps < -1 || steps > UINT32_MAX) {
+      fprintf(stderr, "me-steps value is out of bounds: \"%s\"", value);
+      return 0;
+    }
+
+    cfg->me_max_steps = (uint32_t)steps;
+  }
+  else if (OPT("fast-residual-cost"))
+    cfg->fast_residual_cost_limit = atoi(value);
+  else if (OPT("max-merge")) {
+    int max_merge = atoi(value);
+    if (max_merge < 1 || max_merge > 5) {
+      fprintf(stderr, "max-merge needs to be between 1 and 5\n");
+      return 0;
+    }
+    cfg->max_merge = (uint8_t)max_merge;
+  }
+  else if OPT("early-skip") {
+  cfg->early_skip = (bool)atobool(value);
+  }
+  else {
     return 0;
+  }
 #undef OPT
 
   return 1;
@@ -1209,6 +1359,9 @@
   cfg->gopgop.g - 1.qp_factor = 0.578;  // from HM
 }
 
+// forward declaration
+static int validate_hevc_level(kvz_config *const cfg);
+
 /**
  * \brief Check that configuration is sensible.
  *
@@ -1267,7 +1420,9 @@
     error = 1;
   }
 
-  if (cfg->gop_len && cfg->intra_period && !cfg->gop_lowdelay &&
+  if (cfg->gop_len &&
+      cfg->intra_period > 1 &&
+      !cfg->gop_lowdelay &&
       cfg->intra_period % cfg->gop_len != 0)
   {
     fprintf(stderr,
@@ -1328,7 +1483,7 @@
   }
 
   if (!WITHIN(cfg->pu_depth_inter.min, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX) ||
-      !WITHIN(cfg->pu_depth_inter.max, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX)) 
+      !WITHIN(cfg->pu_depth_inter.max, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX))
   {
     fprintf(stderr, "Input error: illegal value for --pu-depth-inter (%d-%d)\n",
             cfg->pu_depth_inter.min, cfg->pu_depth_inter.max);
@@ -1407,5 +1562,174 @@
     error = 1;
   }
 
+  if ((cfg->scaling_list == KVZ_SCALING_LIST_CUSTOM) && !cfg->cqmfile) {
+    fprintf(stderr, "Input error: --scaling-list=custom does not work without --cqmfile=<FILE>.\n");
+    error = 1;
+  }
+
+  if (validate_hevc_level((kvz_config *const) cfg)) {
+    // a level error found and it's not okay
+    error = 1;
+  }
+
   return !error;
 }
+
+static int validate_hevc_level(kvz_config *const cfg) {
+  static const struct { uint32_t lsr; uint32_t lps; uint32_t main_bitrate; } LEVEL_CONSTRAINTS13 = {
+    { 552960, 36864, 128 }, // 1
+
+    { 3686400, 122880, 1500 }, // 2
+    { 7372800, 245760, 3000 }, // 2.1
+
+    { 16588800, 552960, 6000 },  // 3
+    { 33177600, 983040, 10000 }, // 3.1
+
+    { 66846720, 2228224, 12000 },  // 4
+    { 133693440, 2228224, 20000 }, // 4.1
+
+    { 267386880, 8912896, 25000 },  // 5
+    { 534773760, 8912896, 40000 },  // 5.1
+    { 1069547520, 8912896, 60000 }, // 5.2
+
+    { 1069547520, 35651584, 60000 },  // 6
+    { 2139095040, 35651584, 120000 }, // 6.1
+    { 4278190080, 35651584, 240000 }, // 6.2
+  };
+
+  // bit rates for the high-tiers of the levels from 4 to 6.2
+  static const uint32_t HIGH_TIER_BITRATES8 = {
+    30000, 50000, 100000, 160000, 240000, 240000, 480000, 800000
+  };
+
+  int level_error = 0;
+
+  const char* level_err_prefix;
+  if (cfg->force_level) {
+    level_err_prefix = "Level warning";
+  } else {
+    level_err_prefix = "Level error";
+  }
+
+  uint8_t lvl_idx;
+
+  // for nicer error print
+  float lvl = ((float)cfg->level) / 10.0f;
+
+  // check if the level is valid and get it's lsr and lps values
+  switch (cfg->level) {
+  case 10:
+    lvl_idx = 0;
+    break;
+  case 20:
+    lvl_idx = 1;
+    break;
+  case 21:
+    lvl_idx = 2;
+    break;
+  case 30:
+    lvl_idx = 3;
+    break;
+  case 31:
+    lvl_idx = 4;
+    break;
+  case 40:
+    lvl_idx = 5;
+    break;
+  case 41:
+    lvl_idx = 6;
+    break;
+  case 50:
+    lvl_idx = 7;
+    break;
+  case 51:
+    lvl_idx = 8;
+    break;
+  case 52:
+    lvl_idx = 9;
+    break;
+  case 60:
+    lvl_idx = 10;
+    break;
+  case 61:
+    lvl_idx = 11;
+    break;
+  case 62:
+    lvl_idx = 12;
+    break;
+
+  default:
+    fprintf(stderr, "Input error: %g is an invalid level value\n", lvl);
+    return 1;
+  }
+
+  if (cfg->high_tier && cfg->level < 40) {
+    fprintf(stderr, "Input error: high tier requires at least level 4\n");
+    return 1;
+  }
+
+  // max luma sample rate
+  uint32_t max_lsr = LEVEL_CONSTRAINTSlvl_idx.lsr;
+
+  // max luma picture size
+  uint32_t max_lps = LEVEL_CONSTRAINTSlvl_idx.lps;
+
+  if (cfg->high_tier) {
+    cfg->max_bitrate = HIGH_TIER_BITRATESlvl_idx - 5 * 1000;
+  } else {
+    cfg->max_bitrate = LEVEL_CONSTRAINTSlvl_idx.main_bitrate * 1000;
+  }
+
+  if (cfg->target_bitrate > cfg->max_bitrate) {
+    fprintf(stderr, "%s: target bitrate exceeds %i, which is the maximum %s tier level %g bitrate\n",
+      level_err_prefix, cfg->max_bitrate, cfg->high_tier?"high":"main", lvl);
+    level_error = 1;
+  }
+
+  // check the conformance to the level limits
+
+  // luma samples
+  uint64_t cfg_samples = cfg->width * cfg->height;
+
+  // luma sample rate
+  double framerate = ((double)cfg->framerate_num) / ((double)cfg->framerate_denom);
+  uint64_t cfg_sample_rate = cfg_samples * (uint64_t) framerate;
+
+  // square of the maximum allowed dimension
+  uint32_t max_dimension_squared = 8 * max_lps;
+
+  // check maximum dimensions
+  if (cfg->width * cfg->width > max_dimension_squared) {
+    uint32_t max_dim = sqrtf(max_dimension_squared);
+    fprintf(stderr, "%s: picture width of %i is too large for this level (%g), maximum dimension is %i\n",
+      level_err_prefix, cfg->width, lvl, max_dim);
+    level_error = 1;
+  }
+  if (cfg->height * cfg->height > max_dimension_squared) {
+    uint32_t max_dim = sqrtf(max_dimension_squared);
+    fprintf(stderr, "%s: picture height of %i is too large for this level (%g), maximum dimension is %i\n",
+      level_err_prefix, cfg->height, lvl, max_dim);
+    level_error = 1;
+  }
+
+  // check luma picture size
+  if (cfg_samples > max_lps) {
+    fprintf(stderr, "%s: picture resolution of %ix%i is too large for this level (%g) (it has %llu samples, maximum is %u samples)\n",
+      level_err_prefix, cfg->width, cfg->height, lvl, (unsigned long long) cfg_samples, max_lps);
+    level_error = 1;
+  }
+
+  // check luma sample rate
+  if (cfg_sample_rate > max_lsr) {
+    fprintf(stderr, "%s: framerate of %g is too big for this level (%g) and picture resolution (it has the sample rate of %llu, maximum is %u\n",
+      level_err_prefix, framerate, lvl, (unsigned long long) cfg_sample_rate, max_lsr);
+    level_error = 1;
+  }
+
+  if (cfg->force_level) {
+    // we wanted to print warnings, not get errors
+    return 0;
+  } else {
+    return level_error;
+  }
+}

kvazaar-1.2.0.tar.gz/src/cfg.h -> kvazaar-1.3.0.tar.gz/src/cfg.h Changed

kvazaar-1.2.0.tar.gz/src/cli.c -> kvazaar-1.3.0.tar.gz/src/cli.c Changed

@@ -36,9 +36,9 @@
   { "input",              required_argument, NULL, 'i' },
   { "output",             required_argument, NULL, 'o' },
   { "debug",              required_argument, NULL, 'd' },
-  { "width",              required_argument, NULL, 'w' },
+  { "width",              required_argument, NULL, 'w' }, // deprecated
   { "height",             required_argument, NULL, 'h' }, // deprecated
-  { "frames",             required_argument, NULL, 'n' }, // deprecated
+  { "frames",             required_argument, NULL, 'n' },
   { "qp",                 required_argument, NULL, 'q' },
   { "period",             required_argument, NULL, 'p' },
   { "ref",                required_argument, NULL, 'r' },
@@ -86,7 +86,8 @@
   { "owf",                required_argument, NULL, 0 },
   { "slices",             required_argument, NULL, 0 },
   { "threads",            required_argument, NULL, 0 },
-  { "cpuid",              required_argument, NULL, 0 },
+  { "cpuid",              optional_argument, NULL, 0 },
+  { "no-cpuid",                 no_argument, NULL, 0 },
   { "pu-depth-inter",     required_argument, NULL, 0 },
   { "pu-depth-intra",     required_argument, NULL, 0 },
   { "info",                     no_argument, NULL, 0 },
@@ -109,6 +110,8 @@
   { "crypto",             required_argument, NULL, 0 },
   { "key",                required_argument, NULL, 0 },
   { "me-early-termination",required_argument, NULL, 0 },
+  { "intra-rdo-et",             no_argument, NULL, 0 },
+  { "no-intra-rdo-et",          no_argument, NULL, 0 },
   { "lossless",                 no_argument, NULL, 0 },
   { "no-lossless",              no_argument, NULL, 0 },
   { "tmvp",                     no_argument, NULL, 0 },
@@ -122,6 +125,18 @@
   { "roi",                required_argument, NULL, 0 },
   { "erp-aqp",                  no_argument, NULL, 0 },
   { "no-erp-aqp",               no_argument, NULL, 0 },
+  { "level",              required_argument, NULL, 0 },
+  { "force-level",        required_argument, NULL, 0 },
+  { "high-tier",                no_argument, NULL, 0 },
+  { "me-steps",           required_argument, NULL, 0 },
+  { "fast-residual-cost", required_argument, NULL, 0 },
+  { "set-qp-in-cu",             no_argument, NULL, 0 },
+  { "open-gop",                 no_argument, NULL, 0 },
+  { "no-open-gop",              no_argument, NULL, 0 },
+  { "scaling-list",       required_argument, NULL, 0 },
+  { "max-merge",          required_argument, NULL, 0 },
+  { "early-skip",               no_argument, NULL, 0 },
+  { "no-early-skip",            no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -316,168 +331,214 @@
     "Usage:\n"
     "kvazaar -i <input> --input-res <width>x<height> -o <output>\n"
     "\n"
-    /* Word wrap to this width to stay under 80 characters (including ") ************/
+    /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Required:\n"
-    "  -i, --input                : Input file\n"
+    "  -i, --input <filename>     : Input file\n"
     "      --input-res <res>      : Input resolution auto\n"
-    "                               auto: detect from file name\n"
-    "                               <int>x<int>: width times height\n"
-    "  -o, --output               : Output file\n"
+    "                                   - auto: Detect from file name.\n"
+    "                                   - <int>x<int>: width times height\n"
+    "  -o, --output <filename>    : Output file\n"
     "\n"
-    /* Word wrap to this width to stay under 80 characters (including ") ************/
+    /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Presets:\n"
-    "      --preset=<preset>      : Set options to a preset medium\n"
+    "      --preset <preset>      : Set options to a preset medium\n"
     "                                   - ultrafast, superfast, veryfast, faster,\n"
     "                                     fast, medium, slow, slower, veryslow\n"
     "                                     placebo\n"
     "\n"
-    /* Word wrap to this width to stay under 80 characters (including ") ************/
+    /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Input:\n"
     "  -n, --frames <integer>     : Number of frames to code all\n"
     "      --seek <integer>       : First frame to code 0\n"
-    "      --input-fps <num>/<denom> : Framerate of the input video 25.0\n"
-    "      --source-scan-type <string> : Set source scan type progressive.\n"
-    "                                   - progressive: progressive scan\n"
-    "                                   - tff: top field first\n"
-    "                                   - bff: bottom field first\n"
-    "      --input-format         : P420 or P400\n"
-    "      --input-bitdepth       : 8-16\n"
-    "      --loop-input           : Re-read input file forever\n"
+    "      --input-fps <num>/<denom> : Frame rate of the input video 25\n"
+    "      --source-scan-type <string> : Source scan type progressive\n"
+    "                                   - progressive: Progressive scan\n"
+    "                                   - tff: Top field first\n"
+    "                                   - bff: Bottom field first\n"
+    "      --input-format <string> : P420 or P400 P420\n"
+    "      --input-bitdepth <int> : 8-16 8\n"
+    "      --loop-input           : Re-read input file forever.\n"
     "\n"
-    /* Word wrap to this width to stay under 80 characters (including ") ************/
+    /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Options:\n"
-    "      --help                 : Print this help message and exit\n"
-    "      --version              : Print version information and exit\n"
-    "      --aud                  : Use access unit delimiters\n"
-    "      --debug <string>       : Output encoders reconstruction.\n"
-    "      --cpuid <integer>      : Disable runtime cpu optimizations with value 0.\n"
-    "      --hash                 : Decoded picture hash checksum\n"
+    "      --help                 : Print this help message and exit.\n"
+    "      --version              : Print version information and exit.\n"
+    "      --(no-)aud             : Use access unit delimiters. disabled\n"
+    "      --debug <filename>     : Output internal reconstruction.\n"
+    "      --(no-)cpuid           : Enable runtime CPU optimizations. enabled\n"
+    "      --hash <string>        : Decoded picture hash checksum\n"
     "                                   - none: 0 bytes\n"
     "                                   - checksum: 18 bytes\n"
     "                                   - md5: 56 bytes\n"
-    "      --no-psnr              : Don't calculate PSNR for frames\n"
-    "      --no-info              : Don't add encoder info SEI.\n"
+    "      --(no-)psnr            : Calculate PSNR for frames. enabled\n"
+    "      --(no-)info            : Add encoder info SEI. enabled\n"
+    "      --crypto <string>      : Selective encryption. Crypto support must be\n"
+    "                               enabled at compile-time. Can be 'on' or 'off' or\n"
+    "                               a list of features separated with a '+'. off\n"
+    "                                   - on: Enable all encryption features.\n"
+    "                                   - off: Disable selective encryption.\n"
+    "                                   - mvs: Motion vector magnitudes.\n"
+    "                                   - mv_signs: Motion vector signs.\n"
+    "                                   - trans_coeffs: Coefficient magnitudes.\n"
+    "                                   - trans_coeff_signs: Coefficient signs.\n"
+    "                                   - intra_pred_modes: Intra prediction modes.\n"
+    "      --key <string>         : Encryption key 16,213,27,56,255,127,242,112,\n"
+    "                                               97,126,197,204,25,59,38,30\n"
     "\n"
-    /* Word wrap to this width to stay under 80 characters (including ") ************/
+    /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Video structure:\n"
-    "  -q, --qp <integer>         : Quantization Parameter 32\n"
-    "  -p, --period <integer>     : Period of intra pictures 0\n"
-    "                               - 0: only first picture is intra\n"
-    "                               - 1: all pictures are intra\n"
-    "                               - 2-N: every Nth picture is intra\n"
-    "      --vps-period <integer> : Specify how often the video parameter set is\n"
-    "                               re-sent. 0\n"
-    "                                   - 0: only send VPS with the first frame\n"
-    "                                   - N: send VPS with every Nth intra frame\n"
-    "  -r, --ref <integer>        : Reference frames, range 1..15 3\n"
-    "      --gop <string>         : Definition of GOP structure 0\n"
-    "                                   - 0: disabled\n"
+    "  -q, --qp <integer>         : Quantization parameter 22\n"
+    "  -p, --period <integer>     : Period of intra pictures 64\n"
+    "                                   - 0: Only first picture is intra.\n"
+    "                                   - 1: All pictures are intra.\n"
+    "                                   - N: Every Nth picture is intra.\n"
+    "      --vps-period <integer> : How often the video parameter set is re-sent 0\n"
+    "                                   - 0: Only send VPS with the first frame.\n"
+    "                                   - N: Send VPS with every Nth intra frame.\n"
+    "  -r, --ref <integer>        : Number of reference frames, in range 1..15 4\n"
+    "      --gop <string>         : GOP structure 8\n"
+    "                                   - 0: Disabled\n"
     "                                   - 8: B-frame pyramid of length 8\n"
-    "                                   - lp-<string>: lp-gop definition\n"
-    "                                         (e.g. lp-g8d4t2, see README)\n"
-    "      --cqmfile <string>     : Custom Quantization Matrices from a file\n"
-    "      --bitrate <integer>    : Target bitrate. 0\n"
-    "                                   - 0: disable rate-control\n"
-    "                                   - N: target N bits per second\n"
-    "      --lossless             : Use lossless coding\n"
-    "      --mv-constraint        : Constrain movement vectors\n"
-    "                                   - none: no constraint\n"
-    "                                   - frametile: constrain within the tile\n"
-    "                                   - frametilemargin: constrain even more\n"
-    "      --roi <string>         : Use a delta QP map for region of interest\n"
-    "                                   Read an array of delta QP values from\n"
-    "                                   a file, where the first two values are the\n"
-    "                                   width and height, followed by width*height\n"
-    "                                   delta QP values in raster order.\n"
-    "                                   The delta QP map can be any size or aspect\n"
-    "                                   ratio, and will be mapped to LCU's.\n"
-    "      --(no-)erp-aqp         : Use adaptive QP for 360 video with\n"
-    "                               equirectangular projection\n"
+    "                                   - lp-<string>: Low-delay P-frame GOP\n"
+    "                                     (e.g. lp-g8d4t2, see README)\n"
+    "      --(no-)open-gop        : Use open GOP configuration. enabled\n"
+    "      --cqmfile <filename>   : Read custom quantization matrices from a file.\n"
+    "      --scaling-list <string>: Set scaling list mode. off\n"
+    "                                   - off: Disable scaling lists.\n"
+    "                                   - custom: use custom list (with --cqmfile).\n"
+    "                                   - default: Use default lists.\n"
+    "      --bitrate <integer>    : Target bitrate 0\n"
+    "                                   - 0: Disable rate control.\n"
+    "                                   - N: Target N bits per second.\n"
+    "      --(no-)lossless        : Use lossless coding. disabled\n"
+    "      --mv-constraint <string> : Constrain movement vectors. none\n"
+    "                                   - none: No constraint\n"
+    "                                   - frametile: Constrain within the tile.\n"
+    "                                   - frametilemargin: Constrain even more.\n"
+    "      --roi <filename>       : Use a delta QP map for region of interest.\n"
+    "                               Reads an array of delta QP values from a text\n"
+    "                               file. The file format is: width and height of\n"
+    "                               the QP delta map followed by width*height delta\n"
+    "                               QP values in raster order. The map can be of any\n"
+    "                               size and will be scaled to the video size.\n"
+    "      --set-qp-in-cu         : Set QP at CU level keeping pic_init_qp_minus26.\n"
+    "                               in PPS and slice_qp_delta in slize header zero.\n"
+    "      --(no-)erp-aqp         : Use adaptive QP for 360 degree video with\n"
+    "                               equirectangular projection. disabled\n"
+    "      --level <number>       : Use the given HEVC level in the output and give\n"
+    "                               an error if level limits are exceeded. 6.2\n"
+    "                                   - 1, 2, 2.1, 3, 3.1, 4, 4.1, 5, 5.1, 5.2, 6,\n"
+    "                                     6.1, 6.2\n"
+    "      --force-level <number> : Same as --level but warnings instead of errors.\n"
+    "      --high-tier            : Used with --level. Use high tier bitrate limits\n"
+    "                               instead of the main tier limits during encoding.\n"
+    "                               High tier requires level 4 or higher.\n"
     "\n"
-    /* Word wrap to this width to stay under 80 characters (including ") ************/
+    /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Compression tools:\n"
-    "      --deblock <beta:tc>  : Deblocking\n"
-    "                                     - beta: between -6 and 6\n"
-    "                                     - tc: between -6 and 6\n"
-    "      --(no-)sao             : Sample Adaptive Offset\n"
-    "      --(no-)rdoq            : Rate-Distortion Optimized Quantization\n"
-    "      --(no-)signhide        : Sign Hiding\n"
-    "      --(no-)smp             : Symmetric Motion Partition\n"
-    "      --(no-)amp             : Asymmetric Motion Partition\n"
-    "      --rd <integer>         : Intra mode search complexity\n"
-    "                                   - 0: skip intra if inter is good enough\n"
-    "                                   - 1: rough intra mode search with SATD\n"
-    "                                   - 2: refine intra mode search with SSE\n"
-    "      --(no-)mv-rdo          : Rate-Distortion Optimized motion vector costs\n"
-    "      --(no-)full-intra-search\n"
-    "                             : Try all intra modes during rough search.\n"
-    "      --(no-)transform-skip  : Transform skip\n"
-    "      --me <string>          : Integer motion estimation\n"
+    "      --(no-)deblock <beta:tc> : Deblocking filter. 0:0\n"
+    "                                   - beta: Between -6 and 6\n"
+    "                                   - tc: Between -6 and 6\n"
+    "      --sao <string>         : Sample Adaptive Offset full\n"
+    "                                   - off: SAO disabled\n"
+    "                                   - band: Band offset only\n"
+    "                                   - edge: Edge offset only\n"
+    "                                   - full: Full SAO\n"
+    "      --(no-)rdoq            : Rate-distortion optimized quantization enabled\n"
+    "      --(no-)rdoq-skip       : Skip RDOQ for 4x4 blocks. disabled\n"
+    "      --(no-)signhide        : Sign hiding disabled\n"
+    "      --(no-)smp             : Symmetric motion partition disabled\n"
+    "      --(no-)amp             : Asymmetric motion partition disabled\n"
+    "      --rd <integer>         : Intra mode search complexity 0\n"
+    "                                   - 0: Skip intra if inter is good enough.\n"
+    "                                   - 1: Rough intra mode search with SATD.\n"
+    "                                   - 2: Refine intra mode search with SSE.\n"
+    "                                   - 3: Try all intra modes and enable intra\n"
+    "                                        chroma mode search.\n"
+    "      --(no-)mv-rdo          : Rate-distortion optimized motion vector costs\n"
+    "                               disabled\n"
+    "      --(no-)full-intra-search : Try all intra modes during rough search.\n"
+    "                               disabled\n"
+    "      --(no-)transform-skip  : Try transform skip disabled\n"
+    "      --me <string>          : Integer motion estimation algorithm hexbs\n"
     "                                   - hexbs: Hexagon Based Search\n"
     "                                   - tz:    Test Zone Search\n"
     "                                   - full:  Full Search\n"
     "                                   - full8, full16, full32, full64\n"
-    "      --subme <integer>      : Set fractional pixel motion estimation level\n"
-    "                                   - 0: only integer motion estimation\n"
+    "                                   - dia:   Diamond Search\n"
+    "      --me-steps <integer>   : Motion estimation search step limit. Only\n"
+    "                               affects 'hexbs' and 'dia'. -1\n"
+    "      --subme <integer>      : Fractional pixel motion estimation level 4\n"
+    "                                   - 0: Integer motion estimation only\n"
     "                                   - 1: + 1/2-pixel horizontal and vertical\n"
     "                                   - 2: + 1/2-pixel diagonal\n"
     "                                   - 3: + 1/4-pixel horizontal and vertical\n"
     "                                   - 4: + 1/4-pixel diagonal\n"
-    "      --pu-depth-inter <int>-<int>\n"
-    "                             : Range for sizes for inter predictions\n"
+    "      --pu-depth-inter <int>-<int> : Inter prediction units sizes 0-3\n"
     "                                   - 0, 1, 2, 3: from 64x64 to 8x8\n"
-    "      --pu-depth-intra <int>-<int> : Range for sizes for intra predictions\n"
+    "      --pu-depth-intra <int>-<int> : Intra prediction units sizes 1-4\n"
     "                                   - 0, 1, 2, 3, 4: from 64x64 to 4x4\n"
-    "      --(no-)bipred          : Bi-prediction\n"
-    "      --(no-)cu-split-termination\n"
-    "                             : CU split search termination condition\n"
-    "                                   - off: Never terminate cu-split search\n"
-    "                                   - zero: Terminate with zero residual\n"
-    "      --(no-)me-early-termination : ME early termination condition\n"
-    "                                   - off: Don't terminate early\n"
-    "                                   - on: Terminate early\n"
-    "                                   - sensitive: Terminate even earlier\n"
-    "      --(no-)implicit-rdpcm  : Implicit residual DPCM\n"
-    "                               Currently only supported with lossless coding.\n"
-    "      --(no-)tmvp            : Temporal Motion Vector Prediction\n"
-    "      --(no-)rdoq-skip       : Skips RDOQ for 4x4 blocks\n"
+    "      --tr-depth-intra <int> : Transform split depth for intra blocks 0\n"
+    "      --(no-)bipred          : Bi-prediction disabled\n"
+    "      --cu-split-termination <string> : CU split search termination zero\n"
+    "                                   - off: Don't terminate early.\n"
+    "                                   - zero: Terminate when residual is zero.\n"
+    "      --me-early-termination <string> : Motion estimation termination on\n"
+    "                                   - off: Don't terminate early.\n"
+    "                                   - on: Terminate early.\n"
+    "                                   - sensitive: Terminate even earlier.\n"
+    "      --fast-residual-cost <int> : Skip CABAC cost for residual coefficients\n"
+    "                                   when QP is below the limit. 0\n"
+    "      --(no-)intra-rdo-et    : Check intra modes in rdo stage only until\n"
+    "                               a zero coefficient CU is found. disabled\n"
+    "      --(no-)early-skip      : Try to find skip cu from merge candidates.\n"
+    "                               Perform no further search if skip is found.\n"
+    "                               For rd=0..1: Try the first candidate.\n"
+    "                               For rd=2.. : Try the best candidate based\n"
+    "                                            on luma satd cost. enabled\n"
+    "      --max-merge <integer>  : Maximum number of merge candidates, 1..5 5\n"
+    "      --(no-)implicit-rdpcm  : Implicit residual DPCM. Currently only supported\n"
+    "                               with lossless coding. disabled\n"
+    "      --(no-)tmvp            : Temporal motion vector prediction enabled\n"
     "\n"
-    /* Word wrap to this width to stay under 80 characters (including ") ************/
+    /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Parallel processing:\n"
     "      --threads <integer>    : Number of threads to use auto\n"
-    "                                   - 0: process everything with main thread\n"
-    "                                   - N: use N threads for encoding\n"
-    "                                   - auto: select based on number of cores\n"
-    "      --owf <integer>        : Frame parallelism auto\n"
-    "                                   - N: Process N-1 frames at a time\n"
-    "                                   - auto: Select automatically\n"
-    "      --(no-)wpp             : Wavefront parallel processing enabled\n"
+    "                                   - 0: Process everything with main thread.\n"
+    "                                   - N: Use N threads for encoding.\n"
+    "                                   - auto: Select automatically.\n"
+    "      --owf <integer>        : Frame-level parallelism auto\n"
+    "                                   - N: Process N+1 frames at a time.\n"
+    "                                   - auto: Select automatically.\n"
+    "      --(no-)wpp             : Wavefront parallel processing. enabled\n"
     "                               Enabling tiles automatically disables WPP.\n"
     "                               To enable WPP with tiles, re-enable it after\n"
-    "                               enabling tiles.\n"
+    "                               enabling tiles. Enabling wpp with tiles is,\n"
+    "                               however, an experimental feature since it is\n"
+    "                               not supported in any HEVC profile.\n"
     "      --tiles <int>x<int>    : Split picture into width x height uniform tiles.\n"
     "      --tiles-width-split <string>|u<int> :\n"
-    "                               Specifies a comma separated list of pixel\n"
-    "                               positions of tiles columns separation coordinates.\n"
-    "                               Can also be u followed by and a single int n,\n"
-    "                               in which case it produces columns of uniform width.\n"
+    "                                   - <string>: A comma-separated list of tile\n"
+    "                                               column pixel coordinates.\n"
+    "                                   - u<int>: Number of tile columns of uniform\n"
+    "                                             width.\n"
     "      --tiles-height-split <string>|u<int> :\n"
-    "                               Specifies a comma separated list of pixel\n"
-    "                               positions of tiles rows separation coordinates.\n"
-    "                               Can also be u followed by and a single int n,\n"
-    "                               in which case it produces rows of uniform height.\n"
-    "      --slices <string>      : Control how slices are used\n"
-    "                                   - tiles: put tiles in independent slices\n"
-    "                                   - wpp: put rows in dependent slices\n"
-    "                                   - tiles+wpp: do both\n"
+    "                                   - <string>: A comma-separated list of tile row\n"
+    "                                               column pixel coordinates.\n"
+    "                                   - u<int>: Number of tile rows of uniform\n"
+    "                                             height.\n"
+    "      --slices <string>      : Control how slices are used.\n"
+    "                                   - tiles: Put tiles in independent slices.\n"
+    "                                   - wpp: Put rows in dependent slices.\n"
+    "                                   - tiles+wpp: Do both.\n"
     "\n"
-    /* Word wrap to this width to stay under 80 characters (including ") ************/
+    /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Video Usability Information:\n"
-    "      --sar <width:height>   : Specify Sample Aspect Ratio\n"
+    "      --sar <width:height>   : Specify sample aspect ratio\n"
     "      --overscan <string>    : Specify crop overscan setting undef\n"
     "                                   - undef, show, crop\n"
     "      --videoformat <string> : Specify video format undef\n"
-    "                                   - component, pal, ntsc, secam, mac, undef\n"
+    "                                   - undef, component, pal, ntsc, secam, mac\n"
     "      --range <string>       : Specify color range tv\n"
     "                                   - tv, pc\n"
     "      --colorprim <string>   : Specify color primaries undef\n"
@@ -493,10 +554,10 @@
     "                                     smpte240m, GBR, YCgCo, bt2020nc, bt2020c\n"
     "      --chromaloc <integer>  : Specify chroma sample location (0 to 5) 0\n"
     "\n"
-    /* Word wrap to this width to stay under 80 characters (including ") ************/
+    /* Word wrap to this width to stay under 80 characters (including ") *************/
     "Deprecated parameters: (might be removed at some point)\n"
-    "  -w, --width                 : Use --input-res\n"
-    "  -h, --height                : Use --input-res\n");
+    "  -w, --width <integer>       : Use --input-res.\n"
+    "  -h, --height <integer>      : Use --input-res.\n");
 }

kvazaar-1.2.0.tar.gz/src/cu.c -> kvazaar-1.3.0.tar.gz/src/cu.c Changed

kvazaar-1.2.0.tar.gz/src/cu.h -> kvazaar-1.3.0.tar.gz/src/cu.h Changed

kvazaar-1.2.0.tar.gz/src/encmain.c -> kvazaar-1.3.0.tar.gz/src/encmain.c Changed

@@ -27,6 +27,9 @@
 /* The following two defines must be located before the inclusion of any system header files. */
 #define WINVER       0x0500
 #define _WIN32_WINNT 0x0500
+
+#include "global.h" // IWYU pragma: keep
+
 #include <fcntl.h>    /* _O_BINARY */
 #include <io.h>       /* _setmode() */
 #endif
@@ -41,7 +44,6 @@
 #include "checkpoint.h"
 #include "cli.h"
 #include "encoder.h"
-#include "global.h" // IWYU pragma: keep
 #include "kvazaar.h"
 #include "kvazaar_internal.h"
 #include "threads.h"
@@ -431,6 +433,12 @@
     uint32_t frames_done = 0;
     double psnr_sum3 = { 0.0, 0.0, 0.0 };
 
+    // how many bits have been written this second? used for checking if framerate exceeds level's limits
+    uint64_t bits_this_second = 0;
+    // the amount of frames have been encoded in this second of video. can be non-integer value if framerate is non-integer value
+    unsigned frames_this_second = 0;
+    const float framerate = ((float)encoder->cfg.framerate_num) / ((float)encoder->cfg.framerate_denom);
+
     uint8_t padding_x = get_padding(opts->config->width);
     uint8_t padding_y = get_padding(opts->config->height);
 
@@ -527,6 +535,39 @@
         fflush(output);
 
         bitstream_length += len_out;
+        
+        // the level's bitrate check
+        frames_this_second += 1;
+
+        if ((float)frames_this_second >= framerate) {
+          // if framerate <= 1 then we go here always
+
+          // how much of the bits of the last frame belonged to the next second
+          uint64_t leftover_bits = (uint64_t)((double)len_out * ((double)frames_this_second - framerate));
+
+          // the latest frame is counted for the amount that it contributed to this current second
+          bits_this_second += len_out - leftover_bits;
+
+          if (bits_this_second > encoder->cfg.max_bitrate) {
+            fprintf(stderr, "Level warning: This %s's bitrate (%llu bits/s) reached the maximum bitrate (%u bits/s) of %s tier level %g.",
+              framerate >= 1.0f ? "second" : "frame",
+              (unsigned long long) bits_this_second,
+              encoder->cfg.max_bitrate,
+              encoder->cfg.high_tier ? "high" : "main",
+              (float)encoder->cfg.level / 10.0f );
+          }
+
+          if (framerate > 1.0f) {
+            // leftovers for the next second
+            bits_this_second = leftover_bits;
+          } else {
+            // one or more next seconds are from this frame and their bitrate is the same or less as this frame's
+            bits_this_second = 0;
+          }
+          frames_this_second = 0;
+        } else {
+          bits_this_second += len_out;
+        }
 
         // Compute and print stats.

kvazaar-1.2.0.tar.gz/src/encode_coding_tree.c -> kvazaar-1.3.0.tar.gz/src/encode_coding_tree.c Changed

@@ -30,6 +30,7 @@
 #include "intra.h"
 #include "kvazaar.h"
 #include "kvz_math.h"
+#include "strategyselector.h"
 #include "tables.h"
 #include "videoframe.h"
 
@@ -46,10 +47,10 @@
  * This method encodes the X and Y component within a block of the last
  * significant coefficient.
  */
-static void encode_last_significant_xy(cabac_data_t * const cabac,
-                                       uint8_t lastpos_x, uint8_t lastpos_y,
-                                       uint8_t width, uint8_t height,
-                                       uint8_t type, uint8_t scan)
+void kvz_encode_last_significant_xy(cabac_data_t * const cabac,
+                                    uint8_t lastpos_x, uint8_t lastpos_y,
+                                    uint8_t width, uint8_t height,
+                                    uint8_t type, uint8_t scan)
 {
   const int index = kvz_math_floor_log2(width) - 2;
   uint8_t ctx_offset = type ? 0 : (index * 3 + (index + 1) / 4);
@@ -100,250 +101,6 @@
   }
 }
 
-void kvz_encode_coeff_nxn(encoder_state_t * const state,
-                          cabac_data_t * const cabac,
-                          const coeff_t *coeff,
-                          uint8_t width,
-                          uint8_t type,
-                          int8_t scan_mode,
-                          int8_t tr_skip)
-{
-  const encoder_control_t * const encoder = state->encoder_control;
-  int c1 = 1;
-  uint8_t last_coeff_x = 0;
-  uint8_t last_coeff_y = 0;
-  int32_t i;
-  uint32_t sig_coeffgroup_flag8 * 8 = { 0 };
-
-  int8_t be_valid = encoder->cfg.signhide_enable;
-  int32_t scan_pos_sig;
-  uint32_t go_rice_param = 0;
-  uint32_t blk_pos, pos_y, pos_x, sig, ctx_sig;
-
-  // CONSTANTS
-  const uint32_t num_blk_side    = width >> TR_MIN_LOG2_SIZE;
-  const uint32_t log2_block_size = kvz_g_convert_to_bitwidth + 2;
-  const uint32_t *scan           =
-    kvz_g_sig_last_scanscan_modelog2_block_size - 1;
-  const uint32_t *scan_cg = g_sig_last_scan_cglog2_block_size - 2scan_mode;
-
-  // Init base contexts according to block type
-  cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_modeltype);
-  cabac_ctx_t *baseCtx           = (type == 0) ? &(cabac->ctx.cu_sig_model_luma0) :
-                                 &(cabac->ctx.cu_sig_model_chroma0);
-
-  // Scan all coeff groups to find out which of them have coeffs.
-  // Populate sig_coeffgroup_flag with that info.
-
-  unsigned sig_cg_cnt = 0;
-  for (int cg_y = 0; cg_y < width / 4; ++cg_y) {
-    for (int cg_x = 0; cg_x < width / 4; ++cg_x) {
-      unsigned cg_pos = cg_y * width * 4 + cg_x * 4;
-      for (int coeff_row = 0; coeff_row < 4; ++coeff_row) {
-        // Load four 16-bit coeffs and see if any of them are non-zero.
-        unsigned coeff_pos = cg_pos + coeff_row * width;
-        uint64_t four_coeffs = *(uint64_t*)(&coeffcoeff_pos);
-        if (four_coeffs) {
-          ++sig_cg_cnt;
-          unsigned cg_pos_y = (cg_pos >> log2_block_size) >> TR_MIN_LOG2_SIZE;
-          unsigned cg_pos_x = (cg_pos & (width - 1)) >> TR_MIN_LOG2_SIZE;
-          sig_coeffgroup_flagcg_pos_x + cg_pos_y * num_blk_side = 1;
-          break;
-        }
-      }
-    }
-  }
-
-  // Rest of the code assumes at least one non-zero coeff.
-  assert(sig_cg_cnt > 0);
-
-  // Find the last coeff group by going backwards in scan order.
-  unsigned scan_cg_last = num_blk_side * num_blk_side - 1;
-  while (!sig_coeffgroup_flagscan_cgscan_cg_last) {
-    --scan_cg_last;
-  }
-
-  // Find the last coeff by going backwards in scan order.
-  unsigned scan_pos_last = scan_cg_last * 16 + 15;
-  while (!coeffscanscan_pos_last) {
-    --scan_pos_last;
-  }
-
-  int pos_last = scanscan_pos_last;
-
-  // transform skip flag
-  if(width == 4 && encoder->cfg.trskip_enable) {
-    cabac->cur_ctx = (type == 0) ? &(cabac->ctx.transform_skip_model_luma) : &(cabac->ctx.transform_skip_model_chroma);
-    CABAC_BIN(cabac, tr_skip, "transform_skip_flag");
-  }
-
-  last_coeff_x = pos_last & (width - 1);
-  last_coeff_y = (uint8_t)(pos_last >> log2_block_size);
-
-  // Code last_coeff_x and last_coeff_y
-  encode_last_significant_xy(cabac,
-                             last_coeff_x,
-                             last_coeff_y,
-                             width,
-                             width,
-                             type,
-                             scan_mode);
-
-  scan_pos_sig  = scan_pos_last;
-
-  // significant_coeff_flag
-  for (i = scan_cg_last; i >= 0; i--) {
-    int32_t sub_pos        = i << 4; // LOG2_SCAN_SET_SIZE;
-    int32_t abs_coeff16;
-    int32_t cg_blk_pos     = scan_cgi;
-    int32_t cg_pos_y       = cg_blk_pos / num_blk_side;
-    int32_t cg_pos_x       = cg_blk_pos - (cg_pos_y * num_blk_side);
-
-    uint32_t coeff_signs   = 0;
-    int32_t last_nz_pos_in_cg = -1;
-    int32_t first_nz_pos_in_cg = 16;
-    int32_t num_non_zero = 0;
-    go_rice_param = 0;
-
-    if (scan_pos_sig == scan_pos_last) {
-      abs_coeff0 = abs(coeffpos_last);
-      coeff_signs  = (coeffpos_last < 0);
-      num_non_zero = 1;
-      last_nz_pos_in_cg  = scan_pos_sig;
-      first_nz_pos_in_cg = scan_pos_sig;
-      scan_pos_sig--;
-    }
-
-    if (i == scan_cg_last || i == 0) {
-      sig_coeffgroup_flagcg_blk_pos = 1;
-    } else {
-      uint32_t sig_coeff_group   = (sig_coeffgroup_flagcg_blk_pos != 0);
-      uint32_t ctx_sig  = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
-                                                      cg_pos_y, width);
-      cabac->cur_ctx = &base_coeff_group_ctxctx_sig;
-      CABAC_BIN(cabac, sig_coeff_group, "coded_sub_block_flag");
-    }
-
-    if (sig_coeffgroup_flagcg_blk_pos) {
-      int32_t pattern_sig_ctx = kvz_context_calc_pattern_sig_ctx(sig_coeffgroup_flag,
-                                                             cg_pos_x, cg_pos_y, width);
-
-      for (; scan_pos_sig >= sub_pos; scan_pos_sig--) {
-        blk_pos = scanscan_pos_sig;
-        pos_y   = blk_pos >> log2_block_size;
-        pos_x   = blk_pos - (pos_y << log2_block_size);
-        sig    = (coeffblk_pos != 0) ? 1 : 0;
-
-        if (scan_pos_sig > sub_pos || i == 0 || num_non_zero) {
-          ctx_sig  = kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
-                                             log2_block_size, type);
-          cabac->cur_ctx = &baseCtxctx_sig;
-          CABAC_BIN(cabac, sig, "sig_coeff_flag");
-        }
-
-        if (sig) {
-          abs_coeffnum_non_zero = abs(coeffblk_pos);
-          coeff_signs              = 2 * coeff_signs + (coeffblk_pos < 0);
-          num_non_zero++;
-
-          if (last_nz_pos_in_cg == -1) {
-            last_nz_pos_in_cg = scan_pos_sig;
-          }
-
-          first_nz_pos_in_cg  = scan_pos_sig;
-        }
-      }
-    } else {
-      scan_pos_sig = sub_pos - 1;
-    }
-
-    if (num_non_zero > 0) {
-      bool sign_hidden = last_nz_pos_in_cg - first_nz_pos_in_cg >= 4 /* SBH_THRESHOLD */
-                         && !encoder->cfg.lossless;
-      uint32_t ctx_set  = (i > 0 && type == 0) ? 2 : 0;
-      cabac_ctx_t *base_ctx_mod;
-      int32_t num_c1_flag, first_c2_flag_idx, idx, first_coeff2;
-
-      if (c1 == 0) {
-        ctx_set++;
-      }
-
-      c1 = 1;
-
-      base_ctx_mod     = (type == 0) ? &(cabac->ctx.cu_one_model_luma4 * ctx_set) :
-                         &(cabac->ctx.cu_one_model_chroma4 * ctx_set);
-      num_c1_flag      = MIN(num_non_zero, C1FLAG_NUMBER);
-      first_c2_flag_idx = -1;
-
-      for (idx = 0; idx < num_c1_flag; idx++) {
-        uint32_t symbol = (abs_coeffidx > 1) ? 1 : 0;
-        cabac->cur_ctx = &base_ctx_modc1;
-        CABAC_BIN(cabac, symbol, "coeff_abs_level_greater1_flag");
-
-        if (symbol) {
-          c1 = 0;
-
-          if (first_c2_flag_idx == -1) {
-            first_c2_flag_idx = idx;
-          }
-        } else if ((c1 < 3) && (c1 > 0)) {
-          c1++;
-        }
-      }
-
-      if (c1 == 0) {
-        base_ctx_mod = (type == 0) ? &(cabac->ctx.cu_abs_model_lumactx_set) :
-                       &(cabac->ctx.cu_abs_model_chromactx_set);
-
-        if (first_c2_flag_idx != -1) {
-          uint8_t symbol = (abs_coefffirst_c2_flag_idx > 2) ? 1 : 0;
-          cabac->cur_ctx      = &base_ctx_mod0;
-          CABAC_BIN(cabac, symbol, "coeff_abs_level_greater2_flag");
-        }
-      }
-      if (be_valid && sign_hidden) {
-    	coeff_signs = coeff_signs >> 1;
-    	if (!cabac->only_count)
-    	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) {
-    	    coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero-1);
-    	  }
-        CABAC_BINS_EP(cabac, coeff_signs , (num_non_zero - 1), "coeff_sign_flag");
-      } else {
-        if (!cabac->only_count)
-    	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS)
-    	    coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero);
-        CABAC_BINS_EP(cabac, coeff_signs, num_non_zero, "coeff_sign_flag");
-      }
-
-      if (c1 == 0 || num_non_zero > C1FLAG_NUMBER) {
-        first_coeff2 = 1;
-
-        for (idx = 0; idx < num_non_zero; idx++) {
-          int32_t base_level  = (idx < C1FLAG_NUMBER) ? (2 + first_coeff2) : 1;
-
-          if (abs_coeffidx >= base_level) {
-        	if (!cabac->only_count) {
-        	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFFS)
-                    kvz_cabac_write_coeff_remain_encry(state, cabac, abs_coeffidx - base_level, go_rice_param, base_level);
-        	  else
-        		kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
-        	} else
-              kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
-
-            if (abs_coeffidx > 3 * (1 << go_rice_param)) {
-              go_rice_param = MIN(go_rice_param + 1, 4);
-            }
-          }
-
-          if (abs_coeffidx >= 2) {
-            first_coeff2 = 0;
-          }
-        }
-      }
-    }
-  }
-}
-
 static void encode_transform_unit(encoder_state_t * const state,
                                   int x, int y, int depth)
 {
@@ -372,7 +129,7 @@
                          width,
                          0,
                          scan_idx,
-                         cur_pu->intra.tr_skip);
+                         cur_pu->tr_skip);
   }
 
   if (depth == MAX_DEPTH + 1) {
@@ -435,7 +192,9 @@
   const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, x, y);
   // Round coordinates down to a multiple of 8 to get the location of the
   // containing CU.
-  const cu_info_t *cur_cu = kvz_cu_array_at_const(frame->cu_array, x & ~7, y & ~7);
+  const int x_cu = 8 * (x / 8);
+  const int y_cu = 8 * (y / 8);
+  const cu_info_t *cur_cu = kvz_cu_array_at_const(frame->cu_array, x_cu, y_cu);
 
   // NxN signifies implicit transform split at the first transform level.
   // There is a similar implicit split for inter, but it is only used when
@@ -508,9 +267,10 @@
 
   if (cb_flag_y | cb_flag_u | cb_flag_v) {
     if (state->must_code_qp_delta) {
-      const int qp_delta      = state->qp - state->ref_qp;
-      const int qp_delta_abs  = ABS(qp_delta);
-      cabac_data_t* cabac     = &state->cabac;
+      const int qp_pred      = kvz_get_cu_ref_qp(state, x_cu, y_cu, state->last_qp);
+      const int qp_delta     = cur_cu->qp - qp_pred;
+      const int qp_delta_abs = ABS(qp_delta);
+      cabac_data_t* cabac    = &state->cabac;
 
       // cu_qp_delta_abs prefix
       cabac->cur_ctx = &cabac->ctx.cu_qp_delta_abs0;
@@ -526,7 +286,6 @@
       }
 
       state->must_code_qp_delta = false;
-      state->ref_qp = state->qp;
     }
 
     encode_transform_unit(state, x, y, depth);
@@ -543,7 +302,7 @@
   int16_t num_cand = 0;
   cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);
   CABAC_BIN(cabac, cur_cu->merged, "MergeFlag");
-  num_cand = MRG_MAX_NUM_CANDS;
+  num_cand = state->encoder_control->cfg.max_merge;
   if (cur_cu->merged) { //merge
     if (num_cand > 1) {
       int32_t ui;
@@ -559,122 +318,76 @@
       }
     }
   } else {
-    uint32_t ref_list_idx;
-
-    // Void TEncSbac::codeInterDir( TComDataCU* pcCU, UInt uiAbsPartIdx )
-    if (state->frame->slicetype == KVZ_SLICE_B)
-    {
+    if (state->frame->slicetype == KVZ_SLICE_B) {
       // Code Inter Dir
       uint8_t inter_dir = cur_cu->inter.mv_dir-1;
-      uint8_t ctx = depth;
-      
 
-      if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 8)
-      {
-        cabac->cur_ctx = &(cabac->ctx.inter_dirctx);
+      if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 8) {
+        cabac->cur_ctx = &(cabac->ctx.inter_dirdepth);
         CABAC_BIN(cabac, (inter_dir == 2), "inter_pred_idc");
       }
-      if (inter_dir < 2)
-      {
+      if (inter_dir < 2) {
         cabac->cur_ctx = &(cabac->ctx.inter_dir4);
         CABAC_BIN(cabac, inter_dir, "inter_pred_idc");
       }
     }
 
-    for (ref_list_idx = 0; ref_list_idx < 2; ref_list_idx++) {
-      if (cur_cu->inter.mv_dir & (1 << ref_list_idx)) {
+    for (uint32_t ref_list_idx = 0; ref_list_idx < 2; ref_list_idx++) {
+      if (!(cur_cu->inter.mv_dir & (1 << ref_list_idx))) {
+        continue;
+      }
 
-        // size of the current reference index list (L0/L1)
-        uint8_t ref_LX_size = state->frame->ref_LX_sizeref_list_idx;
+      // size of the current reference index list (L0/L1)
+      uint8_t ref_LX_size = state->frame->ref_LX_sizeref_list_idx;
 
-        if (ref_LX_size > 1) {
-          // parseRefFrmIdx
-          int32_t ref_frame = cur_cu->inter.mv_refref_list_idx;
+      if (ref_LX_size > 1) {
+        // parseRefFrmIdx
+        int32_t ref_frame = cur_cu->inter.mv_refref_list_idx;
 
-          cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model0);
-          CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX");
+        cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model0);
+        CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX");
 
-          if (ref_frame > 0) {
-            int32_t i;
-            int32_t ref_num = ref_LX_size - 2;
+        if (ref_frame > 0) {
+          ref_frame--;
 
-            cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model1);
-            ref_frame--;
+          int32_t ref_num = ref_LX_size - 2;
 
-            for (i = 0; i < ref_num; ++i) {
-              const uint32_t symbol = (i == ref_frame) ? 0 : 1;
+          for (int32_t i = 0; i < ref_num; ++i) {
+            const uint32_t symbol = (i == ref_frame) ? 0 : 1;
 
-              if (i == 0) {
-                CABAC_BIN(cabac, symbol, "ref_idx_lX");
-              } else {
-                CABAC_BIN_EP(cabac, symbol, "ref_idx_lX");
-              }
-              if (symbol == 0) break;
+            if (i == 0) {
+              cabac->cur_ctx = &cabac->ctx.cu_ref_pic_model1;
+              CABAC_BIN(cabac, symbol, "ref_idx_lX");
+            } else {
+              CABAC_BIN_EP(cabac, symbol, "ref_idx_lX");
             }
+            if (symbol == 0) break;
           }
         }
+      }
 
-        if (!(/*pcCU->getSlice()->getMvdL1ZeroFlag() &&*/ state->frame->ref_list == REF_PIC_LIST_1 && cur_cu->inter.mv_dir == 3)) {
-
-          int16_t mv_cand22;
-          kvz_inter_get_mv_cand_cua(
-              state,
-              x, y, width, height,
-              mv_cand, cur_cu, ref_list_idx);
-
-          uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx);
-
-          const int32_t mvd_hor = cur_cu->inter.mvref_list_idx0 - mv_candcu_mv_cand0;
-          const int32_t mvd_ver = cur_cu->inter.mvref_list_idx1 - mv_candcu_mv_cand1;
-          const int8_t hor_abs_gr0 = mvd_hor != 0;
-          const int8_t ver_abs_gr0 = mvd_ver != 0;
-          const uint32_t mvd_hor_abs = abs(mvd_hor);
-          const uint32_t mvd_ver_abs = abs(mvd_ver);
+      if (state->frame->ref_list != REF_PIC_LIST_1 || cur_cu->inter.mv_dir != 3) {
 
+        int16_t mv_cand22;
+        kvz_inter_get_mv_cand_cua(
+            state,
+            x, y, width, height,
+            mv_cand, cur_cu, ref_list_idx);
 
-          cabac->cur_ctx = &(cabac->ctx.cu_mvd_model0);
-          CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
-          CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");
+        uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx);
+        const int32_t mvd_hor = cur_cu->inter.mvref_list_idx0 - mv_candcu_mv_cand0;
+        const int32_t mvd_ver = cur_cu->inter.mvref_list_idx1 - mv_candcu_mv_cand1;
 
-          cabac->cur_ctx = &(cabac->ctx.cu_mvd_model1);
+        kvz_encode_mvd(state, cabac, mvd_hor, mvd_ver);
+      }
 
-          if (hor_abs_gr0) {
-            CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor");
-          }
+      // Signal which candidate MV to use
+      kvz_cabac_write_unary_max_symbol(cabac,
+                                       cabac->ctx.mvp_idx_model,
+                                       CU_GET_MV_CAND(cur_cu, ref_list_idx),
+                                       1,
+                                       AMVP_MAX_NUM_CANDS - 1);
 
-          if (ver_abs_gr0) {
-            CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver");
-          }
-
-          if (hor_abs_gr0) {
-            if (mvd_hor_abs > 1) {
-              kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs-2, 1);
-            }
-            uint32_t mvd_hor_sign = (mvd_hor>0)?0:1;
-            if(!state->cabac.only_count)
-              if (state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_MV_SIGNS)
-                mvd_hor_sign = mvd_hor_sign^kvz_crypto_get_key(state->crypto_hdl, 1);
-            CABAC_BIN_EP(cabac, mvd_hor_sign, "mvd_sign_flag_hor");
-          }
-          if (ver_abs_gr0) {
-            if (mvd_ver_abs > 1) {
-              kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs-2, 1);
-            }
-            uint32_t mvd_ver_sign = (mvd_ver>0)?0:1;
-            if(!state->cabac.only_count)
-              if (state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_MV_SIGNS)
-                mvd_ver_sign = mvd_ver_sign^kvz_crypto_get_key(state->crypto_hdl, 1);
-            CABAC_BIN_EP(cabac, mvd_ver_sign, "mvd_sign_flag_ver");
-          }
-        }
-
-        // Signal which candidate MV to use
-        kvz_cabac_write_unary_max_symbol(cabac,
-                                         cabac->ctx.mvp_idx_model,
-                                         CU_GET_MV_CAND(cur_cu, ref_list_idx),
-                                         1,
-                                         AMVP_MAX_NUM_CANDS - 1);
-      }
     } // for ref_list
   } // if !merge
 }
@@ -1003,6 +716,9 @@
   const videoframe_t * const frame = state->tile->frame;
   const cu_info_t *cur_cu   = kvz_cu_array_at_const(frame->cu_array, x, y);
 
+  const int cu_width = LCU_WIDTH >> depth;
+  const int half_cu  = cu_width >> 1;
+
   const cu_info_t *left_cu  = NULL;
   if (x > 0) {
     left_cu = kvz_cu_array_at_const(frame->cu_array, x - 1, y);
@@ -1019,13 +735,17 @@
   uint16_t abs_x = x + state->tile->offset_x;
   uint16_t abs_y = y + state->tile->offset_y;
 
-  // Check for slice border FIXME
-  bool border_x = ctrl->in.width  < abs_x + (LCU_WIDTH >> depth);
-  bool border_y = ctrl->in.height < abs_y + (LCU_WIDTH >> depth);
-  bool border_split_x = ctrl->in.width  >= abs_x + (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1));
-  bool border_split_y = ctrl->in.height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1));
+  // Check for slice border
+  bool border_x = ctrl->in.width  < abs_x + cu_width;
+  bool border_y = ctrl->in.height < abs_y + cu_width;
+  bool border_split_x = ctrl->in.width  >= abs_x + (LCU_WIDTH >> MAX_DEPTH) + half_cu;
+  bool border_split_y = ctrl->in.height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + half_cu;
   bool border = border_x || border_y; /*!< are we in any border CU */
 
+  if (depth <= ctrl->max_qp_delta_depth) {
+    state->must_code_qp_delta = true;
+  }
+
   // When not in MAX_DEPTH, insert split flag and split the blocks if needed
   if (depth != MAX_DEPTH) {
     // Implisit split flag when on border
@@ -1045,25 +765,22 @@
 
     if (split_flag || border) {
       // Split blocks and remember to change x and y block positions
-      int offset = LCU_WIDTH >> (depth + 1);
-
       kvz_encode_coding_tree(state, x, y, depth + 1);
 
-      // TODO: fix when other half of the block would not be completely over the border
       if (!border_x || border_split_x) {
-        kvz_encode_coding_tree(state, x + offset, y, depth + 1);
+        kvz_encode_coding_tree(state, x + half_cu, y, depth + 1);
       }
       if (!border_y || border_split_y) {
-        kvz_encode_coding_tree(state, x, y + offset, depth + 1);
+        kvz_encode_coding_tree(state, x, y + half_cu, depth + 1);
       }
       if (!border || (border_split_x && border_split_y)) {
-        kvz_encode_coding_tree(state, x + offset, y + offset, depth + 1);
+        kvz_encode_coding_tree(state, x + half_cu, y + half_cu, depth + 1);
       }
       return;
     }
   }
 
-  if (state->encoder_control->cfg.lossless) {
+  if (ctrl->cfg.lossless) {
     cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass;
     CABAC_BIN(cabac, 1, "cu_transquant_bypass_flag");
   }
@@ -1084,7 +801,7 @@
     CABAC_BIN(cabac, cur_cu->skipped, "SkipFlag");
 
     if (cur_cu->skipped) {
-      int16_t num_cand = MRG_MAX_NUM_CANDS;
+      int16_t num_cand = state->encoder_control->cfg.max_merge;
       if (num_cand > 1) {
         for (int ui = 0; ui < num_cand - 1; ui++) {
           int32_t symbol = (ui != cur_cu->merge_idx);
@@ -1099,7 +816,7 @@
           }
         }
       }
-      return;
+      goto end;
     }
   }
 
@@ -1114,7 +831,6 @@
 
   if (cur_cu->type == CU_INTER) {
     const int num_pu = kvz_part_mode_num_partscur_cu->part_size;
-    const int cu_width = LCU_WIDTH >> depth;
 
     for (int i = 0; i < num_pu; ++i) {
       const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i);
@@ -1185,4 +901,59 @@
     assert(0);
     exit(1);
   }
+
+end:
+
+  if (is_last_cu_in_qg(state, x, y, depth)) {
+    state->last_qp = cur_cu->qp;
+  }
+}
+
+
+void kvz_encode_mvd(encoder_state_t * const state,
+                    cabac_data_t *cabac,
+                    int32_t mvd_hor,
+                    int32_t mvd_ver)
+{
+  const int8_t hor_abs_gr0 = mvd_hor != 0;
+  const int8_t ver_abs_gr0 = mvd_ver != 0;
+  const uint32_t mvd_hor_abs = abs(mvd_hor);
+  const uint32_t mvd_ver_abs = abs(mvd_ver);
+
+  cabac->cur_ctx = &cabac->ctx.cu_mvd_model0;
+  CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
+  CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");
+
+  cabac->cur_ctx = &cabac->ctx.cu_mvd_model1;
+  if (hor_abs_gr0) {
+    CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor");
+  }
+  if (ver_abs_gr0) {
+    CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver");
+  }
+
+  if (hor_abs_gr0) {
+    if (mvd_hor_abs > 1) {
+      kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1);
+    }
+    uint32_t mvd_hor_sign = (mvd_hor > 0) ? 0 : 1;
+    if (!state->cabac.only_count &&
+        state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_MV_SIGNS)
+    {
+      mvd_hor_sign = mvd_hor_sign ^ kvz_crypto_get_key(state->crypto_hdl, 1);
+    }
+    CABAC_BIN_EP(cabac, mvd_hor_sign, "mvd_sign_flag_hor");
+  }
+  if (ver_abs_gr0) {
+    if (mvd_ver_abs > 1) {
+      kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1);
+    }
+    uint32_t mvd_ver_sign = mvd_ver > 0 ? 0 : 1;
+    if (!state->cabac.only_count &&
+        state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_MV_SIGNS)
+    {
+      mvd_ver_sign = mvd_ver_sign^kvz_crypto_get_key(state->crypto_hdl, 1);
+    }
+    CABAC_BIN_EP(cabac, mvd_ver_sign, "mvd_sign_flag_ver");
+  }
 }

kvazaar-1.2.0.tar.gz/src/encode_coding_tree.h -> kvazaar-1.3.0.tar.gz/src/encode_coding_tree.h Changed

kvazaar-1.2.0.tar.gz/src/encoder.c -> kvazaar-1.3.0.tar.gz/src/encoder.c Changed

@@ -305,7 +305,7 @@
   kvz_scalinglist_init(&encoder->scaling_list);
 
   // CQM
-  if (cfg->cqmfile) {
+  if (cfg->scaling_list == KVZ_SCALING_LIST_CUSTOM && cfg->cqmfile) {
     FILE* cqmfile = fopen(cfg->cqmfile, "rb");
     if (cqmfile) {
       kvz_scalinglist_parse(&encoder->scaling_list, cqmfile);
@@ -314,7 +314,12 @@
       fprintf(stderr, "Could not open CQM file.\n");
       goto init_failed;
     }
+  } else if (cfg->scaling_list == KVZ_SCALING_LIST_DEFAULT) {
+    // Enable scaling lists if default lists are used
+    encoder->scaling_list.enable = 1;
+    encoder->scaling_list.use_default_list = 1;
   }
+
   kvz_scalinglist_process(&encoder->scaling_list, encoder->bitdepth);
 
   kvz_encoder_control_input_init(encoder, encoder->cfg.width, encoder->cfg.height);
@@ -347,13 +352,15 @@
 
   }
 
-  encoder->lcu_dqp_enabled = cfg->target_bitrate > 0 || encoder->cfg.roi.dqps;
+  // NOTE: When tr_depth_inter is equal to 0, the transform is still split
+  // for SMP and AMP partition units.
+  encoder->tr_depth_inter = 0;
 
-  // When tr_depth_inter is equal to 0, inter transform split flag defaults
-  // to 1 for SMP and AMP partition units. We want to avoid the extra
-  // transform split so we set tr_depth_inter to 1 when SMP or AMP
-  // partition modes are enabled.
-  encoder->tr_depth_inter = (encoder->cfg.smp_enable || encoder->cfg.amp_enable) ? 1 : 0;
+  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu) {
+    encoder->max_qp_delta_depth = 0;
+  } else {
+    encoder->max_qp_delta_depth = -1;
+  }
 
   //Tiles
   encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 ||
@@ -731,6 +738,7 @@
   switch (num_layers) {
     case 0:
     case 1:
+      encoder->gop_layer_weights0 = 1;
       break;
 
     // Use the first layers of the 4-layer weights.

kvazaar-1.2.0.tar.gz/src/encoder.h -> kvazaar-1.3.0.tar.gz/src/encoder.h Changed

kvazaar-1.2.0.tar.gz/src/encoder_state-bitstream.c -> kvazaar-1.3.0.tar.gz/src/encoder_state-bitstream.c Changed

@@ -60,7 +60,7 @@
   // PTL
   // Profile Tier
   WRITE_U(stream, 0, 2, "general_profile_space");
-  WRITE_U(stream, 0, 1, "general_tier_flag");
+  WRITE_U(stream, state->encoder_control->cfg.high_tier, 1, "general_tier_flag");
   // Main Profile == 1,  Main 10 profile == 2
   WRITE_U(stream, (state->encoder_control->bitdepth == 8)?1:2, 5, "general_profile_idc");
   /* Compatibility flags should be set at general_profile_idc
@@ -80,8 +80,8 @@
 
   // end Profile Tier
 
-  // Level 6.2 (general_level_idc is 30 * 6.2)
-  WRITE_U(stream, 186, 8, "general_level_idc");
+  uint8_t level = state->encoder_control->cfg.level;
+  WRITE_U(stream, level * 3, 8, "general_level_idc");
 
   WRITE_U(stream, 0, 1, "sub_layer_profile_present_flag");
   WRITE_U(stream, 0, 1, "sub_layer_level_present_flag");
@@ -395,8 +395,11 @@
   // scaling list
   WRITE_U(stream, encoder->scaling_list.enable, 1, "scaling_list_enable_flag");
   if (encoder->scaling_list.enable) {
-    WRITE_U(stream, 1, 1, "sps_scaling_list_data_present_flag");
-    encoder_state_write_bitstream_scaling_list(stream, state);
+    // Signal scaling list data for custom lists
+    WRITE_U(stream, (encoder->cfg.scaling_list == KVZ_SCALING_LIST_CUSTOM) ? 1 : 0, 1, "sps_scaling_list_data_present_flag");
+    if (encoder->cfg.scaling_list == KVZ_SCALING_LIST_CUSTOM) {
+      encoder_state_write_bitstream_scaling_list(stream, state);
+    }
   }
 
   WRITE_U(stream, (encoder->cfg.amp_enable ? 1 : 0), 1, "amp_enabled_flag");
@@ -451,16 +454,21 @@
 
   WRITE_UE(stream, 0, "num_ref_idx_l0_default_active_minus1");
   WRITE_UE(stream, 0, "num_ref_idx_l1_default_active_minus1");
-  WRITE_SE(stream, ((int8_t)encoder->cfg.qp) - 26, "pic_init_qp_minus26");
+  
+  // If tiles and slices = tiles is enabled, signal QP in the slice header. Keeping the PPS constant for OMAF etc
+  // Keep QP constant here also if it will be only set at CU level.
+  bool constant_qp_in_pps = ((encoder->cfg.slices & KVZ_SLICES_TILES) && encoder->tiles_enable) || encoder->cfg.set_qp_in_cu;
+  WRITE_SE(stream, constant_qp_in_pps ? 0 : (((int8_t)encoder->cfg.qp) - 26), "pic_init_qp_minus26");
+
   WRITE_U(stream, 0, 1, "constrained_intra_pred_flag");
   WRITE_U(stream, encoder->cfg.trskip_enable, 1, "transform_skip_enabled_flag");
 
-  if (encoder->lcu_dqp_enabled) {
+  if (encoder->max_qp_delta_depth >= 0) {
     // Use separate QP for each LCU when rate control is enabled.
     WRITE_U(stream, 1, 1, "cu_qp_delta_enabled_flag");
-    WRITE_UE(stream, 0, "diff_cu_qp_delta_depth");
+    WRITE_UE(stream, encoder->max_qp_delta_depth, "diff_cu_qp_delta_depth");
   } else {
-  WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag");
+    WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag");
   }
 
   //TODO: add QP offsets
@@ -777,12 +785,12 @@
       WRITE_U(stream, 1, 1, "slice_sao_chroma_flag");
     }
   }
-    
+
   if (state->frame->slicetype != KVZ_SLICE_I) {
       WRITE_U(stream, 1, 1, "num_ref_idx_active_override_flag");
-      WRITE_UE(stream, ref_negative != 0 ? ref_negative - 1: 0, "num_ref_idx_l0_active_minus1");
+      WRITE_UE(stream, MAX(0, ((int)state->frame->ref_LX_size0) - 1), "num_ref_idx_l0_active_minus1");
       if (state->frame->slicetype == KVZ_SLICE_B) {
-        WRITE_UE(stream, ref_positive != 0 ? ref_positive - 1 : 0, "num_ref_idx_l1_active_minus1");
+        WRITE_UE(stream, MAX(0, ((int)state->frame->ref_LX_size1) - 1), "num_ref_idx_l1_active_minus1");
         WRITE_U(stream, 0, 1, "mvd_l1_zero_flag");
       }
 
@@ -799,12 +807,16 @@
           WRITE_UE(stream, 0, "collocated_ref_idx");
         }
       }
-
-      WRITE_UE(stream, 5-MRG_MAX_NUM_CANDS, "five_minus_max_num_merge_cand");
+      const uint8_t max_merge_cands = state->encoder_control->cfg.max_merge;
+      WRITE_UE(stream, 5- max_merge_cands, "five_minus_max_num_merge_cand");
   }
 
   {
-    int slice_qp_delta = state->frame->QP - encoder->cfg.qp;
+    // If tiles are enabled, signal the full QP here (relative to the base value of 26)
+    // If QP is to be set only at CU level, force slice_qp_delta zero
+    bool signal_qp_in_slice_header = (encoder->cfg.slices & KVZ_SLICES_TILES) && encoder->tiles_enable;
+    int slice_qp_delta = state->frame->QP - (signal_qp_in_slice_header ? 26 : encoder->cfg.qp);
+    if(encoder->cfg.set_qp_in_cu) slice_qp_delta = 0;
     WRITE_SE(stream, slice_qp_delta, "slice_qp_delta");
   }
 }

kvazaar-1.2.0.tar.gz/src/encoder_state-ctors_dtors.c -> kvazaar-1.3.0.tar.gz/src/encoder_state-ctors_dtors.c Changed

kvazaar-1.2.0.tar.gz/src/encoderstate.c -> kvazaar-1.3.0.tar.gz/src/encoderstate.c Changed

@@ -37,9 +37,6 @@
 #include "tables.h"
 #include "threadqueue.h"
 
-#define SAO_BUF_WIDTH (LCU_WIDTH + SAO_DELAY_PX + 2)
-#define SAO_BUF_WIDTH_C (SAO_BUF_WIDTH / 2)
-
 
 int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) {
   int i;
@@ -250,10 +247,18 @@
 {
   videoframe_t *const frame = state->tile->frame;
 
-  // Temporary buffers for SAO input pixels.
-  kvz_pixel sao_buf_y_arraySAO_BUF_WIDTH * SAO_BUF_WIDTH;
-  kvz_pixel sao_buf_u_arraySAO_BUF_WIDTH_C * SAO_BUF_WIDTH_C;
-  kvz_pixel sao_buf_v_arraySAO_BUF_WIDTH_C * SAO_BUF_WIDTH_C;
+
+  // Temporary buffers for SAO input pixels. The buffers cover the pixels
+  // inside the LCU (LCU_WIDTH x LCU_WIDTH), SAO_DELAY_PX wide bands to the
+  // left and above the LCU, and one pixel border on the left and top
+  // sides. We add two extra pixels to the buffers because the AVX2 SAO
+  // reconstruction reads up to two extra bytes when using edge SAO in the
+  // horizontal direction.
+#define SAO_BUF_WIDTH   (1 + SAO_DELAY_PX   + LCU_WIDTH)
+#define SAO_BUF_WIDTH_C (1 + SAO_DELAY_PX/2 + LCU_WIDTH_C)
+  kvz_pixel sao_buf_y_arraySAO_BUF_WIDTH   * SAO_BUF_WIDTH   + 2;
+  kvz_pixel sao_buf_u_arraySAO_BUF_WIDTH_C * SAO_BUF_WIDTH_C + 2;
+  kvz_pixel sao_buf_v_arraySAO_BUF_WIDTH_C * SAO_BUF_WIDTH_C + 2;
 
   // Pointers to the top-left pixel of the LCU in the buffers.
   kvz_pixel *const sao_buf_y = &sao_buf_y_array(SAO_DELAY_PX + 1) * (SAO_BUF_WIDTH + 1);
@@ -526,68 +531,81 @@
 /**
  * \brief Sets the QP for each CU in state->tile->frame->cu_array.
  *
- * The QPs are used in deblocking.
+ * The QPs are used in deblocking and QP prediction.
  *
- * The delta QP for an LCU is coded when the first CU with coded block flag
- * set is encountered. Hence, for the purposes of deblocking, all CUs
- * before the first one with cbf set use state->ref_qp and all CUs after
- * that use state->qp.
+ * The QP delta for a quantization group is coded when the first CU with
+ * coded block flag set is encountered. Hence, for the purposes of
+ * deblocking and QP prediction, all CUs in before the first one that has
+ * cbf set use the QP predictor and all CUs after that use (QP predictor
+ * + QP delta).
  *
  * \param state           encoder state
  * \param x               x-coordinate of the left edge of the root CU
  * \param y               y-coordinate of the top edge of the root CU
  * \param depth           depth in the CU quadtree
- * \param coeffs_coded    Used for tracking whether a CU with a residual
- *                        has been encountered. Should be set to false at
- *                        the top level.
- * \return Whether there were any CUs with residual or not.
+ * \param last_qp         QP of the last CU in the last quantization group
+ * \param prev_qp         -1 if QP delta has not been coded in current QG,
+ *                        otherwise the QP of the current QG
  */
-static bool set_cu_qps(encoder_state_t *state, int x, int y, int depth, bool coeffs_coded)
+static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *last_qp, int *prev_qp)
 {
-  if (state->qp == state->ref_qp) {
-    // If the QPs are equal there is no need to care about the residuals.
-    coeffs_coded = true;
-  }
+
+  // Stop recursion if the CU is completely outside the frame.
+  if (x >= state->tile->frame->width || y >= state->tile->frame->height) return;
 
   cu_info_t *cu = kvz_cu_array_at(state->tile->frame->cu_array, x, y);
   const int cu_width = LCU_WIDTH >> depth;
-  coeffs_coded = coeffs_coded || cbf_is_set_any(cu->cbf, cu->depth);
 
-  if (!coeffs_coded && cu->depth > depth) {
+  if (depth <= state->encoder_control->max_qp_delta_depth) {
+    *prev_qp = -1;
+  }
+
+  if (cu->depth > depth) {
     // Recursively process sub-CUs.
     const int d = cu_width >> 1;
-    coeffs_coded = set_cu_qps(state, x,     y,     depth + 1, coeffs_coded);
-    coeffs_coded = set_cu_qps(state, x + d, y,     depth + 1, coeffs_coded);
-    coeffs_coded = set_cu_qps(state, x,     y + d, depth + 1, coeffs_coded);
-    coeffs_coded = set_cu_qps(state, x + d, y + d, depth + 1, coeffs_coded);
+    set_cu_qps(state, x,     y,     depth + 1, last_qp, prev_qp);
+    set_cu_qps(state, x + d, y,     depth + 1, last_qp, prev_qp);
+    set_cu_qps(state, x,     y + d, depth + 1, last_qp, prev_qp);
+    set_cu_qps(state, x + d, y + d, depth + 1, last_qp, prev_qp);
 
   } else {
-    if (!coeffs_coded && cu->tr_depth > depth) {
+    bool cbf_found = *prev_qp >= 0;
+
+    if (cu->tr_depth > depth) {
       // The CU is split into smaller transform units. Check whether coded
       // block flag is set for any of the TUs.
       const int tu_width = LCU_WIDTH >> cu->tr_depth;
-      for (int y_scu = y; y_scu < y + cu_width; y_scu += tu_width) {
-        for (int x_scu = x; x_scu < x + cu_width; x_scu += tu_width) {
+      for (int y_scu = y; !cbf_found && y_scu < y + cu_width; y_scu += tu_width) {
+        for (int x_scu = x; !cbf_found && x_scu < x + cu_width; x_scu += tu_width) {
           cu_info_t *tu = kvz_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu);
           if (cbf_is_set_any(tu->cbf, cu->depth)) {
-            coeffs_coded = true;
+            cbf_found = true;
           }
         }
       }
+    } else if (cbf_is_set_any(cu->cbf, cu->depth)) {
+      cbf_found = true;
+    }
+
+    int8_t qp;
+    if (cbf_found) {
+      *prev_qp = qp = cu->qp;
+    } else {
+      qp = kvz_get_cu_ref_qp(state, x, y, *last_qp);
     }
 
     // Set the correct QP for all state->tile->frame->cu_array elements in
     // the area covered by the CU.
-    const int8_t qp = coeffs_coded ? state->qp : state->ref_qp;
-
     for (int y_scu = y; y_scu < y + cu_width; y_scu += SCU_WIDTH) {
       for (int x_scu = x; x_scu < x + cu_width; x_scu += SCU_WIDTH) {
         kvz_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu)->qp = qp;
       }
     }
-  }
 
-  return coeffs_coded;
+    if (is_last_cu_in_qg(state, x, y, depth)) {
+      *last_qp = cu->qp;
+    }
+  }
 }
 
 
@@ -608,11 +626,13 @@
 
   encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search);
 
-  if (encoder->cfg.deblock_enable) {
-    if (encoder->lcu_dqp_enabled) {
-      set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, false);
-    }
+  if (encoder->max_qp_delta_depth >= 0) {
+    int last_qp = state->last_qp;
+    int prev_qp = -1;
+    set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp);
+  }
 
+  if (encoder->cfg.deblock_enable) {
     kvz_filter_deblock_lcu(state, lcu->position_px.x, lcu->position_px.y);
   }
 
@@ -635,9 +655,6 @@
     encode_sao(state, lcu->position.x, lcu->position.y, &frame->sao_lumalcu->position.y * frame->width_in_lcu + lcu->position.x, &frame->sao_chromalcu->position.y * frame->width_in_lcu + lcu->position.x);
   }
 
-  // QP delta is not used when rate control is turned off.
-  state->must_code_qp_delta = encoder->lcu_dqp_enabled;
-
   //Encode coding tree
   kvz_encode_coding_tree(state, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, 0);
 
@@ -709,7 +726,8 @@
   const encoder_control_t *ctrl = state->encoder_control;
   const kvz_config *cfg = &ctrl->cfg;
 
-  state->ref_qp = state->frame->QP;
+  // Signaled slice QP may be different to frame QP with set-qp-in-cu enabled.
+  state->last_qp = ctrl->cfg.set_qp_in_cu ? 26 : state->frame->QP;
 
   if (cfg->crypto_features) {
     state->crypto_hdl = kvz_crypto_create(cfg);
@@ -784,6 +802,21 @@
             dep_lcu = dep_lcu->right;
           }
           kvz_threadqueue_job_dep_add(job0, ref_state->tile->wf_jobsdep_lcu->id);
+
+          // Very spesific bug that happens when owf length is longer than the
+          // gop length. Takes care of that.
+          if(!state->encoder_control->cfg.gop_lowdelay &&
+             state->encoder_control->cfg.open_gop &&
+             state->encoder_control->cfg.gop_len != 0 &&
+             state->encoder_control->cfg.owf > state->encoder_control->cfg.gop_len &&
+             ref_state->frame->slicetype == KVZ_SLICE_I &&
+             ref_state->frame->num != 0){
+
+            while (ref_state->frame->poc != state->frame->poc - state->encoder_control->cfg.gop_len){
+              ref_state = ref_state->previous_encoder_state;
+            }
+            kvz_threadqueue_job_dep_add(job0, ref_state->tile->wf_jobsdep_lcu->id);
+          }
         }
 
         // Add local WPP dependancy to the LCU on the left.
@@ -945,13 +978,19 @@
 }
 
 
-static void encoder_ref_insertion_sort(const encoder_state_t *const state, uint8_t reflist16, uint8_t length) {
+static void encoder_ref_insertion_sort(const encoder_state_t *const state,
+                                       uint8_t reflist16,
+                                       uint8_t length,
+                                       bool reverse)
+{
 
   for (uint8_t i = 1; i < length; ++i) {
     const uint8_t cur_idx = reflisti;
     const int32_t cur_poc = state->frame->ref->pocscur_idx;
     int8_t j = i;
-    while (j > 0 && cur_poc > state->frame->ref->pocsreflistj - 1) {
+    while ((j > 0 && !reverse && cur_poc > state->frame->ref->pocsreflistj - 1) ||
+           (j > 0 &&  reverse && cur_poc < state->frame->ref->pocsreflistj - 1))
+    {
       reflistj = reflistj - 1;
       --j;
     }
@@ -966,29 +1005,54 @@
  */
 void kvz_encoder_create_ref_lists(const encoder_state_t *const state)
 {
-  // TODO check possibility to add L0 references to L1 list also
-  
+  const kvz_config *cfg = &state->encoder_control->cfg;
+
   FILL_ARRAY(state->frame->ref_LX_size, 0, 2);
 
-  // List all pocs of lists
-  int j = 0;
-  for (j = 0; j < state->frame->ref->used_size; j++) {
-    if (state->frame->ref->pocsj < state->frame->poc) {
-      state->frame->ref_LX0state->frame->ref_LX_size0 = j;
-      state->frame->ref_LX_size0 += 1;
-    } else {
-      state->frame->ref_LX1state->frame->ref_LX_size1 = j;
+  int num_negative = 0;
+  int num_positive = 0;
+
+  // Add positive references to L1 list
+  for (int i = 0; i < state->frame->ref->used_size; i++) {
+    if (state->frame->ref->pocsi > state->frame->poc) {
+      state->frame->ref_LX1state->frame->ref_LX_size1 = i;
       state->frame->ref_LX_size1 += 1;
+      num_positive++;
+    }
+  }
+
+  // Add negative references to L1 list when bipred is enabled and GOP is
+  // either disabled or does not use picture reordering.
+  bool l1_negative_refs =
+    (cfg->bipred && (cfg->gop_len == 0 || cfg->gop_lowdelay));
+
+  // Add negative references to L0 and L1 lists.
+  for (int i = 0; i < state->frame->ref->used_size; i++) {
+    if (state->frame->ref->pocsi < state->frame->poc) {
+      state->frame->ref_LX0state->frame->ref_LX_size0 = i;
+      state->frame->ref_LX_size0 += 1;
+      if (l1_negative_refs) {
+        state->frame->ref_LX1state->frame->ref_LX_size1 = i;
+        state->frame->ref_LX_size1 += 1;
+      }
+      num_negative++;
     }
   }
 
-  // Fill the rest with -1s.
-  for (; j < 16; j++) {
-    state->frame->ref_LX0j = (uint8_t) -1;
-    state->frame->ref_LX1j = (uint8_t) -1;
+  // Fill the rest with -1.
+  for (int i = state->frame->ref_LX_size0; i < 16; i++) {
+    state->frame->ref_LX0i = 0xff;
+  }
+  for (int i = state->frame->ref_LX_size1; i < 16; i++) {
+    state->frame->ref_LX1i = 0xff;
   }
 
-  encoder_ref_insertion_sort(state, state->frame->ref_LX0, state->frame->ref_LX_size0);
+  // Sort reference lists.
+  encoder_ref_insertion_sort(state, state->frame->ref_LX0, num_negative, false);
+  encoder_ref_insertion_sort(state, state->frame->ref_LX1, num_positive, true);
+  if (l1_negative_refs) {
+    encoder_ref_insertion_sort(state, state->frame->ref_LX1 + num_positive, num_negative, false);
+  }
 }
 
 /**
@@ -1092,7 +1156,7 @@
   if (state->is_leaf) {
     //Leaf states have cabac and context
     kvz_cabac_start(&state->cabac);
-    kvz_init_contexts(state, state->frame->QP, state->frame->slicetype);
+    kvz_init_contexts(state, state->encoder_control->cfg.set_qp_in_cu ? 26 : state->frame->QP, state->frame->slicetype);
   }
 
   //Clear the jobs
@@ -1133,14 +1197,38 @@
       state->tile->frame->height
   );
 
+  // Use this flag to handle closed gop irap picture selection.
+  // If set to true, irap is already set and we avoid
+  // setting it based on the intra period
+  bool is_closed_normal_gop = false;
+
   // Set POC.
   if (state->frame->num == 0) {
     state->frame->poc = 0;
   } else if (cfg->gop_len && !cfg->gop_lowdelay) {
-    // Calculate POC according to the global frame counter and GOP structure
-    int32_t poc = state->frame->num - 1;
-    int32_t poc_offset = cfg->gopstate->frame->gop_offset.poc_offset;
-    state->frame->poc = poc - poc % cfg->gop_len + poc_offset;
+
+    int32_t framenum = state->frame->num - 1;
+    // Handle closed GOP
+    // Closed GOP structure has an extra IDR between the GOPs
+    if (cfg->intra_period > 0 && !cfg->open_gop) {
+      is_closed_normal_gop = true;
+      if (framenum % (cfg->intra_period + 1) == cfg->intra_period) {
+        // Insert IDR before each new GOP after intra period in closed GOP configuration
+        state->frame->poc = 0;
+      } else {
+        // Calculate frame number again and use that for the POC
+        framenum = framenum % (cfg->intra_period + 1);
+        int32_t poc_offset = cfg->gopstate->frame->gop_offset.poc_offset;
+        state->frame->poc = framenum - framenum % cfg->gop_len + poc_offset;
+        // This should not be an irap picture in closed GOP
+        state->frame->is_irap = false;
+      }
+    } else { // Open GOP
+      // Calculate POC according to the global frame counter and GOP structure
+      int32_t poc_offset = cfg->gopstate->frame->gop_offset.poc_offset;
+      state->frame->poc = framenum - framenum % cfg->gop_len + poc_offset;
+    }
+    
     kvz_videoframe_set_poc(state->tile->frame, state->frame->poc);
   } else if (cfg->intra_period > 0) {
     state->frame->poc = state->frame->num % cfg->intra_period;
@@ -1149,9 +1237,9 @@
   }
 
   // Check whether the frame is a keyframe or not.
-  if (state->frame->num == 0) {
+  if (state->frame->num == 0 || state->frame->poc == 0) {
     state->frame->is_irap = true;
-  } else {
+  } else if(!is_closed_normal_gop) { // In closed-GOP IDR frames are poc==0 so skip this check
     state->frame->is_irap =
       cfg->intra_period > 0 &&
       (state->frame->poc % cfg->intra_period) == 0;
@@ -1165,7 +1253,8 @@
     if (state->frame->num == 0 ||
         cfg->intra_period == 1 ||
         cfg->gop_len == 0 ||
-        cfg->gop_lowdelay)
+        cfg->gop_lowdelay ||
+        !cfg->open_gop) // Closed GOP uses IDR pictures
     {
       state->frame->pictype = KVZ_NAL_IDR_W_RADL;
     } else {
@@ -1331,3 +1420,27 @@
                     state->encoder_control->in.width_in_lcu;
   return &state->frame->lcu_statsindex;
 }
+
+int kvz_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp)
+{
+  const encoder_control_t *ctrl = state->encoder_control;
+  const cu_array_t *cua = state->tile->frame->cu_array;
+  // Quantization group width
+  const int qg_width = LCU_WIDTH >> MIN(ctrl->max_qp_delta_depth, kvz_cu_array_at_const(cua, x, y)->depth);
+
+  // Coordinates of the top-left corner of the quantization group
+  const int x_qg = x & ~(qg_width - 1);
+  const int y_qg = y & ~(qg_width - 1);
+
+  int qp_pred_a = last_qp;
+  if (x_qg % LCU_WIDTH > 0) {
+    qp_pred_a = kvz_cu_array_at_const(cua, x_qg - 1, y_qg)->qp;
+  }
+
+  int qp_pred_b = last_qp;
+  if (y_qg % LCU_WIDTH > 0) {
+    qp_pred_b = kvz_cu_array_at_const(cua, x_qg, y_qg - 1)->qp;
+  }
+
+  return ((qp_pred_a + qp_pred_b + 1) >> 1);
+}

kvazaar-1.2.0.tar.gz/src/encoderstate.h -> kvazaar-1.3.0.tar.gz/src/encoderstate.h Changed

@@ -268,10 +268,17 @@
   bool must_code_qp_delta;
 
   /**
-   * \brief Reference for computing QP delta for the next LCU that is coded
-   * next. Updated whenever a QP delta is coded.
+   * \brief QP value of the last CU in the last coded quantization group.
+   *
+   * A quantization group is a square of width
+   * (LCU_WIDTH >> encoder_control->max_qp_delta_depth). All CUs of in the
+   * same quantization group share the QP predictor value, but may have
+   * different QP values.
+   *
+   * Set to the frame QP at the beginning of a wavefront row or a tile and
+   * updated when the last CU of a quantization group is coded.
    */
-  int8_t ref_qp;
+  int8_t last_qp;
 
   /**
    * \brief Coeffs for the LCU.
@@ -297,6 +304,8 @@
 lcu_stats_t* kvz_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y);
 
 
+int kvz_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp);
+
 /**
  * Whether the parameter sets should be written with the current frame.
  */
@@ -309,6 +318,30 @@
          (vps_period >= 0 && frame == 0);
 }
 
+
+/**
+ * \brief Returns true if the CU is the last CU in its containing
+ * quantization group.
+ *
+ * \param state   encoder state
+ * \param x       x-coordinate of the left edge of the CU
+ * \param y       y-cooradinate of the top edge of the CU
+ * \param depth   depth in the CU tree
+ * \return true, if it's the last CU in its QG, otherwise false
+ */
+static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth)
+{
+  if (state->encoder_control->max_qp_delta_depth < 0) return false;
+
+  const int cu_width = LCU_WIDTH >> depth;
+  const int qg_width = LCU_WIDTH >> state->encoder_control->max_qp_delta_depth;
+  const int right  = x + cu_width;
+  const int bottom = y + cu_width;
+  return (right % qg_width == 0 || right >= state->tile->frame->width) &&
+         (bottom % qg_width == 0 || bottom >= state->tile->frame->height);
+}
+
+
 static const uint8_t g_group_idx32 = {
   0, 1, 2, 3, 4, 4, 5, 5, 6, 6,
   6, 6, 7, 7, 7, 7, 8, 8, 8, 8,

kvazaar-1.2.0.tar.gz/src/extras/crypto.cpp -> kvazaar-1.3.0.tar.gz/src/extras/crypto.cpp Changed

kvazaar-1.2.0.tar.gz/src/filter.c -> kvazaar-1.3.0.tar.gz/src/filter.c Changed

kvazaar-1.2.0.tar.gz/src/global.h -> kvazaar-1.3.0.tar.gz/src/global.h Changed

@@ -78,6 +78,12 @@
  * Stuff related to multi-threading using pthreads
  */
 
+ // Pthreads-win32 tries to define timespec even if it has already been defined.
+ // In Visual Studio 2015 timespec is defined in time.h so we may need to define
+ // HAVE_STRUCT_TIMESPEC.
+#if _MSC_VER >= 1900 && !defined(HAVE_STRUCT_TIMESPEC)
+#   define HAVE_STRUCT_TIMESPEC
+#endif
 
 #if defined(_MSC_VER) && defined(_M_AMD64)
   #define X86_64
@@ -200,7 +206,7 @@
 // NOTE: When making a release, check to see if incrementing libversion in 
 // configure.ac is necessary.
 #ifndef KVZ_VERSION
-#define KVZ_VERSION 1.2.0
+#define KVZ_VERSION 1.3.0
 #endif
 #define VERSION_STRING QUOTE_EXPAND(KVZ_VERSION)
 
@@ -233,8 +239,10 @@
 #ifdef _MSC_VER
 // Buggy VS2010 throws intellisense warnings if void* is not casted.
   #define MALLOC(type, num) (type *)malloc(sizeof(type) * (num))
+  #define MALLOC_SIMD_PADDED(type, num, padding) (type *)malloc(sizeof(type) * (num) + (padding))
 #else
   #define MALLOC(type, num) malloc(sizeof(type) * (num))
+  #define MALLOC_SIMD_PADDED(type, num, padding) malloc(sizeof(type) * (num) + (padding))
 #endif
 
 // Use memset through FILL and FILL_ARRAY when appropriate, such as when

kvazaar-1.2.0.tar.gz/src/image.c -> kvazaar-1.3.0.tar.gz/src/image.c Changed

@@ -47,6 +47,8 @@
   assert((width % 2) == 0);
   assert((height % 2) == 0);
 
+  const size_t simd_padding_width = 64;
+
   kvz_picture *im = MALLOC(kvz_picture, 1);
   if (!im) return NULL;
 
@@ -56,12 +58,13 @@
 
   im->chroma_format = chroma_format;
 
-  //Allocate memory
-  im->fulldata = MALLOC(kvz_pixel, (luma_size + 2 * chroma_size));
-  if (!im->fulldata) {
+  //Allocate memory, pad the full data buffer from both ends
+  im->fulldata_buf = MALLOC_SIMD_PADDED(kvz_pixel, (luma_size + 2 * chroma_size), simd_padding_width * 2);
+  if (!im->fulldata_buf) {
     free(im);
     return NULL;
   }
+  im->fulldata = im->fulldata_buf + simd_padding_width / sizeof(kvz_pixel);
 
   im->base_image = im;
   im->refcount = 1; //We give a reference to caller
@@ -110,11 +113,12 @@
     // Free our reference to the base image.
     kvz_image_free(im->base_image);
   } else {
-    free(im->fulldata);
+    free(im->fulldata_buf);
   }
 
   // Make sure freed data won't be used.
   im->base_image = NULL;
+  im->fulldata_buf = NULL;
   im->fulldata = NULL;
   im->y = im->u = im->v = NULL;
   im->dataCOLOR_Y = im->dataCOLOR_U = im->dataCOLOR_V = NULL;
@@ -128,10 +132,10 @@
  */
 kvz_picture *kvz_image_copy_ref(kvz_picture *im)
 {
-  // The caller should have had another reference.
-  assert(im->refcount > 0);
-  KVZ_ATOMIC_INC(&(im->refcount));
-
+  int32_t new_refcount = KVZ_ATOMIC_INC(&im->refcount);
+  // The caller should have had another reference and we added one
+  // reference so refcount should be at least 2.
+  assert(new_refcount >= 2);
   return im;
 }
 
@@ -223,6 +227,15 @@
   free(yuv);
 }
 
+static INLINE uint32_t reg_sad_maybe_optimized(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                                  const int32_t width, const int32_t height, const uint32_t stride1,
+                                  const uint32_t stride2, optimized_sad_func_ptr_t optimized_sad)
+{
+  if (optimized_sad != NULL)
+    return optimized_sad(data1, data2, height, stride1, stride2);
+  else
+    return kvz_reg_sad(data1, data2, width, height, stride1, stride2);
+}
 
 /**
  * \brief Diagonally interpolate SAD outside the frame.
@@ -251,58 +264,6 @@
   return sad;
 }
 
-/**
- * \brief Vertically interpolate SAD outside the frame.
- *
- * \param data1   Starting point of the first picture.
- * \param data2   Starting point of the second picture.
- * \param width   Width of the region for which SAD is calculated.
- * \param height  Height of the region for which SAD is calculated.
- * \param width  Width of the pixel array.
- *
- * \returns Sum of Absolute Differences
- */
-static unsigned ver_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
-                        int block_width, int block_height, unsigned pic_stride)
-{
-  int x, y;
-  unsigned sad = 0;
-
-  for (y = 0; y < block_height; ++y) {
-    for (x = 0; x < block_width; ++x) {
-      sad += abs(pic_datay * pic_stride + x - ref_datax);
-    }
-  }
-
-  return sad;
-}
-
-/**
- * \brief Horizontally interpolate SAD outside the frame.
- *
- * \param data1   Starting point of the first picture.
- * \param data2   Starting point of the second picture.
- * \param width   Width of the region for which SAD is calculated.
- * \param height  Height of the region for which SAD is calculated.
- * \param width   Width of the pixel array.
- *
- * \returns Sum of Absolute Differences
- */
-static unsigned hor_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
-                        int block_width, int block_height, unsigned pic_stride, unsigned ref_stride)
-{
-  int x, y;
-  unsigned sad = 0;
-
-  for (y = 0; y < block_height; ++y) {
-    for (x = 0; x < block_width; ++x) {
-      sad += abs(pic_datay * pic_stride + x - ref_datay * ref_stride);
-    }
-  }
-
-  return sad;
-}
-
 
 /**
  * \brief  Handle special cases of comparing blocks that are not completely
@@ -319,7 +280,8 @@
  */
 static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture *ref,
                                  int pic_x, int pic_y, int ref_x, int ref_y,
-                                 int block_width, int block_height)
+                                 int block_width, int block_height,
+                                 optimized_sad_func_ptr_t optimized_sad)
 {
   kvz_pixel *pic_data, *ref_data;
 
@@ -356,94 +318,86 @@
   //   that we compare the right part of the block to the ref_data.
   // - Reduce block_width and block_height so that the the size of the area
   //   being compared is correct.
+  //
+  // NOTE: No more correct since hor_sad was modified to be a separate
+  // strategy
   if (top && left) {
     result += cor_sad(pic_data,
                       &ref_datatop * ref->stride + left,
                       left, top, pic->stride);
-    result += ver_sad(&pic_dataleft,
+    result += kvz_ver_sad(&pic_dataleft,
                       &ref_datatop * ref->stride + left,
                       block_width - left, top, pic->stride);
-    result += hor_sad(&pic_datatop * pic->stride,
-                      &ref_datatop * ref->stride + left,
-                      left, block_height - top, pic->stride, ref->stride);
-    result += kvz_reg_sad(&pic_datatop * pic->stride + left,
-                      &ref_datatop * ref->stride + left,
-                      block_width - left, block_height - top, pic->stride, ref->stride);
+
+    result += kvz_hor_sad(pic_data + top * pic->stride,
+                          ref_data + top * ref->stride,
+                          block_width, block_height - top,
+                          pic->stride, ref->stride,
+                          left, right);
+
   } else if (top && right) {
-    result += ver_sad(pic_data,
+    result += kvz_ver_sad(pic_data,
                       &ref_datatop * ref->stride,
                       block_width - right, top, pic->stride);
     result += cor_sad(&pic_datablock_width - right,
                       &ref_datatop * ref->stride + (block_width - right - 1),
                       right, top, pic->stride);
-    result += kvz_reg_sad(&pic_datatop * pic->stride,
-                      &ref_datatop * ref->stride,
-                      block_width - right, block_height - top, pic->stride, ref->stride);
-    result += hor_sad(&pic_datatop * pic->stride + (block_width - right),
-                      &ref_datatop * ref->stride + (block_width - right - 1),
-                      right, block_height - top, pic->stride, ref->stride);
+
+    result += kvz_hor_sad(pic_data + top * pic->stride,
+                          ref_data + top * ref->stride,
+                          block_width, block_height - top,
+                          pic->stride, ref->stride,
+                          left, right);
+
   } else if (bottom && left) {
-    result += hor_sad(pic_data,
-                      &ref_dataleft,
-                      left, block_height - bottom, pic->stride, ref->stride);
-    result += kvz_reg_sad(&pic_dataleft,
-                      &ref_dataleft,
-                      block_width - left, block_height - bottom, pic->stride, ref->stride);
+    result += kvz_hor_sad(pic_data, ref_data, block_width, block_height - bottom,
+                          pic->stride, ref->stride, left, right);
+
     result += cor_sad(&pic_data(block_height - bottom) * pic->stride,
                       &ref_data(block_height - bottom - 1) * ref->stride + left,
                       left, bottom, pic->stride);
-    result += ver_sad(&pic_data(block_height - bottom) * pic->stride + left,
+    result += kvz_ver_sad(&pic_data(block_height - bottom) * pic->stride + left,
                       &ref_data(block_height - bottom - 1) * ref->stride + left,
                       block_width - left, bottom, pic->stride);
   } else if (bottom && right) {
-    result += kvz_reg_sad(pic_data,
-                      ref_data,
-                      block_width - right, block_height - bottom, pic->stride, ref->stride);
-    result += hor_sad(&pic_datablock_width - right,
-                      &ref_datablock_width - right - 1,
-                      right, block_height - bottom, pic->stride, ref->stride);
-    result += ver_sad(&pic_data(block_height - bottom) * pic->stride,
+    result += kvz_hor_sad(pic_data, ref_data, block_width, block_height - bottom,
+                          pic->stride, ref->stride, left, right);
+
+    result += kvz_ver_sad(&pic_data(block_height - bottom) * pic->stride,
                       &ref_data(block_height - bottom - 1) * ref->stride,
                       block_width - right, bottom, pic->stride);
     result += cor_sad(&pic_data(block_height - bottom) * pic->stride + block_width - right,
                       &ref_data(block_height - bottom - 1) * ref->stride + block_width - right - 1,
                       right, bottom, pic->stride);
   } else if (top) {
-    result += ver_sad(pic_data,
+    result += kvz_ver_sad(pic_data,
                       &ref_datatop * ref->stride,
                       block_width, top, pic->stride);
-    result += kvz_reg_sad(&pic_datatop * pic->stride,
+    result += reg_sad_maybe_optimized(&pic_datatop * pic->stride,
                       &ref_datatop * ref->stride,
-                      block_width, block_height - top, pic->stride, ref->stride);
+                      block_width, block_height - top, pic->stride, ref->stride,
+                      optimized_sad);
   } else if (bottom) {
-    result += kvz_reg_sad(pic_data,
+    result += reg_sad_maybe_optimized(pic_data,
                       ref_data,
-                      block_width, block_height - bottom, pic->stride, ref->stride);
-    result += ver_sad(&pic_data(block_height - bottom) * pic->stride,
+                      block_width, block_height - bottom, pic->stride, ref->stride,
+                      optimized_sad);
+    result += kvz_ver_sad(&pic_data(block_height - bottom) * pic->stride,
                       &ref_data(block_height - bottom - 1) * ref->stride,
                       block_width, bottom, pic->stride);
-  } else if (left) {
-    result += hor_sad(pic_data,
-                      &ref_dataleft,
-                      left, block_height, pic->stride, ref->stride);
-    result += kvz_reg_sad(&pic_dataleft,
-                      &ref_dataleft,
-                      block_width - left, block_height, pic->stride, ref->stride);
-  } else if (right) {
-    result += kvz_reg_sad(pic_data,
-                      ref_data,
-                      block_width - right, block_height, pic->stride, ref->stride);
-    result += hor_sad(&pic_datablock_width - right,
-                      &ref_datablock_width - right - 1,
-                      right, block_height, pic->stride, ref->stride);
+  } else if (left | right) {
+    result += kvz_hor_sad(pic_data, ref_data,
+                          block_width, block_height, pic->stride,
+                          ref->stride, left, right);
   } else {
-    result += kvz_reg_sad(pic_data, ref_data, block_width, block_height, pic->stride, ref->stride);
+    result += reg_sad_maybe_optimized(pic_data, ref_data,
+                                      block_width, block_height,
+                                      pic->stride, ref->stride,
+                                      optimized_sad);
   }
-
   return result;
 }
 
-
 /**
 * \brief Calculate interpolated SAD between two blocks.
 *
@@ -459,11 +413,14 @@
                             int ref_x,
                             int ref_y,
                             int block_width,
-                            int block_height)
+                            int block_height,
+                            optimized_sad_func_ptr_t optimized_sad)
 {
   assert(pic_x >= 0 && pic_x <= pic->width - block_width);
   assert(pic_y >= 0 && pic_y <= pic->height - block_height);
 
+  uint32_t res;
+
   if (ref_x >= 0 && ref_x <= ref->width  - block_width &&
       ref_y >= 0 && ref_y <= ref->height - block_height)
   {
@@ -471,11 +428,19 @@
     // SAD directly. This is the most common case, which is why it's first.
     const kvz_pixel *pic_data = &pic->ypic_y * pic->stride + pic_x;
     const kvz_pixel *ref_data = &ref->yref_y * ref->stride + ref_x;
-    return kvz_reg_sad(pic_data, ref_data, block_width, block_height, pic->stride, ref->stride)>>(KVZ_BIT_DEPTH-8);
+
+    res = reg_sad_maybe_optimized(pic_data,
+                                  ref_data,
+                                  block_width,
+                                  block_height,
+                                  pic->stride,
+                                  ref->stride,
+                                  optimized_sad);
   } else {
     // Call a routine that knows how to interpolate pixels outside the frame.
-    return image_interpolated_sad(pic, ref, pic_x, pic_y, ref_x, ref_y, block_width, block_height) >> (KVZ_BIT_DEPTH - 8);
+    res = image_interpolated_sad(pic, ref, pic_x, pic_y, ref_x, ref_y, block_width, block_height, optimized_sad);
   }
+  return res >> (KVZ_BIT_DEPTH - 8);
 }

kvazaar-1.2.0.tar.gz/src/image.h -> kvazaar-1.3.0.tar.gz/src/image.h Changed

kvazaar-1.2.0.tar.gz/src/input_frame_buffer.c -> kvazaar-1.3.0.tar.gz/src/input_frame_buffer.c Changed

@@ -58,6 +58,11 @@
 
   const int gop_buf_size = 3 * cfg->gop_len;
 
+  bool is_closed_gop = false;
+
+  // Check for closed gop, we need an extra frame in the buffer in this case
+  if (!cfg->open_gop && cfg->intra_period > 0 && cfg->gop_len > 0) is_closed_gop = true;
+
   if (cfg->gop_len == 0 || cfg->gop_lowdelay) {
     // No reordering of output pictures necessary.
 
@@ -94,11 +99,11 @@
     buf->pts_bufferbuf_idx = img_in->pts;
     buf->num_in++;
 
-    if (buf->num_in < cfg->gop_len) {
+    if (buf->num_in < cfg->gop_len + is_closed_gop ? 1 : 0) {
       // Not enough frames to start output.
       return 0;
 
-    } else if (buf->num_in == cfg->gop_len) {
+    } else if (buf->num_in == cfg->gop_len + is_closed_gop ? 1 : 0) {
       // Now we known the PTSs that are needed to compute the delay.
       buf->delay = buf->pts_buffergop_buf_size - 1 - img_in->pts;
     }
@@ -109,7 +114,7 @@
     return NULL;
   }
 
-  if (img_in == NULL && buf->num_in < cfg->gop_len) {
+  if (img_in == NULL && buf->num_in < cfg->gop_len + is_closed_gop ? 1 : 0) {
     // End of the sequence but we have less than a single GOP of frames. Use
     // the difference between the PTSs of the first and the last frame as the
     // delay.
@@ -137,22 +142,35 @@
 
   } else {
     gop_offset = (buf->num_out - 1) % cfg->gop_len;
+    
+    // For closed gop, calculate the gop_offset again
+    if (!cfg->open_gop && cfg->intra_period > 0) {
+      // Offset the GOP position for each extra I-frame added to the structure
+      // in closed gop case
+      int num_extra_frames = (buf->num_out - 1) / (cfg->intra_period + 1);
+      gop_offset = (buf->num_out - 1 - num_extra_frames) % cfg->gop_len;
+    }
 
     // Index of the first picture in the GOP that is being output.
     int gop_start_idx = buf->num_out - 1 - gop_offset;
 
     // Skip pictures until we find an available one.
     gop_offset += buf->gop_skipped;
-    for (;;) {
-      assert(gop_offset < cfg->gop_len);
 
-      idx_out = gop_start_idx + cfg->gopgop_offset.poc_offset - 1;
-      if (idx_out < buf->num_in - 1) {
-        // An available picture found.
-        break;
+    // Every closed-gop IRAP handled here
+    if (is_closed_gop && (!cfg->open_gop && ((buf->num_out - 1) % (cfg->intra_period + 1)) == cfg->intra_period)) {
+      idx_out = gop_start_idx;
+    } else {
+      for (;;) {
+        assert(gop_offset < cfg->gop_len + is_closed_gop ? 1 : 0);
+        idx_out = gop_start_idx + cfg->gopgop_offset.poc_offset - 1;
+        if (idx_out < buf->num_in - 1) {
+          // An available picture found.
+          break;
+        }
+        buf->gop_skipped++;
+        gop_offset++;
       }
-      buf->gop_skipped++;
-      gop_offset++;
     }
 
     if (buf->num_out < cfg->gop_len - 1) {

kvazaar-1.2.0.tar.gz/src/inter.c -> kvazaar-1.3.0.tar.gz/src/inter.c Changed

@@ -29,6 +29,7 @@
 #include "strategies/generic/picture-generic.h"
 #include "strategies/strategies-ipol.h"
 #include "videoframe.h"
+#include "strategies/strategies-picture.h"
 
 
 typedef struct {
@@ -51,8 +52,6 @@
   int mv_frac_x = (mv_param0 & 3);
   int mv_frac_y = (mv_param1 & 3);
 
- #define FILTER_SIZE_Y 8 //Luma filter size
-
   // Fractional luma 1/4-pel
   kvz_extended_block src = {0, 0, 0, 0};
 
@@ -66,7 +65,7 @@
                          ref->y,
                          ref->width,
                          ref->height,
-                         FILTER_SIZE_Y,
+                         KVZ_LUMA_FILTER_TAPS,
                          block_width,
                          block_height,
                          &src);
@@ -75,7 +74,7 @@
                                      src.stride,
                                      block_width,
                                      block_height,
-                                     lcu->rec.y + (ypos%LCU_WIDTH)*LCU_WIDTH + (xpos%LCU_WIDTH),
+                                     lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
                                      LCU_WIDTH,
                                      mv_frac_x,
                                      mv_frac_y,
@@ -96,8 +95,6 @@
   int mv_frac_x = (mv_param0 & 3);
   int mv_frac_y = (mv_param1 & 3);
 
-#define FILTER_SIZE_Y 8 //Luma filter size
-
   // Fractional luma 1/4-pel
   kvz_extended_block src = { 0, 0, 0, 0 };
 
@@ -111,7 +108,7 @@
                          ref->y,
                          ref->width,
                          ref->height,
-                         FILTER_SIZE_Y,
+                         KVZ_LUMA_FILTER_TAPS,
                          block_width,
                          block_height,
                          &src);
@@ -120,7 +117,7 @@
                                            src.stride,
                                            block_width,
                                            block_height,
-                                           hi_prec_out->y + (ypos%LCU_WIDTH)*LCU_WIDTH + (xpos%LCU_WIDTH),
+                                           hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH),
                                            LCU_WIDTH,
                                            mv_frac_x,
                                            mv_frac_y,
@@ -147,8 +144,6 @@
   block_width >>= 1;
   block_height >>= 1;
 
-#define FILTER_SIZE_C 4 //Chroma filter size
-
   // Fractional chroma 1/8-pel
   kvz_extended_block src_u = { 0, 0, 0, 0 };
   kvz_extended_block src_v = { 0, 0, 0, 0 };
@@ -162,7 +157,7 @@
                          ref->u,
                          ref->width >> 1,
                          ref->height >> 1,
-                         FILTER_SIZE_C,
+                         KVZ_CHROMA_FILTER_TAPS,
                          block_width,
                          block_height,
                          &src_u);
@@ -178,12 +173,12 @@
                          ref->v,
                          ref->width >> 1,
                          ref->height >> 1,
-                         FILTER_SIZE_C,
+                         KVZ_CHROMA_FILTER_TAPS,
                          block_width,
                          block_height,
                          &src_v);
   kvz_sample_octpel_chroma(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width,
-    block_height, lcu->rec.v + (ypos  % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
+    block_height, lcu->rec.v + (ypos  % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param);
 
   if (src_u.malloc_used) free(src_u.buffer);
   if (src_v.malloc_used) free(src_v.buffer);
@@ -207,8 +202,6 @@
   block_width >>= 1;
   block_height >>= 1;
 
-#define FILTER_SIZE_C 4 //Chroma filter size
-
   // Fractional chroma 1/8-pel
   kvz_extended_block src_u = { 0, 0, 0, 0 };
   kvz_extended_block src_v = { 0, 0, 0, 0 };
@@ -223,7 +216,7 @@
                          ref->u,
                          ref->width >> 1,
                          ref->height >> 1,
-                         FILTER_SIZE_C,
+                         KVZ_CHROMA_FILTER_TAPS,
                          block_width,
                          block_height,
                          &src_u);
@@ -232,7 +225,7 @@
                                          src_u.stride,
                                          block_width,
                                          block_height,
-                                         hi_prec_out->u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
+                                         hi_prec_out->u + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
                                          LCU_WIDTH_C,
                                          mv_frac_x,
                                          mv_frac_y,
@@ -248,7 +241,7 @@
                          ref->v,
                          ref->width >> 1,
                          ref->height >> 1,
-                         FILTER_SIZE_C,
+                         KVZ_CHROMA_FILTER_TAPS,
                          block_width,
                          block_height,
                          &src_v);
@@ -257,7 +250,7 @@
                                          src_v.stride,
                                          block_width,
                                          block_height,
-                                         hi_prec_out->v + (ypos  % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
+                                         hi_prec_out->v + (ypos  % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C),
                                          LCU_WIDTH_C,
                                          mv_frac_x,
                                          mv_frac_y,
@@ -306,27 +299,27 @@
 
 
 /**
- * \brief Reconstruct inter block
+ * \brief Reconstruct an inter PU using uniprediction.
  *
  * \param state         encoder state
  * \param ref           picture to copy the data from
- * \param xpos          block x position
- * \param ypos          block y position
- * \param width         block width
- * \param height        block height
+ * \param xpos          PU x position
+ * \param ypos          PU y position
+ * \param width         PU width
+ * \param height        PU height
  * \param mv_param      motion vector
  * \param lcu           destination lcu
- * \param hi_prec_out   destination of high precision output (null if not needed)
+ * \param hi_prec_out   destination of high precision output, or NULL if not needed
 */
-void kvz_inter_recon_lcu(const encoder_state_t * const state,
-                         const kvz_picture * const ref,
-                         int32_t xpos,
-                         int32_t ypos,
-                         int32_t width,
-                         int32_t height,
-                         const int16_t mv_param2,
-                         lcu_t *lcu,
-                         hi_prec_buf_t *hi_prec_out)
+static void inter_recon_unipred(const encoder_state_t * const state,
+                                const kvz_picture * const ref,
+                                int32_t xpos,
+                                int32_t ypos,
+                                int32_t width,
+                                int32_t height,
+                                const int16_t mv_param2,
+                                lcu_t *lcu,
+                                hi_prec_buf_t *hi_prec_out)
 {
   const vector2d_t pu_in_tile = { xpos, ypos };
   const vector2d_t pu_in_lcu = { xpos % LCU_WIDTH, ypos % LCU_WIDTH };
@@ -426,36 +419,32 @@
     }
   }
 }
-
 /**
- * \brief Reconstruct bi-pred inter block
+ * \brief Reconstruct bi-pred inter PU
  *
  * \param state     encoder state
  * \param ref1      reference picture to copy the data from
  * \param ref2      other reference picture to copy the data from
- * \param xpos      block x position
- * \param ypos      block y position
- * \param width     block width
- * \param height    block height
+ * \param xpos      PU x position
+ * \param ypos      PU y position
+ * \param width     PU width
+ * \param height    PU height
  * \param mv_param  motion vectors
  * \param lcu       destination lcu
  */
-void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state,
-                                const kvz_picture * ref1,
-                                const kvz_picture * ref2,
-                                int32_t xpos,
-                                int32_t ypos,
-                                int32_t width,
-                                int32_t height,
-                                int16_t mv_param22,
-                                lcu_t* lcu)
+void kvz_inter_recon_bipred(const encoder_state_t * const state,
+                            const kvz_picture * ref1,
+                            const kvz_picture * ref2,
+                            int32_t xpos,
+                            int32_t ypos,
+                            int32_t width,
+                            int32_t height,
+                            int16_t mv_param22,
+                            lcu_t* lcu)
 {
   kvz_pixel temp_lcu_yLCU_WIDTH*LCU_WIDTH;
   kvz_pixel temp_lcu_uLCU_WIDTH_C*LCU_WIDTH_C;
   kvz_pixel temp_lcu_vLCU_WIDTH_C*LCU_WIDTH_C;
-  int temp_x, temp_y;
-  int shift = 15 - KVZ_BIT_DEPTH;
-  int offset = 1 << (shift - 1);
 
   const int hi_prec_luma_rec0 = mv_param00 & 3 || mv_param01 & 3;
   const int hi_prec_luma_rec1 = mv_param10 & 3 || mv_param11 & 3;
@@ -467,43 +456,87 @@
   hi_prec_buf_t* high_precision_rec1 = 0;
   if (hi_prec_chroma_rec0) high_precision_rec0 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
   if (hi_prec_chroma_rec1) high_precision_rec1 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
+
+
   //Reconstruct both predictors
-  kvz_inter_recon_lcu(state, ref1, xpos, ypos, width, height, mv_param0, lcu, high_precision_rec0);
+  inter_recon_unipred(state, ref1, xpos, ypos, width, height, mv_param0, lcu, high_precision_rec0);
   if (!hi_prec_luma_rec0){
-    memcpy(temp_lcu_y, lcu->rec.y, sizeof(kvz_pixel) * 64 * 64);
+    memcpy(temp_lcu_y, lcu->rec.y, sizeof(kvz_pixel) * 64 * 64); // copy to temp_lcu_y
   }
   if (!hi_prec_chroma_rec0){
-    memcpy(temp_lcu_u, lcu->rec.u, sizeof(kvz_pixel) * 32 * 32);
-    memcpy(temp_lcu_v, lcu->rec.v, sizeof(kvz_pixel) * 32 * 32);
+    memcpy(temp_lcu_u, lcu->rec.u, sizeof(kvz_pixel) * 32 * 32); // copy to temp_lcu_u
+    memcpy(temp_lcu_v, lcu->rec.v, sizeof(kvz_pixel) * 32 * 32); // copy to temp_lcu_v
   }
-  kvz_inter_recon_lcu(state, ref2, xpos, ypos, width, height, mv_param1, lcu, high_precision_rec1);
+  inter_recon_unipred(state, ref2, xpos, ypos, width, height, mv_param1, lcu, high_precision_rec1);
 
   // After reconstruction, merge the predictors by taking an average of each pixel
-  for (temp_y = 0; temp_y < height; ++temp_y) {
-    int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
-    for (temp_x = 0; temp_x < width; ++temp_x) {
-      int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
-      int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu : (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-      int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu : (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-      lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
-    }
+  kvz_inter_recon_bipred_blend(hi_prec_luma_rec0, hi_prec_luma_rec1, hi_prec_chroma_rec0, hi_prec_chroma_rec1, height, width, ypos, xpos, high_precision_rec0, high_precision_rec1, lcu, temp_lcu_y, temp_lcu_u, temp_lcu_v);
+ 
+  if (high_precision_rec0 != 0) kvz_hi_prec_buf_t_free(high_precision_rec0);
+  if (high_precision_rec1 != 0) kvz_hi_prec_buf_t_free(high_precision_rec1);
+}
 
-  }
-  for (temp_y = 0; temp_y < height >> 1; ++temp_y) {
-    int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
-    for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
-      int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
-      int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + x_in_lcu : (temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-      int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + x_in_lcu : (lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-      lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
-
-      int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + x_in_lcu : (temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-      int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + x_in_lcu : (lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
-      lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
+
+/**
+ * Reconstruct a single CU.
+ *
+ * The CU may consist of multiple PUs, each of which can use either
+ * uniprediction or biprediction.
+ *
+ * \param state   encoder state
+ * \param lcu     containing LCU
+ * \param x       x-coordinate of the CU in pixels
+ * \param y       y-coordinate of the CU in pixels
+ * \param width   CU width
+ */
+void kvz_inter_recon_cu(const encoder_state_t * const state,
+                        lcu_t *lcu,
+                        int32_t x,
+                        int32_t y,
+                        int32_t width)
+{
+  cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+
+  const int num_pu = kvz_part_mode_num_partscu->part_size;
+  for (int i = 0; i < num_pu; ++i) {
+    const int pu_x = PU_GET_X(cu->part_size, width, x, i);
+    const int pu_y = PU_GET_Y(cu->part_size, width, y, i);
+    const int pu_w = PU_GET_W(cu->part_size, width, i);
+    const int pu_h = PU_GET_H(cu->part_size, width, i);
+
+    cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
+
+    if (pu->inter.mv_dir == 3) {
+      const kvz_picture *const refs2 = {
+        state->frame->ref->images
+          state->frame->ref_LX0
+            pu->inter.mv_ref0,
+        state->frame->ref->images
+          state->frame->ref_LX1
+            pu->inter.mv_ref1,
+      };
+      kvz_inter_recon_bipred(state,
+                             refs0, refs1,
+                             pu_x, pu_y,
+                             pu_w, pu_h,
+                             pu->inter.mv,
+                             lcu);
+    } else {
+      const int mv_idx = pu->inter.mv_dir - 1;
+      const kvz_picture *const ref =
+        state->frame->ref->images
+          state->frame->ref_LXmv_idx
+            pu->inter.mv_refmv_idx;
+
+      inter_recon_unipred(state,
+                          ref,
+                          pu_x, pu_y,
+                          pu_w, pu_h,
+                          pu->inter.mvmv_idx,
+                          lcu,
+                          NULL);
     }
   }
-  if (high_precision_rec0 != 0) kvz_hi_prec_buf_t_free(high_precision_rec0);
-  if (high_precision_rec1 != 0) kvz_hi_prec_buf_t_free(high_precision_rec1);
 }
 
 /**
@@ -996,11 +1029,15 @@
   // in L0 or L1, the primary list for the colocated PU is the inverse of
   // collocated_from_l0_flag. Otherwise it is equal to reflist.
   //
-  // In Kvazaar, the L1 list is only used for future pictures and the slice
-  // type is set to KVZ_SLICE_B if and only if L1 is used. Therefore we can
-  // simply check the slice type here. Kvazaar always sets
-  // collocated_from_l0_flag so the list is L1 for B-slices.
-  int col_list = state->frame->slicetype == KVZ_SLICE_P ? reflist : 1;
+  // Kvazaar always sets collocated_from_l0_flag so the list is L1 when
+  // there are future references.
+  int col_list = reflist;
+  for (int i = 0; i < state->frame->ref->used_size; i++) {
+    if (state->frame->ref->pocsi > state->frame->poc) {
+      col_list = 1;
+      break;
+    }
+  }
 
   if ((colocated->inter.mv_dir & (col_list + 1)) == 0) {
     // Use the other list if the colocated PU does not have a MV for the
@@ -1033,22 +1070,27 @@
   if (!cand) return false;
 
   assert(cand->inter.mv_dir != 0);
-  const int cand_list = cand->inter.mv_dir & (1 << reflist) ? reflist : !reflist;
 
-  if (scaling) {
-    mv_cand_out0 = cand->inter.mvcand_list0;
-    mv_cand_out1 = cand->inter.mvcand_list1;
-    apply_mv_scaling(state, cur_cu, cand, reflist, cand_list, mv_cand_out);
-    return true;
-  }
+  for (int i = 0; i < 2; i++) {
+    const int cand_list = i == 0 ? reflist : !reflist;
 
-  if (cand->inter.mv_dir & (1 << cand_list) &&
-      state->frame->ref_LXcand_listcand->inter.mv_refcand_list == 
-      state->frame->ref_LXreflistcur_cu->inter.mv_refreflist)
-  {
-    mv_cand_out0 = cand->inter.mvcand_list0;
-    mv_cand_out1 = cand->inter.mvcand_list1;
-    return true;
+    if ((cand->inter.mv_dir & (1 << cand_list)) == 0) continue;
+
+    if (scaling) {
+      mv_cand_out0 = cand->inter.mvcand_list0;
+      mv_cand_out1 = cand->inter.mvcand_list1;
+      apply_mv_scaling(state, cur_cu, cand, reflist, cand_list, mv_cand_out);
+      return true;
+    }
+
+    if (cand->inter.mv_dir & (1 << cand_list) &&
+        state->frame->ref_LXcand_listcand->inter.mv_refcand_list ==
+        state->frame->ref_LXreflistcur_cu->inter.mv_refreflist)
+    {
+      mv_cand_out0 = cand->inter.mvcand_list0;
+      mv_cand_out1 = cand->inter.mvcand_list1;
+      return true;
+    }
   }
 
   return false;
@@ -1238,11 +1280,14 @@
 static bool add_merge_candidate(const cu_info_t *cand,
                                 const cu_info_t *possible_duplicate1,
                                 const cu_info_t *possible_duplicate2,
-                                inter_merge_cand_t *merge_cand_out)
+                                inter_merge_cand_t *merge_cand_out,
+                                uint8_t candidates,
+                                uint8_t max_num_cands)
 {
   if (!cand ||
       is_duplicate_candidate(cand, possible_duplicate1) ||
-      is_duplicate_candidate(cand, possible_duplicate2)) {
+      is_duplicate_candidate(cand, possible_duplicate2) ||
+      candidates >= max_num_cands) {
     return false;
   }
 
@@ -1280,7 +1325,7 @@
   int8_t zero_idx = 0;
 
   merge_candidates_t merge_cand = { {0, 0}, {0, 0, 0}, 0, 0 };
-
+  const uint8_t max_num_cands = state->encoder_control->cfg.max_merge;
   get_spatial_merge_candidates(x, y, width, height,
                                state->tile->frame->width,
                                state->tile->frame->height,
@@ -1293,16 +1338,16 @@
   if (!use_a1) a1 = NULL;
   if (!use_b1) b1 = NULL;
 
-  if (add_merge_candidate(a1, NULL, NULL, &mv_candcandidates)) candidates++;
-  if (add_merge_candidate(b1, a1, NULL, &mv_candcandidates)) candidates++;
-  if (add_merge_candidate(b0, b1, NULL, &mv_candcandidates)) candidates++;
-  if (add_merge_candidate(a0, a1, NULL, &mv_candcandidates)) candidates++;
+  if (add_merge_candidate(a1, NULL, NULL, &mv_candcandidates, candidates, max_num_cands)) candidates++;
+  if (add_merge_candidate(b1, a1, NULL, &mv_candcandidates, candidates, max_num_cands)) candidates++;
+  if (add_merge_candidate(b0, b1, NULL, &mv_candcandidates, candidates, max_num_cands)) candidates++;
+  if (add_merge_candidate(a0, a1, NULL, &mv_candcandidates, candidates, max_num_cands)) candidates++;
   if (candidates < 4 &&
-      add_merge_candidate(b2, a1, b1, &mv_candcandidates)) candidates++;
+      add_merge_candidate(b2, a1, b1, &mv_candcandidates, candidates, max_num_cands)) candidates++;
 
   bool can_use_tmvp =
     state->encoder_control->cfg.tmvp_enable &&
-    candidates < MRG_MAX_NUM_CANDS &&
+    candidates < max_num_cands &&
     state->frame->ref->used_size;
 
   if (can_use_tmvp) {
@@ -1333,12 +1378,12 @@
     if (mv_candcandidates.dir != 0) candidates++;
   }
 
-  if (candidates < MRG_MAX_NUM_CANDS && state->frame->slicetype == KVZ_SLICE_B) {
+  if (candidates < max_num_cands && state->frame->slicetype == KVZ_SLICE_B) {
     #define NUM_PRIORITY_LIST 12;
     static const uint8_t priorityList0 = { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 };
     static const uint8_t priorityList1 = { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 };
     uint8_t cutoff = candidates;
-    for (int32_t idx = 0; idx<cutoff*(cutoff - 1) && candidates != MRG_MAX_NUM_CANDS; idx++) {
+    for (int32_t idx = 0; idx<cutoff*(cutoff - 1) && candidates != max_num_cands; idx++) {
       uint8_t i = priorityList0idx;
       uint8_t j = priorityList1idx;
       if (i >= candidates || j >= candidates) break;
@@ -1370,7 +1415,7 @@
 
   int num_ref = state->frame->ref->used_size;
 
-  if (candidates < MRG_MAX_NUM_CANDS && state->frame->slicetype == KVZ_SLICE_B) {
+  if (candidates < max_num_cands && state->frame->slicetype == KVZ_SLICE_B) {
     int j;
     int ref_negative = 0;
     int ref_positive = 0;
@@ -1385,7 +1430,7 @@
   }
 
   // Add (0,0) prediction
-  while (candidates != MRG_MAX_NUM_CANDS) {
+  while (candidates != max_num_cands) {
     mv_candcandidates.mv00 = 0;
     mv_candcandidates.mv01 = 0;
     mv_candcandidates.ref0 = (zero_idx >= num_ref - 1) ? 0 : zero_idx;

kvazaar-1.2.0.tar.gz/src/inter.h -> kvazaar-1.3.0.tar.gz/src/inter.h Changed

@@ -40,26 +40,22 @@
 
 } inter_merge_cand_t;
 
+void kvz_inter_recon_cu(const encoder_state_t * const state,
+                        lcu_t *lcu,
+                        int32_t x,
+                        int32_t y,
+                        int32_t width);
 
-void kvz_inter_recon_lcu(const encoder_state_t * const state,
-                         const kvz_picture * ref,
-                         int32_t xpos,
-                         int32_t ypos,
-                         int32_t width,
-                         int32_t height,
-                         const int16_t mv_param2,
-                         lcu_t* lcu,
-                         hi_prec_buf_t *hi_prec_out);
+void kvz_inter_recon_bipred(const encoder_state_t * const state,
+                            const kvz_picture * ref1,
+                            const kvz_picture * ref2,
+                            int32_t xpos,
+                            int32_t ypos,
+                            int32_t width,
+                            int32_t height,
+                            int16_t mv_param22,
+                            lcu_t* lcu);
 
-void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state,
-                                const kvz_picture * ref1,
-                                const kvz_picture * ref2,
-                                int32_t xpos,
-                                int32_t ypos,
-                                int32_t width,
-                                int32_t height,
-                                int16_t mv_param22,
-                                lcu_t* lcu);
 
 void kvz_inter_get_mv_cand(const encoder_state_t * const state,
                            int32_t x,

kvazaar-1.2.0.tar.gz/src/kvazaar.c -> kvazaar-1.3.0.tar.gz/src/kvazaar.c Changed

kvazaar-1.2.0.tar.gz/src/kvazaar.h -> kvazaar-1.3.0.tar.gz/src/kvazaar.h Changed

@@ -92,6 +92,7 @@
   KVZ_IME_FULL16 = 4, //! \since 3.6.0
   KVZ_IME_FULL32 = 5, //! \since 3.6.0
   KVZ_IME_FULL64 = 6, //! \since 3.6.0
+  KVZ_IME_DIA = 7, // Experimental. TODO: change into a proper doc comment
 };
 
 /**
@@ -206,6 +207,12 @@
   KVZ_SAO_FULL = 3
 };
 
+enum kvz_scalinglist {
+  KVZ_SCALING_LIST_OFF = 0,
+  KVZ_SCALING_LIST_CUSTOM = 1,
+  KVZ_SCALING_LIST_DEFAULT = 2,  
+};
+
 // Map from input format to chroma format.
 #define KVZ_FORMAT2CSP(format) ((enum kvz_chroma_format)"\0\1\2\3"format)
 
@@ -322,6 +329,7 @@
   uint8_t *optional_key;
 
   enum kvz_me_early_termination me_early_termination; /*!< \since 3.8.0 \brief Mode of me early termination. */
+  int32_t intra_rdo_et; /*!< \since 4.1.0 \brief Use early termination in intra rdo. */
 
   int32_t lossless; /*!< \brief Use lossless coding. */
 
@@ -351,6 +359,37 @@
    * \brief Use adaptive QP for 360 video with equirectangular projection.
    */
   int32_t erp_aqp;
+
+  /** \brief The HEVC level */
+  uint8_t level;
+  /** \brief Whether we ignore and just warn from all of the errors about the output not conforming to the level's requirements. */
+  uint8_t force_level;
+  /** \brief Whether we use the high tier bitrates. Requires the level to be 4 or higher. */
+  uint8_t high_tier;
+  /** \brief The maximum allowed bitrate for this level and tier. */
+  uint32_t max_bitrate;
+
+  /** \brief Maximum steps that hexagonal and diagonal motion estimation can use. -1 to disable */
+  uint32_t me_max_steps;
+
+  /** \brief Minimum QP that uses CABAC for residual cost instead of a fast estimate. */
+  int8_t fast_residual_cost_limit;
+
+  /** \brief Set QP at CU level keeping pic_init_qp_minus26 in PPS zero */
+  int8_t set_qp_in_cu;
+
+  /** \brief Flag to enable/disable open GOP configuration */
+  int8_t open_gop;
+
+  /** \brief Type of scaling lists to use */
+  int8_t scaling_list;
+
+  /** \brief Maximum number of merge cadidates */
+  uint8_t max_merge;
+
+  /** \brief Enable Early Skip Mode Decision */
+  uint8_t early_skip;
+
 } kvz_config;
 
 /**
@@ -359,7 +398,8 @@
  * Function picture_alloc in kvz_api must be used for allocation.
  */
 typedef struct kvz_picture {
-  kvz_pixel *fulldata;         //!< \brief Allocated buffer (only used in the base_image)
+  kvz_pixel *fulldata_buf;     //!< \brief Allocated buffer with padding (only used in the base_image)
+  kvz_pixel *fulldata;         //!< \brief Allocated buffer portion that's actually used
 
   kvz_pixel *y;                //!< \brief Pointer to luma pixel array.
   kvz_pixel *u;                //!< \brief Pointer to chroma U pixel array.

kvazaar-1.2.0.tar.gz/src/rate_control.c -> kvazaar-1.3.0.tar.gz/src/rate_control.c Changed

kvazaar-1.2.0.tar.gz/src/rdo.c -> kvazaar-1.3.0.tar.gz/src/rdo.c Changed

@@ -30,6 +30,7 @@
 #include "imagelist.h"
 #include "inter.h"
 #include "scalinglist.h"
+#include "strategyselector.h"
 #include "tables.h"
 #include "transform.h"
 
@@ -41,8 +42,6 @@
 #define LOG2_SCAN_SET_SIZE    4
 #define SBH_THRESHOLD         4
 
-static const double COEFF_SUM_MULTIPLIER = 1.9;
-
 const uint32_t kvz_g_go_rice_range5 = { 7, 14, 26, 46, 78 };
 const uint32_t kvz_g_go_rice_prefix_len5 = { 8, 7, 6, 5, 4 };
 
@@ -195,7 +194,6 @@
   return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3);
 }
 
-
 /**
  * \brief Estimate bitcost for coding coefficients.
  *
@@ -211,15 +209,17 @@
                             int32_t type,
                             int8_t scan_mode)
 {
-  if (state->encoder_control->cfg.rdo > 0) {
+  if (state->qp >= state->encoder_control->cfg.fast_residual_cost_limit) {
     return get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
 
   } else {
-    return COEFF_SUM_MULTIPLIER * kvz_coeff_abs_sum(coeff, width * width) + 0.5;
+    // Estimate coeff coding cost based on QP and sum of absolute coeffs.
+    // const uint32_t sum = kvz_coeff_abs_sum(coeff, width * width);
+    // return (uint32_t)(sum * (state->qp * COEFF_COST_QP_FACTOR + COEFF_COST_BIAS) + 0.5);
+    return kvz_fast_coeff_cost(coeff, width, state->qp);
   }
 }
 
-
 #define COEF_REMAIN_BIN_REDUCTION 3
 /** Calculates the cost for specific absolute transform level
  * \param abs_level scaled quantized level
@@ -879,52 +879,23 @@
   }
 }
 
-/** MVD cost calculation with CABAC
-* \returns int
-* Calculates cost of actual motion vectors using CABAC coding
-*/
+/**
+ * Calculate cost of actual motion vectors using CABAC coding
+ */
 uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
-                                       vector2d_t *mvd,
-                                       const cabac_data_t* real_cabac)
+                                       const cabac_data_t* cabac,
+                                       const int32_t mvd_hor,
+                                       const int32_t mvd_ver)
 {
-  uint32_t bitcost = 0;
-  const int32_t mvd_hor = mvd->x;
-  const int32_t mvd_ver = mvd->y;
-  const int8_t hor_abs_gr0 = mvd_hor != 0;
-  const int8_t ver_abs_gr0 = mvd_ver != 0;
-  const uint32_t mvd_hor_abs = abs(mvd_hor);
-  const uint32_t mvd_ver_abs = abs(mvd_ver);
+  cabac_data_t cabac_copy = *cabac;
+  cabac_copy.only_count = 1;
 
-  cabac_data_t cabac_copy;
-  memcpy(&cabac_copy, real_cabac, sizeof(cabac_data_t));
-  cabac_data_t *cabac = &cabac_copy;
-  cabac->only_count = 1;
-
-  cabac->cur_ctx = &(cabac->ctx.cu_mvd_model0);
-  CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
-  CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");
-  cabac->cur_ctx = &(cabac->ctx.cu_mvd_model1);
-  if (hor_abs_gr0) {
-    CABAC_BIN(cabac, (mvd_hor_abs > 1), "abs_mvd_greater1_flag_hor");
-  }
-  if (ver_abs_gr0) {
-    CABAC_BIN(cabac, (mvd_ver_abs > 1), "abs_mvd_greater1_flag_ver");
-  }
-  if (hor_abs_gr0) {
-    if (mvd_hor_abs > 1) {
-      // It is safe to drop const here because cabac->only_count is set.
-      kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_hor_abs - 2, 1);
-    }
-    CABAC_BIN_EP(cabac, (mvd_hor > 0) ? 0 : 1, "mvd_sign_flag_hor");
-  }
-  if (ver_abs_gr0) {
-    if (mvd_ver_abs > 1) {
-      // It is safe to drop const here because cabac->only_count is set.
-      kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_ver_abs - 2, 1);
-    }
-    CABAC_BIN_EP(cabac, (mvd_ver > 0) ? 0 : 1, "mvd_sign_flag_ver");
-  }
-  bitcost = ((23 - cabac->bits_left) + (cabac->num_buffered_bytes << 3)) - ((23 - real_cabac->bits_left) + (real_cabac->num_buffered_bytes << 3));
+  // It is safe to drop const here because cabac->only_count is set.
+  kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver);
+
+  uint32_t bitcost =
+    ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3)) -
+    ((23 - cabac->bits_left)     + (cabac->num_buffered_bytes << 3));
 
   return bitcost;
 }
@@ -946,8 +917,7 @@
   cabac_data_t state_cabac_copy;
   cabac_data_t* cabac;
   uint32_t merge_idx;
-  int cand1_cost, cand2_cost;
-  vector2d_t mvd_temp1, mvd_temp2, mvd = { 0, 0 };
+  vector2d_t mvd = { 0, 0 };
   int8_t merged = 0;
   int8_t cur_mv_cand = 0;
 
@@ -979,27 +949,30 @@
   cabac = &state_cabac_copy;
 
   if (!merged) {
-    mvd_temp1.x = x - mv_cand00;
-    mvd_temp1.y = y - mv_cand01;
-    cand1_cost = kvz_get_mvd_coding_cost_cabac(state, &mvd_temp1, cabac);
-
-    mvd_temp2.x = x - mv_cand10;
-    mvd_temp2.y = y - mv_cand11;
-    cand2_cost = kvz_get_mvd_coding_cost_cabac(state, &mvd_temp2, cabac);
+    vector2d_t mvd1 = {
+      x - mv_cand00,
+      y - mv_cand01,
+    };
+    vector2d_t mvd2 = {
+      x - mv_cand10,
+      y - mv_cand11,
+    };
+    uint32_t cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y);
+    uint32_t cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y);
 
     // Select candidate 1 if it has lower cost
     if (cand2_cost < cand1_cost) {
       cur_mv_cand = 1;
-      mvd = mvd_temp2;
+      mvd = mvd2;
     } else {
-      mvd = mvd_temp1;
+      mvd = mvd1;
     }
   }
 
   cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);
 
   CABAC_BIN(cabac, merged, "MergeFlag");
-  num_cand = MRG_MAX_NUM_CANDS;
+  num_cand = state->encoder_control->cfg.max_merge;
   if (merged) {
     if (num_cand > 1) {
       int32_t ui;
@@ -1058,51 +1031,18 @@
 
         // ToDo: Bidir vector support
         if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) {
-          const int32_t mvd_hor = mvd.x;
-          const int32_t mvd_ver = mvd.y;
-          const int8_t hor_abs_gr0 = mvd_hor != 0;
-          const int8_t ver_abs_gr0 = mvd_ver != 0;
-          const uint32_t mvd_hor_abs = abs(mvd_hor);
-          const uint32_t mvd_ver_abs = abs(mvd_ver);
-
-          cabac->cur_ctx = &(cabac->ctx.cu_mvd_model0);
-          CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor");
-          CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver");
-
-          cabac->cur_ctx = &(cabac->ctx.cu_mvd_model1);
-
-          if (hor_abs_gr0) {
-            CABAC_BIN(cabac, (mvd_hor_abs > 1), "abs_mvd_greater1_flag_hor");
-          }
-
-          if (ver_abs_gr0) {
-            CABAC_BIN(cabac, (mvd_ver_abs > 1), "abs_mvd_greater1_flag_ver");
-          }
-
-          if (hor_abs_gr0) {
-            if (mvd_hor_abs > 1) {
-              // It is safe to drop const because cabac->only_count is set.
-              kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_hor_abs - 2, 1);
-            }
-
-            CABAC_BIN_EP(cabac, (mvd_hor > 0) ? 0 : 1, "mvd_sign_flag_hor");
-          }
-
-          if (ver_abs_gr0) {
-            if (mvd_ver_abs > 1) {
-              // It is safe to drop const because cabac->only_count is set.
-              kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_ver_abs - 2, 1);
-            }
-
-            CABAC_BIN_EP(cabac, (mvd_ver > 0) ? 0 : 1, "mvd_sign_flag_ver");
-          }
+          // It is safe to drop const here because cabac->only_count is set.
+          kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y);
         }
 
         // Signal which candidate MV to use
-        kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.mvp_idx_model, cur_mv_cand, 1,
-          AMVP_MAX_NUM_CANDS - 1);
+        kvz_cabac_write_unary_max_symbol(
+            cabac,
+            cabac->ctx.mvp_idx_model,
+            cur_mv_cand,
+            1,
+            AMVP_MAX_NUM_CANDS - 1);
       }
-
     }
   }

kvazaar-1.2.0.tar.gz/src/rdo.h -> kvazaar-1.3.0.tar.gz/src/rdo.h Changed

@@ -39,7 +39,7 @@
 void  kvz_rdoq(encoder_state_t *state, coeff_t *coef, coeff_t *dest_coeff, int32_t width,
            int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth);
 
-uint32_t kvz_get_coeff_cost(const encoder_state_t *state,
+uint32_t kvz_get_coeff_cost(const encoder_state_t * const state,
                             const coeff_t *coeff,
                             int32_t width,
                             int32_t type,
@@ -57,8 +57,9 @@
 kvz_mvd_cost_func kvz_calc_mvd_cost_cabac;
 
 uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
-                                       vector2d_t *mvd,
-                                       const cabac_data_t* cabac);
+                                       const cabac_data_t* cabac,
+                                       int32_t mvd_hor,
+                                       int32_t mvd_ver);
 
 // Number of fixed point fractional bits used in the fractional bit table.
 #define CTX_FRAC_BITS 15

kvazaar-1.2.0.tar.gz/src/scalinglist.c -> kvazaar-1.3.0.tar.gz/src/scalinglist.c Changed

kvazaar-1.2.0.tar.gz/src/scalinglist.h -> kvazaar-1.3.0.tar.gz/src/scalinglist.h Changed

kvazaar-1.2.0.tar.gz/src/search.c -> kvazaar-1.3.0.tar.gz/src/search.c Changed

@@ -116,7 +116,7 @@
   }
 }
 
-void kvz_lcu_set_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth)
+void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth)
 {
   const int x_local = SUB_SCU(x_px);
   const int y_local = SUB_SCU(y_px);
@@ -138,6 +138,7 @@
       to->type      = cu->type;
       to->depth     = cu->depth;
       to->part_size = cu->part_size;
+      to->qp        = cu->qp;
 
       if (cu->type == CU_INTRA) {
         to->intra.mode        = cu->intra.mode;
@@ -152,7 +153,7 @@
   }
 }
 
-static void lcu_set_inter(lcu_t *lcu, int x_local, int y_local, int cu_width)
+static void lcu_fill_inter(lcu_t *lcu, int x_local, int y_local, int cu_width)
 {
   const part_mode_t part_mode = LCU_GET_CU_AT_PX(lcu, x_local, y_local)->part_size;
   const int num_pu = kvz_part_mode_num_partspart_mode;
@@ -169,7 +170,7 @@
   }
 }
 
-static void lcu_set_coeff(lcu_t *lcu, int x_local, int y_local, int width, cu_info_t *cur_cu)
+static void lcu_fill_cbf(lcu_t *lcu, int x_local, int y_local, int width, cu_info_t *cur_cu)
 {
   const uint32_t tr_split = cur_cu->tr_depth - cur_cu->depth;
   const uint32_t mask = ~((width >> tr_split)-1);
@@ -189,6 +190,40 @@
 }
 
 
+//Calculates cost for all zero coeffs
+static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, const int x, const int y,
+  const int depth)
+{
+  int x_local = SUB_SCU(x);
+  int y_local = SUB_SCU(y);
+  int cu_width = LCU_WIDTH >> depth;
+  lcu_t *const lcu = &work_treedepth;
+
+  const int luma_index = y_local * LCU_WIDTH + x_local;
+  const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2);
+
+  double ssd = 0.0;
+  ssd += LUMA_MULT * kvz_pixels_calc_ssd(
+    &lcu->ref.yluma_index, &lcu->rec.yluma_index,
+    LCU_WIDTH, LCU_WIDTH, cu_width
+    );
+  if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != KVZ_CSP_400) {
+    ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+      &lcu->ref.uchroma_index, &lcu->rec.uchroma_index,
+      LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+      );
+    ssd += CHROMA_MULT * kvz_pixels_calc_ssd(
+      &lcu->ref.vchroma_index, &lcu->rec.vchroma_index,
+      LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2
+      );
+  }
+  // Save the pixels at a lower level of the working tree.
+  copy_cu_pixels(x_local, y_local, cu_width, lcu, &work_treedepth + 1);
+
+  return ssd;
+}
+
+
 /**
 * Calculate RD cost for a Coding Unit.
 * \return Cost of block
@@ -368,6 +403,30 @@
 }
 
 
+/**
+ * \brief Sort modes and costs to ascending order according to costs.
+ */
+void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length)
+{
+  // Length for intra is always between 5 and 23, and is either 21, 17, 9 or 8 about
+  // 60% of the time, so there should be no need for anything more complex
+  // than insertion sort.
+  // Length for merge is 5 or less.
+  for (uint8_t i = 1; i < length; ++i) {
+    const double cur_cost = costsi;
+    const int8_t cur_mode = modesi;
+    uint8_t j = i;
+    while (j > 0 && cur_cost < costsj - 1) {
+      costsj = costsj - 1;
+      modesj = modesj - 1;
+      --j;
+    }
+    costsj = cur_cost;
+    modesj = cur_mode;
+  }
+}
+
+
 static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth)
 {
   vector2d_t lcu_cu = { SUB_SCU(x), SUB_SCU(y) };
@@ -392,6 +451,7 @@
   const videoframe_t * const frame = state->tile->frame;
   int cu_width = LCU_WIDTH >> depth;
   double cost = MAX_INT;
+  double inter_zero_coeff_cost = MAX_INT;
   uint32_t inter_bitcost = MAX_INT;
   cu_info_t *cur_cu;
 
@@ -412,6 +472,7 @@
   cur_cu->tr_depth = depth > 0 ? depth : 1;
   cur_cu->type = CU_NOTSET;
   cur_cu->part_size = SIZE_2Nx2N;
+  cur_cu->qp = state->qp;
 
   // If the CU is completely inside the frame at this depth, search for
   // prediction modes at this depth.
@@ -419,14 +480,17 @@
       y + cu_width <= frame->height)
   {
     int cu_width_inter_min = LCU_WIDTH >> ctrl->cfg.pu_depth_inter.max;
-    bool can_use_inter = state->frame->slicetype != KVZ_SLICE_I && (
-      WITHIN(depth, ctrl->cfg.pu_depth_inter.min, ctrl->cfg.pu_depth_inter.max) ||
-      // When the split was forced because the CTU is partially outside the
-      // frame, we permit inter coding even if pu_depth_inter would
-      // otherwise forbid it.
-      (x & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame->width ||
-      (y & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame->height
-    );
+    bool can_use_inter =
+      state->frame->slicetype != KVZ_SLICE_I &&
+      depth <= MAX_DEPTH &&
+      (
+        WITHIN(depth, ctrl->cfg.pu_depth_inter.min, ctrl->cfg.pu_depth_inter.max) ||
+        // When the split was forced because the CTU is partially outside the
+        // frame, we permit inter coding even if pu_depth_inter would
+        // otherwise forbid it.
+        (x & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame->width ||
+        (y & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame->height
+      );
 
     if (can_use_inter) {
       double mode_cost;
@@ -442,30 +506,31 @@
         cur_cu->type = CU_INTER;
       }
 
-      // Try SMP and AMP partitioning.
-      static const part_mode_t mp_modes = {
-        // SMP
-        SIZE_2NxN, SIZE_Nx2N,
-        // AMP
-        SIZE_2NxnU, SIZE_2NxnD,
-        SIZE_nLx2N, SIZE_nRx2N,
-      };
-
-      const int first_mode = ctrl->cfg.smp_enable ? 0 : 2;
-      const int last_mode  = (ctrl->cfg.amp_enable && cu_width >= 16) ? 5 : 1;
-      for (int i = first_mode; i <= last_mode; ++i) {
-        kvz_search_cu_smp(state,
-                          x, y,
-                          depth,
-                          mp_modesi,
-                          &work_treedepth + 1,
-                          &mode_cost, &mode_bitcost);
-        // TODO: take cost of coding part mode into account
-        if (mode_cost < cost) {
-          cost = mode_cost;
-          inter_bitcost = mode_bitcost;
-          // TODO: only copy inter prediction info, not pixels
-          work_tree_copy_up(x_local, y_local, depth, work_tree);
+      if (!(ctrl->cfg.early_skip && cur_cu->skipped)) {
+        // Try SMP and AMP partitioning.
+        static const part_mode_t mp_modes = {
+          // SMP
+          SIZE_2NxN, SIZE_Nx2N,
+          // AMP
+          SIZE_2NxnU, SIZE_2NxnD,
+          SIZE_nLx2N, SIZE_nRx2N,
+        };
+
+        const int first_mode = ctrl->cfg.smp_enable ? 0 : 2;
+        const int last_mode = (ctrl->cfg.amp_enable && cu_width >= 16) ? 5 : 1;
+        for (int i = first_mode; i <= last_mode; ++i) {
+          kvz_search_cu_smp(state,
+		                    x, y,
+		                    depth,
+		                    mp_modesi,
+		                    &work_treedepth + 1,
+		                    &mode_cost, &mode_bitcost);
+          if (mode_cost < cost) {
+            cost = mode_cost;
+            inter_bitcost = mode_bitcost;
+            // Copy inter prediction info to current level.
+            copy_cu_info(x_local, y_local, cu_width, &work_treedepth + 1, lcu);
+          }
         }
       }
     }
@@ -473,9 +538,10 @@
     // Try to skip intra search in rd==0 mode.
     // This can be quite severe on bdrate. It might be better to do this
     // decision after reconstructing the inter frame.
-    bool skip_intra = state->encoder_control->cfg.rdo == 0
+    bool skip_intra = (state->encoder_control->cfg.rdo == 0
                       && cur_cu->type != CU_NOTSET
-                      && cost / (cu_width * cu_width) < INTRA_THRESHOLD;
+                      && cost / (cu_width * cu_width) < INTRA_THRESHOLD)
+                      || (ctrl->cfg.early_skip && cur_cu->skipped);
 
     int32_t cu_width_intra_min = LCU_WIDTH >> ctrl->cfg.pu_depth_intra.max;
     bool can_use_intra =
@@ -516,7 +582,7 @@
         // rd2. Possibly because the luma mode search already takes chroma
         // into account, so there is less of a chanse of luma mode being
         // really bad for chroma.
-        if (state->encoder_control->cfg.rdo == 3) {
+        if (ctrl->cfg.rdo == 3) {
           cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu);
           lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
         }
@@ -528,74 +594,47 @@
                            NULL, lcu);
       }
     } else if (cur_cu->type == CU_INTER) {
-      // Reset transform depth because intra messes with them.
-      // This will no longer be necessary if the transform depths are not shared.
-      int tr_depth = depth > 0 ? depth : 1;
-      kvz_lcu_set_trdepth(lcu, x, y, depth, tr_depth);
-
-      const int num_pu = kvz_part_mode_num_partscur_cu->part_size;
-      for (int i = 0; i < num_pu; ++i) {
-        const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i);
-        const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i);
-        const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i);
-        const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i);
-
-        cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y));
-
-        if (cur_pu->inter.mv_dir == 3) {
-          const kvz_picture *const refs2 = {
-            state->frame->ref->images
-              state->frame->ref_LX0
-                cur_pu->inter.mv_ref0,
-            state->frame->ref->images
-              state->frame->ref_LX1
-                cur_pu->inter.mv_ref1,
-          };
-          kvz_inter_recon_lcu_bipred(state,
-                                     refs0, refs1,
-                                     pu_x, pu_y,
-                                     pu_w, pu_h,
-                                     cur_pu->inter.mv,
-                                     lcu);
-        } else {
-          const int mv_idx = cur_pu->inter.mv_dir - 1;
-          
-          const kvz_picture *const ref =
-              state->frame->ref->images
-                state->frame->ref_LXmv_idx
-                  cur_pu->inter.mv_refmv_idx;
-
-          kvz_inter_recon_lcu(state,
-                              ref,
-                              pu_x, pu_y,
-                              pu_w, pu_h,
-                              cur_pu->inter.mvmv_idx,
-                              lcu,
-                              0);
+
+      if (!cur_cu->skipped) {
+        // Reset transform depth because intra messes with them.
+        // This will no longer be necessary if the transform depths are not shared.
+        int tr_depth = MAX(1, depth);
+        if (cur_cu->part_size != SIZE_2Nx2N) {
+          tr_depth = depth + 1;
         }
-      }
+        kvz_lcu_fill_trdepth(lcu, x, y, depth, tr_depth);
 
-      const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
-      kvz_quantize_lcu_residual(state,
-                                true, has_chroma,
-                                x, y, depth,
-                                NULL,
-                                lcu);
+        kvz_inter_recon_cu(state, lcu, x, y, cu_width);
 
-      int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+        if (!ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) {
+          //Calculate cost for zero coeffs
+          inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, depth) + inter_bitcost * state->lambda;
 
-      if(cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
-        cur_cu->merged = 0;
-        cur_cu->skipped = 1;
-        // Selecting skip reduces bits needed to code the CU
-        if (inter_bitcost > 1) {
-          inter_bitcost -= 1;
+        }
+
+        const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
+        kvz_quantize_lcu_residual(state,
+          true, has_chroma,
+          x, y, depth,
+          NULL,
+          lcu);
+
+        int cbf = cbf_is_set_any(cur_cu->cbf, depth);
+
+        if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) {
+          cur_cu->merged = 0;
+          cur_cu->skipped = 1;
+          // Selecting skip reduces bits needed to code the CU
+          if (inter_bitcost > 1) {
+            inter_bitcost -= 1;
+          }
         }
       }
-      lcu_set_inter(lcu, x_local, y_local, cu_width);
-      lcu_set_coeff(lcu, x_local, y_local, cu_width, cur_cu);
+      lcu_fill_inter(lcu, x_local, y_local, cu_width);
+      lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
     }
   }
+
   if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) {
     cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu);
     if (state->encoder_control->chroma_format != KVZ_CSP_400) {
@@ -610,6 +649,28 @@
     }
 
     cost += mode_bits * state->lambda;
+
+    if (inter_zero_coeff_cost <= cost) {
+      cost = inter_zero_coeff_cost;
+
+      // Restore saved pixels from lower level of the working tree.
+      copy_cu_pixels(x_local, y_local, cu_width, &work_treedepth + 1, lcu);
+
+      if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) {
+        cur_cu->merged = 0;
+        cur_cu->skipped = 1;
+        lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
+      }
+
+      if (cur_cu->tr_depth != depth) {
+        // Reset transform depth since there are no coefficients. This
+        // ensures that CBF is cleared for the whole area of the CU.
+        kvz_lcu_fill_trdepth(lcu, x, y, depth, depth);
+      }
+
+      cur_cu->cbf = 0;
+      lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu);
+    }
   }
 
   bool can_split_cu =
@@ -672,7 +733,7 @@
         cur_cu->type = CU_INTRA;
         cur_cu->part_size = SIZE_2Nx2N;
 
-        kvz_lcu_set_trdepth(lcu, x, y, depth, cur_cu->tr_depth);
+        kvz_lcu_fill_trdepth(lcu, x, y, depth, cur_cu->tr_depth);
         lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu);
 
         const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;

kvazaar-1.2.0.tar.gz/src/search.h -> kvazaar-1.3.0.tar.gz/src/search.h Changed

kvazaar-1.2.0.tar.gz/src/search_inter.c -> kvazaar-1.3.0.tar.gz/src/search_inter.c Changed

@@ -30,11 +30,12 @@
 #include "inter.h"
 #include "kvazaar.h"
 #include "rdo.h"
+#include "search.h"
 #include "strategies/strategies-ipol.h"
 #include "strategies/strategies-picture.h"
+#include "transform.h"
 #include "videoframe.h"
 
-
 typedef struct {
   encoder_state_t *state;
 
@@ -77,6 +78,13 @@
    * \brief Bit cost of best_mv
    */
   uint32_t best_bitcost;
+
+  /**
+   * \brief Possible optimized SAD implementation for the width, leave as
+   *        NULL for arbitrary-width blocks
+   */
+  optimized_sad_func_ptr_t optimized_sad;
+
 } inter_search_info_t;
 
 
@@ -204,7 +212,8 @@
       info->state->tile->offset_x + info->origin.x + x,
       info->state->tile->offset_y + info->origin.y + y,
       info->width,
-      info->height
+      info->height,
+      info->optimized_sad
   );
 
   if (cost >= info->best_cost) return false;
@@ -261,8 +270,8 @@
   for (int i = 0; i < info->num_merge_cand; ++i) {
     if (info->merge_candi.dir == 3) continue;
     const vector2d_t merge_mv = {
-      info->merge_candi.mvinfo->merge_candi.dir - 10 >> 2,
-      info->merge_candi.mvinfo->merge_candi.dir - 11 >> 2
+      (info->merge_candi.mvinfo->merge_candi.dir - 10 + 2) >> 2,
+      (info->merge_candi.mvinfo->merge_candi.dir - 11 + 2) >> 2
     };
     if (merge_mv.x == mv.x && merge_mv.y == mv.y) {
       return true;
@@ -296,8 +305,8 @@
   for (unsigned i = 0; i < info->num_merge_cand; ++i) {
     if (info->merge_candi.dir == 3) continue;
 
-    int x = info->merge_candi.mvinfo->merge_candi.dir - 10 >> 2;
-    int y = info->merge_candi.mvinfo->merge_candi.dir - 11 >> 2;
+    int x = (info->merge_candi.mvinfo->merge_candi.dir - 10 + 2) >> 2;
+    int y = (info->merge_candi.mvinfo->merge_candi.dir - 11 + 2) >> 2;
 
     if (x == 0 && y == 0) continue;
 
@@ -307,32 +316,65 @@
 
 
 static uint32_t get_mvd_coding_cost(const encoder_state_t *state,
-                                    vector2d_t *mvd,
-                                    const cabac_data_t* cabac)
+                                    const cabac_data_t* cabac,
+                                    const int32_t mvd_hor,
+                                    const int32_t mvd_ver)
 {
   unsigned bitcost = 0;
-  const vector2d_t abs_mvd = { abs(mvd->x), abs(mvd->y) };
+  const vector2d_t abs_mvd = { abs(mvd_hor), abs(mvd_ver) };
 
-  bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model0, abs_mvd.x > 0);
-  if (abs_mvd.x > 0) {
-    bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model1, abs_mvd.x > 1);
-    if (abs_mvd.x > 1) {
-      bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x - 2) << CTX_FRAC_BITS;
-    }
-    bitcost += CTX_FRAC_ONE_BIT; // sign
+  bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x) << CTX_FRAC_BITS;
+  bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y) << CTX_FRAC_BITS;
+
+  // Round and shift back to integer bits.
+  return (bitcost + CTX_FRAC_HALF_BIT) >> CTX_FRAC_BITS;
+}
+
+
+static int select_mv_cand(const encoder_state_t *state,
+                          int16_t mv_cand22,
+                          int32_t mv_x,
+                          int32_t mv_y,
+                          uint32_t *cost_out)
+{
+  const bool same_cand =
+    (mv_cand00 == mv_cand10 && mv_cand01 == mv_cand11);
+
+  if (same_cand && !cost_out) {
+    // Pick the first one if both candidates are the same.
+    return 0;
   }
 
-  bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model0, abs_mvd.y > 0);
-  if (abs_mvd.y > 0) {
-    bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model1, abs_mvd.y > 1);
-    if (abs_mvd.y > 1) {
-      bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y - 2) << CTX_FRAC_BITS;
-    }
-    bitcost += CTX_FRAC_ONE_BIT; // sign
+  uint32_t (*mvd_coding_cost)(const encoder_state_t * const state,
+                              const cabac_data_t*,
+                              int32_t, int32_t);
+  if (state->encoder_control->cfg.mv_rdo) {
+    mvd_coding_cost = kvz_get_mvd_coding_cost_cabac;
+  } else {
+    mvd_coding_cost = get_mvd_coding_cost;
   }
 
-  // Round and shift back to integer bits.
-  return (bitcost + CTX_FRAC_HALF_BIT) >> CTX_FRAC_BITS;
+  uint32_t cand1_cost = mvd_coding_cost(
+      state, &state->cabac,
+      mv_x - mv_cand00,
+      mv_y - mv_cand01);
+
+  uint32_t cand2_cost;
+  if (same_cand) {
+    cand2_cost = cand1_cost;
+  } else {
+    cand2_cost = mvd_coding_cost(
+      state, &state->cabac,
+      mv_x - mv_cand10,
+      mv_y - mv_cand11);
+  }
+
+  if (cost_out) {
+    *cost_out = MIN(cand1_cost, cand2_cost);
+  }
+
+  // Pick the second candidate if it has lower cost.
+  return cand2_cost < cand1_cost ? 1 : 0;
 }
 
 
@@ -348,10 +390,7 @@
 {
   uint32_t temp_bitcost = 0;
   uint32_t merge_idx;
-  int cand1_cost,cand2_cost;
-  vector2d_t mvd_temp1, mvd_temp2;
   int8_t merged      = 0;
-  int8_t cur_mv_cand = 0;
 
   x *= 1 << mv_shift;
   y *= 1 << mv_shift;
@@ -371,20 +410,10 @@
   }
 
   // Check mvd cost only if mv is not merged
-  if(!merged) {
-    mvd_temp1.x = x - mv_cand00;
-    mvd_temp1.y = y - mv_cand01;
-    cand1_cost = get_mvd_coding_cost(state, &mvd_temp1, &state->cabac);
-
-    mvd_temp2.x = x - mv_cand10;
-    mvd_temp2.y = y - mv_cand11;
-    cand2_cost = get_mvd_coding_cost(state, &mvd_temp2, &state->cabac);
-
-    // Select candidate 1 if it has lower cost
-    if (cand2_cost < cand1_cost) {
-      cur_mv_cand = 1;
-    }
-    temp_bitcost += cur_mv_cand ? cand2_cost : cand1_cost;
+  if (!merged) {
+    uint32_t mvd_cost = 0;
+    select_mv_cand(state, mv_cand, x, y, &mvd_cost);
+    temp_bitcost += mvd_cost;
   }
   *bitcost = temp_bitcost;
   return temp_bitcost*(int32_t)(state->lambda_sqrt + 0.5);
@@ -442,6 +471,7 @@
 void kvz_tz_pattern_search(inter_search_info_t *info,
                            unsigned pattern_type,
                            const int iDist,
+                           vector2d_t mv,
                            int *best_dist)
 {
   assert(pattern_type < 4);
@@ -537,8 +567,6 @@
     };
   }
 
-  const vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 };
-
   // Compute SAD values for all chosen points.
   int best_index = -1;
   for (int i = 0; i < n_points; i++) {
@@ -579,8 +607,9 @@
   const int iRaster = 5;  // search distance limit and downsampling factor for step 3
   const unsigned step2_type = 0;  // search patterns for steps 2 and 4
   const unsigned step4_type = 0;
-  const bool bRasterRefinementEnable = true;  // enable step 4 mode 1
-  const bool bStarRefinementEnable = false;   // enable step 4 mode 2 (only one mode will be executed)
+  const bool use_raster_scan = false;  // enable step 3
+  const bool use_raster_refinement = false;  // enable step 4 mode 1
+  const bool use_star_refinement = true;   // enable step 4 mode 2 (only one mode will be executed)
 
   int best_dist = 0;
   info->best_cost = UINT32_MAX;
@@ -596,13 +625,33 @@
     return;
   }
 
-  //step 2, grid search
+  vector2d_t start = { info->best_mv.x >> 2, info->best_mv.y >> 2 };
+
+  // step 2, grid search
+  int rounds_without_improvement = 0;
   for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) {
-    kvz_tz_pattern_search(info, step2_type, iDist, &best_dist);
+    kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist);
+
+    // Break the loop if the last three rounds didn't produce a better MV.
+    if (best_dist != iDist) rounds_without_improvement++;
+    if (rounds_without_improvement >= 3) break;
+  }
+
+  if (start.x != 0 || start.y != 0) {
+    // repeat step 2 starting from the zero MV
+    start.x = 0;
+    start.y = 0;
+    rounds_without_improvement = 0;
+    for (int iDist = 1; iDist <= iSearchRange/2; iDist *= 2) {
+      kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist);
+
+      if (best_dist != iDist) rounds_without_improvement++;
+      if (rounds_without_improvement >= 3) break;
+    }
   }
 
   //step 3, raster scan
-  if (best_dist > iRaster) {
+  if (use_raster_scan && best_dist > iRaster) {
     best_dist = iRaster;
     kvz_tz_raster_search(info, iSearchRange, iRaster);
   }
@@ -610,16 +659,21 @@
   //step 4
 
   //raster refinement
-  if (bRasterRefinementEnable && best_dist > 0) {
+  if (use_raster_refinement && best_dist > 0) {
     for (int iDist = best_dist >> 1; iDist > 0; iDist >>= 1) {
-      kvz_tz_pattern_search(info, step4_type, iDist, &best_dist);
+      start.x = info->best_mv.x >> 2;
+      start.y = info->best_mv.y >> 2;
+      kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist);
     }
   }
 
   //star refinement (repeat step 2 for the current starting point)
-  if (bStarRefinementEnable && best_dist > 0) {
+  while (use_star_refinement && best_dist > 0) {
+    best_dist = 0;
+    start.x = info->best_mv.x >> 2;
+    start.y = info->best_mv.y >> 2;
     for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) {
-      kvz_tz_pattern_search(info, step4_type, iDist, &best_dist);
+      kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist);
     }
   }
 }
@@ -630,6 +684,7 @@
  *
  * \param info      search info
  * \param extra_mv  extra motion vector to check
+ * \param steps     how many steps are done at maximum before exiting, does not affect the final step
  *
  * Motion vector is searched by first searching iteratively with the large
  * hexagon pattern until the best match is at the center of the hexagon.
@@ -640,7 +695,7 @@
  * the predicted motion vector is way off. In the future even more additional
  * points like 0,0 might be used, such as vectors from top or left.
  */
-static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv)
+static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps)
 {
   // The start of the hexagonal pattern has been repeated at the end so that
   // the indices between 1-6 can be used as the start of a 3-point list of new
@@ -659,9 +714,10 @@
   //   1
   // 2 0 3
   //   4
-  static const vector2d_t small_hexbs5 = {
+  static const vector2d_t small_hexbs9 = {
       { 0, 0 },
-      { 0, -1 }, { -1, 0 }, { 1, 0 }, { 0, 1 }
+      { 0, -1 }, { -1, 0 }, { 1, 0 }, { 0, 1 },
+      { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 }
   };
 
   info->best_cost = UINT32_MAX;
@@ -679,7 +735,7 @@
 
   vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 };
 
-  // Current best index, either to merge_cands, large_hebx or small_hexbs.
+  // Current best index, either to merge_cands, large_hexbs or small_hexbs.
   int best_index = 0;
 
   // Search the initial 7 points of the hexagon.
@@ -691,7 +747,10 @@
 
   // Iteratively search the 3 new points around the best match, until the best
   // match is in the center.
-  while (best_index != 0) {
+  while (best_index != 0 && steps != 0) {
+    // decrement count if enabled
+    if (steps > 0) steps -= 1;
+
     // Starting point of the 3 offsets to be searched.
     unsigned start;
     if (best_index == 1) {
@@ -717,16 +776,120 @@
   }
 
   // Move the center to the best match.
-  mv.x += large_hexbsbest_index.x;
-  mv.y += large_hexbsbest_index.y;
-  best_index = 0;
+  //mv.x += large_hexbsbest_index.x;
+  //mv.y += large_hexbsbest_index.y;
 
   // Do the final step of the search with a small pattern.
-  for (int i = 1; i < 5; ++i) {
+  for (int i = 1; i < 9; ++i) {
     check_mv_cost(info, mv.x + small_hexbsi.x, mv.y + small_hexbsi.y);
   }
 }
 
+/**
+* \brief Do motion search using the diamond algorithm.
+*
+* \param info      search info
+* \param extra_mv  extra motion vector to check
+* \param steps     how many steps are done at maximum before exiting
+*
+* Motion vector is searched by searching iteratively with a diamond-shaped
+* pattern. We take care of not checking the direction we came from, but
+* further checking for avoiding visits to already visited points is not done.
+*
+* If a non 0,0 predicted motion vector predictor is given as extra_mv,
+* the 0,0 vector is also tried. This is hoped to help in the case where
+* the predicted motion vector is way off. In the future even more additional
+* points like 0,0 might be used, such as vectors from top or left.
+**/
+static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps) 
+{
+  enum diapos {
+    DIA_UP = 0,
+    DIA_RIGHT = 1,
+    DIA_LEFT = 2,
+    DIA_DOWN = 3,
+    DIA_CENTER = 4,
+  };
+
+  // a diamond shape with the center included
+  //   0
+  // 2 4 1
+  //   3
+  static const vector2d_t diamond5 = {
+    {0, -1}, {1, 0}, {0, 1}, {-1, 0},
+    {0, 0}
+  };
+
+  info->best_cost = UINT32_MAX;
+
+  // Select starting point from among merge candidates. These should
+  // include both mv_cand vectors and (0, 0).
+  select_starting_point(info, extra_mv);
+
+  // Check if we should stop search
+  if (info->state->encoder_control->cfg.me_early_termination &&
+    early_terminate(info))
+  {
+    return;
+  }
+  
+  // current motion vector
+  vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 };
+
+  // current best index
+  enum diapos best_index = DIA_CENTER;
+
+  // initial search of the points of the diamond
+  for (int i = 0; i < 5; ++i) {
+    if (check_mv_cost(info, mv.x + diamondi.x, mv.y + diamondi.y)) {
+      best_index = i;
+    }
+  }
+
+  if (best_index == DIA_CENTER) {
+    // the center point was the best in initial check
+    return;
+  }
+
+  // Move the center to the best match.
+  mv.x += diamondbest_index.x;
+  mv.y += diamondbest_index.y;
+
+  // the arrival direction, the index of the diamond member that will be excluded
+  enum diapos from_dir = DIA_CENTER;
+
+  // whether we found a better candidate this iteration
+  uint8_t better_found;
+
+  do {
+    better_found = 0;
+    // decrement count if enabled
+    if (steps > 0) steps -= 1;
+
+    // search the points of the diamond
+    for (int i = 0; i < 4; ++i) {
+      // this is where we came from so it's checked already
+      if (i == from_dir) continue;
+
+      if (check_mv_cost(info, mv.x + diamondi.x, mv.y + diamondi.y)) {
+        best_index = i;
+        better_found = 1;
+      }
+    }
+
+    if (better_found) {
+      // Move the center to the best match.
+      mv.x += diamondbest_index.x;
+      mv.y += diamondbest_index.y;
+
+      // record where we came from to the next iteration
+      // the xor operation flips the orientation
+      from_dir = best_index ^ 0x3;
+    }
+  } while (better_found && steps != 0);
+  // and we're done
+}
+
 
 static void search_mv_full(inter_search_info_t *info,
                            int32_t search_range,
@@ -830,65 +993,39 @@
   unsigned costs4 = { 0 };
 
   kvz_extended_block src = { 0, 0, 0, 0 };
+  ALIGNED(64) kvz_pixel filtered4LCU_WIDTH * LCU_WIDTH;
 
-  // Buffers for interpolated fractional pixels one 
-  // for each position excluding the integer position.
-  // Has one extra column on left and row on top because
-  // samples are used also from those integer pixels when
-  // searching positions to the left and up.
-  frac_search_block fracpel_blocks15;
-  
-  kvz_pixel *hpel_pos8;
-  
-  // Horizontal hpel positions
-  hpel_pos0 = fracpel_blocksHPEL_POS_HOR + (LCU_WIDTH + 1);
-  hpel_pos1 = fracpel_blocksHPEL_POS_HOR + (LCU_WIDTH + 1) + 1;
-  
-  // Vertical hpel positions
-  hpel_pos2 = fracpel_blocksHPEL_POS_VER + 1;
-  hpel_pos3 = fracpel_blocksHPEL_POS_VER + (LCU_WIDTH + 1) + 1;
-  
-  // Diagonal hpel positions
-  hpel_pos4 = fracpel_blocksHPEL_POS_DIA;
-  hpel_pos5 = fracpel_blocksHPEL_POS_DIA + 1;
-  hpel_pos6 = fracpel_blocksHPEL_POS_DIA + (LCU_WIDTH + 1);
-  hpel_pos7 = fracpel_blocksHPEL_POS_DIA + (LCU_WIDTH + 1) + 1;
+  // Storage buffers for intermediate horizontally filtered results.
+  // Have the first columns in contiguous memory for vectorization.
+  ALIGNED(64) int16_t intermediate5(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH;
+  int16_t hor_first_cols5KVZ_EXT_BLOCK_W_LUMA + 1;
 
   const kvz_picture *ref = info->ref;
   const kvz_picture *pic = info->pic;
   vector2d_t orig = info->origin;
   const int width = info->width;
   const int height = info->height;
+  const int internal_width  = ((width  + 7) >> 3) << 3; // Round up to closest 8
+  const int internal_height = ((height + 7) >> 3) << 3;
 
   const encoder_state_t *state = info->state;
   int fme_level = state->encoder_control->cfg.fme_level;
+  int8_t sample_off_x = 0;
+  int8_t sample_off_y = 0;
 
   kvz_get_extended_block(orig.x, orig.y, mv.x - 1, mv.y - 1,
                 state->tile->offset_x,
                 state->tile->offset_y,
-                ref->y, ref->width, ref->height, FILTER_SIZE,
-                width+1, height+1,
+                ref->y, ref->width, ref->height, KVZ_LUMA_FILTER_TAPS,
+                internal_width+1, internal_height+1,
                 &src);
 
-  kvz_filter_frac_blocks_luma(state->encoder_control,
-                              src.orig_topleft,
-                              src.stride,
-                              width,
-                              height,
-                              fracpel_blocks,
-                              fme_level);
-
-  kvz_pixel tmp_picLCU_WIDTH*LCU_WIDTH;
-  kvz_pixels_blit(pic->y + orig.y * pic->stride + orig.x,
-                  tmp_pic,
-                  width,
-                  height,
-                  pic->stride,
-                  width);
-
+  kvz_pixel *tmp_pic = pic->y + orig.y * pic->stride + orig.x;
+  int tmp_stride = pic->stride;
+                  
   // Search integer position
   costs0 = kvz_satd_any_size(width, height,
-                            tmp_pic, width,
+                            tmp_pic, tmp_stride,
                             src.orig_topleft + src.stride + 1, src.stride);
 
   costs0 += info->mvd_cost_func(state,
@@ -900,31 +1037,51 @@
                                   &bitcosts0);
   best_cost = costs0;
   best_bitcost = bitcosts0;
-
-  int last_hpel_index = (fme_level == 1) ? 4 : 8;
-
+  
   //Set mv to half-pixel precision
   mv.x *= 2;
   mv.y *= 2;
 
+  ipol_blocks_func * filter_steps4 = {
+    kvz_filter_hpel_blocks_hor_ver_luma,
+    kvz_filter_hpel_blocks_diag_luma,
+    kvz_filter_qpel_blocks_hor_ver_luma,
+    kvz_filter_qpel_blocks_diag_luma,
+  };
+
   // Search halfpel positions around best integer mv
-  for (int i = 1; i <= last_hpel_index; i += 4) {
+  int i = 1;
+  for (int step = 0; step < fme_level; ++step){
+
+    const int mv_shift = (step < 2) ? 1 : 0;
+
+    filter_stepsstep(state->encoder_control,
+      src.orig_topleft,
+      src.stride,
+      internal_width,
+      internal_height,
+      filtered,
+      intermediate,
+      fme_level,
+      hor_first_cols,
+      sample_off_x,
+      sample_off_y);
+          
     const vector2d_t *pattern4 = { &squarei, &squarei + 1, &squarei + 2, &squarei + 3 };
 
     int8_t within_tile4;
     for (int j = 0; j < 4; j++) {
       within_tilej =
-        fracmv_within_tile(info, (mv.x + patternj->x) * 2, (mv.y + patternj->y) * 2);
+        fracmv_within_tile(info, (mv.x + patternj->x) * (1 << mv_shift), (mv.y + patternj->y) * (1 << mv_shift));
     };
 
-    int hpel_strides4 = {
-      (LCU_WIDTH + 1),
-      (LCU_WIDTH + 1),
-      (LCU_WIDTH + 1),
-      (LCU_WIDTH + 1)
-    };
+    kvz_pixel *filtered_pos4 = { 0 };
+    filtered_pos0 = &filtered00;
+    filtered_pos1 = &filtered10;
+    filtered_pos2 = &filtered20;
+    filtered_pos3 = &filtered30;
 
-    kvz_satd_any_size_quad(width, height, (const kvz_pixel**)(hpel_pos + i - 1), hpel_strides, tmp_pic, width, 4, costs, within_tile);
+    kvz_satd_any_size_quad(width, height, (const kvz_pixel **)filtered_pos, LCU_WIDTH, tmp_pic, tmp_stride, 4, costs, within_tile);
 
     for (int j = 0; j < 4; j++) {
       if (within_tilej) {
@@ -932,7 +1089,7 @@
             state,
             mv.x + patternj->x,
             mv.y + patternj->y,
-            1,
+            mv_shift,
             info->mv_cand,
             info->merge_cand,
             info->num_merge_cand,
@@ -949,108 +1106,26 @@
         best_index = i + j;
       }
     }
-  }
-
-  unsigned int best_hpel_index = best_index;
 
-  // Move search to best_index
-  mv.x += squarebest_index.x;
-  mv.y += squarebest_index.y;
-
-  //Set mv to quarterpel precision
-  mv.x *= 2;
-  mv.y *= 2;
-
-  if (fme_level >= 3) {
-
-    best_index = 0;
-
-    int last_qpel_index = (fme_level == 3) ? 4 : 8;
-
-    //Search quarterpel points around best halfpel mv
-    for (int i = 1; i <= last_qpel_index; i += 4) {
-      const vector2d_t *pattern4 = { &squarei, &squarei + 1, &squarei + 2, &squarei + 3 };
-
-      int8_t within_tile4;
-      for (int j = 0; j < 4; j++) {
-        within_tilej =
-          fracmv_within_tile(info, mv.x + patternj->x, mv.y + patternj->y);
-      }
-
-      int qpel_indices4 = { 0 };
-      int int_offset_x4 = { 0 };
-      int int_offset_y4 = { 0 };
-
-      for (int j = 0; j < 4; ++j) {
-        int hpel_offset_x = squarebest_hpel_index.x;
-        int hpel_offset_y = squarebest_hpel_index.y;
-
-        int qpel_offset_x = 2 * hpel_offset_x + patternj->x;
-        int qpel_offset_y = 2 * hpel_offset_y + patternj->y;
-
-        unsigned qpel_filter_x = (qpel_offset_x + 4) % 4;
-        unsigned qpel_filter_y = (qpel_offset_y + 4) % 4;
-
-        // The first value (-1) is for the integer position and
-        // it will not be used
-        int filters_to_block_idx44 = {
-            { -1, 3, 0, 4 },
-            { 7, 11, 8, 12 },
-            { 1, 5, 2, 6 },
-            { 9, 13, 10, 14 }
-        };
-
-        qpel_indicesj = filters_to_block_idxqpel_filter_yqpel_filter_x;
-
-        // Select values filtered from correct integer samples
-        int_offset_xj = qpel_offset_x >= 0;
-        int_offset_yj = qpel_offset_y >= 0;
-      }
-
-      kvz_pixel *qpel_pos4 = {
-        fracpel_blocksqpel_indices0 + int_offset_y0 * (LCU_WIDTH + 1) + int_offset_x0,
-        fracpel_blocksqpel_indices1 + int_offset_y1 * (LCU_WIDTH + 1) + int_offset_x1,
-        fracpel_blocksqpel_indices2 + int_offset_y2 * (LCU_WIDTH + 1) + int_offset_x2,
-        fracpel_blocksqpel_indices3 + int_offset_y3 * (LCU_WIDTH + 1) + int_offset_x3
-      };
-
-      int qpel_strides4 = {
-        (LCU_WIDTH + 1),
-        (LCU_WIDTH + 1),
-        (LCU_WIDTH + 1),
-        (LCU_WIDTH + 1)
-      };
-
-      kvz_satd_any_size_quad(width, height, (const kvz_pixel**)qpel_pos, qpel_strides, tmp_pic, width, 4, costs, within_tile);
-
-      for (int j = 0; j < 4; j++) {
-        if (within_tilej) {
-          costsj += info->mvd_cost_func(
-              state,
-              mv.x + patternj->x,
-              mv.y + patternj->y,
-              0,
-              info->mv_cand,
-              info->merge_cand,
-              info->num_merge_cand,
-              info->ref_idx,
-              &bitcostsj
-          );
-        }
-      }
-
-      for (int j = 0; j < 4; ++j) {
-        if (within_tilej && costsj < best_cost) {
-          best_cost = costsj;
-          best_bitcost = bitcostsj;
-          best_index = i + j;
-        }
+    i += 4;
+
+    // Update mv for the best position on current precision
+    if (step == 1 || step == fme_level - 1) {
+      // Move search to best_index
+      mv.x += squarebest_index.x;
+      mv.y += squarebest_index.y;
+
+      // On last hpel step...
+      if (step == MIN(fme_level - 1, 1)) {
+        //Set mv to quarterpel precision
+        mv.x *= 2;
+        mv.y *= 2;
+        sample_off_x = squarebest_index.x;
+        sample_off_y = squarebest_index.y;
+        best_index = 0;
+        i = 1;
       }
     }
-
-    //Set mv to best final best match
-    mv.x += squarebest_index.x;
-    mv.y += squarebest_index.y;
   }
 
   info->best_mv = mv;
@@ -1162,8 +1237,12 @@
       search_mv_full(info, search_range, mv);
       break;
 
+    case KVZ_IME_DIA:
+      diamond_search(info, mv, info->state->encoder_control->cfg.me_max_steps);
+      break;
+
     default:
-      hexagon_search(info, mv);
+      hexagon_search(info, mv, info->state->encoder_control->cfg.me_max_steps);
       break;
   }
 
@@ -1203,30 +1282,9 @@
 
   // Only check when candidates are different
   int cu_mv_cand = 0;
-  if (!merged && (
-        info->mv_cand00 != info->mv_cand10 ||
-        info->mv_cand01 != info->mv_cand11))
-  {
-    uint32_t (*mvd_coding_cost)(const encoder_state_t * const state,
-                                vector2d_t *,
-                                const cabac_data_t*) =
-      cfg->mv_rdo ? kvz_get_mvd_coding_cost_cabac : get_mvd_coding_cost;
-
-    vector2d_t mvd_temp1, mvd_temp2;
-    int cand1_cost,cand2_cost;
-
-    mvd_temp1.x = mv.x - info->mv_cand00;
-    mvd_temp1.y = mv.y - info->mv_cand01;
-    cand1_cost = mvd_coding_cost(info->state, &mvd_temp1, &info->state->cabac);
-
-    mvd_temp2.x = mv.x - info->mv_cand10;
-    mvd_temp2.y = mv.y - info->mv_cand11;
-    cand2_cost = mvd_coding_cost(info->state, &mvd_temp2, &info->state->cabac);
-
-    // Select candidate 1 if it has lower cost
-    if (cand2_cost < cand1_cost) {
-      cu_mv_cand = 1;
-    }
+  if (!merged) {
+    cu_mv_cand =
+      select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL);
   }
 
   if (info->best_cost < *inter_cost) {
@@ -1249,6 +1307,141 @@
 
 
 /**
+ * \brief Search bipred modes for a PU.
+ */
+static void search_pu_inter_bipred(inter_search_info_t *info,
+                                   int depth,
+                                   lcu_t *lcu, cu_info_t *cur_cu,
+                                   double *inter_cost,
+                                   uint32_t *inter_bitcost)
+{
+  const image_list_t *const ref = info->state->frame->ref;
+  uint8_t (*ref_LX)16 = info->state->frame->ref_LX;
+  const videoframe_t * const frame = info->state->tile->frame;
+  const int x         = info->origin.x;
+  const int y         = info->origin.y;
+  const int width     = info->width;
+  const int height    = info->height;
+
+  static const uint8_t priorityList0 = { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 };
+  static const uint8_t priorityList1 = { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 };
+  const unsigned num_cand_pairs =
+    MIN(info->num_merge_cand * (info->num_merge_cand - 1), 12);
+
+  inter_merge_cand_t *merge_cand = info->merge_cand;
+
+  for (int32_t idx = 0; idx < num_cand_pairs; idx++) {
+    uint8_t i = priorityList0idx;
+    uint8_t j = priorityList1idx;
+    if (i >= info->num_merge_cand || j >= info->num_merge_cand) break;
+
+    // Find one L0 and L1 candidate according to the priority list
+    if (!(merge_candi.dir & 0x1) || !(merge_candj.dir & 0x2)) continue;
+
+    if (ref_LX0merge_candi.ref0 == ref_LX1merge_candj.ref1 &&
+        merge_candi.mv00 == merge_candj.mv10 &&
+        merge_candi.mv01 == merge_candj.mv11)
+    {
+      continue;
+    }
+
+    int16_t mv22;
+    mv00 = merge_candi.mv00;
+    mv01 = merge_candi.mv01;
+    mv10 = merge_candj.mv10;
+    mv11 = merge_candj.mv11;
+
+    // Don't try merge candidates that don't satisfy mv constraints.
+    if (!fracmv_within_tile(info, mv00, mv01) ||
+        !fracmv_within_tile(info, mv10, mv11))
+    {
+      continue;
+    }
+
+    kvz_inter_recon_bipred(info->state,
+                           ref->imagesref_LX0merge_candi.ref0,
+                           ref->imagesref_LX1merge_candj.ref1,
+                           x, y,
+                           width,
+                           height,
+                           mv,
+                           lcu);
+
+    const kvz_pixel *rec = &lcu->rec.ySUB_SCU(y) * LCU_WIDTH + SUB_SCU(x);
+    const kvz_pixel *src = &frame->source->yx + y * frame->source->width;
+    uint32_t cost =
+      kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, frame->source->width);
+
+    uint32_t bitcost2 = { 0, 0 };
+
+    cost += info->mvd_cost_func(info->state,
+                               merge_candi.mv00,
+                               merge_candi.mv01,
+                               0,
+                               info->mv_cand,
+                               NULL, 0, 0,
+                               &bitcost0);
+    cost += info->mvd_cost_func(info->state,
+                               merge_candi.mv10,
+                               merge_candi.mv11,
+                               0,
+                               info->mv_cand,
+                               NULL, 0, 0,
+                               &bitcost1);
+
+    const uint8_t mv_ref_coded2 = {
+      merge_candi.ref0,
+      merge_candj.ref1
+    };
+    const int extra_bits = mv_ref_coded0 + mv_ref_coded1 + 2 /* mv dir cost */;
+    cost += info->state->lambda_sqrt * extra_bits + 0.5;
+
+    if (cost < *inter_cost) {
+      cur_cu->inter.mv_dir = 3;
+
+      cur_cu->inter.mv_ref0 = merge_candi.ref0;
+      cur_cu->inter.mv_ref1 = merge_candj.ref1;
+
+      cur_cu->inter.mv00 = merge_candi.mv00;
+      cur_cu->inter.mv01 = merge_candi.mv01;
+      cur_cu->inter.mv10 = merge_candj.mv10;
+      cur_cu->inter.mv11 = merge_candj.mv11;
+      cur_cu->merged = 0;
+
+      // Check every candidate to find a match
+      for (int merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) {
+        if (merge_candmerge_idx.mv00 == cur_cu->inter.mv00 &&
+            merge_candmerge_idx.mv01 == cur_cu->inter.mv01 &&
+            merge_candmerge_idx.mv10 == cur_cu->inter.mv10 &&
+            merge_candmerge_idx.mv11 == cur_cu->inter.mv11 &&
+            merge_candmerge_idx.ref0 == cur_cu->inter.mv_ref0 &&
+            merge_candmerge_idx.ref1 == cur_cu->inter.mv_ref1)
+        {
+          cur_cu->merged = 1;
+          cur_cu->merge_idx = merge_idx;
+          break;
+        }
+      }
+
+      // Each motion vector has its own candidate
+      for (int reflist = 0; reflist < 2; reflist++) {
+        kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, cur_cu, lcu, reflist);
+        int cu_mv_cand = select_mv_cand(
+            info->state,
+            info->mv_cand,
+            cur_cu->inter.mvreflist0,
+            cur_cu->inter.mvreflist1,
+            NULL);
+        CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand);
+      }
+
+      *inter_cost = cost;
+      *inter_bitcost = bitcost0 + bitcost1 + extra_bits;
+    }
+  }
+}
+
+/**
  * \brief Update PU to have best modes at this depth.
  *
  * \param state       encoder state
@@ -1300,6 +1493,7 @@
     .width          = width,
     .height         = height,
     .mvd_cost_func  = cfg->mv_rdo ? kvz_calc_mvd_cost_cabac : calc_mvd_cost,
+    .optimized_sad  = kvz_get_optimized_sad(width),
   };
 
   // Search for merge mode candidates
@@ -1316,6 +1510,90 @@
   CU_SET_MV_CAND(cur_cu, 0, 0);
   CU_SET_MV_CAND(cur_cu, 1, 0);
 
+  // Early Skip Mode Decision
+  if (cfg->early_skip && cur_cu->part_size == SIZE_2Nx2N) {
+
+    int num_rdo_cands = 0;
+    int8_t mrg_candsMRG_MAX_NUM_CANDS = { 0, 1, 2, 3, 4 };
+    double mrg_costsMRG_MAX_NUM_CANDS = { MAX_DOUBLE };
+
+    // Check motion vector constraints and perform rough search
+    for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) {
+
+      cur_cu->inter.mv_dir = info.merge_candmerge_idx.dir;
+      cur_cu->inter.mv_ref0 = info.merge_candmerge_idx.ref0;
+      cur_cu->inter.mv_ref1 = info.merge_candmerge_idx.ref1;
+      cur_cu->inter.mv00 = info.merge_candmerge_idx.mv00;
+      cur_cu->inter.mv01 = info.merge_candmerge_idx.mv01;
+      cur_cu->inter.mv10 = info.merge_candmerge_idx.mv10;
+      cur_cu->inter.mv11 = info.merge_candmerge_idx.mv11;
+
+      // Don't try merge candidates that don't satisfy mv constraints.
+      if (!fracmv_within_tile(&info, cur_cu->inter.mv00, cur_cu->inter.mv01) ||
+          !fracmv_within_tile(&info, cur_cu->inter.mv10, cur_cu->inter.mv11))
+      {
+        continue;
+      }
+
+      if (cfg->rdo >= 2) {
+
+        kvz_lcu_fill_trdepth(lcu, x, y, depth, depth);
+        kvz_inter_recon_cu(state, lcu, x, y, width);
+        mrg_costsmerge_idx = kvz_satd_any_size(width, height,
+          lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH,
+          lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH);
+      }
+
+      num_rdo_cands++;
+    }
+
+
+    if (cfg->rdo >= 2) {
+      // Sort candidates by cost
+      kvz_sort_modes(mrg_cands, mrg_costs, num_rdo_cands);
+    }
+
+    // Limit by availability
+    // TODO: Do not limit to just 1
+    num_rdo_cands = MIN(1, num_rdo_cands);
+
+    // RDO search
+    for (int merge_rdo_idx = 0; merge_rdo_idx < num_rdo_cands; ++merge_rdo_idx) {
+
+      // Reconstruct blocks with merge candidate.
+      // Check luma CBF. Then, check chroma CBFs if luma CBF is not set
+      // and chroma exists.
+      // Early terminate if merge candidate with zero CBF is found.
+      int merge_idx = mrg_candsmerge_rdo_idx;
+      cur_cu->inter.mv_dir = info.merge_candmerge_idx.dir;
+      cur_cu->inter.mv_ref0 = info.merge_candmerge_idx.ref0;
+      cur_cu->inter.mv_ref1 = info.merge_candmerge_idx.ref1;
+      cur_cu->inter.mv00 = info.merge_candmerge_idx.mv00;
+      cur_cu->inter.mv01 = info.merge_candmerge_idx.mv01;
+      cur_cu->inter.mv10 = info.merge_candmerge_idx.mv10;
+      cur_cu->inter.mv11 = info.merge_candmerge_idx.mv11;
+      kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth));
+      kvz_inter_recon_cu(state, lcu, x, y, width);
+      kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_cu, lcu);
+
+      if (cbf_is_set(cur_cu->cbf, depth, COLOR_Y)) {
+        continue;
+      }
+      else if(state->encoder_control->chroma_format != KVZ_CSP_400) {
+
+        kvz_quantize_lcu_residual(state, false, true, x, y, depth, cur_cu, lcu);
+        if (!cbf_is_set_any(cur_cu->cbf, depth)) {
+          cur_cu->type = CU_INTER;
+          cur_cu->merge_idx = merge_idx;
+          cur_cu->skipped = true;
+          *inter_cost = 0.0;  // TODO: Check this
+          *inter_bitcost = 0; // TODO: Check this
+          return;
+        }
+      }
+    }
+  }
+
   for (int ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) {
     info.ref_idx = ref_idx;
     info.ref = state->frame->ref->imagesref_idx;
@@ -1329,159 +1607,7 @@
     && width + height >= 16; // 4x8 and 8x4 PBs are restricted to unipred
 
   if (can_use_bipred) {
-    lcu_t *templcu = MALLOC(lcu_t, 1);
-    unsigned cu_width = LCU_WIDTH >> depth;
-    static const uint8_t priorityList0 = { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 };
-    static const uint8_t priorityList1 = { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 };
-    const unsigned num_cand_pairs =
-      MIN(info.num_merge_cand * (info.num_merge_cand - 1), 12);
-
-    inter_merge_cand_t *merge_cand = info.merge_cand;
-
-    for (int32_t idx = 0; idx < num_cand_pairs; idx++) {
-      uint8_t i = priorityList0idx;
-      uint8_t j = priorityList1idx;
-      if (i >= info.num_merge_cand || j >= info.num_merge_cand) break;
-
-      // Find one L0 and L1 candidate according to the priority list
-      if ((merge_candi.dir & 0x1) && (merge_candj.dir & 0x2)) {
-        if (state->frame->ref_LX0merge_candi.ref0 !=
-            state->frame->ref_LX1merge_candj.ref1 ||
-
-            merge_candi.mv00 != merge_candj.mv10 ||
-            merge_candi.mv01 != merge_candj.mv11)
-        {
-          uint32_t bitcost2;
-          uint32_t cost = 0;
-          int8_t cu_mv_cand = 0;
-          int16_t mv22;
-          kvz_pixel tmp_block64 * 64;
-          kvz_pixel tmp_pic64 * 64;
-
-          mv00 = merge_candi.mv00;
-          mv01 = merge_candi.mv01;
-          mv10 = merge_candj.mv10;
-          mv11 = merge_candj.mv11;
-
-          // Don't try merge candidates that don't satisfy mv constraints.
-          if (!fracmv_within_tile(&info, mv00, mv01) ||
-              !fracmv_within_tile(&info, mv10, mv11))
-          {
-            continue;
-          }
-
-          kvz_inter_recon_lcu_bipred(state,
-                                     state->frame->ref->images
-                                       state->frame->ref_LX0merge_candi.ref0
-                                     ,
-                                     state->frame->ref->images
-                                       state->frame->ref_LX1merge_candj.ref1
-                                     ,
-                                     x, y,
-                                     width,
-                                     height,
-                                     mv,
-                                     templcu);
-
-          for (int ypos = 0; ypos < height; ++ypos) {
-            int dst_y = ypos * width;
-            for (int xpos = 0; xpos < width; ++xpos) {
-              tmp_blockdst_y + xpos = templcu->rec.y
-                SUB_SCU(y + ypos) * LCU_WIDTH + SUB_SCU(x + xpos);
-              tmp_picdst_y + xpos = frame->source->yx + xpos + (y + ypos)*frame->source->width;
-            }
-          }
-
-          cost = kvz_satd_any_size(cu_width, cu_width, tmp_pic, cu_width, tmp_block, cu_width);
-
-          cost += info.mvd_cost_func(state,
-                                     merge_candi.mv00,
-                                     merge_candi.mv01,
-                                     0,
-                                     info.mv_cand,
-                                     NULL, 0, 0,
-                                     &bitcost0);
-          cost += info.mvd_cost_func(state,
-                                     merge_candi.mv10,
-                                     merge_candi.mv11,
-                                     0,
-                                     info.mv_cand,
-                                     NULL, 0, 0,
-                                     &bitcost1);
-
-          const uint8_t mv_ref_coded2 = {
-            merge_candi.ref0,
-            merge_candj.ref1
-          };
-          const int extra_bits = mv_ref_coded0 + mv_ref_coded1 + 2 /* mv dir cost */;
-          cost += state->lambda_sqrt * extra_bits + 0.5;
-
-
-          if (cost < *inter_cost) {
-            cur_cu->inter.mv_dir = 3;
-
-            cur_cu->inter.mv_ref0 = merge_candi.ref0;
-            cur_cu->inter.mv_ref1 = merge_candj.ref1;
-
-            cur_cu->inter.mv00 = merge_candi.mv00;
-            cur_cu->inter.mv01 = merge_candi.mv01;
-            cur_cu->inter.mv10 = merge_candj.mv10;
-            cur_cu->inter.mv11 = merge_candj.mv11;
-            cur_cu->merged = 0;
-
-            // Check every candidate to find a match
-            for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) {
-              if (merge_candmerge_idx.mv00 == cur_cu->inter.mv00 &&
-                  merge_candmerge_idx.mv01 == cur_cu->inter.mv01 &&
-                  merge_candmerge_idx.mv10 == cur_cu->inter.mv10 &&
-                  merge_candmerge_idx.mv11 == cur_cu->inter.mv11 &&
-                  merge_candmerge_idx.ref0 == cur_cu->inter.mv_ref0 &&
-                  merge_candmerge_idx.ref1 == cur_cu->inter.mv_ref1)
-              {
-                cur_cu->merged = 1;
-                cur_cu->merge_idx = merge_idx;
-                break;
-              }
-            }
-
-            // Each motion vector has its own candidate
-            for (int reflist = 0; reflist < 2; reflist++) {
-              cu_mv_cand = 0;
-              kvz_inter_get_mv_cand(state, x, y, width, height, info.mv_cand, cur_cu, lcu, reflist);
-              if (info.mv_cand00 != info.mv_cand10 ||
-                  info.mv_cand01 != info.mv_cand11)
-              {
-                uint32_t (*mvd_coding_cost)(const encoder_state_t * const state,
-                                            vector2d_t *,
-                                            const cabac_data_t*) =
-                  cfg->mv_rdo ? kvz_get_mvd_coding_cost_cabac : get_mvd_coding_cost;
-
-                vector2d_t mvd_temp1, mvd_temp2;
-                int cand1_cost, cand2_cost;
-
-                mvd_temp1.x = cur_cu->inter.mvreflist0 - info.mv_cand00;
-                mvd_temp1.y = cur_cu->inter.mvreflist1 - info.mv_cand01;
-                cand1_cost = mvd_coding_cost(state, &mvd_temp1, (cabac_data_t*)&state->cabac);
-
-                mvd_temp2.x = cur_cu->inter.mvreflist0 - info.mv_cand10;
-                mvd_temp2.y = cur_cu->inter.mvreflist1 - info.mv_cand11;
-                cand2_cost = mvd_coding_cost(state, &mvd_temp2, (cabac_data_t*)&state->cabac);
-
-                // Select candidate 1 if it has lower cost
-                if (cand2_cost < cand1_cost) {
-                  cu_mv_cand = 1;
-                }
-              }
-              CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand);
-            }
-
-            *inter_cost = cost;
-            *inter_bitcost = bitcost0 + bitcost1 + extra_bits;
-          }
-        }
-      }
-    }
-    FREE_POINTER(templcu);
+    search_pu_inter_bipred(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost);
   }
 
   if (*inter_cost < INT_MAX && cur_cu->inter.mv_dir == 1) {
@@ -1489,6 +1615,53 @@
   }
 }
 
+/**
+* \brief Calculate inter coding cost for luma and chroma CBs (--rd=2 accuracy).
+*
+* Calculate inter coding cost of each CB. This should match the intra coding cost
+* calculation that is used on this RDO accuracy, since CU type decision is based
+* on this.
+*
+* The cost includes SSD distortion, transform unit tree bits and motion vector bits
+* for both luma and chroma if enabled.
+*
+* \param state       encoder state
+* \param x           x-coordinate of the CU
+* \param y           y-coordinate of the CU
+* \param depth       depth of the CU in the quadtree
+* \param lcu         containing LCU
+*
+* \param inter_cost    Return inter cost
+* \param inter_bitcost Return inter bitcost
+*/
+void kvz_cu_cost_inter_rd2(encoder_state_t * const state,
+  int x, int y, int depth,
+  lcu_t *lcu,
+  double   *inter_cost,
+  uint32_t *inter_bitcost){
+
+  cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y));
+  int tr_depth = MAX(1, depth);
+  if (cur_cu->part_size != SIZE_2Nx2N) {
+    tr_depth = depth + 1;
+  }
+  kvz_lcu_fill_trdepth(lcu, x, y, depth, tr_depth);
+  kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth));
+
+  const bool reconstruct_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
+  kvz_quantize_lcu_residual(state, true, reconstruct_chroma,
+    x, y, depth,
+    NULL,
+    lcu);
+
+  *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu);
+  if (state->encoder_control->chroma_format != KVZ_CSP_400) {
+    *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu);
+  }
+
+  *inter_cost += *inter_bitcost * state->lambda;
+}
+
 
 /**
  * \brief Update CU to have best modes at this depth.
@@ -1516,6 +1689,15 @@
                   lcu,
                   inter_cost,
                   inter_bitcost);
+
+  // Calculate more accurate cost when needed
+  if (state->encoder_control->cfg.rdo >= 2) {
+    kvz_cu_cost_inter_rd2(state,
+      x, y, depth,
+      lcu,
+      inter_cost,
+      inter_bitcost);
+  }
 }
 
 
@@ -1560,6 +1742,7 @@
     cur_pu->type      = CU_INTER;
     cur_pu->part_size = part_mode;
     cur_pu->depth     = depth;
+    cur_pu->qp        = state->qp;
 
     double cost      = MAX_INT;
     uint32_t bitcost = MAX_INT;
@@ -1584,4 +1767,28 @@
       }
     }
   }
+
+  // Calculate more accurate cost when needed
+  if (state->encoder_control->cfg.rdo >= 2) {
+    kvz_cu_cost_inter_rd2(state,
+      x, y, depth,
+      lcu,
+      inter_cost,
+      inter_bitcost);
+  }
+
+  // Count bits spent for coding the partition mode.
+  int smp_extra_bits = 1; // horizontal or vertical
+  if (state->encoder_control->cfg.amp_enable) {
+    smp_extra_bits += 1; // symmetric or asymmetric
+    if (part_mode != SIZE_2NxN && part_mode != SIZE_Nx2N) {
+      smp_extra_bits += 1; // U,L or D,R
+    }
+  }
+  // The transform is split for SMP and AMP blocks so we need more bits for
+  // coding the CBF.
+  smp_extra_bits += 6;
+
+  *inter_cost += (state->encoder_control->cfg.rdo >= 2 ? state->lambda : state->lambda_sqrt) * smp_extra_bits;
+  *inter_bitcost += smp_extra_bits;
 }

kvazaar-1.2.0.tar.gz/src/search_inter.h -> kvazaar-1.3.0.tar.gz/src/search_inter.h Changed

@@ -32,17 +32,19 @@
 #include "inter.h"
 #include "kvazaar.h"
 
-#define FILTER_SIZE 8
-#define HALF_FILTER (FILTER_SIZE>>1)
+#define KVZ_LUMA_FILTER_TAPS 8
+#define KVZ_LUMA_FILTER_OFFSET 3
+#define KVZ_CHROMA_FILTER_TAPS 4
+#define KVZ_CHROMA_FILTER_OFFSET 1
 
-// Maximum extra width a block needs to filter 
-// a fractional pixel with positive fractional mv.x and mv.y
-#define KVZ_EXT_PADDING (FILTER_SIZE - 1)
+ // Maximum extra width a block needs to filter 
+ // a fractional pixel with positive fractional mv.x and mv.y
+#define KVZ_EXT_PADDING_LUMA (KVZ_LUMA_FILTER_TAPS - 1)
+#define KVZ_EXT_PADDING_CHROMA (KVZ_CHROMA_FILTER_TAPS - 1)
 
-// Maximum block width for extended block
-#define KVZ_EXT_BLOCK_W (LCU_WIDTH + KVZ_EXT_PADDING)
-
-typedef kvz_pixel frac_search_block(LCU_WIDTH + 1) * (LCU_WIDTH + 1);
+ // Maximum block width for extended block
+#define KVZ_EXT_BLOCK_W_LUMA (LCU_WIDTH + KVZ_EXT_PADDING_LUMA)
+#define KVZ_EXT_BLOCK_W_CHROMA (LCU_WIDTH_C + KVZ_EXT_PADDING_CHROMA)
 
 enum hpel_position {
   HPEL_POS_HOR = 0,

kvazaar-1.2.0.tar.gz/src/search_intra.c -> kvazaar-1.3.0.tar.gz/src/search_intra.c Changed

@@ -42,29 +42,6 @@
 
 
 /**
- * \brief Sort modes and costs to ascending order according to costs.
- */
-static INLINE void sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length)
-{
-  // Length is always between 5 and 23, and is either 21, 17, 9 or 8 about
-  // 60% of the time, so there should be no need for anything more complex
-  // than insertion sort.
-  for (uint8_t i = 1; i < length; ++i) {
-    const double cur_cost = costsi;
-    const int8_t cur_mode = modesi;
-    uint8_t j = i;
-    while (j > 0 && cur_cost < costsj - 1) {
-      costsj = costsj - 1;
-      modesj = modesj - 1;
-      --j;
-    }
-    costsj = cur_cost;
-    modesj = cur_mode;
-  }
-}
-
-
-/**
 * \brief Select mode with the smallest cost.
 */
 static INLINE uint8_t select_best_mode_index(const int8_t *modes, const double *costs, uint8_t length)
@@ -309,7 +286,7 @@
   if (depth == 0 || split_cost < nosplit_cost) {
     return split_cost;
   } else {
-    kvz_lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
+    kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);
 
     pred_cu->cbf = nosplit_cbf;
 
@@ -367,7 +344,7 @@
     costsi += satd_func(pred, orig_block);
   }
 
-  sort_modes(modes, costs, 5);
+  kvz_sort_modes(modes, costs, 5);
 }
 
 
@@ -617,12 +594,21 @@
     FILL(pred_cu.cbf, 0);
 
     // Reset transform split data in lcu.cu for this area.
-    kvz_lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
+    kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);
 
     double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modesrdo_mode, MAX_INT, &pred_cu, lcu);
     costsrdo_mode += mode_cost;
+
+    // Early termination if no coefficients has to be coded
+    if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(pred_cu.cbf, depth)) {
+      modes_to_check = rdo_mode + 1;
+      break;
+    }
   }
 
+  // Update order according to new costs
+  kvz_sort_modes(modes, costs, modes_to_check);
+
   // The best transform split hierarchy is not saved anywhere, so to get the
   // transform split hierarchy the search has to be performed again with the
   // best mode.
@@ -844,8 +830,7 @@
   }
 
   // Set transform depth to current depth, meaning no transform splits.
-  kvz_lcu_set_trdepth(lcu, x_px, y_px, depth, depth);
-  double best_rough_cost = costsselect_best_mode_index(modes, costs, number_of_modes);
+  kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth);
   // Refine results with slower search or get some results if rough search was skipped.
   const int32_t rdo_level = state->encoder_control->cfg.rdo;
   if (rdo_level >= 2 || skip_rough_search) {
@@ -860,7 +845,7 @@
     }
     int num_modes_to_check = MIN(number_of_modes, number_of_modes_to_search);
 
-    sort_modes(modes, costs, number_of_modes);
+    kvz_sort_modes(modes, costs, number_of_modes);
     number_of_modes = search_intra_rdo(state,
                       x_px, y_px, depth,
                       ref_pixels, LCU_WIDTH,
@@ -872,5 +857,5 @@
   uint8_t best_mode_i = select_best_mode_index(modes, costs, number_of_modes);
 
   *mode_out = modesbest_mode_i;
-  *cost_out = skip_rough_search ? costsbest_mode_i:best_rough_cost;
+  *cost_out = costsbest_mode_i;
 }

kvazaar-1.3.0.tar.gz/src/strategies/avx2/avx2_common_functions.h Added

@@ -0,0 +1,114 @@
+#ifndef AVX2_COMMON_FUNCTIONS_H
+#define AVX2_COMMON_FUNCTIONS_H
+
+#include <immintrin.h>
+
+/*
+ * Reorder coefficients from raster to scan order
+ * Fun fact: Once upon a time, doing this in a loop looked like this:
+ * for (int32_t n = 0; n < width * height; n++) {
+ *   coef_reordn = coefscann;
+ *   q_coef_reordn = q_coefscann;
+ * }
+ */
+static INLINE void scanord_read_vector(const int16_t **__restrict coeffs, const uint32_t *__restrict scan, int8_t scan_mode, int32_t subpos, int32_t width, __m256i *result_vecs, const int n_bufs)
+{
+  // For vectorized reordering of coef and q_coef
+  const __m128i low128_shuffle_masks3 = {
+    _mm_setr_epi8(10,11,  4, 5, 12,13,  0, 1,  6, 7, 14,15,  8, 9,  2, 3),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 4, 5,  6, 7,  0, 1,  2, 3, 12,13, 14,15,  8, 9, 10,11),
+  };
+
+  const __m128i blend_masks3 = {
+    _mm_setr_epi16( 0,  0,  0, -1,  0,  0, -1, -1),
+    _mm_setr_epi16( 0,  0,  0,  0,  0,  0,  0,  0),
+    _mm_setr_epi16( 0,  0, -1, -1,  0,  0, -1, -1),
+  };
+
+  const __m128i invec_rearr_masks_upper3 = {
+    _mm_setr_epi8( 0, 1,  8, 9,  2, 3,  6, 7, 10,11,  4, 5, 12,13, 14,15),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 0, 1,  8, 9,  4, 5, 12,13,  2, 3, 10,11,  6, 7, 14,15),
+  };
+
+  const __m128i invec_rearr_masks_lower3 = {
+    _mm_setr_epi8(12,13,  6, 7,  0, 1,  2, 3, 14,15,  4, 5,  8, 9, 10,11),
+    _mm_setr_epi8( 0, 1,  2, 3,  4, 5,  6, 7,  8, 9, 10,11, 12,13, 14,15),
+    _mm_setr_epi8( 4, 5, 12,13,  0, 1,  8, 9,  6, 7, 14,15,  2, 3, 10,11),
+  };
+
+  const size_t row_offsets4 = {
+    scansubpos + width * 0,
+    scansubpos + width * 1,
+    scansubpos + width * 2,
+    scansubpos + width * 3,
+  };
+
+  for (int i = 0; i < n_bufs; i++) {
+    const int16_t *__restrict coeff = coeffsi;
+
+    // NOTE: Upper means "higher in pixel order inside block", which implies
+    // lower addresses (note the difference: HIGH and LOW vs UPPER and LOWER),
+    // so upper 128b vector actually becomes the lower part of a 256-bit coeff
+    // vector and lower vector the higher part!
+    __m128d coeffs_d_upper;
+    __m128d coeffs_d_lower;
+
+    __m128i coeffs_upper;
+    __m128i coeffs_lower;
+
+    __m128i coeffs_rearr1_upper;
+    __m128i coeffs_rearr1_lower;
+
+    __m128i coeffs_rearr2_upper;
+    __m128i coeffs_rearr2_lower;
+
+    // Zeroing these is actually unnecessary, but the compiler will whine
+    // about uninitialized values otherwise
+    coeffs_d_upper = _mm_setzero_pd();
+    coeffs_d_lower = _mm_setzero_pd();
+
+    coeffs_d_upper = _mm_loadl_pd(coeffs_d_upper, (double *)(coeff + row_offsets0));
+    coeffs_d_upper = _mm_loadh_pd(coeffs_d_upper, (double *)(coeff + row_offsets1));
+
+    coeffs_d_lower = _mm_loadl_pd(coeffs_d_lower, (double *)(coeff + row_offsets2));
+    coeffs_d_lower = _mm_loadh_pd(coeffs_d_lower, (double *)(coeff + row_offsets3));
+
+    coeffs_upper   = _mm_castpd_si128(coeffs_d_upper);
+    coeffs_lower   = _mm_castpd_si128(coeffs_d_lower);
+
+    coeffs_lower   = _mm_shuffle_epi8(coeffs_lower, low128_shuffle_masksscan_mode);
+
+    coeffs_rearr1_upper = _mm_blendv_epi8(coeffs_upper, coeffs_lower, blend_masksscan_mode);
+    coeffs_rearr1_lower = _mm_blendv_epi8(coeffs_lower, coeffs_upper, blend_masksscan_mode);
+
+    coeffs_rearr2_upper = _mm_shuffle_epi8(coeffs_rearr1_upper, invec_rearr_masks_upperscan_mode);
+    coeffs_rearr2_lower = _mm_shuffle_epi8(coeffs_rearr1_lower, invec_rearr_masks_lowerscan_mode);
+
+    // The Intel Intrinsics Guide talks about _mm256_setr_m128i but my headers
+    // lack such an instruction. What it does is essentially this anyway.
+    result_vecsi = _mm256_inserti128_si256(_mm256_castsi128_si256(coeffs_rearr2_upper),
+                                             coeffs_rearr2_lower,
+                                             1);
+  }
+}
+
+// If ints is completely zero, returns 16 in *first and -1 in *last
+static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t *last)
+{
+  // Note that nonzero_bytes will always have both bytes set for a set word
+  // even if said word only had one of its bytes set, because we're doing 16
+  // bit wide comparisons. No big deal, just shift results to the right by one
+  // bit to have the results represent indexes of first set words, not bytes.
+  // Another note, it has to use right shift instead of division to preserve
+  // behavior on an all-zero vector (-1 / 2 == 0, but -1 >> 1 == -1)
+  const __m256i zero = _mm256_setzero_si256();
+
+  __m256i zeros = _mm256_cmpeq_epi16(ints, zero);
+  uint32_t nonzero_bytes = ~((uint32_t)_mm256_movemask_epi8(zeros));
+  *first = (    (int32_t)_tzcnt_u32(nonzero_bytes)) >> 1;
+  *last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1;
+}
+
+#endif

kvazaar-1.3.0.tar.gz/src/strategies/avx2/encode_coding_tree-avx2.c Added

@@ -0,0 +1,605 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "strategyselector.h"
+
+#if COMPILE_INTEL_AVX2
+#include "avx2_common_functions.h"
+#include "cabac.h"
+#include "context.h"
+#include "encode_coding_tree-avx2.h"
+#include "encode_coding_tree.h"
+#include "strategies/missing-intel-intrinsics.h"
+#include <immintrin.h>
+
+/*
+ * NOTE: Unlike SSE/AVX comparisons that would return 11 or 00 for gt/lte,
+ * this'll use 1x and 0x as bit patterns (x: garbage). A couple extra
+ * instructions will get you 11 and 00 if you need to use this as a mask
+ * somewhere at some point, but we don't need this right now.
+ *
+ * I'd love to draw a logic circuit here to describe this, but I can't. Two
+ * 2-bit uints can be compared for greaterness by first comparing their high
+ * bits using AND-NOT; (x AND (NOT y)) == 1 if x > y. If A_hi > B_hi, A > B.
+ * If A_hi == B_hi AND A_lo > B_lo, A > B. Otherwise, A <= B. It's really
+ * simple when drawn on paper, but quite messy on a general-purpose ALU. But
+ * look, just five instructions!
+ */
+static INLINE uint32_t u32vec_cmpgt_epu2(uint32_t a, uint32_t b)
+{
+  uint32_t a_gt_b          = _andn_u32(b, a);
+  uint32_t a_ne_b          = a ^ b;
+  uint32_t a_gt_b_sh       = a_gt_b << 1;
+  uint32_t lobit_tiebrk_hi = _andn_u32(a_ne_b, a_gt_b_sh);
+  uint32_t res             = a_gt_b | lobit_tiebrk_hi;
+  return res;
+}
+
+static INLINE uint32_t pack_16x16b_to_16x2b(__m256i src)
+{
+  /*
+   * For each 16-bit element in src:
+   * ABCD EFGH IJKL MNOP Original elements
+   * 0000 0000 0000 00XY Element clipped to 0, 3 using _mm256_min_epu16
+   * 0000 000X Y000 0000 Shift word to align LSBs across byte boundary
+   * 0000 0001 1000 0000 Comparison mask to be compared against
+   * XXXX XXXX YYYY YYYY Comparison result, for movemask
+   */
+  const __m256i threes  = _mm256_set1_epi16   (3);
+  const __m256i cmpmask = _mm256_slli_epi16   (threes, 7); // 0x0180 (avoid set1)
+
+  __m256i  clipped      = _mm256_min_epu16    (src, threes);
+  __m256i  shifted      = _mm256_slli_epi16   (clipped, 7);
+  __m256i  cmpres       = _mm256_cmpeq_epi8   (shifted, cmpmask);
+  uint32_t result       = _mm256_movemask_epi8(cmpres);
+
+  return result;
+}
+
+/**
+ * \brief Context derivation process of coeff_abs_significant_flag,
+ *        parallelized to handle 16 coeffs at once
+ * \param pattern_sig_ctx pattern for current coefficient group
+ * \param scan_idx pixel scan type in use
+ * \param pos_xs column addresses of current scan positions
+ * \param pos_ys row addresses of current scan positions
+ * \param block_type log2 value of block size if square block, or 4 otherwise
+ * \param width width of the block
+ * \param texture_type texture type (TEXT_LUMA...)
+ * \returns ctx_inc for current scan position
+ */
+static INLINE __m256i kvz_context_get_sig_ctx_inc_16x16b(int32_t pattern_sig_ctx, uint32_t scan_idx, __m256i pos_xs,
+                                __m256i pos_ys, int32_t block_type, int8_t texture_type)
+{
+  const __m256i zero   = _mm256_set1_epi8(0);
+  const __m256i ff     = _mm256_set1_epi8(0xff);
+
+  const __m256i ones   = _mm256_set1_epi16(1);
+  const __m256i twos   = _mm256_set1_epi16(2);
+  const __m256i threes = _mm256_set1_epi16(3);
+
+  const __m256i ctx_ind_map3 = {
+    _mm256_setr_epi16(
+        0, 2, 1, 6,
+        3, 4, 7, 6,
+        4, 5, 7, 8,
+        5, 8, 8, 8
+    ),
+    _mm256_setr_epi16(
+        0, 1, 4, 5,
+        2, 3, 4, 5,
+        6, 6, 8, 8,
+        7, 7, 8, 8
+    ),
+    _mm256_setr_epi16(
+        0, 2, 6, 7,
+        1, 3, 6, 7,
+        4, 4, 8, 8,
+        5, 5, 8, 8
+    ),
+  };
+
+  int16_t offset;
+  if (block_type == 3)
+    if (scan_idx == SCAN_DIAG)
+      offset = 9;
+    else
+      offset = 15;
+  else
+    if (texture_type == 0)
+      offset = 21;
+    else
+      offset = 12;
+
+  __m256i offsets = _mm256_set1_epi16(offset);
+
+  // This will only ever be compared to 0, 1 and 2, so it's fine to cast down
+  // to 16b (and it should never be above 3 anyways)
+  __m256i pattern_sig_ctxs = _mm256_set1_epi16((int16_t)(MIN(0xffff, pattern_sig_ctx)));
+  __m256i pattern_sig_ctxs_eq_zero = _mm256_cmpeq_epi16(pattern_sig_ctxs, zero);
+  __m256i pattern_sig_ctxs_eq_one  = _mm256_cmpeq_epi16(pattern_sig_ctxs, ones);
+  __m256i pattern_sig_ctxs_eq_two  = _mm256_cmpeq_epi16(pattern_sig_ctxs, twos);
+
+  __m256i pattern_sig_ctxs_eq_1or2 = _mm256_or_si256 (pattern_sig_ctxs_eq_one,
+                                                      pattern_sig_ctxs_eq_two);
+  __m256i pattern_sig_ctxs_lt3     = _mm256_or_si256 (pattern_sig_ctxs_eq_1or2,
+                                                      pattern_sig_ctxs_eq_zero);
+  __m256i pattern_sig_ctxs_other   = _mm256_xor_si256(pattern_sig_ctxs_lt3,
+                                                      ff);
+  __m256i x_plus_y        = _mm256_add_epi16  (pos_xs,   pos_ys);
+  __m256i x_plus_y_zero   = _mm256_cmpeq_epi16(x_plus_y, zero);   // All these should be 0, preempts block_type_two rule
+
+  __m256i texture_types = _mm256_set1_epi16((int16_t)texture_type);
+
+  __m256i block_types     = _mm256_set1_epi16((int16_t)block_type);
+  __m256i block_type_two  = _mm256_cmpeq_epi16(block_types, twos);   // All these should be ctx_ind_map4 * pos_y + pos_x;
+  __m256i bt2_vals        = ctx_ind_mapscan_idx;
+  __m256i bt2_vals_masked = _mm256_and_si256(bt2_vals, block_type_two);
+
+  __m256i pos_xs_in_subset = _mm256_and_si256(pos_xs, threes);
+  __m256i pos_ys_in_subset = _mm256_and_si256(pos_ys, threes);
+
+  __m256i cg_pos_xs        = _mm256_srli_epi16(pos_xs, 2);
+  __m256i cg_pos_ys        = _mm256_srli_epi16(pos_ys, 2);
+  __m256i cg_pos_xysums    = _mm256_add_epi16 (cg_pos_xs, cg_pos_ys);
+
+  __m256i pos_xy_sums_in_subset = _mm256_add_epi16(pos_xs_in_subset, pos_ys_in_subset);
+
+  /*
+   * if (pattern_sig_ctx == 0) {
+   *   switch (pos_x_in_subset + pos_y_in_subset) {
+   *   case 0:
+   *     cnt = 2;
+   *     break;
+   *   case 1:
+   *   case 2:
+   *     cnt = 1;
+   *     break;
+   *   default:
+   *     cnt = 0;
+   *   }
+   * }
+   *
+   * Equivalent to:
+   *
+   * if (pattern_sig_ctx == 0) {
+   *   subamt = cnt <= 1 ? 1 : 0;
+   *   pxyis_max3 = min(3, pos_x_in_subset + pos_y_in_subset);
+   *   cnt = (3 - pxyis_max3) - subamt;
+   * }
+   */
+  __m256i pxyis_lte_1     = _mm256_cmpgt_epi16(twos,                  pos_xy_sums_in_subset);
+  __m256i subamts         = _mm256_and_si256  (pxyis_lte_1,           ones);
+  __m256i pxyis_max3      = _mm256_min_epu16  (pos_xy_sums_in_subset, threes);
+  __m256i cnts_tmp        = _mm256_sub_epi16  (threes,                pxyis_max3);
+  __m256i cnts_sig_ctx_0  = _mm256_sub_epi16  (cnts_tmp,              subamts);
+  __m256i cnts_sc0_masked = _mm256_and_si256  (cnts_sig_ctx_0,        pattern_sig_ctxs_eq_zero);
+
+  /*
+   * if (pattern_sig_ctx == 1 || pattern_sig_ctx == 2) {
+   *   if (pattern_sig_ctx == 1)
+   *     subtrahend = pos_y_in_subset;
+   *   else
+   *     subtrahend = pos_x_in_subset;
+   *   cnt = 2 - min(2, subtrahend);
+   * }
+   */
+  __m256i pos_operands_ctx_1or2 = _mm256_blendv_epi8(pos_ys_in_subset,
+                                                     pos_xs_in_subset,
+                                                     pattern_sig_ctxs_eq_two);
+
+  __m256i pos_operands_max2     = _mm256_min_epu16  (pos_operands_ctx_1or2, twos);
+  __m256i cnts_sig_ctx_1or2     = _mm256_sub_epi16  (twos,                  pos_operands_max2);
+  __m256i cnts_sc12_masked      = _mm256_and_si256  (cnts_sig_ctx_1or2,     pattern_sig_ctxs_eq_1or2);
+
+  /*
+   * if (pattern_sig_ctx > 2)
+   *   cnt = 2;
+   */
+  __m256i cnts_scother_masked = _mm256_and_si256(twos, pattern_sig_ctxs_other);
+
+  // Select correct count
+  __m256i cnts_sc012_masked   = _mm256_or_si256 (cnts_sc0_masked,     cnts_sc12_masked);
+  __m256i cnts                = _mm256_or_si256 (cnts_scother_masked, cnts_sc012_masked);
+
+  // Compute final values
+  __m256i textype_eq_0     = _mm256_cmpeq_epi16(texture_types, zero);
+  __m256i cg_pos_sums_gt_0 = _mm256_cmpgt_epi16(cg_pos_xysums, zero);
+  __m256i tmpcond          = _mm256_and_si256  (textype_eq_0,  cg_pos_sums_gt_0);
+  __m256i tmp              = _mm256_and_si256  (tmpcond,       threes);
+  __m256i tmp_with_offsets = _mm256_add_epi16  (tmp,           offsets);
+  __m256i rv_noshortcirc   = _mm256_add_epi16  (cnts,          tmp_with_offsets);
+
+  // Ol' sprite mask method works here!
+  __m256i rv1 = _mm256_andnot_si256(block_type_two, rv_noshortcirc);
+  __m256i rv2 = _mm256_or_si256    (rv1,            bt2_vals_masked);
+  __m256i rv  = _mm256_andnot_si256(x_plus_y_zero,  rv2);
+  return rv;
+}
+
+void kvz_encode_coeff_nxn_avx2(encoder_state_t * const state,
+                               cabac_data_t * const cabac,
+                               const coeff_t *coeff,
+                               uint8_t width,
+                               uint8_t type,
+                               int8_t scan_mode,
+                               int8_t tr_skip)
+{
+  const encoder_control_t * const encoder = state->encoder_control;
+  int c1 = 1;
+  uint8_t last_coeff_x = 0;
+  uint8_t last_coeff_y = 0;
+  int32_t i;
+  uint32_t sig_coeffgroup_nzs8 * 8 = { 0 };
+
+  int8_t be_valid = encoder->cfg.signhide_enable;
+  int32_t scan_pos_sig;
+  uint32_t go_rice_param = 0;
+  uint32_t ctx_sig;
+
+  // CONSTANTS
+  const uint32_t num_blk_side    = width >> TR_MIN_LOG2_SIZE;
+  const uint32_t log2_block_size = kvz_g_convert_to_bitwidth + 2;
+  const uint32_t *scan           =
+    kvz_g_sig_last_scanscan_modelog2_block_size - 1;
+  const uint32_t *scan_cg = g_sig_last_scan_cglog2_block_size - 2scan_mode;
+  const uint32_t num_blocks = num_blk_side * num_blk_side;
+
+  const __m256i zero = _mm256_set1_epi8(0);
+  const __m256i ones = _mm256_set1_epi16(1);
+  const __m256i twos = _mm256_set1_epi16(2);
+
+  // Init base contexts according to block type
+  cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_modeltype);
+  cabac_ctx_t *baseCtx           = (type == 0) ? &(cabac->ctx.cu_sig_model_luma0) :
+                                 &(cabac->ctx.cu_sig_model_chroma0);
+
+  // Scan all coeff groups to find out which of them have coeffs.
+  // Populate sig_coeffgroup_nzs with that info.
+
+  // NOTE: Modified the functionality a bit, sig_coeffgroup_flag used to be
+  // 1 if true and 0 if false, now it's "undefined but nonzero" if true and
+  // 0 if false (not actually undefined, it's a bitmask representing the
+  // significant coefficients' position in the group which in itself could
+  // be useful information)
+  int32_t scan_cg_last = -1;
+
+  for (int32_t i = 0; i < num_blocks; i++) {
+    const uint32_t cg_id = scan_cgi;
+    const uint32_t n_xbits = log2_block_size - 2; // How many lowest bits of scan_cg represent X coord
+    const uint32_t cg_x = cg_id & ((1 << n_xbits) - 1);
+    const uint32_t cg_y = cg_id >> n_xbits;
+
+    const uint32_t cg_pos = cg_y * width * 4 + cg_x * 4;
+    const uint32_t cg_pos_y = (cg_pos >> log2_block_size) >> TR_MIN_LOG2_SIZE;
+    const uint32_t cg_pos_x = (cg_pos & (width - 1)) >> TR_MIN_LOG2_SIZE;
+    const uint32_t idx = cg_pos_x + cg_pos_y * num_blk_side;
+
+    __m128d coeffs_d_upper = _mm_setzero_pd();
+    __m128d coeffs_d_lower = _mm_setzero_pd();
+    __m128i coeffs_upper;
+    __m128i coeffs_lower;
+    __m256i cur_coeffs;
+
+    coeffs_d_upper = _mm_loadl_pd(coeffs_d_upper, (double *)(coeff + cg_pos + 0 * width));
+    coeffs_d_upper = _mm_loadh_pd(coeffs_d_upper, (double *)(coeff + cg_pos + 1 * width));
+    coeffs_d_lower = _mm_loadl_pd(coeffs_d_lower, (double *)(coeff + cg_pos + 2 * width));
+    coeffs_d_lower = _mm_loadh_pd(coeffs_d_lower, (double *)(coeff + cg_pos + 3 * width));
+
+    coeffs_upper = _mm_castpd_si128(coeffs_d_upper);
+    coeffs_lower = _mm_castpd_si128(coeffs_d_lower);
+
+    cur_coeffs = _mm256_insertf128_si256(_mm256_castsi128_si256(coeffs_upper),
+                                         coeffs_lower,
+                                         1);
+
+    __m256i coeffs_zero = _mm256_cmpeq_epi16(cur_coeffs, zero);
+
+    uint32_t nz_coeffs_2b = ~((uint32_t)_mm256_movemask_epi8(coeffs_zero));
+    sig_coeffgroup_nzsidx = nz_coeffs_2b;
+
+    if (nz_coeffs_2b)
+      scan_cg_last = i;
+  }
+  // Rest of the code assumes at least one non-zero coeff.
+  assert(scan_cg_last >= 0);
+
+  ALIGNED(64) int16_t coeff_reordLCU_WIDTH * LCU_WIDTH;
+  uint32_t pos_last, scan_pos_last;
+
+  {
+    __m256i coeffs_r;
+    for (int32_t i = 0; i <= scan_cg_last; i++) {
+      int32_t subpos = i * 16;
+      scanord_read_vector(&coeff, scan, scan_mode, subpos, width, &coeffs_r, 1);
+      _mm256_store_si256((__m256i *)(coeff_reord + subpos), coeffs_r);
+    }
+
+    // Find the last coeff by going backwards in scan order. With cmpeq_epi16
+    // and movemask, we can generate a dword with 16 2-bit masks that are 11
+    // for zero words in the coeff vector, and 00 for nonzero words. By
+    // inverting the bits and counting leading zeros, we can determine the
+    // number of zero bytes in the vector counting from high to low memory
+    // addresses; subtract that from 31 and divide by 2 to get the offset of
+    // the last nonzero word.
+    uint32_t baseaddr = scan_cg_last * 16;
+    __m256i cur_coeffs_zeros = _mm256_cmpeq_epi16(coeffs_r, zero);
+    uint32_t nz_bytes = ~(_mm256_movemask_epi8(cur_coeffs_zeros));
+    scan_pos_last = baseaddr + ((31 - _lzcnt_u32(nz_bytes)) >> 1);
+    pos_last = scanscan_pos_last;
+  }
+
+  // transform skip flag
+  if(width == 4 && encoder->cfg.trskip_enable) {
+    cabac->cur_ctx = (type == 0) ? &(cabac->ctx.transform_skip_model_luma) : &(cabac->ctx.transform_skip_model_chroma);
+    CABAC_BIN(cabac, tr_skip, "transform_skip_flag");
+  }
+
+  last_coeff_x = pos_last & (width - 1);
+  last_coeff_y = (uint8_t)(pos_last >> log2_block_size);
+
+  // Code last_coeff_x and last_coeff_y
+  kvz_encode_last_significant_xy(cabac,
+                                 last_coeff_x,
+                                 last_coeff_y,
+                                 width,
+                                 width,
+                                 type,
+                                 scan_mode);
+
+  scan_pos_sig = scan_pos_last;
+
+  ALIGNED(64) uint16_t abs_coeff16;
+  ALIGNED(32) uint16_t abs_coeff_buf_sb16;
+  ALIGNED(32) int16_t pos_ys_buf16;
+  ALIGNED(32) int16_t pos_xs_buf16;
+  ALIGNED(32) int16_t ctx_sig_buf16;
+
+  abs_coeff0 = abs(coeffpos_last);
+  uint32_t coeff_signs  = (coeffpos_last < 0);
+  int32_t num_non_zero = 1;
+  int32_t last_nz_pos_in_cg  = scan_pos_sig;
+  int32_t first_nz_pos_in_cg = scan_pos_sig;
+  scan_pos_sig--;
+
+  // significant_coeff_flag
+  for (i = scan_cg_last; i >= 0; i--) {
+    int32_t sub_pos        = i << 4; // LOG2_SCAN_SET_SIZE;
+    int32_t cg_blk_pos     = scan_cgi;
+    int32_t cg_pos_y       = cg_blk_pos / num_blk_side;
+    int32_t cg_pos_x       = cg_blk_pos - (cg_pos_y * num_blk_side);
+
+    go_rice_param = 0;
+
+    if (i == scan_cg_last || i == 0) {
+      sig_coeffgroup_nzscg_blk_pos = 1;
+    } else {
+      uint32_t sig_coeff_group   = (sig_coeffgroup_nzscg_blk_pos != 0);
+      uint32_t ctx_sig  = kvz_context_get_sig_coeff_group(sig_coeffgroup_nzs, cg_pos_x,
+                                                      cg_pos_y, width);
+      cabac->cur_ctx = &base_coeff_group_ctxctx_sig;
+      CABAC_BIN(cabac, sig_coeff_group, "coded_sub_block_flag");
+    }
+
+    if (sig_coeffgroup_nzscg_blk_pos) {
+      int32_t pattern_sig_ctx = kvz_context_calc_pattern_sig_ctx(sig_coeffgroup_nzs,
+                                                             cg_pos_x, cg_pos_y, width);
+
+      // A mask with the first 16-bit word unmasked (bits set ie. 0xffff)
+      const __m256i coeff_pos_zero = _mm256_castsi128_si256(_mm_cvtsi32_si128(0xffff));
+
+      const __m128i log2_block_size_128 = _mm_cvtsi32_si128(log2_block_size);
+
+      __m256i coeffs = _mm256_load_si256((__m256i *)(coeff_reord + sub_pos));
+      __m256i sigs_inv = _mm256_cmpeq_epi16(coeffs, zero);
+      __m256i is = _mm256_set1_epi16(i);
+      __m256i is_zero = _mm256_cmpeq_epi16(is, zero);
+      __m256i coeffs_negative = _mm256_cmpgt_epi16(zero, coeffs);
+
+      __m256i masked_coeffs = _mm256_andnot_si256(sigs_inv, coeffs);
+      __m256i abs_coeffs = _mm256_abs_epi16(masked_coeffs);
+
+      // TODO: obtain 16-bit block positions, maybe? :P
+      __m256i blk_poses_hi = _mm256_loadu_si256((__m256i *)(scan + sub_pos + 8));
+      __m256i blk_poses_lo = _mm256_loadu_si256((__m256i *)(scan + sub_pos + 0));
+      __m256i blk_poses_tmp = _mm256_packs_epi32(blk_poses_lo, blk_poses_hi);
+      __m256i blk_poses = _mm256_permute4x64_epi64(blk_poses_tmp, _MM_SHUFFLE(3, 1, 2, 0));
+
+      __m256i pos_ys = _mm256_srl_epi16(blk_poses, log2_block_size_128);
+      __m256i pos_xs = _mm256_sub_epi16(blk_poses, _mm256_sll_epi16(pos_ys, log2_block_size_128));
+
+      _mm256_store_si256((__m256i *)pos_ys_buf, pos_ys);
+      _mm256_store_si256((__m256i *)pos_xs_buf, pos_xs);
+
+      __m256i encode_sig_coeff_flags_inv = _mm256_andnot_si256(is_zero, coeff_pos_zero);
+
+      get_first_last_nz_int16(masked_coeffs, &first_nz_pos_in_cg, &last_nz_pos_in_cg);
+      _mm256_store_si256((__m256i *)abs_coeff_buf_sb, abs_coeffs);
+
+      __m256i ctx_sigs = kvz_context_get_sig_ctx_inc_16x16b(pattern_sig_ctx, scan_mode, pos_xs, pos_ys,
+                                             log2_block_size, type);
+
+      _mm256_store_si256((__m256i *)ctx_sig_buf, ctx_sigs);
+
+      uint32_t esc_flags = ~(_mm256_movemask_epi8(encode_sig_coeff_flags_inv));
+      uint32_t sigs = ~(_mm256_movemask_epi8(sigs_inv));
+      uint32_t coeff_sign_buf = _mm256_movemask_epi8(coeffs_negative);
+
+      for (; scan_pos_sig >= sub_pos; scan_pos_sig--) {
+        uint32_t id = scan_pos_sig - sub_pos;
+        uint32_t shift = (id << 1) + 1;
+
+        uint32_t curr_sig = (sigs >> shift) & 1;
+        uint32_t curr_esc_flag = (esc_flags >> shift) & 1;
+        uint32_t curr_coeff_sign = (coeff_sign_buf >> shift) & 1;
+
+        if (curr_esc_flag | num_non_zero) {
+          ctx_sig = ctx_sig_bufid;
+          cabac->cur_ctx = &baseCtxctx_sig;
+          CABAC_BIN(cabac, curr_sig, "sig_coeff_flag");
+        }
+
+        if (curr_sig) {
+          abs_coeffnum_non_zero  = abs_coeff_buf_sbid;
+          coeff_signs              = 2 * coeff_signs + curr_coeff_sign;
+          num_non_zero++;
+        }
+      }
+    } else {
+      scan_pos_sig = sub_pos - 1;
+    }
+
+    if (num_non_zero > 0) {
+      bool sign_hidden = last_nz_pos_in_cg - first_nz_pos_in_cg >= 4 /* SBH_THRESHOLD */
+                         && !encoder->cfg.lossless;
+      uint32_t ctx_set  = (i > 0 && type == 0) ? 2 : 0;
+      cabac_ctx_t *base_ctx_mod;
+      int32_t num_c1_flag, first_c2_flag_idx, idx;
+
+      __m256i abs_coeffs = _mm256_load_si256((__m256i *)abs_coeff);
+      __m256i coeffs_gt1 = _mm256_cmpgt_epi16(abs_coeffs, ones);
+      __m256i coeffs_gt2 = _mm256_cmpgt_epi16(abs_coeffs, twos);
+      uint32_t coeffs_gt1_bits = _mm256_movemask_epi8(coeffs_gt1);
+      uint32_t coeffs_gt2_bits = _mm256_movemask_epi8(coeffs_gt2);
+
+      if (c1 == 0) {
+        ctx_set++;
+      }
+
+      base_ctx_mod     = (type == 0) ? &(cabac->ctx.cu_one_model_luma4 * ctx_set) :
+                         &(cabac->ctx.cu_one_model_chroma4 * ctx_set);
+      num_c1_flag      = MIN(num_non_zero, C1FLAG_NUMBER);
+      first_c2_flag_idx = -1;
+
+
+      /*
+       * c1s_pattern is 16 base-4 numbers: 3, 3, 3, ... , 3, 2 (c1 will never
+       * be less than 0 or greater than 3, so two bits per iter are enough).
+       * It's essentially the values that c1 will be for the next iteration as
+       * long as we have not encountered any >1 symbols. Count how long run of
+       * such symbols there is in the beginning of this CG, and zero all c1's
+       * that are located at or after the first >1 symbol.
+       */
+      const uint32_t c1s_pattern = 0xfffffffe;
+      uint32_t n_nongt1_bits = _tzcnt_u32(coeffs_gt1_bits);
+      uint32_t c1s_nextiter  = _bzhi_u32(c1s_pattern, n_nongt1_bits);
+      first_c2_flag_idx      = n_nongt1_bits >> 1;
+
+      c1 = 1;
+      for (idx = 0; idx < num_c1_flag; idx++) {
+        uint32_t shift = idx << 1;
+        uint32_t symbol = (coeffs_gt1_bits >> shift) & 1;
+
+        cabac->cur_ctx = &base_ctx_modc1;
+        CABAC_BIN(cabac, symbol, "coeff_abs_level_greater1_flag");
+
+        c1 = (c1s_nextiter >> shift) & 3;
+      }
+
+      if (c1 == 0) {
+        base_ctx_mod = (type == 0) ? &(cabac->ctx.cu_abs_model_lumactx_set) :
+                       &(cabac->ctx.cu_abs_model_chromactx_set);
+
+        if (first_c2_flag_idx != -1) {
+          uint32_t shift = (first_c2_flag_idx << 1) + 1;
+          uint8_t symbol = (coeffs_gt2_bits >> shift) & 1;
+          cabac->cur_ctx = &base_ctx_mod0;
+
+          CABAC_BIN(cabac, symbol, "coeff_abs_level_greater2_flag");
+        }
+      }
+      int32_t shiftamt = (be_valid && sign_hidden) ? 1 : 0;
+      int32_t nnz = num_non_zero - shiftamt;
+      coeff_signs >>= shiftamt;
+      if (!cabac->only_count) {
+        if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) {
+          coeff_signs ^= kvz_crypto_get_key(state->crypto_hdl, nnz);
+        }
+      }
+      CABAC_BINS_EP(cabac, coeff_signs, nnz, "coeff_sign_flag");
+
+      if (c1 == 0 || num_non_zero > C1FLAG_NUMBER) {
+
+        const __m256i ones        = _mm256_set1_epi16(1);
+
+        __m256i abs_coeffs_gt1    = _mm256_cmpgt_epi16  (abs_coeffs, ones);
+        uint32_t acgt1_bits       = _mm256_movemask_epi8(abs_coeffs_gt1);
+        uint32_t first_acgt1_bpos = _tzcnt_u32(acgt1_bits);
+
+        uint32_t abs_coeffs_base4 = pack_16x16b_to_16x2b(abs_coeffs);
+
+        const uint32_t ones_base4 = 0x55555555;
+        const uint32_t twos_base4 = 0xaaaaaaaa;
+
+        const uint32_t c1flag_number_mask_inv = 0xffffffff << (C1FLAG_NUMBER << 1);
+        const uint32_t c1flag_number_mask     = ~c1flag_number_mask_inv;
+
+        // The addition will not overflow between 2-bit atoms because
+        // first_coeff2s will only be 1 or 0, and the other addend is 2
+        uint32_t first_coeff2s    = _bzhi_u32(ones_base4, first_acgt1_bpos + 2);
+        uint32_t base_levels      = first_coeff2s + twos_base4;
+
+        base_levels &= c1flag_number_mask;
+        base_levels |= (ones_base4 & c1flag_number_mask_inv);
+
+        uint32_t encode_decisions = u32vec_cmpgt_epu2(base_levels, abs_coeffs_base4);
+
+        for (idx = 0; idx < num_non_zero; idx++) {
+
+          uint32_t shift = idx << 1;
+          uint32_t dont_encode_curr = (encode_decisions >> shift);
+          int16_t base_level        = (base_levels      >> shift) & 3;
+
+          uint16_t curr_abs_coeff = abs_coeffidx;
+
+          if (!(dont_encode_curr & 2)) {
+            uint16_t level_diff = curr_abs_coeff - base_level;
+            if (!cabac->only_count && (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFFS)) {
+              kvz_cabac_write_coeff_remain_encry(state, cabac, level_diff, go_rice_param, base_level);
+            } else {
+              kvz_cabac_write_coeff_remain(cabac, level_diff, go_rice_param);
+            }
+
+            if (curr_abs_coeff > 3 * (1 << go_rice_param)) {
+              go_rice_param = MIN(go_rice_param + 1, 4);
+            }
+          }
+
+        }
+      }
+    }
+    last_nz_pos_in_cg = -1;
+    first_nz_pos_in_cg = 16;
+    num_non_zero = 0;
+    coeff_signs = 0;
+  }
+}
+#endif // COMPILE_INTEL_AVX2
+
+int kvz_strategy_register_encode_avx2(void* opaque, uint8_t bitdepth)
+{
+  bool success = true;
+
+#if COMPILE_INTEL_AVX2
+  success &= kvz_strategyselector_register(opaque, "encode_coeff_nxn", "avx2", 40, &kvz_encode_coeff_nxn_avx2);
+#endif
+
+  return success;
+}

kvazaar-1.3.0.tar.gz/src/strategies/avx2/encode_coding_tree-avx2.h Added

@@ -0,0 +1,42 @@
+#ifndef ENCODE_CODING_TREE_AVX2_H_
+#define ENCODE_CODING_TREE_AVX2_H_
+
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \file
+ * Functions for writing the coding quadtree and related syntax.
+ */
+
+#include "encoderstate.h"
+#include "global.h"
+
+void kvz_encode_coeff_nxn_avx2(encoder_state_t * const state,
+                               cabac_data_t * const cabac,
+                               const coeff_t *coeff,
+                               uint8_t width,
+                               uint8_t type,
+                               int8_t scan_mode,
+                               int8_t tr_skip);
+
+int kvz_strategy_register_encode_avx2(void* opaque, uint8_t bitdepth);
+
+#endif // ENCODE_CODING_TREE_AVX2_H_

kvazaar-1.2.0.tar.gz/src/strategies/avx2/ipol-avx2.c -> kvazaar-1.3.0.tar.gz/src/strategies/avx2/ipol-avx2.c Changed

@@ -31,1338 +31,1422 @@
 
 #include "encoder.h"
 #include "kvazaar.h"
+#include "search_inter.h"
 #include "strategies/generic/picture-generic.h"
 #include "strategies/strategies-ipol.h"
 #include "strategyselector.h"
 #include "strategies/generic/ipol-generic.h"
 
 
-#define FILTER_OFFSET 3
-#define FILTER_SIZE 8
-
-#define MAX_HEIGHT (4 * (LCU_WIDTH + 1) + FILTER_SIZE)
-#define MAX_WIDTH ((LCU_WIDTH + 1) + FILTER_SIZE)
-
 extern int8_t kvz_g_luma_filter48;
 extern int8_t kvz_g_chroma_filter84;
 
-void kvz_eight_tap_filter_x8_and_flip(__m128i *data01, __m128i *data23, __m128i *data45, __m128i *data67, __m128i *filter, __m128i *dst)
+static int32_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data)
 {
-  __m128i a, b, c, d;
-  __m128i fir = _mm_broadcastq_epi64(_mm_loadl_epi64(filter));
-
-  a = _mm_maddubs_epi16(*data01, fir);
-  b = _mm_maddubs_epi16(*data23, fir);
-  a = _mm_hadd_epi16(a, b);
-
-  c = _mm_maddubs_epi16(*data45, fir);
-  d = _mm_maddubs_epi16(*data67, fir);
-  c = _mm_hadd_epi16(c, d);
-
-  a = _mm_hadd_epi16(a, c);
+  __m128i fir = _mm_loadl_epi64((__m128i*)filter);
+  __m128i row = _mm_loadl_epi64((__m128i*)data);
+  __m128i acc;
+  acc = _mm_maddubs_epi16(row, fir);
+  __m128i temp = _mm_srli_si128(acc, 4);
+  acc = _mm_add_epi16(acc, temp);
+  temp = _mm_srli_si128(acc, 2);
+  acc = _mm_add_epi16(acc, temp);
+  int32_t filtered = _mm_cvtsi128_si32(acc);
+
+  return filtered;
+}
 
-  _mm_storeu_si128(dst, a);
+static void kvz_init_shuffle_masks(__m256i *shuf_01_23, __m256i *shuf_45_67) {
+  // Shuffle pairs
+  *shuf_01_23 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+                                 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+  *shuf_45_67 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+                                 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
 }
 
-static __m128i kvz_eight_tap_filter_flip_x8_16bit_avx2(__m128i *row, int8_t *filter, int32_t offset23, int32_t shift23)
-{
-  __m128i temp8;
-  __m128i temp_lo;
-  __m128i temp_hi;
-  __m128i fir = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)filter));
-
-  temp0 = _mm_madd_epi16(row0, fir);
-  temp1 = _mm_madd_epi16(row1, fir);
-  temp_lo = _mm_unpacklo_epi32(temp0, temp1);
-  temp_hi = _mm_unpackhi_epi32(temp0, temp1);
-  temp0 = _mm_add_epi32(temp_lo, temp_hi);
-
-  temp2 = _mm_madd_epi16(row2, fir);
-  temp3 = _mm_madd_epi16(row3, fir);
-  temp_lo = _mm_unpacklo_epi32(temp2, temp3);
-  temp_hi = _mm_unpackhi_epi32(temp2, temp3);
-  temp2 = _mm_add_epi32(temp_lo, temp_hi);
-
-  temp4 = _mm_madd_epi16(row4, fir);
-  temp5 = _mm_madd_epi16(row5, fir);
-  temp_lo = _mm_unpacklo_epi32(temp4, temp5);
-  temp_hi = _mm_unpackhi_epi32(temp4, temp5);
-  temp4 = _mm_add_epi32(temp_lo, temp_hi);
-
-  temp6 = _mm_madd_epi16(row6, fir);
-  temp7 = _mm_madd_epi16(row7, fir);
-  temp_lo = _mm_unpacklo_epi32(temp6, temp7);
-  temp_hi = _mm_unpackhi_epi32(temp6, temp7);
-  temp6 = _mm_add_epi32(temp_lo, temp_hi);
-
-  temp_lo = _mm_unpacklo_epi32(temp0, temp2);
-  temp_hi = _mm_unpackhi_epi32(temp0, temp2);
-  temp0 = _mm_add_epi32(temp_lo, temp_hi);
-  temp0 = _mm_shuffle_epi32(temp0, _MM_SHUFFLE(3, 1, 2, 0));
-
-  temp_lo = _mm_unpacklo_epi32(temp4, temp6);
-  temp_hi = _mm_unpackhi_epi32(temp4, temp6);
-  temp4 = _mm_add_epi32(temp_lo, temp_hi);
-  temp4 = _mm_shuffle_epi32(temp4, _MM_SHUFFLE(3, 1, 2, 0));
-
-  __m128i add = _mm_set1_epi32(offset23);
-  temp0 = _mm_add_epi32(temp0, add);
-  temp4 = _mm_add_epi32(temp4, add);
-  temp0 = _mm_srai_epi32(temp0, shift23);
-  temp4 = _mm_srai_epi32(temp4, shift23);
-
-  temp0 = _mm_packus_epi32(temp0, temp4);
-  temp0 = _mm_packus_epi16(temp0, temp0);
-
-  return temp0;
+static void kvz_init_shuffle_masks_chroma(__m256i *shuf_01, __m256i *shuf_23) {
+  // Shuffle pairs
+  *shuf_01 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12,
+                              0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
+  *shuf_23 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14,
+                              2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
 }
 
-static __m256i kvz_eight_tap_filter_flip_x8_16bit_dual_avx2(__m256i *row, int8_t *filter2, int32_t offset23, int32_t shift23)
-{
-  __m256i temp8;
-  __m256i temp_lo;
-  __m256i temp_hi;
-  __m256i fir = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)filter0), _mm_loadl_epi64((__m128i*)filter1)));
-
-  temp0 = _mm256_madd_epi16(row0, fir);
-  temp1 = _mm256_madd_epi16(row1, fir);
-  temp_lo = _mm256_unpacklo_epi32(temp0, temp1);
-  temp_hi = _mm256_unpackhi_epi32(temp0, temp1);
-  temp0 = _mm256_add_epi32(temp_lo, temp_hi);
-
-  temp2 = _mm256_madd_epi16(row2, fir);
-  temp3 = _mm256_madd_epi16(row3, fir);
-  temp_lo = _mm256_unpacklo_epi32(temp2, temp3);
-  temp_hi = _mm256_unpackhi_epi32(temp2, temp3);
-  temp2 = _mm256_add_epi32(temp_lo, temp_hi);
-
-  temp4 = _mm256_madd_epi16(row4, fir);
-  temp5 = _mm256_madd_epi16(row5, fir);
-  temp_lo = _mm256_unpacklo_epi32(temp4, temp5);
-  temp_hi = _mm256_unpackhi_epi32(temp4, temp5);
-  temp4 = _mm256_add_epi32(temp_lo, temp_hi);
-
-  temp6 = _mm256_madd_epi16(row6, fir);
-  temp7 = _mm256_madd_epi16(row7, fir);
-  temp_lo = _mm256_unpacklo_epi32(temp6, temp7);
-  temp_hi = _mm256_unpackhi_epi32(temp6, temp7);
-  temp6 = _mm256_add_epi32(temp_lo, temp_hi);
-
-  temp_lo = _mm256_unpacklo_epi32(temp0, temp2);
-  temp_hi = _mm256_unpackhi_epi32(temp0, temp2);
-  temp0 = _mm256_add_epi32(temp_lo, temp_hi);
-  temp0 = _mm256_shuffle_epi32(temp0, _MM_SHUFFLE(3, 1, 2, 0));
-
-  temp_lo = _mm256_unpacklo_epi32(temp4, temp6);
-  temp_hi = _mm256_unpackhi_epi32(temp4, temp6);
-  temp4 = _mm256_add_epi32(temp_lo, temp_hi);
-  temp4 = _mm256_shuffle_epi32(temp4, _MM_SHUFFLE(3, 1, 2, 0));
-
-  __m256i add = _mm256_set1_epi32(offset23);
-  temp0 = _mm256_add_epi32(temp0, add);
-  temp4 = _mm256_add_epi32(temp4, add);
-  temp0 = _mm256_srai_epi32(temp0, shift23);
-  temp4 = _mm256_srai_epi32(temp4, shift23);
-
-  temp0 = _mm256_packus_epi32(temp0, temp4);
-  temp0 = _mm256_packus_epi16(temp0, temp0);
-
-  return temp0;
+static void kvz_init_filter_taps(int8_t *filter,
+  __m256i *taps_01_23, __m256i *taps_45_67) {
+  // Filter weights
+  __m256i all_taps = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)filter));
+  __m256i perm_01 = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1);
+  __m256i perm_23 = _mm256_setr_epi32(2, 2, 2, 2, 3, 3, 3, 3);
+  all_taps = _mm256_unpacklo_epi16(all_taps, all_taps);
+  *taps_01_23 = _mm256_permutevar8x32_epi32(all_taps, perm_01);
+  *taps_45_67 = _mm256_permutevar8x32_epi32(all_taps, perm_23);
 }
 
-/*
-static __m128i kvz_eight_tap_filter_flip_x8_avx2(__m128i *row, int8_t *filter,  int32_t shift1)
-{
-  __m128i temp4;
-  __m128i fir = _mm_broadcastq_epi64(_mm_loadl_epi64((__m128i*)filter));
-  
-  temp0 = _mm_unpacklo_epi64(row0, row1);
-  temp0 = _mm_maddubs_epi16(temp0, fir);
+static void kvz_init_filter_taps_chroma(int8_t *filter,
+  __m256i *taps_01, __m256i *taps_23) {
+  // Filter weights
+  __m256i all_taps = _mm256_set1_epi32(*(int32_t*)filter);
+  all_taps = _mm256_unpacklo_epi16(all_taps, all_taps);
+  *taps_01 = _mm256_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0));
+  *taps_23 = _mm256_shuffle_epi32(all_taps, _MM_SHUFFLE(1, 1, 1, 1));
+}
 
-  temp1 = _mm_unpacklo_epi64(row2, row3);
-  temp1 = _mm_maddubs_epi16(temp1, fir);
+static void kvz_init_ver_filter_taps(int8_t *filter, __m256i *filters) {
+  for (int i = 0; i < 4; ++i) filtersi = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&filter2 * i));
+  filters0 = _mm256_inserti128_si256(filters0, _mm256_castsi256_si128(filters3), 1); // Pairs 01 67
+  filters1 = _mm256_inserti128_si256(filters1, _mm256_castsi256_si128(filters0), 1); // Pairs 23 01
+  filters2 = _mm256_inserti128_si256(filters2, _mm256_castsi256_si128(filters1), 1); // Pairs 45 23
+  filters3 = _mm256_inserti128_si256(filters3, _mm256_castsi256_si128(filters2), 1); // Pairs 67 45
+}
 
-  temp0 = _mm_hadd_epi16(temp0, temp1);
+static void kvz_eight_tap_filter_hor_8x1_avx2(kvz_pixel *data, int16_t * out,
+  __m256i *shuf_01_23, __m256i *shuf_45_67,
+  __m256i *taps_01_23, __m256i *taps_45_67) {
 
-  temp2 = _mm_unpacklo_epi64(row4, row5);
-  temp2 = _mm_maddubs_epi16(temp2, fir);
+  __m256i row = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)data));
 
-  temp3 = _mm_unpacklo_epi64(row6, row7);
-  temp3 = _mm_maddubs_epi16(temp3, fir);
+  __m256i pairs_01_23 = _mm256_shuffle_epi8(row, *shuf_01_23);
+  __m256i pairs_45_67 = _mm256_shuffle_epi8(row, *shuf_45_67);
   
-  temp2 = _mm_hadd_epi16(temp2, temp3);
+  __m256i temp0 = _mm256_maddubs_epi16(pairs_01_23, *taps_01_23);
+  __m256i temp1 = _mm256_maddubs_epi16(pairs_45_67, *taps_45_67);
 
-  temp0 = _mm_hadd_epi16(temp0, temp2);
-
-  temp0 = _mm_srai_epi16(temp0, shift1);
-
-  return temp0;
+  __m256i sum = _mm256_add_epi16(temp0, temp1);
+  __m128i filtered = _mm_add_epi16(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+  _mm_storeu_si128((__m128i*)out, filtered);
 }
-*/
-
-static __m256i kvz_eight_tap_filter_flip_x8_dual_avx2(__m256i *row, int8_t *filter2,  int32_t shift1)
-{
-  __m256i temp4;
-  __m256i fir = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)filter0)), _mm_loadl_epi64((__m128i*)filter1), 1);
-  fir = _mm256_shuffle_epi32(fir, _MM_SHUFFLE(1, 0, 1, 0));
-  
-  temp0 = _mm256_unpacklo_epi64(row0, row1);
-  temp0 = _mm256_maddubs_epi16(temp0, fir);
 
-  temp1 = _mm256_unpacklo_epi64(row2, row3);
-  temp1 = _mm256_maddubs_epi16(temp1, fir);
+static void kvz_four_tap_filter_hor_4x4_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride,
+  __m256i *shuf_01, __m256i *shuf_23,
+  __m256i *taps_01, __m256i *taps_23) {
 
-  temp0 = _mm256_hadd_epi16(temp0, temp1);
+  __m256i four_rows = _mm256_setr_epi64x(
+    *(int64_t*)&data0 * stride,
+    *(int64_t*)&data1 * stride,
+    *(int64_t*)&data2 * stride,
+    *(int64_t*)&data3 * stride);
 
-  temp2 = _mm256_unpacklo_epi64(row4, row5);
-  temp2 = _mm256_maddubs_epi16(temp2, fir);
+  __m256i pairs_l = _mm256_shuffle_epi8(four_rows, *shuf_01);
+  __m256i pairs_r = _mm256_shuffle_epi8(four_rows, *shuf_23);
 
-  temp3 = _mm256_unpacklo_epi64(row6, row7);
-  temp3 = _mm256_maddubs_epi16(temp3, fir);
-  
-  temp2 = _mm256_hadd_epi16(temp2, temp3);
+  __m256i temp_l = _mm256_maddubs_epi16(pairs_l, *taps_01);
+  __m256i temp_r = _mm256_maddubs_epi16(pairs_r, *taps_23);
 
-  temp0 = _mm256_hadd_epi16(temp0, temp2);
+  __m256i sum = _mm256_add_epi16(temp_l, temp_r);
 
-  temp0 = _mm256_srai_epi16(temp0, shift1);
-
-  return temp0;
+  __m128i lower = _mm256_castsi256_si128(sum);
+  __m128i upper = _mm256_extracti128_si256(sum, 1);
+  _mm_storel_epi64((__m128i*)(out + 0 * out_stride), lower);
+  _mm_storeh_pd((double*)(out + 1 * out_stride), _mm_castsi128_pd(lower));
+  _mm_storel_epi64((__m128i*)(out + 2 * out_stride), upper);
+  _mm_storeh_pd((double*)(out + 3 * out_stride), _mm_castsi128_pd(upper));
 }
 
-/*
-static INLINE void kvz_filter_flip_shift_x8_avx2(kvz_pixel *src, int16_t src_stride, int8_t *filter, int32_t shift1, int16_t *dst){
-
-  __m128i rows8;
-  rows0 = _mm_loadl_epi64((__m128i*)(src + 0 * src_stride));
-  rows1 = _mm_loadl_epi64((__m128i*)(src + 1 * src_stride));
-  rows2 = _mm_loadl_epi64((__m128i*)(src + 2 * src_stride));
-  rows3 = _mm_loadl_epi64((__m128i*)(src + 3 * src_stride));
-  rows4 = _mm_loadl_epi64((__m128i*)(src + 4 * src_stride));
-  rows5 = _mm_loadl_epi64((__m128i*)(src + 5 * src_stride));
-  rows6 = _mm_loadl_epi64((__m128i*)(src + 6 * src_stride));
-  rows7 = _mm_loadl_epi64((__m128i*)(src + 7 * src_stride));
-  __m128i out = kvz_eight_tap_filter_flip_x8_avx2(rows, filter, shift1);
-  _mm_storeu_si128((__m128i*)dst,  out);
-}
-*/
+static void kvz_four_tap_filter_hor_4xN_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride,
+  __m256i *shuf_01_23, __m256i *taps_01_23,
+  int rows) {
 
-static INLINE void kvz_filter_flip_shift_x8_dual_avx2(kvz_pixel *src, int16_t src_stride, int8_t *firs2, int32_t shift1, int16_t *dst2){
-
-  __m256i rows8;
-  rows0 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 0 * src_stride)));
-  rows1 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 1 * src_stride)));
-  rows2 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 2 * src_stride)));
-  rows3 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 3 * src_stride)));
-  rows4 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 4 * src_stride)));
-  rows5 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 5 * src_stride)));
-  rows6 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 6 * src_stride)));
-  rows7 = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 7 * src_stride)));
-  __m256i out = kvz_eight_tap_filter_flip_x8_dual_avx2(rows, firs, shift1);
-  _mm_storeu_si128((__m128i*)dst0,  _mm256_castsi256_si128(out));
-  _mm_storeu_si128((__m128i*)dst1,  _mm256_extracti128_si256(out, 1));
-}
+  for (int i = 0; i < rows; ++i) {
+    __m256i row = _mm256_set1_epi64x(*(int64_t*)&datai * stride);
 
-static INLINE void kvz_filter_flip_round_clip_x8_16bit_avx2(int16_t *flipped_filtered, int16_t src_stride, int8_t *filter, int32_t offset23, int32_t shift23, kvz_pixel *dst){
-
-  __m128i rows8;
-  rows0 = _mm_loadu_si128((__m128i*)(flipped_filtered + 0 * src_stride));
-  rows1 = _mm_loadu_si128((__m128i*)(flipped_filtered + 1 * src_stride));
-  rows2 = _mm_loadu_si128((__m128i*)(flipped_filtered + 2 * src_stride));
-  rows3 = _mm_loadu_si128((__m128i*)(flipped_filtered + 3 * src_stride));
-  rows4 = _mm_loadu_si128((__m128i*)(flipped_filtered + 4 * src_stride));
-  rows5 = _mm_loadu_si128((__m128i*)(flipped_filtered + 5 * src_stride));
-  rows6 = _mm_loadu_si128((__m128i*)(flipped_filtered + 6 * src_stride));
-  rows7 = _mm_loadu_si128((__m128i*)(flipped_filtered + 7 * src_stride));
-  _mm_storel_epi64((__m128i*)dst, kvz_eight_tap_filter_flip_x8_16bit_avx2(rows, filter, offset23, shift23) );
-}
+    __m256i pairs_l_r = _mm256_shuffle_epi8(row, *shuf_01_23);
+    __m256i temp_l_r = _mm256_maddubs_epi16(pairs_l_r, *taps_01_23);
 
-static INLINE void kvz_filter_flip_round_clip_x8_16bit_dual_avx2(int16_t *flipped_filtered2, int16_t src_stride, int8_t *firs2, int32_t offset23, int32_t shift23, kvz_pixel *dst2){
-
-  __m256i rows8;
-  rows0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 0 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 0 * src_stride)), 1);
-  rows1 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 1 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 1 * src_stride)), 1);
-  rows2 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 2 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 2 * src_stride)), 1);
-  rows3 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 3 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 3 * src_stride)), 1);
-  rows4 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 4 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 4 * src_stride)), 1);
-  rows5 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 5 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 5 * src_stride)), 1);
-  rows6 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 6 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 6 * src_stride)), 1);
-  rows7 = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered0 + 7 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered1 + 7 * src_stride)), 1);
-  __m256i out = kvz_eight_tap_filter_flip_x8_16bit_dual_avx2(rows, firs, offset23, shift23);
-  _mm_storel_epi64((__m128i*)dst0,  _mm256_castsi256_si128(out));
-  _mm_storel_epi64((__m128i*)dst1,  _mm256_extracti128_si256(out, 1));
+    __m128i temp_l = _mm256_castsi256_si128(temp_l_r);
+    __m128i temp_r = _mm256_extracti128_si256(temp_l_r, 1);
+    __m128i sum = _mm_add_epi16(temp_l, temp_r);
 
+    _mm_storel_epi64((__m128i*)(out + i * out_stride), sum);
+  }
 }
 
-__m128i kvz_eight_tap_filter_x4_and_flip_16bit(__m128i *data0, __m128i *data1, __m128i *data2, __m128i *data3, __m128i *filter)
+static int32_t kvz_eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data)
 {
-  __m128i a, b, c, d;
-  __m128i fir = _mm_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(filter)));
-
-  a = _mm_madd_epi16(*data0, fir);
-  b = _mm_madd_epi16(*data1, fir);
-  a = _mm_hadd_epi32(a, b);
-
-  c = _mm_madd_epi16(*data2, fir);
-  d = _mm_madd_epi16(*data3, fir);
-  c = _mm_hadd_epi32(c, d);
-
-  a = _mm_hadd_epi32(a, c);
-
-  return a;
+  __m128i fir = _mm_loadl_epi64((__m128i*)filter);
+  fir = _mm_cvtepi8_epi16(fir);
+  __m128i row = _mm_loadu_si128((__m128i*)data);
+  __m128i acc;
+  acc = _mm_madd_epi16(fir, row);
+  __m128i temp = _mm_srli_si128(acc, 8);
+  acc = _mm_add_epi32(acc, temp);
+  temp = _mm_srli_si128(acc, 4);
+  acc = _mm_add_epi32(acc, temp);
+  int32_t filtered = _mm_cvtsi128_si32(acc);
+
+  return filtered;
 }
 
-void kvz_eight_tap_filter_and_flip_avx2(int8_t filter48, kvz_pixel *src, int16_t src_stride, int16_t* __restrict dst)
+static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *data, int16_t stride, kvz_pixel *out)
 {
+  // Interpolation filter shifts
+  int32_t shift2 = 6;
 
-  //Load 2 rows per xmm register
-  __m128i rows01 = _mm_loadl_epi64((__m128i*)(src + 0 * src_stride));
-  rows01 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows01), (double*)(src + 1 * src_stride)));
-
-  __m128i rows23 = _mm_loadl_epi64((__m128i*)(src + 2 * src_stride));
-  rows23 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows23), (double*)(src + 3 * src_stride)));
-
-  __m128i rows45 = _mm_loadl_epi64((__m128i*)(src + 4 * src_stride));
-  rows45 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows45), (double*)(src + 5 * src_stride)));
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
 
-  __m128i rows67 = _mm_loadl_epi64((__m128i*)(src + 6 * src_stride));
-  rows67 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows67), (double*)(src + 7 * src_stride)));
+  // Filter weights
+  __m256i all_taps = _mm256_castsi128_si256(_mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)filter)));
+  __m256i taps_01_23 = _mm256_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0));
+  __m128i taps_23 = _mm_shuffle_epi32(_mm256_castsi256_si128(all_taps), _MM_SHUFFLE(1, 1, 1, 1));
+  __m256i taps_45_67 = _mm256_shuffle_epi32(all_taps, _MM_SHUFFLE(2, 2, 2, 2));
+  __m128i taps_67 = _mm_shuffle_epi32(_mm256_castsi256_si128(all_taps), _MM_SHUFFLE(3, 3, 3, 3));
 
-  //Filter rows
-  const int dst_stride = MAX_WIDTH;
-  kvz_eight_tap_filter_x8_and_flip(&rows01, &rows23, &rows45, &rows67, (__m128i*)(&filter0), (__m128i*)(dst + 0));
-  kvz_eight_tap_filter_x8_and_flip(&rows01, &rows23, &rows45, &rows67, (__m128i*)(&filter1), (__m128i*)(dst + 1 * dst_stride));
-  kvz_eight_tap_filter_x8_and_flip(&rows01, &rows23, &rows45, &rows67, (__m128i*)(&filter2), (__m128i*)(dst + 2 * dst_stride));
-  kvz_eight_tap_filter_x8_and_flip(&rows01, &rows23, &rows45, &rows67, (__m128i*)(&filter3), (__m128i*)(dst + 3 * dst_stride));
-}
+  taps_01_23 = _mm256_inserti128_si256(taps_01_23, taps_23, 1);
+  taps_45_67 = _mm256_inserti128_si256(taps_45_67, taps_67, 1);
 
-static INLINE void eight_tap_filter_and_flip_16bit_avx2(int8_t filter48, int16_t *src, int16_t src_stride, int offset, int combined_shift, kvz_pixel* __restrict dst, int16_t dst_stride)
-{
+  __m256i rows02 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&data0 * stride));
+  __m128i row2 = _mm_loadu_si128((__m128i*)&data2 * stride);
+  rows02 = _mm256_inserti128_si256(rows02, row2, 1);
 
-  //Load a row per xmm register
-  __m128i row0 = _mm_loadu_si128((__m128i*)(src + 0 * src_stride));
-  __m128i row1 = _mm_loadu_si128((__m128i*)(src + 1 * src_stride));
-  __m128i row2 = _mm_loadu_si128((__m128i*)(src + 2 * src_stride));
-  __m128i row3 = _mm_loadu_si128((__m128i*)(src + 3 * src_stride));
+  __m256i rows13 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&data1 * stride));
+  __m128i row3 = _mm_loadu_si128((__m128i*)&data3 * stride);
+  rows13 = _mm256_inserti128_si256(rows13, row3, 1);
 
-  //Filter rows
-  union {
-    __m128i vector;
-    int32_t array4;
-  } temp4;
+  __m256i pairs_01_23_lo = _mm256_unpacklo_epi16(rows02, rows13);
+  __m256i pairs_01_23_hi = _mm256_unpackhi_epi16(rows02, rows13);
+  __m256i temp_01_23_lo = _mm256_madd_epi16(pairs_01_23_lo, taps_01_23);
+  __m256i temp_01_23_hi = _mm256_madd_epi16(pairs_01_23_hi, taps_01_23);
 
-  temp0.vector = kvz_eight_tap_filter_x4_and_flip_16bit(&row0, &row1, &row2, &row3, (__m128i*)(&filter0));
-  temp1.vector = kvz_eight_tap_filter_x4_and_flip_16bit(&row0, &row1, &row2, &row3, (__m128i*)(&filter1));
-  temp2.vector = kvz_eight_tap_filter_x4_and_flip_16bit(&row0, &row1, &row2, &row3, (__m128i*)(&filter2));
-  temp3.vector = kvz_eight_tap_filter_x4_and_flip_16bit(&row0, &row1, &row2, &row3, (__m128i*)(&filter3));
+  __m256i rows46 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&data4 * stride));
+  __m128i row6 = _mm_loadu_si128((__m128i*)&data6 * stride);
+  rows46 = _mm256_inserti128_si256(rows46, row6, 1);
 
-  __m128i packed_offset = _mm_set1_epi32(offset);
+  __m256i rows57 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&data5 * stride));
+  __m128i row7 = _mm_loadu_si128((__m128i*)&data7 * stride);
+  rows57 = _mm256_inserti128_si256(rows57, row7, 1);
 
-  temp0.vector = _mm_add_epi32(temp0.vector, packed_offset);
-  temp0.vector = _mm_srai_epi32(temp0.vector, combined_shift);
-  temp1.vector = _mm_add_epi32(temp1.vector, packed_offset);
-  temp1.vector = _mm_srai_epi32(temp1.vector, combined_shift);
+  __m256i pairs_45_67_lo = _mm256_unpacklo_epi16(rows46, rows57);
+  __m256i pairs_45_67_hi = _mm256_unpackhi_epi16(rows46, rows57);
+  __m256i temp_45_67_lo = _mm256_madd_epi16(pairs_45_67_lo, taps_45_67);
+  __m256i temp_45_67_hi = _mm256_madd_epi16(pairs_45_67_hi, taps_45_67);
 
-  temp0.vector = _mm_packus_epi32(temp0.vector, temp1.vector);
+  __m256i sum_lo_half = _mm256_add_epi32(temp_01_23_lo, temp_45_67_lo);
+  __m256i sum_hi_half = _mm256_add_epi32(temp_01_23_hi, temp_45_67_hi);
 
-  temp2.vector = _mm_add_epi32(temp2.vector, packed_offset);
-  temp2.vector = _mm_srai_epi32(temp2.vector, combined_shift);
-  temp3.vector = _mm_add_epi32(temp3.vector, packed_offset);
-  temp3.vector = _mm_srai_epi32(temp3.vector, combined_shift);
+  __m128i sum_lo = _mm_add_epi32(_mm256_castsi256_si128(sum_lo_half), _mm256_extracti128_si256(sum_lo_half, 1));
+  __m128i sum_hi = _mm_add_epi32(_mm256_castsi256_si128(sum_hi_half), _mm256_extracti128_si256(sum_hi_half, 1));
 
-  temp2.vector = _mm_packus_epi32(temp2.vector, temp3.vector);
+  sum_lo = _mm_srai_epi32(sum_lo, shift2);
+  sum_hi = _mm_srai_epi32(sum_hi, shift2);
 
-  temp0.vector = _mm_packus_epi16(temp0.vector, temp2.vector);
+  __m128i offset = _mm_set1_epi32(wp_offset1);
+  sum_lo = _mm_add_epi32(sum_lo, offset);
+  sum_lo = _mm_srai_epi32(sum_lo, wp_shift1);
+  sum_hi = _mm_add_epi32(sum_hi, offset);
+  sum_hi = _mm_srai_epi32(sum_hi, wp_shift1);
+  __m128i filtered = _mm_packus_epi32(sum_lo, sum_hi);
+  filtered = _mm_packus_epi16(filtered, filtered);
 
-  int32_t* four_pixels = (int32_t*)&(dst0 * dst_stride);
-  *four_pixels = temp0.array0;
 
-  four_pixels = (int32_t*)&(dst1 * dst_stride);
-  *four_pixels = _mm_extract_epi32(temp0.vector, 1);
+  _mm_storel_epi64((__m128i*)out, filtered);
+}
 
-  four_pixels = (int32_t*)&(dst2 * dst_stride);
-  *four_pixels = _mm_extract_epi32(temp0.vector, 2);
+static void kvz_four_tap_filter_ver_16bit_4x4_avx2(int8_t *filter, int16_t *data, int16_t stride, kvz_pixel *out, int16_t out_stride)
+{
+  // Interpolation filter shifts
+  int32_t shift2 = 6;
 
-  four_pixels = (int32_t*)&(dst3 * dst_stride);
-  *four_pixels = _mm_extract_epi32(temp0.vector, 3);
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
+
+  // Filter weights
+  __m128i all_taps = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(*(int32_t*)filter));
+  __m128i taps_01 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0));
+  __m128i taps_23 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(1, 1, 1, 1));
+
+  __m128i row0 = _mm_loadl_epi64((__m128i*)&data0 * stride);
+  __m128i row1 = _mm_loadl_epi64((__m128i*)&data1 * stride);
+  __m128i row2 = _mm_loadl_epi64((__m128i*)&data2 * stride);
+  __m128i row3 = _mm_loadl_epi64((__m128i*)&data3 * stride);
+  __m128i row4 = _mm_loadl_epi64((__m128i*)&data4 * stride);
+  __m128i row5 = _mm_loadl_epi64((__m128i*)&data5 * stride);
+  __m128i row6 = _mm_loadl_epi64((__m128i*)&data6 * stride);
+
+  __m128i pairs01 = _mm_unpacklo_epi16(row0, row1);
+  __m128i pairs23 = _mm_unpacklo_epi16(row2, row3);
+  __m128i temp01 = _mm_madd_epi16(pairs01, taps_01);
+  __m128i temp23 = _mm_madd_epi16(pairs23, taps_23);
+  __m128i sum0123 = _mm_add_epi32(temp01, temp23);
+
+  __m128i pairs12 = _mm_unpacklo_epi16(row1, row2);
+  __m128i pairs34 = _mm_unpacklo_epi16(row3, row4);
+  __m128i temp12 = _mm_madd_epi16(pairs12, taps_01);
+  __m128i temp34 = _mm_madd_epi16(pairs34, taps_23);
+  __m128i sum1234 = _mm_add_epi32(temp12, temp34);
+
+  __m128i pairs45 = _mm_unpacklo_epi16(row4, row5);
+  __m128i temp23_2 = _mm_madd_epi16(pairs23, taps_01);
+  __m128i temp45 = _mm_madd_epi16(pairs45, taps_23);
+  __m128i sum2345 = _mm_add_epi32(temp23_2, temp45);
+
+  __m128i pairs56 = _mm_unpacklo_epi16(row5, row6);
+  __m128i temp34_2 = _mm_madd_epi16(pairs34, taps_01);
+  __m128i temp56 = _mm_madd_epi16(pairs56, taps_23);
+  __m128i sum3456 = _mm_add_epi32(temp34_2, temp56);
+
+  sum0123 = _mm_srai_epi32(sum0123, shift2);
+  sum1234 = _mm_srai_epi32(sum1234, shift2);
+  sum2345 = _mm_srai_epi32(sum2345, shift2);
+  sum3456 = _mm_srai_epi32(sum3456, shift2);
+
+  __m128i offset = _mm_set1_epi32(wp_offset1);
+  sum0123 = _mm_add_epi32(sum0123, offset);
+  sum1234 = _mm_add_epi32(sum1234, offset);
+  sum2345 = _mm_add_epi32(sum2345, offset);
+  sum3456 = _mm_add_epi32(sum3456, offset);
+
+  sum0123 = _mm_srai_epi32(sum0123, wp_shift1);
+  sum1234 = _mm_srai_epi32(sum1234, wp_shift1);
+  sum2345 = _mm_srai_epi32(sum2345, wp_shift1);
+  sum3456 = _mm_srai_epi32(sum3456, wp_shift1);
+
+  __m128i filtered01 = _mm_packs_epi32(sum0123, sum1234);
+  __m128i filtered23 = _mm_packs_epi32(sum2345, sum3456);
+  __m128i filtered = _mm_packus_epi16(filtered01, filtered23);
+
+  *(int32_t*)&out0 * out_stride = _mm_cvtsi128_si32(filtered);
+  *(int32_t*)&out1 * out_stride = _mm_extract_epi32(filtered, 1);
+  *(int32_t*)&out2 * out_stride = _mm_extract_epi32(filtered, 2);
+  *(int32_t*)&out3 * out_stride = _mm_extract_epi32(filtered, 3);
+}
 
+static void kvz_four_tap_filter_ver_16bit_4x4_no_round_avx2(int8_t *filter, int16_t *data, int16_t stride, int16_t *out, int16_t out_stride)
+{
+  int32_t shift2 = 6;
 
+  // Filter weights
+  __m128i all_taps = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(*(int32_t*)filter));
+  __m128i taps_01 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0));
+  __m128i taps_23 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(1, 1, 1, 1));
+
+  __m128i row0 = _mm_loadl_epi64((__m128i*)&data0 * stride);
+  __m128i row1 = _mm_loadl_epi64((__m128i*)&data1 * stride);
+  __m128i row2 = _mm_loadl_epi64((__m128i*)&data2 * stride);
+  __m128i row3 = _mm_loadl_epi64((__m128i*)&data3 * stride);
+  __m128i row4 = _mm_loadl_epi64((__m128i*)&data4 * stride);
+  __m128i row5 = _mm_loadl_epi64((__m128i*)&data5 * stride);
+  __m128i row6 = _mm_loadl_epi64((__m128i*)&data6 * stride);
+
+  __m128i pairs01 = _mm_unpacklo_epi16(row0, row1);
+  __m128i pairs23 = _mm_unpacklo_epi16(row2, row3);
+  __m128i temp01 = _mm_madd_epi16(pairs01, taps_01);
+  __m128i temp23 = _mm_madd_epi16(pairs23, taps_23);
+  __m128i sum0123 = _mm_add_epi32(temp01, temp23);
+
+  __m128i pairs12 = _mm_unpacklo_epi16(row1, row2);
+  __m128i pairs34 = _mm_unpacklo_epi16(row3, row4);
+  __m128i temp12 = _mm_madd_epi16(pairs12, taps_01);
+  __m128i temp34 = _mm_madd_epi16(pairs34, taps_23);
+  __m128i sum1234 = _mm_add_epi32(temp12, temp34);
+
+  __m128i pairs45 = _mm_unpacklo_epi16(row4, row5);
+  __m128i temp23_2 = _mm_madd_epi16(pairs23, taps_01);
+  __m128i temp45 = _mm_madd_epi16(pairs45, taps_23);
+  __m128i sum2345 = _mm_add_epi32(temp23_2, temp45);
+
+  __m128i pairs56 = _mm_unpacklo_epi16(row5, row6);
+  __m128i temp34_2 = _mm_madd_epi16(pairs34, taps_01);
+  __m128i temp56 = _mm_madd_epi16(pairs56, taps_23);
+  __m128i sum3456 = _mm_add_epi32(temp34_2, temp56);
+
+  sum0123 = _mm_srai_epi32(sum0123, shift2);
+  sum1234 = _mm_srai_epi32(sum1234, shift2);
+  sum2345 = _mm_srai_epi32(sum2345, shift2);
+  sum3456 = _mm_srai_epi32(sum3456, shift2);
+
+  __m128i filtered01 = _mm_packs_epi32(sum0123, sum1234);
+  __m128i filtered23 = _mm_packs_epi32(sum2345, sum3456);
+
+  _mm_storel_pi((__m64*)&out0 * out_stride, _mm_castsi128_ps(filtered01));
+  _mm_storeh_pi((__m64*)&out1 * out_stride, _mm_castsi128_ps(filtered01));
+  _mm_storel_pi((__m64*)&out2 * out_stride, _mm_castsi128_ps(filtered23));
+  _mm_storeh_pi((__m64*)&out3 * out_stride, _mm_castsi128_ps(filtered23));
 }
 
-int16_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data)
+INLINE static void filter_row_ver_16b_8x1_avx2(int16_t *data, int64_t stride, __m256i* taps, kvz_pixel * out, int64_t out_stride)
 {
+  // Interpolation filter shifts
+  int32_t shift2 = 6;
 
-  __m128i sample;
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
+
+  __m256i pairs_lo, pairs_hi;
+
+  // Filter 01 later with 67
+  __m256i br0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 0 * stride)));
+  __m256i br1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 1 * stride)));
+
+  __m256i br2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 2 * stride)));
+  __m256i br3 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 3 * stride)));
+  pairs_lo = _mm256_unpacklo_epi16(br2, br3);
+  pairs_hi = _mm256_unpackhi_epi16(br2, br3);
+  __m256i rows02_23_01_lo = _mm256_madd_epi16(pairs_lo, taps1); // Firs 23 01
+  __m256i rows02_23_01_hi = _mm256_madd_epi16(pairs_hi, taps1); // Firs 23 01
+
+  __m256i br4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 4 * stride)));
+  __m256i br5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 5 * stride)));
+  pairs_lo = _mm256_unpacklo_epi16(br4, br5);
+  pairs_hi = _mm256_unpackhi_epi16(br4, br5);
+  __m256i rows02_45_23_lo = _mm256_madd_epi16(pairs_lo, taps2); // Firs 45 23
+  __m256i rows02_45_23_hi = _mm256_madd_epi16(pairs_hi, taps2); // Firs 45 23
+
+  __m256i br6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 6 * stride)));
+  __m256i br7 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 7 * stride)));
+  pairs_lo = _mm256_unpacklo_epi16(br6, br7);
+  pairs_hi = _mm256_unpackhi_epi16(br6, br7);
+  __m256i rows02_67_45_lo = _mm256_madd_epi16(pairs_lo, taps3); // Firs 67 45
+  __m256i rows02_67_45_hi = _mm256_madd_epi16(pairs_hi, taps3); // Firs 67 45
+  __m256i rows46_23_01_lo = _mm256_madd_epi16(pairs_lo, taps1); // Firs 23 01
+  __m256i rows46_23_01_hi = _mm256_madd_epi16(pairs_hi, taps1); // Firs 23 01
+
+  __m256i br8 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 8 * stride)));
+  __m256i br9 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 9 * stride)));
+  pairs_lo = _mm256_unpacklo_epi16(br8, br9);
+  pairs_hi = _mm256_unpackhi_epi16(br8, br9);
+  // Filter rows02 later
+  __m256i rows46_45_23_lo = _mm256_madd_epi16(pairs_lo, taps2); // Firs 45 23
+  __m256i rows46_45_23_hi = _mm256_madd_epi16(pairs_hi, taps2); // Firs 45 23
+
+  __m256i br10 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 10 * stride)));
+  __m256i br11 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 11 * stride)));
+  pairs_lo = _mm256_unpacklo_epi16(br10, br11);
+  pairs_hi = _mm256_unpackhi_epi16(br10, br11);
+  __m256i rows46_67_45_lo = _mm256_madd_epi16(pairs_lo, taps3); // Firs 67 45
+  __m256i rows46_67_45_hi = _mm256_madd_epi16(pairs_hi, taps3); // Firs 67 45
+
+  // Deferred
+  __m256i r08 = _mm256_permute2x128_si256(br0, br8, _MM_SHUFFLE(0, 2, 0, 0));
+  __m256i r19 = _mm256_permute2x128_si256(br1, br9, _MM_SHUFFLE(0, 2, 0, 0));
+  pairs_lo = _mm256_unpacklo_epi16(r08, r19);
+  pairs_hi = _mm256_unpackhi_epi16(r08, r19);
+  __m256i rows02_01_67_lo = _mm256_madd_epi16(pairs_lo, taps0); // Firs 01 67
+  __m256i rows02_01_67_hi = _mm256_madd_epi16(pairs_hi, taps0); // Firs 01 67
+
+  __m256i br12 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 12 * stride)));
+  __m256i br13 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 13 * stride)));
+
+  __m256i r412 = _mm256_permute2x128_si256(br4, br12, _MM_SHUFFLE(0, 2, 0, 0));
+  __m256i r513 = _mm256_permute2x128_si256(br5, br13, _MM_SHUFFLE(0, 2, 0, 0));
+  pairs_lo = _mm256_unpacklo_epi16(r412, r513);
+  pairs_hi = _mm256_unpackhi_epi16(r412, r513);
+  __m256i rows46_01_67_lo = _mm256_madd_epi16(pairs_lo, taps0); // Firs 01 67
+  __m256i rows46_01_67_hi = _mm256_madd_epi16(pairs_hi, taps0); // Firs 01 67
+
+  __m256i accu02_lo, accu02_hi;
+  __m256i accu46_lo, accu46_hi;
+
+  accu02_lo = _mm256_add_epi32(rows02_23_01_lo, rows02_45_23_lo);
+  accu02_lo = _mm256_add_epi32(accu02_lo, rows02_67_45_lo);
+  accu02_lo = _mm256_add_epi32(accu02_lo, rows02_01_67_lo);
+
+  accu02_hi = _mm256_add_epi32(rows02_23_01_hi, rows02_45_23_hi);
+  accu02_hi = _mm256_add_epi32(accu02_hi, rows02_67_45_hi);
+  accu02_hi = _mm256_add_epi32(accu02_hi, rows02_01_67_hi);
+
+  accu46_lo = _mm256_add_epi32(rows46_23_01_lo, rows46_45_23_lo);
+  accu46_lo = _mm256_add_epi32(accu46_lo, rows46_67_45_lo);
+  accu46_lo = _mm256_add_epi32(accu46_lo, rows46_01_67_lo);
+
+  accu46_hi = _mm256_add_epi32(rows46_23_01_hi, rows46_45_23_hi);
+  accu46_hi = _mm256_add_epi32(accu46_hi, rows46_67_45_hi);
+  accu46_hi = _mm256_add_epi32(accu46_hi, rows46_01_67_hi);
+
+  accu02_lo = _mm256_srai_epi32(accu02_lo, shift2);
+  accu02_hi = _mm256_srai_epi32(accu02_hi, shift2);
+  accu46_lo = _mm256_srai_epi32(accu46_lo, shift2);
+  accu46_hi = _mm256_srai_epi32(accu46_hi, shift2);
+
+  __m256i offset = _mm256_set1_epi32(wp_offset1);
+  accu02_lo = _mm256_add_epi32(accu02_lo, offset);
+  accu02_hi = _mm256_add_epi32(accu02_hi, offset);
+  accu46_lo = _mm256_add_epi32(accu46_lo, offset);
+  accu46_hi = _mm256_add_epi32(accu46_hi, offset);
+
+  accu02_lo = _mm256_srai_epi32(accu02_lo, wp_shift1);
+  accu02_hi = _mm256_srai_epi32(accu02_hi, wp_shift1);
+  accu46_lo = _mm256_srai_epi32(accu46_lo, wp_shift1);
+  accu46_hi = _mm256_srai_epi32(accu46_hi, wp_shift1);
+
+  __m256i rows02 = _mm256_packs_epi32(accu02_lo, accu02_hi);
+  __m256i rows46 = _mm256_packs_epi32(accu46_lo, accu46_hi);
+
+  __m256i filtered04_26 = _mm256_packus_epi16(rows02, rows46);
+  __m128i filtered04 = _mm256_castsi256_si128(filtered04_26);
+  __m128i filtered26 = _mm256_extracti128_si256(filtered04_26, 1);
+
+  _mm_storel_pi((__m64*)&out0 * out_stride, _mm_castsi128_ps(filtered04));
+  _mm_storel_pi((__m64*)&out2 * out_stride, _mm_castsi128_ps(filtered26));
+  _mm_storeh_pi((__m64*)&out4 * out_stride, _mm_castsi128_ps(filtered04));
+  _mm_storeh_pi((__m64*)&out6 * out_stride, _mm_castsi128_ps(filtered26));
+}
 
-  __m128i packed_data = _mm_loadl_epi64((__m128i*)data);
-  __m128i packed_filter = _mm_loadl_epi64((__m128i*)filter);
+INLINE static void filter_row_ver_16b_8x1_no_round_avx2(int16_t *data, int64_t stride, __m256i *taps, int16_t *out, int64_t out_stride) {
 
-  sample = _mm_maddubs_epi16(packed_data, packed_filter);
-  sample = _mm_add_epi16(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(0, 1, 0, 1)));
-  sample = _mm_add_epi16(sample, _mm_shufflelo_epi16(sample, _MM_SHUFFLE(0, 1, 0, 1)));
+  int32_t shift2 = 6;
 
-  return (int16_t)_mm_cvtsi128_si32(sample);
+  __m256i pairs_lo, pairs_hi;
+
+  // Filter 01 later with 67
+  __m256i br0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 0 * stride)));
+  __m256i br1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 1 * stride)));
+
+  __m256i br2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 2 * stride)));
+  __m256i br3 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 3 * stride)));
+  pairs_lo = _mm256_unpacklo_epi16(br2, br3);
+  pairs_hi = _mm256_unpackhi_epi16(br2, br3);
+  __m256i rows02_23_01_lo = _mm256_madd_epi16(pairs_lo, taps1); // Firs 23 01
+  __m256i rows02_23_01_hi = _mm256_madd_epi16(pairs_hi, taps1); // Firs 23 01
+
+  __m256i br4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 4 * stride)));
+  __m256i br5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 5 * stride)));
+  pairs_lo = _mm256_unpacklo_epi16(br4, br5);
+  pairs_hi = _mm256_unpackhi_epi16(br4, br5);
+  __m256i rows02_45_23_lo = _mm256_madd_epi16(pairs_lo, taps2); // Firs 45 23
+  __m256i rows02_45_23_hi = _mm256_madd_epi16(pairs_hi, taps2); // Firs 45 23
+
+  __m256i br6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 6 * stride)));
+  __m256i br7 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 7 * stride)));
+  pairs_lo = _mm256_unpacklo_epi16(br6, br7);
+  pairs_hi = _mm256_unpackhi_epi16(br6, br7);
+  __m256i rows02_67_45_lo = _mm256_madd_epi16(pairs_lo, taps3); // Firs 67 45
+  __m256i rows02_67_45_hi = _mm256_madd_epi16(pairs_hi, taps3); // Firs 67 45
+  __m256i rows46_23_01_lo = _mm256_madd_epi16(pairs_lo, taps1); // Firs 23 01
+  __m256i rows46_23_01_hi = _mm256_madd_epi16(pairs_hi, taps1); // Firs 23 01
+
+  __m256i br8 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 8 * stride)));
+  __m256i br9 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 9 * stride)));
+  pairs_lo = _mm256_unpacklo_epi16(br8, br9);
+  pairs_hi = _mm256_unpackhi_epi16(br8, br9);
+  // Filter rows02 later
+  __m256i rows46_45_23_lo = _mm256_madd_epi16(pairs_lo, taps2); // Firs 45 23
+  __m256i rows46_45_23_hi = _mm256_madd_epi16(pairs_hi, taps2); // Firs 45 23
+
+  __m256i br10 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 10 * stride)));
+  __m256i br11 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 11 * stride)));
+  pairs_lo = _mm256_unpacklo_epi16(br10, br11);
+  pairs_hi = _mm256_unpackhi_epi16(br10, br11);
+  __m256i rows46_67_45_lo = _mm256_madd_epi16(pairs_lo, taps3); // Firs 67 45
+  __m256i rows46_67_45_hi = _mm256_madd_epi16(pairs_hi, taps3); // Firs 67 45
+
+                                                                  // Deferred
+  __m256i r08 = _mm256_permute2x128_si256(br0, br8, _MM_SHUFFLE(0, 2, 0, 0));
+  __m256i r19 = _mm256_permute2x128_si256(br1, br9, _MM_SHUFFLE(0, 2, 0, 0));
+  pairs_lo = _mm256_unpacklo_epi16(r08, r19);
+  pairs_hi = _mm256_unpackhi_epi16(r08, r19);
+  __m256i rows02_01_67_lo = _mm256_madd_epi16(pairs_lo, taps0); // Firs 01 67
+  __m256i rows02_01_67_hi = _mm256_madd_epi16(pairs_hi, taps0); // Firs 01 67
+
+  __m256i br12 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 12 * stride)));
+  __m256i br13 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 13 * stride)));
+
+  __m256i r412 = _mm256_permute2x128_si256(br4, br12, _MM_SHUFFLE(0, 2, 0, 0));
+  __m256i r513 = _mm256_permute2x128_si256(br5, br13, _MM_SHUFFLE(0, 2, 0, 0));
+  pairs_lo = _mm256_unpacklo_epi16(r412, r513);
+  pairs_hi = _mm256_unpackhi_epi16(r412, r513);
+  __m256i rows46_01_67_lo = _mm256_madd_epi16(pairs_lo, taps0); // Firs 01 67
+  __m256i rows46_01_67_hi = _mm256_madd_epi16(pairs_hi, taps0); // Firs 01 67
+
+  __m256i accu02_lo, accu02_hi;
+  __m256i accu46_lo, accu46_hi;
+
+  accu02_lo = _mm256_add_epi32(rows02_23_01_lo, rows02_45_23_lo);
+  accu02_lo = _mm256_add_epi32(accu02_lo, rows02_67_45_lo);
+  accu02_lo = _mm256_add_epi32(accu02_lo, rows02_01_67_lo);
+
+  accu02_hi = _mm256_add_epi32(rows02_23_01_hi, rows02_45_23_hi);
+  accu02_hi = _mm256_add_epi32(accu02_hi, rows02_67_45_hi);
+  accu02_hi = _mm256_add_epi32(accu02_hi, rows02_01_67_hi);
+
+  accu46_lo = _mm256_add_epi32(rows46_23_01_lo, rows46_45_23_lo);
+  accu46_lo = _mm256_add_epi32(accu46_lo, rows46_67_45_lo);
+  accu46_lo = _mm256_add_epi32(accu46_lo, rows46_01_67_lo);
+
+  accu46_hi = _mm256_add_epi32(rows46_23_01_hi, rows46_45_23_hi);
+  accu46_hi = _mm256_add_epi32(accu46_hi, rows46_67_45_hi);
+  accu46_hi = _mm256_add_epi32(accu46_hi, rows46_01_67_hi);
+
+  accu02_lo = _mm256_srai_epi32(accu02_lo, shift2);
+  accu02_hi = _mm256_srai_epi32(accu02_hi, shift2);
+  accu46_lo = _mm256_srai_epi32(accu46_lo, shift2);
+  accu46_hi = _mm256_srai_epi32(accu46_hi, shift2);
+
+  __m256i rows02 = _mm256_packs_epi32(accu02_lo, accu02_hi);
+  __m256i rows46 = _mm256_packs_epi32(accu46_lo, accu46_hi);
+
+  __m128i filtered0 = _mm256_castsi256_si128(rows02);
+  __m128i filtered2 = _mm256_extracti128_si256(rows02, 1);
+  __m128i filtered4 = _mm256_castsi256_si128(rows46);
+  __m128i filtered6 = _mm256_extracti128_si256(rows46, 1);
+
+  _mm_storeu_si128((__m128i*)(out + 0 * out_stride), filtered0);
+  _mm_storeu_si128((__m128i*)(out + 2 * out_stride), filtered2);
+  _mm_storeu_si128((__m128i*)(out + 4 * out_stride), filtered4);
+  _mm_storeu_si128((__m128i*)(out + 6 * out_stride), filtered6);
 }
 
-
-int32_t kvz_eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data)
+INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_avx2(__m256i *filters, int16_t *data, int16_t stride, kvz_pixel *out, int out_stride)
 {
-  __m128i sample;
-
-  __m128i packed_data = _mm_loadu_si128((__m128i*)data);
-  __m128i packed_filter = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)filter));
+  // Filter even rows
+  filter_row_ver_16b_8x1_avx2(data, stride, filters, out, out_stride); // 0 2 4 6
 
-  sample = _mm_madd_epi16(packed_data, packed_filter);
-  sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(0, 1, 3, 2)));
-  sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(0, 1, 0, 1)));
-
-  return _mm_extract_epi32(sample, 0);
+  // Filter odd rows
+  filter_row_ver_16b_8x1_avx2(data + stride, stride, filters, out + out_stride, out_stride); // 1 3 5 7
+ 
 }
 
-int16_t kvz_eight_tap_filter_ver_avx2(int8_t *filter, kvz_pixel *data, int16_t stride)
+INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(__m256i *filters, int16_t *data, int16_t stride, int16_t *out, int out_stride)
 {
-  int16_t temp = 0;
-  for (int i = 0; i < 8; ++i)
-  {
-    temp += filteri * datastride * i;
-  }
+  // Filter even rows
+  filter_row_ver_16b_8x1_no_round_avx2(data, stride, filters, out, out_stride); // 0 2 4 6
+
+  // Filter odd rows
+  filter_row_ver_16b_8x1_no_round_avx2(data + stride, stride, filters, out + out_stride, out_stride); // 1 3 5 7
 
-  return temp;
 }
 
-int32_t kvz_eight_tap_filter_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride)
+static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  kvz_pixel filtered4LCU_WIDTH * LCU_WIDTH,
+  int16_t hor_intermediate5(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH,
+  int8_t fme_level,
+  int16_t hor_first_cols5KVZ_EXT_BLOCK_W_LUMA + 1,
+  int8_t hpel_off_x, int8_t hpel_off_y)
 {
-  int32_t temp = 0;
-  for (int i = 0; i < 8; ++i)
-  {
-    temp += filteri * datastride * i;
-  }
+  int x, y, first_y;
 
-  return temp;
-}
+  // Interpolation filter shifts
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
 
-int16_t kvz_four_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data)
-{
-  __m128i packed_data = _mm_cvtsi32_si128(*(int32_t*)data);
-  __m128i packed_filter = _mm_cvtsi32_si128(*(int32_t*)filter);
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
 
-  __m128i temp = _mm_maddubs_epi16(packed_data, packed_filter);
-  temp = _mm_hadd_epi16(temp, temp);
+  int8_t *fir0 = kvz_g_luma_filter0;
+  int8_t *fir2 = kvz_g_luma_filter2;
 
-  return _mm_extract_epi16(temp, 0);
-}
+  int16_t dst_stride = LCU_WIDTH;
+  int16_t hor_stride = LCU_WIDTH;
 
-int32_t kvz_four_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data)
-{
-  __m128i packed_data = _mm_loadl_epi64((__m128i*)data);
-  __m128i packed_filter = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(*(int32_t*)filter) );
+  int16_t *hor_pos0 = hor_intermediate0;
+  int16_t *hor_pos2 = hor_intermediate1;
+  int16_t *col_pos0 = hor_first_cols0;
+  int16_t *col_pos2 = hor_first_cols2;
 
-  __m128i temp = _mm_madd_epi16(packed_data, packed_filter);
-  temp = _mm_hadd_epi32(temp, temp);
+  // Horizontally filtered samples from the top row are
+  // not needed unless samples for diagonal positions are filtered later.
+  first_y = fme_level > 1 ? 0 : 1;
 
-  return _mm_cvtsi128_si32(temp);
-}
+  // HORIZONTAL STEP
+  // Integer pixels
+  __m256i shuf_01_23, shuf_45_67;
+  __m256i taps_01_23, taps_45_67;
 
-int16_t kvz_four_tap_filter_ver_avx2(int8_t *filter, kvz_pixel *data, int16_t stride)
-{
-  int16_t temp = 0;
-  for (int i = 0; i < 4; ++i)
-  {
-    temp += filteri * datastride * i;
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x + 1;
+      __m128i* out = (__m128i*)&hor_pos0y * hor_stride + x;
+      __m128i chunk = _mm_loadl_epi64((__m128i*)&srcsrc_stride*ypos + xpos);
+      chunk = _mm_cvtepu8_epi16(chunk);
+      chunk = _mm_slli_epi16(chunk, 6); // Multiply by 64
+      _mm_storeu_si128(out, chunk); //TODO: >> shift1
+    }
   }
 
-  return temp;
-}
+  // Write the first column in contiguous memory
+  x = 0;
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+    int32_t first_sample = srcsrc_stride*ypos + x << 6 >> shift1;
+    col_pos0y = first_sample;
+  }
 
-int32_t kvz_four_tap_filter_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride)
-{
-  int32_t temp = 0;
-  for (int i = 0; i < 4; ++i)
-  {
-    temp += filteri * datastride * i;
+  // Half pixels
+  kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67);
+  kvz_init_filter_taps(fir2, &taps_01_23, &taps_45_67);
+
+  for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
+      kvz_eight_tap_filter_hor_8x1_avx2(&srcsrc_stride*ypos + xpos, &hor_pos2y * hor_stride + x,
+                                            &shuf_01_23, &shuf_45_67,
+                                            &taps_01_23, &taps_45_67); //TODO: >> shift1
+    }
   }
 
-  return temp;
-}
+  // Write the first column in contiguous memory
+  x = 0;
+  for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+    int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+    col_pos2y = kvz_eight_tap_filter_hor_avx2(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
+  }
 
-void kvz_eight_tap_filter_x4_hor_avx2(int8_t *filter, kvz_pixel *data, int shift, int16_t* dst)
-{
-  __m256i packed_data = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)data)), _mm_loadl_epi64((__m128i*)(data + 2)), 1);
-  __m256i packed_filter = _mm256_broadcastq_epi64(_mm_loadl_epi64((__m128i*)filter));
-  __m256i idx_lookup = _mm256_broadcastsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8));
+  // VERTICAL STEP
+  kvz_pixel *out_l = filtered0;
+  kvz_pixel *out_r = filtered1;
+  kvz_pixel *out_t = filtered2;
+  kvz_pixel *out_b = filtered3;
 
-  __m256i temp = _mm256_shuffle_epi8(packed_data, idx_lookup);
+  __m256i taps4;
+  kvz_init_ver_filter_taps(fir0, taps);
 
-  temp = _mm256_maddubs_epi16(temp, packed_filter);
-  __m128i temp_128 = _mm_hadd_epi16(_mm256_extracti128_si256(temp, 0), _mm256_extracti128_si256(temp, 1));
-  temp_128 = _mm_hadd_epi16(temp_128, temp_128);
-  temp_128 = _mm_srai_epi16(temp_128, shift);
+  // Right
+  for (y = 0; y + 7 < height; y+=8) {
 
-  _mm_storel_epi64((__m128i*)dst, temp_128);
-}
+    for (x = 0; x + 7 < width ; x+=8) {
+      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos2(y + 1) * hor_stride + x, hor_stride, &out_ry * dst_stride + x, dst_stride);
+    }
+  }
 
-void kvz_four_tap_filter_x4_hor_avx2(int8_t *filter, kvz_pixel *data, int shift, int16_t* dst)
-{
-  __m128i packed_data = _mm_loadl_epi64((__m128i*)data);
-  __m128i packed_filter = _mm_set1_epi32(*(int32_t*)filter);
-  __m128i idx_lookup = _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
+  // Left
+  // Copy from the right filtered block and filter the extra column
+  for (y = 0; y < height; ++y) {
+    x = 0;
+    *(uint64_t*)&out_ly * dst_stride + x = *(uint64_t*)&out_ry * dst_stride + x << 8;
+    for (x = 8; x < width; x += 8) *(int64_t*)&out_ly * dst_stride + x = *(int64_t*)&out_ry * dst_stride + x - 1;
+    x = 0;
+    int16_t sample = 64 * col_pos2y + 1 + KVZ_LUMA_FILTER_OFFSET >> shift2;
+    sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+    out_ly * dst_stride + x = sample;
+  }
 
-  __m128i temp = _mm_shuffle_epi8(packed_data, idx_lookup);
+  kvz_init_ver_filter_taps(fir2, taps);
+  // Top
+  for (y = 0; y + 7 < height; y+=8) {
+    for (x = 0; x + 7 < width; x+=8) {
+      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos0y * hor_stride + x, hor_stride, &out_ty * dst_stride + x, dst_stride);
+    }
+  }
 
-  temp = _mm_maddubs_epi16(temp, packed_filter);
-  temp = _mm_hadd_epi16(temp, temp);
-  temp = _mm_srai_epi16(temp, shift);
+  // Bottom
+  // Copy what can be copied from the top filtered values.
+  // Then filter the last row from horizontal intermediate buffer.
+  for (y = 0; y < height - 1; ++y) {
+    for (x = 0; x + 7 < width; x += 8) {
+      *(int64_t*)&out_b(y + 0) * dst_stride + x = *(int64_t*)&out_t(y + 1) * dst_stride + x;
+    }
+  }
 
-  _mm_storel_epi64((__m128i*)dst, temp);
+  for (x = 0; x + 7 < width; x += 8) {
+    kvz_eight_tap_filter_ver_16bit_1x8_avx2(fir2, &hor_pos0(y + 1) * hor_stride + x, hor_stride, &out_by * dst_stride + x);
+  }
 }
 
-void kvz_eight_tap_filter_x8_hor_avx2(int8_t *filter, kvz_pixel *data, int shift, int16_t* dst)
+static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * encoder,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  kvz_pixel filtered4LCU_WIDTH * LCU_WIDTH,
+  int16_t hor_intermediate5(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH,
+  int8_t fme_level,
+  int16_t hor_first_cols5KVZ_EXT_BLOCK_W_LUMA + 1,
+  int8_t hpel_off_x, int8_t hpel_off_y)
 {
-  __m256i packed_data = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)data)), _mm_loadu_si128((__m128i*)(data + 4)), 1);
-  __m256i packed_filter = _mm256_broadcastq_epi64(_mm_loadl_epi64((__m128i*)filter));
-  __m256i idx_lookup0 = _mm256_broadcastsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8));
-  __m256i idx_lookup1 = _mm256_broadcastsi128_si256(_mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10));
-
-  __m256i temp0 = _mm256_shuffle_epi8(packed_data, idx_lookup0);
-  __m256i temp1 = _mm256_shuffle_epi8(packed_data, idx_lookup1);
-
-  temp0 = _mm256_maddubs_epi16(temp0, packed_filter);
-  temp1 = _mm256_maddubs_epi16(temp1, packed_filter);
-  temp0 = _mm256_hadd_epi16(temp0, temp1);
-  temp0 = _mm256_hadd_epi16(temp0, temp0);
+  int x, y;
 
-  temp0 = _mm256_srai_epi16(temp0, shift);
+  // Interpolation filter shifts
+  int32_t shift2 = 6;
 
-  temp0 = _mm256_permute4x64_epi64(temp0, _MM_SHUFFLE(3, 1, 2, 0));
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
 
-  _mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(temp0));
-}
+  int8_t *fir2 = kvz_g_luma_filter2;
 
-void kvz_four_tap_filter_x8_hor_avx2(int8_t *filter, kvz_pixel *data, int shift, int16_t* dst)
-{
-  __m256i packed_data = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)data)), _mm_loadl_epi64((__m128i*)(data + 4)), 1);
-  __m256i packed_filter = _mm256_set1_epi32(*(int32_t*)filter);
-  __m256i idx_lookup = _mm256_broadcastsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6));
+  int16_t dst_stride = LCU_WIDTH;
+  int16_t hor_stride = LCU_WIDTH;
 
-  __m256i temp = _mm256_shuffle_epi8(packed_data, idx_lookup);
+  int16_t *hor_pos2 = hor_intermediate1;
+  int16_t *col_pos2 = hor_first_cols2;
 
-  temp = _mm256_maddubs_epi16(temp, packed_filter);
-  temp = _mm256_hadd_epi16(temp, temp);
-  temp = _mm256_srai_epi16(temp, shift);
+  // VERTICAL STEP
+  kvz_pixel *out_tl = filtered0;
+  kvz_pixel *out_tr = filtered1;
+  kvz_pixel *out_bl = filtered2;
+  kvz_pixel *out_br = filtered3;
 
-  _mm_storel_epi64((__m128i*)dst, _mm256_castsi256_si128(temp));
-  _mm_storel_epi64((__m128i*)(dst + 4), _mm256_extracti128_si256(temp, 1));
-}
+ __m256i taps4;
+  kvz_init_ver_filter_taps(fir2, taps);
+  // Top-Right
+  for (y = 0; y + 7 < height; y += 8) {
+    for (x = 0; x + 7 < width; x += 8) {
+      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos2y * hor_stride + x, hor_stride, &out_try * dst_stride + x, dst_stride);
+    }
+  }
 
-int32_t kvz_eight_tap_filter_x4_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3)
-{
+  // Top-left
+  // Copy from the top-right filtered block and filter the extra column
+  for (y = 0; y < height; ++y) {
+    x = 0;
+    int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(fir2, &col_pos2y) >> shift2;
+    sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+    out_tly * dst_stride + x = sample;
 
-  __m128i v_filter = _mm_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter0)));
-  __m128i v_data0 = _mm_loadl_epi64((__m128i*)(data + stride * 0));
-  __m128i v_data1 = _mm_loadl_epi64((__m128i*)(data + stride * 1));
-  __m128i v_data = _mm_unpacklo_epi16(v_data0, v_data1);
-  __m128i temp =  _mm_madd_epi16(v_filter, v_data);
+    for (x = 1; x < width; ++x) out_tly * dst_stride + x = out_try * dst_stride + x - 1;
+  }
 
-  v_filter = _mm_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter2)));
-  __m128i v_data2 = _mm_loadl_epi64((__m128i*)(data + stride * 2));
-  __m128i v_data3 = _mm_loadl_epi64((__m128i*)(data + stride * 3));
-  v_data = _mm_unpacklo_epi16(v_data2, v_data3);
-  temp = _mm_add_epi32(temp, _mm_madd_epi16(v_filter, v_data) );
+  // Bottom-right
+  // Copy what can be copied from top-right filtered values. Filter the last row.
+  for (y = 0; y < height - 1; ++y) {
+    for (x = 0; x + 7 < width; x += 8) {
+      memcpy(&out_bry * dst_stride + x, &out_tr(y + 1) * dst_stride + x, 8);
+    }
+  }
 
-  temp = _mm_add_epi32(temp, _mm_set1_epi32(offset));
-  temp = _mm_srai_epi32(temp, shift2 + shift3);
+  for (x = 0; x + 7 < width; x += 8) {
+    kvz_eight_tap_filter_ver_16bit_1x8_avx2(fir2, &hor_pos2(y + 1) * hor_stride + x, hor_stride, &out_bry * dst_stride + x);
+  }
 
-  temp = _mm_packus_epi32(temp, temp);
-  temp = _mm_packus_epi16(temp, temp);
+  // Bottom-left
+  // Copy what can be copied from the top-left filtered values.
+  // Copy what can be copied from the bottom-right filtered values.
+  // Finally filter the last pixel from the column array.
+  for (y = 0; y < height - 1; ++y) {
+    for (x = 0; x + 7 < width; x += 8) {
+      memcpy(&out_bly * dst_stride + x, &out_tl(y + 1) * dst_stride + x, 8);
+    }
+  }
 
-  return _mm_cvtsi128_si32(temp);
+  for (x = 1; x < width; ++x) out_bly * dst_stride + x = out_bry * dst_stride + x - 1;
+  x = 0;
+  int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(fir2, &col_pos2(y + 1)) >> shift2;
+  sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+  out_bly * dst_stride + x = sample;
 }
 
-int32_t kvz_four_tap_filter_x4_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3)
+static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  kvz_pixel filtered4LCU_WIDTH * LCU_WIDTH,
+  int16_t hor_intermediate5(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH,
+  int8_t fme_level,
+  int16_t hor_first_cols5KVZ_EXT_BLOCK_W_LUMA + 1,
+  int8_t hpel_off_x, int8_t hpel_off_y)
 {
+  int x, y;
 
-  __m128i v_filter = _mm_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter0)));
-  __m128i v_data0 = _mm_loadl_epi64((__m128i*)(data + stride * 0));
-  __m128i v_data1 = _mm_loadl_epi64((__m128i*)(data + stride * 1));
-  __m128i v_data = _mm_unpacklo_epi16(v_data0, v_data1);
-  __m128i temp =  _mm_madd_epi16(v_filter, v_data);
+  // Interpolation filter shifts
+  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+  int32_t shift2 = 6;
 
-  v_filter = _mm_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter2)));
-  __m128i v_data2 = _mm_loadl_epi64((__m128i*)(data + stride * 2));
-  __m128i v_data3 = _mm_loadl_epi64((__m128i*)(data + stride * 3));
-  v_data = _mm_unpacklo_epi16(v_data2, v_data3);
-  temp = _mm_add_epi32(temp, _mm_madd_epi16(v_filter, v_data) );
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
 
-  temp = _mm_add_epi32(temp, _mm_set1_epi32(offset));
-  temp = _mm_srai_epi32(temp, shift2 + shift3);
+  int8_t *fir0 = kvz_g_luma_filter0;
+  int8_t *fir2 = kvz_g_luma_filter2;
+  int8_t *fir1 = kvz_g_luma_filter1;
+  int8_t *fir3 = kvz_g_luma_filter3;
 
-  temp = _mm_packus_epi32(temp, temp);
-  temp = _mm_packus_epi16(temp, temp);
+  // Horiziontal positions. Positions 0 and 2 have already been calculated in filtered.
+  int16_t *hor_pos0 = hor_intermediate0;
+  int16_t *hor_pos2 = hor_intermediate1;
+  int16_t *hor_pos_l = hor_intermediate3;
+  int16_t *hor_pos_r = hor_intermediate4;
+  int8_t *hor_fir_l = hpel_off_x != 0 ? fir1 : fir3;
+  int8_t *hor_fir_r = hpel_off_x != 0 ? fir3 : fir1;
+  int16_t *col_pos_l = hor_first_cols1;
+  int16_t *col_pos_r = hor_first_cols3;
+
+  int16_t dst_stride = LCU_WIDTH;
+  int16_t hor_stride = LCU_WIDTH;
+
+  int16_t *hor_hpel_pos = hpel_off_x != 0 ? hor_pos2 : hor_pos0;
+  int16_t *col_pos_hor = hpel_off_x != 0 ? hor_first_cols2 : hor_first_cols0;
+
+  // Specify if integer pixels are filtered from left or/and top integer samples
+  int off_x_fir_l = hpel_off_x < 1 ? 0 : 1;
+  int off_x_fir_r = hpel_off_x < 0 ? 0 : 1;
+  int off_y_fir_t = hpel_off_y < 1 ? 0 : 1;
+  int off_y_fir_b = hpel_off_y < 0 ? 0 : 1;
+
+  // HORIZONTAL STEP
+  __m256i shuf_01_23, shuf_45_67;
+  __m256i taps_01_23, taps_45_67;
+
+  // Left QPEL
+  kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67);
+  kvz_init_filter_taps(hor_fir_l, &taps_01_23, &taps_45_67);
+  
+  int sample_off_y = hpel_off_y < 0 ? 0 : 1;
 
-  return _mm_cvtsi128_si32(temp);
-}
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
 
-void kvz_eight_tap_filter_x8_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3, kvz_pixel* dst)
-{
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
+      kvz_eight_tap_filter_hor_8x1_avx2(&srcsrc_stride*ypos + xpos, &hor_pos_ly * hor_stride + x,
+        &shuf_01_23, &shuf_45_67,
+        &taps_01_23, &taps_45_67); //TODO: >> shift1
+    }
+  }
 
-  __m256i v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter0)));
-  __m256i v_data0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 0)));
-  __m256i v_data1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 1)));
-  __m256i v_data = _mm256_or_si256(v_data0, _mm256_slli_epi32(v_data1, 16));
-  __m256i temp =  _mm256_madd_epi16(v_filter, v_data);
-
-  v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter2)));
-  __m256i v_data2 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 2)));
-  __m256i v_data3 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 3)));
-  v_data = _mm256_or_si256(v_data2, _mm256_slli_epi32(v_data3, 16));
-  temp = _mm256_add_epi32(temp, _mm256_madd_epi16(v_filter, v_data) );
-
-  v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter4)));
-  __m256i v_data4 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 4)));
-  __m256i v_data5 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 5)));
-  v_data = _mm256_or_si256(v_data4, _mm256_slli_epi32(v_data5, 16));
-  temp = _mm256_add_epi32(temp, _mm256_madd_epi16(v_filter, v_data) );
-
-  v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter6)));
-  __m256i v_data6 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 6)));
-  __m256i v_data7 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 7)));
-  v_data = _mm256_or_si256(v_data6, _mm256_slli_epi32(v_data7, 16));
-  temp = _mm256_add_epi32(temp, _mm256_madd_epi16(v_filter, v_data) );
-
-  temp = _mm256_add_epi32(temp, _mm256_set1_epi32(offset));
-  temp = _mm256_srai_epi32(temp, shift2 + shift3);
-
-  temp = _mm256_packus_epi32(temp, temp);
-  temp = _mm256_packus_epi16(temp, temp);
-
-  *(int32_t*)dst = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp));
-  *(int32_t*)(dst + 4) = _mm_cvtsi128_si32(_mm256_extracti128_si256(temp, 1));
-}
+  // Write the first column in contiguous memory
+  x = 0;
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+    int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+    col_pos_ly = kvz_eight_tap_filter_hor_avx2(hor_fir_l, &srcsrc_stride*ypos + xpos) >> shift1;
+  }
 
-void kvz_four_tap_filter_x8_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3, kvz_pixel* dst)
-{
+  // Right QPEL
+  kvz_init_filter_taps(hor_fir_r, &taps_01_23, &taps_45_67);
 
-  __m256i v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter0)));
-  __m256i v_data0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 0)));
-  __m256i v_data1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 1)));
-  __m256i v_data = _mm256_or_si256(v_data0, _mm256_slli_epi32(v_data1, 16));
-  __m256i temp =  _mm256_madd_epi16(v_filter, v_data);
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
 
-  v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter2)));
-  __m256i v_data2 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 2)));
-  __m256i v_data3 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 3)));
-  v_data = _mm256_or_si256(v_data2, _mm256_slli_epi32(v_data3, 16));
-  temp = _mm256_add_epi32(temp, _mm256_madd_epi16(v_filter, v_data) );
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
+      kvz_eight_tap_filter_hor_8x1_avx2(&srcsrc_stride*ypos + xpos, &hor_pos_ry * hor_stride + x,
+        &shuf_01_23, &shuf_45_67,
+        &taps_01_23, &taps_45_67); //TODO: >> shift1
+    }
+  }
 
-  temp = _mm256_add_epi32(temp, _mm256_set1_epi32(offset));
-  temp = _mm256_srai_epi32(temp, shift2 + shift3);
+  // Write the first column in contiguous memory
+  x = 0;
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+    int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+    col_pos_ry = kvz_eight_tap_filter_hor_avx2(hor_fir_r, &srcsrc_stride*ypos + xpos) >> shift1;
+  }
 
-  temp = _mm256_packus_epi32(temp, temp);
-  temp = _mm256_packus_epi16(temp, temp);
+  // VERTICAL STEP
+  kvz_pixel *out_l = filtered0;
+  kvz_pixel *out_r = filtered1;
+  kvz_pixel *out_t = filtered2;
+  kvz_pixel *out_b = filtered3;
 
-  *(int32_t*)dst = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp));
-  *(int32_t*)(dst + 4) = _mm_cvtsi128_si32(_mm256_extracti128_si256(temp, 1));
-}
+  int8_t *ver_fir_l = hpel_off_y != 0 ? fir2 : fir0;
+  int8_t *ver_fir_r = hpel_off_y != 0 ? fir2 : fir0;
+  int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3;
+  int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1;
 
-void kvz_filter_inter_quarterpel_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
-{
+  __m256i taps4;
 
-  int32_t x, y;
-  int16_t shift1 = KVZ_BIT_DEPTH - 8;
-  int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+  // Left QPEL (1/4 or 3/4 x positions) 
+  // Filter block and then filter column and align if neccessary
+  kvz_init_ver_filter_taps(ver_fir_l, taps);
 
-  //coefficients for 1/4, 2/4 and 3/4 positions
-  int8_t *c0, *c1, *c2, *c3;
+  for (y = 0; y + 7 < height; y += 8) {
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y + sample_off_y;
+      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_lypos * hor_stride + x, hor_stride, &out_ly * dst_stride + x, dst_stride);
+    }
+  }
 
-  c0 = kvz_g_luma_filter0;
-  c1 = kvz_g_luma_filter1;
-  c2 = kvz_g_luma_filter2;
-  c3 = kvz_g_luma_filter3;
+  if (!off_x_fir_l) {
+    for (y = 0; y < height; ++y) {
+      for (x = width - 8; x >= 8; x -= 8) {
+        uint64_t chunk = *(uint64_t*)&out_ly * dst_stride + x - 1;
+        *(uint64_t*)&out_ly * dst_stride + x = chunk;
+      }
 
-  int16_t flipped_hor_filteredMAX_HEIGHTMAX_WIDTH;
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_l, &col_pos_ly + sample_off_y) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      uint64_t first = sample;
+      uint64_t rest = *(uint64_t*)&out_ly * dst_stride + x;
+      uint64_t chunk = (rest << 8) | first;
+      *(uint64_t*)&out_ly * dst_stride + x = chunk;
+    }
+  }
 
-  // Filter horizontally and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height; y += 8) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
+  // Right QPEL (3/4 or 1/4 x positions)
+  // Filter block and then filter column and align if neccessary
+  kvz_init_ver_filter_taps(ver_fir_r, taps);
 
-      kvz_eight_tap_filter_and_flip_avx2(kvz_g_luma_filter, &srcsrc_stride*ypos + xpos, src_stride, (int16_t*)&(flipped_hor_filtered4 * x + 0y));
-    
+  for (y = 0; y + 7 < height; y += 8) {
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y + sample_off_y;
+      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_rypos * hor_stride + x, hor_stride, &out_ry * dst_stride + x, dst_stride);
     }
+  }
 
-    for (; y < height + FILTER_SIZE - 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      flipped_hor_filtered4 * x + 0y = kvz_eight_tap_filter_hor_avx2(c0, &srcsrc_stride*ypos + xpos) << shift1;
-      flipped_hor_filtered4 * x + 1y = kvz_eight_tap_filter_hor_avx2(c1, &srcsrc_stride*ypos + xpos) << shift1;
-      flipped_hor_filtered4 * x + 2y = kvz_eight_tap_filter_hor_avx2(c2, &srcsrc_stride*ypos + xpos) << shift1;
-      flipped_hor_filtered4 * x + 3y = kvz_eight_tap_filter_hor_avx2(c3, &srcsrc_stride*ypos + xpos) << shift1;
+  if (!off_x_fir_r) {
+    for (y = 0; y < height; ++y) {
+      for (x = width - 8; x >= 8; x -= 8) {
+        uint64_t chunk = *(uint64_t*)&out_ry * dst_stride + x - 1;
+        *(uint64_t*)&out_ry * dst_stride + x = chunk;
+      }
+
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_r, &col_pos_ry + sample_off_y) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      uint64_t first = sample;
+      uint64_t rest = *(uint64_t*)&out_ry * dst_stride + x;
+      uint64_t chunk = (rest << 8) | first;
+      *(uint64_t*)&out_ry * dst_stride + x = chunk;
     }
   }
 
-  // Filter vertically and flip x and y
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < 4 * width - 3; x += 4) {
+  
+  // Top QPEL (1/4 or 3/4 y positions)
+  // Filter block and then filter column and align if neccessary
+  int sample_off_x = (hpel_off_x > -1 ? 1 : 0);
+  kvz_init_ver_filter_taps(ver_fir_t, taps);
+
+  for (y = 0; y + 7 < height; y += 8) {
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y + off_y_fir_t;
+      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_hpel_posypos * hor_stride + x, hor_stride, &out_ty * dst_stride + x, dst_stride);
+    }
+  }
 
-     eight_tap_filter_and_flip_16bit_avx2(kvz_g_luma_filter, &flipped_hor_filteredxy, MAX_WIDTH, offset23, shift2 + shift3, &(dst(4 * y + 0)*dst_stride + x), dst_stride);
+  if (!sample_off_x) {
+    for (y = 0; y < height; ++y) {
+      for (x = width - 8; x >= 8; x -= 8) {
+        uint64_t chunk = *(uint64_t*)&out_ty * dst_stride + x - 1;
+        *(uint64_t*)&out_ty * dst_stride + x = chunk;
+      }
 
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_t, &col_pos_hory + off_y_fir_t) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      uint64_t first = sample;
+      uint64_t rest = *(uint64_t*)&out_ty * dst_stride + x;
+      uint64_t chunk = (rest << 8) | first;
+      *(uint64_t*)&out_ty * dst_stride + x = chunk;
     }
-
   }
 
-}
+  // Bottom QPEL (3/4 or 1/4 y positions)
+  // Filter block and then filter column and align if neccessary
+  kvz_init_ver_filter_taps(ver_fir_b, taps);
 
-/**
-* \brief Interpolation for chroma half-pixel
-* \param src source image in integer pels (-2..width+3, -2..height+3)
-* \param src_stride stride of source image
-* \param width width of source image block
-* \param height height of source image block
-* \param dst destination image in half-pixel resolution
-* \param dst_stride stride of destination image
-*
-*/
-void kvz_filter_inter_halfpel_chroma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
-{
-  /* ____________
-  * | B0,0|ae0,0|
-  * |ea0,0|ee0,0|
-  *
-  * ae0,0 = (-4*B-1,0  + 36*B0,0  + 36*B1,0  - 4*B2,0)  >> shift1
-  * ea0,0 = (-4*B0,-1  + 36*B0,0  + 36*B0,1  - 4*B0,2)  >> shift1
-  * ee0,0 = (-4*ae0,-1 + 36*ae0,0 + 36*ae0,1 - 4*ae0,2) >> shift2
-  */
-  int32_t x, y;
-  int32_t shift1 = KVZ_BIT_DEPTH - 8;
-  int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset3 = 1 << (shift3 - 1);
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
-
-  int8_t* c = kvz_g_chroma_filter4;
-  int16_t temp4 = {0,0,0,0};
-
-  // Loop source pixels and generate four filtered half-pel pixels on each round
-  for (y = 0; y < height; y++) {
-    int dst_pos_y = (y << 1)*dst_stride;
-    int src_pos_y = y*src_stride;
-    for (x = 0; x < width; x++) {
-      // Calculate current dst and src pixel positions
-      int dst_pos = dst_pos_y + (x << 1);
-      int src_pos = src_pos_y + x;
-
-      // Original pixel (not really needed)
-      dstdst_pos = srcsrc_pos; //B0,0
-
-      // ae0,0 - We need this only when hor_flag and for ee0,0
-      if (hor_flag) {
-        temp1 = kvz_four_tap_filter_hor_avx2(c, &srcsrc_pos - 1) >> shift1; // ae0,0
-      }
-      // ea0,0 - needed only when ver_flag
-      if (ver_flag) {
-        dstdst_pos + 1 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c, &srcsrc_pos - src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); // ea0,0
-      }
+  for (y = 0; y + 7 < height; y += 8) {
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y + off_y_fir_b;
+      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_hpel_posypos * hor_stride + x, hor_stride, &out_by * dst_stride + x, dst_stride);
+    }
+  }
 
-      // When both flags, we use _only_ this pixel (but still need ae0,0 for it)
-      if (hor_flag && ver_flag) {
-        // Calculate temporary values..
-        src_pos -= src_stride;  //0,-1
-        temp0 = (kvz_four_tap_filter_hor_avx2(c, &srcsrc_pos - 1) >> shift1); // ae0,-1
-        src_pos += 2 * src_stride;  //0,1
-        temp2 = (kvz_four_tap_filter_hor_avx2(c, &srcsrc_pos - 1) >> shift1); // ae0,1
-        src_pos += src_stride;  //0,2
-        temp3 = (kvz_four_tap_filter_hor_avx2(c, &srcsrc_pos - 1) >> shift1); // ae0,2
-
-        dstdst_pos + 1 * dst_stride + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c, temp) + offset23) >> shift2) >> shift3); // ee0,0
+  if (!sample_off_x) {
+    for (y = 0; y < height; ++y) {
+      for (x = width - 8; x >= 8; x -= 8) {
+        uint64_t chunk = *(uint64_t*)&out_by * dst_stride + x - 1;
+        *(uint64_t*)&out_by * dst_stride + x = chunk;
       }
 
-      if (hor_flag) {
-        dstdst_pos + 1 = kvz_fast_clip_32bit_to_pixel((temp1 + offset3) >> shift3);
-      }
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_b, &col_pos_hory + off_y_fir_b) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      uint64_t first = sample;
+      uint64_t rest = *(uint64_t*)&out_by * dst_stride + x;
+      uint64_t chunk = (rest << 8) | first;
+      *(uint64_t*)&out_by * dst_stride + x = chunk;
     }
   }
 }
 
-void kvz_filter_inter_octpel_chroma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
+static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * encoder,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  kvz_pixel filtered4LCU_WIDTH * LCU_WIDTH,
+  int16_t hor_intermediate5(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH,
+  int8_t fme_level,
+  int16_t hor_first_cols5KVZ_EXT_BLOCK_W_LUMA + 1,
+  int8_t hpel_off_x, int8_t hpel_off_y)
 {
+  int x, y;
 
-  int32_t x, y;
-  int32_t shift1 = KVZ_BIT_DEPTH - 8;
+  // Interpolation filter shifts
   int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset3 = 1 << (shift3 - 1);
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
-
-  //coefficients for 1/8, 2/8, 3/8, 4/8, 5/8, 6/8 and 7/8 positions
-  int8_t *c1, *c2, *c3, *c4, *c5, *c6, *c7;
-
-  int i;
-  c1 = kvz_g_chroma_filter1;
-  c2 = kvz_g_chroma_filter2;
-  c3 = kvz_g_chroma_filter3;
-  c4 = kvz_g_chroma_filter4;
-  c5 = kvz_g_chroma_filter5;
-  c6 = kvz_g_chroma_filter6;
-  c7 = kvz_g_chroma_filter7;
-
-  int16_t temp74; // Temporary horizontal values calculated from integer pixels
-
-
-  // Loop source pixels and generate 64 filtered 1/8-pel pixels on each round
-  for (y = 0; y < height; y++) {
-    int dst_pos_y = (y << 3)*dst_stride;
-    int src_pos_y = y*src_stride;
-    for (x = 0; x < width; x++) {
-      // Calculate current dst and src pixel positions
-      int dst_pos = dst_pos_y + (x << 3);
-      int src_pos = src_pos_y + x;
-
-      // Original pixel
-      dstdst_pos = srcsrc_pos;
-
-      // Horizontal 1/8-values
-      if (hor_flag && !ver_flag) {
-
-        temp01 = (kvz_four_tap_filter_hor_avx2(c1, &srcsrc_pos - 1) >> shift1); // ae0,0 h0
-        temp11 = (kvz_four_tap_filter_hor_avx2(c2, &srcsrc_pos - 1) >> shift1);
-        temp21 = (kvz_four_tap_filter_hor_avx2(c3, &srcsrc_pos - 1) >> shift1);
-        temp31 = (kvz_four_tap_filter_hor_avx2(c4, &srcsrc_pos - 1) >> shift1);
-        temp41 = (kvz_four_tap_filter_hor_avx2(c5, &srcsrc_pos - 1) >> shift1);
-        temp51 = (kvz_four_tap_filter_hor_avx2(c6, &srcsrc_pos - 1) >> shift1);
-        temp61 = (kvz_four_tap_filter_hor_avx2(c7, &srcsrc_pos - 1) >> shift1);
 
-      }
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
 
-      // Vertical 1/8-values
-      if (ver_flag) {
-        dstdst_pos + 1 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c1, &srcsrc_pos - 1 * src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); //
-        dstdst_pos + 2 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c2, &srcsrc_pos - 1 * src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
-        dstdst_pos + 3 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c3, &srcsrc_pos - 1 * src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
-        dstdst_pos + 4 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c4, &srcsrc_pos - 1 * src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
-        dstdst_pos + 5 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c5, &srcsrc_pos - 1 * src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
-        dstdst_pos + 6 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c6, &srcsrc_pos - 1 * src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
-        dstdst_pos + 7 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c7, &srcsrc_pos - 1 * src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
-      }
+  int8_t *fir1 = kvz_g_luma_filter1;
+  int8_t *fir3 = kvz_g_luma_filter3;
 
-      // When both flags, interpolate values from temporary horizontal values
-      if (hor_flag && ver_flag) {
+  int16_t *hor_pos_l = hor_intermediate3;
+  int16_t *hor_pos_r = hor_intermediate4;
 
-        // Calculate temporary values
-        src_pos -= 1 * src_stride;  //0,-3
-        for (i = 0; i < 4; ++i) {
+  int16_t *col_pos_l = hor_first_cols1;
+  int16_t *col_pos_r = hor_first_cols3;
 
-          temp0i = (kvz_four_tap_filter_hor_avx2(c1, &srcsrc_pos + i * src_stride - 1) >> shift1);
-          temp1i = (kvz_four_tap_filter_hor_avx2(c2, &srcsrc_pos + i * src_stride - 1) >> shift1);
-          temp2i = (kvz_four_tap_filter_hor_avx2(c3, &srcsrc_pos + i * src_stride - 1) >> shift1);
-          temp3i = (kvz_four_tap_filter_hor_avx2(c4, &srcsrc_pos + i * src_stride - 1) >> shift1);
-          temp4i = (kvz_four_tap_filter_hor_avx2(c5, &srcsrc_pos + i * src_stride - 1) >> shift1);
-          temp5i = (kvz_four_tap_filter_hor_avx2(c6, &srcsrc_pos + i * src_stride - 1) >> shift1);
-          temp6i = (kvz_four_tap_filter_hor_avx2(c7, &srcsrc_pos + i * src_stride - 1) >> shift1);
+  int16_t dst_stride = LCU_WIDTH;
+  int16_t hor_stride = LCU_WIDTH;
 
-        }
+  // VERTICAL STEP
+  kvz_pixel *out_tl = filtered0;
+  kvz_pixel *out_tr = filtered1;
+  kvz_pixel *out_bl = filtered2;
+  kvz_pixel *out_br = filtered3;
 
+  int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3;
+  int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1;
 
-        //Calculate values from temporary horizontal 1/8-values
-        for (i = 0; i<7; ++i){
-          dstdst_pos + 1 * dst_stride + i + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c1, &tempi0) + offset23) >> shift2) >> shift3); // ee0,0
-          dstdst_pos + 2 * dst_stride + i + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c2, &tempi0) + offset23) >> shift2) >> shift3);
-          dstdst_pos + 3 * dst_stride + i + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c3, &tempi0) + offset23) >> shift2) >> shift3);
-          dstdst_pos + 4 * dst_stride + i + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c4, &tempi0) + offset23) >> shift2) >> shift3);
-          dstdst_pos + 5 * dst_stride + i + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c5, &tempi0) + offset23) >> shift2) >> shift3);
-          dstdst_pos + 6 * dst_stride + i + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c6, &tempi0) + offset23) >> shift2) >> shift3);
-          dstdst_pos + 7 * dst_stride + i + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c7, &tempi0) + offset23) >> shift2) >> shift3);
+  // Specify if integer pixels are filtered from left or/and top integer samples
+  int off_x_fir_l = hpel_off_x < 1 ? 0 : 1;
+  int off_x_fir_r = hpel_off_x < 0 ? 0 : 1;
+  int off_y_fir_t = hpel_off_y < 1 ? 0 : 1;
+  int off_y_fir_b = hpel_off_y < 0 ? 0 : 1;
 
-        }
+  __m256i taps4;
+  // Top-left QPEL
+  // Filter block and then filter column and align if neccessary
+  kvz_init_ver_filter_taps(ver_fir_t, taps);
 
-      }
+  for (y = 0; y + 7 < height; y += 8) {
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y + off_y_fir_t;
+      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_lypos * hor_stride + x, hor_stride, &out_tly * dst_stride + x, dst_stride);
+    }
+  }
 
-      if (hor_flag) {
-        dstdst_pos + 1 = kvz_fast_clip_32bit_to_pixel((temp01 + offset3) >> shift3);
-        dstdst_pos + 2 = kvz_fast_clip_32bit_to_pixel((temp11 + offset3) >> shift3);
-        dstdst_pos + 3 = kvz_fast_clip_32bit_to_pixel((temp21 + offset3) >> shift3);
-        dstdst_pos + 4 = kvz_fast_clip_32bit_to_pixel((temp31 + offset3) >> shift3);
-        dstdst_pos + 5 = kvz_fast_clip_32bit_to_pixel((temp41 + offset3) >> shift3);
-        dstdst_pos + 6 = kvz_fast_clip_32bit_to_pixel((temp51 + offset3) >> shift3);
-        dstdst_pos + 7 = kvz_fast_clip_32bit_to_pixel((temp61 + offset3) >> shift3);
+  if (!off_x_fir_l) {
+    for (y = 0; y < height; ++y) {
+      for (x = width - 8; x >= 8; x -= 8) {
+        uint64_t chunk = *(uint64_t*)&out_tly * dst_stride + x - 1;
+        *(uint64_t*)&out_tly * dst_stride + x = chunk;
       }
 
-
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_t, &col_pos_ly + off_y_fir_t) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      uint64_t first = sample;
+      uint64_t rest = *(uint64_t*)&out_tly * dst_stride + x;
+      uint64_t chunk = (rest << 8) | first;
+      *(uint64_t*)&out_tly * dst_stride + x = chunk;
     }
   }
-}
 
-void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
-{
-  int x, y;
-  int16_t shift1 = KVZ_BIT_DEPTH - 8;
-  int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+  // Top-right QPEL
+  // Filter block and then filter column and align if neccessary
 
-  int8_t *fir0 = kvz_g_luma_filter0;
-  int8_t *fir2 = kvz_g_luma_filter2;
+  for (y = 0; y + 7 < height; y += 8) {
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y + off_y_fir_t;
+      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_rypos * hor_stride + x, hor_stride, &out_try * dst_stride + x, dst_stride);
+    }
+  }
 
-  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  if (!off_x_fir_r) {
+    for (y = 0; y < height; ++y) {
+      for (x = width - 8; x >= 8; x -= 8) {
+        uint64_t chunk = *(uint64_t*)&out_try * dst_stride + x - 1;
+        *(uint64_t*)&out_try * dst_stride + x = chunk;
+      }
 
-  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
-  int16_t dst_stride = (LCU_WIDTH + 1);
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_t, &col_pos_ry + off_y_fir_t) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      uint64_t first = sample;
+      uint64_t rest = *(uint64_t*)&out_try * dst_stride + x;
+      uint64_t chunk = (rest << 8) | first;
+      *(uint64_t*)&out_try * dst_stride + x = chunk;
+    }
+  }
 
-  // Horizontal positions
-  for (x = 0; x < width + 1; ++x) {
-    for (y = 0; y + 8 < height + KVZ_EXT_PADDING + 1; y += 8) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      int8_t *firs2 = { fir0, fir2 };
-      int16_t *dsts2 = { &flipped0x * temp_stride + y, &flipped2x * temp_stride + y };
-      kvz_filter_flip_shift_x8_dual_avx2(&srcsrc_stride*ypos + xpos, src_stride, &firs0, shift1, &dsts0);
+  // Bottom-left QPEL
+  // Filter block and then filter column and align if neccessary
+  kvz_init_ver_filter_taps(ver_fir_b, taps);
+
+  for (y = 0; y + 7 < height; y += 8) {
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y + off_y_fir_b;
+      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_lypos * hor_stride + x, hor_stride, &out_bly * dst_stride + x, dst_stride);
     }
+  }
 
-    for (; y < height + KVZ_EXT_PADDING + 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
+  if (!off_x_fir_l) {
+    for (y = 0; y < height; ++y) {
+      for (x = width - 8; x >= 8; x -= 8) {
+        uint64_t chunk = *(uint64_t*)&out_bly * dst_stride + x - 1;
+        *(uint64_t*)&out_bly * dst_stride + x = chunk;
+      }
+
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_b, &col_pos_ly + off_y_fir_b) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      uint64_t first = sample;
+      uint64_t rest = *(uint64_t*)&out_bly * dst_stride + x;
+      uint64_t chunk = (rest << 8) | first;
+      *(uint64_t*)&out_bly * dst_stride + x = chunk;
     }
   }
 
-  // Filter vertically and flip x and y
-  for (x = 0; x + 8 < width + 1; x += 8) {
-    for (y = 0; y < height + 1; ++y) {  
-      int8_t *firs2 = { fir0, fir2 };
-      kvz_pixel *dsts2 = { &filteredHPEL_POS_HORy * dst_stride + x, &filteredHPEL_POS_VERy * dst_stride + x};
-      int16_t *srcs2 = {flipped2 + x * temp_stride + y, flipped0 + x * temp_stride + y };
-      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(srcs, temp_stride, firs, offset23, shift2 + shift3, dsts);
+  // Bottom-right QPEL
+  // Filter block and then filter column and align if neccessary
+  for (y = 0; y + 7 < height; y += 8) {
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y + off_y_fir_b;
+      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_rypos * hor_stride + x, hor_stride, &out_bry * dst_stride + x, dst_stride);
     }
   }
 
-  // The remaining pixels
-  for (; x < width + 1; ++x) {
-    for (y = 0; y < height + 1; ++y) {
-      filteredHPEL_POS_HORy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filteredHPEL_POS_VERy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
+  if (!off_x_fir_r) {
+    for (y = 0; y < height; ++y) {
+      for (x = width - 8; x >= 8; x -= 8) {
+        uint64_t chunk = *(uint64_t*)&out_bry * dst_stride + x - 1;
+        *(uint64_t*)&out_bry * dst_stride + x = chunk;
+      }
+
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_b, &col_pos_ry + off_y_fir_b) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      uint64_t first = sample;
+      uint64_t rest = *(uint64_t*)&out_bry * dst_stride + x;
+      uint64_t chunk = (rest << 8) | first;
+      *(uint64_t*)&out_bry * dst_stride + x = chunk;
     }
   }
 }
 
-void kvz_filter_hpel_blocks_full_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder,
+  kvz_pixel *src, 
+  int16_t src_stride, 
+  int width, 
+  int height, 
+  kvz_pixel *dst, 
+  int16_t dst_stride, 
+  int8_t hor_flag, 
+  int8_t ver_flag, 
+  const int16_t mv2)
 {
+  // TODO: Optimize SMP and AMP
+  if (width != height) {
+    kvz_sample_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
+    return;
+  }
+
   int x, y;
-  int16_t shift1 = KVZ_BIT_DEPTH - 8;
-  int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
 
-  int8_t *fir0 = kvz_g_luma_filter0;
-  int8_t *fir2 = kvz_g_luma_filter2;
+  int8_t *hor_fir = kvz_g_luma_filtermv0 & 3;
+  int8_t *ver_fir = kvz_g_luma_filtermv1 & 3;
 
-  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t hor_stride = LCU_WIDTH;
+  int16_t hor_intermediateKVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH;
 
-  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
-  int16_t dst_stride = (LCU_WIDTH + 1);
+  // HORIZONTAL STEP
+  __m256i shuf_01_23, shuf_45_67;
+  __m256i taps_01_23, taps_45_67;
 
-  // Horizontal positions
-  for (x = 0; x < width + 1; ++x) {
-    for (y = 0; y + 8 < height + KVZ_EXT_PADDING + 1; y += 8) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      int8_t *firs2 = { fir0, fir2 };
-      int16_t *dsts2 = { &flipped0x * temp_stride + y, &flipped2x * temp_stride + y };
-      kvz_filter_flip_shift_x8_dual_avx2(&srcsrc_stride*ypos + xpos, src_stride, &firs0, shift1, &dsts0);
+  kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67);
+  kvz_init_filter_taps(hor_fir, &taps_01_23, &taps_45_67);
 
-    }
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) {
 
-    for (; y < height + KVZ_EXT_PADDING + 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+      kvz_eight_tap_filter_hor_8x1_avx2(&srcsrc_stride*ypos + xpos, &hor_intermediatey * hor_stride + x,
+        &shuf_01_23, &shuf_45_67,
+        &taps_01_23, &taps_45_67); //TODO: >> shift1
     }
   }
 
-  // Filter vertically and flip x and y
-  for (x = 0; x + 8 < width + 1; x += 8) {
-    for (y = 0; y < height + 1; ++y) {
-      int8_t *firs2 = { fir0, fir2 };
-      kvz_pixel *dsts2 = { &filteredHPEL_POS_HORy * dst_stride + x, &filteredHPEL_POS_VERy * dst_stride + x};
-      int16_t *srcs2 = {flipped2 + x * temp_stride + y, flipped0 + x * temp_stride + y };
-      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(srcs, temp_stride, firs, offset23, shift2 + shift3, dsts);
-      kvz_filter_flip_round_clip_x8_16bit_avx2(flipped2 + x * temp_stride + y, temp_stride, fir2, offset23, shift2 + shift3, &filteredHPEL_POS_DIAy * dst_stride + x);
+  // VERTICAL STEP
+  __m256i taps4;
+  kvz_init_ver_filter_taps(ver_fir, taps);
 
-    }
-  }
-
-  // The remaining pixels
-  for (; x < width + 1; ++x) {
-    for (y = 0; y < height + 1; ++y) {
-      filteredHPEL_POS_HORy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filteredHPEL_POS_VERy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filteredHPEL_POS_DIAy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+  for (y = 0; y + 7 < height; y += 8) {
+    for (x = 0; x + 7 < width; x += 8) {
+      kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_intermediatey * hor_stride + x, hor_stride, &dsty * dst_stride + x, dst_stride);
     }
   }
 }
 
-void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * const encoder,
+  kvz_pixel *src, 
+  int16_t src_stride, 
+  int width, 
+  int height, 
+  int16_t *dst, 
+  int16_t dst_stride, 
+  int8_t hor_flag, 
+  int8_t ver_flag, 
+  const int16_t mv2)
 {
+  // TODO: Optimize SMP and AMP
+  if (width != height) {
+    kvz_sample_14bit_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
+    return;
+  }
+  // TODO: horizontal and vertical only filtering
   int x, y;
-  int16_t shift1 = KVZ_BIT_DEPTH - 8;
-  int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
 
-  int8_t *fir0 = kvz_g_luma_filter0;
-  int8_t *fir2 = kvz_g_luma_filter2;
-  int8_t *fir1 = kvz_g_luma_filter1;
-  int8_t *fir3 = kvz_g_luma_filter3;
+  int8_t *hor_fir = kvz_g_luma_filtermv0 & 3;
+  int8_t *ver_fir = kvz_g_luma_filtermv1 & 3;
 
-  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped1(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped3(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t hor_stride = LCU_WIDTH;
+  int16_t hor_intermediateKVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH;
 
-  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
-  int16_t dst_stride = (LCU_WIDTH + 1);
-  
-  // Horizontal positions
-  for (x = 0; x < width + 1; ++x) {
-    for (y = 0; y + 8 < height + KVZ_EXT_PADDING + 1; y += 8) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      int8_t *firs4 = { fir0, fir2, fir1, fir3 };
-      int16_t *dsts4 = { &flipped0x * temp_stride + y, &flipped2x * temp_stride + y, &flipped1x * temp_stride + y, &flipped3x * temp_stride + y};
-      kvz_filter_flip_shift_x8_dual_avx2(&srcsrc_stride*ypos + xpos, src_stride, &firs0, shift1, &dsts0);
-      kvz_filter_flip_shift_x8_dual_avx2(&srcsrc_stride*ypos + xpos, src_stride, &firs2, shift1, &dsts2);
-    }
+  // HORIZONTAL STEP
+  __m256i shuf_01_23, shuf_45_67;
+  __m256i taps_01_23, taps_45_67;
 
-    for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped1x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir1, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped3x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir3, &srcsrc_stride*ypos + xpos) >> shift1;
-    }
-  }
+  kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67);
+  kvz_init_filter_taps(hor_fir, &taps_01_23, &taps_45_67);
+
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) {
 
-  // Filter vertically and flip x and y
-  for (x = 0; x + 8 < width + 1; x += 8) {
-    for (y = 0; y < height + 1; ++y) {
-      
-      // HPEL
-      int8_t *firs02 = { fir0, fir2 };
-      kvz_pixel *dsts02 = { &filtered0y * dst_stride + x, &filtered1y * dst_stride + x};
-      int16_t *srcs04 = { flipped2 + x * temp_stride + y, flipped0 + x * temp_stride + y};
-      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(srcs0, temp_stride, firs0, offset23, shift2 + shift3, dsts0);
-      kvz_filter_flip_round_clip_x8_16bit_avx2(flipped2 + x * temp_stride + y, temp_stride, fir2, offset23, shift2 + shift3, &filtered2y * dst_stride + x);
-     
-      // QPEL
-      // Horizontal
-      int8_t *firs14 = { fir0, fir0, fir2, fir2 };
-      kvz_pixel *dsts14 = { &filtered3y * dst_stride + x, &filtered4y * dst_stride + x, 
-                              &filtered5y * dst_stride + x, &filtered6y * dst_stride + x };
-      int16_t *srcs14 = { flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, 
-                            flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, };
-      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs10, temp_stride, &firs10, offset23, shift2 + shift3, &dsts10);
-      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs12, temp_stride, &firs12, offset23, shift2 + shift3, &dsts12);
-
-      // Vertical
-      int8_t *firs24 = { fir1, fir1, fir3, fir3 };
-      kvz_pixel *dsts24 = { &filtered7y * dst_stride + x, &filtered8y * dst_stride + x, 
-                              &filtered9y * dst_stride + x, &filtered10y * dst_stride + x };
-      int16_t *srcs24 = { flipped0 + x * temp_stride + y, flipped2 + x * temp_stride + y, 
-                            flipped0 + x * temp_stride + y, flipped2 + x * temp_stride + y, };
-      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs20, temp_stride, &firs20, offset23, shift2 + shift3, &dsts20);
-      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs22, temp_stride, &firs22, offset23, shift2 + shift3, &dsts22);
+    for (x = 0; x + 7 < width; x += 8) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+      kvz_eight_tap_filter_hor_8x1_avx2(&srcsrc_stride*ypos + xpos, &hor_intermediatey * hor_stride + x,
+        &shuf_01_23, &shuf_45_67,
+        &taps_01_23, &taps_45_67); //TODO: >> shift1
     }
   }
 
-  // The remaining pixels
-  for (; x < width + 1; ++x) {
-    for (y = 0; y < height + 1; ++y) {
-
-      // HPEL
-      filtered0y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered1y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered2y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-
-      // QPEL
-      // Horizontal
-      filtered3y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered4y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered5y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered6y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
-
-      // Vertical
-      filtered7y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered8y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered9y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered10y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+  // VERTICAL STEP
+  __m256i taps4;
+  kvz_init_ver_filter_taps(ver_fir, taps);
+
+  for (y = 0; y + 7 < height; y += 8) {
+    for (x = 0; x + 7 < width; x += 8) {
+      kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(taps, &hor_intermediatey * hor_stride + x, hor_stride, &dsty * dst_stride + x, dst_stride);
     }
   }
 }
 
-void kvz_filter_qpel_blocks_full_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+
+static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encoder,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  kvz_pixel *dst,
+  int16_t dst_stride,
+  int8_t hor_flag,
+  int8_t ver_flag,
+  const int16_t mv2)
 {
+  // TODO: Optimize SMP and AMP
+  if (width != height) {
+    kvz_sample_octpel_chroma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
+    return;
+  }
   int x, y;
-  int16_t shift1 = KVZ_BIT_DEPTH - 8;
-  int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
 
-  int8_t *fir0 = kvz_g_luma_filter0;
-  int8_t *fir2 = kvz_g_luma_filter2;
-  int8_t *fir1 = kvz_g_luma_filter1;
-  int8_t *fir3 = kvz_g_luma_filter3;
+  int8_t *hor_fir = kvz_g_chroma_filtermv0 & 7;
+  int8_t *ver_fir = kvz_g_chroma_filtermv1 & 7;
 
-  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped1(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped3(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  int16_t hor_stride = LCU_WIDTH_C;
+  int16_t hor_intermediateKVZ_EXT_BLOCK_W_CHROMA * LCU_WIDTH_C;
 
-  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
-  int16_t dst_stride = (LCU_WIDTH + 1);
-  
-  // Horizontal positions
-  for (x = 0; x < (width + 1); ++x) {
-    for (y = 0; y + 8 < height + KVZ_EXT_PADDING + 1; y += 8) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET; 
-      int8_t *firs4 = { fir0, fir2, fir1, fir3 };
-      int16_t *dsts4 = { &flipped0x * temp_stride + y, &flipped2x * temp_stride + y, &flipped1x * temp_stride + y, &flipped3x * temp_stride + y};
-      kvz_filter_flip_shift_x8_dual_avx2(&srcsrc_stride*ypos + xpos, src_stride, &firs0, shift1, &dsts0);
-      kvz_filter_flip_shift_x8_dual_avx2(&srcsrc_stride*ypos + xpos, src_stride, &firs2, shift1, &dsts2);
-    }
+  // HORIZONTAL STEP
+  __m256i shuf_01, shuf_23;
+  __m256i taps_01, taps_23;
 
-    for (; y < height + KVZ_EXT_PADDING + 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped1x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir1, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped3x * temp_stride + y = kvz_eight_tap_filter_hor_avx2(fir3, &srcsrc_stride*ypos + xpos) >> shift1;
-    }
-  }
+  kvz_init_shuffle_masks_chroma(&shuf_01, &shuf_23);
+  kvz_init_filter_taps_chroma(hor_fir, &taps_01, &taps_23);
+
+  for (y = 0; y + 3 < height + KVZ_EXT_PADDING_CHROMA; y += 4) {
 
-  // Filter vertically and flip x and y
-  for (x = 0; x + 8 < width + 1; x += 8) {
-    for (y = 0; y < height + 1; ++y) {
-      
-      // HPEL
-      int8_t *firs02 = { fir0, fir2 };
-      kvz_pixel *dsts02 = { &filtered0y * dst_stride + x, &filtered1y * dst_stride + x};
-      int16_t *srcs04 = { flipped2 + x * temp_stride + y, flipped0 + x * temp_stride + y};
-      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(srcs0, temp_stride, firs0, offset23, shift2 + shift3, dsts0);
-      kvz_filter_flip_round_clip_x8_16bit_avx2(flipped2 + x * temp_stride + y, temp_stride, fir2, offset23, shift2 + shift3, &filtered2y * dst_stride + x);
-     
-      // QPEL
-      // Horizontal
-      int8_t *firs14 = { fir0, fir0, fir2, fir2 };
-      kvz_pixel *dsts14 = { &filtered3y * dst_stride + x, &filtered4y * dst_stride + x, 
-                              &filtered5y * dst_stride + x, &filtered6y * dst_stride + x };
-      int16_t *srcs14 = { flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, 
-                            flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, };
-      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs10, temp_stride, &firs10, offset23, shift2 + shift3, &dsts10);
-      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs12, temp_stride, &firs12, offset23, shift2 + shift3, &dsts12);
-
-      // Vertical
-      int8_t *firs24 = { fir1, fir1, fir3, fir3 };
-      kvz_pixel *dsts24 = { &filtered7y * dst_stride + x, &filtered8y * dst_stride + x, 
-                              &filtered9y * dst_stride + x, &filtered10y * dst_stride + x };
-      int16_t *srcs24 = { flipped0 + x * temp_stride + y, flipped2 + x * temp_stride + y, 
-                            flipped0 + x * temp_stride + y, flipped2 + x * temp_stride + y, };
-      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs20, temp_stride, &firs20, offset23, shift2 + shift3, &dsts20);
-      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs22, temp_stride, &firs22, offset23, shift2 + shift3, &dsts22);
-
-      // Diagonal
-      int8_t *firs34 = { fir1, fir1, fir3, fir3 };
-      kvz_pixel *dsts34 = { &filtered11y * dst_stride + x, &filtered12y * dst_stride + x, 
-                              &filtered13y * dst_stride + x, &filtered14y * dst_stride + x };
-      int16_t *srcs34 = { flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, 
-                            flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, };
-      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs30, temp_stride, &firs30, offset23, shift2 + shift3, &dsts30);
-      kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs32, temp_stride, &firs32, offset23, shift2 + shift3, &dsts32);
+    for (x = 0; x + 3 < width; x += 4) {
+      int ypos = y - KVZ_CHROMA_FILTER_OFFSET;
+      int xpos = x - KVZ_CHROMA_FILTER_OFFSET;
+      kvz_four_tap_filter_hor_4x4_avx2(&srcsrc_stride*ypos + xpos, src_stride, &hor_intermediatey * hor_stride + x, hor_stride,
+        &shuf_01, &shuf_23,
+        &taps_01, &taps_23); //TODO: >> shift1
     }
   }
 
-  // The remaining pixels
-  for (; x < width + 1; ++x) {
-    for (y = 0; y < height + 1; ++y) {
-
-      // HPEL
-      filtered0y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered1y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered2y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-
-      // QPEL
-      // Horizontal
-      filtered3y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered4y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered5y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered6y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
-
-      // Vertical
-      filtered7y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered8y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered9y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered10y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-
-      // Diagonal
-      filtered11y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered12y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered13y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered14y * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
-    }
+  __m256i shuf_01_23 = _mm256_permute2x128_si256(shuf_01, shuf_23, _MM_SHUFFLE(0, 2, 0, 0));
+  __m256i taps_01_23 = _mm256_permute2x128_si256(taps_01, taps_23, _MM_SHUFFLE(0, 2, 0, 0));
+  
+  int rows = 3;
+  for (x = 0; x + 3 < width; x += 4) {
+    int ypos = y - KVZ_CHROMA_FILTER_OFFSET;
+    int xpos = x - KVZ_CHROMA_FILTER_OFFSET;
+    kvz_four_tap_filter_hor_4xN_avx2(&srcsrc_stride*ypos + xpos, src_stride, &hor_intermediatey * hor_stride + x, hor_stride,
+      &shuf_01_23, &taps_01_23,
+      rows); //TODO: >> shift1
   }
-}
 
-void kvz_filter_frac_blocks_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block filtered15, int8_t fme_level)
-{
-  switch (fme_level) {
-    case 1:
-      kvz_filter_hpel_blocks_hor_ver_luma_avx2(encoder, src, src_stride, width, height, filtered);
-      break;
-    case 2:
-      kvz_filter_hpel_blocks_full_luma_avx2(encoder, src, src_stride, width, height, filtered);
-      break;
-    case 3:
-      kvz_filter_qpel_blocks_hor_ver_luma_avx2(encoder, src, src_stride, width, height, filtered);
-      break;
-    default:
-      kvz_filter_qpel_blocks_full_luma_avx2(encoder, src, src_stride, width, height, filtered);
-      break;
+  // VERTICAL STEP
+  for (y = 0; y + 3 < height; y += 4) {
+    for (x = 0; x + 3 < width; x += 4) {
+      kvz_four_tap_filter_ver_16bit_4x4_avx2(ver_fir, &hor_intermediatey * hor_stride + x, hor_stride, &dsty * dst_stride + x, dst_stride);
+    }
   }
 }
 
-void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height,kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2)
+static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t * const encoder,
+  kvz_pixel *src, 
+  int16_t src_stride, 
+  int width, 
+  int height, 
+  int16_t *dst, 
+  int16_t dst_stride, 
+  int8_t hor_flag, 
+  int8_t ver_flag, 
+  const int16_t mv2)
 {
-  //Check for amp
+  // TODO: Optimize SMP and AMP
   if (width != height) {
-    kvz_sample_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
+    kvz_sample_14bit_octpel_chroma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
     return;
   }
-  //TODO: horizontal and vertical only filtering
-  int32_t x, y;
-  int16_t shift1 = KVZ_BIT_DEPTH - 8;
-  int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
-
-  int8_t *hor_filter = kvz_g_luma_filtermv0 & 3;
-  int8_t *ver_filter = kvz_g_luma_filtermv1 & 3;
-
-  int16_t hor_filtered(LCU_WIDTH + 1) + FILTER_SIZE(LCU_WIDTH + 1) + FILTER_SIZE;
-
-  if (width == 4) {
-    // Filter horizontally and flip x and y
-    for (y = 0; y < height + FILTER_SIZE - 1; ++y) {
-      for (x = 0; x < width; x += 4) {
-        int ypos = y - FILTER_OFFSET;
-        int xpos = x - FILTER_OFFSET;
-        int16_t *out = &(hor_filteredyx);
-        kvz_eight_tap_filter_x4_hor_avx2(hor_filter, &srcsrc_stride*ypos + xpos, shift1, out);
-      }
-    }
+  // TODO: horizontal and vertical only filtering
+  int x, y;
 
-    // Filter vertically and flip x and y
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; x+=4) {
-        int ypos = y;
-        int xpos = x;
-        *(int32_t*)&(dsty*dst_stride + x) = kvz_eight_tap_filter_x4_ver_16bit_avx2(ver_filter, &hor_filteredyposxpos, sizeof(hor_filtered0)/sizeof(int16_t), offset23, shift2, shift3);
-      }
-    }
+  int8_t *hor_fir = kvz_g_chroma_filtermv0 & 7;
+  int8_t *ver_fir = kvz_g_chroma_filtermv1 & 7;
 
-  } else {
-    // Filter horizontally and flip x and y
-    for (y = 0; y < height + FILTER_SIZE - 1; ++y) {
-      for (x = 0; x < width; x+=8) {
-        int ypos = y - FILTER_OFFSET;
-        int xpos = x - FILTER_OFFSET;
-        int16_t *dst = &(hor_filteredyx);
-        kvz_eight_tap_filter_x8_hor_avx2(hor_filter, &srcsrc_stride*ypos + xpos, shift1, dst);
-      }
-    }
+  int16_t hor_stride = LCU_WIDTH_C;
+  int16_t hor_intermediateKVZ_EXT_BLOCK_W_CHROMA * LCU_WIDTH_C;
 
-    // Filter vertically and flip x and y
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; x+=8) {
-        int ypos = y;
-        int xpos = x;
-        kvz_pixel *out = &(dsty*dst_stride + x);
-        kvz_eight_tap_filter_x8_ver_16bit_avx2(ver_filter, &hor_filteredyposxpos, sizeof(hor_filtered0)/sizeof(int16_t), offset23, shift2, shift3, out);
-      }
-    }
-  }
-}
+  // HORIZONTAL STEP
+  __m256i shuf_01, shuf_23;
+  __m256i taps_01, taps_23;
 
-void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height,kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2)
-{
-  //Check for amp
-  if (width != height) {
-    kvz_sample_octpel_chroma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv);
-    return;
-  }
-  //TODO: horizontal and vertical only filtering
-  int32_t x, y;
-  int16_t shift1 = KVZ_BIT_DEPTH - 8;
-  int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
-
-  int8_t *hor_filter = kvz_g_chroma_filtermv0 & 7;
-  int8_t *ver_filter = kvz_g_chroma_filtermv1 & 7;
-
-#define FILTER_SIZE_C (FILTER_SIZE / 2)
-#define FILTER_OFFSET_C (FILTER_OFFSET / 2)
-  int16_t hor_filtered(LCU_WIDTH_C + 1) + FILTER_SIZE_C(LCU_WIDTH_C + 1) + FILTER_SIZE_C;
-
-  if (width == 4) {
-    // Filter horizontally and flip x and y
-    for (y = 0; y < height + FILTER_SIZE_C - 1; ++y) {
-      for (x = 0; x < width; x += 4) {
-        int ypos = y - FILTER_OFFSET_C;
-        int xpos = x - FILTER_OFFSET_C;
-        int16_t *out = &(hor_filteredyx);
-        kvz_four_tap_filter_x4_hor_avx2(hor_filter, &srcsrc_stride*ypos + xpos, shift1, out);
-      }
-    }
+  kvz_init_shuffle_masks_chroma(&shuf_01, &shuf_23);
+  kvz_init_filter_taps_chroma(hor_fir, &taps_01, &taps_23);
 
-    // Filter vertically and flip x and y
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; x+=4) {
-        int ypos = y;
-        int xpos = x;
-        *(int32_t*)&(dsty*dst_stride + x) = kvz_four_tap_filter_x4_ver_16bit_avx2(ver_filter, &hor_filteredyposxpos, sizeof(hor_filtered0)/sizeof(int16_t), offset23, shift2, shift3);
-      }
-    }
+  for (y = 0; y + 3 < height + KVZ_EXT_PADDING_CHROMA; y += 4) {
 
-  } else {
-    // Filter horizontally and flip x and y
-    for (y = 0; y < height + FILTER_SIZE_C - 1; ++y) {
-      for (x = 0; x < width; x += 8) {
-        int ypos = y - FILTER_OFFSET_C;
-        int xpos = x - FILTER_OFFSET_C;
-        int16_t *dst = &(hor_filteredyx);
-        kvz_four_tap_filter_x8_hor_avx2(hor_filter, &srcsrc_stride*ypos + xpos, shift1, dst);
-      }
+    for (x = 0; x + 3 < width; x += 4) {
+      int ypos = y - KVZ_CHROMA_FILTER_OFFSET;
+      int xpos = x - KVZ_CHROMA_FILTER_OFFSET;
+      kvz_four_tap_filter_hor_4x4_avx2(&srcsrc_stride*ypos + xpos, src_stride, &hor_intermediatey * hor_stride + x, hor_stride,
+        &shuf_01, &shuf_23,
+        &taps_01, &taps_23); //TODO: >> shift1
     }
+  }
 
-    // Filter vertically and flip x and y
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; x+=8) {
-        int ypos = y;
-        int xpos = x;
-        kvz_pixel *out = &(dsty*dst_stride + x);
-        kvz_four_tap_filter_x8_ver_16bit_avx2(ver_filter, &hor_filteredyposxpos, sizeof(hor_filtered0)/sizeof(int16_t), offset23, shift2, shift3, out);
-      }
+  __m256i shuf_01_23 = _mm256_permute2x128_si256(shuf_01, shuf_23, _MM_SHUFFLE(0, 2, 0, 0));
+  __m256i taps_01_23 = _mm256_permute2x128_si256(taps_01, taps_23, _MM_SHUFFLE(0, 2, 0, 0));
+  
+  int rows = 3;
+  for (x = 0; x + 3 < width; x += 4) {
+    int ypos = y - KVZ_CHROMA_FILTER_OFFSET;
+    int xpos = x - KVZ_CHROMA_FILTER_OFFSET;
+    kvz_four_tap_filter_hor_4xN_avx2(&srcsrc_stride*ypos + xpos, src_stride, &hor_intermediatey * hor_stride + x, hor_stride,
+      &shuf_01_23, &taps_01_23,
+      rows); //TODO: >> shift1
+  }
+
+  // VERTICAL STEP
+  for (y = 0; y + 3 < height; y += 4) {
+    for (x = 0; x + 3 < width; x += 4) {
+      kvz_four_tap_filter_ver_16bit_4x4_no_round_avx2(ver_fir, &hor_intermediatey * hor_stride + x, hor_stride, &dsty * dst_stride + x, dst_stride);
     }
   }
 }
 
-
 void kvz_get_extended_block_avx2(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
   int filter_size, int width, int height, kvz_extended_block *out) {
 
@@ -1427,12 +1511,14 @@
   bool success = true;
 #if COMPILE_INTEL_AVX2
   if (bitdepth == 8){
-    success &= kvz_strategyselector_register(opaque, "filter_inter_quarterpel_luma", "avx2", 40, &kvz_filter_inter_quarterpel_luma_avx2);
-    success &= kvz_strategyselector_register(opaque, "filter_inter_halfpel_chroma", "avx2", 40, &kvz_filter_inter_halfpel_chroma_avx2);
-    success &= kvz_strategyselector_register(opaque, "filter_inter_octpel_chroma", "avx2", 40, &kvz_filter_inter_octpel_chroma_avx2);
-    success &= kvz_strategyselector_register(opaque, "filter_frac_blocks_luma", "avx2", 40, &kvz_filter_frac_blocks_luma_avx2);
+    success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_hor_ver_luma", "avx2", 40, &kvz_filter_hpel_blocks_hor_ver_luma_avx2);
+    success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_diag_luma", "avx2", 40, &kvz_filter_hpel_blocks_diag_luma_avx2);
+    success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_hor_ver_luma", "avx2", 40, &kvz_filter_qpel_blocks_hor_ver_luma_avx2);
+    success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "avx2", 40, &kvz_filter_qpel_blocks_diag_luma_avx2);
     success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "avx2", 40, &kvz_sample_quarterpel_luma_avx2);
     success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "avx2", 40, &kvz_sample_octpel_chroma_avx2);
+    success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "avx2", 40, &kvz_sample_14bit_quarterpel_luma_avx2);
+    success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "avx2", 40, &kvz_sample_14bit_octpel_chroma_avx2);
   }
   success &= kvz_strategyselector_register(opaque, "get_extended_block", "avx2", 40, &kvz_get_extended_block_avx2);
 #endif //COMPILE_INTEL_AVX2

kvazaar-1.2.0.tar.gz/src/strategies/avx2/picture-avx2.c -> kvazaar-1.3.0.tar.gz/src/strategies/avx2/picture-avx2.c Changed

@@ -21,17 +21,59 @@
 /*
  * \file
  */
-#include "strategies/avx2/picture-avx2.h"
+
+#include "global.h"
 
 #if COMPILE_INTEL_AVX2
+#include "strategies/avx2/picture-avx2.h"
+#include "strategies/avx2/reg_sad_pow2_widths-avx2.h"
+
 #include <immintrin.h>
+#include <emmintrin.h>
+#include <mmintrin.h>
+#include <xmmintrin.h>
 #include <string.h>
-
 #include "kvazaar.h"
 #include "strategies/strategies-picture.h"
 #include "strategyselector.h"
 #include "strategies/generic/picture-generic.h"
 
+/**
+ * \brief Calculate Sum of Absolute Differences (SAD)
+ *
+ * Calculate Sum of Absolute Differences (SAD) between two rectangular regions
+ * located in arbitrary points in the picture.
+ *
+ * \param data1   Starting point of the first picture.
+ * \param data2   Starting point of the second picture.
+ * \param width   Width of the region for which SAD is calculated.
+ * \param height  Height of the region for which SAD is calculated.
+ * \param stride  Width of the pixel array.
+ *
+ * \returns Sum of Absolute Differences
+ */
+uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                          const int width, const int height, const unsigned stride1, const unsigned stride2)
+{
+  if (width == 0)
+    return 0;
+  if (width == 4)
+    return reg_sad_w4(data1, data2, height, stride1, stride2);
+  if (width == 8)
+    return reg_sad_w8(data1, data2, height, stride1, stride2);
+  if (width == 12)
+    return reg_sad_w12(data1, data2, height, stride1, stride2);
+  if (width == 16)
+    return reg_sad_w16(data1, data2, height, stride1, stride2);
+  if (width == 24)
+    return reg_sad_w24(data1, data2, height, stride1, stride2);
+  if (width == 32)
+    return reg_sad_w32(data1, data2, height, stride1, stride2);
+  if (width == 64)
+    return reg_sad_w64(data1, data2, height, stride1, stride2);
+  else
+    return reg_sad_arbitrary(data1, data2, width, height, stride1, stride2);
+}
 
 /**
 * \brief Calculate SAD for 8x8 bytes in continuous memory.
@@ -484,13 +526,13 @@
 }
 
 static void kvz_satd_4x4_subblock_quad_avx2(const kvz_pixel *preds4,
-                                       const int strides4,
+                                       const int stride,
                                        const kvz_pixel *orig,
                                        const int orig_stride,
                                        unsigned costs4)
 {
   // TODO: AVX2 implementation
-  kvz_satd_4x4_subblock_quad_generic(preds, strides, orig, orig_stride, costs);
+  kvz_satd_4x4_subblock_quad_generic(preds, stride, orig, orig_stride, costs);
 }
 
 static unsigned satd_8x8_subblock_8bit_avx2(const kvz_pixel * buf1, unsigned stride1, const kvz_pixel * buf2, unsigned stride2)
@@ -508,13 +550,13 @@
 }
 
 static void satd_8x8_subblock_quad_avx2(const kvz_pixel **preds,
-  const int *strides,
+  const int stride,
   const kvz_pixel *orig,
   const int orig_stride,
   unsigned *costs)
 {
-  kvz_satd_8bit_8x8_general_dual_avx2(preds0, strides0, preds1, strides1, orig, orig_stride, &costs0, &costs1);
-  kvz_satd_8bit_8x8_general_dual_avx2(preds2, strides2, preds3, strides3, orig, orig_stride, &costs2, &costs3);
+  kvz_satd_8bit_8x8_general_dual_avx2(preds0, stride, preds1, stride, orig, orig_stride, &costs0, &costs1);
+  kvz_satd_8bit_8x8_general_dual_avx2(preds2, stride, preds3, stride, orig, orig_stride, &costs2, &costs3);
 }
 
 SATD_NxN(8bit_avx2,  8)
@@ -577,7 +619,7 @@
   static void satd_any_size_ ## suffix ( \
       int width, int height, \
       const kvz_pixel **preds, \
-      const int *strides, \
+      const int stride, \
       const kvz_pixel *orig, \
       const int orig_stride, \
       unsigned num_modes, \
@@ -591,7 +633,7 @@
     if (width % 8 != 0) { \
       /* Process the first column using 4x4 blocks. */ \
       for (int y = 0; y < height; y += 4) { \
-        kvz_satd_4x4_subblock_ ## suffix(preds, strides, orig, orig_stride, sums); \
+        kvz_satd_4x4_subblock_ ## suffix(preds, stride, orig, orig_stride, sums); \
             } \
       orig_ptr += 4; \
       for(int blk = 0; blk < num_parallel_blocks; ++blk){\
@@ -602,23 +644,23 @@
     if (height % 8 != 0) { \
       /* Process the first row using 4x4 blocks. */ \
       for (int x = 0; x < width; x += 4 ) { \
-        kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \
+        kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
             } \
       orig_ptr += 4 * orig_stride; \
       for(int blk = 0; blk < num_parallel_blocks; ++blk){\
-        pred_ptrsblk += 4 * stridesblk; \
+        pred_ptrsblk += 4 * stride; \
             }\
       height -= 4; \
         } \
     /* The rest can now be processed with 8x8 blocks. */ \
     for (int y = 0; y < height; y += 8) { \
       orig_ptr = &origy * orig_stride; \
-      pred_ptrs0 = &preds0y * strides0; \
-      pred_ptrs1 = &preds1y * strides1; \
-      pred_ptrs2 = &preds2y * strides2; \
-      pred_ptrs3 = &preds3y * strides3; \
+      pred_ptrs0 = &preds0y * stride; \
+      pred_ptrs1 = &preds1y * stride; \
+      pred_ptrs2 = &preds2y * stride; \
+      pred_ptrs3 = &preds3y * stride; \
       for (int x = 0; x < width; x += 8) { \
-        satd_8x8_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \
+        satd_8x8_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
         orig_ptr += 8; \
         pred_ptrs0 += 8; \
         pred_ptrs1 += 8; \
@@ -714,8 +756,570 @@
   }
 }
 
-#endif //COMPILE_INTEL_AVX2
+static void inter_recon_bipred_no_mov_avx2(
+ const int height,
+ const int width,
+ const int ypos,
+ const int xpos,
+ const hi_prec_buf_t*high_precision_rec0,
+ const hi_prec_buf_t*high_precision_rec1,
+ lcu_t* lcu,
+ kvz_pixel* temp_lcu_y,
+ kvz_pixel* temp_lcu_u,
+ kvz_pixel* temp_lcu_v) {
+
+ // This function is used only when kvazaar can't find any movement from the current block
+ int y_in_lcu, x_in_lcu;
+ __m256i sample0_epi8, sample1_epi8, temp_y_epi8;
+ int32_t * pointer = 0;
+
+ for (int temp_y = 0; temp_y < height; temp_y += 1) {
+  y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+
+  for (int temp_x = 0; temp_x < width; temp_x += 32) {
+
+   x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+
+   switch (width)
+   {
+
+   case 4:
+
+    sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu));
+    sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu));
+
+    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+    pointer = (int32_t*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu);
+    *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_y_epi8));
+
+    break;
+
+   case 8:
+
+    sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu));
+    sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu));
+
+    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+    // Store 64-bits from vector to memory
+    _mm_storel_epi64((__m128i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), _mm256_castsi256_si128(temp_y_epi8));
+
+    break;
+
+   case 12:
+    sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu));
+    sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu));
+
+    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+    // Store 64-bits from vector to memory
+    _mm_storel_epi64((__m128i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), _mm256_castsi256_si128(temp_y_epi8));
+
+    x_in_lcu = ((xpos + temp_x + 8) & ((LCU_WIDTH)-1));
+
+    sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu));
+    sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu));
+
+    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+    pointer = (int32_t*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu);
+    *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_y_epi8));
+    break;
+
+
+   case 16:
+
+    sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu));
+    sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu));
+
+    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+    // Store 128-bit to memory
+    _mm_storeu_si128((__m128i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), _mm256_castsi256_si128(temp_y_epi8));
+
+    break;
+
+   case 32:
+
+    sample0_epi8 = _mm256_loadu_si256((__m256i*) &(temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu));
+    sample1_epi8 = _mm256_loadu_si256((__m256i*) &(lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu));
+
+    temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+
+    // Store 256-bit integers to memory
+    _mm256_storeu_si256((__m256i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), temp_y_epi8);
+    break;
+
+   default:
+    // If width is something strange size, use this
+    for (int temp_i = 0; temp_i < width; ++temp_i) {
+     x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1));
+
+     int sample0_y = (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH));
+     int sample1_y = (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH));
+
+     lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y) >> 1);
+    }
+
+
+   }
+
+   if (temp_x < width >> 1 && temp_y < height >> 1) {
+    y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+    x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+
+    __m256i temp_u_epi8;
+    __m256i temp_v_epi8;
+
+
+    switch (width)
+    {
+
+    case 8:
+
+
+     sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+     sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+     pointer = (int32_t*)&(lcu->rec.u(y_in_lcu)* LCU_WIDTH_C + x_in_lcu);
+     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_u_epi8));
+
+     pointer = (int32_t*)&(lcu->rec.v(y_in_lcu)* LCU_WIDTH_C + x_in_lcu);
+     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_v_epi8));
+
+     break;
+
+    case 12:
+
+     sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+     sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+     pointer = (int32_t*)&(lcu->rec.u(y_in_lcu)* LCU_WIDTH_C + x_in_lcu);
+     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_u_epi8));
+
+     pointer = (int32_t*)&(lcu->rec.v(y_in_lcu)* LCU_WIDTH_C + x_in_lcu);
+     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_v_epi8));
+
+     // This is used only with odd shaped objects
+     for (int temp_i = 4; temp_i < width >> 1; ++temp_i) {
+      int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
+      int16_t sample0_u = (temp_lcu_uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
+      int16_t sample1_u = (lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
+      lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u) >> 1);
+
+      int16_t sample0_v = (temp_lcu_vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
+      int16_t sample1_v = (lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
+      lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v) >> 1);
+     }
+
+     break;
+
+    case 16:
+
+     sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+     sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+     // Store 64-bit integer into memory
+     _mm_storel_epi64((__m128i*)&(lcu->rec.u(y_in_lcu)* LCU_WIDTH_C + x_in_lcu), _mm256_castsi256_si128(temp_u_epi8));
+
+     // Store 64-bit integer into memory
+     _mm_storel_epi64((__m128i*)&(lcu->rec.v(y_in_lcu)* LCU_WIDTH_C + x_in_lcu), _mm256_castsi256_si128(temp_v_epi8));
+
+     break;
+
+    case 32:
+
+     sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+     sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+     // Fill 128 bit vector with packed data and store it to memory
+     _mm_storeu_si128((__m128i*)&(lcu->rec.u(y_in_lcu)* LCU_WIDTH_C + x_in_lcu), _mm256_castsi256_si128(temp_u_epi8));
+
+     // Fill 128 bit vector with packed data and store it to memory
+     _mm_storeu_si128((__m128i*)&(lcu->rec.v(y_in_lcu)* LCU_WIDTH_C + x_in_lcu), _mm256_castsi256_si128(temp_v_epi8));
+
+
+     break;
+
+     case 64:
+
+     sample0_epi8 = _mm256_loadu_si256((__m256i*) &(temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     sample1_epi8 = _mm256_loadu_si256((__m256i*) &(lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+     sample0_epi8 = _mm256_loadu_si256((__m256i*) &(temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     sample1_epi8 = _mm256_loadu_si256((__m256i*) &(lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu));
+     temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8);
+
+     _mm256_storeu_si256((__m256i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), temp_u_epi8);
+     _mm256_storeu_si256((__m256i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), temp_v_epi8);
+     break;
+
+     default:
+      // This is used only with odd shaped objects
+      for (int temp_i = 0; temp_i < width >> 1; ++temp_i) {
+       int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
+       int16_t sample0_u = (temp_lcu_uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
+       int16_t sample1_u = (lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
+       lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u) >> 1);
+
+       int16_t sample0_v = (temp_lcu_vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
+       int16_t sample1_v = (lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH));
+       lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v) >> 1);
+      }
+
+      break;
 
+    }
+    y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+   }
+  }
+ }
+
+
+}
+
+static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0,
+ const int hi_prec_luma_rec1,
+ const int hi_prec_chroma_rec0,
+ const int hi_prec_chroma_rec1,
+ const int height,
+ const int width,
+ const int ypos,
+ const int xpos,
+ const hi_prec_buf_t*high_precision_rec0,
+ const hi_prec_buf_t*high_precision_rec1,
+ lcu_t* lcu,
+ kvz_pixel* temp_lcu_y,
+ kvz_pixel* temp_lcu_u,
+ kvz_pixel* temp_lcu_v)
+{
+ if(hi_prec_luma_rec0 == 0 && hi_prec_luma_rec1 == 0 && hi_prec_chroma_rec0 == 0 && hi_prec_chroma_rec1 == 0)
+ {
+  inter_recon_bipred_no_mov_avx2(height, width, ypos, xpos, high_precision_rec0, high_precision_rec1, lcu, temp_lcu_y, temp_lcu_u, temp_lcu_v);
+ }
+
+ else
+ {
+
+  int y_in_lcu, x_in_lcu;
+  int shift = 15 - KVZ_BIT_DEPTH;
+  int offset = 1 << (shift - 1);
+  __m256i temp_epi8, temp_y_epi32, sample0_epi32, sample1_epi32, temp_epi16;
+  int32_t * pointer = 0;
+  __m256i offset_epi32 = _mm256_set1_epi32(offset);
+
+  for (int temp_y = 0; temp_y < height; ++temp_y) {
+
+   y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+
+   for (int temp_x = 0; temp_x < width; temp_x += 8) {
+    x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+
+    bool use_8_elements = ((temp_x + 8) <= width);
+
+    switch (use_8_elements)
+    {
+
+    case false:
+
+     if (width < 4) {
+      // If width is smaller than 4 there's no need to use SIMD
+      for (int temp_i = 0; temp_i < width; ++temp_i) {
+       x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1));
+
+       int sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu : (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+       int sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu : (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+
+       lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
+      }
+     }
+
+     else{
+     // Load total of 4 elements from memory to vector
+     sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu))) :
+      _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)&(temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+
+
+     sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu))) :
+      _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+
+
+     // (sample1 + sample2 + offset)>>shift 
+     temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+     temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32);
+     temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift);
+
+     // Pack the bits from 32-bit to 8-bit
+     temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32);
+     temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
+     temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
+
+     pointer = (int32_t*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu);
+     *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
+
+
+
+     for (int temp_i = temp_x + 4; temp_i < width; ++temp_i) {
+      x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1));
+
+      int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu : (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+      int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu : (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+
+      lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
+     }
+
+     }
+     break;
+
+    default:
+     // Load total of 8 elements from memory to vector
+     sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu))) :
+      _mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu)))), 14 - KVZ_BIT_DEPTH);
+
+     sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu))) :
+      _mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu)))), 14 - KVZ_BIT_DEPTH);
+
+     // (sample1 + sample2 + offset)>>shift 
+     temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+     temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32);
+     temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift);
+
+     // Pack the bits from 32-bit to 8-bit
+     temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32);
+     temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
+     temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
+
+     // Store 64-bits from vector to memory
+     _mm_storel_epi64((__m128i*)&(lcu->rec.y(y_in_lcu)* LCU_WIDTH + x_in_lcu), _mm256_castsi256_si128(temp_epi8));
+
+     break;
+    }
+
+
+   }
+  }
+  for (int temp_y = 0; temp_y < height >> 1; ++temp_y) {
+   int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+   
+   for (int temp_x = 0; temp_x < width >> 1; temp_x += 8) {
+
+    int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+
+    if ((width >> 1) < 4) {
+     // If width>>1 is smaller than 4 there's no need to use SIMD
+
+     for (int temp_i = 0; temp_i < width >> 1; ++temp_i) {
+      int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
+      int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (temp_lcu_uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+      int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+      lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
+
+      int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (temp_lcu_vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+      int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+      lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
+     }
+    }
+
+    else{
+
+     bool use_8_elements = ((temp_x + 8) <= (width>>1));
+
+     __m256i temp_u_epi32, temp_v_epi32;
+
+     switch (use_8_elements)
+     {
+
+     case false:
+      // Load 4 pixels to vector
+      sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+
+      sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+
+      // (sample1 + sample2 + offset)>>shift 
+      temp_u_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+      temp_u_epi32 = _mm256_add_epi32(temp_u_epi32, offset_epi32);
+      temp_u_epi32 = _mm256_srai_epi32(temp_u_epi32, shift);
+
+
+
+      sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+
+      sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+
+
+      // (sample1 + sample2 + offset)>>shift 
+      temp_v_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+      temp_v_epi32 = _mm256_add_epi32(temp_v_epi32, offset_epi32);
+      temp_v_epi32 = _mm256_srai_epi32(temp_v_epi32, shift);
+
+
+      temp_epi16 = _mm256_packs_epi32(temp_u_epi32, temp_u_epi32);
+      temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
+      temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
+
+      pointer = (int32_t*)&(lcu->rec.u(y_in_lcu)* LCU_WIDTH_C + x_in_lcu);
+      *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
+
+
+      temp_epi16 = _mm256_packs_epi32(temp_v_epi32, temp_v_epi32);
+      temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
+      temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
+
+      pointer = (int32_t*)&(lcu->rec.v(y_in_lcu)* LCU_WIDTH_C + x_in_lcu);
+      *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8));
+
+      for (int temp_i = 4; temp_i < width >> 1; ++temp_i) {
+
+       // Use only if width>>1 is not divideble by 4
+       int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1));
+       int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (temp_lcu_uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+       int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+       lcu->rec.uy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
+
+       int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (temp_lcu_vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+       int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu : (lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+       lcu->rec.vy_in_lcu * LCU_WIDTH_C + temp_x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
+      }
+
+
+      break;
+
+     default:
+      // Load 8 pixels to vector
+      sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+
+      sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+
+      // (sample1 + sample2 + offset)>>shift 
+      temp_u_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+      temp_u_epi32 = _mm256_add_epi32(temp_u_epi32, offset_epi32);
+      temp_u_epi32 = _mm256_srai_epi32(temp_u_epi32, shift);
+
+      sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+
+      sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + x_in_lcu))) :
+       _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu))), 14 - KVZ_BIT_DEPTH);
+
+
+      // (sample1 + sample2 + offset)>>shift 
+      temp_v_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32);
+      temp_v_epi32 = _mm256_add_epi32(temp_v_epi32, offset_epi32);
+      temp_v_epi32 = _mm256_srai_epi32(temp_v_epi32, shift);
+
+      temp_epi16 = _mm256_packs_epi32(temp_u_epi32, temp_u_epi32);
+      temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
+      temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
+
+      // Store 64-bit integer into memory
+      _mm_storel_epi64((__m128i*)&(lcu->rec.u(y_in_lcu)* LCU_WIDTH_C + x_in_lcu), _mm256_castsi256_si128(temp_epi8));
+
+      temp_epi16 = _mm256_packs_epi32(temp_v_epi32, temp_v_epi32);
+      temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0));
+      temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16);
+
+      // Store 64-bit integer into memory
+      _mm_storel_epi64((__m128i*)&(lcu->rec.v(y_in_lcu)* LCU_WIDTH_C + x_in_lcu), _mm256_castsi256_si128(temp_epi8));
+
+
+      break;
+     }
+    }
+   }
+  }
+ }
+}
+
+static optimized_sad_func_ptr_t get_optimized_sad_avx2(int32_t width)
+{
+  if (width == 0)
+    return reg_sad_w0;
+  if (width == 4)
+    return reg_sad_w4;
+  if (width == 8)
+    return reg_sad_w8;
+  if (width == 12)
+    return reg_sad_w12;
+  if (width == 16)
+    return reg_sad_w16;
+  if (width == 24)
+    return reg_sad_w24;
+  if (width == 32)
+    return reg_sad_w32;
+  if (width == 64)
+    return reg_sad_w64;
+  else
+    return NULL;
+}
+
+static uint32_t ver_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                             int32_t width, int32_t height, uint32_t stride)
+{
+  if (width == 0)
+    return 0;
+  if (width == 4)
+    return ver_sad_w4(pic_data, ref_data, height, stride);
+  if (width == 8)
+    return ver_sad_w8(pic_data, ref_data, height, stride);
+  if (width == 12)
+    return ver_sad_w12(pic_data, ref_data, height, stride);
+  if (width == 16)
+    return ver_sad_w16(pic_data, ref_data, height, stride);
+  else
+    return ver_sad_arbitrary(pic_data, ref_data, width, height, stride);
+}
+
+static uint32_t hor_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                             int32_t width, int32_t height, uint32_t pic_stride,
+                             uint32_t ref_stride, uint32_t left, uint32_t right)
+{
+  if (width == 4)
+    return hor_sad_sse41_w4(pic_data, ref_data, height,
+                            pic_stride, ref_stride, left, right);
+  if (width == 8)
+    return hor_sad_sse41_w8(pic_data, ref_data, height,
+                            pic_stride, ref_stride, left, right);
+  if (width == 16)
+    return hor_sad_sse41_w16(pic_data, ref_data, height,
+                             pic_stride, ref_stride, left, right);
+  if (width == 32)
+    return hor_sad_avx2_w32 (pic_data, ref_data, height,
+                             pic_stride, ref_stride, left, right);
+  else
+    return hor_sad_sse41_arbitrary(pic_data, ref_data, width, height,
+                                   pic_stride, ref_stride, left, right);
+}
+
+#endif //COMPILE_INTEL_AVX2
 
 int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth)
 {
@@ -726,6 +1330,8 @@
   // simplest code to look at for anyone interested in doing more
   // optimizations, so it's worth it to keep this maintained.
   if (bitdepth == 8){
+
+    success &= kvz_strategyselector_register(opaque, "reg_sad", "avx2", 40, &kvz_reg_sad_avx2);
     success &= kvz_strategyselector_register(opaque, "sad_8x8", "avx2", 40, &sad_8bit_8x8_avx2);
     success &= kvz_strategyselector_register(opaque, "sad_16x16", "avx2", 40, &sad_8bit_16x16_avx2);
     success &= kvz_strategyselector_register(opaque, "sad_32x32", "avx2", 40, &sad_8bit_32x32_avx2);
@@ -746,6 +1352,11 @@
     success &= kvz_strategyselector_register(opaque, "satd_any_size_quad", "avx2", 40, &satd_any_size_quad_avx2);
 
     success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2);
+	  success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2);
+    success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "avx2", 40, &get_optimized_sad_avx2);
+    success &= kvz_strategyselector_register(opaque, "ver_sad", "avx2", 40, &ver_sad_avx2);
+    success &= kvz_strategyselector_register(opaque, "hor_sad", "avx2", 40, &hor_sad_avx2);
+
   }
 #endif
   return success;

kvazaar-1.2.0.tar.gz/src/strategies/avx2/quant-avx2.c -> kvazaar-1.3.0.tar.gz/src/strategies/avx2/quant-avx2.c Changed

@@ -28,6 +28,7 @@
 #include <immintrin.h>
 #include <stdlib.h>
 
+#include "avx2_common_functions.h"
 #include "cu.h"
 #include "encoder.h"
 #include "encoderstate.h"
@@ -40,17 +41,316 @@
 #include "tables.h"
 #include "transform.h"
 
+static INLINE int32_t hsum32_8x32i(__m256i src)
+{
+  __m128i a = _mm256_extracti128_si256(src, 0);
+  __m128i b = _mm256_extracti128_si256(src, 1);
+
+  a = _mm_add_epi32(a, b);
+  b = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
+
+  a = _mm_add_epi32(a, b);
+  b = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+
+  a = _mm_add_epi32(a, b);
+  return _mm_cvtsi128_si32(a);
+}
+
+static INLINE int32_t hsum32_16x16i(__m256i src)
+{
+  __m128i a = _mm256_extracti128_si256(src, 0);
+  __m128i b = _mm256_extracti128_si256(src, 1);
+  __m256i c = _mm256_cvtepi16_epi32(a);
+  __m256i d = _mm256_cvtepi16_epi32(b);
+
+  c = _mm256_add_epi32(c, d);
+  return hsum32_8x32i(c);
+}
+
+// Rearranges a 16x32b double vector into a format suitable for a stable SIMD
+// max algorithm:
+// (abcd|efgh) (ijkl|mnop) => (aceg|ikmo) (bdfh|jlnp)
+static INLINE void rearrange_512(__m256i *hi, __m256i *lo)
+{
+  const __m256i perm8x32mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
+
+  __m256i tmphi = _mm256_permutevar8x32_epi32(*hi, perm8x32mask);
+  __m256i tmplo = _mm256_permutevar8x32_epi32(*lo, perm8x32mask);
+
+  *hi = _mm256_permute2x128_si256(tmplo, tmphi, 0x31);
+  *lo = _mm256_permute2x128_si256(tmplo, tmphi, 0x20);
+}
+
+static INLINE void get_cheapest_alternative(__m256i costs_hi, __m256i costs_lo,
+    __m256i ns, __m256i changes,
+    int16_t *final_change, int32_t *min_pos)
+{
+  // Interleave ns and lo into 32-bit variables and to two 256-bit wide vecs,
+  // to have the same data layout as in costs. Zero extend to 32b width, shift
+  // changes 16 bits to the left, and store them into the same vectors.
+  __m256i tmp1hi = _mm256_unpackhi_epi16(ns, changes);
+  __m256i tmp1lo = _mm256_unpacklo_epi16(ns, changes);
+
+  __m256i pl1hi = _mm256_permute2x128_si256(tmp1lo, tmp1hi, 0x31);
+  __m256i pl1lo = _mm256_permute2x128_si256(tmp1lo, tmp1hi, 0x20);
+
+  // Reorder to afford result stability (if multiple atoms tie for cheapest,
+  // rightmost ie. the highest is the wanted one)
+  rearrange_512(&costs_hi, &costs_lo);
+  rearrange_512(&pl1hi, &pl1lo);
+
+  // 0: pick hi, 1: pick lo (equality evaluates as 0)
+  __m256i cmpmask1 = _mm256_cmpgt_epi32(costs_hi, costs_lo);
+  __m256i cost1    = _mm256_blendv_epi8(costs_hi, costs_lo, cmpmask1);
+  __m256i pl1_1    = _mm256_blendv_epi8(pl1hi,    pl1lo,    cmpmask1);
+
+  __m256i cost2    = _mm256_shuffle_epi32(cost1, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256i pl1_2    = _mm256_shuffle_epi32(pl1_1, _MM_SHUFFLE(2, 3, 0, 1));
+
+  __m256i cmpmask2 = _mm256_cmpgt_epi32(cost2, cost1);
+  __m256i cost3    = _mm256_blendv_epi8(cost2, cost1, cmpmask2);
+  __m256i pl1_3    = _mm256_blendv_epi8(pl1_2, pl1_1, cmpmask2);
+
+  __m256i cost4    = _mm256_shuffle_epi32(cost3, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i pl1_4    = _mm256_shuffle_epi32(pl1_3, _MM_SHUFFLE(1, 0, 3, 2));
+
+  __m256i cmpmask3 = _mm256_cmpgt_epi32(cost4, cost3);
+  __m256i cost5    = _mm256_blendv_epi8(cost4, cost3, cmpmask3);
+  __m256i pl1_5    = _mm256_blendv_epi8(pl1_4, pl1_3, cmpmask3);
+
+  __m256i cost6    = _mm256_permute4x64_epi64(cost5, _MM_SHUFFLE(1, 0, 3, 2));
+  __m256i pl1_6    = _mm256_permute4x64_epi64(pl1_5, _MM_SHUFFLE(1, 0, 3, 2));
+
+  __m256i cmpmask4 = _mm256_cmpgt_epi32(cost6, cost5);
+  __m256i pl1_7    = _mm256_blendv_epi8(pl1_6, pl1_5, cmpmask4);
+
+  __m128i res1_128 = _mm256_castsi256_si128(pl1_7);
+  uint32_t tmp1 = (uint32_t)_mm_extract_epi32(res1_128, 0);
+  uint16_t n = (uint16_t)(tmp1 & 0xffff);
+  uint16_t chng = (uint16_t)(tmp1 >> 16);
+
+  *final_change = (int16_t)chng;
+  *min_pos = (int32_t)n;
+}
+
+static INLINE __m256i concatenate_2x128i(__m128i lo, __m128i hi)
+{
+  __m256i v = _mm256_castsi128_si256(lo);
+  return _mm256_inserti128_si256(v, hi, 1);
+}
+
+static INLINE void scanord_read_vector_32(const int32_t  *__restrict quant_coeff,
+                                          const uint32_t *__restrict scan,
+                                          int8_t scan_mode,
+                                          int32_t subpos,
+                                          int32_t width,
+                                          __m256i *__restrict v_quant_coeffs)
+{
+  const size_t row_offsets4 = {
+    scansubpos + width * 0,
+    scansubpos + width * 1,
+    scansubpos + width * 2,
+    scansubpos + width * 3,
+  };
+
+  const __m256i shufmasks3 = {
+    _mm256_setr_epi32(5, 2, 6, 0, 3, 7, 4, 1),
+    _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7),
+    _mm256_setr_epi32(2, 3, 0, 1, 6, 7, 4, 5),
+  };
+
+  const __m256i blend_masks3 = {
+    _mm256_setr_epi32( 0,  0,  0, -1,  0,  0, -1, -1),
+    _mm256_setr_epi32( 0,  0,  0,  0,  0,  0,  0,  0),
+    _mm256_setr_epi32( 0,  0, -1, -1,  0,  0, -1, -1),
+  };
+
+  const __m256i rearr_masks_lo3 = {
+    _mm256_setr_epi32(0, 4, 1, 3, 5, 2, 6, 7),
+    _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7),
+    _mm256_setr_epi32(0, 4, 2, 6, 1, 5, 3, 7),
+  };
+
+  const __m256i rearr_masks_hi3 = {
+    _mm256_setr_epi32(6, 3, 0, 1, 7, 2, 4, 5),
+    _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7),
+    _mm256_setr_epi32(2, 6, 0, 4, 3, 7, 1, 5),
+  };
+
+  __m128i coeffs4 = {
+    _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets0)),
+    _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets1)),
+    _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets2)),
+    _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets3)),
+  };
+
+  __m256i coeffs_upper = concatenate_2x128i(coeffs0, coeffs1);
+  __m256i coeffs_lower = concatenate_2x128i(coeffs2, coeffs3);
+
+  __m256i lower_shuffled = _mm256_permutevar8x32_epi32(coeffs_lower, shufmasksscan_mode);
+
+  __m256i upper_blended  = _mm256_blendv_epi8(coeffs_upper,   lower_shuffled, blend_masksscan_mode);
+  __m256i lower_blended  = _mm256_blendv_epi8(lower_shuffled, coeffs_upper,   blend_masksscan_mode);
+
+  __m256i result_lo      = _mm256_permutevar8x32_epi32(upper_blended, rearr_masks_loscan_mode);
+  __m256i result_hi      = _mm256_permutevar8x32_epi32(lower_blended, rearr_masks_hiscan_mode);
+
+  v_quant_coeffs0 = result_lo;
+  v_quant_coeffs1 = result_hi;
+}
+
+#define VEC_WIDTH 16
+#define SCAN_SET_SIZE 16
+#define LOG2_SCAN_SET_SIZE 4
+
+static INLINE int32_t hide_block_sign(__m256i coefs, __m256i q_coefs, __m256i deltas_h, __m256i deltas_l, coeff_t * __restrict q_coef, const uint32_t * __restrict scan, int32_t subpos, int32_t last_cg)
+{
+  assert(SCAN_SET_SIZE == 16);
+
+  int32_t first_nz_pos_in_cg, last_nz_pos_in_cg;
+  int32_t abssum = 0;
+
+  // Find first and last nonzero coeffs
+  get_first_last_nz_int16(q_coefs, &first_nz_pos_in_cg, &last_nz_pos_in_cg);
+
+  // Sum all kvz_quant coeffs between first and last
+  abssum = hsum32_16x16i(q_coefs);
+
+  if (last_nz_pos_in_cg >= 0 && last_cg == -1) {
+    last_cg = 1;
+  }
+
+  if (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4) {
+
+    uint32_t q_coef_signbits = _mm256_movemask_epi8(q_coefs);
+    int32_t signbit = (q_coef_signbits >> (2 * first_nz_pos_in_cg + 1)) & 0x1;
+
+    if (signbit != (abssum & 0x1)) { // compare signbit with sum_parity
+      int32_t min_pos;
+      int16_t final_change;
+      int16_t cheapest_q;
+
+      const int32_t mask_max = (last_cg == 1) ? last_nz_pos_in_cg : SCAN_SET_SIZE - 1;
+
+      const __m256i zero = _mm256_setzero_si256();
+      const __m256i ones = _mm256_set1_epi16(1);
+      const __m256i maxiters = _mm256_set1_epi16(mask_max);
+      const __m256i ff = _mm256_set1_epi8(0xff);
+
+      const __m256i fnpics = _mm256_set1_epi16((int16_t)first_nz_pos_in_cg);
+      const __m256i ns = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+      __m256i block_signbit = _mm256_set1_epi16(((int16_t)signbit) * -1);
+      __m256i coef_signbits = _mm256_cmpgt_epi16(zero, coefs);
+      __m256i signbits_equal_block = _mm256_cmpeq_epi16(coef_signbits, block_signbit);
+
+      __m256i q_coefs_zero = _mm256_cmpeq_epi16(q_coefs, zero);
+
+      __m256i dus_packed = _mm256_packs_epi32(deltas_l, deltas_h);
+      __m256i dus_ordered = _mm256_permute4x64_epi64(dus_packed, _MM_SHUFFLE(3, 1, 2, 0));
+      __m256i dus_positive = _mm256_cmpgt_epi16(dus_ordered, zero);
+
+      __m256i q_coef_abss = _mm256_abs_epi16(q_coefs);
+      __m256i q_coefs_plusminus_one = _mm256_cmpeq_epi16(q_coef_abss, ones);
+
+      __m256i eq_fnpics = _mm256_cmpeq_epi16(fnpics, ns);
+      __m256i lt_fnpics = _mm256_cmpgt_epi16(fnpics, ns);
+
+      __m256i maxcost_subcond1s = _mm256_and_si256(eq_fnpics, q_coefs_plusminus_one);
+      __m256i maxcost_subcond2s = _mm256_andnot_si256(signbits_equal_block, lt_fnpics);
+      __m256i elsecond1s_inv = _mm256_or_si256(dus_positive, maxcost_subcond1s);
+      __m256i elsecond1s = _mm256_andnot_si256(elsecond1s_inv, ff);
+
+      __m256i outside_maxiters = _mm256_cmpgt_epi16(ns, maxiters);
+
+      __m256i negdelta_cond1s = _mm256_andnot_si256(q_coefs_zero, dus_positive);
+      __m256i negdelta_cond2s = _mm256_andnot_si256(maxcost_subcond2s, q_coefs_zero);
+      __m256i negdelta_mask16s_part1 = _mm256_or_si256(negdelta_cond1s, negdelta_cond2s);
+      __m256i negdelta_mask16s = _mm256_andnot_si256(outside_maxiters, negdelta_mask16s_part1);
+
+      __m256i posdelta_mask16s_part1 = _mm256_andnot_si256(q_coefs_zero, elsecond1s);
+      __m256i posdelta_mask16s = _mm256_andnot_si256(outside_maxiters, posdelta_mask16s_part1);
+
+      __m256i maxcost_cond1_parts = _mm256_andnot_si256(dus_positive, maxcost_subcond1s);
+      __m256i maxcost_cond1s = _mm256_andnot_si256(q_coefs_zero, maxcost_cond1_parts);
+      __m256i maxcost_cond2s = _mm256_and_si256(q_coefs_zero, maxcost_subcond2s);
+      __m256i maxcost_mask16s_parts = _mm256_or_si256(maxcost_cond1s, maxcost_cond2s);
+      __m256i maxcost_mask16s = _mm256_or_si256(maxcost_mask16s_parts, outside_maxiters);
+
+      __m128i tmp_l, tmp_h;
+      tmp_l = _mm256_extracti128_si256(negdelta_mask16s, 0);
+      tmp_h = _mm256_extracti128_si256(negdelta_mask16s, 1);
+      __m256i negdelta_mask32s_l = _mm256_cvtepi16_epi32(tmp_l);
+      __m256i negdelta_mask32s_h = _mm256_cvtepi16_epi32(tmp_h);
+
+      tmp_l = _mm256_extracti128_si256(posdelta_mask16s, 0);
+      tmp_h = _mm256_extracti128_si256(posdelta_mask16s, 1);
+      __m256i posdelta_mask32s_l = _mm256_cvtepi16_epi32(tmp_l);
+      __m256i posdelta_mask32s_h = _mm256_cvtepi16_epi32(tmp_h);
+
+      tmp_l = _mm256_extracti128_si256(maxcost_mask16s, 0);
+      tmp_h = _mm256_extracti128_si256(maxcost_mask16s, 1);
+      __m256i maxcost_mask32s_l = _mm256_cvtepi16_epi32(tmp_l);
+      __m256i maxcost_mask32s_h = _mm256_cvtepi16_epi32(tmp_h);
+
+      // Output value generation
+      // cur_change_max: zero
+      // cur_change_negdelta: ff
+      // cur_change_posdelta: ones
+      __m256i costs_negdelta_h = _mm256_sub_epi32(zero, deltas_h);
+      __m256i costs_negdelta_l = _mm256_sub_epi32(zero, deltas_l);
+      // costs_posdelta_l and _h: deltas_l and _h
+      __m256i costs_max_lh = _mm256_set1_epi32(0x7fffffff);
+
+      __m256i change_neg = _mm256_and_si256(negdelta_mask16s, ones);
+      __m256i change_pos = _mm256_and_si256(posdelta_mask16s, ff);
+      __m256i change_max = _mm256_and_si256(maxcost_mask16s, zero);
+
+      __m256i cost_neg_l = _mm256_and_si256(negdelta_mask32s_l, costs_negdelta_l);
+      __m256i cost_neg_h = _mm256_and_si256(negdelta_mask32s_h, costs_negdelta_h);
+      __m256i cost_pos_l = _mm256_and_si256(posdelta_mask32s_l, deltas_l);
+      __m256i cost_pos_h = _mm256_and_si256(posdelta_mask32s_h, deltas_h);
+      __m256i cost_max_l = _mm256_and_si256(maxcost_mask32s_l, costs_max_lh);
+      __m256i cost_max_h = _mm256_and_si256(maxcost_mask32s_h, costs_max_lh);
+
+      __m256i changes = _mm256_or_si256(change_neg, _mm256_or_si256(change_pos, change_max));
+      __m256i costs_l = _mm256_or_si256(cost_neg_l, _mm256_or_si256(cost_pos_l, cost_max_l));
+      __m256i costs_h = _mm256_or_si256(cost_neg_h, _mm256_or_si256(cost_pos_h, cost_max_h));
+
+      get_cheapest_alternative(costs_h, costs_l, ns, changes, &final_change, &min_pos);
+      const int32_t best_id = scanmin_pos + subpos;
+
+      cheapest_q = q_coefbest_id;
+      if (cheapest_q == 32767 || cheapest_q == -32768)
+        final_change = -1;
+
+      uint32_t coef_signs = _mm256_movemask_epi8(coef_signbits);
+      uint32_t cheapest_coef_sign_mask = (uint32_t)(1 << (2 * min_pos));
+
+      if (!(coef_signs & cheapest_coef_sign_mask))
+        cheapest_q += final_change;
+      else
+        cheapest_q -= final_change;
+
+      q_coefbest_id = cheapest_q;
+    } // Hide
+  }
+  if (last_cg == 1)
+    last_cg = 0;
+
+  return last_cg;
+}
 
 /**
  * \brief quantize transformed coefficents
  *
  */
-void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
+void kvz_quant_avx2(const encoder_state_t * const state, const coeff_t * __restrict coef, coeff_t * __restrict q_coef, int32_t width,
   int32_t height, int8_t type, int8_t scan_idx, int8_t block_type)
 {
   const encoder_control_t * const encoder = state->encoder_control;
   const uint32_t log2_block_size = kvz_g_convert_to_bitwidth + 2;
-  const uint32_t * const scan = kvz_g_sig_last_scanscan_idxlog2_block_size - 1;
+  const uint32_t * const  __restrict scan = kvz_g_sig_last_scanscan_idxlog2_block_size - 1;
 
   int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6);
   const uint32_t log2_tr_size = kvz_g_convert_to_bitwidth + 2;
@@ -61,28 +361,58 @@
   const int32_t add = ((state->frame->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9);
   const int32_t q_bits8 = q_bits - 8;
 
-  assert(quant_coeff0 <= (1 << 15) - 1 && quant_coeff0 >= -(1 << 15)); //Assuming flat values to fit int16_t
-
   uint32_t ac_sum = 0;
+  int32_t last_cg = -1;
 
   __m256i v_ac_sum = _mm256_setzero_si256();
-  __m256i v_quant_coeff = _mm256_set1_epi16(quant_coeff0);
 
-  for (int32_t n = 0; n < width * height; n += 16) {
+  // Loading once is enough if scaling lists are not off
+  __m256i low_b = _mm256_setzero_si256(), high_b = _mm256_setzero_si256();
+  if (!(state->encoder_control->scaling_list.enable)) {
+    low_b  = _mm256_set1_epi32(quant_coeff0);
+    high_b = low_b;
+  }
+
+  for (int32_t n = 0; n < width * height; n += VEC_WIDTH) {
 
-    __m256i v_level = _mm256_loadu_si256((__m256i*)&(coefn));
+    __m256i v_level = _mm256_loadu_si256((__m256i *)(coef + n));
     __m256i v_sign = _mm256_cmpgt_epi16(_mm256_setzero_si256(), v_level);
     v_sign = _mm256_or_si256(v_sign, _mm256_set1_epi16(1));
 
-    v_level = _mm256_abs_epi16(v_level);
-    __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0));
-    __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0));
+    if (state->encoder_control->scaling_list.enable) {
+      __m256i v_quant_coeff_lo = _mm256_loadu_si256(((__m256i *)(quant_coeff + n)) + 0);
+      __m256i v_quant_coeff_hi = _mm256_loadu_si256(((__m256i *)(quant_coeff + n)) + 1);
 
-    __m256i low_b = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0));
-    __m256i high_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0));
+      low_b  = _mm256_permute2x128_si256(v_quant_coeff_lo,
+                                         v_quant_coeff_hi,
+                                         0x20);
+
+      high_b = _mm256_permute2x128_si256(v_quant_coeff_lo,
+                                         v_quant_coeff_hi,
+                                         0x31);
+    }
 
-    __m256i v_level32_a = _mm256_madd_epi16(low_a, low_b);
-    __m256i v_level32_b = _mm256_madd_epi16(high_a, high_b);
+// TODO: do we need to have this?
+// #define CHECK_QUANT_COEFFS
+#ifdef CHECK_QUANT_COEFFS
+      __m256i abs_vq_lo = _mm256_abs_epi32(v_quant_coeff_lo);
+      __m256i abs_vq_hi = _mm256_abs_epi32(v_quant_coeff_hi);
+
+      __m256i vq_over_16b_lo = _mm256_cmpgt_epi32(abs_vq_lo, _mm256_set1_epi32(0x7fff));
+      __m256i vq_over_16b_hi = _mm256_cmpgt_epi32(abs_vq_hi, _mm256_set1_epi32(0x7fff));
+
+      uint32_t over_16b_mask_lo = _mm256_movemask_epi8(vq_over_16b_lo);
+      uint32_t over_16b_mask_hi = _mm256_movemask_epi8(vq_over_16b_hi);
+
+      assert(!(over_16b_mask_lo || over_16b_mask_hi));
+#endif
+
+    v_level = _mm256_abs_epi16(v_level);
+    __m256i low_a  = _mm256_unpacklo_epi16(v_level, _mm256_setzero_si256());
+    __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_setzero_si256());
+
+    __m256i v_level32_a = _mm256_mullo_epi32(low_a,  low_b);
+    __m256i v_level32_b = _mm256_mullo_epi32(high_a, high_b);
 
     v_level32_a = _mm256_add_epi32(v_level32_a, _mm256_set1_epi32(add));
     v_level32_b = _mm256_add_epi32(v_level32_b, _mm256_set1_epi32(add));
@@ -93,7 +423,7 @@
     v_level = _mm256_packs_epi32(v_level32_a, v_level32_b);
     v_level = _mm256_sign_epi16(v_level, v_sign);
 
-    _mm256_storeu_si256((__m256i*)&(q_coefn), v_level);
+    _mm256_storeu_si256((__m256i *)(q_coef + n), v_level);
 
     v_ac_sum = _mm256_add_epi32(v_ac_sum, v_level32_a);
     v_ac_sum = _mm256_add_epi32(v_ac_sum, v_level32_b);
@@ -104,23 +434,47 @@
   temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, _MM_SHUFFLE(0, 1, 0, 1)));
   ac_sum += _mm_cvtsi128_si32(temp);
 
-  if (!encoder->cfg.signhide_enable || ac_sum < 2) return;
+  if (!encoder->cfg.signhide_enable || ac_sum < 2)
+    return;
 
-  int32_t delta_uLCU_WIDTH*LCU_WIDTH >> 2;
+  assert(VEC_WIDTH == SCAN_SET_SIZE);
+  for (int32_t subpos = (width * height - 1) & (~(VEC_WIDTH - 1)); subpos >= 0; subpos -= VEC_WIDTH) {
+    const int16_t *coeffs2 = {coef, q_coef};
+    __m256i result_coeffs2;
+    __m256i v_quant_coeffs2;
 
-  for (int32_t n = 0; n < width * height; n += 16) {
+    __m256i v_coef, q_coefs;
+    __m256i v_quant_coeff_lo, v_quant_coeff_hi;
 
-    __m256i v_level = _mm256_loadu_si256((__m256i*)&(coefn));
+    scanord_read_vector(coeffs, scan, scan_idx, subpos, width, result_coeffs, 2);
 
-    v_level = _mm256_abs_epi16(v_level);
-    __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0));
-    __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0));
+    v_coef  = result_coeffs0;
+    q_coefs = result_coeffs1;
+
+    if (state->encoder_control->scaling_list.enable) {
+      scanord_read_vector_32(quant_coeff, scan, scan_idx, subpos, width, v_quant_coeffs);
 
-    __m256i low_b = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0));
-    __m256i high_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0));
+      v_quant_coeff_lo = v_quant_coeffs0;
+      v_quant_coeff_hi = v_quant_coeffs1;
 
-    __m256i v_level32_a = _mm256_madd_epi16(low_a, low_b);
-    __m256i v_level32_b = _mm256_madd_epi16(high_a, high_b);
+      low_b  = _mm256_permute2x128_si256(v_quant_coeff_lo,
+                                         v_quant_coeff_hi,
+                                         0x20);
+
+      high_b = _mm256_permute2x128_si256(v_quant_coeff_lo,
+                                         v_quant_coeff_hi,
+                                         0x31);
+    }
+
+    __m256i v_level = _mm256_abs_epi16(v_coef);
+    __m256i low_a  = _mm256_unpacklo_epi16(v_level, _mm256_setzero_si256());
+    __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_setzero_si256());
+
+    __m256i v_quant_coeff_a = _mm256_or_si256(low_b,  _mm256_setzero_si256());
+    __m256i v_quant_coeff_b = _mm256_or_si256(high_b, _mm256_setzero_si256());
+
+    __m256i v_level32_a = _mm256_mullo_epi32(low_a,  low_b);
+    __m256i v_level32_b = _mm256_mullo_epi32(high_a, high_b);
 
     v_level32_a = _mm256_add_epi32(v_level32_a, _mm256_set1_epi32(add));
     v_level32_b = _mm256_add_epi32(v_level32_b, _mm256_set1_epi32(add));
@@ -130,107 +484,26 @@
 
     v_level = _mm256_packs_epi32(v_level32_a, v_level32_b);
 
-    __m256i v_coef = _mm256_loadu_si256((__m256i*)&(coefn));
     __m256i v_coef_a = _mm256_unpacklo_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0));
     __m256i v_coef_b = _mm256_unpackhi_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0));
-    __m256i v_quant_coeff_a = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0));
-    __m256i v_quant_coeff_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0));
-    v_coef_a = _mm256_madd_epi16(v_coef_a, v_quant_coeff_a);
-    v_coef_b = _mm256_madd_epi16(v_coef_b, v_quant_coeff_b);
+
+    v_coef_a = _mm256_mullo_epi32(v_coef_a, v_quant_coeff_a);
+    v_coef_b = _mm256_mullo_epi32(v_coef_b, v_quant_coeff_b);
+
     v_coef_a = _mm256_sub_epi32(v_coef_a, _mm256_slli_epi32(_mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)), q_bits) );
     v_coef_b = _mm256_sub_epi32(v_coef_b, _mm256_slli_epi32(_mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0)), q_bits) );
     v_coef_a = _mm256_srai_epi32(v_coef_a, q_bits8);
     v_coef_b = _mm256_srai_epi32(v_coef_b, q_bits8);
     
-    _mm_storeu_si128((__m128i*)&(delta_un+0*4), _mm256_castsi256_si128(v_coef_a));
-    _mm_storeu_si128((__m128i*)&(delta_un+2*4), _mm256_extracti128_si256(v_coef_a, 1));
-    _mm_storeu_si128((__m128i*)&(delta_un+1*4), _mm256_castsi256_si128(v_coef_b));
-    _mm_storeu_si128((__m128i*)&(delta_un+3*4), _mm256_extracti128_si256(v_coef_b, 1));
-  }
-
-  if (ac_sum >= 2) {
-#define SCAN_SET_SIZE 16
-#define LOG2_SCAN_SET_SIZE 4
-    int32_t n, last_cg = -1, abssum = 0, subset, subpos;
-    for (subset = (width*height - 1) >> LOG2_SCAN_SET_SIZE; subset >= 0; subset--) {
-      int32_t first_nz_pos_in_cg = SCAN_SET_SIZE, last_nz_pos_in_cg = -1;
-      subpos = subset << LOG2_SCAN_SET_SIZE;
-      abssum = 0;
-
-      // Find last coeff pos
-      for (n = SCAN_SET_SIZE - 1; n >= 0; n--)  {
-        if (q_coefscann + subpos)  {
-          last_nz_pos_in_cg = n;
-          break;
-        }
-      }
+    __m256i deltas_h = _mm256_permute2x128_si256(v_coef_a, v_coef_b, 0x31);
+    __m256i deltas_l = _mm256_permute2x128_si256(v_coef_a, v_coef_b, 0x20);
 
-      // First coeff pos
-      for (n = 0; n <SCAN_SET_SIZE; n++) {
-        if (q_coefscann + subpos) {
-          first_nz_pos_in_cg = n;
-          break;
-        }
-      }
-
-      // Sum all kvz_quant coeffs between first and last
-      for (n = first_nz_pos_in_cg; n <= last_nz_pos_in_cg; n++) {
-        abssum += q_coefscann + subpos;
-      }
-
-      if (last_nz_pos_in_cg >= 0 && last_cg == -1) {
-        last_cg = 1;
-      }
-
-      if (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4) {
-        int32_t signbit = (q_coefscansubpos + first_nz_pos_in_cg > 0 ? 0 : 1);
-        if (signbit != (abssum & 0x1)) { // compare signbit with sum_parity
-          int32_t min_cost_inc = 0x7fffffff, min_pos = -1, cur_cost = 0x7fffffff;
-          int16_t final_change = 0, cur_change = 0;
-          for (n = (last_cg == 1 ? last_nz_pos_in_cg : SCAN_SET_SIZE - 1); n >= 0; n--) {
-            uint32_t blkPos = scann + subpos;
-            if (q_coefblkPos != 0) {
-              if (delta_ublkPos > 0) {
-                cur_cost = -delta_ublkPos;
-                cur_change = 1;
-              }
-              else if (n == first_nz_pos_in_cg && abs(q_coefblkPos) == 1) {
-                cur_cost = 0x7fffffff;
-              }
-              else {
-                cur_cost = delta_ublkPos;
-                cur_change = -1;
-              }
-            }
-            else if (n < first_nz_pos_in_cg && ((coefblkPos >= 0) ? 0 : 1) != signbit) {
-              cur_cost = 0x7fffffff;
-            }
-            else {
-              cur_cost = -delta_ublkPos;
-              cur_change = 1;
-            }
-
-            if (cur_cost < min_cost_inc) {
-              min_cost_inc = cur_cost;
-              final_change = cur_change;
-              min_pos = blkPos;
-            }
-          } // CG loop
-
-          if (q_coefmin_pos == 32767 || q_coefmin_pos == -32768) {
-            final_change = -1;
-          }
-
-          if (coefmin_pos >= 0) q_coefmin_pos += final_change;
-          else q_coefmin_pos -= final_change;
-        } // Hide
-      }
-      if (last_cg == 1) last_cg = 0;
-    }
+    last_cg = hide_block_sign(v_coef, q_coefs, deltas_h, deltas_l, q_coef, scan, subpos, last_cg);
+  }
 
+#undef VEC_WIDTH
 #undef SCAN_SET_SIZE
 #undef LOG2_SCAN_SET_SIZE
-  }
 }
 
 static INLINE __m128i get_residual_4x1_avx2(const kvz_pixel *a_in, const kvz_pixel *b_in){
@@ -375,7 +648,7 @@
     kvz_transformskip(state->encoder_control, residual, coeff, width);
   }
   else {
-    kvz_transform2d(state->encoder_control, residual, coeff, width, (color == COLOR_Y ? 0 : 65535));
+    kvz_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu->type);
   }
 
   // Quantize coeffs. (coeff -> coeff_out)
@@ -408,7 +681,7 @@
       kvz_itransformskip(state->encoder_control, residual, coeff, width);
     }
     else {
-      kvz_itransform2d(state->encoder_control, residual, coeff, width, (color == COLOR_Y ? 0 : 65535));
+      kvz_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu->type);
     }
 
     // Get quantized reconstruction. (residual + pred_in -> rec_out)
@@ -429,17 +702,6 @@
   return has_coeffs;
 }
 
-void kvz_quant_avx2(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
-  int32_t height, int8_t type, int8_t scan_idx, int8_t block_type)
-{
-  if (state->encoder_control->scaling_list.enable){
-    kvz_quant_generic(state, coef, q_coef, width, height, type, scan_idx, block_type);
-  }
-  else {
-    kvz_quant_flat_avx2(state, coef, q_coef, width, height, type, scan_idx, block_type);
-  }
-}
-
 /**
  * \brief inverse quantize transformed and quantized coefficents
  *
@@ -524,6 +786,81 @@
   return parts0 + parts1 + parts2 + parts3;
 }
 
+#define TO_Q88(f) ((int16_t)((f) * 256.0f))
+
+static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, int32_t qp)
+{
+#define NUM_BUCKETS 5
+  static const int16_t wt_mNUM_BUCKETS = {
+    TO_Q88(-0.004916),
+    TO_Q88( 0.010806),
+    TO_Q88( 0.055562),
+    TO_Q88( 0.033436),
+    TO_Q88(-0.007690),
+  };
+  static const int16_t wt_cNUM_BUCKETS = {
+    TO_Q88( 0.172024),
+    TO_Q88( 3.421462),
+    TO_Q88( 2.879506),
+    TO_Q88( 5.585471),
+    TO_Q88( 0.256772),
+  };
+
+  const __m256i zero   = _mm256_setzero_si256();
+  const __m256i threes = _mm256_set1_epi16(3);
+  const __m256i ones   = _mm256_srli_epi16(threes, 1);
+  const __m256i twos   = _mm256_slli_epi16(ones,   1);
+
+  __m256i wtNUM_BUCKETS - 1;
+  for (int32_t i = 0; i < NUM_BUCKETS - 1; i++)
+    wti = _mm256_set1_epi16(wt_mi * qp + wt_ci);
+
+  uint32_t wid_wt = width * (wt_mNUM_BUCKETS - 1 * qp + wt_cNUM_BUCKETS - 1);
+  __m256i avx_inc = _mm256_setzero_si256();
+
+  for (int32_t i = 0; i < width * width; i += 16) {
+    __m256i curr      = _mm256_loadu_si256((__m256i *)(coeff + i));
+    __m256i curr_abs  = _mm256_abs_epi16  (curr);
+    __m256i curr_max3 = _mm256_min_epi16  (curr_abs, threes);
+
+    __m256i curr_eq_0 = _mm256_cmpeq_epi16(curr_max3, zero);
+    __m256i curr_eq_1 = _mm256_cmpeq_epi16(curr_max3, ones);
+    __m256i curr_eq_2 = _mm256_cmpeq_epi16(curr_max3, twos);
+    __m256i curr_eq_3 = _mm256_cmpeq_epi16(curr_max3, threes);
+
+    __m256i curr_0_wt = _mm256_and_si256  (curr_eq_0, wt0);
+    __m256i curr_1_wt = _mm256_and_si256  (curr_eq_1, wt1);
+    __m256i curr_2_wt = _mm256_and_si256  (curr_eq_2, wt2);
+    __m256i curr_3_wt = _mm256_and_si256  (curr_eq_3, wt3);
+
+    // Use madd to horizontally sum 16-bit weights into 32-bit atoms
+    __m256i wt_0_32b  = _mm256_madd_epi16(curr_0_wt, ones);
+    __m256i wt_1_32b  = _mm256_madd_epi16(curr_1_wt, ones);
+    __m256i wt_2_32b  = _mm256_madd_epi16(curr_2_wt, ones);
+    __m256i wt_3_32b  = _mm256_madd_epi16(curr_3_wt, ones);
+
+    __m256i wt_01     = _mm256_add_epi32(wt_0_32b, wt_1_32b);
+    __m256i wt_23     = _mm256_add_epi32(wt_2_32b, wt_3_32b);
+    __m256i curr_wts  = _mm256_add_epi32(wt_01,    wt_23);
+    avx_inc           = _mm256_add_epi32(avx_inc,  curr_wts);
+  }
+  __m128i inchi = _mm256_extracti128_si256(avx_inc, 1);
+  __m128i inclo = _mm256_castsi256_si128  (avx_inc);
+
+  __m128i sum_1 = _mm_add_epi32    (inclo, inchi);
+  __m128i sum_2 = _mm_shuffle_epi32(sum_1, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sum_3 = _mm_add_epi32    (sum_1, sum_2);
+  __m128i sum_4 = _mm_shuffle_epi32(sum_3, _MM_SHUFFLE(2, 3, 0, 1));
+  __m128i sum   = _mm_add_epi32    (sum_3, sum_4);
+
+  uint32_t sum_u32 = _mm_cvtsi128_si32(sum);
+  uint32_t sum_total = sum_u32 + wid_wt;
+  return sum_total >> 8;
+#undef NUM_BUCKETS
+}
+
+#undef TO_Q88
+
 #endif //COMPILE_INTEL_AVX2 && defined X86_64
 
 int kvz_strategy_register_quant_avx2(void* opaque, uint8_t bitdepth)
@@ -537,6 +874,7 @@
     success &= kvz_strategyselector_register(opaque, "dequant", "avx2", 40, &kvz_dequant_avx2);
   }
   success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "avx2", 0, &coeff_abs_sum_avx2);
+  success &= kvz_strategyselector_register(opaque, "fast_coeff_cost", "avx2", 40, &fast_coeff_cost_avx2);
 #endif //COMPILE_INTEL_AVX2 && defined X86_64
 
   return success;

kvazaar-1.3.0.tar.gz/src/strategies/avx2/reg_sad_pow2_widths-avx2.h Added

@@ -0,0 +1,209 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#ifndef REG_SAD_POW2_WIDTHS_AVX2_H_
+#define REG_SAD_POW2_WIDTHS_AVX2_H_
+
+#include "strategies/sse41/reg_sad_pow2_widths-sse41.h"
+#include "kvazaar.h"
+
+static INLINE uint32_t reg_sad_w32(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                            const int32_t height, const uint32_t stride1,
+                            const uint32_t stride2)
+{
+  __m256i avx_inc = _mm256_setzero_si256();
+  int32_t y;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1));
+    __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2));
+    __m256i c = _mm256_loadu_si256((const __m256i *)(data1 + (y + 1) * stride1));
+    __m256i d = _mm256_loadu_si256((const __m256i *)(data2 + (y + 1) * stride2));
+    __m256i e = _mm256_loadu_si256((const __m256i *)(data1 + (y + 2) * stride1));
+    __m256i f = _mm256_loadu_si256((const __m256i *)(data2 + (y + 2) * stride2));
+    __m256i g = _mm256_loadu_si256((const __m256i *)(data1 + (y + 3) * stride1));
+    __m256i h = _mm256_loadu_si256((const __m256i *)(data2 + (y + 3) * stride2));
+
+    __m256i curr_sads_ab = _mm256_sad_epu8(a, b);
+    __m256i curr_sads_cd = _mm256_sad_epu8(c, d);
+    __m256i curr_sads_ef = _mm256_sad_epu8(e, f);
+    __m256i curr_sads_gh = _mm256_sad_epu8(g, h);
+
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ab);
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_cd);
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ef);
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_gh);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1));
+      __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2));
+
+      __m256i curr_sads = _mm256_sad_epu8(a, b);
+      avx_inc = _mm256_add_epi64(avx_inc, curr_sads);
+    }
+  }
+
+  __m128i inchi = _mm256_extracti128_si256(avx_inc, 1);
+  __m128i inclo = _mm256_castsi256_si128  (avx_inc);
+
+  __m128i sum_1 = _mm_add_epi64    (inclo, inchi);
+  __m128i sum_2 = _mm_shuffle_epi32(sum_1, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad   = _mm_add_epi64    (sum_1, sum_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static INLINE uint32_t reg_sad_w64(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                            const int32_t height, const uint32_t stride1,
+                            const uint32_t stride2)
+{
+  __m256i avx_inc = _mm256_setzero_si256();
+  int32_t y;
+
+  const int32_t height_twoline_groups = height & ~1;
+  const int32_t height_residual_lines = height &  1;
+
+  for (y = 0; y < height_twoline_groups; y += 2) {
+    __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1));
+    __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2));
+    __m256i c = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1 + 32));
+    __m256i d = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2 + 32));
+
+    __m256i e = _mm256_loadu_si256((const __m256i *)(data1 + (y + 1) * stride1));
+    __m256i f = _mm256_loadu_si256((const __m256i *)(data2 + (y + 1) * stride2));
+    __m256i g = _mm256_loadu_si256((const __m256i *)(data1 + (y + 1) * stride1 + 32));
+    __m256i h = _mm256_loadu_si256((const __m256i *)(data2 + (y + 1) * stride2 + 32));
+
+    __m256i curr_sads_ab = _mm256_sad_epu8(a, b);
+    __m256i curr_sads_cd = _mm256_sad_epu8(c, d);
+    __m256i curr_sads_ef = _mm256_sad_epu8(e, f);
+    __m256i curr_sads_gh = _mm256_sad_epu8(g, h);
+
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ab);
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_cd);
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ef);
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_gh);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1));
+      __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2));
+      __m256i c = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1 + 32));
+      __m256i d = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2 + 32));
+
+      __m256i curr_sads_ab = _mm256_sad_epu8(a, b);
+      __m256i curr_sads_cd = _mm256_sad_epu8(c, d);
+      avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ab);
+      avx_inc = _mm256_add_epi64(avx_inc, curr_sads_cd);
+    }
+  }
+
+  __m128i inchi = _mm256_extracti128_si256(avx_inc, 1);
+  __m128i inclo = _mm256_castsi256_si128  (avx_inc);
+
+  __m128i sum_1 = _mm_add_epi64    (inclo, inchi);
+  __m128i sum_2 = _mm_shuffle_epi32(sum_1, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad   = _mm_add_epi64    (sum_1, sum_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static uint32_t hor_sad_avx2_w32(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                 int32_t height, uint32_t pic_stride, uint32_t ref_stride,
+                                 const uint32_t left, const uint32_t right)
+{
+  __m256i avx_inc = _mm256_setzero_si256();
+
+  const size_t block_width      = 32;
+  const size_t block_width_log2 = 5;
+  const size_t lane_width       = 16;
+
+  const int32_t left_eq_wid     = left  >> block_width_log2;
+  const int32_t left_clamped    = left  -  left_eq_wid;
+  const int32_t right_eq_wid    = right >> block_width_log2;
+  const int32_t right_clamped   = right -  right_eq_wid;
+
+  const __m256i zero        = _mm256_setzero_si256();
+  const __m256i lane_widths = _mm256_set1_epi8((uint8_t)lane_width);
+  const __m256i lefts       = _mm256_set1_epi8((uint8_t)left_clamped);
+  const __m256i rights      = _mm256_set1_epi8((uint8_t)right_clamped);
+  const __m256i unsign_mask = _mm256_set1_epi8(0x7f);
+  const __m256i ns          = _mm256_setr_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
+
+  const __m256i rightmost_good_idx = _mm256_set1_epi8((uint8_t)(block_width - right - 1));
+
+  const __m256i shufmask1_l    = _mm256_sub_epi8  (ns,          lefts);
+  const __m256i shufmask1_r    = _mm256_add_epi8  (shufmask1_l, rights);
+  const __m256i shufmask1      = _mm256_and_si256 (shufmask1_r, unsign_mask);
+
+  const __m256i epol_mask_r    = _mm256_min_epi8  (ns,    rightmost_good_idx);
+  const __m256i epol_mask      = _mm256_max_epi8  (lefts, epol_mask_r);
+
+  const __m256i mlo2hi_mask_l  = _mm256_cmpgt_epi8(lefts, ns);
+  const __m256i mlo2hi_imask_r = _mm256_cmpgt_epi8(lane_widths, shufmask1);
+  const __m256i mlo2hi_mask_r  = _mm256_cmpeq_epi8(mlo2hi_imask_r, zero);
+
+  // For left != 0,  use low lane of mlo2hi_mask_l as blend mask for high lane.
+  // For right != 0, use low lane of mlo2hi_mask_r as blend mask for low lane.
+  const __m256i xchg_mask1     = _mm256_permute2x128_si256(mlo2hi_mask_l, mlo2hi_mask_r, 0x02);
+
+  // If left != 0 (ie. right == 0), the xchg should only affect high lane,
+  // if right != 0 (ie. left == 0), the low lane. Set bits on the lane that
+  // the xchg should affect. left == right == 0 should never happen, this'll
+  // break if it does.
+  const __m256i lanes_llo_rhi  = _mm256_blend_epi32(lefts, rights, 0xf0);
+  const __m256i xchg_lane_mask = _mm256_cmpeq_epi32(lanes_llo_rhi, zero);
+
+  const __m256i xchg_data_mask = _mm256_and_si256(xchg_mask1, xchg_lane_mask);
+
+  // If we're straddling the left border, start from the left border instead,
+  // and if right border, end on the border
+  const int32_t ld_offset = left - right;
+
+  int32_t y;
+  for (y = 0; y < height; y++) {
+    __m256i a = _mm256_loadu_si256((__m256i *)(pic_data + (y + 0) * pic_stride + 0));
+    __m256i b = _mm256_loadu_si256((__m256i *)(ref_data + (y + 0) * ref_stride + 0  + ld_offset));
+
+    __m256i b_shifted            = _mm256_shuffle_epi8     (b, shufmask1);
+    __m256i b_lanes_reversed     = _mm256_permute4x64_epi64(b_shifted,   _MM_SHUFFLE(1, 0, 3, 2));
+    __m256i b_data_transfered    = _mm256_blendv_epi8      (b_shifted, b_lanes_reversed, xchg_data_mask);
+    __m256i b_epoled             = _mm256_shuffle_epi8     (b_data_transfered, epol_mask);
+
+    __m256i curr_sads_ab         = _mm256_sad_epu8(a, b_epoled);
+
+    avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ab);
+  }
+  __m128i inchi = _mm256_extracti128_si256(avx_inc, 1);
+  __m128i inclo = _mm256_castsi256_si128  (avx_inc);
+
+  __m128i sum_1 = _mm_add_epi64    (inclo, inchi);
+  __m128i sum_2 = _mm_shuffle_epi32(sum_1, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad   = _mm_add_epi64    (sum_1, sum_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+#endif

kvazaar-1.3.0.tar.gz/src/strategies/generic/encode_coding_tree-generic.c Added

@@ -0,0 +1,279 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "strategyselector.h"
+
+#include "cabac.h"
+#include "context.h"
+#include "encode_coding_tree-generic.h"
+#include "encode_coding_tree.h"
+
+void kvz_encode_coeff_nxn_generic(encoder_state_t * const state,
+                                  cabac_data_t * const cabac,
+                                  const coeff_t *coeff,
+                                  uint8_t width,
+                                  uint8_t type,
+                                  int8_t scan_mode,
+                                  int8_t tr_skip)
+{
+  const encoder_control_t * const encoder = state->encoder_control;
+  int c1 = 1;
+  uint8_t last_coeff_x = 0;
+  uint8_t last_coeff_y = 0;
+  int32_t i;
+  uint32_t sig_coeffgroup_flag8 * 8 = { 0 };
+
+  int8_t be_valid = encoder->cfg.signhide_enable;
+  int32_t scan_pos_sig;
+  uint32_t go_rice_param = 0;
+  uint32_t blk_pos, pos_y, pos_x, sig, ctx_sig;
+
+  // CONSTANTS
+  const uint32_t num_blk_side    = width >> TR_MIN_LOG2_SIZE;
+  const uint32_t log2_block_size = kvz_g_convert_to_bitwidth + 2;
+  const uint32_t *scan           =
+    kvz_g_sig_last_scanscan_modelog2_block_size - 1;
+  const uint32_t *scan_cg = g_sig_last_scan_cglog2_block_size - 2scan_mode;
+
+  // Init base contexts according to block type
+  cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_modeltype);
+  cabac_ctx_t *baseCtx           = (type == 0) ? &(cabac->ctx.cu_sig_model_luma0) :
+                                 &(cabac->ctx.cu_sig_model_chroma0);
+
+  // Scan all coeff groups to find out which of them have coeffs.
+  // Populate sig_coeffgroup_flag with that info.
+
+  unsigned sig_cg_cnt = 0;
+  for (int cg_y = 0; cg_y < width / 4; ++cg_y) {
+    for (int cg_x = 0; cg_x < width / 4; ++cg_x) {
+      unsigned cg_pos = cg_y * width * 4 + cg_x * 4;
+      for (int coeff_row = 0; coeff_row < 4; ++coeff_row) {
+        // Load four 16-bit coeffs and see if any of them are non-zero.
+        unsigned coeff_pos = cg_pos + coeff_row * width;
+        uint64_t four_coeffs = *(uint64_t*)(&coeffcoeff_pos);
+        if (four_coeffs) {
+          ++sig_cg_cnt;
+          unsigned cg_pos_y = (cg_pos >> log2_block_size) >> TR_MIN_LOG2_SIZE;
+          unsigned cg_pos_x = (cg_pos & (width - 1)) >> TR_MIN_LOG2_SIZE;
+          sig_coeffgroup_flagcg_pos_x + cg_pos_y * num_blk_side = 1;
+          break;
+        }
+      }
+    }
+  }
+
+  // Rest of the code assumes at least one non-zero coeff.
+  assert(sig_cg_cnt > 0);
+
+  // Find the last coeff group by going backwards in scan order.
+  unsigned scan_cg_last = num_blk_side * num_blk_side - 1;
+  while (!sig_coeffgroup_flagscan_cgscan_cg_last) {
+    --scan_cg_last;
+  }
+
+  // Find the last coeff by going backwards in scan order.
+  unsigned scan_pos_last = scan_cg_last * 16 + 15;
+  while (!coeffscanscan_pos_last) {
+    --scan_pos_last;
+  }
+
+  int pos_last = scanscan_pos_last;
+
+  // transform skip flag
+  if(width == 4 && encoder->cfg.trskip_enable) {
+    cabac->cur_ctx = (type == 0) ? &(cabac->ctx.transform_skip_model_luma) : &(cabac->ctx.transform_skip_model_chroma);
+    CABAC_BIN(cabac, tr_skip, "transform_skip_flag");
+  }
+
+  last_coeff_x = pos_last & (width - 1);
+  last_coeff_y = (uint8_t)(pos_last >> log2_block_size);
+
+  // Code last_coeff_x and last_coeff_y
+  kvz_encode_last_significant_xy(cabac,
+                                 last_coeff_x,
+                                 last_coeff_y,
+                                 width,
+                                 width,
+                                 type,
+                                 scan_mode);
+
+  scan_pos_sig  = scan_pos_last;
+
+  // significant_coeff_flag
+  for (i = scan_cg_last; i >= 0; i--) {
+    int32_t sub_pos        = i << 4; // LOG2_SCAN_SET_SIZE;
+    int32_t abs_coeff16;
+    int32_t cg_blk_pos     = scan_cgi;
+    int32_t cg_pos_y       = cg_blk_pos / num_blk_side;
+    int32_t cg_pos_x       = cg_blk_pos - (cg_pos_y * num_blk_side);
+
+    uint32_t coeff_signs   = 0;
+    int32_t last_nz_pos_in_cg = -1;
+    int32_t first_nz_pos_in_cg = 16;
+    int32_t num_non_zero = 0;
+    go_rice_param = 0;
+
+    if (scan_pos_sig == scan_pos_last) {
+      abs_coeff0 = abs(coeffpos_last);
+      coeff_signs  = (coeffpos_last < 0);
+      num_non_zero = 1;
+      last_nz_pos_in_cg  = scan_pos_sig;
+      first_nz_pos_in_cg = scan_pos_sig;
+      scan_pos_sig--;
+    }
+
+    if (i == scan_cg_last || i == 0) {
+      sig_coeffgroup_flagcg_blk_pos = 1;
+    } else {
+      uint32_t sig_coeff_group   = (sig_coeffgroup_flagcg_blk_pos != 0);
+      uint32_t ctx_sig  = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
+                                                      cg_pos_y, width);
+      cabac->cur_ctx = &base_coeff_group_ctxctx_sig;
+      CABAC_BIN(cabac, sig_coeff_group, "coded_sub_block_flag");
+    }
+
+    if (sig_coeffgroup_flagcg_blk_pos) {
+      int32_t pattern_sig_ctx = kvz_context_calc_pattern_sig_ctx(sig_coeffgroup_flag,
+                                                             cg_pos_x, cg_pos_y, width);
+
+      for (; scan_pos_sig >= sub_pos; scan_pos_sig--) {
+        blk_pos = scanscan_pos_sig;
+        pos_y   = blk_pos >> log2_block_size;
+        pos_x   = blk_pos - (pos_y << log2_block_size);
+        sig    = (coeffblk_pos != 0) ? 1 : 0;
+
+        if (scan_pos_sig > sub_pos || i == 0 || num_non_zero) {
+          ctx_sig  = kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
+                                             log2_block_size, type);
+          cabac->cur_ctx = &baseCtxctx_sig;
+          CABAC_BIN(cabac, sig, "sig_coeff_flag");
+        }
+
+        if (sig) {
+          abs_coeffnum_non_zero = abs(coeffblk_pos);
+          coeff_signs              = 2 * coeff_signs + (coeffblk_pos < 0);
+          num_non_zero++;
+
+          if (last_nz_pos_in_cg == -1) {
+            last_nz_pos_in_cg = scan_pos_sig;
+          }
+
+          first_nz_pos_in_cg  = scan_pos_sig;
+        }
+      }
+    } else {
+      scan_pos_sig = sub_pos - 1;
+    }
+
+    if (num_non_zero > 0) {
+      bool sign_hidden = last_nz_pos_in_cg - first_nz_pos_in_cg >= 4 /* SBH_THRESHOLD */
+                         && !encoder->cfg.lossless;
+      uint32_t ctx_set  = (i > 0 && type == 0) ? 2 : 0;
+      cabac_ctx_t *base_ctx_mod;
+      int32_t num_c1_flag, first_c2_flag_idx, idx, first_coeff2;
+
+      if (c1 == 0) {
+        ctx_set++;
+      }
+
+      c1 = 1;
+
+      base_ctx_mod     = (type == 0) ? &(cabac->ctx.cu_one_model_luma4 * ctx_set) :
+                         &(cabac->ctx.cu_one_model_chroma4 * ctx_set);
+      num_c1_flag      = MIN(num_non_zero, C1FLAG_NUMBER);
+      first_c2_flag_idx = -1;
+
+      for (idx = 0; idx < num_c1_flag; idx++) {
+        uint32_t symbol = (abs_coeffidx > 1) ? 1 : 0;
+        cabac->cur_ctx = &base_ctx_modc1;
+        CABAC_BIN(cabac, symbol, "coeff_abs_level_greater1_flag");
+
+        if (symbol) {
+          c1 = 0;
+
+          if (first_c2_flag_idx == -1) {
+            first_c2_flag_idx = idx;
+          }
+        } else if ((c1 < 3) && (c1 > 0)) {
+          c1++;
+        }
+      }
+
+      if (c1 == 0) {
+        base_ctx_mod = (type == 0) ? &(cabac->ctx.cu_abs_model_lumactx_set) :
+                       &(cabac->ctx.cu_abs_model_chromactx_set);
+
+        if (first_c2_flag_idx != -1) {
+          uint8_t symbol = (abs_coefffirst_c2_flag_idx > 2) ? 1 : 0;
+          cabac->cur_ctx      = &base_ctx_mod0;
+          CABAC_BIN(cabac, symbol, "coeff_abs_level_greater2_flag");
+        }
+      }
+      if (be_valid && sign_hidden) {
+    	coeff_signs = coeff_signs >> 1;
+    	if (!cabac->only_count)
+    	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) {
+    	    coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero-1);
+    	  }
+        CABAC_BINS_EP(cabac, coeff_signs , (num_non_zero - 1), "coeff_sign_flag");
+      } else {
+        if (!cabac->only_count)
+    	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS)
+    	    coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero);
+        CABAC_BINS_EP(cabac, coeff_signs, num_non_zero, "coeff_sign_flag");
+      }
+
+      if (c1 == 0 || num_non_zero > C1FLAG_NUMBER) {
+        first_coeff2 = 1;
+
+        for (idx = 0; idx < num_non_zero; idx++) {
+          int32_t base_level  = (idx < C1FLAG_NUMBER) ? (2 + first_coeff2) : 1;
+
+          if (abs_coeffidx >= base_level) {
+        	if (!cabac->only_count) {
+        	  if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFFS)
+                    kvz_cabac_write_coeff_remain_encry(state, cabac, abs_coeffidx - base_level, go_rice_param, base_level);
+        	  else
+        		kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
+        	} else
+              kvz_cabac_write_coeff_remain(cabac, abs_coeffidx - base_level, go_rice_param);
+
+            if (abs_coeffidx > 3 * (1 << go_rice_param)) {
+              go_rice_param = MIN(go_rice_param + 1, 4);
+            }
+          }
+
+          if (abs_coeffidx >= 2) {
+            first_coeff2 = 0;
+          }
+        }
+      }
+    }
+  }
+}
+
+int kvz_strategy_register_encode_generic(void* opaque, uint8_t bitdepth)
+{
+  bool success = true;
+
+  success &= kvz_strategyselector_register(opaque, "encode_coeff_nxn", "generic", 0, &kvz_encode_coeff_nxn_generic);
+
+  return success;
+}

kvazaar-1.3.0.tar.gz/src/strategies/generic/encode_coding_tree-generic.h Added

@@ -0,0 +1,42 @@
+#ifndef ENCODE_CODING_TREE_GENERIC_H_
+#define ENCODE_CODING_TREE_GENERIC_H_
+
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \file
+ * Functions for writing the coding quadtree and related syntax.
+ */
+
+#include "encoderstate.h"
+#include "global.h"
+
+void kvz_encode_coeff_nxn_generic(encoder_state_t * const state,
+                                  cabac_data_t * const cabac,
+                                  const coeff_t *coeff,
+                                  uint8_t width,
+                                  uint8_t type,
+                                  int8_t scan_mode,
+                                  int8_t tr_skip);
+
+int kvz_strategy_register_encode_generic(void* opaque, uint8_t bitdepth);
+
+#endif // ENCODE_CODING_TREE_GENERIC_H_

kvazaar-1.2.0.tar.gz/src/strategies/generic/ipol-generic.c -> kvazaar-1.3.0.tar.gz/src/strategies/generic/ipol-generic.c Changed

@@ -119,510 +119,541 @@
   return temp;
 }
 
-void kvz_filter_inter_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
+void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2)
 {
   //TODO: horizontal and vertical only filtering
   int32_t x, y;
+
+  // Interpolation filter shifts
   int16_t shift1 = KVZ_BIT_DEPTH - 8;
   int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
-
-  //coefficients for 1/4, 2/4 and 3/4 positions
-  int8_t *c0, *c1, *c2, *c3;
 
-  c0 = kvz_g_luma_filter0;
-  c1 = kvz_g_luma_filter1;
-  c2 = kvz_g_luma_filter2;
-  c3 = kvz_g_luma_filter3;
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
 
-  #define FILTER_OFFSET 3
-  #define FILTER_SIZE 8
+  // Select filters according to the fractional part of the x and y mv components
+  int8_t *hor_filter = kvz_g_luma_filtermv0 & 3;
+  int8_t *ver_filter = kvz_g_luma_filtermv1 & 3;
 
-  int16_t flipped_hor_filtered4 * (LCU_WIDTH + 1) + FILTER_SIZE(LCU_WIDTH + 1) + FILTER_SIZE;
+  int16_t hor_filteredKVZ_EXT_BLOCK_W_LUMALCU_WIDTH;
+  int16_t hor_stride = LCU_WIDTH;
 
-  // Filter horizontally and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height + FILTER_SIZE - 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      // Original pixel
-      flipped_hor_filtered4 * x + 0y = (c0FILTER_OFFSET * srcsrc_stride*ypos + xpos + FILTER_OFFSET) >> shift1;
-      flipped_hor_filtered4 * x + 1y = kvz_eight_tap_filter_hor_generic(c1, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped_hor_filtered4 * x + 2y = kvz_eight_tap_filter_hor_generic(c2, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped_hor_filtered4 * x + 3y = kvz_eight_tap_filter_hor_generic(c3, &srcsrc_stride*ypos + xpos) >> shift1;
+  // Filter horizontally
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) {
+    for (x = 0; x < width; ++x) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+      hor_filteredyx = kvz_eight_tap_filter_hor_generic(hor_filter, &srcsrc_stride * ypos + xpos) >> shift1;
     }
   }
 
-  // Filter vertically and flip x and y
-  for (x = 0; x < 4 * width; ++x) {
-    for (y = 0; y < height; ++y) {
-      int ypos = y;
-      int xpos = x;
-      dst(4 * y + 0)*dst_stride + x = kvz_fast_clip_32bit_to_pixel(((c0FILTER_OFFSET * flipped_hor_filteredxposypos + FILTER_OFFSET + offset23) >> shift2) >> shift3); 
-      dst(4 * y + 1)*dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(c1, &flipped_hor_filteredxposypos) + offset23) >> shift2) >> shift3);
-      dst(4 * y + 2)*dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(c2, &flipped_hor_filteredxposypos) + offset23) >> shift2) >> shift3);
-      dst(4 * y + 3)*dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(c3, &flipped_hor_filteredxposypos) + offset23) >> shift2) >> shift3);
-
+  // Filter vertically
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      dsty * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_ver_16bit_generic(ver_filter, &hor_filteredyx, hor_stride) >> shift2) + wp_offset1) >> wp_shift1);
     }
   }
 }
 
-void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2)
+void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2)
 {
   //TODO: horizontal and vertical only filtering
   int32_t x, y;
+
+  // Interpolation filter shifts
   int16_t shift1 = KVZ_BIT_DEPTH - 8;
   int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
 
-  //coefficients for 1/4, 2/4 and 3/4 positions
-  int8_t *hor_filter = kvz_g_luma_filtermv0&3;
-  int8_t *ver_filter = kvz_g_luma_filtermv1&3;
+  // Select filters according to the fractional part of the x and y mv components
+  int8_t *hor_filter = kvz_g_luma_filtermv0 & 3;
+  int8_t *ver_filter = kvz_g_luma_filtermv1 & 3;
 
-  int16_t flipped_hor_filtered(LCU_WIDTH + 1) + FILTER_SIZE(LCU_WIDTH + 1) + FILTER_SIZE;
+  int16_t hor_filteredKVZ_EXT_BLOCK_W_LUMALCU_WIDTH;
+  int16_t hor_stride = LCU_WIDTH;
 
-  // Filter horizontally and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height + FILTER_SIZE - 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      flipped_hor_filteredxy = kvz_eight_tap_filter_hor_generic(hor_filter, &srcsrc_stride*ypos + xpos) >> shift1;
+  // Filter horizontally
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) {
+    for (x = 0; x < width; ++x) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+      hor_filteredyx = kvz_eight_tap_filter_hor_generic(hor_filter, &srcsrc_stride * ypos + xpos) >> shift1;
     }
   }
 
-  // Filter vertically and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height; ++y) {
-      int ypos = y;
-      int xpos = x;
-      dsty*dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(ver_filter, &flipped_hor_filteredxposypos) + offset23) >> shift2) >> shift3);
+  // Filter vertically
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      dsty * dst_stride + x = kvz_eight_tap_filter_ver_16bit_generic(ver_filter, &hor_filteredyx, hor_stride) >> shift2;
     }
   }
 }
 
-void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2)
+void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * encoder, 
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  kvz_pixel filtered4LCU_WIDTH * LCU_WIDTH,
+  int16_t hor_intermediate5(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH,
+  int8_t fme_level,
+  int16_t hor_first_cols5KVZ_EXT_BLOCK_W_LUMA + 1,
+  int8_t hpel_off_x, int8_t hpel_off_y)
 {
-  //TODO: horizontal and vertical only filtering
-  int32_t x, y;
+  int x, y, first_y;
+
+  // Interpolation filter shifts
   int16_t shift1 = KVZ_BIT_DEPTH - 8;
-  int32_t shift2 = 6;
 
-  //coefficients for 1/4, 2/4 and 3/4 positions
-  int8_t *hor_filter = kvz_g_luma_filtermv0 & 3;
-  int8_t *ver_filter = kvz_g_luma_filtermv1 & 3;
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
 
-  int16_t flipped_hor_filtered(LCU_WIDTH + 1) + FILTER_SIZE(LCU_WIDTH + 1) + FILTER_SIZE;
+  int8_t *fir0 = kvz_g_luma_filter0;
+  int8_t *fir2 = kvz_g_luma_filter2;
 
-  // Filter horizontally and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height + FILTER_SIZE - 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      flipped_hor_filteredxy = kvz_eight_tap_filter_hor_generic(hor_filter, &srcsrc_stride*ypos + xpos) >> shift1;
+  int16_t dst_stride = LCU_WIDTH;
+  int16_t hor_stride = LCU_WIDTH;
+  int32_t first_row_offset = (KVZ_LUMA_FILTER_OFFSET + 1) * hor_stride;
+
+  int16_t *col_pos0 = hor_first_cols0;
+  int16_t *col_pos2 = hor_first_cols2;
+
+  // Horizontally filtered samples from the top row are
+  // not needed unless samples for diagonal positions are filtered later.
+  first_y = fme_level > 1 ? 0 : 1; 
+                                             
+  // HORIZONTAL STEP
+  // Integer pixels
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    for (x = 0; x < width; ++x) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
+      hor_intermediate0y * hor_stride + x = kvz_eight_tap_filter_hor_generic(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
     }
   }
 
-  // Filter vertically and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height; ++y) {
-      int ypos = y;
-      int xpos = x;
-      dsty*dst_stride + x = (kvz_eight_tap_filter_hor_16bit_generic(ver_filter, &flipped_hor_filteredxposypos)) >> shift2;
+  // Write the first column in contiguous memory
+  x = 0;
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+    int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+    col_pos0y = kvz_eight_tap_filter_hor_generic(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
+  }
+
+  // Half pixels
+  for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    for (x = 0; x < width; ++x) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
+      hor_intermediate1y * hor_stride + x = kvz_eight_tap_filter_hor_generic(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
     }
   }
-}
 
-/**
- * \brief Interpolation for chroma half-pixel
- * \param src source image in integer pels (-2..width+3, -2..height+3)
- * \param src_stride stride of source image
- * \param width width of source image block
- * \param height height of source image block
- * \param dst destination image in half-pixel resolution
- * \param dst_stride stride of destination image
- *
- */
-void kvz_filter_inter_halfpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
-{
-  /* ____________
-  * | B0,0|ae0,0|
-  * |ea0,0|ee0,0|
-  *
-  * ae0,0 = (-4*B-1,0  + 36*B0,0  + 36*B1,0  - 4*B2,0)  >> shift1
-  * ea0,0 = (-4*B0,-1  + 36*B0,0  + 36*B0,1  - 4*B0,2)  >> shift1
-  * ee0,0 = (-4*ae0,-1 + 36*ae0,0 + 36*ae0,1 - 4*ae0,2) >> shift2
-  */
-  int32_t x, y;
-  int32_t shift1 = KVZ_BIT_DEPTH - 8;
-  int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset3 = 1 << (shift3 - 1);
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
-
-  int8_t* c = kvz_g_chroma_filter4;
-  int16_t temp4 = {0,0,0,0};
-
-  // Loop source pixels and generate four filtered half-pel pixels on each round
-  for (y = 0; y < height; y++) {
-    int dst_pos_y = (y << 1)*dst_stride;
-    int src_pos_y = y*src_stride;
-    for (x = 0; x < width; x++) {
-      // Calculate current dst and src pixel positions
-      int dst_pos = dst_pos_y + (x << 1);
-      int src_pos = src_pos_y + x;
-
-      // Original pixel (not really needed)
-      dstdst_pos = srcsrc_pos; //B0,0
-
-      // ae0,0 - We need this only when hor_flag and for ee0,0
-      if (hor_flag) {
-        temp1 = kvz_four_tap_filter_hor_generic(c, &srcsrc_pos - 1) >> shift1; // ae0,0
-      }
-      // ea0,0 - needed only when ver_flag
-      if (ver_flag) {
-        dstdst_pos + 1 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c, &srcsrc_pos - src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); // ea0,0
-      }
+  // Write the first column in contiguous memory
+  x = 0;
+  for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+    int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+    col_pos2y = kvz_eight_tap_filter_hor_generic(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
+  }
 
-      // When both flags, we use _only_ this pixel (but still need ae0,0 for it)
-      if (hor_flag && ver_flag) {
-        // Calculate temporary values..
-        src_pos -= src_stride;  //0,-1
-        temp0 = (kvz_four_tap_filter_hor_generic(c, &srcsrc_pos - 1) >> shift1); // ae0,-1
-        src_pos += 2 * src_stride;  //0,1
-        temp2 = (kvz_four_tap_filter_hor_generic(c, &srcsrc_pos - 1) >> shift1); // ae0,1
-        src_pos += src_stride;  //0,2
-        temp3 = (kvz_four_tap_filter_hor_generic(c, &srcsrc_pos - 1) >> shift1); // ae0,2
-        
-        dstdst_pos + 1 * dst_stride + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c, temp) + offset23) >> shift2) >> shift3); // ee0,0
-      }
+  // VERTICAL STEP
 
-      if (hor_flag) {
-        dstdst_pos + 1 = kvz_fast_clip_32bit_to_pixel((temp1 + offset3) >> shift3);
-      }
+  // Right
+  // Only horizontal filter
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      filtered1y * dst_stride + x = kvz_fast_clip_16bit_to_pixel((hor_intermediate1first_row_offset + y * hor_stride + x + wp_offset1) >> wp_shift1);
+    }
+  }
+
+  // Left
+  // Copy from the right filtered block and the extra column
+  for (y = 0; y < height; ++y) {
+    x = 0;
+    filtered0y * dst_stride + x = kvz_fast_clip_16bit_to_pixel((col_pos2y + KVZ_LUMA_FILTER_OFFSET + 1 + wp_offset1) >> wp_shift1);
+    for (x = 1; x < width; ++x) filtered0y * dst_stride + x = filtered1y * dst_stride + x - 1;
+  }
+
+  // Top
+  // Only vertical filter
+  for (y = 0; y < height; ++y) {
+    int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+    for (x = 0; x < width; ++x) {
+      int xpos = x;
+      int16_t sample = kvz_eight_tap_filter_ver_generic(fir2, &srcsrc_stride*ypos + xpos + 1, src_stride) >> shift1;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered2y * dst_stride + x = sample;
     }
   }
+
+  // Bottom
+  // Copy what can be copied from the top filtered values.
+  // Then filter the last row from horizontal intermediate buffer.
+  for (y = 0; y < height - 1; ++y) {
+    for (x = 0; x < width; ++x) filtered3y * dst_stride + x = filtered2(y + 1) * dst_stride + x;
+  }
+
+  int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+  for (x = 0; x < width; ++x) {
+    int xpos = x;
+    int16_t sample = kvz_eight_tap_filter_ver_generic(fir2, &srcsrc_stride*(ypos + 1) + xpos + 1, src_stride) >> shift1;
+    sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+    filtered3y * dst_stride + x = sample;
+  }
 }
 
-void kvz_filter_inter_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag)
+void kvz_filter_hpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  kvz_pixel filtered4LCU_WIDTH * LCU_WIDTH,
+  int16_t hor_intermediate5(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH,
+  int8_t fme_level,
+  int16_t hor_first_cols5KVZ_EXT_BLOCK_W_LUMA + 1,
+  int8_t hpel_off_x, int8_t hpel_off_y)
 {
+  int x, y;
 
-  int32_t x, y;
-  int32_t shift1 = KVZ_BIT_DEPTH - 8;
+  // Interpolation filter shifts
   int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset3 = 1 << (shift3 - 1);
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
-
-  //coefficients for 1/8, 2/8, 3/8, 4/8, 5/8, 6/8 and 7/8 positions
-  int8_t *c1, *c2, *c3, *c4, *c5, *c6, *c7;
-
-  int i;
-  c1 = kvz_g_chroma_filter1;
-  c2 = kvz_g_chroma_filter2;
-  c3 = kvz_g_chroma_filter3;
-  c4 = kvz_g_chroma_filter4;
-  c5 = kvz_g_chroma_filter5;
-  c6 = kvz_g_chroma_filter6;
-  c7 = kvz_g_chroma_filter7;
-
-  int16_t temp74; // Temporary horizontal values calculated from integer pixels
-
-
-  // Loop source pixels and generate 64 filtered 1/8-pel pixels on each round
-  for (y = 0; y < height; y++) {
-    int dst_pos_y = (y << 3)*dst_stride;
-    int src_pos_y = y*src_stride;
-    for (x = 0; x < width; x++) {
-      // Calculate current dst and src pixel positions
-      int dst_pos = dst_pos_y + (x << 3);
-      int src_pos = src_pos_y + x;
-      
-      // Original pixel
-      dstdst_pos = srcsrc_pos;
-
-      // Horizontal 1/8-values
-      if (hor_flag && !ver_flag) {
-
-        temp01 = (kvz_four_tap_filter_hor_generic(c1, &srcsrc_pos - 1) >> shift1); // ae0,0 h0
-        temp11 = (kvz_four_tap_filter_hor_generic(c2, &srcsrc_pos - 1) >> shift1);
-        temp21 = (kvz_four_tap_filter_hor_generic(c3, &srcsrc_pos - 1) >> shift1);
-        temp31 = (kvz_four_tap_filter_hor_generic(c4, &srcsrc_pos - 1) >> shift1);
-        temp41 = (kvz_four_tap_filter_hor_generic(c5, &srcsrc_pos - 1) >> shift1);
-        temp51 = (kvz_four_tap_filter_hor_generic(c6, &srcsrc_pos - 1) >> shift1);
-        temp61 = (kvz_four_tap_filter_hor_generic(c7, &srcsrc_pos - 1) >> shift1);
 
-      }
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
 
-      // Vertical 1/8-values
-      if (ver_flag) {
-        dstdst_pos + 1 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c1, &srcsrc_pos - 1 * src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); //
-        dstdst_pos + 2 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c2, &srcsrc_pos - 1 * src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
-        dstdst_pos + 3 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c3, &srcsrc_pos - 1 * src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
-        dstdst_pos + 4 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c4, &srcsrc_pos - 1 * src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
-        dstdst_pos + 5 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c5, &srcsrc_pos - 1 * src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
-        dstdst_pos + 6 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c6, &srcsrc_pos - 1 * src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
-        dstdst_pos + 7 * dst_stride = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c7, &srcsrc_pos - 1 * src_stride, src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3);
-      }
+  int8_t *fir2 = kvz_g_luma_filter2;
 
-      // When both flags, interpolate values from temporary horizontal values
-      if (hor_flag && ver_flag) {
+  int16_t dst_stride = LCU_WIDTH;
+  int16_t hor_stride = LCU_WIDTH;
 
-        // Calculate temporary values
-        src_pos -= 1 * src_stride;  //0,-3
-        for (i = 0; i < 4; ++i) {
+  // Horizontal positions
+  int16_t *col_pos2 = hor_first_cols2;
 
-          temp0i = (kvz_four_tap_filter_hor_generic(c1, &srcsrc_pos + i * src_stride - 1) >> shift1);
-          temp1i = (kvz_four_tap_filter_hor_generic(c2, &srcsrc_pos + i * src_stride - 1) >> shift1);
-          temp2i = (kvz_four_tap_filter_hor_generic(c3, &srcsrc_pos + i * src_stride - 1) >> shift1);
-          temp3i = (kvz_four_tap_filter_hor_generic(c4, &srcsrc_pos + i * src_stride - 1) >> shift1);
-          temp4i = (kvz_four_tap_filter_hor_generic(c5, &srcsrc_pos + i * src_stride - 1) >> shift1);
-          temp5i = (kvz_four_tap_filter_hor_generic(c6, &srcsrc_pos + i * src_stride - 1) >> shift1);
-          temp6i = (kvz_four_tap_filter_hor_generic(c7, &srcsrc_pos + i * src_stride - 1) >> shift1);
-
-        }
+  // VERTICAL STEP
 
+  // Top-right
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(fir2, &hor_intermediate1y * hor_stride + x, hor_stride) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered1y * dst_stride + x = sample;
+    }
+  }
 
-        //Calculate values from temporary horizontal 1/8-values
-        for (i = 0; i<7; ++i){
-          dstdst_pos + 1 * dst_stride + i + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c1, &tempi0) + offset23) >> shift2) >> shift3); // ee0,0
-          dstdst_pos + 2 * dst_stride + i + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c2, &tempi0) + offset23) >> shift2) >> shift3);
-          dstdst_pos + 3 * dst_stride + i + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c3, &tempi0) + offset23) >> shift2) >> shift3);
-          dstdst_pos + 4 * dst_stride + i + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c4, &tempi0) + offset23) >> shift2) >> shift3);
-          dstdst_pos + 5 * dst_stride + i + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c5, &tempi0) + offset23) >> shift2) >> shift3);
-          dstdst_pos + 6 * dst_stride + i + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c6, &tempi0) + offset23) >> shift2) >> shift3);
-          dstdst_pos + 7 * dst_stride + i + 1 = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c7, &tempi0) + offset23) >> shift2) >> shift3);
-          
-        }
+  for (y = 0; y < height; ++y) {
+    x = 0;
+    filtered0y * dst_stride + x = kvz_fast_clip_16bit_to_pixel((col_pos2y + KVZ_LUMA_FILTER_OFFSET + 1 + wp_offset1) >> wp_shift1);
+    for (x = 1; x < width; ++x) filtered0y * dst_stride + x = filtered1y * dst_stride + x - 1;
+  }
 
-      }
+  // Top-left
+  // Copy what can be copied from top-right filtered values. Filter the first column from the column array.
+  for (y = 0; y < height; ++y) {
+    x = 0;
+    int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(fir2, &col_pos2y) >> shift2;
+    sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+    filtered0y * dst_stride + x = sample;
+    for (x = 1; x < width; ++x) filtered0y * dst_stride + x = filtered1y * dst_stride + x - 1;
+  }
 
-      if (hor_flag) {
-        dstdst_pos + 1 = kvz_fast_clip_32bit_to_pixel((temp01 + offset3) >> shift3);
-        dstdst_pos + 2 = kvz_fast_clip_32bit_to_pixel((temp11 + offset3) >> shift3);
-        dstdst_pos + 3 = kvz_fast_clip_32bit_to_pixel((temp21 + offset3) >> shift3);
-        dstdst_pos + 4 = kvz_fast_clip_32bit_to_pixel((temp31 + offset3) >> shift3);
-        dstdst_pos + 5 = kvz_fast_clip_32bit_to_pixel((temp41 + offset3) >> shift3);
-        dstdst_pos + 6 = kvz_fast_clip_32bit_to_pixel((temp51 + offset3) >> shift3);
-        dstdst_pos + 7 = kvz_fast_clip_32bit_to_pixel((temp61 + offset3) >> shift3);
-      }
+  // Bottom-right
+  // Copy what can be copied from top-right filtered values. Filter the last row.
+  for (y = 0; y < height - 1; ++y) {
+    for (x = 0; x < width; ++x) filtered3y* dst_stride + x = filtered1(y + 1) * dst_stride + x;
+  }
 
+  for (x = 0; x < width; ++x) {
+    int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(fir2, &hor_intermediate1(y + 1) * hor_stride + x, hor_stride) >> shift2;
+    sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+    filtered3y * dst_stride + x = sample;
+  }
 
-    }
+  // Bottom-left
+  // Copy what can be copied from the top-left filtered values.
+  // Copy what can be copied from the bottom-right filtered values.
+  // Finally filter the last pixel from the column array.
+  for (y = 0; y < height - 1; ++y) {
+    for (x = 0; x < width; ++x) filtered2y * dst_stride + x = filtered0(y + 1) * dst_stride + x;
   }
+  for (x = 1; x < width; ++x) filtered2y * dst_stride + x = filtered3y * dst_stride + x - 1;
+  x = 0;
+  int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(fir2, &col_pos2(y + 1)) >> shift2;
+  sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+  filtered2y * dst_stride + x = sample;
 }
 
-void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * encoder,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  kvz_pixel filtered4LCU_WIDTH * LCU_WIDTH,
+  int16_t hor_intermediate5(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH,
+  int8_t fme_level,
+  int16_t hor_first_cols5KVZ_EXT_BLOCK_W_LUMA + 1,
+  int8_t hpel_off_x, int8_t hpel_off_y)
 {
   int x, y;
+
+  // Interpolation filter shifts
   int16_t shift1 = KVZ_BIT_DEPTH - 8;
   int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
 
   int8_t *fir0 = kvz_g_luma_filter0;
   int8_t *fir2 = kvz_g_luma_filter2;
+  int8_t *fir1 = kvz_g_luma_filter1;
+  int8_t *fir3 = kvz_g_luma_filter3;
 
-  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-
-  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
-  int16_t dst_stride = (LCU_WIDTH + 1);
-
-  // Horizontal positions
-  for (x = 0; x < width + 1; ++x) {
-    for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
+  // Horiziontal positions. Positions 0 and 2 have already been calculated in filtered.
+  int16_t *hor_pos0 = hor_intermediate0;
+  int16_t *hor_pos2 = hor_intermediate1;
+  int16_t *hor_pos_l = hor_intermediate3;
+  int16_t *hor_pos_r = hor_intermediate4;
+  int8_t *hor_fir_l  = hpel_off_x != 0 ? fir1 : fir3;
+  int8_t *hor_fir_r  = hpel_off_x != 0 ? fir3 : fir1;
+  int16_t *col_pos_l = hor_first_cols1;
+  int16_t *col_pos_r = hor_first_cols3;
+
+  int16_t dst_stride = LCU_WIDTH;
+  int16_t hor_stride = LCU_WIDTH;
+
+  int16_t *hor_hpel_pos = hpel_off_x != 0 ? hor_pos2 : hor_pos0;
+  int16_t *col_pos_hor  = hpel_off_x != 0 ? hor_first_cols2 : hor_first_cols0;
+
+  // Specify if integer pixels are filtered from left or/and top integer samples
+  int off_x_fir_l = hpel_off_x < 1 ? 0 : 1;
+  int off_x_fir_r = hpel_off_x < 0 ? 0 : 1;
+  int off_y_fir_t = hpel_off_y < 1 ? 0 : 1;
+  int off_y_fir_b = hpel_off_y < 0 ? 0 : 1;
+  
+  // HORIZONTAL STEP
+  // Left QPEL
+  int sample_off_y = hpel_off_y < 0 ? 0 : 1;
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    for (x = 0; x < width; ++x) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
+      hor_pos_ly * hor_stride + x = kvz_eight_tap_filter_hor_generic(hor_fir_l, &srcsrc_stride*ypos + xpos) >> shift1;
     }
   }
 
-  // Filter vertically and flip x and y
-  for (x = 0; x < width + 1; ++x) {
-    for (y = 0; y < height + 1; ++y) {
-      filteredHPEL_POS_HORy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filteredHPEL_POS_VERy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-    }
+  // Write the first column in contiguous memory
+  x = 0;
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+    int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+    col_pos_ly = kvz_eight_tap_filter_hor_generic(hor_fir_l, &srcsrc_stride*ypos + xpos) >> shift1;
   }
-}
 
-void kvz_filter_hpel_blocks_full_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
-{
-  int x, y;
-  int16_t shift1 = KVZ_BIT_DEPTH - 8;
-  int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
+  // Right QPEL
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    for (x = 0; x < width; ++x) {
+      int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+      int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1;
+      hor_pos_ry * hor_stride + x = kvz_eight_tap_filter_hor_generic(hor_fir_r, &srcsrc_stride*ypos + xpos) >> shift1;
+    }
+  }
 
-  int8_t *fir0 = kvz_g_luma_filter0;
-  int8_t *fir2 = kvz_g_luma_filter2;
+  // Write the first column in contiguous memory
+  x = 0;
+  for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) {
+    int ypos = y - KVZ_LUMA_FILTER_OFFSET;
+    int xpos = x - KVZ_LUMA_FILTER_OFFSET;
+    col_pos_ry = kvz_eight_tap_filter_hor_generic(hor_fir_r, &srcsrc_stride*ypos + xpos) >> shift1;
+  }
 
-  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  // VERTICAL STEP
+  int8_t *ver_fir_l = hpel_off_y != 0 ? fir2 : fir0;
+  int8_t *ver_fir_r = hpel_off_y != 0 ? fir2 : fir0;
+  int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3;
+  int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1;
+
+  // Left QPEL (1/4 or 3/4 x positions) 
+  for (y = 0; y < height; ++y) {
+    if (!off_x_fir_l) {
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_l, &col_pos_ly + sample_off_y) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered0y * dst_stride + x = sample;
+    }
+    for (x = !off_x_fir_l; x < width; ++x) {
+      int ypos = y + sample_off_y;
+      int xpos = x - !off_x_fir_l;
+      int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_l, &hor_pos_lypos * hor_stride + xpos, hor_stride) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered0y * dst_stride + x = sample;
+    }
+  }
 
-  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
-  int16_t dst_stride = (LCU_WIDTH + 1);
+  // Right QPEL (3/4 or 1/4 x positions)
+  for (y = 0; y < height; ++y) {
+    if (!off_x_fir_r) {
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_r, &col_pos_ry + sample_off_y) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered1y * dst_stride + x = sample;
+    }
+    for (x = !off_x_fir_r; x < width; ++x) {
+      int ypos = y + sample_off_y;
+      int xpos = x - !off_x_fir_r;
+      int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_r, &hor_pos_rypos * hor_stride + xpos, hor_stride) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered1y * dst_stride + x = sample;
+    }
+  }
 
-  // Horizontal positions
-  for (x = 0; x < width + 1; ++x) {
-    for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
+  // Top QPEL (1/4 or 3/4 y positions)
+  int sample_off_x = (hpel_off_x > -1 ? 1 : 0);
+  for (y = 0; y < height; ++y) {
+    if (!sample_off_x) {
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_t, &col_pos_hory + off_y_fir_t) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered2y * dst_stride + x = sample;
+    }
+    for (x = !sample_off_x; x < width; ++x) {
+      int ypos = y + off_y_fir_t;
+      int xpos = x - !sample_off_x;
+      int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_t, &hor_hpel_posypos * hor_stride + xpos, hor_stride) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered2y * dst_stride + x = sample;
     }
   }
 
-  // Filter vertically and flip x and y
-  for (x = 0; x < width + 1; ++x) {
-    for (y = 0; y < height + 1; ++y) {
-      filteredHPEL_POS_HORy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filteredHPEL_POS_VERy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filteredHPEL_POS_DIAy * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+  // Bottom QPEL (3/4 or 1/4 y positions)
+  for (y = 0; y < height; ++y) {
+    if (!sample_off_x) {
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_b, &col_pos_hory + off_y_fir_b) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered3y * dst_stride + x = sample;
+    }
+    for (x = !sample_off_x; x < width; ++x) {
+      int ypos = y + off_y_fir_b;
+      int xpos = x - !sample_off_x;
+      int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_b, &hor_hpel_posypos * hor_stride + xpos, hor_stride) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered3y * dst_stride + x = sample;
     }
   }
 }
 
-void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
+void kvz_filter_qpel_blocks_diag_luma_generic(const encoder_control_t * encoder,
+  kvz_pixel *src,
+  int16_t src_stride,
+  int width,
+  int height,
+  kvz_pixel filtered4LCU_WIDTH * LCU_WIDTH,
+  int16_t hor_intermediate5(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH,
+  int8_t fme_level,
+  int16_t hor_first_cols5KVZ_EXT_BLOCK_W_LUMA + 1,
+  int8_t hpel_off_x, int8_t hpel_off_y)
 {
   int x, y;
-  int16_t shift1 = KVZ_BIT_DEPTH - 8;
+
+  // Interpolation filter shifts
   int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
 
-  int8_t *fir0 = kvz_g_luma_filter0;
-  int8_t *fir2 = kvz_g_luma_filter2;
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
+
   int8_t *fir1 = kvz_g_luma_filter1;
   int8_t *fir3 = kvz_g_luma_filter3;
 
-  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped1(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped3(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-
-  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
-  int16_t dst_stride = (LCU_WIDTH + 1);
-  
-  // Horizontal positions
-  for (x = 0; x < width + 1; ++x) {
-    for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped1x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir1, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped3x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir3, &srcsrc_stride*ypos + xpos) >> shift1;
-    }
-  }
-
-  // Filter vertically and flip x and y
-  for (x = 0; x < width + 1; ++x) {
-    for (y = 0; y < height + 1; ++y) {
-      
-      // HPEL
-      filtered 0y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered 1y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered 2y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      
-      // QPEL
-      // Horizontal
-      filtered 3y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered 4y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered 5y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered 6y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
-
-      // Vertical
-      filtered 7y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered 8y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered 9y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered10y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
+  // Horiziontal positions.
+  int16_t *hor_pos_l = hor_intermediate3;
+  int16_t *hor_pos_r = hor_intermediate4;
+
+  int16_t *col_pos_l = hor_first_cols1;
+  int16_t *col_pos_r = hor_first_cols3;
+
+  int16_t dst_stride = LCU_WIDTH;
+  int16_t hor_stride = LCU_WIDTH;
+
+  // VERTICAL STEP
+  int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3;
+  int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1;
+
+  // Specify if integer pixels are filtered from left or/and top integer samples
+  int off_x_fir_l = hpel_off_x < 1 ? 0 : 1;
+  int off_x_fir_r = hpel_off_x < 0 ? 0 : 1;
+  int off_y_fir_t = hpel_off_y < 1 ? 0 : 1;
+  int off_y_fir_b = hpel_off_y < 0 ? 0 : 1;
+
+  // Top-left QPEL
+  for (y = 0; y < height; ++y) {
+    if (!off_x_fir_l) {
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_t, &col_pos_ly + off_y_fir_t) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered0y * dst_stride + x = sample;
+    }
+    for (x = !off_x_fir_l; x < width; ++x) {
+      int ypos = y + off_y_fir_t;
+      int xpos = x - !off_x_fir_l;
+      int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_t, &hor_pos_lypos * hor_stride + xpos, hor_stride) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered0y * dst_stride + x = sample;
     }
   }
-}
 
-void kvz_filter_qpel_blocks_full_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered)
-{
-  int x, y;
-  int16_t shift1 = KVZ_BIT_DEPTH - 8;
-  int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
-
-  int8_t *fir0 = kvz_g_luma_filter0;
-  int8_t *fir2 = kvz_g_luma_filter2;
-  int8_t *fir1 = kvz_g_luma_filter1;
-  int8_t *fir3 = kvz_g_luma_filter3;
-
-  int16_t flipped0(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped2(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped1(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
-  int16_t flipped3(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1);
+  // Top-right QPEL
+  for (y = 0; y < height; ++y) {
+    if (!off_x_fir_r) {
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_t, &col_pos_ry + off_y_fir_t) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered1y * dst_stride + x = sample;
+    }
+    for (x = !off_x_fir_r; x < width; ++x) {
+      int ypos = y + off_y_fir_t;
+      int xpos = x - !off_x_fir_r;
+      int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_t, &hor_pos_rypos * hor_stride + xpos, hor_stride) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered1y * dst_stride + x = sample;
+    }
+  }
 
-  int16_t temp_stride = height + KVZ_EXT_PADDING + 1;
-  int16_t dst_stride = (LCU_WIDTH + 1);
-  
-  // Horizontal positions
-  for (x = 0; x < width + 1; ++x) {
-    for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) {
-      int ypos = y - FILTER_OFFSET;
-      int xpos = x - FILTER_OFFSET;
-      flipped0x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir0, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped2x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir2, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped1x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir1, &srcsrc_stride*ypos + xpos) >> shift1;
-      flipped3x * temp_stride + y = kvz_eight_tap_filter_hor_generic(fir3, &srcsrc_stride*ypos + xpos) >> shift1;
-    }
-  }
-
-  // Filter vertically and flip x and y
-  for (x = 0; x < width + 1; ++x) {
-    for (y = 0; y < height + 1; ++y) {
-      
-      // HPEL
-      filtered 0y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered 1y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered 2y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      
-      // QPEL
-      // Horizontal
-      filtered 3y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered 4y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered 5y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered 6y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
-
-      // Vertical
-      filtered 7y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered 8y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered 9y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped0x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered10y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped2x * temp_stride + y) + offset23) >> shift2) >> shift3);
-
-      // Diagonal
-      filtered11y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered12y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered13y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped1x * temp_stride + y) + offset23) >> shift2) >> shift3);
-      filtered14y * dst_stride + x  = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped3x * temp_stride + y) + offset23) >> shift2) >> shift3);    
+  // Bottom-left QPEL
+  for (y = 0; y < height; ++y) {
+    if (!off_x_fir_l) {
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_b, &col_pos_ly + off_y_fir_b) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered2y * dst_stride + x = sample;
+    }
+    for (x = !off_x_fir_l; x < width; ++x) {
+      int ypos = y + off_y_fir_b;
+      int xpos = x - !off_x_fir_l;
+      int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_b, &hor_pos_lypos * hor_stride + xpos, hor_stride) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered2y * dst_stride + x = sample;
     }
   }
-}
 
-void kvz_filter_frac_blocks_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block filtered15, int8_t fme_level)
-{
-  switch (fme_level) {
-    case 1:
-      kvz_filter_hpel_blocks_hor_ver_luma_generic(encoder, src, src_stride, width, height, filtered);
-      break;
-    case 2:
-      kvz_filter_hpel_blocks_full_luma_generic(encoder, src, src_stride, width, height, filtered);
-      break;
-    case 3:
-      kvz_filter_qpel_blocks_hor_ver_luma_generic(encoder, src, src_stride, width, height, filtered);
-      break;
-    default:
-      kvz_filter_qpel_blocks_full_luma_generic(encoder, src, src_stride, width, height, filtered);
-      break;
+  // Bottom-right QPEL
+  for (y = 0; y < height; ++y) {
+    if (!off_x_fir_r) {
+      x = 0;
+      int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_b, &col_pos_ry + off_y_fir_b) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered3y * dst_stride + x = sample;
+    }
+    for (x = !off_x_fir_r; x < width; ++x) {
+      int ypos = y + off_y_fir_b;
+      int xpos = x - !off_x_fir_r;
+      int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_b, &hor_pos_rypos * hor_stride + xpos, hor_stride) >> shift2;
+      sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1);
+      filtered3y * dst_stride + x = sample;
+    }
   }
 }
 
@@ -630,33 +661,35 @@
 {
   //TODO: horizontal and vertical only filtering
   int32_t x, y;
+
+  // Interpolation filter shifts
   int16_t shift1 = KVZ_BIT_DEPTH - 8;
   int32_t shift2 = 6;
-  int32_t shift3 = 14 - KVZ_BIT_DEPTH;
-  int32_t offset23 = 1 << (shift2 + shift3 - 1);
 
+  // Weighted prediction offset and shift
+  int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH;
+  int32_t wp_offset1 = 1 << (wp_shift1 - 1);
+
+  // Select filters according to the fractional part of the x and y mv components
   int8_t *hor_filter = kvz_g_chroma_filtermv0 & 7;
   int8_t *ver_filter = kvz_g_chroma_filtermv1 & 7;
 
-#define FILTER_SIZE_C (FILTER_SIZE / 2)
-#define FILTER_OFFSET_C (FILTER_OFFSET / 2)
-  int16_t flipped_hor_filtered(LCU_WIDTH_C + 1) + FILTER_SIZE_C(LCU_WIDTH_C + 1) + FILTER_SIZE_C;
+  int16_t hor_filteredKVZ_EXT_BLOCK_W_CHROMALCU_WIDTH_C;
+  int16_t hor_stride = LCU_WIDTH_C;
 
-  // Filter horizontally and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height + FILTER_SIZE_C - 1; ++y) {
-      int ypos = y - FILTER_OFFSET_C;
-      int xpos = x - FILTER_OFFSET_C;
-      flipped_hor_filteredxy = kvz_four_tap_filter_hor_generic(hor_filter, &srcsrc_stride*ypos + xpos) >> shift1;
+  // Filter horizontally
+  for (y = 0; y < height + KVZ_EXT_PADDING_CHROMA; ++y) {
+    for (x = 0; x < width; ++x) {
+      int ypos = y - KVZ_CHROMA_FILTER_OFFSET;
+      int xpos = x - KVZ_CHROMA_FILTER_OFFSET;
+      hor_filteredyx = kvz_four_tap_filter_hor_generic(hor_filter, &srcsrc_stride * ypos + xpos) >> shift1;
     }
   }
 
-  // Filter vertically and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height; ++y) {
-      int ypos = y;
-      int xpos = x;
-      dsty*dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(ver_filter, &flipped_hor_filteredxposypos) + offset23) >> shift2) >> shift3);
+  // Filter vertically
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      dsty * dst_stride + x = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_16bit_generic(ver_filter, &hor_filteredyx, hor_stride) >> shift2) + wp_offset1) >> wp_shift1);
     }
   }
 }
@@ -665,30 +698,31 @@
 {
   //TODO: horizontal and vertical only filtering
   int32_t x, y;
+
+  // Interpolation filter shifts
   int16_t shift1 = KVZ_BIT_DEPTH - 8;
   int32_t shift2 = 6;
+
+  // Select filters according to the fractional part of the x and y mv components
   int8_t *hor_filter = kvz_g_chroma_filtermv0 & 7;
   int8_t *ver_filter = kvz_g_chroma_filtermv1 & 7;
 
-#define FILTER_SIZE_C (FILTER_SIZE / 2)
-#define FILTER_OFFSET_C (FILTER_OFFSET / 2)
-  int16_t flipped_hor_filtered(LCU_WIDTH_C + 1) + FILTER_SIZE_C(LCU_WIDTH_C + 1) + FILTER_SIZE_C;
+  int16_t hor_filteredKVZ_EXT_BLOCK_W_CHROMALCU_WIDTH_C;
+  int16_t hor_stride = LCU_WIDTH_C;
 
-  // Filter horizontally and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height + FILTER_SIZE_C - 1; ++y) {
-      int ypos = y - FILTER_OFFSET_C;
-      int xpos = x - FILTER_OFFSET_C;
-      flipped_hor_filteredxy = kvz_four_tap_filter_hor_generic(hor_filter, &srcsrc_stride*ypos + xpos) >> shift1;
+  // Filter horizontally
+  for (y = 0; y < height + KVZ_EXT_PADDING_CHROMA; ++y) {
+    for (x = 0; x < width; ++x) {
+      int ypos = y - KVZ_CHROMA_FILTER_OFFSET;
+      int xpos = x - KVZ_CHROMA_FILTER_OFFSET;
+      hor_filteredyx = kvz_four_tap_filter_hor_generic(hor_filter, &srcsrc_stride * ypos + xpos) >> shift1;
     }
   }
 
-  // Filter vertically and flip x and y
-  for (x = 0; x < width; ++x) {
-    for (y = 0; y < height; ++y) {
-      int ypos = y;
-      int xpos = x;
-      dsty*dst_stride + x = (kvz_four_tap_filter_hor_16bit_generic(ver_filter, &flipped_hor_filteredxposypos)) >> shift2;
+  // Filter vertically
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      dsty * dst_stride + x = kvz_four_tap_filter_ver_16bit_generic(ver_filter, &hor_filteredyx, hor_stride) >> shift2;
     }
   }
 }
@@ -749,15 +783,14 @@
   } 
 }
 
-
 int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
 
-  success &= kvz_strategyselector_register(opaque, "filter_inter_quarterpel_luma", "generic", 0, &kvz_filter_inter_quarterpel_luma_generic);
-  success &= kvz_strategyselector_register(opaque, "filter_inter_halfpel_chroma", "generic", 0, &kvz_filter_inter_halfpel_chroma_generic);
-  success &= kvz_strategyselector_register(opaque, "filter_inter_octpel_chroma", "generic", 0, &kvz_filter_inter_octpel_chroma_generic);
-  success &= kvz_strategyselector_register(opaque, "filter_frac_blocks_luma", "generic", 0, &kvz_filter_frac_blocks_luma_generic);
+  success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_hor_ver_luma", "generic", 0, &kvz_filter_hpel_blocks_hor_ver_luma_generic);
+  success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_diag_luma", "generic", 0, &kvz_filter_hpel_blocks_diag_luma_generic);
+  success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_hor_ver_luma", "generic", 0, &kvz_filter_qpel_blocks_hor_ver_luma_generic);
+  success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "generic", 0, &kvz_filter_qpel_blocks_diag_luma_generic);
   success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "generic", 0, &kvz_sample_quarterpel_luma_generic);
   success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "generic", 0, &kvz_sample_octpel_chroma_generic);
   success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "generic", 0, &kvz_sample_14bit_quarterpel_luma_generic);

kvazaar-1.2.0.tar.gz/src/strategies/generic/ipol-generic.h -> kvazaar-1.3.0.tar.gz/src/strategies/generic/ipol-generic.h Changed

@@ -32,7 +32,9 @@
 
 int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth);
 void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
+void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
 void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
+void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
 
 
 #endif //STRATEGIES_IPOL_GENERIC_H_

kvazaar-1.2.0.tar.gz/src/strategies/generic/picture-generic.c -> kvazaar-1.3.0.tar.gz/src/strategies/generic/picture-generic.c Changed

@@ -213,7 +213,7 @@
 }
 
 void kvz_satd_4x4_subblock_quad_generic(const kvz_pixel *preds4,
-                                       const int strides4,
+                                       const int stride,
                                        const kvz_pixel *orig,
                                        const int orig_stride,
                                        unsigned costs4)
@@ -221,10 +221,10 @@
   int32_t diff44 * 4;
   for (int y = 0; y < 4; y++) {
     for (int x = 0; x < 4; x++) {
-      diff0x + y * 4 = origx + y * orig_stride - preds0x + y * strides0;
-      diff1x + y * 4 = origx + y * orig_stride - preds1x + y * strides1;
-      diff2x + y * 4 = origx + y * orig_stride - preds2x + y * strides2;
-      diff3x + y * 4 = origx + y * orig_stride - preds3x + y * strides3;
+      diff0x + y * 4 = origx + y * orig_stride - preds0x + y * stride;
+      diff1x + y * 4 = origx + y * orig_stride - preds1x + y * stride;
+      diff2x + y * 4 = origx + y * orig_stride - preds2x + y * stride;
+      diff3x + y * 4 = origx + y * orig_stride - preds3x + y * stride;
     }
   }
 
@@ -328,15 +328,15 @@
 }
 
 static void satd_8x8_subblock_quad_generic(const kvz_pixel **preds,
-                                       const int *strides,
+                                       const int stride,
                                        const kvz_pixel *orig,
                                        const int orig_stride,
                                        unsigned *costs)
 {
-  costs0 = satd_8x8_subblock_generic(orig, orig_stride, preds0, strides0);
-  costs1 = satd_8x8_subblock_generic(orig, orig_stride, preds1, strides1);
-  costs2 = satd_8x8_subblock_generic(orig, orig_stride, preds2, strides2);
-  costs3 = satd_8x8_subblock_generic(orig, orig_stride, preds3, strides3);
+  costs0 = satd_8x8_subblock_generic(orig, orig_stride, preds0, stride);
+  costs1 = satd_8x8_subblock_generic(orig, orig_stride, preds1, stride);
+  costs2 = satd_8x8_subblock_generic(orig, orig_stride, preds2, stride);
+  costs3 = satd_8x8_subblock_generic(orig, orig_stride, preds3, stride);
 }
 
 // These macros define sadt_16bit_NxN for N = 8, 16, 32, 64
@@ -394,7 +394,7 @@
   static void satd_any_size_ ## suffix ( \
       int width, int height, \
       const kvz_pixel **preds, \
-      const int *strides, \
+      const int stride, \
       const kvz_pixel *orig, \
       const int orig_stride, \
       unsigned num_modes, \
@@ -408,7 +408,7 @@
     if (width % 8 != 0) { \
       /* Process the first column using 4x4 blocks. */ \
       for (int y = 0; y < height; y += 4) { \
-        kvz_satd_4x4_subblock_ ## suffix(preds, strides, orig, orig_stride, sums); \
+        kvz_satd_4x4_subblock_ ## suffix(preds, stride, orig, orig_stride, sums); \
             } \
       orig_ptr += 4; \
       for(int blk = 0; blk < num_parallel_blocks; ++blk){\
@@ -419,23 +419,23 @@
     if (height % 8 != 0) { \
       /* Process the first row using 4x4 blocks. */ \
       for (int x = 0; x < width; x += 4 ) { \
-        kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \
+        kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
             } \
       orig_ptr += 4 * orig_stride; \
       for(int blk = 0; blk < num_parallel_blocks; ++blk){\
-        pred_ptrsblk += 4 * stridesblk; \
+        pred_ptrsblk += 4 * stride; \
             }\
       height -= 4; \
         } \
     /* The rest can now be processed with 8x8 blocks. */ \
     for (int y = 0; y < height; y += 8) { \
       orig_ptr = &origy * orig_stride; \
-      pred_ptrs0 = &preds0y * strides0; \
-      pred_ptrs1 = &preds1y * strides1; \
-      pred_ptrs2 = &preds2y * strides2; \
-      pred_ptrs3 = &preds3y * strides3; \
+      pred_ptrs0 = &preds0y * stride; \
+      pred_ptrs1 = &preds1y * stride; \
+      pred_ptrs2 = &preds2y * stride; \
+      pred_ptrs3 = &preds3y * stride; \
       for (int x = 0; x < width; x += 8) { \
-        satd_8x8_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \
+        satd_8x8_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \
         orig_ptr += 8; \
         pred_ptrs0 += 8; \
         pred_ptrs1 += 8; \
@@ -535,6 +535,141 @@
   return ssd >> (2*(KVZ_BIT_DEPTH-8));
 }
 
+static void inter_recon_bipred_generic(const int hi_prec_luma_rec0,
+	const int hi_prec_luma_rec1,
+	const int hi_prec_chroma_rec0,
+	const int hi_prec_chroma_rec1,
+	int32_t height,
+	int32_t width,
+	int32_t ypos,
+	int32_t xpos,
+	const hi_prec_buf_t*high_precision_rec0,
+	const hi_prec_buf_t*high_precision_rec1,
+	lcu_t* lcu,
+	kvz_pixel* temp_lcu_y,
+	kvz_pixel* temp_lcu_u,
+	kvz_pixel* temp_lcu_v) {
+
+	int shift = 15 - KVZ_BIT_DEPTH;
+	int offset = 1 << (shift - 1);
+
+	int y_in_lcu;
+	int x_in_lcu;
+
+	//After reconstruction, merge the predictors by taking an average of each pixel
+	for (int temp_y = 0; temp_y < height; ++temp_y) {
+
+
+		for (int temp_x = 0; temp_x < width; ++temp_x) {
+			y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+			x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+
+			int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu : (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu : (lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+
+			lcu->rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
+
+			if (temp_x < width >> 1 && temp_y < height >> 1) {
+
+				y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+				x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+
+				int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + x_in_lcu : (temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+				int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + x_in_lcu : (lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+				lcu->rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
+
+				int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + x_in_lcu : (temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+				int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + x_in_lcu : (lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+				lcu->rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
+			}
+		}
+	}
+
+}
+
+
+static optimized_sad_func_ptr_t get_optimized_sad_generic(int32_t width)
+{
+  return NULL;
+}
+
+/**
+ * \brief Vertically interpolate SAD outside the frame.
+ *
+ * \param data1   Starting point of the first picture.
+ * \param data2   Starting point of the second picture.
+ * \param width   Width of the region for which SAD is calculated.
+ * \param height  Height of the region for which SAD is calculated.
+ * \param width  Width of the pixel array.
+ *
+ * \returns Sum of Absolute Differences
+ */
+static uint32_t ver_sad_generic(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                int block_width, int block_height, unsigned pic_stride)
+{
+  int x, y;
+  unsigned sad = 0;
+
+  for (y = 0; y < block_height; ++y) {
+    for (x = 0; x < block_width; ++x) {
+      sad += abs(pic_datay * pic_stride + x - ref_datax);
+    }
+  }
+
+  return sad;
+}
+
+/**
+ * \brief Horizontally interpolate SAD outside the frame.
+ *
+ * \param data1   Starting point of the first picture.
+ * \param data2   Starting point of the second picture.
+ * \param width   Width of the region for which SAD is calculated.
+ * \param height  Height of the region for which SAD is calculated.
+ * \param width   Width of the pixel array.
+ *
+ * \returns Sum of Absolute Differences
+ */
+static unsigned hor_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                        int block_width, int block_height, unsigned pic_stride, unsigned ref_stride)
+{
+  int x, y;
+  unsigned sad = 0;
+
+  for (y = 0; y < block_height; ++y) {
+    for (x = 0; x < block_width; ++x) {
+      sad += abs(pic_datay * pic_stride + x - ref_datay * ref_stride);
+    }
+  }
+
+  return sad;
+}
+
+
+static uint32_t hor_sad_generic(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                int32_t width, int32_t height, uint32_t pic_stride,
+                                uint32_t ref_stride, uint32_t left, uint32_t right)
+{
+  uint32_t result = 0;
+  if (left) {
+    result += hor_sad    (pic_data, ref_data + left, left,
+                          height, pic_stride, ref_stride);
+
+    result += kvz_reg_sad(pic_data + left, ref_data + left, width - left,
+                          height, pic_stride, ref_stride);
+  } else if (right) {
+    result += kvz_reg_sad(pic_data, ref_data, width - right,
+                          height, pic_stride, ref_stride);
+
+    result += hor_sad    (pic_data + width - right,
+                          ref_data + width - right - 1,
+                          right, height, pic_stride, ref_stride);
+  } else {
+    result += kvz_reg_sad(pic_data, ref_data, width,
+                          height, pic_stride, ref_stride);
+  }
+  return result;
+}
 
 int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth)
 {
@@ -569,6 +704,11 @@
   success &= kvz_strategyselector_register(opaque, "satd_any_size_quad", "generic", 0, &satd_any_size_quad_generic);
 
   success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "generic", 0, &pixels_calc_ssd_generic);
+  success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "generic", 0, &inter_recon_bipred_generic);
+
+  success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "generic", 0, &get_optimized_sad_generic);
+  success &= kvz_strategyselector_register(opaque, "ver_sad", "generic", 0, &ver_sad_generic);
+  success &= kvz_strategyselector_register(opaque, "hor_sad", "generic", 0, &hor_sad_generic);
 
   return success;
 }

kvazaar-1.2.0.tar.gz/src/strategies/generic/picture-generic.h -> kvazaar-1.3.0.tar.gz/src/strategies/generic/picture-generic.h Changed

kvazaar-1.2.0.tar.gz/src/strategies/generic/quant-generic.c -> kvazaar-1.3.0.tar.gz/src/strategies/generic/quant-generic.c Changed

@@ -53,17 +53,19 @@
   uint32_t ac_sum = 0;
 
   for (int32_t n = 0; n < width * height; n++) {
-    int32_t level;
+    int32_t level = coefn;
+    int64_t abs_level = (int64_t)abs(level);
     int32_t  sign;
 
-    level = coefn;
     sign = (level < 0 ? -1 : 1);
 
-    level = ((int64_t)abs(level) * quant_coeffn + add) >> q_bits;
+    int32_t curr_quant_coeff = quant_coeffn;
+    level = (abs_level * curr_quant_coeff + add) >> q_bits;
     ac_sum += level;
 
     level *= sign;
     q_coefn = (coeff_t)(CLIP(-32768, 32767, level));
+
   }
 
   if (!encoder->cfg.signhide_enable || ac_sum < 2) return;
@@ -71,10 +73,12 @@
   int32_t delta_uLCU_WIDTH*LCU_WIDTH >> 2;
 
   for (int32_t n = 0; n < width * height; n++) {
-    int32_t level;
-    level = coefn;
-    level = ((int64_t)abs(level) * quant_coeffn + add) >> q_bits;
-    delta_un = (int32_t)(((int64_t)abs(coefn) * quant_coeffn - (level << q_bits)) >> q_bits8);
+    int32_t level = coefn;
+    int64_t abs_level = (int64_t)abs(level);
+    int32_t curr_quant_coeff = quant_coeffn;
+
+    level = (abs_level * curr_quant_coeff + add) >> q_bits;
+    delta_un = (int32_t)((abs_level * curr_quant_coeff - (level << q_bits)) >> q_bits8);
   }
 
   if (ac_sum >= 2) {
@@ -208,7 +212,7 @@
     kvz_transformskip(state->encoder_control, residual, coeff, width);
   }
   else {
-    kvz_transform2d(state->encoder_control, residual, coeff, width, (color == COLOR_Y ? 0 : 65535));
+    kvz_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu->type);
   }
 
   // Quantize coeffs. (coeff -> coeff_out)
@@ -246,7 +250,7 @@
       kvz_itransformskip(state->encoder_control, residual, coeff, width);
     }
     else {
-      kvz_itransform2d(state->encoder_control, residual, coeff, width, (color == COLOR_Y ? 0 : 65535));
+      kvz_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu->type);
     }
 
     // Get quantized reconstruction. (residual + pred_in -> rec_out)
@@ -329,6 +333,48 @@
   return sum;
 }
 
+static INLINE int16_t to_q88(float f)
+{
+  return (int16_t)(f * 256.0f);
+}
+
+static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, int32_t qp)
+{
+  uint32_t sum = 0;
+#define NUM_BUCKETS 5
+  const int16_t wt_mNUM_BUCKETS = {
+    to_q88(-0.004916),
+    to_q88(0.010806),
+    to_q88(0.055562),
+    to_q88(0.033436),
+    to_q88(-0.007690),
+  };
+  const int16_t wt_cNUM_BUCKETS = {
+    to_q88(0.172024),
+    to_q88(3.421462),
+    to_q88(2.879506),
+    to_q88(5.585471),
+    to_q88(0.256772),
+  };
+
+  int16_t wtNUM_BUCKETS;
+  for (int32_t i = 0; i < NUM_BUCKETS; i++)
+    wti = wt_mi * qp + wt_ci;
+
+  for (int32_t i = 0; i < width * width; i++) {
+    int16_t curr = coeffi;
+    int16_t signmask = curr >> 15;
+    int16_t curr_abs = (curr ^ signmask) - signmask;
+    if (curr_abs > 3)
+      curr_abs = 3;
+
+    sum += wtcurr_abs;
+  }
+  sum += wtNUM_BUCKETS - 1 * width;
+  return sum >> 8;
+#undef NUM_BUCKETS
+}
+
 int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
@@ -337,6 +383,7 @@
   success &= kvz_strategyselector_register(opaque, "quantize_residual", "generic", 0, &kvz_quantize_residual_generic);
   success &= kvz_strategyselector_register(opaque, "dequant", "generic", 0, &kvz_dequant_generic);
   success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "generic", 0, &coeff_abs_sum_generic);
+  success &= kvz_strategyselector_register(opaque, "fast_coeff_cost", "generic", 0, &fast_coeff_cost_generic);
 
   return success;
 }

kvazaar-1.3.0.tar.gz/src/strategies/missing-intel-intrinsics.h Added

kvazaar-1.3.0.tar.gz/src/strategies/optimized_sad_func_ptr_t.h Added

kvazaar-1.2.0.tar.gz/src/strategies/sse41/picture-sse41.c -> kvazaar-1.3.0.tar.gz/src/strategies/sse41/picture-sse41.c Changed

@@ -18,73 +18,201 @@
  * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
  ****************************************************************************/
 
-#include "strategies/sse41/picture-sse41.h"
+#include "global.h"
 
 #if COMPILE_INTEL_SSE41
+#include "strategies/sse41/picture-sse41.h"
+#include "strategies/sse41/reg_sad_pow2_widths-sse41.h"
+
 #include <immintrin.h>
 #include <stdlib.h>
 
 #include "kvazaar.h"
 #include "strategyselector.h"
 
+uint32_t kvz_reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                           const int32_t width, const int32_t height, const uint32_t stride1,
+                           const uint32_t stride2)
+{
+  if (width == 0)
+    return 0;
+  if (width == 4)
+    return reg_sad_w4(data1, data2, height, stride1, stride2);
+  if (width == 8)
+    return reg_sad_w8(data1, data2, height, stride1, stride2);
+  if (width == 12)
+    return reg_sad_w12(data1, data2, height, stride1, stride2);
+  if (width == 16)
+    return reg_sad_w16(data1, data2, height, stride1, stride2);
+  if (width == 24)
+    return reg_sad_w24(data1, data2, height, stride1, stride2);
+  else
+    return reg_sad_arbitrary(data1, data2, width, height, stride1, stride2);
+}
 
-unsigned kvz_reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2,
-                           const int width, const int height, const unsigned stride1, const unsigned stride2)
+static optimized_sad_func_ptr_t get_optimized_sad_sse41(int32_t width)
 {
-  int y, x;
-  unsigned sad = 0;
-  __m128i sse_inc = _mm_setzero_si128 ();
-  long long int sse_inc_array2;
-  
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x <= width-16; x+=16) {
-      const __m128i a = _mm_loadu_si128((__m128i const*) &data1y * stride1 + x);
-      const __m128i b = _mm_loadu_si128((__m128i const*) &data2y * stride2 + x);
-      sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a,b));
-    }
-    
-    {
-      const __m128i a = _mm_loadu_si128((__m128i const*) &data1y * stride1 + x);
-      const __m128i b = _mm_loadu_si128((__m128i const*) &data2y * stride2 + x);
-      switch (((width - (width%2)) - x)/2) {
-        case 0:
-          break;
-        case 1:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x01)));
-          break;
-        case 2:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x03)));
-          break;
-        case 3:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x07)));
-          break;
-        case 4:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x0f)));
-          break;
-        case 5:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x1f)));
-          break;
-        case 6:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x3f)));
-          break;
-        case 7:
-          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x7f)));
-          break;
-        default:
-          //Should not happen
-          assert(0);
-      }
-      x = (width - (width%2));
-    }
-
-    for (; x < width; ++x) {
-      sad += abs(data1y * stride1 + x - data2y * stride2 + x);
-    }
+  if (width == 0)
+    return reg_sad_w0;
+  if (width == 4)
+    return reg_sad_w4;
+  if (width == 8)
+    return reg_sad_w8;
+  if (width == 12)
+    return reg_sad_w12;
+  if (width == 16)
+    return reg_sad_w16;
+  if (width == 24)
+    return reg_sad_w24;
+  else
+    return NULL;
+}
+
+static uint32_t ver_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                              int32_t width, int32_t height, uint32_t stride)
+{
+  if (width == 0)
+    return 0;
+  if (width == 4)
+    return ver_sad_w4(pic_data, ref_data, height, stride);
+  if (width == 8)
+    return ver_sad_w8(pic_data, ref_data, height, stride);
+  if (width == 12)
+    return ver_sad_w12(pic_data, ref_data, height, stride);
+  if (width == 16)
+    return ver_sad_w16(pic_data, ref_data, height, stride);
+  else
+    return ver_sad_arbitrary(pic_data, ref_data, width, height, stride);
+}
+
+static uint32_t hor_sad_sse41_w32(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                  int32_t height, uint32_t pic_stride, uint32_t ref_stride,
+                                  uint32_t left, uint32_t right)
+{
+  const size_t vec_width       = 16;
+  const uint32_t blkwidth_log2 = 5;
+  const uint32_t left_eq_wid   = left  >> blkwidth_log2;
+  const uint32_t right_eq_wid  = right >> blkwidth_log2;
+  const int32_t  left_clamped  = left  - left_eq_wid;
+  const int32_t  right_clamped = right - right_eq_wid;
+
+  const int32_t height_twoline_groups = height & ~1;
+  const int32_t height_residual_lines = height &  1;
+
+  const __m128i zero       = _mm_setzero_si128();
+  const __m128i vec_widths = _mm_set1_epi8((uint8_t)vec_width);
+  const __m128i lefts      = _mm_set1_epi8((uint8_t)left_clamped);
+  const __m128i rights     = _mm_set1_epi8((uint8_t)right_clamped);
+  const __m128i nslo       = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  const __m128i nshi       = _mm_add_epi8 (nslo, vec_widths);
+
+  const __m128i rightmost_good_idx = _mm_set1_epi8((uint8_t)((vec_width << 1) - right - 1));
+
+  const __m128i epol_mask_right_lo = _mm_min_epi8  (nslo,            rightmost_good_idx);
+  const __m128i epol_mask_right_hi = _mm_min_epi8  (nshi,            rightmost_good_idx);
+  const __m128i epol_mask_lo       = _mm_max_epi8  (lefts,           epol_mask_right_lo);
+  const __m128i epol_mask_hi       = _mm_max_epi8  (lefts,           epol_mask_right_hi);
+
+  const __m128i is_left            = _mm_cmpeq_epi8(rights,          zero);
+  const __m128i vecwid_for_left    = _mm_and_si128 (is_left,         vec_widths);
+  const __m128i ns_for_shufmask    = _mm_or_si128  (nslo,            vecwid_for_left);
+
+  const __m128i shufmask1_right    = _mm_add_epi8  (ns_for_shufmask, rights);
+  const __m128i shufmask1          = _mm_sub_epi8  (shufmask1_right, lefts);
+
+  const __m128i md2bimask          = _mm_cmpgt_epi8(vec_widths,      shufmask1);
+  const __m128i move_d_to_b_imask  = _mm_or_si128  (is_left,         md2bimask);
+  const __m128i move_b_to_d_mask   = _mm_cmpgt_epi8(lefts,           nslo);
+
+  // If we're straddling the left border, start from the left border instead,
+  // and if right border, end on the border
+  const int32_t ld_offset = left - right;
+
+  int32_t y;
+  __m128i sse_inc = _mm_setzero_si128();
+  for (y = 0; y < height_twoline_groups; y += 2) {
+    __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + 0));
+    __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + 0  + ld_offset));
+    __m128i c = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + 16));
+    __m128i d = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + 16 + ld_offset));
+    __m128i e = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * pic_stride + 0));
+    __m128i f = _mm_loadu_si128((__m128i *)(ref_data + (y + 1) * ref_stride + 0  + ld_offset));
+    __m128i g = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * pic_stride + 16));
+    __m128i h = _mm_loadu_si128((__m128i *)(ref_data + (y + 1) * ref_stride + 16 + ld_offset));
+
+    __m128i b_shifted         = _mm_shuffle_epi8(b, shufmask1);
+    __m128i d_shifted         = _mm_shuffle_epi8(d, shufmask1);
+    __m128i f_shifted         = _mm_shuffle_epi8(f, shufmask1);
+    __m128i h_shifted         = _mm_shuffle_epi8(h, shufmask1);
+
+    // TODO: could these be optimized for two-operand efficiency? Only one of
+    // these ever does useful work, the other should leave the vector untouched,
+    // so could the first result be used in the second calculation or something?
+    __m128i b_with_d_data     = _mm_blendv_epi8(d_shifted, b_shifted, move_d_to_b_imask);
+    __m128i d_with_b_data     = _mm_blendv_epi8(d_shifted, b_shifted, move_b_to_d_mask);
+    __m128i f_with_h_data     = _mm_blendv_epi8(h_shifted, f_shifted, move_d_to_b_imask);
+    __m128i h_with_f_data     = _mm_blendv_epi8(h_shifted, f_shifted, move_b_to_d_mask);
+
+    __m128i b_final           = _mm_shuffle_epi8(b_with_d_data, epol_mask_lo);
+    __m128i d_final           = _mm_shuffle_epi8(d_with_b_data, epol_mask_hi);
+    __m128i f_final           = _mm_shuffle_epi8(f_with_h_data, epol_mask_lo);
+    __m128i h_final           = _mm_shuffle_epi8(h_with_f_data, epol_mask_hi);
+
+    __m128i curr_sads_ab      = _mm_sad_epu8    (a, b_final);
+    __m128i curr_sads_cd      = _mm_sad_epu8    (c, d_final);
+    __m128i curr_sads_ef      = _mm_sad_epu8    (e, f_final);
+    __m128i curr_sads_gh      = _mm_sad_epu8    (g, h_final);
+
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
   }
-  _mm_storeu_si128((__m128i*) sse_inc_array, sse_inc);
-  sad += sse_inc_array0 + sse_inc_array1;
+  if (height_residual_lines) {
+    __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + 0));
+    __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + 0  + ld_offset));
+    __m128i c = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + 16));
+    __m128i d = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + 16 + ld_offset));
 
-  return sad;
+    __m128i b_shifted         = _mm_shuffle_epi8(b, shufmask1);
+    __m128i d_shifted         = _mm_shuffle_epi8(d, shufmask1);
+
+    __m128i b_with_d_data     = _mm_blendv_epi8(d_shifted, b_shifted, move_d_to_b_imask);
+    __m128i d_with_b_data     = _mm_blendv_epi8(d_shifted, b_shifted, move_b_to_d_mask);
+
+    __m128i b_final           = _mm_shuffle_epi8(b_with_d_data, epol_mask_lo);
+    __m128i d_final           = _mm_shuffle_epi8(d_with_b_data, epol_mask_hi);
+
+    __m128i curr_sads_ab      = _mm_sad_epu8    (a, b_final);
+    __m128i curr_sads_cd      = _mm_sad_epu8    (c, d_final);
+
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+  return _mm_cvtsi128_si32(sad);
+}
+
+static uint32_t hor_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                              int32_t width, int32_t height, uint32_t pic_stride,
+                              uint32_t ref_stride, uint32_t left, uint32_t right)
+{
+  if (width == 4)
+    return hor_sad_sse41_w4(pic_data, ref_data, height,
+                            pic_stride, ref_stride, left, right);
+  if (width == 8)
+    return hor_sad_sse41_w8(pic_data, ref_data, height,
+                            pic_stride, ref_stride, left, right);
+  if (width == 16)
+    return hor_sad_sse41_w16(pic_data, ref_data, height,
+                             pic_stride, ref_stride, left, right);
+  if (width == 32)
+    return hor_sad_sse41_w32(pic_data, ref_data, height,
+                             pic_stride, ref_stride, left, right);
+  else
+    return hor_sad_sse41_arbitrary(pic_data, ref_data, width, height,
+                                   pic_stride, ref_stride, left, right);
 }
 
 #endif //COMPILE_INTEL_SSE41
@@ -95,6 +223,9 @@
 #if COMPILE_INTEL_SSE41
   if (bitdepth == 8){
     success &= kvz_strategyselector_register(opaque, "reg_sad", "sse41", 20, &kvz_reg_sad_sse41);
+    success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "sse41", 20, &get_optimized_sad_sse41);
+    success &= kvz_strategyselector_register(opaque, "ver_sad", "sse41", 20, &ver_sad_sse41);
+    success &= kvz_strategyselector_register(opaque, "hor_sad", "sse41", 20, &hor_sad_sse41);
   }
 #endif
   return success;

kvazaar-1.3.0.tar.gz/src/strategies/sse41/reg_sad_pow2_widths-sse41.h Added

@@ -0,0 +1,1027 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#ifndef REG_SAD_POW2_WIDTHS_SSE41_H_
+#define REG_SAD_POW2_WIDTHS_SSE41_H_
+
+#include "kvazaar.h"
+#include "strategies/missing-intel-intrinsics.h"
+#include <immintrin.h>
+
+static INLINE uint32_t reg_sad_w0(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                           const int32_t height, const uint32_t stride1,
+                           const uint32_t stride2)
+{
+  return 0;
+}
+
+static INLINE uint32_t reg_sad_w4(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                           const int32_t height, const uint32_t stride1,
+                           const uint32_t stride2)
+{
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128i a = _mm_cvtsi32_si128(*(uint32_t *)(data1 + y * stride1));
+    __m128i b = _mm_cvtsi32_si128(*(uint32_t *)(data2 + y * stride2));
+
+    a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 1) * stride1), 1);
+    b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 1) * stride2), 1);
+    a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 2) * stride1), 2);
+    b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 2) * stride2), 2);
+    a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 3) * stride1), 3);
+    b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 3) * stride2), 3);
+
+    __m128i curr_sads = _mm_sad_epu8(a, b);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(data1 + y * stride1));
+      __m128i b = _mm_cvtsi32_si128(*(const uint32_t *)(data2 + y * stride2));
+
+      __m128i curr_sads = _mm_sad_epu8(a, b);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static INLINE uint32_t reg_sad_w8(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                           const int32_t height, const uint32_t stride1,
+                           const uint32_t stride2)
+{
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128d a_d = _mm_setzero_pd();
+    __m128d b_d = _mm_setzero_pd();
+    __m128d c_d = _mm_setzero_pd();
+    __m128d d_d = _mm_setzero_pd();
+
+    a_d = _mm_loadl_pd(a_d, (const double *)(data1 + (y + 0) * stride1));
+    b_d = _mm_loadl_pd(b_d, (const double *)(data2 + (y + 0) * stride2));
+    a_d = _mm_loadh_pd(a_d, (const double *)(data1 + (y + 1) * stride1));
+    b_d = _mm_loadh_pd(b_d, (const double *)(data2 + (y + 1) * stride2));
+
+    c_d = _mm_loadl_pd(c_d, (const double *)(data1 + (y + 2) * stride1));
+    d_d = _mm_loadl_pd(d_d, (const double *)(data2 + (y + 2) * stride2));
+    c_d = _mm_loadh_pd(c_d, (const double *)(data1 + (y + 3) * stride1));
+    d_d = _mm_loadh_pd(d_d, (const double *)(data2 + (y + 3) * stride2));
+
+    __m128i a = _mm_castpd_si128(a_d);
+    __m128i b = _mm_castpd_si128(b_d);
+    __m128i c = _mm_castpd_si128(c_d);
+    __m128i d = _mm_castpd_si128(d_d);
+
+    __m128i curr_sads_ab = _mm_sad_epu8(a, b);
+    __m128i curr_sads_cd = _mm_sad_epu8(c, d);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m128i a = _mm_loadl_epi64((__m128i *)(data1 + y * stride1));
+      __m128i b = _mm_loadl_epi64((__m128i *)(data2 + y * stride2));
+
+      __m128i curr_sads_ab = _mm_sad_epu8(a, b);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static INLINE uint32_t reg_sad_w12(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                            const int32_t height, const uint32_t stride1,
+                            const uint32_t stride2)
+{
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+  for (y = 0; y < height; y++) {
+    __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1));
+    __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2));
+
+    __m128i b_masked  = _mm_blend_epi16(a, b, 0x3f);
+    __m128i curr_sads = _mm_sad_epu8   (a, b_masked);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+  return _mm_cvtsi128_si32(sad);
+}
+
+static INLINE uint32_t reg_sad_w16(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                            const int32_t height, const uint32_t stride1,
+                            const uint32_t stride2)
+{
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1));
+    __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2));
+    __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1));
+    __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2));
+    __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1));
+    __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2));
+    __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1));
+    __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2));
+
+    __m128i curr_sads_ab = _mm_sad_epu8(a, b);
+    __m128i curr_sads_cd = _mm_sad_epu8(c, d);
+    __m128i curr_sads_ef = _mm_sad_epu8(e, f);
+    __m128i curr_sads_gh = _mm_sad_epu8(g, h);
+
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1));
+      __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2));
+
+      __m128i curr_sads = _mm_sad_epu8(a, b);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+    }
+  }
+
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+  return _mm_cvtsi128_si32(sad);
+}
+
+static INLINE uint32_t reg_sad_w24(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                            const int32_t height, const uint32_t stride1,
+                            const uint32_t stride2)
+{
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+
+  const int32_t height_doublelines = height & ~1;
+  const int32_t height_parity      = height &  1;
+
+  for (y = 0; y < height_doublelines; y += 2) {
+    __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1));
+    __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2));
+    __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1));
+    __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2));
+
+    __m128d e_d = _mm_setzero_pd();
+    __m128d f_d = _mm_setzero_pd();
+
+    e_d = _mm_loadl_pd(e_d, (const double *)(data1 + (y + 0) * stride1 + 16));
+    f_d = _mm_loadl_pd(f_d, (const double *)(data2 + (y + 0) * stride2 + 16));
+    e_d = _mm_loadh_pd(e_d, (const double *)(data1 + (y + 1) * stride1 + 16));
+    f_d = _mm_loadh_pd(f_d, (const double *)(data2 + (y + 1) * stride2 + 16));
+
+    __m128i e = _mm_castpd_si128(e_d);
+    __m128i f = _mm_castpd_si128(f_d);
+
+    __m128i curr_sads_1 = _mm_sad_epu8(a, b);
+    __m128i curr_sads_2 = _mm_sad_epu8(c, d);
+    __m128i curr_sads_3 = _mm_sad_epu8(e, f);
+
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_3);
+  }
+  if (height_parity) {
+    __m128i a = _mm_loadu_si128   ((const __m128i *)(data1 + y * stride1));
+    __m128i b = _mm_loadu_si128   ((const __m128i *)(data2 + y * stride2));
+    __m128i c = _mm_loadl_epi64   ((const __m128i *)(data1 + y * stride1 + 16));
+    __m128i d = _mm_loadl_epi64   ((const __m128i *)(data2 + y * stride2 + 16));
+
+    __m128i curr_sads_1 = _mm_sad_epu8(a, b);
+    __m128i curr_sads_2 = _mm_sad_epu8(c, d);
+
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+  return _mm_cvtsi128_si32(sad);
+}
+
+static INLINE uint32_t reg_sad_arbitrary(const kvz_pixel * const data1, const kvz_pixel * const data2,
+                                  const int32_t width, const int32_t height, const uint32_t stride1,
+                                  const uint32_t stride2)
+{
+  int32_t y, x;
+  __m128i sse_inc = _mm_setzero_si128();
+  
+  // Bytes in block in 128-bit blocks per each scanline, and remainder
+  const int32_t width_xmms             = width  & ~15;
+  const int32_t width_residual_pixels  = width  &  15;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  const __m128i rds    = _mm_set1_epi8 (width_residual_pixels);
+  const __m128i ns     = _mm_setr_epi8 (0,  1,  2,  3,  4,  5,  6,  7,
+                                        8,  9,  10, 11, 12, 13, 14, 15);
+  const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
+
+  for (x = 0; x < width_xmms; x += 16) {
+    for (y = 0; y < height_fourline_groups; y += 4) {
+      __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1 + x));
+      __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2 + x));
+      __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1 + x));
+      __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2 + x));
+      __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1 + x));
+      __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2 + x));
+      __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1 + x));
+      __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2 + x));
+
+      __m128i curr_sads_ab = _mm_sad_epu8(a, b);
+      __m128i curr_sads_cd = _mm_sad_epu8(c, d);
+      __m128i curr_sads_ef = _mm_sad_epu8(e, f);
+      __m128i curr_sads_gh = _mm_sad_epu8(g, h);
+
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
+    }
+    if (height_residual_lines) {
+      for (; y < height; y++) {
+        __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1 + x));
+        __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2 + x));
+
+        __m128i curr_sads = _mm_sad_epu8(a, b);
+
+        sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+      }
+    }
+  }
+
+  if (width_residual_pixels) {
+    for (y = 0; y < height_fourline_groups; y += 4) {
+      __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1 + x));
+      __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2 + x));
+      __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1 + x));
+      __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2 + x));
+      __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1 + x));
+      __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2 + x));
+      __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1 + x));
+      __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2 + x));
+
+      __m128i b_masked     = _mm_blendv_epi8(a, b, rdmask);
+      __m128i d_masked     = _mm_blendv_epi8(c, d, rdmask);
+      __m128i f_masked     = _mm_blendv_epi8(e, f, rdmask);
+      __m128i h_masked     = _mm_blendv_epi8(g, h, rdmask);
+
+      __m128i curr_sads_ab = _mm_sad_epu8   (a, b_masked);
+      __m128i curr_sads_cd = _mm_sad_epu8   (c, d_masked);
+      __m128i curr_sads_ef = _mm_sad_epu8   (e, f_masked);
+      __m128i curr_sads_gh = _mm_sad_epu8   (g, h_masked);
+
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
+    }
+    if (height_residual_lines) {
+      for (; y < height; y++) {
+        __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1 + x));
+        __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2 + x));
+
+        __m128i b_masked  = _mm_blendv_epi8(a, b, rdmask);
+        __m128i curr_sads = _mm_sad_epu8   (a, b_masked);
+
+        sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+      }
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static uint32_t ver_sad_w4(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                           int32_t height, uint32_t stride)
+{
+  __m128i ref_row = _mm_set1_epi32(*(const uint32_t *)ref_data);
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128i a = _mm_cvtsi32_si128(*(uint32_t *)(pic_data + y * stride));
+
+    a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 1) * stride), 1);
+    a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 2) * stride), 2);
+    a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 3) * stride), 3);
+
+    __m128i curr_sads = _mm_sad_epu8(a, ref_row);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+  }
+  if (height_residual_lines) {
+    // Only pick the last dword, because we're comparing single dwords (lines)
+    ref_row = _mm_bsrli_si128(ref_row, 12);
+
+    for (; y < height; y++) {
+      __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * stride));
+
+      __m128i curr_sads = _mm_sad_epu8(a, ref_row);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static uint32_t ver_sad_w8(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                           int32_t height, uint32_t stride)
+{
+  const __m128i ref_row = _mm_set1_epi64x(*(const uint64_t *)ref_data);
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128d a_d = _mm_setzero_pd();
+    __m128d c_d = _mm_setzero_pd();
+
+    a_d = _mm_loadl_pd(a_d, (const double *)(pic_data + (y + 0) * stride));
+    a_d = _mm_loadh_pd(a_d, (const double *)(pic_data + (y + 1) * stride));
+
+    c_d = _mm_loadl_pd(c_d, (const double *)(pic_data + (y + 2) * stride));
+    c_d = _mm_loadh_pd(c_d, (const double *)(pic_data + (y + 3) * stride));
+
+    __m128i a = _mm_castpd_si128(a_d);
+    __m128i c = _mm_castpd_si128(c_d);
+
+    __m128i curr_sads_ab = _mm_sad_epu8(a, ref_row);
+    __m128i curr_sads_cd = _mm_sad_epu8(c, ref_row);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+  }
+  if (height_residual_lines) {
+    __m128i b = _mm_move_epi64(ref_row);
+
+    for (; y < height; y++) {
+      __m128i a = _mm_loadl_epi64((__m128i *)(pic_data + y * stride));
+
+      __m128i curr_sads_ab = _mm_sad_epu8(a, b);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static uint32_t ver_sad_w12(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                            int32_t height, uint32_t stride)
+{
+  const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+
+  for (y = 0; y < height; y++) {
+    __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride));
+
+    __m128i a_masked  = _mm_blend_epi16(ref_row, a, 0x3f);
+    __m128i curr_sads = _mm_sad_epu8   (ref_row, a_masked);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+  return _mm_cvtsi128_si32(sad);
+}
+
+static uint32_t ver_sad_w16(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                            int32_t height, uint32_t stride)
+{
+  const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data);
+  __m128i sse_inc       = _mm_setzero_si128();
+  int32_t y;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128i pic_row_1   = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
+    __m128i pic_row_2   = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * stride));
+    __m128i pic_row_3   = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * stride));
+    __m128i pic_row_4   = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * stride));
+
+    __m128i curr_sads_1 = _mm_sad_epu8   (pic_row_1, ref_row);
+    __m128i curr_sads_2 = _mm_sad_epu8   (pic_row_2, ref_row);
+    __m128i curr_sads_3 = _mm_sad_epu8   (pic_row_3, ref_row);
+    __m128i curr_sads_4 = _mm_sad_epu8   (pic_row_4, ref_row);
+
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_1);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_2);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_3);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_4);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m128i pic_row   = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride));
+      __m128i curr_sads = _mm_sad_epu8   (pic_row, ref_row);
+
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static uint32_t ver_sad_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                  int32_t width, int32_t height, uint32_t stride)
+{
+  int32_t y, x;
+  __m128i sse_inc = _mm_setzero_si128();
+
+  // Bytes in block in 128-bit blocks per each scanline, and remainder
+  const int32_t width_xmms             = width  & ~15;
+  const int32_t width_residual_pixels  = width  &  15;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  const __m128i rds    = _mm_set1_epi8 (width_residual_pixels);
+  const __m128i ns     = _mm_setr_epi8 (0,  1,  2,  3,  4,  5,  6,  7,
+                                        8,  9,  10, 11, 12, 13, 14, 15);
+  const __m128i rdmask = _mm_cmpgt_epi8(rds, ns);
+
+  for (x = 0; x < width_xmms; x += 16) {
+    const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x));
+    for (y = 0; y < height_fourline_groups; y += 4) {
+      __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x));
+      __m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x));
+      __m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x));
+      __m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x));
+
+      __m128i curr_sads_ab = _mm_sad_epu8(ref_row, a);
+      __m128i curr_sads_cd = _mm_sad_epu8(ref_row, c);
+      __m128i curr_sads_ef = _mm_sad_epu8(ref_row, e);
+      __m128i curr_sads_gh = _mm_sad_epu8(ref_row, g);
+
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
+    }
+    if (height_residual_lines) {
+      for (; y < height; y++) {
+        __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x));
+
+        __m128i curr_sads = _mm_sad_epu8(a, ref_row);
+
+        sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+      }
+    }
+  }
+
+  if (width_residual_pixels) {
+    const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x));
+    for (y = 0; y < height_fourline_groups; y += 4) {
+      __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x));
+      __m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x));
+      __m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x));
+      __m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x));
+
+      __m128i a_masked     = _mm_blendv_epi8(ref_row, a, rdmask);
+      __m128i c_masked     = _mm_blendv_epi8(ref_row, c, rdmask);
+      __m128i e_masked     = _mm_blendv_epi8(ref_row, e, rdmask);
+      __m128i g_masked     = _mm_blendv_epi8(ref_row, g, rdmask);
+
+      __m128i curr_sads_ab = _mm_sad_epu8   (ref_row, a_masked);
+      __m128i curr_sads_cd = _mm_sad_epu8   (ref_row, c_masked);
+      __m128i curr_sads_ef = _mm_sad_epu8   (ref_row, e_masked);
+      __m128i curr_sads_gh = _mm_sad_epu8   (ref_row, g_masked);
+
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
+    }
+    if (height_residual_lines) {
+      for (; y < height; y++) {
+        __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x));
+
+        __m128i a_masked  = _mm_blendv_epi8(ref_row, a, rdmask);
+        __m128i curr_sads = _mm_sad_epu8   (ref_row, a_masked);
+
+        sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+      }
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static uint32_t hor_sad_sse41_w4(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                 int32_t height, uint32_t pic_stride, uint32_t ref_stride,
+                                 uint32_t left, uint32_t right)
+{
+  const int32_t right_border_idx = 3 - right;
+  const int32_t border_idx       = left ? left : right_border_idx;
+
+  const __m128i ns               = _mm_setr_epi8(0,  1,  2,  3,  4,  5,  6,  7,
+                                                 8,  9,  10, 11, 12, 13, 14, 15);
+
+  const int32_t border_idx_negative = border_idx >> 31;
+  const int32_t leftoff             = border_idx_negative | left;
+
+  // Dualword (ie. line) base indexes, ie. the edges the lines read will be
+  // clamped towards
+  const __m128i dwbaseids   = _mm_setr_epi8(0, 0, 0, 0, 4, 4, 4, 4,
+                                            8, 8, 8, 8, 12, 12, 12, 12);
+
+  __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx);
+  __m128i left_128          = _mm_set1_epi8((int8_t)left);
+
+  right_border_idxs         = _mm_add_epi8 (right_border_idxs, dwbaseids);
+
+  __m128i mask_right        = _mm_min_epi8 (ns,         right_border_idxs);
+  __m128i mask1             = _mm_sub_epi8 (mask_right, left_128);
+
+  const __m128i epol_mask   = _mm_max_epi8(mask1, dwbaseids);
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * pic_stride));
+    __m128i b = _mm_cvtsi32_si128(*(const uint32_t *)(ref_data + y * ref_stride + leftoff));
+
+    a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 1) * pic_stride),           1);
+    b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 1) * ref_stride + leftoff), 1);
+    a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 2) * pic_stride),           2);
+    b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 2) * ref_stride + leftoff), 2);
+    a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 3) * pic_stride),           3);
+    b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 3) * ref_stride + leftoff), 3);
+
+    __m128i b_epol    = _mm_shuffle_epi8(b,       epol_mask);
+    __m128i curr_sads = _mm_sad_epu8    (a,       b_epol);
+            sse_inc   = _mm_add_epi64   (sse_inc, curr_sads);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * pic_stride));
+      __m128i b = _mm_cvtsi32_si128(*(const uint32_t *)(ref_data + y * ref_stride + leftoff));
+
+      __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
+      __m128i curr_sads = _mm_sad_epu8 (a, b_epol);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+
+  return _mm_cvtsi128_si32(sad);
+}
+
+static uint32_t hor_sad_sse41_w8(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                 int32_t height, uint32_t pic_stride, uint32_t ref_stride,
+                                 uint32_t left, uint32_t right)
+{
+  // right is the number of overhanging pixels in the vector, so it has to be
+  // handled this way to produce the index of last valid (border) pixel
+  const int32_t right_border_idx = 7 - right;
+  const int32_t border_idx       = left ? left : right_border_idx;
+
+  const __m128i ns               = _mm_setr_epi8(0,  1,  2,  3,  4,  5,  6,  7,
+                                                 8,  9,  10, 11, 12, 13, 14, 15);
+
+  // Quadword (ie. line) base indexes, ie. the edges the lines read will be
+  // clamped towards; higher qword (lower line) bytes tend towards 8 and lower
+  // qword (higher line) bytes towards 0
+  const __m128i qwbaseids   = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+                                            8, 8, 8, 8, 8, 8, 8, 8);
+
+  // Dirty hack alert! If right == block_width (ie. the entire vector is
+  // outside the frame), move the block offset one pixel to the left (so
+  // that the leftmost pixel in vector is actually the valid border pixel
+  // from which we want to extrapolate), and use an epol mask that will
+  // simply stretch the pixel all over the vector.
+  //
+  // To avoid a branch here:
+  // The mask will be -1 (0xffffffff) for border_idx -1 and 0 for >= 0
+  const int32_t border_idx_negative = border_idx >> 31;
+  const int32_t leftoff             = border_idx_negative | left;
+
+  __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx);
+  __m128i left_128          = _mm_set1_epi8((int8_t)left);
+
+  right_border_idxs         = _mm_add_epi8 (right_border_idxs, qwbaseids);
+
+  // If we're straddling the left border, right_border_idx is 7 and the first
+  // operation does nothing. If right border, left is 0 and the second
+  // operation does nothing.
+  __m128i mask_right        = _mm_min_epi8 (ns,         right_border_idxs);
+  __m128i mask1             = _mm_sub_epi8 (mask_right, left_128);
+
+  // If right == 8 (we're completely outside the frame), right_border_idx is
+  // -1 and so is mask1. Clamp negative values to qwbaseid and as discussed
+  // earlier, adjust the load offset instead to load the "-1'st" pixels and
+  // using qwbaseids as the shuffle mask, broadcast it all over the rows.
+  const __m128i epol_mask = _mm_max_epi8(mask1, qwbaseids);
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128d a_d = _mm_setzero_pd();
+    __m128d b_d = _mm_setzero_pd();
+    __m128d c_d = _mm_setzero_pd();
+    __m128d d_d = _mm_setzero_pd();
+
+    a_d = _mm_loadl_pd(a_d, (const double *)(pic_data + (y + 0) * pic_stride));
+    b_d = _mm_loadl_pd(b_d, (const double *)(ref_data + (y + 0) * ref_stride + leftoff));
+    a_d = _mm_loadh_pd(a_d, (const double *)(pic_data + (y + 1) * pic_stride));
+    b_d = _mm_loadh_pd(b_d, (const double *)(ref_data + (y + 1) * ref_stride + leftoff));
+
+    c_d = _mm_loadl_pd(c_d, (const double *)(pic_data + (y + 2) * pic_stride));
+    d_d = _mm_loadl_pd(d_d, (const double *)(ref_data + (y + 2) * ref_stride + leftoff));
+    c_d = _mm_loadh_pd(c_d, (const double *)(pic_data + (y + 3) * pic_stride));
+    d_d = _mm_loadh_pd(d_d, (const double *)(ref_data + (y + 3) * ref_stride + leftoff));
+
+    __m128i a = _mm_castpd_si128(a_d);
+    __m128i b = _mm_castpd_si128(b_d);
+    __m128i c = _mm_castpd_si128(c_d);
+    __m128i d = _mm_castpd_si128(d_d);
+
+    __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
+    __m128i d_epol = _mm_shuffle_epi8(d, epol_mask);
+
+    __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol);
+    __m128i curr_sads_cd = _mm_sad_epu8(c, d_epol);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m128i a = _mm_loadl_epi64((__m128i *)(pic_data + y * pic_stride));
+      __m128i b = _mm_loadl_epi64((__m128i *)(ref_data + y * ref_stride + leftoff));
+
+      __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
+
+      __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+  return _mm_cvtsi128_si32(sad);
+}
+
+/*
+ * left and right measure how many pixels of one horizontal scanline will be
+ * outside either the left or the right screen border. For blocks straddling
+ * the left border, read the scanlines starting from the left border instead,
+ * and use the extrapolation mask to essentially move the pixels right while
+ * copying the left border pixel to the vector positions that logically point
+ * outside of the buffer.
+ *
+ * For blocks straddling the right border, just read over the right border,
+ * and extrapolate all pixels beyond the border idx to copy the value of the
+ * border pixel. An exception is right == width (leftmost reference pixel is
+ * one place right from the right border, it's ugly because the pixel to
+ * extrapolate from is located at relative X offset -1), abuse the left border
+ * aligning functionality instead to actually read starting from the valid
+ * border pixel, and use a suitable mask to fill all the other pixels with
+ * that value.
+ */
+static uint32_t hor_sad_sse41_w16(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                  int32_t height, uint32_t pic_stride, uint32_t ref_stride,
+                                  const uint32_t left, const uint32_t right)
+{
+  // right is the number of overhanging pixels in the vector, so it has to be
+  // handled this way to produce the index of last valid (border) pixel
+  const int32_t right_border_idx = 15 - right;
+  const int32_t border_idx       = left ? left : right_border_idx;
+
+  const __m128i ns               = _mm_setr_epi8(0,  1,  2,  3,  4,  5,  6,  7,
+                                                 8,  9,  10, 11, 12, 13, 14, 15);
+  const __m128i zero             = _mm_setzero_si128();
+
+  // Dirty hack alert! If right == block_width (ie. the entire vector is
+  // outside the frame), move the block offset one pixel to the left (so
+  // that the leftmost pixel in vector is actually the valid border pixel
+  // from which we want to extrapolate), and use an epol mask that will
+  // simply stretch the pixel all over the vector.
+  //
+  // To avoid a branch here:
+  // The mask will be -1 (0xffffffff) for border_idx -1 and 0 for >= 0
+  const int32_t border_idx_negative = border_idx >> 31;
+  const int32_t leftoff             = border_idx_negative | left;
+
+  __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx);
+  __m128i left_128          = _mm_set1_epi8((int8_t)left);
+
+  // If we're straddling the left border, right_border_idx is 15 and the first
+  // operation does nothing. If right border, left is 0 and the second
+  // operation does nothing.
+  __m128i mask_right        = _mm_min_epi8 (ns,         right_border_idxs);
+  __m128i mask1             = _mm_sub_epi8 (mask_right, left_128);
+
+  // If right == 16 (we're completely outside the frame), right_border_idx is
+  // -1 and so is mask1. Clamp negative values to zero and as discussed
+  // earlier, adjust the load offset instead to load the "-1'st" pixel and
+  // using an all-zero shuffle mask, broadcast it all over the vector.
+  const __m128i epol_mask = _mm_max_epi8(mask1, zero);
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  __m128i sse_inc = _mm_setzero_si128();
+  int32_t y;
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride));
+    __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + leftoff));
+    __m128i c = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * pic_stride));
+    __m128i d = _mm_loadu_si128((__m128i *)(ref_data + (y + 1) * ref_stride + leftoff));
+    __m128i e = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * pic_stride));
+    __m128i f = _mm_loadu_si128((__m128i *)(ref_data + (y + 2) * ref_stride + leftoff));
+    __m128i g = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * pic_stride));
+    __m128i h = _mm_loadu_si128((__m128i *)(ref_data + (y + 3) * ref_stride + leftoff));
+
+    __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
+    __m128i d_epol = _mm_shuffle_epi8(d, epol_mask);
+    __m128i f_epol = _mm_shuffle_epi8(f, epol_mask);
+    __m128i h_epol = _mm_shuffle_epi8(h, epol_mask);
+
+    __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol);
+    __m128i curr_sads_cd = _mm_sad_epu8(c, d_epol);
+    __m128i curr_sads_ef = _mm_sad_epu8(e, f_epol);
+    __m128i curr_sads_gh = _mm_sad_epu8(g, h_epol);
+
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef);
+    sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh);
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride));
+      __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + leftoff));
+      __m128i b_epol = _mm_shuffle_epi8(b, epol_mask);
+      __m128i curr_sads = _mm_sad_epu8(a, b_epol);
+      sse_inc = _mm_add_epi64(sse_inc, curr_sads);
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+  return _mm_cvtsi128_si32(sad);
+}
+
+static INLINE uint32_t hor_sad_sse41_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                               int32_t width, int32_t height, uint32_t pic_stride,
+                                               uint32_t ref_stride, uint32_t left, uint32_t right)
+{
+  __m128i sse_inc = _mm_setzero_si128();
+
+  const size_t vec_width = 16;
+  const size_t vecwid_bitmask = 15;
+  const size_t vec_width_log2 = 4;
+
+  const int32_t height_fourline_groups = height & ~3;
+  const int32_t height_residual_lines  = height &  3;
+
+  const __m128i rights     = _mm_set1_epi8((uint8_t)right);
+  const __m128i blk_widths = _mm_set1_epi8((uint8_t)width);
+  const __m128i vec_widths = _mm_set1_epi8((uint8_t)vec_width);
+  const __m128i nslo       = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+  uint32_t outside_vecs,  inside_vecs,  left_offset, is_left_bm;
+  int32_t  outside_width, inside_width, border_off,  invec_lstart,
+           invec_lend,    invec_linc;
+  if (left) {
+    outside_vecs  =    left                              >> vec_width_log2;
+    inside_vecs   = (( width           + vecwid_bitmask) >> vec_width_log2) - outside_vecs;
+    outside_width =    outside_vecs * vec_width;
+    inside_width  =    inside_vecs  * vec_width;
+    left_offset   =    left;
+    border_off    =    left;
+    invec_lstart  =    0;
+    invec_lend    =    inside_vecs;
+    invec_linc    =    1;
+    is_left_bm    =    -1;
+  } else {
+    inside_vecs   =  ((width - right) + vecwid_bitmask)  >> vec_width_log2;
+    outside_vecs  = (( width          + vecwid_bitmask)  >> vec_width_log2) - inside_vecs;
+    outside_width =    outside_vecs * vec_width;
+    inside_width  =    inside_vecs  * vec_width;
+    left_offset   =    right - width;
+    border_off    =    width - 1 - right;
+    invec_lstart  =    inside_vecs - 1;
+    invec_lend    =    -1;
+    invec_linc    =    -1;
+    is_left_bm    =    0;
+  }
+  left_offset &= vecwid_bitmask;
+
+  const __m128i left_offsets = _mm_set1_epi8 ((uint8_t)left_offset);
+  const __m128i is_left      = _mm_cmpeq_epi8(rights, _mm_setzero_si128());
+  const __m128i vw_for_left  = _mm_and_si128 (is_left, vec_widths);
+
+  // -x == (x ^ 0xff) + 1 = (x ^ 0xff) - 0xff. Also x == (x ^ 0x00) - 0x00.
+  // in other words, calculate inverse of left_offsets if is_left is true.
+  const __m128i offs_neg            = _mm_xor_si128 (left_offsets, is_left);
+  const __m128i offs_for_sm1        = _mm_sub_epi8  (offs_neg,     is_left);
+
+  const __m128i ns_for_sm1          = _mm_or_si128  (vw_for_left,  nslo);
+  const __m128i shufmask1           = _mm_add_epi8  (ns_for_sm1,   offs_for_sm1);
+
+  const __m128i mo2bmask_l          = _mm_cmpgt_epi8(left_offsets, nslo);
+  const __m128i mo2bimask_l         = _mm_cmpeq_epi8(mo2bmask_l,   _mm_setzero_si128());
+  const __m128i mo2bimask_r         = _mm_cmpgt_epi8(vec_widths,   shufmask1);
+  const __m128i move_old_to_b_imask = _mm_blendv_epi8(mo2bimask_r, mo2bimask_l, is_left);
+
+  const int32_t outvec_offset = (~is_left_bm) & inside_width;
+  int32_t x, y;
+  for (y = 0; y < height_fourline_groups; y += 4) {
+    __m128i borderpx_vec_b = _mm_set1_epi8(ref_data(int32_t)((y + 0) * ref_stride + border_off));
+    __m128i borderpx_vec_d = _mm_set1_epi8(ref_data(int32_t)((y + 1) * ref_stride + border_off));
+    __m128i borderpx_vec_f = _mm_set1_epi8(ref_data(int32_t)((y + 2) * ref_stride + border_off));
+    __m128i borderpx_vec_h = _mm_set1_epi8(ref_data(int32_t)((y + 3) * ref_stride + border_off));
+
+    for (x = 0; x < outside_vecs; x++) {
+      __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + outvec_offset));
+      __m128i c = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 1) * pic_stride + outvec_offset));
+      __m128i e = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 2) * pic_stride + outvec_offset));
+      __m128i g = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 3) * pic_stride + outvec_offset));
+
+      __m128i startoffs  = _mm_set1_epi8  ((x + inside_vecs) << vec_width_log2);
+      __m128i ns         = _mm_add_epi8   (startoffs, nslo);
+
+      // Unread imask is (is_left NOR unrd_imask_for_right), do the maths etc
+      __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, ns);
+              unrd_imask = _mm_or_si128   (unrd_imask, is_left);
+      __m128i unrd_mask  = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128());
+
+      __m128i b_unread   = _mm_blendv_epi8(borderpx_vec_b, a, unrd_mask);
+      __m128i d_unread   = _mm_blendv_epi8(borderpx_vec_d, c, unrd_mask);
+      __m128i f_unread   = _mm_blendv_epi8(borderpx_vec_f, e, unrd_mask);
+      __m128i h_unread   = _mm_blendv_epi8(borderpx_vec_h, g, unrd_mask);
+
+      __m128i sad_ab     = _mm_sad_epu8   (a, b_unread);
+      __m128i sad_cd     = _mm_sad_epu8   (c, d_unread);
+      __m128i sad_ef     = _mm_sad_epu8   (e, f_unread);
+      __m128i sad_gh     = _mm_sad_epu8   (g, h_unread);
+
+      sse_inc = _mm_add_epi64(sse_inc, sad_ab);
+      sse_inc = _mm_add_epi64(sse_inc, sad_cd);
+      sse_inc = _mm_add_epi64(sse_inc, sad_ef);
+      sse_inc = _mm_add_epi64(sse_inc, sad_gh);
+    }
+    int32_t a_off = outside_width & is_left_bm;
+    int32_t leftoff_with_sign_neg = (left_offset ^ is_left_bm) - is_left_bm;
+
+    __m128i old_b = borderpx_vec_b;
+    __m128i old_d = borderpx_vec_d;
+    __m128i old_f = borderpx_vec_f;
+    __m128i old_h = borderpx_vec_h;
+
+    for (x = invec_lstart; x != invec_lend; x += invec_linc) {
+      __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + a_off));
+      __m128i c = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 1) * pic_stride + a_off));
+      __m128i e = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 2) * pic_stride + a_off));
+      __m128i g = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 3) * pic_stride + a_off));
+      __m128i b = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 0) * ref_stride + a_off - leftoff_with_sign_neg));
+      __m128i d = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 1) * ref_stride + a_off - leftoff_with_sign_neg));
+      __m128i f = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 2) * ref_stride + a_off - leftoff_with_sign_neg));
+      __m128i h = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 3) * ref_stride + a_off - leftoff_with_sign_neg));
+
+      __m128i b_shifted    = _mm_shuffle_epi8(b,     shufmask1);
+      __m128i d_shifted    = _mm_shuffle_epi8(d,     shufmask1);
+      __m128i f_shifted    = _mm_shuffle_epi8(f,     shufmask1);
+      __m128i h_shifted    = _mm_shuffle_epi8(h,     shufmask1);
+
+      __m128i b_with_old   = _mm_blendv_epi8 (old_b, b_shifted, move_old_to_b_imask);
+      __m128i d_with_old   = _mm_blendv_epi8 (old_d, d_shifted, move_old_to_b_imask);
+      __m128i f_with_old   = _mm_blendv_epi8 (old_f, f_shifted, move_old_to_b_imask);
+      __m128i h_with_old   = _mm_blendv_epi8 (old_h, h_shifted, move_old_to_b_imask);
+
+      uint8_t startoff     = (x << vec_width_log2) + a_off;
+      __m128i startoffs    = _mm_set1_epi8   (startoff);
+      __m128i curr_ns      = _mm_add_epi8    (startoffs,    nslo);
+      __m128i unrd_imask   = _mm_cmpgt_epi8  (blk_widths,   curr_ns);
+      __m128i unrd_mask    = _mm_cmpeq_epi8  (unrd_imask,   _mm_setzero_si128());
+
+      __m128i b_unread     = _mm_blendv_epi8 (b_with_old,   a, unrd_mask);
+      __m128i d_unread     = _mm_blendv_epi8 (d_with_old,   c, unrd_mask);
+      __m128i f_unread     = _mm_blendv_epi8 (f_with_old,   e, unrd_mask);
+      __m128i h_unread     = _mm_blendv_epi8 (h_with_old,   g, unrd_mask);
+
+      old_b = b_shifted;
+      old_d = d_shifted;
+      old_f = f_shifted;
+      old_h = h_shifted;
+
+      __m128i sad_ab     = _mm_sad_epu8(a, b_unread);
+      __m128i sad_cd     = _mm_sad_epu8(c, d_unread);
+      __m128i sad_ef     = _mm_sad_epu8(e, f_unread);
+      __m128i sad_gh     = _mm_sad_epu8(g, h_unread);
+
+      sse_inc = _mm_add_epi64(sse_inc, sad_ab);
+      sse_inc = _mm_add_epi64(sse_inc, sad_cd);
+      sse_inc = _mm_add_epi64(sse_inc, sad_ef);
+      sse_inc = _mm_add_epi64(sse_inc, sad_gh);
+    }
+  }
+  if (height_residual_lines) {
+    for (; y < height; y++) {
+      __m128i borderpx_vec = _mm_set1_epi8(ref_data(int32_t)((y + 0) * ref_stride + border_off));
+      for (x = 0; x < outside_vecs; x++) {
+        __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + outvec_offset));
+
+        __m128i startoffs  = _mm_set1_epi8  ((x + inside_vecs) << vec_width_log2);
+        __m128i ns         = _mm_add_epi8   (startoffs, nslo);
+
+        // Unread imask is (is_left NOR unrd_imask_for_right), do the maths etc
+        __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, ns);
+                unrd_imask = _mm_or_si128   (unrd_imask, is_left);
+        __m128i unrd_mask  = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128());
+        __m128i b_unread   = _mm_blendv_epi8(borderpx_vec, a, unrd_mask);
+
+        __m128i sad_ab     = _mm_sad_epu8   (a, b_unread);
+        sse_inc = _mm_add_epi64(sse_inc, sad_ab);
+      }
+      int32_t a_off = outside_width & is_left_bm;
+      int32_t leftoff_with_sign_neg = (left_offset ^ is_left_bm) - is_left_bm;
+
+      __m128i old_b = borderpx_vec;
+      for (x = invec_lstart; x != invec_lend; x += invec_linc) {
+        __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + a_off));
+        __m128i b = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 0) * ref_stride + a_off - leftoff_with_sign_neg));
+
+        __m128i b_shifted    = _mm_shuffle_epi8(b,     shufmask1);
+        __m128i b_with_old   = _mm_blendv_epi8 (old_b, b_shifted, move_old_to_b_imask);
+
+        uint8_t startoff     = (x << vec_width_log2) + a_off;
+        __m128i startoffs    = _mm_set1_epi8   (startoff);
+        __m128i curr_ns      = _mm_add_epi8    (startoffs,    nslo);
+        __m128i unrd_imask   = _mm_cmpgt_epi8  (blk_widths,   curr_ns);
+        __m128i unrd_mask    = _mm_cmpeq_epi8  (unrd_imask,   _mm_setzero_si128());
+        __m128i b_unread     = _mm_blendv_epi8 (b_with_old,   a, unrd_mask);
+
+        old_b = b_shifted;
+
+        __m128i sad_ab     = _mm_sad_epu8(a, b_unread);
+        sse_inc = _mm_add_epi64(sse_inc, sad_ab);
+      }
+    }
+  }
+  __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2));
+  __m128i sad       = _mm_add_epi64    (sse_inc, sse_inc_2);
+  return _mm_cvtsi128_si32(sad);
+}
+
+#endif

kvazaar-1.2.0.tar.gz/src/strategies/strategies-dct.c -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-dct.c Changed

@@ -55,22 +55,23 @@
 
 
 /**
-* \brief  Get a function that calculates SAD for NxN block.
-*
-* \param n  Width of the region for which SAD is calculated.
-*
-* \returns  Pointer to cost_16bit_nxn_func.
-*/
-dct_func * kvz_get_dct_func(int8_t width, int32_t mode)
+ * \brief  Get a function that performs the transform for a block.
+ *
+ * \param width    Width of the region
+ * \param color    Color plane
+ * \param type     Prediction type
+ *
+ * \returns Pointer to the function.
+ */
+dct_func * kvz_get_dct_func(int8_t width, color_t color, cu_type_t type)
 {
   switch (width) {
   case 4:
-    switch (mode){
-    case 65535:
-      return kvz_dct_4x4;
-    default:
+    if (color == COLOR_Y && type == CU_INTRA) {
       return kvz_fast_forward_dst_4x4;
-  }
+    } else {
+      return kvz_dct_4x4;
+    }
   case 8:
     return kvz_dct_8x8;
   case 16:
@@ -83,21 +84,22 @@
 }
 
 /**
-* \brief  Get a function that calculates SAD for NxN block.
-*
-* \param n  Width of the region for which SAD is calculated.
-*
-* \returns  Pointer to cost_16bit_nxn_func.
-*/
-dct_func * kvz_get_idct_func(int8_t width, int32_t mode)
+ * \brief  Get a function that performs the inverse transform for a block.
+ *
+ * \param width    Width of the region
+ * \param color    Color plane
+ * \param type     Prediction type
+ *
+ * \returns Pointer to the function.
+ */
+dct_func * kvz_get_idct_func(int8_t width, color_t color, cu_type_t type)
 {
   switch (width) {
   case 4:
-    switch (mode){
-    case 65535:
-      return kvz_idct_4x4;
-    default:
+    if (color == COLOR_Y && type == CU_INTRA) {
       return kvz_fast_inverse_dst_4x4;
+    } else {
+      return kvz_idct_4x4;
     }
   case 8:
     return kvz_idct_8x8;

kvazaar-1.2.0.tar.gz/src/strategies/strategies-dct.h -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-dct.h Changed

kvazaar-1.3.0.tar.gz/src/strategies/strategies-encode.c Added

@@ -0,0 +1,41 @@
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+#include "strategies/strategies-encode.h"
+
+#include "strategies/avx2/encode_coding_tree-avx2.h"
+#include "strategies/generic/encode_coding_tree-generic.h"
+#include "strategyselector.h"
+
+
+// Define function pointers.
+encode_coeff_nxn_func *kvz_encode_coeff_nxn;
+
+
+int kvz_strategy_register_encode(void* opaque, uint8_t bitdepth) {
+  bool success = true;
+
+  success &= kvz_strategy_register_encode_generic(opaque, bitdepth);
+
+  if (kvz_g_hardware_flags.intel_flags.avx2) {
+    success &= kvz_strategy_register_encode_avx2(opaque, bitdepth);
+  }
+  return success;
+}

kvazaar-1.3.0.tar.gz/src/strategies/strategies-encode.h Added

@@ -0,0 +1,56 @@
+#ifndef STRATEGIES_ENCODE_H_
+#define STRATEGIES_ENCODE_H_
+/*****************************************************************************
+ * This file is part of Kvazaar HEVC encoder.
+ *
+ * Copyright (C) 2013-2015 Tampere University of Technology and others (see
+ * COPYING file).
+ *
+ * Kvazaar is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ *
+ * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+ ****************************************************************************/
+
+/**
+ * \ingroup Optimization
+ * \file
+ * Interface for quantization functions.
+ */
+
+#include "cu.h"
+#include "encoderstate.h"
+#include "global.h" // IWYU pragma: keep
+#include "kvazaar.h"
+#include "tables.h"
+
+
+// Declare function pointers.
+typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state,
+                                         cabac_data_t * const cabac,
+                                         const coeff_t *coeff,
+                                         uint8_t width,
+                                         uint8_t type,
+                                         int8_t scan_mode,
+                                         int8_t tr_skip);
+
+// Declare function pointers.
+extern encode_coeff_nxn_func *kvz_encode_coeff_nxn;
+
+int kvz_strategy_register_encode(void* opaque, uint8_t bitdepth);
+
+
+#define STRATEGIES_ENCODE_EXPORTS \
+  {"encode_coeff_nxn", (void**) &kvz_encode_coeff_nxn}, \
+
+
+
+#endif //STRATEGIES_ENCODE_H_

kvazaar-1.2.0.tar.gz/src/strategies/strategies-ipol.c -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-ipol.c Changed

kvazaar-1.2.0.tar.gz/src/strategies/strategies-ipol.h -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-ipol.h Changed

@@ -34,11 +34,9 @@
 
 typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } kvz_extended_block;
 
-typedef unsigned(ipol_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst,
-  int16_t dst_stride, int8_t hor_flag, int8_t ver_flag);
-
-typedef unsigned(ipol_frac_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height,
-  frac_search_block filtered_out15, int8_t fme_level);
+typedef void(ipol_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height,
+  kvz_pixel filtered4LCU_WIDTH * LCU_WIDTH, int16_t hor_intermediate5(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH, int8_t fme_level, int16_t hor_first_cols5KVZ_EXT_BLOCK_W_LUMA + 1, 
+  int8_t sample_off_x, int8_t sample_off_y);
 
 typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height,
   int filter_size, int width, int height, kvz_extended_block *out);
@@ -50,10 +48,10 @@
 typedef void(kvz_sample_14bit_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv2);
 
 // Declare function pointers.
-extern ipol_func * kvz_filter_inter_quarterpel_luma;
-extern ipol_func * kvz_filter_inter_halfpel_chroma;
-extern ipol_func * kvz_filter_inter_octpel_chroma;
-extern ipol_frac_blocks_func *kvz_filter_frac_blocks_luma;
+extern ipol_blocks_func * kvz_filter_hpel_blocks_hor_ver_luma;
+extern ipol_blocks_func * kvz_filter_hpel_blocks_diag_luma;
+extern ipol_blocks_func * kvz_filter_qpel_blocks_hor_ver_luma;
+extern ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma;
 extern epol_func * kvz_get_extended_block;
 extern kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma;
 extern kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
@@ -65,10 +63,10 @@
 
 
 #define STRATEGIES_IPOL_EXPORTS \
-  {"filter_inter_quarterpel_luma", (void**) &kvz_filter_inter_quarterpel_luma}, \
-  {"filter_inter_halfpel_chroma", (void**) &kvz_filter_inter_halfpel_chroma}, \
-  {"filter_inter_octpel_chroma", (void**) &kvz_filter_inter_octpel_chroma}, \
-  {"filter_frac_blocks_luma", (void**) &kvz_filter_frac_blocks_luma}, \
+  {"filter_hpel_blocks_hor_ver_luma", (void**) &kvz_filter_hpel_blocks_hor_ver_luma}, \
+  {"filter_hpel_blocks_diag_luma",    (void**) &kvz_filter_hpel_blocks_diag_luma}, \
+  {"filter_qpel_blocks_hor_ver_luma", (void**) &kvz_filter_qpel_blocks_hor_ver_luma}, \
+  {"filter_qpel_blocks_diag_luma",    (void**) &kvz_filter_qpel_blocks_diag_luma}, \
   {"sample_quarterpel_luma", (void**) &kvz_sample_quarterpel_luma}, \
   {"sample_octpel_chroma", (void**) &kvz_sample_octpel_chroma}, \
   {"sample_14bit_quarterpel_luma", (void**) &kvz_sample_14bit_quarterpel_luma}, \

kvazaar-1.2.0.tar.gz/src/strategies/strategies-picture.c -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-picture.c Changed

kvazaar-1.2.0.tar.gz/src/strategies/strategies-picture.h -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-picture.h Changed

@@ -28,11 +28,12 @@
 
 #include "global.h" // IWYU pragma: keep
 #include "kvazaar.h"
+#include "encoderstate.h"
+#include "strategies/optimized_sad_func_ptr_t.h"
 
 
 typedef kvz_pixel (*pred_buffer)32 * 32;
 
-
 // Function macro for defining hadamard calculating functions
 // for fixed size blocks. They calculate hadamard for integer
 // multiples of 8x8 with the 8x8 hadamard function.
@@ -108,9 +109,33 @@
     const kvz_pixel *block2, int stride2
 );
 typedef void (cost_pixel_nxn_multi_func)(const pred_buffer preds, const kvz_pixel *orig, unsigned num_modes, unsigned *costs_out);
-typedef void (cost_pixel_any_size_multi_func)(int width, int height, const kvz_pixel **preds, const int *strides, const kvz_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid);
+typedef void (cost_pixel_any_size_multi_func)(int width, int height, const kvz_pixel **preds, const int stride, const kvz_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid);
 
 typedef unsigned (pixels_calc_ssd_func)(const kvz_pixel *const ref, const kvz_pixel *const rec, const int ref_stride, const int rec_stride, const int width);
+typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t);
+typedef uint32_t (ver_sad_func)(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                int32_t block_width, int32_t block_height,
+                                uint32_t pic_stride);
+typedef uint32_t (hor_sad_func)(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
+                                int32_t width, int32_t height, uint32_t pic_stride,
+                                uint32_t ref_stride, uint32_t left, uint32_t right);
+
+typedef void (inter_recon_bipred_func)(const int hi_prec_luma_rec0,
+	const int hi_prec_luma_rec1,
+	const int hi_prec_chroma_rec0,
+	const int hi_prec_chroma_rec1,
+	int height,
+	int width,
+	int ypos,
+	int xpos,
+	const hi_prec_buf_t*high_precision_rec0,
+	const hi_prec_buf_t*high_precision_rec1,
+	lcu_t* lcu,
+	kvz_pixel temp_lcu_yLCU_WIDTH*LCU_WIDTH,
+	kvz_pixel temp_lcu_uLCU_WIDTH_C*LCU_WIDTH_C,
+	kvz_pixel temp_lcu_vLCU_WIDTH_C*LCU_WIDTH_C);
+	
+	
 
 // Declare function pointers.
 extern reg_sad_func * kvz_reg_sad;
@@ -144,6 +169,12 @@
 
 extern pixels_calc_ssd_func *kvz_pixels_calc_ssd;
 
+extern inter_recon_bipred_func * kvz_inter_recon_bipred_blend;
+
+extern get_optimized_sad_func *kvz_get_optimized_sad;
+extern ver_sad_func *kvz_ver_sad;
+extern hor_sad_func *kvz_hor_sad;
+
 int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth);
 cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n);
 cost_pixel_nxn_func * kvz_pixels_get_sad_func(unsigned n);
@@ -175,6 +206,10 @@
   {"satd_64x64_dual", (void**) &kvz_satd_64x64_dual}, \
   {"satd_any_size_quad", (void**) &kvz_satd_any_size_quad}, \
   {"pixels_calc_ssd", (void**) &kvz_pixels_calc_ssd}, \
+  {"inter_recon_bipred", (void**) &kvz_inter_recon_bipred_blend}, \
+  {"get_optimized_sad", (void**) &kvz_get_optimized_sad}, \
+  {"ver_sad", (void**) &kvz_ver_sad}, \
+  {"hor_sad", (void**) &kvz_hor_sad}, \

kvazaar-1.2.0.tar.gz/src/strategies/strategies-quant.c -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-quant.c Changed

kvazaar-1.2.0.tar.gz/src/strategies/strategies-quant.h -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-quant.h Changed

kvazaar-1.3.0.tar.gz/src/strategies/x86_asm/x86inc.asm Added

@@ -0,0 +1,1466 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2014 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Anton Mitrofanov <BugMaster@narod.ru>
+;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Henrik Gramner <henrik@gramner.com>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible.  Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well.  Send patches or ideas
+; to x264-devel@videolan.org .
+
+%ifndef private_prefix
+    %define private_prefix kvz
+%endif
+
+%ifndef public_prefix
+    %define public_prefix private_prefix
+%endif
+
+%define WIN64  0
+%define UNIX64 0
+%if ARCH_X86_64
+    %ifidn __OUTPUT_FORMAT__,win32
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,win64
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,x64
+        %define WIN64  1
+    %else
+        %define UNIX64 1
+    %endif
+%endif
+
+%ifdef PREFIX
+    %define mangle(x) _ %+ x
+%else
+    %define mangle(x) x
+%endif
+
+%macro SECTION_RODATA 0-1 16
+    SECTION .rodata align=%1
+%endmacro
+
+%macro SECTION_TEXT 0-1 16
+    SECTION .text align=%1
+%endmacro
+
+%if WIN64
+    %define PIC
+%elif ARCH_X86_64 == 0
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
+    %undef PIC
+%endif
+%ifdef PIC
+    default rel
+%endif
+
+%macro CPUNOP 1
+    %ifdef __YASM_MAJOR__
+        CPU %1
+    %endif
+%endmacro
+
+; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
+CPUNOP amdnop
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
+;      MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
+;      and an extra register will be allocated to hold the original stack
+;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
+;      register as stack pointer, request a negative stack size.
+; %4+/%5+ = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,0, dst, src, tmp
+; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Use this instead of RET if it's a branch target.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+    %define r%1q %2
+    %define r%1d %2d
+    %define r%1w %2w
+    %define r%1b %2b
+    %define r%1h %2h
+    %if %0 == 2
+        %define r%1m  %2d
+        %define r%1mp %2
+    %elif ARCH_X86_64 ; memory
+        %define r%1m rstk + stack_offset + %3
+        %define r%1mp qword r %+ %1 %+ m
+    %else
+        %define r%1m rstk + stack_offset + %3
+        %define r%1mp dword r %+ %1 %+ m
+    %endif
+    %define r%1  %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+    %define r%1q r%1
+    %define e%1q r%1
+    %define r%1d e%1
+    %define e%1d e%1
+    %define r%1w %1
+    %define e%1w %1
+    %define r%1h %3
+    %define e%1h %3
+    %define r%1b %2
+    %define e%1b %2
+%if ARCH_X86_64 == 0
+    %define r%1  e%1
+%endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+    %assign %%i 0
+    %rep %0
+        CAT_XDEFINE t, %%i, r%1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+    %rep %0
+        %define t%1q t%1 %+ q
+        %define t%1d t%1 %+ d
+        %define t%1w t%1 %+ w
+        %define t%1h t%1 %+ h
+        %define t%1b t%1 %+ b
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+    %define gprsize 8
+%else
+    %define gprsize 4
+%endif
+
+%macro PUSH 1
+    push %1
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset+gprsize
+    %endif
+%endmacro
+
+%macro POP 1
+    pop %1
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset-gprsize
+    %endif
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            PUSH r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            pop r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+    %rep %0
+        %if %1 < num_args
+            mov r%1, r %+ %1 %+ mp
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SUB 2
+    sub %1, %2
+    %ifidn %1, rstk
+        %assign stack_offset stack_offset+(%2)
+    %endif
+%endmacro
+
+%macro ADD 2
+    add %1, %2
+    %ifidn %1, rstk
+        %assign stack_offset stack_offset-(%2)
+    %endif
+%endmacro
+
+%macro movifnidn 2
+    %ifnidn %1, %2
+        mov %1, %2
+    %endif
+%endmacro
+
+%macro movsxdifnidn 2
+    %ifnidn %1, %2
+        movsxd %1, %2
+    %endif
+%endmacro
+
+%macro ASSERT 1
+    %if (%1) == 0
+        %error assert failed
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+    %ifdef n_arg_names
+        %assign %%i 0
+        %rep n_arg_names
+            CAT_UNDEF arg_name %+ %%i, q
+            CAT_UNDEF arg_name %+ %%i, d
+            CAT_UNDEF arg_name %+ %%i, w
+            CAT_UNDEF arg_name %+ %%i, h
+            CAT_UNDEF arg_name %+ %%i, b
+            CAT_UNDEF arg_name %+ %%i, m
+            CAT_UNDEF arg_name %+ %%i, mp
+            CAT_UNDEF arg_name, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+
+    %xdefine %%stack_offset stack_offset
+    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+    %assign %%i 0
+    %rep %0
+        %xdefine %1q r %+ %%i %+ q
+        %xdefine %1d r %+ %%i %+ d
+        %xdefine %1w r %+ %%i %+ w
+        %xdefine %1h r %+ %%i %+ h
+        %xdefine %1b r %+ %%i %+ b
+        %xdefine %1m r %+ %%i %+ m
+        %xdefine %1mp r %+ %%i %+ mp
+        CAT_XDEFINE arg_name, %%i, %1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+    %xdefine stack_offset %%stack_offset
+    %assign n_arg_names %0
+%endmacro
+
+%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
+    %ifnum %1
+        %if %1 != 0
+            %assign %%stack_alignment ((mmsize + 15) & ~15)
+            %assign stack_size %1
+            %if stack_size < 0
+                %assign stack_size -stack_size
+            %endif
+            %assign stack_size_padded stack_size
+            %if WIN64
+                %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
+                %if mmsize != 8
+                    %assign xmm_regs_used %2
+                    %if xmm_regs_used > 8
+                        %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
+                    %endif
+                %endif
+            %endif
+            %if mmsize <= 16 && HAVE_ALIGNED_STACK
+                %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
+                SUB rsp, stack_size_padded
+            %else
+                %assign %%reg_num (regs_used - 1)
+                %xdefine rstk r %+ %%reg_num
+                ; align stack, and save original stack location directly above
+                ; it, i.e. in rsp+stack_size_padded, so we can restore the
+                ; stack in a single instruction (i.e. mov rsp, rstk or mov
+                ; rsp, rsp+stack_size_padded)
+                mov  rstk, rsp
+                %if %1 < 0 ; need to store rsp on stack
+                    sub  rsp, gprsize+stack_size_padded
+                    and  rsp, ~(%%stack_alignment-1)
+                    %xdefine rstkm rsp+stack_size_padded
+                    mov rstkm, rstk
+                %else ; can keep rsp in rstk during whole function
+                    sub  rsp, stack_size_padded
+                    and  rsp, ~(%%stack_alignment-1)
+                    %xdefine rstkm rstk
+                %endif
+            %endif
+            WIN64_PUSH_XMM
+        %endif
+    %endif
+%endmacro
+
+%macro SETUP_STACK_POINTER 1
+    %ifnum %1
+        %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
+            %if %1 > 0
+                %assign regs_used (regs_used + 1)
+            %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
+                %warning "Stack pointer will overwrite register argument"
+            %endif
+        %endif
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS_INTERNAL 3+
+    %ifnum %2
+        DEFINE_ARGS %3
+    %elif %1 == 4
+        DEFINE_ARGS %2
+    %elif %1 > 4
+        DEFINE_ARGS %2, %3
+    %endif
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0,  rcx
+DECLARE_REG 1,  rdx
+DECLARE_REG 2,  R8
+DECLARE_REG 3,  R9
+DECLARE_REG 4,  R10, 40
+DECLARE_REG 5,  R11, 48
+DECLARE_REG 6,  rax, 56
+DECLARE_REG 7,  rdi, 64
+DECLARE_REG 8,  rsi, 72
+DECLARE_REG 9,  rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
+
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+    ALLOC_STACK %4, %3
+    %if mmsize != 8 && stack_size == 0
+        WIN64_SPILL_XMM %3
+    %endif
+    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%macro WIN64_PUSH_XMM 0
+    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+    %if xmm_regs_used > 6
+        movaps rstk + stack_offset +  8, xmm6
+    %endif
+    %if xmm_regs_used > 7
+        movaps rstk + stack_offset + 24, xmm7
+    %endif
+    %if xmm_regs_used > 8
+        %assign %%i 8
+        %rep xmm_regs_used-8
+            movaps rsp + (%%i-8)*16 + stack_size + 32, xmm %+ %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+    %assign xmm_regs_used %1
+    ASSERT xmm_regs_used <= 16
+    %if xmm_regs_used > 8
+        %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
+        SUB rsp, stack_size_padded
+    %endif
+    WIN64_PUSH_XMM
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 1
+    %assign %%pad_size 0
+    %if xmm_regs_used > 8
+        %assign %%i xmm_regs_used
+        %rep xmm_regs_used-8
+            %assign %%i %%i-1
+            movaps xmm %+ %%i, %1 + (%%i-8)*16 + stack_size + 32
+        %endrep
+    %endif
+    %if stack_size_padded > 0
+        %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
+            mov rsp, rstkm
+        %else
+            add %1, stack_size_padded
+            %assign %%pad_size stack_size_padded
+        %endif
+    %endif
+    %if xmm_regs_used > 7
+        movaps xmm7, %1 + stack_offset - %%pad_size + 24
+    %endif
+    %if xmm_regs_used > 6
+        movaps xmm6, %1 + stack_offset - %%pad_size +  8
+    %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 1
+    WIN64_RESTORE_XMM_INTERNAL %1
+    %assign stack_offset (stack_offset-stack_size_padded)
+    %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+    WIN64_RESTORE_XMM_INTERNAL rsp
+    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+%if mmsize == 32
+    vzeroupper
+%endif
+    AUTO_REP_RET
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0,  rdi
+DECLARE_REG 1,  rsi
+DECLARE_REG 2,  rdx
+DECLARE_REG 3,  rcx
+DECLARE_REG 4,  R8
+DECLARE_REG 5,  R9
+DECLARE_REG 6,  rax, 8
+DECLARE_REG 7,  R10, 16
+DECLARE_REG 8,  R11, 24
+DECLARE_REG 9,  rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R12, 48
+DECLARE_REG 12, R13, 56
+DECLARE_REG 13, R14, 64
+DECLARE_REG 14, R15, 72
+
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 9, 10, 11, 12, 13, 14
+    ALLOC_STACK %4
+    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+%if stack_size_padded > 0
+%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
+    mov rsp, rstkm
+%else
+    add rsp, stack_size_padded
+%endif
+%endif
+    POP_IF_USED 14, 13, 12, 11, 10, 9
+%if mmsize == 32
+    vzeroupper
+%endif
+    AUTO_REP_RET
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+    %rep %0
+        %define r%1m rstk + stack_offset + 4*%1 + 4
+        %define r%1mp dword r%1m
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    %if num_args > 7
+        %assign num_args 7
+    %endif
+    %if regs_used > 7
+        %assign regs_used 7
+    %endif
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 7
+    PUSH_IF_USED 3, 4, 5, 6
+    ALLOC_STACK %4
+    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+%if stack_size_padded > 0
+%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
+    mov rsp, rstkm
+%else
+    add rsp, stack_size_padded
+%endif
+%endif
+    POP_IF_USED 6, 5, 4, 3
+%if mmsize == 32
+    vzeroupper
+%endif
+    AUTO_REP_RET
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+%macro WIN64_SPILL_XMM 1
+%endmacro
+%macro WIN64_RESTORE_XMM 1
+%endmacro
+%macro WIN64_PUSH_XMM 0
+%endmacro
+%endif
+
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
+%macro REP_RET 0
+    %if has_epilogue
+        RET
+    %else
+        rep ret
+    %endif
+%endmacro
+
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+    %ifndef cpuflags
+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
+    %elif notcpuflag(ssse3)
+        times ((last_branch_adr-$)>>31)+1 rep
+    %endif
+    ret
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+    %rep %0
+        %macro %1 1-2 %1
+            %2 %1
+            %%branch_instr:
+            %xdefine last_branch_adr %%branch_instr
+        %endmacro
+        %rotate 1
+    %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
+%macro TAIL_CALL 2 ; callee, is_nonadjacent
+    %if has_epilogue
+        call %1
+        RET
+    %elif %2
+        jmp %1
+    %endif
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
+; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
+%macro cglobal 1-2+ "" ; name, PROLOGUE args
+    cglobal_internal 1, %1 %+ SUFFIX, %2
+%endmacro
+%macro cvisible 1-2+ "" ; name, PROLOGUE args
+    cglobal_internal 0, %1 %+ SUFFIX, %2
+%endmacro
+%macro cglobal_internal 2-3+
+    %if %1
+        %xdefine %%FUNCTION_PREFIX private_prefix
+        %xdefine %%VISIBILITY hidden
+    %else
+        %xdefine %%FUNCTION_PREFIX public_prefix
+        %xdefine %%VISIBILITY
+    %endif
+    %ifndef cglobaled_%2
+        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
+        %xdefine %2.skip_prologue %2 %+ .skip_prologue
+        CAT_XDEFINE cglobaled_, %2, 1
+    %endif
+    %xdefine current_function %2
+    %ifidn __OUTPUT_FORMAT__,elf
+        global %2:function %%VISIBILITY
+    %else
+        global %2
+    %endif
+    align function_align
+    %2:
+    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
+    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+    %assign stack_offset 0      ; stack pointer offset relative to the return address
+    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
+    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+    %ifnidn %3, ""
+        PROLOGUE %3
+    %endif
+%endmacro
+
+%macro cextern 1
+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+    %xdefine %1 mangle(%1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+%macro const 1-2+
+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+    %ifidn __OUTPUT_FORMAT__,elf
+        global %1:data hidden
+    %else
+        global %1
+    %endif
+    %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is
+; executable by default.
+%ifidn __OUTPUT_FORMAT__,elf
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+; cpuflags
+
+%assign cpuflags_mmx      (1<<0)
+%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
+%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
+%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
+%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
+%assign cpuflags_sse2     (1<<5) | cpuflags_sse
+%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
+%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
+%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
+%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
+%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
+%assign cpuflags_avx      (1<<11)| cpuflags_sse42
+%assign cpuflags_xop      (1<<12)| cpuflags_avx
+%assign cpuflags_fma4     (1<<13)| cpuflags_avx
+%assign cpuflags_avx2     (1<<14)| cpuflags_avx
+%assign cpuflags_fma3     (1<<15)| cpuflags_avx
+
+%assign cpuflags_cache32  (1<<16)
+%assign cpuflags_cache64  (1<<17)
+%assign cpuflags_slowctz  (1<<18)
+%assign cpuflags_lzcnt    (1<<19)
+%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<21)
+%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
+%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
+
+%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
+%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
+
+; Takes up to 2 cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-2
+    CPUNOP amdnop
+    %if %0 >= 1
+        %xdefine cpuname %1
+        %assign cpuflags cpuflags_%1
+        %if %0 >= 2
+            %xdefine cpuname %1_%2
+            %assign cpuflags cpuflags | cpuflags_%2
+        %endif
+        %xdefine SUFFIX _ %+ cpuname
+        %if cpuflag(avx)
+            %assign avx_enabled 1
+        %endif
+        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
+            %define mova movaps
+            %define movu movups
+            %define movnta movntps
+        %endif
+        %if cpuflag(aligned)
+            %define movu mova
+        %elifidn %1, sse3
+            %define movu lddqu
+        %endif
+        %if ARCH_X86_64 == 0 && notcpuflag(sse2)
+            CPUNOP basicnop
+        %endif
+    %else
+        %xdefine SUFFIX
+        %undef cpuname
+        %undef cpuflags
+    %endif
+%endmacro
+
+; Merge mmx and sse*
+; m# is a simd register of the currently selected size
+; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
+; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
+; (All 3 remain in sync through SWAP.)
+
+%macro CAT_XDEFINE 3
+    %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+    %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0-1+
+    %assign avx_enabled 0
+    %define RESET_MM_PERMUTATION INIT_MMX %1
+    %define mmsize 8
+    %define num_mmregs 8
+    %define mova movq
+    %define movu movq
+    %define movh movd
+    %define movnta movntq
+    %assign %%i 0
+    %rep 8
+    CAT_XDEFINE m, %%i, mm %+ %%i
+    CAT_XDEFINE nmm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    %rep 8
+    CAT_UNDEF m, %%i
+    CAT_UNDEF nmm, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_XMM 0-1+
+    %assign avx_enabled 0
+    %define RESET_MM_PERMUTATION INIT_XMM %1
+    %define mmsize 16
+    %define num_mmregs 8
+    %if ARCH_X86_64
+    %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %define movh movq
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, xmm %+ %%i
+    CAT_XDEFINE nxmm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_YMM 0-1+
+    %assign avx_enabled 1
+    %define RESET_MM_PERMUTATION INIT_YMM %1
+    %define mmsize 32
+    %define num_mmregs 8
+    %if ARCH_X86_64
+    %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %undef movh
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, ymm %+ %%i
+    CAT_XDEFINE nymm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+INIT_XMM
+
+%macro DECLARE_MMCAST 1
+    %define  mmmm%1   mm%1
+    %define  mmxmm%1  mm%1
+    %define  mmymm%1  mm%1
+    %define xmmmm%1   mm%1
+    %define xmmxmm%1 xmm%1
+    %define xmmymm%1 xmm%1
+    %define ymmmm%1   mm%1
+    %define ymmxmm%1 xmm%1
+    %define ymmymm%1 ymm%1
+    %define xm%1 xmm %+ m%1
+    %define ym%1 ymm %+ m%1
+%endmacro
+
+%assign i 0
+%rep 16
+    DECLARE_MMCAST i
+%assign i i+1
+%endrep
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+%rep %0/2
+    %xdefine %%tmp%2 m%2
+    %rotate 2
+%endrep
+%rep %0/2
+    %xdefine m%1 %%tmp%2
+    CAT_XDEFINE n, m%1, %1
+    %rotate 2
+%endrep
+%endmacro
+
+%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
+%ifnum %1 ; SWAP 0, 1, ...
+    SWAP_INTERNAL_NUM %1, %2
+%else ; SWAP m0, m1, ...
+    SWAP_INTERNAL_NAME %1, %2
+%endif
+%endmacro
+
+%macro SWAP_INTERNAL_NUM 2-*
+    %rep %0-1
+        %xdefine %%tmp m%1
+        %xdefine m%1 m%2
+        %xdefine m%2 %%tmp
+        CAT_XDEFINE n, m%1, %1
+        CAT_XDEFINE n, m%2, %2
+    %rotate 1
+    %endrep
+%endmacro
+
+%macro SWAP_INTERNAL_NAME 2-*
+    %xdefine %%args n %+ %1
+    %rep %0-1
+        %xdefine %%args %%args, n %+ %2
+    %rotate 1
+    %endrep
+    SWAP_INTERNAL_NUM %%args
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+    %if %0
+        %xdefine %%f %1_m
+    %else
+        %xdefine %%f current_function %+ _m
+    %endif
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE %%f, %%i, m %+ %%i
+    %assign %%i %%i+1
+    %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
+    %ifdef %1_m0
+        %assign %%i 0
+        %rep num_mmregs
+            CAT_XDEFINE m, %%i, %1_m %+ %%i
+            CAT_XDEFINE n, m %+ %%i, %%i
+        %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+    call_internal %1, %1 %+ SUFFIX
+%endmacro
+%macro call_internal 2
+    %xdefine %%i %1
+    %ifndef cglobaled_%1
+        %ifdef cglobaled_%2
+            %xdefine %%i %2
+        %endif
+    %endif
+    call %%i
+    LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+    %ifnum %2
+        %if %2==128
+            sub %1, -128
+        %else
+            add %1, %2
+        %endif
+    %else
+        add %1, %2
+    %endif
+%endmacro
+
+%macro sub 2
+    %ifnum %2
+        %if %2==128
+            add %1, -128
+        %else
+            sub %1, %2
+        %endif
+    %else
+        sub %1, %2
+    %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 16
+    %if i < 8
+        CAT_XDEFINE sizeofmm, i, 8
+    %endif
+    CAT_XDEFINE sizeofxmm, i, 16
+    CAT_XDEFINE sizeofymm, i, 32
+%assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+    %xdefine %%opcode %1
+    %xdefine %%dst %2
+    %rep %0-2
+        %ifidn %%dst, %3
+            %error non-avx emulation of ``%%opcode'' is not supported
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+;%5+: operands
+%macro RUN_AVX_INSTR 5-8+
+    %ifnum sizeof%6
+        %assign %%sizeofreg sizeof%6
+    %elifnum sizeof%5
+        %assign %%sizeofreg sizeof%5
+    %else
+        %assign %%sizeofreg mmsize
+    %endif
+    %assign %%emulate_avx 0
+    %if avx_enabled && %%sizeofreg >= 16
+        %xdefine %%instr v%1
+    %else
+        %xdefine %%instr %1
+        %if %0 >= 7+%3
+            %assign %%emulate_avx 1
+        %endif
+    %endif
+
+    %if %%emulate_avx
+        %xdefine %%src1 %6
+        %xdefine %%src2 %7
+        %ifnidn %5, %6
+            %if %0 >= 8
+                CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8
+            %else
+                CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
+            %endif
+            %if %4 && %3 == 0
+                %ifnid %7
+                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
+                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
+                    ; So, if the instruction is commutative with a memory arg, swap them.
+                    %xdefine %%src1 %7
+                    %xdefine %%src2 %6
+                %endif
+            %endif
+            %if %%sizeofreg == 8
+                MOVQ %5, %%src1
+            %elif %2
+                MOVAPS %5, %%src1
+            %else
+                MOVDQA %5, %%src1
+            %endif
+        %endif
+        %if %0 >= 8
+            %1 %5, %%src2, %8
+        %else
+            %1 %5, %%src2
+        %endif
+    %elif %0 >= 8
+        %%instr %5, %6, %7, %8
+    %elif %0 == 7
+        %%instr %5, %6, %7
+    %elif %0 == 6
+        %%instr %5, %6
+    %else
+        %%instr %5
+    %endif
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-4 0, 1, 0
+    %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4
+        %ifidn %2, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %1
+        %elifidn %3, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2
+        %elifidn %4, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3
+        %elifidn %5, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4
+        %else
+            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5
+        %endif
+    %endmacro
+%endmacro
+
+; Instructions with both VEX and non-VEX encodings
+; Non-destructive instructions are written without parameters
+AVX_INSTR addpd, 1, 0, 1
+AVX_INSTR addps, 1, 0, 1
+AVX_INSTR addsd, 1, 0, 1
+AVX_INSTR addss, 1, 0, 1
+AVX_INSTR addsubpd, 1, 0, 0
+AVX_INSTR addsubps, 1, 0, 0
+AVX_INSTR aesdec, 0, 0, 0
+AVX_INSTR aesdeclast, 0, 0, 0
+AVX_INSTR aesenc, 0, 0, 0
+AVX_INSTR aesenclast, 0, 0, 0
+AVX_INSTR aesimc
+AVX_INSTR aeskeygenassist
+AVX_INSTR andnpd, 1, 0, 0
+AVX_INSTR andnps, 1, 0, 0
+AVX_INSTR andpd, 1, 0, 1
+AVX_INSTR andps, 1, 0, 1
+AVX_INSTR blendpd, 1, 0, 0
+AVX_INSTR blendps, 1, 0, 0
+AVX_INSTR blendvpd, 1, 0, 0
+AVX_INSTR blendvps, 1, 0, 0
+AVX_INSTR cmppd, 1, 1, 0
+AVX_INSTR cmpps, 1, 1, 0
+AVX_INSTR cmpsd, 1, 1, 0
+AVX_INSTR cmpss, 1, 1, 0
+AVX_INSTR comisd
+AVX_INSTR comiss
+AVX_INSTR cvtdq2pd
+AVX_INSTR cvtdq2ps
+AVX_INSTR cvtpd2dq
+AVX_INSTR cvtpd2ps
+AVX_INSTR cvtps2dq
+AVX_INSTR cvtps2pd
+AVX_INSTR cvtsd2si
+AVX_INSTR cvtsd2ss
+AVX_INSTR cvtsi2sd
+AVX_INSTR cvtsi2ss
+AVX_INSTR cvtss2sd
+AVX_INSTR cvtss2si
+AVX_INSTR cvttpd2dq
+AVX_INSTR cvttps2dq
+AVX_INSTR cvttsd2si
+AVX_INSTR cvttss2si
+AVX_INSTR divpd, 1, 0, 0
+AVX_INSTR divps, 1, 0, 0
+AVX_INSTR divsd, 1, 0, 0
+AVX_INSTR divss, 1, 0, 0
+AVX_INSTR dppd, 1, 1, 0
+AVX_INSTR dpps, 1, 1, 0
+AVX_INSTR extractps
+AVX_INSTR haddpd, 1, 0, 0
+AVX_INSTR haddps, 1, 0, 0
+AVX_INSTR hsubpd, 1, 0, 0
+AVX_INSTR hsubps, 1, 0, 0
+AVX_INSTR insertps, 1, 1, 0
+AVX_INSTR lddqu
+AVX_INSTR ldmxcsr
+AVX_INSTR maskmovdqu
+AVX_INSTR maxpd, 1, 0, 1
+AVX_INSTR maxps, 1, 0, 1
+AVX_INSTR maxsd, 1, 0, 1
+AVX_INSTR maxss, 1, 0, 1
+AVX_INSTR minpd, 1, 0, 1
+AVX_INSTR minps, 1, 0, 1
+AVX_INSTR minsd, 1, 0, 1
+AVX_INSTR minss, 1, 0, 1
+AVX_INSTR movapd
+AVX_INSTR movaps
+AVX_INSTR movd
+AVX_INSTR movddup
+AVX_INSTR movdqa
+AVX_INSTR movdqu
+AVX_INSTR movhlps, 1, 0, 0
+AVX_INSTR movhpd, 1, 0, 0
+AVX_INSTR movhps, 1, 0, 0
+AVX_INSTR movlhps, 1, 0, 0
+AVX_INSTR movlpd, 1, 0, 0
+AVX_INSTR movlps, 1, 0, 0
+AVX_INSTR movmskpd
+AVX_INSTR movmskps
+AVX_INSTR movntdq
+AVX_INSTR movntdqa
+AVX_INSTR movntpd
+AVX_INSTR movntps
+AVX_INSTR movq
+AVX_INSTR movsd, 1, 0, 0
+AVX_INSTR movshdup
+AVX_INSTR movsldup
+AVX_INSTR movss, 1, 0, 0
+AVX_INSTR movupd
+AVX_INSTR movups
+AVX_INSTR mpsadbw, 0, 1, 0
+AVX_INSTR mulpd, 1, 0, 1
+AVX_INSTR mulps, 1, 0, 1
+AVX_INSTR mulsd, 1, 0, 1
+AVX_INSTR mulss, 1, 0, 1
+AVX_INSTR orpd, 1, 0, 1
+AVX_INSTR orps, 1, 0, 1
+AVX_INSTR pabsb
+AVX_INSTR pabsd
+AVX_INSTR pabsw
+AVX_INSTR packsswb, 0, 0, 0
+AVX_INSTR packssdw, 0, 0, 0
+AVX_INSTR packuswb, 0, 0, 0
+AVX_INSTR packusdw, 0, 0, 0
+AVX_INSTR paddb, 0, 0, 1
+AVX_INSTR paddw, 0, 0, 1
+AVX_INSTR paddd, 0, 0, 1
+AVX_INSTR paddq, 0, 0, 1
+AVX_INSTR paddsb, 0, 0, 1
+AVX_INSTR paddsw, 0, 0, 1
+AVX_INSTR paddusb, 0, 0, 1
+AVX_INSTR paddusw, 0, 0, 1
+AVX_INSTR palignr, 0, 1, 0
+AVX_INSTR pand, 0, 0, 1
+AVX_INSTR pandn, 0, 0, 0
+AVX_INSTR pavgb, 0, 0, 1
+AVX_INSTR pavgw, 0, 0, 1
+AVX_INSTR pblendvb, 0, 0, 0
+AVX_INSTR pblendw, 0, 1, 0
+AVX_INSTR pclmulqdq, 0, 1, 0
+AVX_INSTR pcmpestri
+AVX_INSTR pcmpestrm
+AVX_INSTR pcmpistri
+AVX_INSTR pcmpistrm
+AVX_INSTR pcmpeqb, 0, 0, 1
+AVX_INSTR pcmpeqw, 0, 0, 1
+AVX_INSTR pcmpeqd, 0, 0, 1
+AVX_INSTR pcmpeqq, 0, 0, 1
+AVX_INSTR pcmpgtb, 0, 0, 0
+AVX_INSTR pcmpgtw, 0, 0, 0
+AVX_INSTR pcmpgtd, 0, 0, 0
+AVX_INSTR pcmpgtq, 0, 0, 0
+AVX_INSTR pextrb
+AVX_INSTR pextrd
+AVX_INSTR pextrq
+AVX_INSTR pextrw
+AVX_INSTR phaddw, 0, 0, 0
+AVX_INSTR phaddd, 0, 0, 0
+AVX_INSTR phaddsw, 0, 0, 0
+AVX_INSTR phminposuw
+AVX_INSTR phsubw, 0, 0, 0
+AVX_INSTR phsubd, 0, 0, 0
+AVX_INSTR phsubsw, 0, 0, 0
+AVX_INSTR pinsrb, 0, 1, 0
+AVX_INSTR pinsrd, 0, 1, 0
+AVX_INSTR pinsrq, 0, 1, 0
+AVX_INSTR pinsrw, 0, 1, 0
+AVX_INSTR pmaddwd, 0, 0, 1
+AVX_INSTR pmaddubsw, 0, 0, 0
+AVX_INSTR pmaxsb, 0, 0, 1
+AVX_INSTR pmaxsw, 0, 0, 1
+AVX_INSTR pmaxsd, 0, 0, 1
+AVX_INSTR pmaxub, 0, 0, 1
+AVX_INSTR pmaxuw, 0, 0, 1
+AVX_INSTR pmaxud, 0, 0, 1
+AVX_INSTR pminsb, 0, 0, 1
+AVX_INSTR pminsw, 0, 0, 1
+AVX_INSTR pminsd, 0, 0, 1
+AVX_INSTR pminub, 0, 0, 1
+AVX_INSTR pminuw, 0, 0, 1
+AVX_INSTR pminud, 0, 0, 1
+AVX_INSTR pmovmskb
+AVX_INSTR pmovsxbw
+AVX_INSTR pmovsxbd
+AVX_INSTR pmovsxbq
+AVX_INSTR pmovsxwd
+AVX_INSTR pmovsxwq
+AVX_INSTR pmovsxdq
+AVX_INSTR pmovzxbw
+AVX_INSTR pmovzxbd
+AVX_INSTR pmovzxbq
+AVX_INSTR pmovzxwd
+AVX_INSTR pmovzxwq
+AVX_INSTR pmovzxdq
+AVX_INSTR pmuldq, 0, 0, 1
+AVX_INSTR pmulhrsw, 0, 0, 1
+AVX_INSTR pmulhuw, 0, 0, 1
+AVX_INSTR pmulhw, 0, 0, 1
+AVX_INSTR pmullw, 0, 0, 1
+AVX_INSTR pmulld, 0, 0, 1
+AVX_INSTR pmuludq, 0, 0, 1
+AVX_INSTR por, 0, 0, 1
+AVX_INSTR psadbw, 0, 0, 1
+AVX_INSTR pshufb, 0, 0, 0
+AVX_INSTR pshufd
+AVX_INSTR pshufhw
+AVX_INSTR pshuflw
+AVX_INSTR psignb, 0, 0, 0
+AVX_INSTR psignw, 0, 0, 0
+AVX_INSTR psignd, 0, 0, 0
+AVX_INSTR psllw, 0, 0, 0
+AVX_INSTR pslld, 0, 0, 0
+AVX_INSTR psllq, 0, 0, 0
+AVX_INSTR pslldq, 0, 0, 0
+AVX_INSTR psraw, 0, 0, 0
+AVX_INSTR psrad, 0, 0, 0
+AVX_INSTR psrlw, 0, 0, 0
+AVX_INSTR psrld, 0, 0, 0
+AVX_INSTR psrlq, 0, 0, 0
+AVX_INSTR psrldq, 0, 0, 0
+AVX_INSTR psubb, 0, 0, 0
+AVX_INSTR psubw, 0, 0, 0
+AVX_INSTR psubd, 0, 0, 0
+AVX_INSTR psubq, 0, 0, 0
+AVX_INSTR psubsb, 0, 0, 0
+AVX_INSTR psubsw, 0, 0, 0
+AVX_INSTR psubusb, 0, 0, 0
+AVX_INSTR psubusw, 0, 0, 0
+AVX_INSTR ptest
+AVX_INSTR punpckhbw, 0, 0, 0
+AVX_INSTR punpckhwd, 0, 0, 0
+AVX_INSTR punpckhdq, 0, 0, 0
+AVX_INSTR punpckhqdq, 0, 0, 0
+AVX_INSTR punpcklbw, 0, 0, 0
+AVX_INSTR punpcklwd, 0, 0, 0
+AVX_INSTR punpckldq, 0, 0, 0
+AVX_INSTR punpcklqdq, 0, 0, 0
+AVX_INSTR pxor, 0, 0, 1
+AVX_INSTR rcpps, 1, 0, 0
+AVX_INSTR rcpss, 1, 0, 0
+AVX_INSTR roundpd
+AVX_INSTR roundps
+AVX_INSTR roundsd
+AVX_INSTR roundss
+AVX_INSTR rsqrtps, 1, 0, 0
+AVX_INSTR rsqrtss, 1, 0, 0
+AVX_INSTR shufpd, 1, 1, 0
+AVX_INSTR shufps, 1, 1, 0
+AVX_INSTR sqrtpd, 1, 0, 0
+AVX_INSTR sqrtps, 1, 0, 0
+AVX_INSTR sqrtsd, 1, 0, 0
+AVX_INSTR sqrtss, 1, 0, 0
+AVX_INSTR stmxcsr
+AVX_INSTR subpd, 1, 0, 0
+AVX_INSTR subps, 1, 0, 0
+AVX_INSTR subsd, 1, 0, 0
+AVX_INSTR subss, 1, 0, 0
+AVX_INSTR ucomisd
+AVX_INSTR ucomiss
+AVX_INSTR unpckhpd, 1, 0, 0
+AVX_INSTR unpckhps, 1, 0, 0
+AVX_INSTR unpcklpd, 1, 0, 0
+AVX_INSTR unpcklps, 1, 0, 0
+AVX_INSTR xorpd, 1, 0, 1
+AVX_INSTR xorps, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 1, 0, 1
+AVX_INSTR pfsub, 1, 0, 0
+AVX_INSTR pfmul, 1, 0, 1
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+    %if j < 10
+        CAT_XDEFINE q000, j, i
+    %elif j < 100
+        CAT_XDEFINE q00, j, i
+    %elif j < 1000
+        CAT_XDEFINE q0, j, i
+    %else
+        CAT_XDEFINE q, j, i
+    %endif
+%assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+    %macro %1 4-7 %1, %2, %3
+        %if cpuflag(xop)
+            v%5 %1, %2, %3, %4
+        %else
+            %6 %1, %2, %3
+            %7 %1, %4
+        %endif
+    %endmacro
+%endmacro
+
+FMA_INSTR  pmacsdd,  pmulld, paddd
+FMA_INSTR  pmacsww,  pmullw, paddw
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; convert FMA4 to FMA3 if possible
+%macro FMA4_INSTR 4
+    %macro %1 4-8 %1, %2, %3, %4
+        %if cpuflag(fma4)
+            v%5 %1, %2, %3, %4
+        %elifidn %1, %2
+            v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
+        %elifidn %1, %3
+            v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
+        %elifidn %1, %4
+            v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
+        %else
+            %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
+        %endif
+    %endmacro
+%endmacro
+
+FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
+FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
+FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
+FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
+
+FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
+FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
+FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
+FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
+
+FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
+FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
+FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
+FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
+
+FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
+FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
+FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
+FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
+
+FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
+FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
+FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
+FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
+
+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
+%if ARCH_X86_64 == 0
+%macro vpbroadcastq 2
+%if sizeof%1 == 16
+    movddup %1, %2
+%else
+    vbroadcastsd %1, %2
+%endif
+%endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif

kvazaar-1.2.0.tar.gz/src/strategyselector.c -> kvazaar-1.3.0.tar.gz/src/strategyselector.c Changed

@@ -26,9 +26,6 @@
 
 #ifdef _WIN32
 #include <windows.h>
-#elif MACOS
-#include <sys/param.h>
-#include <sys/sysctl.h>
 #else
 #include <unistd.h>
 #endif
@@ -89,6 +86,11 @@
     return 0;
   }
   
+  if (!kvz_strategy_register_encode(&strategies, bitdepth)) {
+    fprintf(stderr, "kvz_strategy_register_encode failed!\n");
+    return 0;
+  }
+  
   while(cur_strategy_to_select->fptr) {
     *(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type);
     
@@ -372,40 +374,67 @@
 #endif // COMPILE_INTEL
 
 #if COMPILE_POWERPC
-#include <fcntl.h>
-#include <unistd.h>
-#include <linux/auxvec.h>
+#  if defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD__ >= 12)
+#ifdef __linux__
 #include <asm/cputable.h>
+#else
+#include <machine/cpu.h>
+#endif
+#include <sys/auxv.h>
 
-//Source: http://freevec.org/function/altivec_runtime_detection_linux
 static int altivec_available(void)
 {
-    int result = 0;
-    unsigned long buf64;
-    ssize_t count;
-    int fd, i;
- 
-    fd = open("/proc/self/auxv", O_RDONLY);
-    if (fd < 0) {
-        return 0;
-    }
-    // loop on reading
-    do {
-        count = read(fd, buf, sizeof(buf));
-        if (count < 0)
-            break;
-        for (i=0; i < (count / sizeof(unsigned long)); i += 2) {
-            if (bufi == AT_HWCAP) {
-                result = !!(bufi+1 & PPC_FEATURE_HAS_ALTIVEC);
-                goto out_close;
-            } else if (bufi == AT_NULL)
-                goto out_close;
-        }
-    } while (count == sizeof(buf));
-out_close:
-    close(fd);
-    return result;
+    unsigned long hwcap = 0;
+#ifdef __linux__
+    hwcap = getauxval(AT_HWCAP);
+#else
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+#endif
+    return !!(hwcap & PPC_FEATURE_HAS_ALTIVEC);
 }
+#  elif defined(__FreeBSD__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+
+static int altivec_available(void)
+{
+  u_long cpu_features = 0;
+  size_t len = sizeof(cpu_features);
+
+  sysctlbyname("hw.cpu_features", &cpu_features, &len, NULL, 0);
+  return !!(cpu_features & PPC_FEATURE_HAS_ALTIVEC);
+}
+#  elif defined(__APPLE__) || defined(__NetBSD__) || defined(__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#ifndef __APPLE__
+#include <machine/cpu.h>
+#endif
+
+static int altivec_available(void)
+{
+  int cpu_altivec = 0;
+  size_t len = sizeof(cpu_altivec);
+#ifdef HW_VECTORUNIT
+  int mib = { CTL_HW, HW_VECTORUNIT };
+#else
+  int mib = { CTL_MACHDEP, CPU_ALTIVEC };
+#endif
+
+  sysctl(mib, sizeof(mib)/sizeof(mib0), &cpu_altivec, &len, NULL, 0);
+  return cpu_altivec;
+}
+#  else
+static int altivec_available(void)
+{
+#if COMPILE_POWERPC_ALTIVEC
+  return 1;
+#else
+  return 0;
+#endif
+}
+#  endif
 #endif //COMPILE_POWERPC
 
 static void set_hardware_flags(int32_t cpuid) {

kvazaar-1.2.0.tar.gz/src/strategyselector.h -> kvazaar-1.3.0.tar.gz/src/strategyselector.h Changed

kvazaar-1.2.0.tar.gz/src/threadqueue.c -> kvazaar-1.3.0.tar.gz/src/threadqueue.c Changed

kvazaar-1.2.0.tar.gz/src/threadqueue.h -> kvazaar-1.3.0.tar.gz/src/threadqueue.h Changed

kvazaar-1.3.0.tar.gz/src/threadwrapper Added

kvazaar-1.3.0.tar.gz/src/threadwrapper/LICENSE Added

kvazaar-1.3.0.tar.gz/src/threadwrapper/README.md Added

kvazaar-1.3.0.tar.gz/src/threadwrapper/include Added

kvazaar-1.3.0.tar.gz/src/threadwrapper/include/pthread.h Added

@@ -0,0 +1,53 @@
+/*
+Copyright 2019 Tampere University
+
+Permission to use, copy, modify, and/or distribute this software for
+any purpose with or without fee is hereby granted, provided that the
+above copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void* pthread_cond_t;
+typedef void* pthread_cond_t;
+typedef void* pthread_mutex_t;
+typedef void* pthread_t;
+typedef void*(voidp_voidp_func)(void*);
+
+typedef void pthread_attr_t;
+typedef void pthread_condattr_t;
+typedef void pthread_mutexattr_t;
+
+// Parameter names that have been commented away do nothing,
+// as they are always null when the functions are used in Kvazaar.
+
+int pthread_cond_broadcast(pthread_cond_t* cond);
+int pthread_cond_destroy(pthread_cond_t* cond);
+int pthread_cond_init(pthread_cond_t* cond, const pthread_condattr_t* /*attr*/);
+int pthread_cond_signal(pthread_cond_t* cond);
+int pthread_cond_wait(pthread_cond_t* cond, pthread_mutex_t* mutex);
+
+int pthread_create(pthread_t* thread, const pthread_attr_t* /*attr*/, voidp_voidp_func executee, void* arg);
+void pthread_exit(void* /*value_ptr*/);
+int pthread_join(pthread_t thread, void** /*value_ptr*/);
+
+int pthread_mutex_destroy(pthread_mutex_t* mutex);
+int pthread_mutex_init(pthread_mutex_t* mutex, const pthread_mutexattr_t* /*attr*/);
+int pthread_mutex_lock(pthread_mutex_t* mutex);
+int pthread_mutex_unlock(pthread_mutex_t* mutex);
+
+#ifdef __cplusplus
+}
+#endif

kvazaar-1.3.0.tar.gz/src/threadwrapper/include/semaphore.h Added

@@ -0,0 +1,33 @@
+/*
+Copyright 2019 Tampere University
+
+Permission to use, copy, modify, and/or distribute this software for
+any purpose with or without fee is hereby granted, provided that the
+above copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void* sem_t;
+
+int sem_destroy(sem_t* sem);
+// pshared is always 0 in Kvazaar on w32.
+int sem_init(sem_t* sem, int /*pshared*/, unsigned int value);
+int sem_post(sem_t* sem);
+int sem_wait(sem_t* sem);
+
+#ifdef __cplusplus
+}
+#endif

kvazaar-1.3.0.tar.gz/src/threadwrapper/src Added

kvazaar-1.3.0.tar.gz/src/threadwrapper/src/pthread.cpp Added

@@ -0,0 +1,88 @@
+/*
+Copyright 2019 Tampere University
+
+Permission to use, copy, modify, and/or distribute this software for
+any purpose with or without fee is hereby granted, provided that the
+above copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#include "pthread.h"
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+
+int pthread_cond_broadcast(pthread_cond_t* cond) {
+    static_cast<std::condition_variable*>(*cond)->notify_all();
+    return 0;
+}
+
+int pthread_cond_destroy(pthread_cond_t* cond) {
+    delete static_cast<std::condition_variable*>(*cond);
+    *cond = nullptr;
+    return 0;
+}
+
+int pthread_cond_init(pthread_cond_t* cond, const pthread_condattr_t*) {
+    *cond = new std::condition_variable();
+    return 0;
+}
+
+int pthread_cond_signal(pthread_cond_t* cond) {
+    static_cast<std::condition_variable*>(*cond)->notify_one();
+    return 0;
+}
+
+int pthread_cond_wait(pthread_cond_t* cond, pthread_mutex_t* mutex) {
+    std::mutex* real_mutex = static_cast<std::mutex*>(*mutex);
+    std::unique_lock<std::mutex> lock(*real_mutex, std::adopt_lock);
+    static_cast<std::condition_variable*>(*cond)->wait(lock);
+    lock.release();
+    return 0;
+}
+
+int pthread_create(pthread_t* thread, const pthread_attr_t*, voidp_voidp_func executee, void* arg) {
+    *thread = new std::thread(executee, arg);
+    return 0;
+}
+
+void pthread_exit(void*) {
+    // It might be enough to do nothing here
+    // considering Kvazaar's current use of pthread_exit
+}
+
+int pthread_join(pthread_t thread, void**) {
+    std::thread* real_thread = static_cast<std::thread*>(thread);
+    real_thread->join();
+    delete real_thread;
+    return 0;
+}
+
+int pthread_mutex_destroy(pthread_mutex_t* mutex) {
+    delete static_cast<std::mutex*>(*mutex);
+    *mutex = nullptr;
+    return 0;
+}
+
+int pthread_mutex_init(pthread_mutex_t* mutex, const pthread_mutexattr_t*) {
+    *mutex = new std::mutex();
+    return 0;
+}
+
+int pthread_mutex_lock(pthread_mutex_t* mutex) {
+    static_cast<std::mutex*>(*mutex)->lock();
+    return 0;
+}
+
+int pthread_mutex_unlock(pthread_mutex_t* mutex) {
+    static_cast<std::mutex*>(*mutex)->unlock();
+    return 0;
+}

kvazaar-1.3.0.tar.gz/src/threadwrapper/src/semaphore.cpp Added

@@ -0,0 +1,72 @@
+/*
+Copyright 2019 Tampere University
+
+Permission to use, copy, modify, and/or distribute this software for
+any purpose with or without fee is hereby granted, provided that the
+above copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#include "semaphore.h"
+#include <condition_variable>
+#include <mutex>
+
+
+class Semaphore {
+public:
+
+    Semaphore(int value):
+        val_(value) {
+    }
+
+    void post() {
+        std::unique_lock<std::mutex> lck(mtx_);
+        if (++val_ <= 0) {
+            cvar_.notify_one();
+        }
+    }
+
+    void wait() {
+        std::unique_lock<std::mutex> lck(mtx_);
+        if (--val_ < 0) {
+            cvar_.wait(lck);
+        }
+    }
+
+
+private:
+
+    int val_;
+    std::condition_variable cvar_;
+    std::mutex mtx_;
+
+};  // class Semaphore
+
+
+int sem_destroy(sem_t* sem) {
+    delete static_cast<Semaphore*>(*sem);
+    *sem = nullptr;
+    return 0;
+}
+
+int sem_init(sem_t* sem, int, unsigned int value) {
+    *sem = new Semaphore(value);
+    return 0;
+}
+
+int sem_post(sem_t* sem) {
+    static_cast<Semaphore*>(*sem)->post();
+    return 0;
+}
+
+int sem_wait(sem_t* sem) {
+    static_cast<Semaphore*>(*sem)->wait();
+    return 0;
+}

kvazaar-1.2.0.tar.gz/src/transform.c -> kvazaar-1.3.0.tar.gz/src/transform.c Changed

@@ -186,15 +186,25 @@
  * \param coeff transform coefficients
  * \param block_size width of transform
  */
-void kvz_transform2d(const encoder_control_t * const encoder, int16_t *block, int16_t *coeff, int8_t block_size, int32_t mode)
+void kvz_transform2d(const encoder_control_t * const encoder,
+                     int16_t *block,
+                     int16_t *coeff,
+                     int8_t block_size,
+                     color_t color,
+                     cu_type_t type)
 {
-  dct_func *dct_func = kvz_get_dct_func(block_size, mode);  
+  dct_func *dct_func = kvz_get_dct_func(block_size, color, type);
   dct_func(encoder->bitdepth, block, coeff);
 }
 
-void kvz_itransform2d(const encoder_control_t * const encoder, int16_t *block, int16_t *coeff, int8_t block_size, int32_t mode)
+void kvz_itransform2d(const encoder_control_t * const encoder,
+                      int16_t *block,
+                      int16_t *coeff,
+                      int8_t block_size,
+                      color_t color,
+                      cu_type_t type)
 {
-  dct_func *idct_func = kvz_get_idct_func(block_size, mode);
+  dct_func *idct_func = kvz_get_idct_func(block_size, color, type);
   idct_func(encoder->bitdepth, coeff, block);
 }
 
@@ -359,19 +369,22 @@
     }
 
   } else if (can_use_trskip) {
+    int8_t tr_skip = 0;
+
     // Try quantization with trskip and use it if it's better.
     has_coeffs = kvz_quantize_residual_trskip(state,
                                               cur_pu,
                                               tr_width,
                                               color,
                                               scan_idx,
-                                              &cur_pu->intra.tr_skip,
+                                              &tr_skip,
                                               lcu_width,
                                               lcu_width,
                                               ref,
                                               pred,
                                               pred,
                                               coeff);
+    cur_pu->tr_skip = tr_skip;
   } else {
     has_coeffs = kvz_quantize_residual(state,
                                        cur_pu,
@@ -450,10 +463,8 @@
       LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf,
     };
 
-    if (luma && depth < MAX_DEPTH) {
+    if (depth <= MAX_DEPTH) {
       cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_Y);
-    }
-    if (chroma && depth <= MAX_DEPTH) {
       cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_U);
       cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_V);
     }

kvazaar-1.2.0.tar.gz/src/transform.h -> kvazaar-1.3.0.tar.gz/src/transform.h Changed

@@ -38,8 +38,18 @@
 void kvz_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
 void kvz_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size);
 
-void kvz_transform2d(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size, int32_t mode);
-void kvz_itransform2d(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size, int32_t mode);
+void kvz_transform2d(const encoder_control_t * const encoder,
+                     int16_t *block,
+                     int16_t *coeff,
+                     int8_t block_size,
+                     color_t color,
+                     cu_type_t type);
+void kvz_itransform2d(const encoder_control_t * const encoder,
+                      int16_t *block,
+                      int16_t *coeff,
+                      int8_t block_size,
+                      color_t color,
+                      cu_type_t type);
 
 int32_t kvz_get_scaled_qp(int8_t type, int8_t qp, int8_t qp_offset);

kvazaar-1.2.0.tar.gz/tests/Makefile.am -> kvazaar-1.3.0.tar.gz/tests/Makefile.am Changed

kvazaar-1.2.0.tar.gz/tests/dct_tests.c -> kvazaar-1.3.0.tar.gz/tests/dct_tests.c Changed

kvazaar-1.3.0.tar.gz/tests/inter_recon_bipred_tests.c Added

@@ -0,0 +1,184 @@
+/*****************************************************************************
+* This file is part of Kvazaar HEVC encoder.
+*
+* Copyright (C) 2017 Tampere University of Technology and others (see
+* COPYING file).
+*
+* Kvazaar is free software: you can redistribute it and/or modify
+* it under the terms of the GNU Lesser General Public License version 2.1 as
+* published by the Free Software Foundation.
+*
+* Kvazaar is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with Kvazaar.  If not, see <http://www.gnu.org/licenses/>.
+****************************************************************************/
+
+#include "greatest/greatest.h"
+
+#include "test_strategies.h"
+#include "strategies/generic/picture-generic.h"
+#include <string.h>
+#include <stdlib.h>
+
+
+static lcu_t expected_test_result;
+static lcu_t result;
+
+static lcu_t lcu1;
+
+int temp1, temp2, temp3, temp4;
+
+int16_t mv_param22 = { { 3,3 },{ 3,3 } };
+int width = 16;
+int height = 16;
+int xpos = 0;
+int ypos = 0;
+
+
+kvz_pixel temp_lcu_yLCU_WIDTH*LCU_WIDTH;
+kvz_pixel temp_lcu_uLCU_WIDTH_C*LCU_WIDTH_C;
+kvz_pixel temp_lcu_vLCU_WIDTH_C*LCU_WIDTH_C;
+
+int hi_prec_luma_rec0;
+int hi_prec_luma_rec1;
+int hi_prec_chroma_rec0;
+int hi_prec_chroma_rec1;
+
+hi_prec_buf_t* high_precision_rec0 = 0;
+hi_prec_buf_t* high_precision_rec1 = 0;
+
+int temp_x, temp_y;
+
+
+
+static void setup()
+{
+
+	memset(lcu1.rec.y, 0, sizeof(kvz_pixel) * 64 * 64);
+	memset(lcu1.rec.u, 0, sizeof(kvz_pixel) * 32 * 32);
+	memset(lcu1.rec.v, 0, sizeof(kvz_pixel) * 32 * 32);
+
+
+	memset(expected_test_result.rec.y, 0, sizeof(kvz_pixel) * 64 * 64);
+	memset(expected_test_result.rec.u, 0, sizeof(kvz_pixel) * 32 * 32);
+	memset(expected_test_result.rec.v, 0, sizeof(kvz_pixel) * 32 * 32);
+
+	memcpy(expected_test_result.rec.y, lcu1.rec.y, sizeof(kvz_pixel) * 64 * 64);
+	memcpy(expected_test_result.rec.u, lcu1.rec.u, sizeof(kvz_pixel) * 32 * 32);
+	memcpy(expected_test_result.rec.v, lcu1.rec.v, sizeof(kvz_pixel) * 32 * 32);
+
+	// Setup is not optimized working function from picture-generic.c.
+
+	
+	int shift = 15 - KVZ_BIT_DEPTH;
+	int offset = 1 << (shift - 1);
+
+ hi_prec_luma_rec0 = mv_param00 & 3 || mv_param01 & 3;
+ hi_prec_luma_rec1 = mv_param10 & 3 || mv_param11 & 3;
+
+ hi_prec_chroma_rec0 = mv_param00 & 7 || mv_param01 & 7;
+ hi_prec_chroma_rec1 = mv_param10 & 7 || mv_param11 & 7;
+
+	if (hi_prec_chroma_rec0) high_precision_rec0 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
+	if (hi_prec_chroma_rec1) high_precision_rec1 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH);
+
+	
+
+
+	for (temp_y = 0; temp_y < height; ++temp_y) {
+		int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+		for (temp_x = 0; temp_x < width; ++temp_x) {
+			int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+			int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->yy_in_lcu * LCU_WIDTH + x_in_lcu : (temp_lcu_yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->yy_in_lcu * LCU_WIDTH + x_in_lcu : (expected_test_result.rec.yy_in_lcu * LCU_WIDTH + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+			expected_test_result.rec.yy_in_lcu * LCU_WIDTH + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift);
+		}
+
+	}
+	for (temp_y = 0; temp_y < height >> 1; ++temp_y) {
+		int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+		for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
+			int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+			int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->uy_in_lcu * LCU_WIDTH_C + x_in_lcu : (temp_lcu_uy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->uy_in_lcu * LCU_WIDTH_C + x_in_lcu : (expected_test_result.rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+			expected_test_result.rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift);
+
+			int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->vy_in_lcu * LCU_WIDTH_C + x_in_lcu : (temp_lcu_vy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+			int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->vy_in_lcu * LCU_WIDTH_C + x_in_lcu : (expected_test_result.rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu << (14 - KVZ_BIT_DEPTH)));
+			expected_test_result.rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift);
+
+
+
+		}
+	}
+}
+
+
+TEST test_inter_recon_bipred()
+{
+
+
+	memcpy(result.rec.y, lcu1.rec.y, sizeof(kvz_pixel) * 64 * 64);
+	memcpy(result.rec.u, lcu1.rec.u, sizeof(kvz_pixel) * 32 * 32);
+	memcpy(result.rec.v, lcu1.rec.v, sizeof(kvz_pixel) * 32 * 32);
+
+	
+	kvz_inter_recon_bipred_blend(hi_prec_luma_rec0, hi_prec_luma_rec1, hi_prec_chroma_rec0, hi_prec_chroma_rec1, width, height, xpos, ypos, high_precision_rec0, high_precision_rec1, &result, temp_lcu_y, temp_lcu_u, temp_lcu_v); 
+ 
+ for (temp_y = 0; temp_y < height; ++temp_y) {
+  int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+  for (temp_x = 0; temp_x < width; temp_x += 1) {
+   int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+   printf("%d ", result.rec.yy_in_lcu * LCU_WIDTH + x_in_lcu);
+  }
+ }
+ printf("\n");
+ 
+ /*
+ for (temp_y = 0; temp_y < height >> 1; ++temp_y) {
+  int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+  for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
+   int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+   printf("%d ", result.rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu);
+  }
+ }
+ printf("\n");
+ */
+
+	for (temp_y = 0; temp_y < height; ++temp_y) {
+		int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1));
+		for (temp_x = 0; temp_x < width; temp_x+=1) {
+			int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1));
+			ASSERT_EQ_FMT(expected_test_result.rec.yy_in_lcu * LCU_WIDTH + x_in_lcu, result.rec.yy_in_lcu * LCU_WIDTH + x_in_lcu, "%d");
+		}
+	}
+
+	for (temp_y = 0; temp_y < height >> 1; ++temp_y) {
+		int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1));
+		for (temp_x = 0; temp_x < width >> 1; ++temp_x) {
+			int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1));
+			ASSERT_EQ_FMT(expected_test_result.rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu, result.rec.uy_in_lcu * LCU_WIDTH_C + x_in_lcu, "%d");
+			ASSERT_EQ_FMT(expected_test_result.rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu, result.rec.vy_in_lcu * LCU_WIDTH_C + x_in_lcu, "%d");
+		}
+	}
+	
+	PASS();
+}
+
+SUITE(inter_recon_bipred_tests)
+{
+	setup();
+
+	for (volatile int i = 0; i < strategies.count; ++i) {
+		if (strcmp(strategies.strategiesi.type, "inter_recon_bipred") != 0) {
+			continue;
+		}
+
+		kvz_inter_recon_bipred_blend = strategies.strategiesi.fptr;
+		RUN_TEST(test_inter_recon_bipred);
+	}
+}

kvazaar-1.2.0.tar.gz/tests/sad_tests.c -> kvazaar-1.3.0.tar.gz/tests/sad_tests.c Changed

kvazaar-1.2.0.tar.gz/tests/speed_tests.c -> kvazaar-1.3.0.tar.gz/tests/speed_tests.c Changed

kvazaar-1.2.0.tar.gz/tests/test_gop.sh -> kvazaar-1.3.0.tar.gz/tests/test_gop.sh Changed

@@ -9,4 +9,13 @@
 valgrind_test 264x130 10 $common_args --gop=8 -p0 --owf=1
 valgrind_test 264x130 10 $common_args --gop=8 -p0 --owf=4
 valgrind_test 264x130 20 $common_args --gop=8 -p16 --owf=0
+valgrind_test 264x130 10 $common_args --gop=8 -p1 --owf=4
 valgrind_test 264x130 10 $common_args --gop=lp-g4d3t1 -p5 --owf=4
+valgrind_test 264x130 10 $common_args --gop=8 -p8 --owf=4 --no-open-gop
+valgrind_test 264x130 30 $common_args --gop=8 -p16 --owf=16
+# Do more extensive tests in a private gitlab CI runner
+if  ! -z ${GITLAB_CI+x} ;then valgrind_test 264x130 20 $common_args --gop=8 -p8 --owf=0 --no-open-gop; fi
+if  ! -z ${GITLAB_CI+x} ;then valgrind_test 264x130 40 $common_args --gop=8 -p32 --owf=4 --no-open-gop; fi
+if  ! -z ${GITLAB_CI+x} ;then valgrind_test 264x130 70 $common_args --gop=8 -p64 --owf=4 --no-open-gop; fi
+if  ! -z ${GITLAB_CI+x} ;then valgrind_test 264x130 50 $common_args --gop=8 -p40 --owf=4 --no-open-gop; fi
+if  ! -z ${GITLAB_CI+x} ;then valgrind_test 264x130 10 $common_args --gop=8 -p8 --owf=0 --no-open-gop --bipred; fi

kvazaar-1.2.0.tar.gz/tests/test_owf_wpp_tiles.sh -> kvazaar-1.3.0.tar.gz/tests/test_owf_wpp_tiles.sh Changed

kvazaar-1.2.0.tar.gz/tests/test_rate_control.sh -> kvazaar-1.3.0.tar.gz/tests/test_rate_control.sh Changed

kvazaar-1.2.0.tar.gz/tests/test_slices.sh -> kvazaar-1.3.0.tar.gz/tests/test_slices.sh Changed

kvazaar-1.2.0.tar.gz/tests/test_smp.sh -> kvazaar-1.3.0.tar.gz/tests/test_smp.sh Changed

kvazaar-1.2.0.tar.gz/tests/tests_main.c -> kvazaar-1.3.0.tar.gz/tests/tests_main.c Changed

kvazaar-1.2.0.tar.gz/tests/util.sh -> kvazaar-1.3.0.tar.gz/tests/util.sh Changed