Projects
Essentials
kvazaar
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
Expand all
Collapse all
Changes of Revision 14
View file
kvazaar.changes
Changed
@@ -1,4 +1,47 @@ ------------------------------------------------------------------- +Tue Jul 9 20:15:25 UTC 2019 - Luigi Baldoni <aloisio@gmx.com> + +- Update to version 1.3.0 + Features: + * Add release notes like this (#159, cf85d52) + * Changed --rd=2 to use SSD metric for CU mode decision + (662430d) + * Changed inter search to check the cost of flushing residual + to zero (75a8700) + * Changed rectangular and asymmetric blocks to use a transform + split (774c666) + * Added diamond search ME algorithm (4e13608) + * Enabled low delay B GOP structure with --bipred + --gop=lp-g4d3t1 (7155dd0) + * Added termination of intra search at zero residual with + --intra-rdo-et (4fb1c16) + Optimization: + * Made TZ search faster and slightly better (c136044) + * Optimized bi-prediction (69756e2) + Fixes: + * Fixed transform skip with rectangular inter blocks (fb462b2) + * Fixed accidental inter search for 4x4 blocks (649113a) + User Interface: + * Changed options for all preset levels (f033ad0) + * Added an option for limiting the number of steps in motion + estimation with --me-steps (39ed368) + * Added --me=dia (4e13608) + * Added --level, --force-level and --high-tier for setting + bitstream level and tier (bac0745) + Building: + * Fixed issue with struct timespec redefinition with Visual + Studio 2015 and later (713e694) + * Fixed building .asm files in Visual Studio 2017 (6be8195) + * Fixed compatibility with crypto++ 6.0 (4b24cd0) + * Added support for crypto++ with the name libcryptopp + (411276d) + * Dockerfile base image was updated to Ubuntu 18.04 (8380b6c) + * Enabled -Wextra by default (ff17e0b) + Refactoring: + * Inter motion vector cost functions (c73cce3) + * Dockerfile (0164291) + +------------------------------------------------------------------- Fri Nov 17 14:01:40 UTC 2017 - aloisio@gmx.com - Update to version 1.2.0
View file
kvazaar.spec
Changed
@@ -1,8 +1,8 @@ # # spec file for package kvazaar # +# Copyright (c) 2019 Packman Team <packman@links2linux.de> # Copyright (c) 2017 SUSE LINUX GmbH, Nuernberg, Germany. -# Copyright (c) 2017 Packman Team <packman@links2linux.de> # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -13,19 +13,19 @@ # license that conforms to the Open Source Definition (Version 1.9) # published by the Open Source Initiative. -# Please submit bugfixes or comments via http://bugs.opensuse.org/ +# Please submit bugfixes or comments via https://bugs.links2linux.org/ # %define libname libkvazaar %define libmver 4 Name: kvazaar -Version: 1.2.0 +Version: 1.3.0 Release: 0 Summary: HEVC encoder -License: LGPL-2.1 +License: LGPL-2.1-or-later Group: Productivity/Multimedia/Video/Editors and Convertors -Url: http://ultravideo.cs.tut.fi/#encoder +URL: http://ultravideo.cs.tut.fi/#encoder Source0: https://github.com/ultravideo/%{name}/archive/v%{version}.tar.gz#/%{name}-%{version}.tar.gz Patch0: kvazaar.memset.patch BuildRequires: automake @@ -33,7 +33,6 @@ BuildRequires: gcc >= 4.4 BuildRequires: gcc-c++ BuildRequires: libtool -BuildRequires: make BuildRequires: pkgconfig Requires: %{libname}%{libmver} = %{version} %ifnarch %{arm} @@ -66,34 +65,32 @@ autoreconf -fvi %configure \ --disable-static \ - --disable-silent-rules + --disable-silent-rules \ + --docdir=%{_defaultdocdir}/%{name} make %{?_smp_mflags} %install %make_install find %{buildroot} -type f -name "*.la" -delete -print +rm %{buildroot}%{_defaultdocdir}/%{name}/COPYING %post -n %{libname}%{libmver} -p /sbin/ldconfig %postun -n %{libname}%{libmver} -p /sbin/ldconfig %files -%defattr(-,root,root) -%dir %{_datadir}/doc/%{name} -%doc %{_datadir}/doc/%{name}/COPYING -%doc %{_datadir}/doc/%{name}/CREDITS -%doc %{_datadir}/doc/%{name}/README.md -%{_bindir}/kvazaar -%{_mandir}/man1/kvazaar.1%{ext_man} +%license COPYING +%doc CREDITS README.md +%{_bindir}/%{name} +%{_mandir}/man1/%{name}.1%{ext_man} %files -n %{libname}%{libmver} -%defattr(-,root,root) -%doc COPYING CREDITS README.md +%license COPYING +%doc CREDITS README.md %{_libdir}/%{libname}.so.%{libmver}* %files -n %{libname}-devel -%defattr(-,root,root) -%{_includedir}/kvazaar.h +%{_includedir}/%{name}.h %{_libdir}/%{libname}.so -%{_libdir}/pkgconfig/kvazaar.pc +%{_libdir}/pkgconfig/%{name}.pc %changelog
View file
kvazaar-1.2.0.tar.gz/build/kvazaar_VS2013.sln
Deleted
@@ -1,55 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 2013 -VisualStudioVersion = 12.0.30723.0 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_lib", "kvazaar_lib\kvazaar_lib.vcxproj", "{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}" -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{50AB7A17-4885-4D20-BF01-376DE4417FCD}" - ProjectSection(SolutionItems) = preProject - kvazaar_VS2010.vsd = kvazaar_VS2010.vsd - kvazaar_VS2010.vsmdi = kvazaar_VS2010.vsmdi - Local.testsettings = Local.testsettings - TraceAndTestImpact.testsettings = TraceAndTestImpact.testsettings - EndProjectSection -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_tests", "kvazaar_tests\kvazaar_tests.vcxproj", "{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_cli", "kvazaar_cli\kvazaar_cli.vcxproj", "{C755308D-9B3E-4712-99AB-7F6F4E2DA567}" - ProjectSection(ProjectDependencies) = postProject - {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF} = {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF} - EndProjectSection -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Win32 = Debug|Win32 - Debug|x64 = Debug|x64 - Release|Win32 = Release|Win32 - Release|x64 = Release|x64 - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|Win32.ActiveCfg = Debug|Win32 - {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|Win32.Build.0 = Debug|Win32 - {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|x64.ActiveCfg = Debug|x64 - {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|x64.Build.0 = Debug|x64 - {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|Win32.ActiveCfg = Release|Win32 - {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|Win32.Build.0 = Release|Win32 - {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|x64.ActiveCfg = Release|x64 - {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|x64.Build.0 = Release|x64 - {3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Debug|Win32.ActiveCfg = Debug|Win32 - {3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Debug|x64.ActiveCfg = Debug|x64 - {3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Release|Win32.ActiveCfg = Release|Win32 - {3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Release|x64.ActiveCfg = Release|x64 - {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|Win32.ActiveCfg = Debug|Win32 - {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|Win32.Build.0 = Debug|Win32 - {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|x64.ActiveCfg = Debug|x64 - {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|x64.Build.0 = Debug|x64 - {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|Win32.ActiveCfg = Release|Win32 - {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|Win32.Build.0 = Release|Win32 - {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|x64.ActiveCfg = Release|x64 - {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|x64.Build.0 = Release|x64 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection -EndGlobal
View file
kvazaar-1.2.0.tar.gz/.gitignore -> kvazaar-1.3.0.tar.gz/.gitignore
Changed
@@ -42,6 +42,7 @@ *.lo *.o *.trs +.*.swp *.log .kdev4
View file
kvazaar-1.3.0.tar.gz/.gitlab-ci.yml
Added
@@ -0,0 +1,47 @@ +# Use Kvazaar CI base image which includes the build tools and ffmpeg + hmdec in ${HOME}/bin +image: ultravideo/kvazaar_ci_base:latest + +# Build and test kvazaar +test-kvazaar: &test-template + stage: test + script: + - export PATH="${HOME}/bin:${PATH}" + - ./autogen.sh + - ./configure --enable-werror || (cat config.log && false) + - make --jobs=8 + - make check --jobs=8 VERBOSE=1 + artifacts: + paths: + - src/kvazaar + - src/.libs + expire_in: 1 week + +test-asan: + <<: *test-template + variables: + CFLAGS: '-fsanitize=address' + # LeakSanitizer doesn't work inside the container because it requires + # ptrace so we disable it. + ASAN_OPTIONS: 'detect_leaks=0' + # AddressSanitizer adds some extra symbols so we expect a failure from + # the external symbols test. + XFAIL_TESTS: test_external_symbols.sh + +test-tsan: + <<: *test-template + variables: + CFLAGS: '-fsanitize=thread' + +test-ubsan: + <<: *test-template + variables: + CFLAGS: '-fsanitize=undefined -fno-sanitize-recover=all -fno-sanitize=alignment' + +test-valgrind: + <<: *test-template + variables: + KVAZAAR_OVERRIDE_angular_pred: generic + KVAZAAR_OVERRIDE_sao_band_ddistortion: generic + KVAZAAR_OVERRIDE_sao_edge_ddistortion: generic + KVAZAAR_OVERRIDE_calc_sao_edge_dir: generic + KVZ_TEST_VALGRIND: 1
View file
kvazaar-1.2.0.tar.gz/.travis.yml -> kvazaar-1.3.0.tar.gz/.travis.yml
Changed
@@ -19,7 +19,16 @@ include: - compiler: clang + env: KVZ_TEST_VALGRIND=1 + + - compiler: clang + env: CFLAGS='-fsanitize=thread' + + - compiler: clang + env: CFLAGS='-fsanitize=undefined -fno-sanitize-recover=all -fno-sanitize=alignment' + - compiler: gcc-4.8 + env: CFLAGS='-fsanitize=address' # We have some Mac specific code and Mac sometimes has odd build issues. - os: osx @@ -27,14 +36,15 @@ install: true script: - ./autogen.sh - - ./configure --enable-werror + - ./configure --enable-werror || (cat config.log && false) - make --jobs=2 V=1 + - make check TESTS=kvazaar_tests install: bash .travis-install.bash script: - ./autogen.sh - - ./configure --enable-werror + - ./configure --enable-werror || (cat config.log && false) - make --jobs=2 V=1 - make check VERBOSE=1
View file
kvazaar-1.2.0.tar.gz/Dockerfile -> kvazaar-1.3.0.tar.gz/Dockerfile
Changed
@@ -9,34 +9,35 @@ # # RESOLUTION=`avconv -i input.avi 2>&1 | grep Stream | grep -oP ', \K[0-9]+x[0-9]+'` # avconv -i input.avi -an -f rawvideo -pix_fmt yuv420p - | docker run -i -a STDIN -a STDOUT kvazaar -i - --wpp --threads=8 --input-res=$RESOLUTION --preset=ultrafast -o - > output.265 -# or +# or # RESOLUTION=`ffmpeg -i input.avi 2>&1 | grep Stream | grep -oP ', \K[0-9]+x[0-9]+'` # ffmpeg -i input.avi -an -f rawvideo -pix_fmt yuv420p - | docker run -i -a STDIN -a STDOUT kvazaar -i - --wpp --threads=8 --input-res=$RESOLUTION --preset=ultrafast -o - > output.265 # -# Use Ubuntu 15.10 as a base for now, it's around 136MB -FROM ubuntu:15.10 +# Use Ubuntu 18.04 as a base for now, it's around 88MB +FROM ubuntu:18.04 MAINTAINER Marko Viitanen <fador@iki.fi> - # List of needed packages to be able to build kvazaar with autotools - ENV REQUIRED_PACKAGES automake autoconf libtool m4 build-essential git yasm pkgconf - - # Run all the commands in one RUN so we don't have any extra history - # data in the image. - RUN apt-get update \ +# List of needed packages to be able to build kvazaar with autotools +ENV REQUIRED_PACKAGES automake autoconf libtool m4 build-essential git yasm pkgconf + +ADD . kvazaar +# Run all the commands in one RUN so we don't have any extra history +# data in the image. +RUN apt-get update \ && apt-get install -y $REQUIRED_PACKAGES \ && apt-get clean \ - && git clone --depth=1 git://github.com/ultravideo/kvazaar.git; \ - cd kvazaar; \ - ./autogen.sh; \ - ./configure --disable-shared;\ - make;\ - make install; \ - AUTOINSTALLED_PACKAGES=`apt-mark showauto`; \ - apt-get remove --purge --force-yes -y $REQUIRED_PACKAGES $AUTOINSTALLED_PACKAGES; \ - apt-get clean autoclean; \ - apt-get autoremove -y; \ - rm -rf /var/lib/{apt,dpkg,cache,log}/ + && cd kvazaar \ + && ./autogen.sh \ + && ./configure --disable-shared \ + && make\ + && make install \ + && AUTOINSTALLED_PACKAGES=`apt-mark showauto` \ + && apt-get remove --purge --force-yes -y $REQUIRED_PACKAGES $AUTOINSTALLED_PACKAGES \ + && apt-get clean autoclean \ + && apt-get autoremove -y \ + && rm -rf /var/lib/{apt,dpkg,cache,log}/ + ENTRYPOINT ["kvazaar"] CMD ["--help"]
View file
kvazaar-1.2.0.tar.gz/README.md -> kvazaar-1.3.0.tar.gz/README.md
Changed
@@ -11,6 +11,29 @@ - Linux/Mac [](https://travis-ci.org/ultravideo/kvazaar) - Windows [](https://ci.appveyor.com/project/Ultravideo/kvazaar) +## Table of Contents + +- [Using Kvazaar](#using-kvazaar) + - [Example:](#example) + - [Parameters](#parameters) + - [LP-GOP syntax](#lp-gop-syntax) +- [Presets](#presets) +- [Kvazaar library](#kvazaar-library) +- [Compiling Kvazaar](#compiling-kvazaar) + - [Required libraries](#required-libraries) + - [Autotools](#autotools) + - [OS X](#os-x) + - [Visual Studio](#visual-studio) + - [Docker](#docker) + - [Visualization (Windows only)](#visualization-windows-only) +- [Paper](#paper) +- [Contributing to Kvazaar](#contributing-to-kvazaar) + - [Code documentation](#code-documentation) + - [For version control we try to follow these conventions:](#for-version-control-we-try-to-follow-these-conventions) + - [Testing](#testing) + - [Unit tests](#unit-tests) + - [Code style](#code-style) + ## Using Kvazaar ### Example: @@ -31,14 +54,14 @@ kvazaar -i <input> --input-res <width>x<height> -o <output> Required: - -i, --input : Input file + -i, --input <filename> : Input file --input-res <res> : Input resolution [auto] - auto: detect from file name - <int>x<int>: width times height - -o, --output : Output file + - auto: Detect from file name. + - <int>x<int>: width times height + -o, --output <filename> : Output file Presets: - --preset=<preset> : Set options to a preset [medium] + --preset <preset> : Set options to a preset [medium] - ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow placebo @@ -46,144 +69,190 @@ Input: -n, --frames <integer> : Number of frames to code [all] --seek <integer> : First frame to code [0] - --input-fps <num>/<denom> : Framerate of the input video [25.0] - --source-scan-type <string> : Set source scan type [progressive]. - - progressive: progressive scan - - tff: top field first - - bff: bottom field first - --input-format : P420 or P400 - --input-bitdepth : 8-16 - --loop-input : Re-read input file forever + --input-fps <num>[/<denom>] : Frame rate of the input video [25] + --source-scan-type <string> : Source scan type [progressive] + - progressive: Progressive scan + - tff: Top field first + - bff: Bottom field first + --input-format <string> : P420 or P400 [P420] + --input-bitdepth <int> : 8-16 [8] + --loop-input : Re-read input file forever. Options: - --help : Print this help message and exit - --version : Print version information and exit - --aud : Use access unit delimiters - --debug <string> : Output encoders reconstruction. - --cpuid <integer> : Disable runtime cpu optimizations with value 0. - --hash : Decoded picture hash [checksum] + --help : Print this help message and exit. + --version : Print version information and exit. + --(no-)aud : Use access unit delimiters. [disabled] + --debug <filename> : Output internal reconstruction. + --(no-)cpuid : Enable runtime CPU optimizations. [enabled] + --hash <string> : Decoded picture hash [checksum] - none: 0 bytes - checksum: 18 bytes - md5: 56 bytes - --no-psnr : Don't calculate PSNR for frames - --no-info : Don't add encoder info SEI. + --(no-)psnr : Calculate PSNR for frames. [enabled] + --(no-)info : Add encoder info SEI. [enabled] + --crypto <string> : Selective encryption. Crypto support must be + enabled at compile-time. Can be 'on' or 'off' or + a list of features separated with a '+'. [off] + - on: Enable all encryption features. + - off: Disable selective encryption. + - mvs: Motion vector magnitudes. + - mv_signs: Motion vector signs. + - trans_coeffs: Coefficient magnitudes. + - trans_coeff_signs: Coefficient signs. + - intra_pred_modes: Intra prediction modes. + --key <string> : Encryption key [16,213,27,56,255,127,242,112, + 97,126,197,204,25,59,38,30] Video structure: - -q, --qp <integer> : Quantization Parameter [32] - -p, --period <integer> : Period of intra pictures [0] - - 0: only first picture is intra - - 1: all pictures are intra - - 2-N: every Nth picture is intra - --vps-period <integer> : Specify how often the video parameter set is - re-sent. [0] - - 0: only send VPS with the first frame - - N: send VPS with every Nth intra frame - -r, --ref <integer> : Reference frames, range 1..15 [3] - --gop <string> : Definition of GOP structure [0] - - 0: disabled + -q, --qp <integer> : Quantization parameter [22] + -p, --period <integer> : Period of intra pictures [64] + - 0: Only first picture is intra. + - 1: All pictures are intra. + - N: Every Nth picture is intra. + --vps-period <integer> : How often the video parameter set is re-sent [0] + - 0: Only send VPS with the first frame. + - N: Send VPS with every Nth intra frame. + -r, --ref <integer> : Number of reference frames, in range 1..15 [4] + --gop <string> : GOP structure [8] + - 0: Disabled - 8: B-frame pyramid of length 8 - - lp-<string>: lp-gop definition - (e.g. lp-g8d4t2, see README) - --cqmfile <string> : Custom Quantization Matrices from a file - --bitrate <integer> : Target bitrate. [0] - - 0: disable rate-control - - N: target N bits per second - --lossless : Use lossless coding - --mv-constraint : Constrain movement vectors - - none: no constraint - - frametile: constrain within the tile - - frametilemargin: constrain even more - --roi <string> : Use a delta QP map for region of interest - Read an array of delta QP values from - a file, where the first two values are the - width and height, followed by width*height - delta QP values in raster order. - The delta QP map can be any size or aspect - ratio, and will be mapped to LCU's. - --(no-)erp-aqp : Use adaptive QP for 360 video with - equirectangular projection + - lp-<string>: Low-delay P-frame GOP + (e.g. lp-g8d4t2, see README) + --(no-)open-gop : Use open GOP configuration. [enabled] + --cqmfile <filename> : Read custom quantization matrices from a file. + --scaling-list <string>: Set scaling list mode. [off] + - off: Disable scaling lists. + - custom: use custom list (with --cqmfile). + - default: Use default lists. + --bitrate <integer> : Target bitrate [0] + - 0: Disable rate control. + - N: Target N bits per second. + --(no-)lossless : Use lossless coding. [disabled] + --mv-constraint <string> : Constrain movement vectors. [none] + - none: No constraint + - frametile: Constrain within the tile. + - frametilemargin: Constrain even more. + --roi <filename> : Use a delta QP map for region of interest. + Reads an array of delta QP values from a text + file. The file format is: width and height of + the QP delta map followed by width*height delta + QP values in raster order. The map can be of any + size and will be scaled to the video size. + --set-qp-in-cu : Set QP at CU level keeping pic_init_qp_minus26. + in PPS and slice_qp_delta in slize header zero. + --(no-)erp-aqp : Use adaptive QP for 360 degree video with + equirectangular projection. [disabled] + --level <number> : Use the given HEVC level in the output and give + an error if level limits are exceeded. [6.2] + - 1, 2, 2.1, 3, 3.1, 4, 4.1, 5, 5.1, 5.2, 6, + 6.1, 6.2 + --force-level <number> : Same as --level but warnings instead of errors. + --high-tier : Used with --level. Use high tier bitrate limits + instead of the main tier limits during encoding. + High tier requires level 4 or higher. Compression tools: - --deblock [<beta:tc>] : Deblocking - - beta: between -6 and 6 - - tc: between -6 and 6 - --(no-)sao : Sample Adaptive Offset - --(no-)rdoq : Rate-Distortion Optimized Quantization - --(no-)signhide : Sign Hiding - --(no-)smp : Symmetric Motion Partition - --(no-)amp : Asymmetric Motion Partition - --rd <integer> : Intra mode search complexity - - 0: skip intra if inter is good enough - - 1: rough intra mode search with SATD - - 2: refine intra mode search with SSE - --(no-)mv-rdo : Rate-Distortion Optimized motion vector costs - --(no-)full-intra-search - : Try all intra modes during rough search. - --(no-)transform-skip : Transform skip - --me <string> : Integer motion estimation + --(no-)deblock <beta:tc> : Deblocking filter. [0:0] + - beta: Between -6 and 6 + - tc: Between -6 and 6 + --sao <string> : Sample Adaptive Offset [full] + - off: SAO disabled + - band: Band offset only + - edge: Edge offset only + - full: Full SAO + --(no-)rdoq : Rate-distortion optimized quantization [enabled] + --(no-)rdoq-skip : Skip RDOQ for 4x4 blocks. [disabled] + --(no-)signhide : Sign hiding [disabled] + --(no-)smp : Symmetric motion partition [disabled] + --(no-)amp : Asymmetric motion partition [disabled] + --rd <integer> : Intra mode search complexity [0] + - 0: Skip intra if inter is good enough. + - 1: Rough intra mode search with SATD. + - 2: Refine intra mode search with SSE. + - 3: Try all intra modes and enable intra + chroma mode search. + --(no-)mv-rdo : Rate-distortion optimized motion vector costs + [disabled] + --(no-)full-intra-search : Try all intra modes during rough search. + [disabled] + --(no-)transform-skip : Try transform skip [disabled] + --me <string> : Integer motion estimation algorithm [hexbs] - hexbs: Hexagon Based Search - tz: Test Zone Search - full: Full Search - full8, full16, full32, full64 - --subme <integer> : Set fractional pixel motion estimation level - - 0: only integer motion estimation + - dia: Diamond Search + --me-steps <integer> : Motion estimation search step limit. Only + affects 'hexbs' and 'dia'. [-1] + --subme <integer> : Fractional pixel motion estimation level [4] + - 0: Integer motion estimation only - 1: + 1/2-pixel horizontal and vertical - 2: + 1/2-pixel diagonal - 3: + 1/4-pixel horizontal and vertical - 4: + 1/4-pixel diagonal - --pu-depth-inter <int>-<int> - : Range for sizes for inter predictions + --pu-depth-inter <int>-<int> : Inter prediction units sizes [0-3] - 0, 1, 2, 3: from 64x64 to 8x8 - --pu-depth-intra <int>-<int> : Range for sizes for intra predictions + --pu-depth-intra <int>-<int> : Intra prediction units sizes [1-4] - 0, 1, 2, 3, 4: from 64x64 to 4x4 - --(no-)bipred : Bi-prediction - --(no-)cu-split-termination - : CU split search termination condition - - off: Never terminate cu-split search - - zero: Terminate with zero residual - --(no-)me-early-termination : ME early termination condition - - off: Don't terminate early - - on: Terminate early - - sensitive: Terminate even earlier - --(no-)implicit-rdpcm : Implicit residual DPCM - Currently only supported with lossless coding. - --(no-)tmvp : Temporal Motion Vector Prediction - --(no-)rdoq-skip : Skips RDOQ for 4x4 blocks + --tr-depth-intra <int> : Transform split depth for intra blocks [0] + --(no-)bipred : Bi-prediction [disabled] + --cu-split-termination <string> : CU split search termination [zero] + - off: Don't terminate early. + - zero: Terminate when residual is zero. + --me-early-termination <string> : Motion estimation termination [on] + - off: Don't terminate early. + - on: Terminate early. + - sensitive: Terminate even earlier. + --fast-residual-cost <int> : Skip CABAC cost for residual coefficients + when QP is below the limit. [0] + --(no-)intra-rdo-et : Check intra modes in rdo stage only until + a zero coefficient CU is found. [disabled] + --(no-)early-skip : Try to find skip cu from merge candidates. + Perform no further search if skip is found. + For rd=0..1: Try the first candidate. + For rd=2.. : Try the best candidate based + on luma satd cost. [enabled] + --max-merge <integer> : Maximum number of merge candidates, 1..5 [5] + --(no-)implicit-rdpcm : Implicit residual DPCM. Currently only supported + with lossless coding. [disabled] + --(no-)tmvp : Temporal motion vector prediction [enabled] Parallel processing: --threads <integer> : Number of threads to use [auto] - - 0: process everything with main thread - - N: use N threads for encoding - - auto: select based on number of cores - --owf <integer> : Frame parallelism [auto] - - N: Process N-1 frames at a time - - auto: Select automatically - --(no-)wpp : Wavefront parallel processing [enabled] + - 0: Process everything with main thread. + - N: Use N threads for encoding. + - auto: Select automatically. + --owf <integer> : Frame-level parallelism [auto] + - N: Process N+1 frames at a time. + - auto: Select automatically. + --(no-)wpp : Wavefront parallel processing. [enabled] Enabling tiles automatically disables WPP. To enable WPP with tiles, re-enable it after - enabling tiles. + enabling tiles. Enabling wpp with tiles is, + however, an experimental feature since it is + not supported in any HEVC profile. --tiles <int>x<int> : Split picture into width x height uniform tiles. --tiles-width-split <string>|u<int> : - Specifies a comma separated list of pixel - positions of tiles columns separation coordinates. - Can also be u followed by and a single int n, - in which case it produces columns of uniform width. + - <string>: A comma-separated list of tile + column pixel coordinates. + - u<int>: Number of tile columns of uniform + width. --tiles-height-split <string>|u<int> : - Specifies a comma separated list of pixel - positions of tiles rows separation coordinates. - Can also be u followed by and a single int n, - in which case it produces rows of uniform height. - --slices <string> : Control how slices are used - - tiles: put tiles in independent slices - - wpp: put rows in dependent slices - - tiles+wpp: do both + - <string>: A comma-separated list of tile row + column pixel coordinates. + - u<int>: Number of tile rows of uniform + height. + --slices <string> : Control how slices are used. + - tiles: Put tiles in independent slices. + - wpp: Put rows in dependent slices. + - tiles+wpp: Do both. Video Usability Information: - --sar <width:height> : Specify Sample Aspect Ratio + --sar <width:height> : Specify sample aspect ratio --overscan <string> : Specify crop overscan setting [undef] - undef, show, crop --videoformat <string> : Specify video format [undef] - - component, pal, ntsc, secam, mac, undef + - undef, component, pal, ntsc, secam, mac --range <string> : Specify color range [tv] - tv, pc --colorprim <string> : Specify color primaries [undef] @@ -200,8 +269,8 @@ --chromaloc <integer> : Specify chroma sample location (0 to 5) [0] Deprecated parameters: (might be removed at some point) - -w, --width : Use --input-res - -h, --height : Use --input-res + -w, --width <integer> : Use --input-res. + -h, --height <integer> : Use --input-res. ``` [comment]: # (END KVAZAAR HELP MESSAGE) @@ -230,24 +299,30 @@ | | 0-uf | 1-sf | 2-vf | 3-fr | 4-f | 5-m | 6-s | 7-sr | 8-vs | 9-p | | -------------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | -| rd | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | -| pu-depth-intra | 2-3 | 2-3 | 2-3 | 2-3 | 2-3 | 1-3 | 1-3 | 1-3 | 1-4 | 1-4 | -| pu-depth-inter | 2-3 | 2-3 | 2-3 | 1-3 | 1-3 | 1-3 | 1-3 | 0-3 | 0-3 | 0-3 | +| rd | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | +| pu-depth-intra | 2-3 | 2-3 | 2-3 | 2-3 | 1-3 | 1-4 | 1-4 | 1-4 | 1-4 | 1-4 | +| pu-depth-inter | 2-3 | 2-3 | 1-3 | 1-3 | 1-3 | 0-3 | 0-3 | 0-3 | 0-3 | 0-3 | | me | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | hexbs | tz | -| ref | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 2 | 3 | 4 | +| gop | g4d4t1| g4d4t1| g4d4t1| g4d4t1| g4d4t1| 8 | 8 | 8 | 8 | 8 | +| ref | 1 | 1 | 1 | 1 | 2 | 4 | 4 | 4 | 4 | 4 | +| bipred | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | | deblock | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | -| signhide | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | -| subme | 0 | 0 | 2 | 2 | 4 | 4 | 4 | 4 | 4 | 4 | -| sao | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | +| signhide | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | +| subme | 2 | 2 | 2 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | +| sao | off | full | full | full | full | full | full | full | full | full | | rdoq | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | -| rdoq-skip | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | +| rdoq-skip | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | | transform-skip | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | mv-rdo | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | full-intra-search | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -| smp | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| smp | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | | amp | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | cu-split-termination | zero | zero | zero | zero | zero | zero | zero | zero | zero | off | -| me-early-termination | sens. | sens. | sens. | sens. | on | on | on | on | on | off | +| me-early-termination | sens. | sens. | sens. | sens. | sens. | on | on | off | off | off | +| intra-rdo-et | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | +| early-skip | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | +| fast-residual-cost | 28 | 28 | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | +| max-merge | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | 5 | ## Kvazaar library @@ -268,16 +343,6 @@ possible. -### Required libraries -- For Visual Studio, the pthreads-w32 library is required. Platforms - with native POSIX thread support don't need anything. - - The project file expects the library to be in ../pthreads.2/ - relative to Kvazaar. You can just extract the pre-built library - there. - - The executable needs pthreadVC2.dll to be present. Either install it - somewhere or ship it with the executable. - - ### Autotools Depending on the platform, some additional tools are required for compiling Kvazaar with autotools. For Ubuntu, the required packages are `automake autoconf libtool m4 build-essential yasm`. Yasm is @@ -300,7 +365,7 @@ ### Visual Studio -- At least VisualStudio 2013 is required. +- At least VisualStudio 2015 is required. - Project files can be found under build/. - Requires external [vsyasm.exe](http://yasm.tortall.net/Download.html) in %PATH% @@ -314,18 +379,37 @@ ### Visualization (Windows only) -Branch `visualizer` has a visual studio project, which can be compiled to enable visualization feature in Kvazaar. +Compiling `kvazaar_cli` project in the `visualizer` branch results in a Kvazaar executable with visualization enabled. Additional Requirements: [`SDL2`](https://www.libsdl.org/download-2.0.php), [`SDL2-ttf`](https://www.libsdl.org/projects/SDL_ttf/). -Directory `visualizer_extras` is expected to be found from the same directory level as the kvazaar project directory. Inside should be directories `include` and `lib` found from the development library zip packages. +Directory `visualizer_extras` has to be added into the same directory level as the kvazaar project directory. Inside should be directories `include` and `lib` found from the development library zip packages. -`SDL2.dll`, `SDL2_ttf.dll`, `libfreetype-6.dll`, `zlib1.dll`, and `pthreadVC2.dll` should be placed in the working directory (i.e. the folder the `kvazaar.exe` is in after compiling the `kvazaar_cli` project/solution) when running the visualizer. The required `.dll` can be found in the aforementioned `lib`-folder (`lib\x64`) and the dll folder inside the pthreads folder (see `Required libraries`). +`SDL2.dll`, `SDL2_ttf.dll`, `libfreetype-6.dll`, and `zlib1.dll` should be placed in the working directory (i.e. the folder the `kvazaar.exe` is in after compiling the `kvazaar_cli` project/solution) when running the visualizer. The required `.dll` can be found in the aforementioned `lib`-folder (`lib\x64`). Note: The solution should be compiled on the x64 platform in visual studio. Optional font file `arial.ttf` is to be placed in the working directory, if block info tool is used. +## Paper + +Please cite [this paper](https://dl.acm.org/citation.cfm?doid=2964284.2973796) for Kvazaar: + +```M. Viitanen, A. Koivula, A. Lemmetti, A. Ylä-Outinen, J. Vanne, and T. D. Hämäläinen, “Kvazaar: open-source HEVC/H.265 encoder,” in Proc. ACM Int. Conf. Multimedia, Amsterdam, The Netherlands, Oct. 2016.``` + +Or in BibTex: + +``` +@inproceedings{Kvazaar2016, + author = {Viitanen, Marko and Koivula, Ari and Lemmetti, Ari and Yl\"{a}-Outinen, Arttu and Vanne, Jarno and H\"{a}m\"{a}l\"{a}inen, Timo D.}, + title = {Kvazaar: Open-Source HEVC/H.265 Encoder}, + booktitle = {Proceedings of the 24th ACM International Conference on Multimedia}, + year = {2016}, + isbn = {978-1-4503-3603-1}, + location = {Amsterdam, The Netherlands}, + url = {http://doi.acm.org/10.1145/2964284.2973796}, +} +``` ## Contributing to Kvazaar We are happy to look at pull requests in Github. There is still lots of work to be done. @@ -353,7 +437,7 @@ - Uninitialized variables and such are checked with Valgrind. - Bitstream validity is checked with HM. - Compilation is checked on GCC and Clang on Linux, and Clang on OSX. -- Windows msys2 build is checked automatically on Appveyor. +- Windows msys2 and msvc builds are checked automatically on Appveyor. - If your changes change the bitstream, decode with HM to check that it doesn't throw checksum errors or asserts. - If your changes shouldn't alter the bitstream, check that they don't. @@ -377,7 +461,7 @@ ### Code style We try to follow the following conventions: -- C99 without features not supported by Visual Studio 2013 (VLAs). +- C99 without features not supported by Visual Studio 2015 (VLAs). - // comments allowed and encouraged. - Follow overall conventions already established in the code. - Indent by 2 spaces. (no tabs)
View file
kvazaar-1.2.0.tar.gz/appveyor.yml -> kvazaar-1.3.0.tar.gz/appveyor.yml
Changed
@@ -1,28 +1,85 @@ +# Only the whitelisted branches get built, regardless of build config branches: only: - master - - appveyor +# Email the author if their commit either failed to build or fixed a failed build +# good -> bad, bad -> bad, bad -> good but not good -> good +notifications: + - provider: Email + to: + - '{{commitAuthorEmail}}' + on_build_success: false + on_build_failure: true + on_build_status_changed: true + +# Skip commits that don't affect the code / compiling the code +skip_commits: + files: + - .gitignore + - .gitlab-ci.yml + - .travis-install.bash + - .travis.yml + - COPYING + - CREDITS + - README.md + - docs.doxy + +# Download only a zip file of the latest commit +# Downloading the whole history of the repository would be unnecessary +shallow_clone: true + +# Only try building the app, don't run any tests +test: off + +# Don't bother with debug builds +configuration: + - Release + +# Build with multiple compilers / build suites +image: Visual Studio 2015 environment: matrix: - - MSYSTEM: MINGW64 + - platform: Win32 + - platform: x64 - MSYSTEM: MINGW32 + - MSYSTEM: MINGW64 -shallow_clone: true -test: off +for: +- + # MinGW builds need all kinds of build scripts + matrix: + only: + - MSYSTEM: MINGW32 + - MSYSTEM: MINGW64 + + install: + # Update core packages + - C:\msys64\usr\bin\pacman -Syyuu --noconfirm --noprogressbar + # Update non-core packages + - C:\msys64\usr\bin\pacman -Suu --noconfirm --noprogressbar + # Install required MSYS2 packages + - C:\msys64\usr\bin\pacman -S --noconfirm --noprogressbar --needed automake-wrapper make + # Now MSYS2 is up to date, do the rest of the install from a bash script + - C:\msys64\usr\bin\bash -lc "cd \"$APPVEYOR_BUILD_FOLDER\" && exec ./tools/appveyor-install.sh" + + build_script: + - C:\msys64\usr\bin\bash -lc "cd \"$APPVEYOR_BUILD_FOLDER\" && exec ./tools/appveyor-build.sh" + + cache: + - C:\msys64\var\cache\pacman\pkg +- + # MSVC builds only need vsyasm and the solution file + matrix: + except: + - MSYSTEM: MINGW32 + - MSYSTEM: MINGW64 + + install: + - ps: $url = "http://ultravideo.cs.tut.fi/vsyasm.exe" + - ps: $output = "C:\Tools\vsyasm.exe" + - ps: "(New-Object System.Net.WebClient).DownloadFile($url, $output)" + - ps: '$env:Path += ";$output\.."' -install: - # Update core packages - - C:\msys64\usr\bin\pacman -Syyuu --noconfirm --noprogressbar - # Update non-core packages - - C:\msys64\usr\bin\pacman -Suu --noconfirm --noprogressbar - # Install required MSYS2 packages - - C:\msys64\usr\bin\pacman -S --noconfirm --noprogressbar --needed automake-wrapper make - # Now MSYS2 is up to date, do the rest of the install from a bash script - - C:\msys64\usr\bin\bash -lc "cd \"$APPVEYOR_BUILD_FOLDER\" && exec ./tools/appveyor-install.sh" - -build_script: - - C:\msys64\usr\bin\bash -lc "cd \"$APPVEYOR_BUILD_FOLDER\" && exec ./tools/appveyor-build.sh" - -cache: - - C:\msys64\var\cache\pacman\pkg + build: + project: .\build\kvazaar_VS2015.sln
View file
kvazaar-1.2.0.tar.gz/autogen.sh -> kvazaar-1.3.0.tar.gz/autogen.sh
Changed
@@ -1,5 +1,4 @@ #!/bin/sh -git submodule init -git submodule update +git submodule update --init --depth 1 autoreconf -if
View file
kvazaar-1.2.0.tar.gz/build/C_Properties.props -> kvazaar-1.3.0.tar.gz/build/C_Properties.props
Changed
@@ -13,7 +13,7 @@ <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput> <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> <PreprocessorDefinitions>KVZ_DLL_EXPORTS;KVZ_COMPILE_ASM;WIN32_LEAN_AND_MEAN;WIN32;WIN64;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions> - <AdditionalIncludeDirectories>$(SolutionDir)..\..\pthreads.2\include;$(SolutionDir)..\src;$(SolutionDir)..\src\extras;$(SolutionDir)..\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <AdditionalIncludeDirectories>$(SolutionDir)..\src\threadwrapper\include;$(SolutionDir)..\src;$(SolutionDir)..\src\extras;$(SolutionDir)..\;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> <DisableSpecificWarnings>4244;4204;4206;4028;4152;4996;4018;4456;4389;4100;4131;4459;4706;4214;4127;4201</DisableSpecificWarnings> <OpenMPSupport>false</OpenMPSupport> <TreatSpecificWarningsAsErrors>4013;4029;4047;4716;4700;4020;4021;4133</TreatSpecificWarningsAsErrors>
View file
kvazaar-1.3.0.tar.gz/build/kvazaar_VS2015.sln
Added
@@ -0,0 +1,55 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 14 +VisualStudioVersion = 12.0.30723.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_lib", "kvazaar_lib\kvazaar_lib.vcxproj", "{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{50AB7A17-4885-4D20-BF01-376DE4417FCD}" + ProjectSection(SolutionItems) = preProject + kvazaar_VS2010.vsd = kvazaar_VS2010.vsd + kvazaar_VS2010.vsmdi = kvazaar_VS2010.vsmdi + Local.testsettings = Local.testsettings + TraceAndTestImpact.testsettings = TraceAndTestImpact.testsettings + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_tests", "kvazaar_tests\kvazaar_tests.vcxproj", "{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kvazaar_cli", "kvazaar_cli\kvazaar_cli.vcxproj", "{C755308D-9B3E-4712-99AB-7F6F4E2DA567}" + ProjectSection(ProjectDependencies) = postProject + {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF} = {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF} + EndProjectSection +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|Win32.ActiveCfg = Debug|Win32 + {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|Win32.Build.0 = Debug|Win32 + {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|x64.ActiveCfg = Debug|x64 + {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Debug|x64.Build.0 = Debug|x64 + {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|Win32.ActiveCfg = Release|Win32 + {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|Win32.Build.0 = Release|Win32 + {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|x64.ActiveCfg = Release|x64 + {EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}.Release|x64.Build.0 = Release|x64 + {3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Debug|Win32.ActiveCfg = Debug|Win32 + {3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Debug|x64.ActiveCfg = Debug|x64 + {3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Release|Win32.ActiveCfg = Release|Win32 + {3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}.Release|x64.ActiveCfg = Release|x64 + {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|Win32.ActiveCfg = Debug|Win32 + {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|Win32.Build.0 = Debug|Win32 + {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|x64.ActiveCfg = Debug|x64 + {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Debug|x64.Build.0 = Debug|x64 + {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|Win32.ActiveCfg = Release|Win32 + {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|Win32.Build.0 = Release|Win32 + {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|x64.ActiveCfg = Release|x64 + {C755308D-9B3E-4712-99AB-7F6F4E2DA567}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal
View file
kvazaar-1.2.0.tar.gz/build/kvazaar_cli/kvazaar_cli.vcxproj -> kvazaar-1.3.0.tar.gz/build/kvazaar_cli/kvazaar_cli.vcxproj
Changed
@@ -22,23 +22,24 @@ <ProjectGuid>{C755308D-9B3E-4712-99AB-7F6F4E2DA567}</ProjectGuid> <Keyword>Win32Proj</Keyword> <RootNamespace>kvazaar_cli</RootNamespace> + <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion> </PropertyGroup> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v120</PlatformToolset> + <PlatformToolset>v140</PlatformToolset> </PropertyGroup> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v120</PlatformToolset> + <PlatformToolset>v140</PlatformToolset> </PropertyGroup> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v120</PlatformToolset> + <PlatformToolset>v140</PlatformToolset> </PropertyGroup> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v120</PlatformToolset> + <PlatformToolset>v140</PlatformToolset> </PropertyGroup> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <ImportGroup Label="ExtensionSettings">
View file
kvazaar-1.2.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj -> kvazaar-1.3.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj
Changed
@@ -22,27 +22,28 @@ <ProjectGuid>{EEA3BDD1-8A08-41C1-BA57-E05D5C2CD8FF}</ProjectGuid> <Keyword>Win32Proj</Keyword> <RootNamespace>kvazaar_lib</RootNamespace> + <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion> </PropertyGroup> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> <ConfigurationType>StaticLibrary</ConfigurationType> <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v120</PlatformToolset> + <PlatformToolset>v140</PlatformToolset> </PropertyGroup> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> <ConfigurationType>StaticLibrary</ConfigurationType> <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v120</PlatformToolset> + <PlatformToolset>v140</PlatformToolset> </PropertyGroup> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> <ConfigurationType>StaticLibrary</ConfigurationType> <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v120</PlatformToolset> + <PlatformToolset>v140</PlatformToolset> </PropertyGroup> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> <ConfigurationType>StaticLibrary</ConfigurationType> <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v120</PlatformToolset> + <PlatformToolset>v140</PlatformToolset> </PropertyGroup> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <ImportGroup Label="ExtensionSettings"> @@ -78,20 +79,26 @@ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> <YASM /> <Lib> - <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x64</AdditionalLibraryDirectories> - <AdditionalDependencies>pthreadVC2.lib</AdditionalDependencies> + <AdditionalLibraryDirectories> + </AdditionalLibraryDirectories> + <AdditionalDependencies> + </AdditionalDependencies> </Lib> <YASM> <Defines>ARCH_X86_64=1;%(Defines)</Defines> + <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths> </YASM> </ItemDefinitionGroup> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> <YASM> <Defines>ARCH_X86_64=0;PREFIX</Defines> + <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths> </YASM> <Lib> - <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x86</AdditionalLibraryDirectories> - <AdditionalDependencies>pthreadVC2.lib</AdditionalDependencies> + <AdditionalLibraryDirectories> + </AdditionalLibraryDirectories> + <AdditionalDependencies> + </AdditionalDependencies> </Lib> <ClCompile> <UndefinePreprocessorDefinitions> @@ -101,10 +108,13 @@ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> <YASM> <Defines>ARCH_X86_64=0;PREFIX</Defines> + <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths> </YASM> <Lib> - <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x86</AdditionalLibraryDirectories> - <AdditionalDependencies>pthreadVC2.lib</AdditionalDependencies> + <AdditionalLibraryDirectories> + </AdditionalLibraryDirectories> + <AdditionalDependencies> + </AdditionalDependencies> </Lib> <ClCompile> <UndefinePreprocessorDefinitions> @@ -114,10 +124,13 @@ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> <YASM> <Defines>ARCH_X86_64=1;%(Defines)</Defines> + <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths> </YASM> <Lib> - <AdditionalLibraryDirectories>$(SolutionDir)..\..\pthreads.2\lib\x64</AdditionalLibraryDirectories> - <AdditionalDependencies>pthreadVC2.lib</AdditionalDependencies> + <AdditionalLibraryDirectories> + </AdditionalLibraryDirectories> + <AdditionalDependencies> + </AdditionalDependencies> </Lib> <ClCompile> <UndefinePreprocessorDefinitions> @@ -154,6 +167,12 @@ <ClCompile Include="..\..\src\search.c" /> <ClCompile Include="..\..\src\search_inter.c" /> <ClCompile Include="..\..\src\search_intra.c" /> + <ClCompile Include="..\..\src\strategies\avx2\encode_coding_tree-avx2.c"> + <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet> + <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet> + <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet> + <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet> + </ClCompile> <ClCompile Include="..\..\src\strategies\avx2\intra-avx2.c"> <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet> <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet> @@ -172,9 +191,11 @@ <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet> <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet> </ClCompile> + <ClCompile Include="..\..\src\strategies\generic\encode_coding_tree-generic.c" /> <ClCompile Include="..\..\src\strategies\generic\intra-generic.c" /> <ClCompile Include="..\..\src\strategies\generic\quant-generic.c" /> <ClCompile Include="..\..\src\strategies\generic\sao-generic.c" /> + <ClCompile Include="..\..\src\strategies\strategies-encode.c" /> <ClCompile Include="..\..\src\strategies\strategies-intra.c" /> <ClCompile Include="..\..\src\strategies\strategies-quant.c" /> <ClInclude Include="..\..\src\checkpoint.h" /> @@ -214,6 +235,18 @@ <ClCompile Include="..\..\src\strategies\strategies-picture.c" /> <ClCompile Include="..\..\src\strategies\strategies-sao.c" /> <ClCompile Include="..\..\src\strategies\x86_asm\picture-x86-asm.c" /> + <ClCompile Include="..\..\src\threadwrapper\src\pthread.cpp"> + <CompileAs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CompileAsCpp</CompileAs> + <CompileAs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CompileAsCpp</CompileAs> + <CompileAs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CompileAsCpp</CompileAs> + <CompileAs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CompileAsCpp</CompileAs> + </ClCompile> + <ClCompile Include="..\..\src\threadwrapper\src\semaphore.cpp"> + <CompileAs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CompileAsCpp</CompileAs> + <CompileAs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CompileAsCpp</CompileAs> + <CompileAs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CompileAsCpp</CompileAs> + <CompileAs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CompileAsCpp</CompileAs> + </ClCompile> <ClCompile Include="..\..\src\videoframe.c" /> <ClInclude Include="..\..\src\encoder_state-bitstream.h" /> <ClInclude Include="..\..\src\encoder_state-ctors_dtors.h" /> @@ -228,13 +261,19 @@ <ClInclude Include="..\..\src\kvz_math.h" /> <ClInclude Include="..\..\src\search_inter.h" /> <ClInclude Include="..\..\src\search_intra.h" /> + <ClInclude Include="..\..\src\strategies\avx2\avx2_common_functions.h" /> + <ClInclude Include="..\..\src\strategies\avx2\encode_coding_tree-avx2.h" /> <ClInclude Include="..\..\src\strategies\avx2\intra-avx2.h" /> + <ClInclude Include="..\..\src\strategies\avx2\reg_sad_pow2_widths-avx2.h" /> <ClInclude Include="..\..\src\strategies\avx2\sao-avx2.h" /> + <ClInclude Include="..\..\src\strategies\generic\encode_coding_tree-generic.h" /> <ClInclude Include="..\..\src\strategies\generic\intra-generic.h" /> <ClInclude Include="..\..\src\strategies\generic\sao-generic.h" /> + <ClInclude Include="..\..\src\strategies\sse41\reg_sad_pow2_widths-sse41.h" /> <ClInclude Include="..\..\src\strategies\strategies-common.h" /> <ClInclude Include="..\..\src\strategies\avx2\quant-avx2.h" /> <ClInclude Include="..\..\src\strategies\generic\quant-generic.h" /> + <ClInclude Include="..\..\src\strategies\strategies-encode.h" /> <ClInclude Include="..\..\src\strategies\strategies-intra.h" /> <ClInclude Include="..\..\src\strategies\strategies-quant.h" /> </ItemGroup> @@ -279,6 +318,8 @@ <ClInclude Include="..\..\src\tables.h" /> <ClInclude Include="..\..\src\threadqueue.h" /> <ClInclude Include="..\..\src\threads.h" /> + <ClInclude Include="..\..\src\threadwrapper\include\pthread.h" /> + <ClInclude Include="..\..\src\threadwrapper\include\semaphore.h" /> <ClInclude Include="..\..\src\transform.h" /> <ClInclude Include="..\..\src\videoframe.h" /> </ItemGroup> @@ -296,4 +337,4 @@ <ImportGroup Label="ExtensionTargets"> <Import Project="..\yasm\vsyasm.targets" /> </ImportGroup> -</Project> +</Project> \ No newline at end of file
View file
kvazaar-1.2.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters -> kvazaar-1.3.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
Changed
@@ -49,6 +49,9 @@ <Filter Include="Threading"> <UniqueIdentifier>{63c21cb2-b379-4d38-bcb8-173786c2466d}</UniqueIdentifier> </Filter> + <Filter Include="Threadwrapper"> + <UniqueIdentifier>{f4abece9-e209-4817-a57e-c64ca7c5e05c}</UniqueIdentifier> + </Filter> </ItemGroup> <ItemGroup> <ClCompile Include="..\..\src\strategies\strategies-nal.c"> @@ -221,6 +224,21 @@ </ClCompile> <ClCompile Include="..\..\src\extras\libmd5.c" /> <ClCompile Include="..\..\src\extras\crypto.cpp" /> + <ClCompile Include="..\..\src\strategies\avx2\encode_coding_tree-avx2.c"> + <Filter>Optimization\strategies\avx2</Filter> + </ClCompile> + <ClCompile Include="..\..\src\strategies\generic\encode_coding_tree-generic.c"> + <Filter>Optimization\strategies\generic</Filter> + </ClCompile> + <ClCompile Include="..\..\src\strategies\strategies-encode.c"> + <Filter>Optimization\strategies</Filter> + </ClCompile> + <ClCompile Include="..\..\src\threadwrapper\src\pthread.cpp"> + <Filter>Threadwrapper</Filter> + </ClCompile> + <ClCompile Include="..\..\src\threadwrapper\src\semaphore.cpp"> + <Filter>Threadwrapper</Filter> + </ClCompile> </ItemGroup> <ItemGroup> <ClInclude Include="..\..\src\bitstream.h"> @@ -411,6 +429,30 @@ </ClInclude> <ClInclude Include="..\..\src\extras\libmd5.h" /> <ClInclude Include="..\..\src\extras\crypto.h" /> + <ClInclude Include="..\..\src\strategies\avx2\encode_coding_tree-avx2.h"> + <Filter>Optimization\strategies\avx2</Filter> + </ClInclude> + <ClInclude Include="..\..\src\strategies\generic\encode_coding_tree-generic.h"> + <Filter>Optimization\strategies\generic</Filter> + </ClInclude> + <ClInclude Include="..\..\src\strategies\avx2\avx2_common_functions.h"> + <Filter>Optimization\strategies\avx2</Filter> + </ClInclude> + <ClInclude Include="..\..\src\strategies\strategies-encode.h"> + <Filter>Optimization\strategies</Filter> + </ClInclude> + <ClInclude Include="..\..\src\strategies\avx2\reg_sad_pow2_widths-avx2.h"> + <Filter>Optimization\strategies\avx2</Filter> + </ClInclude> + <ClInclude Include="..\..\src\strategies\sse41\reg_sad_pow2_widths-sse41.h"> + <Filter>Optimization\strategies\sse41</Filter> + </ClInclude> + <ClInclude Include="..\..\src\threadwrapper\include\pthread.h"> + <Filter>Threadwrapper</Filter> + </ClInclude> + <ClInclude Include="..\..\src\threadwrapper\include\semaphore.h"> + <Filter>Threadwrapper</Filter> + </ClInclude> </ItemGroup> <ItemGroup> <YASM Include="..\..\src\extras\x86inc.asm"> @@ -423,4 +465,4 @@ <Filter>Optimization\strategies\x86_asm</Filter> </YASM> </ItemGroup> -</Project> +</Project> \ No newline at end of file
View file
kvazaar-1.2.0.tar.gz/build/kvazaar_tests/kvazaar_tests.vcxproj -> kvazaar-1.3.0.tar.gz/build/kvazaar_tests/kvazaar_tests.vcxproj
Changed
@@ -22,23 +22,24 @@ <ProjectGuid>{3CD1C68B-542C-46D8-9B8A-6C91C5A3F312}</ProjectGuid> <Keyword>Win32Proj</Keyword> <RootNamespace>kvazaar_tests</RootNamespace> + <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion> </PropertyGroup> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v120</PlatformToolset> + <PlatformToolset>v140</PlatformToolset> </PropertyGroup> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> <UseDebugLibraries>true</UseDebugLibraries> - <PlatformToolset>v120</PlatformToolset> + <PlatformToolset>v140</PlatformToolset> </PropertyGroup> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v120</PlatformToolset> + <PlatformToolset>v140</PlatformToolset> </PropertyGroup> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> <UseDebugLibraries>false</UseDebugLibraries> - <PlatformToolset>v120</PlatformToolset> + <PlatformToolset>v140</PlatformToolset> </PropertyGroup> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <ImportGroup Label="ExtensionSettings"> @@ -115,4 +116,4 @@ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <ImportGroup Label="ExtensionTargets"> </ImportGroup> -</Project> +</Project> \ No newline at end of file
View file
kvazaar-1.2.0.tar.gz/build/yasm/vsyasm.targets -> kvazaar-1.3.0.tar.gz/build/yasm/vsyasm.targets
Changed
@@ -20,7 +20,7 @@ AfterTargets="$(YASMAfterTargets)" Condition="'@(YASM)' != ''" DependsOnTargets="$(YASMDependsOn);ComputeYASMOutput" - Outputs="@(YASM->'%(ObjectFile)')" + Outputs="@(YASM->'$(ProjectDir)$(IntDir)%(Filename).obj')" Inputs="@(YASM);%(YASM.AdditionalDependencies);$(MSBuildProjectFile)"> <ItemGroup Condition="'@(SelectedFiles)' != ''"> @@ -32,7 +32,7 @@ <YASM_tlog Include="%(YASM.ObjectFile)" Condition="'%(YASM.ObjectFile)' != '' and '%(YASM.ExcludedFromBuild)' != 'true'"> - <Source>@(YASM, '|')</Source> + <Source>@(YASM->'%(FullPath)', '|')</Source> </YASM_tlog> </ItemGroup> <Message @@ -40,8 +40,9 @@ Text="%(YASM.ExecutionDescription)" /> <WriteLinesToFile Condition="'@(YASM_tlog)' != '' and '%(YASM_tlog.ExcludedFromBuild)' != 'true'" - File="$(IntDir)$(ProjectName).write.1.tlog" - Lines="^%(YASM_tlog.Source);@(YASM_tlog->'%(Fullpath)')" /> + File="$(TLogLocation)$(ProjectName).write.1.tlog" + Lines="^%(YASM_tlog.Source);@(YASM->'$(ProjectDir)$(IntDir)%(Filename).obj')" + Encoding="Unicode" /> <YASM Condition="'@(YASM)' != '' and '%(YASM.ExcludedFromBuild)' != 'true'" CommandLineTemplate="%(YASM.CommandLineTemplate)"
View file
kvazaar-1.2.0.tar.gz/configure.ac -> kvazaar-1.3.0.tar.gz/configure.ac
Changed
@@ -23,7 +23,7 @@ # # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html ver_major=4 -ver_minor=0 +ver_minor=2 ver_release=0 # Prevents configure from adding a lot of defines to the CFLAGS @@ -45,15 +45,20 @@ LT_INIT([win32-dll]) +AX_CHECK_COMPILE_FLAG([-maltivec],[flag_altivec="true"]) AX_CHECK_COMPILE_FLAG([-mavx2], [flag_avx2="true"]) AX_CHECK_COMPILE_FLAG([-msse4.1], [flag_sse4_1="true"]) AX_CHECK_COMPILE_FLAG([-msse2], [flag_sse2="true"]) +AX_CHECK_COMPILE_FLAG([-mbmi], [flag_bmi="true"]) +AX_CHECK_COMPILE_FLAG([-mabm], [flag_abm="true"]) +AX_CHECK_COMPILE_FLAG([-mbmi2], [flag_bmi2="true"]) -AM_CONDITIONAL([HAVE_AVX2], [test x"$flag_avx2" = x"true"]) +AM_CONDITIONAL([HAVE_ALTIVEC], [test x"$flag_altivec" = x"true"]) +AM_CONDITIONAL([HAVE_AVX2], [test x"$flag_avx2" = x"true" -a x"$flag_bmi" = x"true" -a x"$flag_abm" = x"true" -a x"$flag_bmi2" = x"true"]) AM_CONDITIONAL([HAVE_SSE4_1], [test x"$flag_sse4_1" = x"true"]) AM_CONDITIONAL([HAVE_SSE2], [test x"$flag_sse2" = x"true"]) -KVZ_CFLAGS="-Wall -Wtype-limits -Wvla -I$srcdir/src -I$srcdir/src/extras -ftree-vectorize -fvisibility=hidden" +KVZ_CFLAGS="-Wall -Wextra -Wvla -Wno-sign-compare -Wno-unused-parameter -I$srcdir/src -I$srcdir/src/extras -ftree-vectorize -fvisibility=hidden" CFLAGS="$KVZ_CFLAGS $CFLAGS" AC_SEARCH_LIBS([log], [m c], [], [exit 1]) @@ -68,7 +73,10 @@ [AC_DEFINE([KVZ_SEL_ENCRYPTION], [1], [With cryptopp])], [PKG_CHECK_MODULES([cryptopp], [libcrypto++], [AC_DEFINE([KVZ_SEL_ENCRYPTION], [1], [With cryptopp])], - [AC_MSG_ERROR([neither cryptopp nor libcrypto++ found with pkg-config])] + [PKG_CHECK_MODULES([cryptopp], [libcryptopp], + [AC_DEFINE([KVZ_SEL_ENCRYPTION], [1], [With cryptopp])], + [AC_MSG_ERROR([neither cryptopp, libcrypto++ nor libcryptopp found with pkg-config])] + )] )] )] )
View file
kvazaar-1.2.0.tar.gz/doc/kvazaar.1 -> kvazaar-1.3.0.tar.gz/doc/kvazaar.1
Changed
@@ -1,24 +1,24 @@ -.TH KVAZAAR "1" "November 2017" "kvazaar v1.2.0" "User Commands" +.TH KVAZAAR "1" "July 2019" "kvazaar v1.3.0" "User Commands" .SH NAME kvazaar \- open source HEVC encoder .SH SYNOPSIS \fBkvazaar \fR\-i <input> \-\-input\-res <width>x<height> \-o <output> .SH DESCRIPTION .TP -\fB\-i\fR, \fB\-\-input +\fB\-i\fR, \fB\-\-input <filename> Input file .TP \fB\-\-input\-res <res> Input resolution [auto] -auto: detect from file name -<int>x<int>: width times height + \- auto: Detect from file name. + \- <int>x<int>: width times height .TP -\fB\-o\fR, \fB\-\-output +\fB\-o\fR, \fB\-\-output <filename> Output file .SS "Presets:" .TP -\fB\-\-preset=<preset> +\fB\-\-preset <preset> Set options to a preset [medium] \- ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow @@ -32,241 +32,315 @@ \fB\-\-seek <integer> First frame to code [0] .TP -\fB\-\-input\-fps <num>/<denom> -Framerate of the input video [25.0] +\fB\-\-input\-fps <num>[/<denom>] +Frame rate of the input video [25] .TP \fB\-\-source\-scan\-type <string> -Set source scan type [progressive]. - \- progressive: progressive scan - \- tff: top field first - \- bff: bottom field first +Source scan type [progressive] + \- progressive: Progressive scan + \- tff: Top field first + \- bff: Bottom field first .TP -\fB\-\-input\-format -P420 or P400 +\fB\-\-input\-format <string> +P420 or P400 [P420] .TP -\fB\-\-input\-bitdepth -8\-16 +\fB\-\-input\-bitdepth <int> +8\-16 [8] .TP \fB\-\-loop\-input -Re\-read input file forever +Re\-read input file forever. .SS "Options:" .TP \fB\-\-help -Print this help message and exit +Print this help message and exit. .TP \fB\-\-version -Print version information and exit +Print version information and exit. .TP -\fB\-\-aud -Use access unit delimiters +\fB\-\-(no\-)aud +Use access unit delimiters. [disabled] .TP -\fB\-\-debug <string> -Output encoders reconstruction. +\fB\-\-debug <filename> +Output internal reconstruction. .TP -\fB\-\-cpuid <integer> -Disable runtime cpu optimizations with value 0. +\fB\-\-(no\-)cpuid +Enable runtime CPU optimizations. [enabled] .TP -\fB\-\-hash +\fB\-\-hash <string> Decoded picture hash [checksum] \- none: 0 bytes \- checksum: 18 bytes \- md5: 56 bytes .TP -\fB\-\-no\-psnr -Don't calculate PSNR for frames -.TP -\fB\-\-no\-info -Don't add encoder info SEI. +\fB\-\-(no\-)psnr +Calculate PSNR for frames. [enabled] +.TP +\fB\-\-(no\-)info +Add encoder info SEI. [enabled] +.TP +\fB\-\-crypto <string> +Selective encryption. Crypto support must be +enabled at compile\-time. Can be 'on' or 'off' or +a list of features separated with a '+'. [off] + \- on: Enable all encryption features. + \- off: Disable selective encryption. + \- mvs: Motion vector magnitudes. + \- mv_signs: Motion vector signs. + \- trans_coeffs: Coefficient magnitudes. + \- trans_coeff_signs: Coefficient signs. + \- intra_pred_modes: Intra prediction modes. +.TP +\fB\-\-key <string> +Encryption key [16,213,27,56,255,127,242,112, + 97,126,197,204,25,59,38,30] .SS "Video structure:" .TP \fB\-q\fR, \fB\-\-qp <integer> -Quantization Parameter [32] +Quantization parameter [22] .TP \fB\-p\fR, \fB\-\-period <integer> -Period of intra pictures [0] -\- 0: only first picture is intra -\- 1: all pictures are intra -\- 2\-N: every Nth picture is intra +Period of intra pictures [64] + \- 0: Only first picture is intra. + \- 1: All pictures are intra. + \- N: Every Nth picture is intra. .TP \fB\-\-vps\-period <integer> -Specify how often the video parameter set is -re\-sent. [0] - \- 0: only send VPS with the first frame - \- N: send VPS with every Nth intra frame +How often the video parameter set is re\-sent [0] + \- 0: Only send VPS with the first frame. + \- N: Send VPS with every Nth intra frame. .TP \fB\-r\fR, \fB\-\-ref <integer> -Reference frames, range 1..15 [3] +Number of reference frames, in range 1..15 [4] .TP \fB\-\-gop <string> -Definition of GOP structure [0] - \- 0: disabled +GOP structure [8] + \- 0: Disabled \- 8: B\-frame pyramid of length 8 - \- lp\-<string>: lp\-gop definition - (e.g. lp\-g8d4t2, see README) + \- lp\-<string>: Low\-delay P\-frame GOP + (e.g. lp\-g8d4t2, see README) +.TP +\fB\-\-(no\-)open\-gop +Use open GOP configuration. [enabled] .TP -\fB\-\-cqmfile <string> -Custom Quantization Matrices from a file +\fB\-\-cqmfile <filename> +Read custom quantization matrices from a file. +.TP +\fB\-\-scaling-list <string> +Set scaling list mode. [off] + \- off: Disable scaling lists. + \- custom: use custom list (with \-\-cqmfile). + \- default: Use default lists. .TP \fB\-\-bitrate <integer> -Target bitrate. [0] - \- 0: disable rate\-control - \- N: target N bits per second -.TP -\fB\-\-lossless -Use lossless coding -.TP -\fB\-\-mv\-constraint -Constrain movement vectors - \- none: no constraint - \- frametile: constrain within the tile - \- frametilemargin: constrain even more -.TP -\fB\-\-roi <string> -Use a delta QP map for region of interest - Read an array of delta QP values from - a file, where the first two values are the - width and height, followed by width*height - delta QP values in raster order. - The delta QP map can be any size or aspect - ratio, and will be mapped to LCU's. +Target bitrate [0] + \- 0: Disable rate control. + \- N: Target N bits per second. +.TP +\fB\-\-(no\-)lossless +Use lossless coding. [disabled] +.TP +\fB\-\-mv\-constraint <string> +Constrain movement vectors. [none] + \- none: No constraint + \- frametile: Constrain within the tile. + \- frametilemargin: Constrain even more. +.TP +\fB\-\-roi <filename> +Use a delta QP map for region of interest. +Reads an array of delta QP values from a text +file. The file format is: width and height of +the QP delta map followed by width*height delta +QP values in raster order. The map can be of any +size and will be scaled to the video size. +.TP +\fB\-\-set\-qp\-in\-cu +Set QP at CU level keeping pic_init_qp_minus26. +in PPS and slice_qp_delta in slize header zero. .TP \fB\-\-(no\-)erp\-aqp -Use adaptive QP for 360 video with -equirectangular projection +Use adaptive QP for 360 degree video with +equirectangular projection. [disabled] +.TP +\fB\-\-level <number> +Use the given HEVC level in the output and give +an error if level limits are exceeded. [6.2] + \- 1, 2, 2.1, 3, 3.1, 4, 4.1, 5, 5.1, 5.2, 6, + 6.1, 6.2 +.TP +\fB\-\-force\-level <number> +Same as \-\-level but warnings instead of errors. +.TP +\fB\-\-high\-tier +Used with \-\-level. Use high tier bitrate limits +instead of the main tier limits during encoding. +High tier requires level 4 or higher. .SS "Compression tools:" .TP -\fB\-\-deblock [<beta:tc>] -Deblocking - \- beta: between \-6 and 6 - \- tc: between \-6 and 6 +\fB\-\-(no\-)deblock <beta:tc> +Deblocking filter. [0:0] + \- beta: Between \-6 and 6 + \- tc: Between \-6 and 6 .TP -\fB\-\-(no\-)sao -Sample Adaptive Offset +\fB\-\-sao <string> +Sample Adaptive Offset [full] + \- off: SAO disabled + \- band: Band offset only + \- edge: Edge offset only + \- full: Full SAO .TP \fB\-\-(no\-)rdoq -Rate\-Distortion Optimized Quantization +Rate\-distortion optimized quantization [enabled] +.TP +\fB\-\-(no\-)rdoq\-skip +Skip RDOQ for 4x4 blocks. [disabled] .TP \fB\-\-(no\-)signhide -Sign Hiding +Sign hiding [disabled] .TP \fB\-\-(no\-)smp -Symmetric Motion Partition +Symmetric motion partition [disabled] .TP \fB\-\-(no\-)amp -Asymmetric Motion Partition +Asymmetric motion partition [disabled] .TP \fB\-\-rd <integer> -Intra mode search complexity - \- 0: skip intra if inter is good enough - \- 1: rough intra mode search with SATD - \- 2: refine intra mode search with SSE +Intra mode search complexity [0] + \- 0: Skip intra if inter is good enough. + \- 1: Rough intra mode search with SATD. + \- 2: Refine intra mode search with SSE. + \- 3: Try all intra modes and enable intra + chroma mode search. .TP \fB\-\-(no\-)mv\-rdo -Rate\-Distortion Optimized motion vector costs +Rate\-distortion optimized motion vector costs +[disabled] .TP \fB\-\-(no\-)full\-intra\-search - Try all intra modes during rough search. +[disabled] .TP \fB\-\-(no\-)transform\-skip -Transform skip +Try transform skip [disabled] .TP \fB\-\-me <string> -Integer motion estimation +Integer motion estimation algorithm [hexbs] \- hexbs: Hexagon Based Search \- tz: Test Zone Search \- full: Full Search \- full8, full16, full32, full64 + \- dia: Diamond Search +.TP +\fB\-\-me\-steps <integer> +Motion estimation search step limit. Only +affects 'hexbs' and 'dia'. [\-1] .TP \fB\-\-subme <integer> -Set fractional pixel motion estimation level - \- 0: only integer motion estimation +Fractional pixel motion estimation level [4] + \- 0: Integer motion estimation only \- 1: + 1/2\-pixel horizontal and vertical \- 2: + 1/2\-pixel diagonal \- 3: + 1/4\-pixel horizontal and vertical \- 4: + 1/4\-pixel diagonal .TP \fB\-\-pu\-depth\-inter <int>\-<int> - -Range for sizes for inter predictions +Inter prediction units sizes [0\-3] \- 0, 1, 2, 3: from 64x64 to 8x8 .TP \fB\-\-pu\-depth\-intra <int>\-<int> -Range for sizes for intra predictions +Intra prediction units sizes [1\-4] \- 0, 1, 2, 3, 4: from 64x64 to 4x4 .TP +\fB\-\-tr\-depth\-intra <int> +Transform split depth for intra blocks [0] +.TP \fB\-\-(no\-)bipred -Bi\-prediction +Bi\-prediction [disabled] .TP -\fB\-\-(no\-)cu\-split\-termination - -CU split search termination condition - \- off: Never terminate cu\-split search - \- zero: Terminate with zero residual +\fB\-\-cu\-split\-termination <string> +CU split search termination [zero] + \- off: Don't terminate early. + \- zero: Terminate when residual is zero. .TP -\fB\-\-(no\-)me\-early\-termination -ME early termination condition - \- off: Don't terminate early - \- on: Terminate early - \- sensitive: Terminate even earlier +\fB\-\-me\-early\-termination <string> +Motion estimation termination [on] + \- off: Don't terminate early. + \- on: Terminate early. + \- sensitive: Terminate even earlier. +.TP +\fB\-\-fast\-residual\-cost <int> +Skip CABAC cost for residual coefficients + when QP is below the limit. [0] +.TP +\fB\-\-(no\-)intra\-rdo\-et +Check intra modes in rdo stage only until +a zero coefficient CU is found. [disabled] +.TP +\fB\-\-(no\-)early\-skip +Try to find skip cu from merge candidates. +Perform no further search if skip is found. +For rd=0..1: Try the first candidate. +For rd=2.. : Try the best candidate based + on luma satd cost. [enabled] +.TP +\fB\-\-max\-merge <integer> +Maximum number of merge candidates, 1..5 [5] .TP \fB\-\-(no\-)implicit\-rdpcm -Implicit residual DPCM -Currently only supported with lossless coding. +Implicit residual DPCM. Currently only supported +with lossless coding. [disabled] .TP \fB\-\-(no\-)tmvp -Temporal Motion Vector Prediction -.TP -\fB\-\-(no\-)rdoq\-skip -Skips RDOQ for 4x4 blocks +Temporal motion vector prediction [enabled] .SS "Parallel processing:" .TP \fB\-\-threads <integer> Number of threads to use [auto] - \- 0: process everything with main thread - \- N: use N threads for encoding - \- auto: select based on number of cores + \- 0: Process everything with main thread. + \- N: Use N threads for encoding. + \- auto: Select automatically. .TP \fB\-\-owf <integer> -Frame parallelism [auto] - \- N: Process N\-1 frames at a time - \- auto: Select automatically +Frame\-level parallelism [auto] + \- N: Process N+1 frames at a time. + \- auto: Select automatically. .TP \fB\-\-(no\-)wpp -Wavefront parallel processing [enabled] +Wavefront parallel processing. [enabled] Enabling tiles automatically disables WPP. To enable WPP with tiles, re\-enable it after -enabling tiles. +enabling tiles. Enabling wpp with tiles is, +however, an experimental feature since it is +not supported in any HEVC profile. .TP \fB\-\-tiles <int>x<int> Split picture into width x height uniform tiles. .TP \fB\-\-tiles\-width\-split <string>|u<int> -Specifies a comma separated list of pixel -positions of tiles columns separation coordinates. -Can also be u followed by and a single int n, -in which case it produces columns of uniform width. + \- <string>: A comma\-separated list of tile + column pixel coordinates. + \- u<int>: Number of tile columns of uniform + width. .TP \fB\-\-tiles\-height\-split <string>|u<int> -Specifies a comma separated list of pixel -positions of tiles rows separation coordinates. -Can also be u followed by and a single int n, -in which case it produces rows of uniform height. + \- <string>: A comma\-separated list of tile row + column pixel coordinates. + \- u<int>: Number of tile rows of uniform + height. .TP \fB\-\-slices <string> -Control how slices are used - \- tiles: put tiles in independent slices - \- wpp: put rows in dependent slices - \- tiles+wpp: do both +Control how slices are used. + \- tiles: Put tiles in independent slices. + \- wpp: Put rows in dependent slices. + \- tiles+wpp: Do both. .SS "Video Usability Information:" .TP \fB\-\-sar <width:height> -Specify Sample Aspect Ratio +Specify sample aspect ratio .TP \fB\-\-overscan <string> Specify crop overscan setting [undef] @@ -274,7 +348,7 @@ .TP \fB\-\-videoformat <string> Specify video format [undef] - \- component, pal, ntsc, secam, mac, undef + \- undef, component, pal, ntsc, secam, mac .TP \fB\-\-range <string> Specify color range [tv]
View file
kvazaar-1.2.0.tar.gz/src/Makefile.am -> kvazaar-1.3.0.tar.gz/src/Makefile.am
Changed
@@ -124,6 +124,8 @@ strategies/generic/quant-generic.h \ strategies/generic/sao-generic.c \ strategies/generic/sao-generic.h \ + strategies/generic/encode_coding_tree-generic.c \ + strategies/generic/encode_coding_tree-generic.h \ strategies/strategies-common.h \ strategies/strategies-dct.c \ strategies/strategies-dct.h \ @@ -139,6 +141,8 @@ strategies/strategies-quant.h \ strategies/strategies-sao.c \ strategies/strategies-sao.h \ + strategies/strategies-encode.c \ + strategies/strategies-encode.h \ strategies/x86_asm/picture-x86-asm.c \ strategies/x86_asm/picture-x86-asm.h \ strategyselector.c \ @@ -186,7 +190,9 @@ strategies/avx2/quant-avx2.c \ strategies/avx2/quant-avx2.h \ strategies/avx2/sao-avx2.c \ - strategies/avx2/sao-avx2.h + strategies/avx2/sao-avx2.h \ + strategies/avx2/encode_coding_tree-avx2.c \ + strategies/avx2/encode_coding_tree-avx2.h libsse2_la_SOURCES = \ strategies/sse2/picture-sse2.c \ @@ -197,13 +203,17 @@ strategies/sse41/picture-sse41.h if HAVE_PPC + +if HAVE_ALTIVEC libaltivec_la_CFLAGS = -maltivec endif +endif #HAVE_PPC + if HAVE_X86 if HAVE_AVX2 -libavx2_la_CFLAGS = -mavx2 +libavx2_la_CFLAGS = -mavx2 -mbmi -mabm -mbmi2 endif if HAVE_SSE4_1 libsse41_la_CFLAGS = -msse4.1
View file
kvazaar-1.2.0.tar.gz/src/cfg.c -> kvazaar-1.3.0.tar.gz/src/cfg.c
Changed
@@ -24,6 +24,7 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <math.h> kvz_config *kvz_config_alloc(void) @@ -36,7 +37,7 @@ cfg->width = 0; cfg->height = 0; cfg->framerate = 25; // deprecated and will be removed. - cfg->framerate_num = 0; + cfg->framerate_num = 25; cfg->framerate_denom = 1; cfg->qp = 22; cfg->intra_period = 64; @@ -78,6 +79,7 @@ cfg->lossless = false; cfg->tmvp_enable = true; cfg->implicit_rdpcm = false; + cfg->fast_residual_cost_limit = 0; cfg->cu_split_termination = KVZ_CU_SPLIT_TERMINATION_ZERO; @@ -85,13 +87,13 @@ cfg->tiles_height_count = 1; cfg->tiles_width_split = NULL; cfg->tiles_height_split = NULL; - + cfg->wpp = 1; cfg->owf = -1; cfg->slice_count = 1; cfg->slice_addresses_in_ts = MALLOC(int32_t, 1); cfg->slice_addresses_in_ts[0] = 0; - + cfg->threads = -1; cfg->cpuid = 1; @@ -108,16 +110,19 @@ cfg->crypto_features = KVZ_CRYPTO_OFF; cfg->me_early_termination = 1; + cfg->intra_rdo_et = 0; cfg->input_format = KVZ_FORMAT_P420; cfg->input_bitdepth = 8; cfg->gop_lp_definition.d = 3; cfg->gop_lp_definition.t = 1; + cfg->open_gop = true; cfg->roi.width = 0; cfg->roi.height = 0; cfg->roi.dqps = NULL; + cfg->set_qp_in_cu = false; cfg->erp_aqp = false; @@ -125,6 +130,17 @@ cfg->optional_key = NULL; + cfg->level = 62; // default hevc level, 6.2 (the highest) + cfg->force_level = true; // don't care about level limits by-default + cfg->high_tier = false; + + cfg->me_max_steps = (uint32_t)-1; + + cfg->scaling_list = KVZ_SCALING_LIST_OFF; + + cfg->max_merge = 5; + cfg->early_skip = true; + return 1; } @@ -178,14 +194,14 @@ const char* current_arg = NULL; int32_t current_value; int32_t values[MAX_TILES_PER_DIM]; - + int i; - + //Free pointer in any case if (*array) { FREE_POINTER(*array); } - + //If the arg starts with u, we want an uniform split if (arg[0]=='u') { *ntiles = atoi(arg + 1); @@ -196,7 +212,7 @@ //Done with parsing return 1; } - + //We have a comma-separated list of int for the split... current_arg = arg; *ntiles = 1; @@ -213,27 +229,27 @@ ++(*ntiles); if (MAX_TILES_PER_DIM <= *ntiles) break; } while (current_arg); - + if (MAX_TILES_PER_DIM <= *ntiles || 1 >= *ntiles) { fprintf(stderr, "Invalid number of tiles (1 <= %d <= %d = MAX_TILES_PER_DIM)!\n", *ntiles, MAX_TILES_PER_DIM); return 0; } - + *array = MALLOC(int32_t, *ntiles - 1); if (!*array) { fprintf(stderr, "Could not allocate array for tiles\n"); return 0; } - + //TODO: memcpy? for (i = 0; i < *ntiles - 1; ++i) { (*array)[i] = values[i]; } - + return 1; } -static int parse_uint8(const char *numstr,uint8_t* number,int min, int max) +static int parse_uint8(const char *numstr,uint8_t* number,int min, int max) { char *tail; int d = strtol(numstr, &tail, 10); @@ -285,14 +301,14 @@ const char* current_arg = NULL; int32_t current_value; int32_t values[MAX_SLICES]; - + int i; - + //Free pointer in any case if (*array) { FREE_POINTER(*array); } - + //If the arg starts with u, we want an uniform split if (arg[0]=='u') { *nslices = atoi(arg+1); @@ -303,7 +319,7 @@ //Done with parsing return 1; } - + //We have a comma-separated list of int for the split... current_arg = arg; //We always have a slice starting at 0 @@ -322,29 +338,29 @@ ++(*nslices); if (MAX_SLICES <= *nslices) break; } while (current_arg); - + if (MAX_SLICES <= *nslices || 0 >= *nslices) { fprintf(stderr, "Invalid number of slices (0 < %d <= %d = MAX_SLICES)!\n", *nslices, MAX_SLICES); return 0; } - + *array = MALLOC(int32_t, *nslices); if (!*array) { fprintf(stderr, "Could not allocate array for slices\n"); return 0; } - + //TODO: memcpy? for (i = 0; i < *nslices; ++i) { (*array)[i] = values[i]; } - + return 1; } int kvz_config_parse(kvz_config *cfg, const char *name, const char *value) { - static const char * const me_names[] = { "hexbs", "tz", "full", "full8", "full16", "full32", "full64", NULL }; + static const char * const me_names[] = { "hexbs", "tz", "full", "full8", "full16", "full32", "full64", "dia", NULL }; static const char * const source_scan_type_names[] = { "progressive", "tff", "bff", NULL }; static const char * const overscan_names[] = { "undef", "show", "crop", NULL }; @@ -368,221 +384,270 @@ static const char * const sao_names[] = { "off", "edge", "band", "full", NULL }; - static const char * const preset_values[11][20*2] = { - { - "ultrafast", + static const char * const scaling_list_names[] = { "off", "custom", "default", NULL }; + + static const char * const preset_values[11][25*2] = { + { + "ultrafast", + "rd", "0", "pu-depth-intra", "2-3", "pu-depth-inter", "2-3", - "rd", "0", "me", "hexbs", + "gop", "lp-g4d4t1", "ref", "1", + "bipred", "0", "deblock", "0:0", "signhide", "0", - "subme", "0", + "subme", "2", "sao", "off", "rdoq", "0", - "rdoq-skip", "1", - "transform-skip", "0", - "full-intra-search", "0", + "rdoq-skip", "0", + "transform-skip", "0", "mv-rdo", "0", + "full-intra-search", "0", "smp", "0", "amp", "0", "cu-split-termination", "zero", "me-early-termination", "sensitive", - "gop", "lp-g4d3t1", - NULL + "intra-rdo-et", "0", + "early-skip", "1", + "fast-residual-cost", "28", + "max-merge", "5", + NULL }, - { + { "superfast", + "rd", "0", "pu-depth-intra", "2-3", "pu-depth-inter", "2-3", - "rd", "0", "me", "hexbs", + "gop", "lp-g4d4t1", "ref", "1", + "bipred", "0", "deblock", "0:0", "signhide", "0", - "subme", "0", + "subme", "2", "sao", "full", "rdoq", "0", - "rdoq-skip", "1", + "rdoq-skip", "0", "transform-skip", "0", - "full-intra-search", "0", "mv-rdo", "0", + "full-intra-search", "0", "smp", "0", "amp", "0", "cu-split-termination", "zero", "me-early-termination", "sensitive", - "gop", "lp-g4d3t1", + "intra-rdo-et", "0", + "early-skip", "1", + "fast-residual-cost", "28", + "max-merge", "5", NULL }, { "veryfast", - "pu-depth-intra", "2-3", - "pu-depth-inter", "2-3", "rd", "0", + "pu-depth-intra", "2-3", + "pu-depth-inter", "1-3", "me", "hexbs", + "gop", "lp-g4d4t1", "ref", "1", + "bipred", "0", "deblock", "0:0", "signhide", "0", "subme", "2", "sao", "full", "rdoq", "0", - "rdoq-skip", "1", + "rdoq-skip", "0", "transform-skip", "0", - "full-intra-search", "0", "mv-rdo", "0", + "full-intra-search", "0", "smp", "0", "amp", "0", "cu-split-termination", "zero", "me-early-termination", "sensitive", - "gop", "lp-g4d3t1", + "intra-rdo-et", "0", + "early-skip", "1", + "fast-residual-cost", "28", + "max-merge", "5", NULL }, { "faster", + "rd", "0", "pu-depth-intra", "2-3", "pu-depth-inter", "1-3", - "rd", "1", "me", "hexbs", + "gop", "lp-g4d4t1", "ref", "1", + "bipred", "0", "deblock", "0:0", "signhide", "0", - "subme", "2", + "subme", "4", "sao", "full", "rdoq", "0", - "rdoq-skip", "1", + "rdoq-skip", "0", "transform-skip", "0", - "full-intra-search", "0", "mv-rdo", "0", + "full-intra-search", "0", "smp", "0", "amp", "0", "cu-split-termination", "zero", "me-early-termination", "sensitive", - "gop", "lp-g4d3t1", + "intra-rdo-et", "0", + "early-skip", "1", + "fast-residual-cost", "0", + "max-merge", "5", NULL }, { "fast", - "pu-depth-intra", "2-3", + "rd", "0", + "pu-depth-intra", "1-3", "pu-depth-inter", "1-3", - "rd", "1", "me", "hexbs", - "ref", "1", + "gop", "lp-g4d4t1", + "ref", "2", + "bipred", "0", "deblock", "0:0", "signhide", "0", "subme", "4", "sao", "full", "rdoq", "0", - "rdoq-skip", "1", + "rdoq-skip", "0", "transform-skip", "0", - "full-intra-search", "0", "mv-rdo", "0", + "full-intra-search", "0", "smp", "0", "amp", "0", "cu-split-termination", "zero", - "me-early-termination", "on", - "gop", "lp-g4d3t1", + "me-early-termination", "sensitive", + "intra-rdo-et", "0", + "early-skip", "1", + "fast-residual-cost", "0", + "max-merge", "5", NULL }, { "medium", - "pu-depth-intra", "1-3", - "pu-depth-inter", "1-3", - "rd", "1", + "rd", "0", + "pu-depth-intra", "1-4", + "pu-depth-inter", "0-3", "me", "hexbs", - "ref", "1", + "gop", "8", + "ref", "4", + "bipred", "0", "deblock", "0:0", "signhide", "0", "subme", "4", "sao", "full", "rdoq", "1", - "rdoq-skip", "1", + "rdoq-skip", "0", "transform-skip", "0", - "full-intra-search", "0", "mv-rdo", "0", + "full-intra-search", "0", "smp", "0", "amp", "0", "cu-split-termination", "zero", "me-early-termination", "on", - "gop", "lp-g4d3t1", + "intra-rdo-et", "0", + "early-skip", "1", + "fast-residual-cost", "0", + "max-merge", "5", NULL }, { "slow", - "pu-depth-intra", "1-3", - "pu-depth-inter", "1-3", - "rd", "1", + "rd", "0", + "pu-depth-intra", "1-4", + "pu-depth-inter", "0-3", "me", "hexbs", - "ref", "2", + "gop", "8", + "ref", "4", + "bipred", "1", "deblock", "0:0", - "signhide", "1", + "signhide", "0", "subme", "4", "sao", "full", "rdoq", "1", - "rdoq-skip", "1", + "rdoq-skip", "0", "transform-skip", "0", - "full-intra-search", "0", "mv-rdo", "0", + "full-intra-search", "0", "smp", "0", "amp", "0", "cu-split-termination", "zero", "me-early-termination", "on", - "gop", "lp-g4d2t1", + "intra-rdo-et", "0", + "early-skip", "1", + "fast-residual-cost", "0", + "max-merge", "5", NULL }, { "slower", - "pu-depth-intra", "1-3", + "rd", "2", + "pu-depth-intra", "1-4", "pu-depth-inter", "0-3", - "rd", "1", "me", "hexbs", - "ref", "2", + "gop", "8", + "ref", "4", + "bipred", "1", "deblock", "0:0", "signhide", "1", "subme", "4", "sao", "full", "rdoq", "1", - "rdoq-skip", "1", + "rdoq-skip", "0", "transform-skip", "0", - "full-intra-search", "0", "mv-rdo", "0", + "full-intra-search", "0", "smp", "0", "amp", "0", "cu-split-termination", "zero", - "me-early-termination", "on", - "gop", "lp-g4d2t1", + "me-early-termination", "off", + "intra-rdo-et", "0", + "early-skip", "1", + "fast-residual-cost", "0", + "max-merge", "5", NULL }, { "veryslow", + "rd", "2", "pu-depth-intra", "1-4", "pu-depth-inter", "0-3", - "rd", "1", "me", "hexbs", - "ref", "3", + "gop", "8", + "ref", "4", + "bipred", "1", "deblock", "0:0", "signhide", "1", "subme", "4", "sao", "full", "rdoq", "1", - "rdoq-skip", "1", + "rdoq-skip", "0", "transform-skip", "0", - "full-intra-search", "0", "mv-rdo", "0", - "smp", "0", + "full-intra-search", "0", + "smp", "1", "amp", "0", "cu-split-termination", "zero", - "me-early-termination", "on", - "gop", "lp-g4d2t1", + "me-early-termination", "off", + "intra-rdo-et", "0", + "early-skip", "1", + "fast-residual-cost", "0", + "max-merge", "5", NULL }, { "placebo", + "rd", "2", "pu-depth-intra", "1-4", "pu-depth-inter", "0-3", - "rd", "1", "me", "tz", + "gop", "8", "ref", "4", + "bipred", "1", "deblock", "0:0", "signhide", "1", "subme", "4", @@ -590,13 +655,16 @@ "rdoq", "1", "rdoq-skip", "0", "transform-skip", "1", - "full-intra-search", "0", "mv-rdo", "1", + "full-intra-search", "0", "smp", "1", "amp", "1", "cu-split-termination", "off", "me-early-termination", "off", - "gop", "lp-g4d2t1", + "intra-rdo-et", "0", + "early-skip", "1", + "fast-residual-cost", "0", + "max-merge", "5", NULL }, { NULL } @@ -717,10 +785,17 @@ } FREE_POINTER(cfg->cqmfile); cfg->cqmfile = cqmfile; + cfg->scaling_list = KVZ_SCALING_LIST_CUSTOM; + } + else if OPT("scaling-list") { + int8_t scaling_list = KVZ_SCALING_LIST_OFF; + int result = parse_enum(value, scaling_list_names, &scaling_list); + cfg->scaling_list = scaling_list; + return result; } else if OPT("tiles-width-split") { int retval = parse_tiles_specification(value, &cfg->tiles_width_count, &cfg->tiles_width_split); - + if (cfg->tiles_width_count > 1 && cfg->tmvp_enable) { cfg->tmvp_enable = false; fprintf(stderr, "Disabling TMVP because tiles are used.\n"); @@ -735,7 +810,7 @@ } else if OPT("tiles-height-split") { int retval = parse_tiles_specification(value, &cfg->tiles_height_count, &cfg->tiles_height_split); - + if (cfg->tiles_height_count > 1 && cfg->tmvp_enable) { cfg->tmvp_enable = false; fprintf(stderr, "Disabling TMVP because tiles are used.\n"); @@ -815,7 +890,7 @@ } } else if OPT("cpuid") - cfg->cpuid = atoi(value); + cfg->cpuid = atobool(value); else if OPT("pu-depth-inter") return sscanf(value, "%d-%d", &cfg->pu_depth_inter.min, &cfg->pu_depth_inter.max) == 2; else if OPT("pu-depth-intra") @@ -899,6 +974,9 @@ return 0; } } + else if OPT("open-gop") { + cfg->open_gop = (bool)atobool(value); + } else if OPT("bipred") cfg->bipred = atobool(value); else if OPT("bitrate") @@ -1015,6 +1093,8 @@ cfg->me_early_termination = mode; return result; } + else if OPT("intra-rdo-et") + cfg->intra_rdo_et = (bool)atobool(value); else if OPT("lossless") cfg->lossless = (bool)atobool(value); else if OPT("tmvp") { @@ -1081,6 +1161,7 @@ if (width > 10000 || height > 10000) { fprintf(stderr, "ROI dimensions exceed arbitrary value of 10000.\n"); + fclose(f); return 0; } @@ -1109,10 +1190,79 @@ fclose(f); } - else if OPT("erp-aqp") + else if OPT("set-qp-in-cu") { + cfg->set_qp_in_cu = (bool)atobool(value); + } + else if OPT("erp-aqp") { cfg->erp_aqp = (bool)atobool(value); - else + } + else if (OPT("level") || OPT("force-level")) { + if OPT("force-level") { + cfg->force_level = true; + } else { + cfg->force_level = false; + } + + unsigned int num_first, num_second, level; + int matched_amount = sscanf(value, "%u.%u", &num_first, &num_second); + + if (matched_amount == 2) { + // of form x.y + level = num_first * 10 + num_second; + } else if (matched_amount == 1) { + // no dot + if (num_first < 10) { + // of form x + level = num_first * 10; + } else { + // of form xx + level = num_first; + } + } else { + fprintf(stderr, "Invalid level value: \"%s\"\n", value); + return 0; + } + if (level < 10 || level > 62) { + fprintf(stderr, "Level value of %s is out of bounds\n", value); + return 0; + } + + cfg->level = level; + } + else if (OPT("high-tier")) { + cfg->high_tier = true; + } + else if (OPT("me-steps")) { + char * tailptr = NULL; + long steps = strtol(value, &tailptr, 0); + + if (*tailptr != '\0') { + fprintf(stderr, "Invalid me-steps value: \"%s\"", value); + return 0; + } + if (steps < -1 || steps > UINT32_MAX) { + fprintf(stderr, "me-steps value is out of bounds: \"%s\"", value); + return 0; + } + + cfg->me_max_steps = (uint32_t)steps; + } + else if (OPT("fast-residual-cost")) + cfg->fast_residual_cost_limit = atoi(value); + else if (OPT("max-merge")) { + int max_merge = atoi(value); + if (max_merge < 1 || max_merge > 5) { + fprintf(stderr, "max-merge needs to be between 1 and 5\n"); + return 0; + } + cfg->max_merge = (uint8_t)max_merge; + } + else if OPT("early-skip") { + cfg->early_skip = (bool)atobool(value); + } + else { return 0; + } #undef OPT return 1; @@ -1209,6 +1359,9 @@ cfg->gop[gop.g - 1].qp_factor = 0.578; // from HM } +// forward declaration +static int validate_hevc_level(kvz_config *const cfg); + /** * \brief Check that configuration is sensible. * @@ -1267,7 +1420,9 @@ error = 1; } - if (cfg->gop_len && cfg->intra_period && !cfg->gop_lowdelay && + if (cfg->gop_len && + cfg->intra_period > 1 && + !cfg->gop_lowdelay && cfg->intra_period % cfg->gop_len != 0) { fprintf(stderr, @@ -1328,7 +1483,7 @@ } if (!WITHIN(cfg->pu_depth_inter.min, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX) || - !WITHIN(cfg->pu_depth_inter.max, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX)) + !WITHIN(cfg->pu_depth_inter.max, PU_DEPTH_INTER_MIN, PU_DEPTH_INTER_MAX)) { fprintf(stderr, "Input error: illegal value for --pu-depth-inter (%d-%d)\n", cfg->pu_depth_inter.min, cfg->pu_depth_inter.max); @@ -1407,5 +1562,174 @@ error = 1; } + if ((cfg->scaling_list == KVZ_SCALING_LIST_CUSTOM) && !cfg->cqmfile) { + fprintf(stderr, "Input error: --scaling-list=custom does not work without --cqmfile=<FILE>.\n"); + error = 1; + } + + if (validate_hevc_level((kvz_config *const) cfg)) { + // a level error found and it's not okay + error = 1; + } + return !error; } + +static int validate_hevc_level(kvz_config *const cfg) { + static const struct { uint32_t lsr; uint32_t lps; uint32_t main_bitrate; } LEVEL_CONSTRAINTS[13] = { + { 552960, 36864, 128 }, // 1 + + { 3686400, 122880, 1500 }, // 2 + { 7372800, 245760, 3000 }, // 2.1 + + { 16588800, 552960, 6000 }, // 3 + { 33177600, 983040, 10000 }, // 3.1 + + { 66846720, 2228224, 12000 }, // 4 + { 133693440, 2228224, 20000 }, // 4.1 + + { 267386880, 8912896, 25000 }, // 5 + { 534773760, 8912896, 40000 }, // 5.1 + { 1069547520, 8912896, 60000 }, // 5.2 + + { 1069547520, 35651584, 60000 }, // 6 + { 2139095040, 35651584, 120000 }, // 6.1 + { 4278190080, 35651584, 240000 }, // 6.2 + }; + + // bit rates for the high-tiers of the levels from 4 to 6.2 + static const uint32_t HIGH_TIER_BITRATES[8] = { + 30000, 50000, 100000, 160000, 240000, 240000, 480000, 800000 + }; + + int level_error = 0; + + const char* level_err_prefix; + if (cfg->force_level) { + level_err_prefix = "Level warning"; + } else { + level_err_prefix = "Level error"; + } + + uint8_t lvl_idx; + + // for nicer error print + float lvl = ((float)cfg->level) / 10.0f; + + // check if the level is valid and get it's lsr and lps values + switch (cfg->level) { + case 10: + lvl_idx = 0; + break; + case 20: + lvl_idx = 1; + break; + case 21: + lvl_idx = 2; + break; + case 30: + lvl_idx = 3; + break; + case 31: + lvl_idx = 4; + break; + case 40: + lvl_idx = 5; + break; + case 41: + lvl_idx = 6; + break; + case 50: + lvl_idx = 7; + break; + case 51: + lvl_idx = 8; + break; + case 52: + lvl_idx = 9; + break; + case 60: + lvl_idx = 10; + break; + case 61: + lvl_idx = 11; + break; + case 62: + lvl_idx = 12; + break; + + default: + fprintf(stderr, "Input error: %g is an invalid level value\n", lvl); + return 1; + } + + if (cfg->high_tier && cfg->level < 40) { + fprintf(stderr, "Input error: high tier requires at least level 4\n"); + return 1; + } + + // max luma sample rate + uint32_t max_lsr = LEVEL_CONSTRAINTS[lvl_idx].lsr; + + // max luma picture size + uint32_t max_lps = LEVEL_CONSTRAINTS[lvl_idx].lps; + + if (cfg->high_tier) { + cfg->max_bitrate = HIGH_TIER_BITRATES[lvl_idx - 5] * 1000; + } else { + cfg->max_bitrate = LEVEL_CONSTRAINTS[lvl_idx].main_bitrate * 1000; + } + + if (cfg->target_bitrate > cfg->max_bitrate) { + fprintf(stderr, "%s: target bitrate exceeds %i, which is the maximum %s tier level %g bitrate\n", + level_err_prefix, cfg->max_bitrate, cfg->high_tier?"high":"main", lvl); + level_error = 1; + } + + // check the conformance to the level limits + + // luma samples + uint64_t cfg_samples = cfg->width * cfg->height; + + // luma sample rate + double framerate = ((double)cfg->framerate_num) / ((double)cfg->framerate_denom); + uint64_t cfg_sample_rate = cfg_samples * (uint64_t) framerate; + + // square of the maximum allowed dimension + uint32_t max_dimension_squared = 8 * max_lps; + + // check maximum dimensions + if (cfg->width * cfg->width > max_dimension_squared) { + uint32_t max_dim = sqrtf(max_dimension_squared); + fprintf(stderr, "%s: picture width of %i is too large for this level (%g), maximum dimension is %i\n", + level_err_prefix, cfg->width, lvl, max_dim); + level_error = 1; + } + if (cfg->height * cfg->height > max_dimension_squared) { + uint32_t max_dim = sqrtf(max_dimension_squared); + fprintf(stderr, "%s: picture height of %i is too large for this level (%g), maximum dimension is %i\n", + level_err_prefix, cfg->height, lvl, max_dim); + level_error = 1; + } + + // check luma picture size + if (cfg_samples > max_lps) { + fprintf(stderr, "%s: picture resolution of %ix%i is too large for this level (%g) (it has %llu samples, maximum is %u samples)\n", + level_err_prefix, cfg->width, cfg->height, lvl, (unsigned long long) cfg_samples, max_lps); + level_error = 1; + } + + // check luma sample rate + if (cfg_sample_rate > max_lsr) { + fprintf(stderr, "%s: framerate of %g is too big for this level (%g) and picture resolution (it has the sample rate of %llu, maximum is %u\n", + level_err_prefix, framerate, lvl, (unsigned long long) cfg_sample_rate, max_lsr); + level_error = 1; + } + + if (cfg->force_level) { + // we wanted to print warnings, not get errors + return 0; + } else { + return level_error; + } +}
View file
kvazaar-1.2.0.tar.gz/src/cfg.h -> kvazaar-1.3.0.tar.gz/src/cfg.h
Changed
@@ -30,7 +30,6 @@ #include "kvazaar.h" - /* Function definitions */ kvz_config *kvz_config_alloc(void); int kvz_config_init(kvz_config *cfg);
View file
kvazaar-1.2.0.tar.gz/src/cli.c -> kvazaar-1.3.0.tar.gz/src/cli.c
Changed
@@ -36,9 +36,9 @@ { "input", required_argument, NULL, 'i' }, { "output", required_argument, NULL, 'o' }, { "debug", required_argument, NULL, 'd' }, - { "width", required_argument, NULL, 'w' }, + { "width", required_argument, NULL, 'w' }, // deprecated { "height", required_argument, NULL, 'h' }, // deprecated - { "frames", required_argument, NULL, 'n' }, // deprecated + { "frames", required_argument, NULL, 'n' }, { "qp", required_argument, NULL, 'q' }, { "period", required_argument, NULL, 'p' }, { "ref", required_argument, NULL, 'r' }, @@ -86,7 +86,8 @@ { "owf", required_argument, NULL, 0 }, { "slices", required_argument, NULL, 0 }, { "threads", required_argument, NULL, 0 }, - { "cpuid", required_argument, NULL, 0 }, + { "cpuid", optional_argument, NULL, 0 }, + { "no-cpuid", no_argument, NULL, 0 }, { "pu-depth-inter", required_argument, NULL, 0 }, { "pu-depth-intra", required_argument, NULL, 0 }, { "info", no_argument, NULL, 0 }, @@ -109,6 +110,8 @@ { "crypto", required_argument, NULL, 0 }, { "key", required_argument, NULL, 0 }, { "me-early-termination",required_argument, NULL, 0 }, + { "intra-rdo-et", no_argument, NULL, 0 }, + { "no-intra-rdo-et", no_argument, NULL, 0 }, { "lossless", no_argument, NULL, 0 }, { "no-lossless", no_argument, NULL, 0 }, { "tmvp", no_argument, NULL, 0 }, @@ -122,6 +125,18 @@ { "roi", required_argument, NULL, 0 }, { "erp-aqp", no_argument, NULL, 0 }, { "no-erp-aqp", no_argument, NULL, 0 }, + { "level", required_argument, NULL, 0 }, + { "force-level", required_argument, NULL, 0 }, + { "high-tier", no_argument, NULL, 0 }, + { "me-steps", required_argument, NULL, 0 }, + { "fast-residual-cost", required_argument, NULL, 0 }, + { "set-qp-in-cu", no_argument, NULL, 0 }, + { "open-gop", no_argument, NULL, 0 }, + { "no-open-gop", no_argument, NULL, 0 }, + { "scaling-list", required_argument, NULL, 0 }, + { "max-merge", required_argument, NULL, 0 }, + { "early-skip", no_argument, NULL, 0 }, + { "no-early-skip", no_argument, NULL, 0 }, {0, 0, 0, 0} }; @@ -316,168 +331,214 @@ "Usage:\n" "kvazaar -i <input> --input-res <width>x<height> -o <output>\n" "\n" - /* Word wrap to this width to stay under 80 characters (including ") ************/ + /* Word wrap to this width to stay under 80 characters (including ") *************/ "Required:\n" - " -i, --input : Input file\n" + " -i, --input <filename> : Input file\n" " --input-res <res> : Input resolution [auto]\n" - " auto: detect from file name\n" - " <int>x<int>: width times height\n" - " -o, --output : Output file\n" + " - auto: Detect from file name.\n" + " - <int>x<int>: width times height\n" + " -o, --output <filename> : Output file\n" "\n" - /* Word wrap to this width to stay under 80 characters (including ") ************/ + /* Word wrap to this width to stay under 80 characters (including ") *************/ "Presets:\n" - " --preset=<preset> : Set options to a preset [medium]\n" + " --preset <preset> : Set options to a preset [medium]\n" " - ultrafast, superfast, veryfast, faster,\n" " fast, medium, slow, slower, veryslow\n" " placebo\n" "\n" - /* Word wrap to this width to stay under 80 characters (including ") ************/ + /* Word wrap to this width to stay under 80 characters (including ") *************/ "Input:\n" " -n, --frames <integer> : Number of frames to code [all]\n" " --seek <integer> : First frame to code [0]\n" - " --input-fps <num>/<denom> : Framerate of the input video [25.0]\n" - " --source-scan-type <string> : Set source scan type [progressive].\n" - " - progressive: progressive scan\n" - " - tff: top field first\n" - " - bff: bottom field first\n" - " --input-format : P420 or P400\n" - " --input-bitdepth : 8-16\n" - " --loop-input : Re-read input file forever\n" + " --input-fps <num>[/<denom>] : Frame rate of the input video [25]\n" + " --source-scan-type <string> : Source scan type [progressive]\n" + " - progressive: Progressive scan\n" + " - tff: Top field first\n" + " - bff: Bottom field first\n" + " --input-format <string> : P420 or P400 [P420]\n" + " --input-bitdepth <int> : 8-16 [8]\n" + " --loop-input : Re-read input file forever.\n" "\n" - /* Word wrap to this width to stay under 80 characters (including ") ************/ + /* Word wrap to this width to stay under 80 characters (including ") *************/ "Options:\n" - " --help : Print this help message and exit\n" - " --version : Print version information and exit\n" - " --aud : Use access unit delimiters\n" - " --debug <string> : Output encoders reconstruction.\n" - " --cpuid <integer> : Disable runtime cpu optimizations with value 0.\n" - " --hash : Decoded picture hash [checksum]\n" + " --help : Print this help message and exit.\n" + " --version : Print version information and exit.\n" + " --(no-)aud : Use access unit delimiters. [disabled]\n" + " --debug <filename> : Output internal reconstruction.\n" + " --(no-)cpuid : Enable runtime CPU optimizations. [enabled]\n" + " --hash <string> : Decoded picture hash [checksum]\n" " - none: 0 bytes\n" " - checksum: 18 bytes\n" " - md5: 56 bytes\n" - " --no-psnr : Don't calculate PSNR for frames\n" - " --no-info : Don't add encoder info SEI.\n" + " --(no-)psnr : Calculate PSNR for frames. [enabled]\n" + " --(no-)info : Add encoder info SEI. [enabled]\n" + " --crypto <string> : Selective encryption. Crypto support must be\n" + " enabled at compile-time. Can be 'on' or 'off' or\n" + " a list of features separated with a '+'. [off]\n" + " - on: Enable all encryption features.\n" + " - off: Disable selective encryption.\n" + " - mvs: Motion vector magnitudes.\n" + " - mv_signs: Motion vector signs.\n" + " - trans_coeffs: Coefficient magnitudes.\n" + " - trans_coeff_signs: Coefficient signs.\n" + " - intra_pred_modes: Intra prediction modes.\n" + " --key <string> : Encryption key [16,213,27,56,255,127,242,112,\n" + " 97,126,197,204,25,59,38,30]\n" "\n" - /* Word wrap to this width to stay under 80 characters (including ") ************/ + /* Word wrap to this width to stay under 80 characters (including ") *************/ "Video structure:\n" - " -q, --qp <integer> : Quantization Parameter [32]\n" - " -p, --period <integer> : Period of intra pictures [0]\n" - " - 0: only first picture is intra\n" - " - 1: all pictures are intra\n" - " - 2-N: every Nth picture is intra\n" - " --vps-period <integer> : Specify how often the video parameter set is\n" - " re-sent. [0]\n" - " - 0: only send VPS with the first frame\n" - " - N: send VPS with every Nth intra frame\n" - " -r, --ref <integer> : Reference frames, range 1..15 [3]\n" - " --gop <string> : Definition of GOP structure [0]\n" - " - 0: disabled\n" + " -q, --qp <integer> : Quantization parameter [22]\n" + " -p, --period <integer> : Period of intra pictures [64]\n" + " - 0: Only first picture is intra.\n" + " - 1: All pictures are intra.\n" + " - N: Every Nth picture is intra.\n" + " --vps-period <integer> : How often the video parameter set is re-sent [0]\n" + " - 0: Only send VPS with the first frame.\n" + " - N: Send VPS with every Nth intra frame.\n" + " -r, --ref <integer> : Number of reference frames, in range 1..15 [4]\n" + " --gop <string> : GOP structure [8]\n" + " - 0: Disabled\n" " - 8: B-frame pyramid of length 8\n" - " - lp-<string>: lp-gop definition\n" - " (e.g. lp-g8d4t2, see README)\n" - " --cqmfile <string> : Custom Quantization Matrices from a file\n" - " --bitrate <integer> : Target bitrate. [0]\n" - " - 0: disable rate-control\n" - " - N: target N bits per second\n" - " --lossless : Use lossless coding\n" - " --mv-constraint : Constrain movement vectors\n" - " - none: no constraint\n" - " - frametile: constrain within the tile\n" - " - frametilemargin: constrain even more\n" - " --roi <string> : Use a delta QP map for region of interest\n" - " Read an array of delta QP values from\n" - " a file, where the first two values are the\n" - " width and height, followed by width*height\n" - " delta QP values in raster order.\n" - " The delta QP map can be any size or aspect\n" - " ratio, and will be mapped to LCU's.\n" - " --(no-)erp-aqp : Use adaptive QP for 360 video with\n" - " equirectangular projection\n" + " - lp-<string>: Low-delay P-frame GOP\n" + " (e.g. lp-g8d4t2, see README)\n" + " --(no-)open-gop : Use open GOP configuration. [enabled]\n" + " --cqmfile <filename> : Read custom quantization matrices from a file.\n" + " --scaling-list <string>: Set scaling list mode. [off]\n" + " - off: Disable scaling lists.\n" + " - custom: use custom list (with --cqmfile).\n" + " - default: Use default lists.\n" + " --bitrate <integer> : Target bitrate [0]\n" + " - 0: Disable rate control.\n" + " - N: Target N bits per second.\n" + " --(no-)lossless : Use lossless coding. [disabled]\n" + " --mv-constraint <string> : Constrain movement vectors. [none]\n" + " - none: No constraint\n" + " - frametile: Constrain within the tile.\n" + " - frametilemargin: Constrain even more.\n" + " --roi <filename> : Use a delta QP map for region of interest.\n" + " Reads an array of delta QP values from a text\n" + " file. The file format is: width and height of\n" + " the QP delta map followed by width*height delta\n" + " QP values in raster order. The map can be of any\n" + " size and will be scaled to the video size.\n" + " --set-qp-in-cu : Set QP at CU level keeping pic_init_qp_minus26.\n" + " in PPS and slice_qp_delta in slize header zero.\n" + " --(no-)erp-aqp : Use adaptive QP for 360 degree video with\n" + " equirectangular projection. [disabled]\n" + " --level <number> : Use the given HEVC level in the output and give\n" + " an error if level limits are exceeded. [6.2]\n" + " - 1, 2, 2.1, 3, 3.1, 4, 4.1, 5, 5.1, 5.2, 6,\n" + " 6.1, 6.2\n" + " --force-level <number> : Same as --level but warnings instead of errors.\n" + " --high-tier : Used with --level. Use high tier bitrate limits\n" + " instead of the main tier limits during encoding.\n" + " High tier requires level 4 or higher.\n" "\n" - /* Word wrap to this width to stay under 80 characters (including ") ************/ + /* Word wrap to this width to stay under 80 characters (including ") *************/ "Compression tools:\n" - " --deblock [<beta:tc>] : Deblocking\n" - " - beta: between -6 and 6\n" - " - tc: between -6 and 6\n" - " --(no-)sao : Sample Adaptive Offset\n" - " --(no-)rdoq : Rate-Distortion Optimized Quantization\n" - " --(no-)signhide : Sign Hiding\n" - " --(no-)smp : Symmetric Motion Partition\n" - " --(no-)amp : Asymmetric Motion Partition\n" - " --rd <integer> : Intra mode search complexity\n" - " - 0: skip intra if inter is good enough\n" - " - 1: rough intra mode search with SATD\n" - " - 2: refine intra mode search with SSE\n" - " --(no-)mv-rdo : Rate-Distortion Optimized motion vector costs\n" - " --(no-)full-intra-search\n" - " : Try all intra modes during rough search.\n" - " --(no-)transform-skip : Transform skip\n" - " --me <string> : Integer motion estimation\n" + " --(no-)deblock <beta:tc> : Deblocking filter. [0:0]\n" + " - beta: Between -6 and 6\n" + " - tc: Between -6 and 6\n" + " --sao <string> : Sample Adaptive Offset [full]\n" + " - off: SAO disabled\n" + " - band: Band offset only\n" + " - edge: Edge offset only\n" + " - full: Full SAO\n" + " --(no-)rdoq : Rate-distortion optimized quantization [enabled]\n" + " --(no-)rdoq-skip : Skip RDOQ for 4x4 blocks. [disabled]\n" + " --(no-)signhide : Sign hiding [disabled]\n" + " --(no-)smp : Symmetric motion partition [disabled]\n" + " --(no-)amp : Asymmetric motion partition [disabled]\n" + " --rd <integer> : Intra mode search complexity [0]\n" + " - 0: Skip intra if inter is good enough.\n" + " - 1: Rough intra mode search with SATD.\n" + " - 2: Refine intra mode search with SSE.\n" + " - 3: Try all intra modes and enable intra\n" + " chroma mode search.\n" + " --(no-)mv-rdo : Rate-distortion optimized motion vector costs\n" + " [disabled]\n" + " --(no-)full-intra-search : Try all intra modes during rough search.\n" + " [disabled]\n" + " --(no-)transform-skip : Try transform skip [disabled]\n" + " --me <string> : Integer motion estimation algorithm [hexbs]\n" " - hexbs: Hexagon Based Search\n" " - tz: Test Zone Search\n" " - full: Full Search\n" " - full8, full16, full32, full64\n" - " --subme <integer> : Set fractional pixel motion estimation level\n" - " - 0: only integer motion estimation\n" + " - dia: Diamond Search\n" + " --me-steps <integer> : Motion estimation search step limit. Only\n" + " affects 'hexbs' and 'dia'. [-1]\n" + " --subme <integer> : Fractional pixel motion estimation level [4]\n" + " - 0: Integer motion estimation only\n" " - 1: + 1/2-pixel horizontal and vertical\n" " - 2: + 1/2-pixel diagonal\n" " - 3: + 1/4-pixel horizontal and vertical\n" " - 4: + 1/4-pixel diagonal\n" - " --pu-depth-inter <int>-<int>\n" - " : Range for sizes for inter predictions\n" + " --pu-depth-inter <int>-<int> : Inter prediction units sizes [0-3]\n" " - 0, 1, 2, 3: from 64x64 to 8x8\n" - " --pu-depth-intra <int>-<int> : Range for sizes for intra predictions\n" + " --pu-depth-intra <int>-<int> : Intra prediction units sizes [1-4]\n" " - 0, 1, 2, 3, 4: from 64x64 to 4x4\n" - " --(no-)bipred : Bi-prediction\n" - " --(no-)cu-split-termination\n" - " : CU split search termination condition\n" - " - off: Never terminate cu-split search\n" - " - zero: Terminate with zero residual\n" - " --(no-)me-early-termination : ME early termination condition\n" - " - off: Don't terminate early\n" - " - on: Terminate early\n" - " - sensitive: Terminate even earlier\n" - " --(no-)implicit-rdpcm : Implicit residual DPCM\n" - " Currently only supported with lossless coding.\n" - " --(no-)tmvp : Temporal Motion Vector Prediction\n" - " --(no-)rdoq-skip : Skips RDOQ for 4x4 blocks\n" + " --tr-depth-intra <int> : Transform split depth for intra blocks [0]\n" + " --(no-)bipred : Bi-prediction [disabled]\n" + " --cu-split-termination <string> : CU split search termination [zero]\n" + " - off: Don't terminate early.\n" + " - zero: Terminate when residual is zero.\n" + " --me-early-termination <string> : Motion estimation termination [on]\n" + " - off: Don't terminate early.\n" + " - on: Terminate early.\n" + " - sensitive: Terminate even earlier.\n" + " --fast-residual-cost <int> : Skip CABAC cost for residual coefficients\n" + " when QP is below the limit. [0]\n" + " --(no-)intra-rdo-et : Check intra modes in rdo stage only until\n" + " a zero coefficient CU is found. [disabled]\n" + " --(no-)early-skip : Try to find skip cu from merge candidates.\n" + " Perform no further search if skip is found.\n" + " For rd=0..1: Try the first candidate.\n" + " For rd=2.. : Try the best candidate based\n" + " on luma satd cost. [enabled]\n" + " --max-merge <integer> : Maximum number of merge candidates, 1..5 [5]\n" + " --(no-)implicit-rdpcm : Implicit residual DPCM. Currently only supported\n" + " with lossless coding. [disabled]\n" + " --(no-)tmvp : Temporal motion vector prediction [enabled]\n" "\n" - /* Word wrap to this width to stay under 80 characters (including ") ************/ + /* Word wrap to this width to stay under 80 characters (including ") *************/ "Parallel processing:\n" " --threads <integer> : Number of threads to use [auto]\n" - " - 0: process everything with main thread\n" - " - N: use N threads for encoding\n" - " - auto: select based on number of cores\n" - " --owf <integer> : Frame parallelism [auto]\n" - " - N: Process N-1 frames at a time\n" - " - auto: Select automatically\n" - " --(no-)wpp : Wavefront parallel processing [enabled]\n" + " - 0: Process everything with main thread.\n" + " - N: Use N threads for encoding.\n" + " - auto: Select automatically.\n" + " --owf <integer> : Frame-level parallelism [auto]\n" + " - N: Process N+1 frames at a time.\n" + " - auto: Select automatically.\n" + " --(no-)wpp : Wavefront parallel processing. [enabled]\n" " Enabling tiles automatically disables WPP.\n" " To enable WPP with tiles, re-enable it after\n" - " enabling tiles.\n" + " enabling tiles. Enabling wpp with tiles is,\n" + " however, an experimental feature since it is\n" + " not supported in any HEVC profile.\n" " --tiles <int>x<int> : Split picture into width x height uniform tiles.\n" " --tiles-width-split <string>|u<int> :\n" - " Specifies a comma separated list of pixel\n" - " positions of tiles columns separation coordinates.\n" - " Can also be u followed by and a single int n,\n" - " in which case it produces columns of uniform width.\n" + " - <string>: A comma-separated list of tile\n" + " column pixel coordinates.\n" + " - u<int>: Number of tile columns of uniform\n" + " width.\n" " --tiles-height-split <string>|u<int> :\n" - " Specifies a comma separated list of pixel\n" - " positions of tiles rows separation coordinates.\n" - " Can also be u followed by and a single int n,\n" - " in which case it produces rows of uniform height.\n" - " --slices <string> : Control how slices are used\n" - " - tiles: put tiles in independent slices\n" - " - wpp: put rows in dependent slices\n" - " - tiles+wpp: do both\n" + " - <string>: A comma-separated list of tile row\n" + " column pixel coordinates.\n" + " - u<int>: Number of tile rows of uniform\n" + " height.\n" + " --slices <string> : Control how slices are used.\n" + " - tiles: Put tiles in independent slices.\n" + " - wpp: Put rows in dependent slices.\n" + " - tiles+wpp: Do both.\n" "\n" - /* Word wrap to this width to stay under 80 characters (including ") ************/ + /* Word wrap to this width to stay under 80 characters (including ") *************/ "Video Usability Information:\n" - " --sar <width:height> : Specify Sample Aspect Ratio\n" + " --sar <width:height> : Specify sample aspect ratio\n" " --overscan <string> : Specify crop overscan setting [undef]\n" " - undef, show, crop\n" " --videoformat <string> : Specify video format [undef]\n" - " - component, pal, ntsc, secam, mac, undef\n" + " - undef, component, pal, ntsc, secam, mac\n" " --range <string> : Specify color range [tv]\n" " - tv, pc\n" " --colorprim <string> : Specify color primaries [undef]\n" @@ -493,10 +554,10 @@ " smpte240m, GBR, YCgCo, bt2020nc, bt2020c\n" " --chromaloc <integer> : Specify chroma sample location (0 to 5) [0]\n" "\n" - /* Word wrap to this width to stay under 80 characters (including ") ************/ + /* Word wrap to this width to stay under 80 characters (including ") *************/ "Deprecated parameters: (might be removed at some point)\n" - " -w, --width : Use --input-res\n" - " -h, --height : Use --input-res\n"); + " -w, --width <integer> : Use --input-res.\n" + " -h, --height <integer> : Use --input-res.\n"); }
View file
kvazaar-1.2.0.tar.gz/src/cu.c -> kvazaar-1.3.0.tar.gz/src/cu.c
Changed
@@ -184,9 +184,10 @@ */ cu_array_t * kvz_cu_array_copy_ref(cu_array_t* cua) { - // The caller should have had another reference. - assert(cua->refcount > 0); - KVZ_ATOMIC_INC(&cua->refcount); + int32_t new_refcount = KVZ_ATOMIC_INC(&cua->refcount); + // The caller should have had another reference and we added one + // reference so refcount should be at least 2. + assert(new_refcount >= 2); return cua; }
View file
kvazaar-1.2.0.tar.gz/src/cu.h -> kvazaar-1.3.0.tar.gz/src/cu.h
Changed
@@ -123,6 +123,7 @@ uint8_t skipped : 1; //!< \brief flag to indicate this block is skipped uint8_t merged : 1; //!< \brief flag to indicate this block is merged uint8_t merge_idx : 3; //!< \brief merge index + uint8_t tr_skip : 1; //!< \brief transform skip flag uint16_t cbf; @@ -137,7 +138,6 @@ struct { int8_t mode; int8_t mode_chroma; - int8_t tr_skip; //!< \brief transform skip flag #if KVZ_SEL_ENCRYPTION int8_t mode_encry; #endif
View file
kvazaar-1.2.0.tar.gz/src/encmain.c -> kvazaar-1.3.0.tar.gz/src/encmain.c
Changed
@@ -27,6 +27,9 @@ /* The following two defines must be located before the inclusion of any system header files. */ #define WINVER 0x0500 #define _WIN32_WINNT 0x0500 + +#include "global.h" // IWYU pragma: keep + #include <fcntl.h> /* _O_BINARY */ #include <io.h> /* _setmode() */ #endif @@ -41,7 +44,6 @@ #include "checkpoint.h" #include "cli.h" #include "encoder.h" -#include "global.h" // IWYU pragma: keep #include "kvazaar.h" #include "kvazaar_internal.h" #include "threads.h" @@ -431,6 +433,12 @@ uint32_t frames_done = 0; double psnr_sum[3] = { 0.0, 0.0, 0.0 }; + // how many bits have been written this second? used for checking if framerate exceeds level's limits + uint64_t bits_this_second = 0; + // the amount of frames have been encoded in this second of video. can be non-integer value if framerate is non-integer value + unsigned frames_this_second = 0; + const float framerate = ((float)encoder->cfg.framerate_num) / ((float)encoder->cfg.framerate_denom); + uint8_t padding_x = get_padding(opts->config->width); uint8_t padding_y = get_padding(opts->config->height); @@ -527,6 +535,39 @@ fflush(output); bitstream_length += len_out; + + // the level's bitrate check + frames_this_second += 1; + + if ((float)frames_this_second >= framerate) { + // if framerate <= 1 then we go here always + + // how much of the bits of the last frame belonged to the next second + uint64_t leftover_bits = (uint64_t)((double)len_out * ((double)frames_this_second - framerate)); + + // the latest frame is counted for the amount that it contributed to this current second + bits_this_second += len_out - leftover_bits; + + if (bits_this_second > encoder->cfg.max_bitrate) { + fprintf(stderr, "Level warning: This %s's bitrate (%llu bits/s) reached the maximum bitrate (%u bits/s) of %s tier level %g.", + framerate >= 1.0f ? "second" : "frame", + (unsigned long long) bits_this_second, + encoder->cfg.max_bitrate, + encoder->cfg.high_tier ? "high" : "main", + (float)encoder->cfg.level / 10.0f ); + } + + if (framerate > 1.0f) { + // leftovers for the next second + bits_this_second = leftover_bits; + } else { + // one or more next seconds are from this frame and their bitrate is the same or less as this frame's + bits_this_second = 0; + } + frames_this_second = 0; + } else { + bits_this_second += len_out; + } // Compute and print stats.
View file
kvazaar-1.2.0.tar.gz/src/encode_coding_tree.c -> kvazaar-1.3.0.tar.gz/src/encode_coding_tree.c
Changed
@@ -30,6 +30,7 @@ #include "intra.h" #include "kvazaar.h" #include "kvz_math.h" +#include "strategyselector.h" #include "tables.h" #include "videoframe.h" @@ -46,10 +47,10 @@ * This method encodes the X and Y component within a block of the last * significant coefficient. */ -static void encode_last_significant_xy(cabac_data_t * const cabac, - uint8_t lastpos_x, uint8_t lastpos_y, - uint8_t width, uint8_t height, - uint8_t type, uint8_t scan) +void kvz_encode_last_significant_xy(cabac_data_t * const cabac, + uint8_t lastpos_x, uint8_t lastpos_y, + uint8_t width, uint8_t height, + uint8_t type, uint8_t scan) { const int index = kvz_math_floor_log2(width) - 2; uint8_t ctx_offset = type ? 0 : (index * 3 + (index + 1) / 4); @@ -100,250 +101,6 @@ } } -void kvz_encode_coeff_nxn(encoder_state_t * const state, - cabac_data_t * const cabac, - const coeff_t *coeff, - uint8_t width, - uint8_t type, - int8_t scan_mode, - int8_t tr_skip) -{ - const encoder_control_t * const encoder = state->encoder_control; - int c1 = 1; - uint8_t last_coeff_x = 0; - uint8_t last_coeff_y = 0; - int32_t i; - uint32_t sig_coeffgroup_flag[8 * 8] = { 0 }; - - int8_t be_valid = encoder->cfg.signhide_enable; - int32_t scan_pos_sig; - uint32_t go_rice_param = 0; - uint32_t blk_pos, pos_y, pos_x, sig, ctx_sig; - - // CONSTANTS - const uint32_t num_blk_side = width >> TR_MIN_LOG2_SIZE; - const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2; - const uint32_t *scan = - kvz_g_sig_last_scan[scan_mode][log2_block_size - 1]; - const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 2][scan_mode]; - - // Init base contexts according to block type - cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_model[type]); - cabac_ctx_t *baseCtx = (type == 0) ? &(cabac->ctx.cu_sig_model_luma[0]) : - &(cabac->ctx.cu_sig_model_chroma[0]); - - // Scan all coeff groups to find out which of them have coeffs. - // Populate sig_coeffgroup_flag with that info. - - unsigned sig_cg_cnt = 0; - for (int cg_y = 0; cg_y < width / 4; ++cg_y) { - for (int cg_x = 0; cg_x < width / 4; ++cg_x) { - unsigned cg_pos = cg_y * width * 4 + cg_x * 4; - for (int coeff_row = 0; coeff_row < 4; ++coeff_row) { - // Load four 16-bit coeffs and see if any of them are non-zero. - unsigned coeff_pos = cg_pos + coeff_row * width; - uint64_t four_coeffs = *(uint64_t*)(&coeff[coeff_pos]); - if (four_coeffs) { - ++sig_cg_cnt; - unsigned cg_pos_y = (cg_pos >> log2_block_size) >> TR_MIN_LOG2_SIZE; - unsigned cg_pos_x = (cg_pos & (width - 1)) >> TR_MIN_LOG2_SIZE; - sig_coeffgroup_flag[cg_pos_x + cg_pos_y * num_blk_side] = 1; - break; - } - } - } - } - - // Rest of the code assumes at least one non-zero coeff. - assert(sig_cg_cnt > 0); - - // Find the last coeff group by going backwards in scan order. - unsigned scan_cg_last = num_blk_side * num_blk_side - 1; - while (!sig_coeffgroup_flag[scan_cg[scan_cg_last]]) { - --scan_cg_last; - } - - // Find the last coeff by going backwards in scan order. - unsigned scan_pos_last = scan_cg_last * 16 + 15; - while (!coeff[scan[scan_pos_last]]) { - --scan_pos_last; - } - - int pos_last = scan[scan_pos_last]; - - // transform skip flag - if(width == 4 && encoder->cfg.trskip_enable) { - cabac->cur_ctx = (type == 0) ? &(cabac->ctx.transform_skip_model_luma) : &(cabac->ctx.transform_skip_model_chroma); - CABAC_BIN(cabac, tr_skip, "transform_skip_flag"); - } - - last_coeff_x = pos_last & (width - 1); - last_coeff_y = (uint8_t)(pos_last >> log2_block_size); - - // Code last_coeff_x and last_coeff_y - encode_last_significant_xy(cabac, - last_coeff_x, - last_coeff_y, - width, - width, - type, - scan_mode); - - scan_pos_sig = scan_pos_last; - - // significant_coeff_flag - for (i = scan_cg_last; i >= 0; i--) { - int32_t sub_pos = i << 4; // LOG2_SCAN_SET_SIZE; - int32_t abs_coeff[16]; - int32_t cg_blk_pos = scan_cg[i]; - int32_t cg_pos_y = cg_blk_pos / num_blk_side; - int32_t cg_pos_x = cg_blk_pos - (cg_pos_y * num_blk_side); - - uint32_t coeff_signs = 0; - int32_t last_nz_pos_in_cg = -1; - int32_t first_nz_pos_in_cg = 16; - int32_t num_non_zero = 0; - go_rice_param = 0; - - if (scan_pos_sig == scan_pos_last) { - abs_coeff[0] = abs(coeff[pos_last]); - coeff_signs = (coeff[pos_last] < 0); - num_non_zero = 1; - last_nz_pos_in_cg = scan_pos_sig; - first_nz_pos_in_cg = scan_pos_sig; - scan_pos_sig--; - } - - if (i == scan_cg_last || i == 0) { - sig_coeffgroup_flag[cg_blk_pos] = 1; - } else { - uint32_t sig_coeff_group = (sig_coeffgroup_flag[cg_blk_pos] != 0); - uint32_t ctx_sig = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x, - cg_pos_y, width); - cabac->cur_ctx = &base_coeff_group_ctx[ctx_sig]; - CABAC_BIN(cabac, sig_coeff_group, "coded_sub_block_flag"); - } - - if (sig_coeffgroup_flag[cg_blk_pos]) { - int32_t pattern_sig_ctx = kvz_context_calc_pattern_sig_ctx(sig_coeffgroup_flag, - cg_pos_x, cg_pos_y, width); - - for (; scan_pos_sig >= sub_pos; scan_pos_sig--) { - blk_pos = scan[scan_pos_sig]; - pos_y = blk_pos >> log2_block_size; - pos_x = blk_pos - (pos_y << log2_block_size); - sig = (coeff[blk_pos] != 0) ? 1 : 0; - - if (scan_pos_sig > sub_pos || i == 0 || num_non_zero) { - ctx_sig = kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y, - log2_block_size, type); - cabac->cur_ctx = &baseCtx[ctx_sig]; - CABAC_BIN(cabac, sig, "sig_coeff_flag"); - } - - if (sig) { - abs_coeff[num_non_zero] = abs(coeff[blk_pos]); - coeff_signs = 2 * coeff_signs + (coeff[blk_pos] < 0); - num_non_zero++; - - if (last_nz_pos_in_cg == -1) { - last_nz_pos_in_cg = scan_pos_sig; - } - - first_nz_pos_in_cg = scan_pos_sig; - } - } - } else { - scan_pos_sig = sub_pos - 1; - } - - if (num_non_zero > 0) { - bool sign_hidden = last_nz_pos_in_cg - first_nz_pos_in_cg >= 4 /* SBH_THRESHOLD */ - && !encoder->cfg.lossless; - uint32_t ctx_set = (i > 0 && type == 0) ? 2 : 0; - cabac_ctx_t *base_ctx_mod; - int32_t num_c1_flag, first_c2_flag_idx, idx, first_coeff2; - - if (c1 == 0) { - ctx_set++; - } - - c1 = 1; - - base_ctx_mod = (type == 0) ? &(cabac->ctx.cu_one_model_luma[4 * ctx_set]) : - &(cabac->ctx.cu_one_model_chroma[4 * ctx_set]); - num_c1_flag = MIN(num_non_zero, C1FLAG_NUMBER); - first_c2_flag_idx = -1; - - for (idx = 0; idx < num_c1_flag; idx++) { - uint32_t symbol = (abs_coeff[idx] > 1) ? 1 : 0; - cabac->cur_ctx = &base_ctx_mod[c1]; - CABAC_BIN(cabac, symbol, "coeff_abs_level_greater1_flag"); - - if (symbol) { - c1 = 0; - - if (first_c2_flag_idx == -1) { - first_c2_flag_idx = idx; - } - } else if ((c1 < 3) && (c1 > 0)) { - c1++; - } - } - - if (c1 == 0) { - base_ctx_mod = (type == 0) ? &(cabac->ctx.cu_abs_model_luma[ctx_set]) : - &(cabac->ctx.cu_abs_model_chroma[ctx_set]); - - if (first_c2_flag_idx != -1) { - uint8_t symbol = (abs_coeff[first_c2_flag_idx] > 2) ? 1 : 0; - cabac->cur_ctx = &base_ctx_mod[0]; - CABAC_BIN(cabac, symbol, "coeff_abs_level_greater2_flag"); - } - } - if (be_valid && sign_hidden) { - coeff_signs = coeff_signs >> 1; - if (!cabac->only_count) - if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) { - coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero-1); - } - CABAC_BINS_EP(cabac, coeff_signs , (num_non_zero - 1), "coeff_sign_flag"); - } else { - if (!cabac->only_count) - if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) - coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero); - CABAC_BINS_EP(cabac, coeff_signs, num_non_zero, "coeff_sign_flag"); - } - - if (c1 == 0 || num_non_zero > C1FLAG_NUMBER) { - first_coeff2 = 1; - - for (idx = 0; idx < num_non_zero; idx++) { - int32_t base_level = (idx < C1FLAG_NUMBER) ? (2 + first_coeff2) : 1; - - if (abs_coeff[idx] >= base_level) { - if (!cabac->only_count) { - if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFFS) - kvz_cabac_write_coeff_remain_encry(state, cabac, abs_coeff[idx] - base_level, go_rice_param, base_level); - else - kvz_cabac_write_coeff_remain(cabac, abs_coeff[idx] - base_level, go_rice_param); - } else - kvz_cabac_write_coeff_remain(cabac, abs_coeff[idx] - base_level, go_rice_param); - - if (abs_coeff[idx] > 3 * (1 << go_rice_param)) { - go_rice_param = MIN(go_rice_param + 1, 4); - } - } - - if (abs_coeff[idx] >= 2) { - first_coeff2 = 0; - } - } - } - } - } -} - static void encode_transform_unit(encoder_state_t * const state, int x, int y, int depth) { @@ -372,7 +129,7 @@ width, 0, scan_idx, - cur_pu->intra.tr_skip); + cur_pu->tr_skip); } if (depth == MAX_DEPTH + 1) { @@ -435,7 +192,9 @@ const cu_info_t *cur_pu = kvz_cu_array_at_const(frame->cu_array, x, y); // Round coordinates down to a multiple of 8 to get the location of the // containing CU. - const cu_info_t *cur_cu = kvz_cu_array_at_const(frame->cu_array, x & ~7, y & ~7); + const int x_cu = 8 * (x / 8); + const int y_cu = 8 * (y / 8); + const cu_info_t *cur_cu = kvz_cu_array_at_const(frame->cu_array, x_cu, y_cu); // NxN signifies implicit transform split at the first transform level. // There is a similar implicit split for inter, but it is only used when @@ -508,9 +267,10 @@ if (cb_flag_y | cb_flag_u | cb_flag_v) { if (state->must_code_qp_delta) { - const int qp_delta = state->qp - state->ref_qp; - const int qp_delta_abs = ABS(qp_delta); - cabac_data_t* cabac = &state->cabac; + const int qp_pred = kvz_get_cu_ref_qp(state, x_cu, y_cu, state->last_qp); + const int qp_delta = cur_cu->qp - qp_pred; + const int qp_delta_abs = ABS(qp_delta); + cabac_data_t* cabac = &state->cabac; // cu_qp_delta_abs prefix cabac->cur_ctx = &cabac->ctx.cu_qp_delta_abs[0]; @@ -526,7 +286,6 @@ } state->must_code_qp_delta = false; - state->ref_qp = state->qp; } encode_transform_unit(state, x, y, depth); @@ -543,7 +302,7 @@ int16_t num_cand = 0; cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model); CABAC_BIN(cabac, cur_cu->merged, "MergeFlag"); - num_cand = MRG_MAX_NUM_CANDS; + num_cand = state->encoder_control->cfg.max_merge; if (cur_cu->merged) { //merge if (num_cand > 1) { int32_t ui; @@ -559,122 +318,76 @@ } } } else { - uint32_t ref_list_idx; - - // Void TEncSbac::codeInterDir( TComDataCU* pcCU, UInt uiAbsPartIdx ) - if (state->frame->slicetype == KVZ_SLICE_B) - { + if (state->frame->slicetype == KVZ_SLICE_B) { // Code Inter Dir uint8_t inter_dir = cur_cu->inter.mv_dir-1; - uint8_t ctx = depth; - - if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 8) - { - cabac->cur_ctx = &(cabac->ctx.inter_dir[ctx]); + if (cur_cu->part_size == SIZE_2Nx2N || (LCU_WIDTH >> depth) != 8) { + cabac->cur_ctx = &(cabac->ctx.inter_dir[depth]); CABAC_BIN(cabac, (inter_dir == 2), "inter_pred_idc"); } - if (inter_dir < 2) - { + if (inter_dir < 2) { cabac->cur_ctx = &(cabac->ctx.inter_dir[4]); CABAC_BIN(cabac, inter_dir, "inter_pred_idc"); } } - for (ref_list_idx = 0; ref_list_idx < 2; ref_list_idx++) { - if (cur_cu->inter.mv_dir & (1 << ref_list_idx)) { + for (uint32_t ref_list_idx = 0; ref_list_idx < 2; ref_list_idx++) { + if (!(cur_cu->inter.mv_dir & (1 << ref_list_idx))) { + continue; + } - // size of the current reference index list (L0/L1) - uint8_t ref_LX_size = state->frame->ref_LX_size[ref_list_idx]; + // size of the current reference index list (L0/L1) + uint8_t ref_LX_size = state->frame->ref_LX_size[ref_list_idx]; - if (ref_LX_size > 1) { - // parseRefFrmIdx - int32_t ref_frame = cur_cu->inter.mv_ref[ref_list_idx]; + if (ref_LX_size > 1) { + // parseRefFrmIdx + int32_t ref_frame = cur_cu->inter.mv_ref[ref_list_idx]; - cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]); - CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX"); + cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]); + CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX"); - if (ref_frame > 0) { - int32_t i; - int32_t ref_num = ref_LX_size - 2; + if (ref_frame > 0) { + ref_frame--; - cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]); - ref_frame--; + int32_t ref_num = ref_LX_size - 2; - for (i = 0; i < ref_num; ++i) { - const uint32_t symbol = (i == ref_frame) ? 0 : 1; + for (int32_t i = 0; i < ref_num; ++i) { + const uint32_t symbol = (i == ref_frame) ? 0 : 1; - if (i == 0) { - CABAC_BIN(cabac, symbol, "ref_idx_lX"); - } else { - CABAC_BIN_EP(cabac, symbol, "ref_idx_lX"); - } - if (symbol == 0) break; + if (i == 0) { + cabac->cur_ctx = &cabac->ctx.cu_ref_pic_model[1]; + CABAC_BIN(cabac, symbol, "ref_idx_lX"); + } else { + CABAC_BIN_EP(cabac, symbol, "ref_idx_lX"); } + if (symbol == 0) break; } } + } - if (!(/*pcCU->getSlice()->getMvdL1ZeroFlag() &&*/ state->frame->ref_list == REF_PIC_LIST_1 && cur_cu->inter.mv_dir == 3)) { - - int16_t mv_cand[2][2]; - kvz_inter_get_mv_cand_cua( - state, - x, y, width, height, - mv_cand, cur_cu, ref_list_idx); - - uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx); - - const int32_t mvd_hor = cur_cu->inter.mv[ref_list_idx][0] - mv_cand[cu_mv_cand][0]; - const int32_t mvd_ver = cur_cu->inter.mv[ref_list_idx][1] - mv_cand[cu_mv_cand][1]; - const int8_t hor_abs_gr0 = mvd_hor != 0; - const int8_t ver_abs_gr0 = mvd_ver != 0; - const uint32_t mvd_hor_abs = abs(mvd_hor); - const uint32_t mvd_ver_abs = abs(mvd_ver); + if (state->frame->ref_list != REF_PIC_LIST_1 || cur_cu->inter.mv_dir != 3) { + int16_t mv_cand[2][2]; + kvz_inter_get_mv_cand_cua( + state, + x, y, width, height, + mv_cand, cur_cu, ref_list_idx); - cabac->cur_ctx = &(cabac->ctx.cu_mvd_model[0]); - CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor"); - CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver"); + uint8_t cu_mv_cand = CU_GET_MV_CAND(cur_cu, ref_list_idx); + const int32_t mvd_hor = cur_cu->inter.mv[ref_list_idx][0] - mv_cand[cu_mv_cand][0]; + const int32_t mvd_ver = cur_cu->inter.mv[ref_list_idx][1] - mv_cand[cu_mv_cand][1]; - cabac->cur_ctx = &(cabac->ctx.cu_mvd_model[1]); + kvz_encode_mvd(state, cabac, mvd_hor, mvd_ver); + } - if (hor_abs_gr0) { - CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor"); - } + // Signal which candidate MV to use + kvz_cabac_write_unary_max_symbol(cabac, + cabac->ctx.mvp_idx_model, + CU_GET_MV_CAND(cur_cu, ref_list_idx), + 1, + AMVP_MAX_NUM_CANDS - 1); - if (ver_abs_gr0) { - CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver"); - } - - if (hor_abs_gr0) { - if (mvd_hor_abs > 1) { - kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs-2, 1); - } - uint32_t mvd_hor_sign = (mvd_hor>0)?0:1; - if(!state->cabac.only_count) - if (state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_MV_SIGNS) - mvd_hor_sign = mvd_hor_sign^kvz_crypto_get_key(state->crypto_hdl, 1); - CABAC_BIN_EP(cabac, mvd_hor_sign, "mvd_sign_flag_hor"); - } - if (ver_abs_gr0) { - if (mvd_ver_abs > 1) { - kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs-2, 1); - } - uint32_t mvd_ver_sign = (mvd_ver>0)?0:1; - if(!state->cabac.only_count) - if (state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_MV_SIGNS) - mvd_ver_sign = mvd_ver_sign^kvz_crypto_get_key(state->crypto_hdl, 1); - CABAC_BIN_EP(cabac, mvd_ver_sign, "mvd_sign_flag_ver"); - } - } - - // Signal which candidate MV to use - kvz_cabac_write_unary_max_symbol(cabac, - cabac->ctx.mvp_idx_model, - CU_GET_MV_CAND(cur_cu, ref_list_idx), - 1, - AMVP_MAX_NUM_CANDS - 1); - } } // for ref_list } // if !merge } @@ -1003,6 +716,9 @@ const videoframe_t * const frame = state->tile->frame; const cu_info_t *cur_cu = kvz_cu_array_at_const(frame->cu_array, x, y); + const int cu_width = LCU_WIDTH >> depth; + const int half_cu = cu_width >> 1; + const cu_info_t *left_cu = NULL; if (x > 0) { left_cu = kvz_cu_array_at_const(frame->cu_array, x - 1, y); @@ -1019,13 +735,17 @@ uint16_t abs_x = x + state->tile->offset_x; uint16_t abs_y = y + state->tile->offset_y; - // Check for slice border FIXME - bool border_x = ctrl->in.width < abs_x + (LCU_WIDTH >> depth); - bool border_y = ctrl->in.height < abs_y + (LCU_WIDTH >> depth); - bool border_split_x = ctrl->in.width >= abs_x + (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)); - bool border_split_y = ctrl->in.height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + (LCU_WIDTH >> (depth + 1)); + // Check for slice border + bool border_x = ctrl->in.width < abs_x + cu_width; + bool border_y = ctrl->in.height < abs_y + cu_width; + bool border_split_x = ctrl->in.width >= abs_x + (LCU_WIDTH >> MAX_DEPTH) + half_cu; + bool border_split_y = ctrl->in.height >= abs_y + (LCU_WIDTH >> MAX_DEPTH) + half_cu; bool border = border_x || border_y; /*!< are we in any border CU */ + if (depth <= ctrl->max_qp_delta_depth) { + state->must_code_qp_delta = true; + } + // When not in MAX_DEPTH, insert split flag and split the blocks if needed if (depth != MAX_DEPTH) { // Implisit split flag when on border @@ -1045,25 +765,22 @@ if (split_flag || border) { // Split blocks and remember to change x and y block positions - int offset = LCU_WIDTH >> (depth + 1); - kvz_encode_coding_tree(state, x, y, depth + 1); - // TODO: fix when other half of the block would not be completely over the border if (!border_x || border_split_x) { - kvz_encode_coding_tree(state, x + offset, y, depth + 1); + kvz_encode_coding_tree(state, x + half_cu, y, depth + 1); } if (!border_y || border_split_y) { - kvz_encode_coding_tree(state, x, y + offset, depth + 1); + kvz_encode_coding_tree(state, x, y + half_cu, depth + 1); } if (!border || (border_split_x && border_split_y)) { - kvz_encode_coding_tree(state, x + offset, y + offset, depth + 1); + kvz_encode_coding_tree(state, x + half_cu, y + half_cu, depth + 1); } return; } } - if (state->encoder_control->cfg.lossless) { + if (ctrl->cfg.lossless) { cabac->cur_ctx = &cabac->ctx.cu_transquant_bypass; CABAC_BIN(cabac, 1, "cu_transquant_bypass_flag"); } @@ -1084,7 +801,7 @@ CABAC_BIN(cabac, cur_cu->skipped, "SkipFlag"); if (cur_cu->skipped) { - int16_t num_cand = MRG_MAX_NUM_CANDS; + int16_t num_cand = state->encoder_control->cfg.max_merge; if (num_cand > 1) { for (int ui = 0; ui < num_cand - 1; ui++) { int32_t symbol = (ui != cur_cu->merge_idx); @@ -1099,7 +816,7 @@ } } } - return; + goto end; } } @@ -1114,7 +831,6 @@ if (cur_cu->type == CU_INTER) { const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size]; - const int cu_width = LCU_WIDTH >> depth; for (int i = 0; i < num_pu; ++i) { const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i); @@ -1185,4 +901,59 @@ assert(0); exit(1); } + +end: + + if (is_last_cu_in_qg(state, x, y, depth)) { + state->last_qp = cur_cu->qp; + } +} + + +void kvz_encode_mvd(encoder_state_t * const state, + cabac_data_t *cabac, + int32_t mvd_hor, + int32_t mvd_ver) +{ + const int8_t hor_abs_gr0 = mvd_hor != 0; + const int8_t ver_abs_gr0 = mvd_ver != 0; + const uint32_t mvd_hor_abs = abs(mvd_hor); + const uint32_t mvd_ver_abs = abs(mvd_ver); + + cabac->cur_ctx = &cabac->ctx.cu_mvd_model[0]; + CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor"); + CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver"); + + cabac->cur_ctx = &cabac->ctx.cu_mvd_model[1]; + if (hor_abs_gr0) { + CABAC_BIN(cabac, (mvd_hor_abs>1), "abs_mvd_greater1_flag_hor"); + } + if (ver_abs_gr0) { + CABAC_BIN(cabac, (mvd_ver_abs>1), "abs_mvd_greater1_flag_ver"); + } + + if (hor_abs_gr0) { + if (mvd_hor_abs > 1) { + kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_hor_abs - 2, 1); + } + uint32_t mvd_hor_sign = (mvd_hor > 0) ? 0 : 1; + if (!state->cabac.only_count && + state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_MV_SIGNS) + { + mvd_hor_sign = mvd_hor_sign ^ kvz_crypto_get_key(state->crypto_hdl, 1); + } + CABAC_BIN_EP(cabac, mvd_hor_sign, "mvd_sign_flag_hor"); + } + if (ver_abs_gr0) { + if (mvd_ver_abs > 1) { + kvz_cabac_write_ep_ex_golomb(state, cabac, mvd_ver_abs - 2, 1); + } + uint32_t mvd_ver_sign = mvd_ver > 0 ? 0 : 1; + if (!state->cabac.only_count && + state->encoder_control->cfg.crypto_features & KVZ_CRYPTO_MV_SIGNS) + { + mvd_ver_sign = mvd_ver_sign^kvz_crypto_get_key(state->crypto_hdl, 1); + } + CABAC_BIN_EP(cabac, mvd_ver_sign, "mvd_sign_flag_ver"); + } }
View file
kvazaar-1.2.0.tar.gz/src/encode_coding_tree.h -> kvazaar-1.3.0.tar.gz/src/encode_coding_tree.h
Changed
@@ -34,12 +34,14 @@ uint16_t y_ctb, uint8_t depth); -void kvz_encode_coeff_nxn(encoder_state_t * const state, - cabac_data_t * const cabac, - const coeff_t *coeff, - uint8_t width, - uint8_t type, - int8_t scan_mode, - int8_t tr_skip); +void kvz_encode_mvd(encoder_state_t * const state, + cabac_data_t *cabac, + int32_t mvd_hor, + int32_t mvd_ver); + +void kvz_encode_last_significant_xy(cabac_data_t * const cabac, + uint8_t lastpos_x, uint8_t lastpos_y, + uint8_t width, uint8_t height, + uint8_t type, uint8_t scan); #endif // ENCODE_CODING_TREE_H_
View file
kvazaar-1.2.0.tar.gz/src/encoder.c -> kvazaar-1.3.0.tar.gz/src/encoder.c
Changed
@@ -305,7 +305,7 @@ kvz_scalinglist_init(&encoder->scaling_list); // CQM - if (cfg->cqmfile) { + if (cfg->scaling_list == KVZ_SCALING_LIST_CUSTOM && cfg->cqmfile) { FILE* cqmfile = fopen(cfg->cqmfile, "rb"); if (cqmfile) { kvz_scalinglist_parse(&encoder->scaling_list, cqmfile); @@ -314,7 +314,12 @@ fprintf(stderr, "Could not open CQM file.\n"); goto init_failed; } + } else if (cfg->scaling_list == KVZ_SCALING_LIST_DEFAULT) { + // Enable scaling lists if default lists are used + encoder->scaling_list.enable = 1; + encoder->scaling_list.use_default_list = 1; } + kvz_scalinglist_process(&encoder->scaling_list, encoder->bitdepth); kvz_encoder_control_input_init(encoder, encoder->cfg.width, encoder->cfg.height); @@ -347,13 +352,15 @@ } - encoder->lcu_dqp_enabled = cfg->target_bitrate > 0 || encoder->cfg.roi.dqps; + // NOTE: When tr_depth_inter is equal to 0, the transform is still split + // for SMP and AMP partition units. + encoder->tr_depth_inter = 0; - // When tr_depth_inter is equal to 0, inter transform split flag defaults - // to 1 for SMP and AMP partition units. We want to avoid the extra - // transform split so we set tr_depth_inter to 1 when SMP or AMP - // partition modes are enabled. - encoder->tr_depth_inter = (encoder->cfg.smp_enable || encoder->cfg.amp_enable) ? 1 : 0; + if (encoder->cfg.target_bitrate > 0 || encoder->cfg.roi.dqps || encoder->cfg.set_qp_in_cu) { + encoder->max_qp_delta_depth = 0; + } else { + encoder->max_qp_delta_depth = -1; + } //Tiles encoder->tiles_enable = encoder->cfg.tiles_width_count > 1 || @@ -731,6 +738,7 @@ switch (num_layers) { case 0: case 1: + encoder->gop_layer_weights[0] = 1; break; // Use the first layers of the 4-layer weights.
View file
kvazaar-1.2.0.tar.gz/src/encoder.h -> kvazaar-1.3.0.tar.gz/src/encoder.h
Changed
@@ -118,7 +118,7 @@ //! Picture weights when GOP is used. double gop_layer_weights[MAX_GOP_LAYERS]; - bool lcu_dqp_enabled; + int8_t max_qp_delta_depth; int tr_depth_inter;
View file
kvazaar-1.2.0.tar.gz/src/encoder_state-bitstream.c -> kvazaar-1.3.0.tar.gz/src/encoder_state-bitstream.c
Changed
@@ -60,7 +60,7 @@ // PTL // Profile Tier WRITE_U(stream, 0, 2, "general_profile_space"); - WRITE_U(stream, 0, 1, "general_tier_flag"); + WRITE_U(stream, state->encoder_control->cfg.high_tier, 1, "general_tier_flag"); // Main Profile == 1, Main 10 profile == 2 WRITE_U(stream, (state->encoder_control->bitdepth == 8)?1:2, 5, "general_profile_idc"); /* Compatibility flags should be set at general_profile_idc @@ -80,8 +80,8 @@ // end Profile Tier - // Level 6.2 (general_level_idc is 30 * 6.2) - WRITE_U(stream, 186, 8, "general_level_idc"); + uint8_t level = state->encoder_control->cfg.level; + WRITE_U(stream, level * 3, 8, "general_level_idc"); WRITE_U(stream, 0, 1, "sub_layer_profile_present_flag"); WRITE_U(stream, 0, 1, "sub_layer_level_present_flag"); @@ -395,8 +395,11 @@ // scaling list WRITE_U(stream, encoder->scaling_list.enable, 1, "scaling_list_enable_flag"); if (encoder->scaling_list.enable) { - WRITE_U(stream, 1, 1, "sps_scaling_list_data_present_flag"); - encoder_state_write_bitstream_scaling_list(stream, state); + // Signal scaling list data for custom lists + WRITE_U(stream, (encoder->cfg.scaling_list == KVZ_SCALING_LIST_CUSTOM) ? 1 : 0, 1, "sps_scaling_list_data_present_flag"); + if (encoder->cfg.scaling_list == KVZ_SCALING_LIST_CUSTOM) { + encoder_state_write_bitstream_scaling_list(stream, state); + } } WRITE_U(stream, (encoder->cfg.amp_enable ? 1 : 0), 1, "amp_enabled_flag"); @@ -451,16 +454,21 @@ WRITE_UE(stream, 0, "num_ref_idx_l0_default_active_minus1"); WRITE_UE(stream, 0, "num_ref_idx_l1_default_active_minus1"); - WRITE_SE(stream, ((int8_t)encoder->cfg.qp) - 26, "pic_init_qp_minus26"); + + // If tiles and slices = tiles is enabled, signal QP in the slice header. Keeping the PPS constant for OMAF etc + // Keep QP constant here also if it will be only set at CU level. + bool constant_qp_in_pps = ((encoder->cfg.slices & KVZ_SLICES_TILES) && encoder->tiles_enable) || encoder->cfg.set_qp_in_cu; + WRITE_SE(stream, constant_qp_in_pps ? 0 : (((int8_t)encoder->cfg.qp) - 26), "pic_init_qp_minus26"); + WRITE_U(stream, 0, 1, "constrained_intra_pred_flag"); WRITE_U(stream, encoder->cfg.trskip_enable, 1, "transform_skip_enabled_flag"); - if (encoder->lcu_dqp_enabled) { + if (encoder->max_qp_delta_depth >= 0) { // Use separate QP for each LCU when rate control is enabled. WRITE_U(stream, 1, 1, "cu_qp_delta_enabled_flag"); - WRITE_UE(stream, 0, "diff_cu_qp_delta_depth"); + WRITE_UE(stream, encoder->max_qp_delta_depth, "diff_cu_qp_delta_depth"); } else { - WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag"); + WRITE_U(stream, 0, 1, "cu_qp_delta_enabled_flag"); } //TODO: add QP offsets @@ -777,12 +785,12 @@ WRITE_U(stream, 1, 1, "slice_sao_chroma_flag"); } } - + if (state->frame->slicetype != KVZ_SLICE_I) { WRITE_U(stream, 1, 1, "num_ref_idx_active_override_flag"); - WRITE_UE(stream, ref_negative != 0 ? ref_negative - 1: 0, "num_ref_idx_l0_active_minus1"); + WRITE_UE(stream, MAX(0, ((int)state->frame->ref_LX_size[0]) - 1), "num_ref_idx_l0_active_minus1"); if (state->frame->slicetype == KVZ_SLICE_B) { - WRITE_UE(stream, ref_positive != 0 ? ref_positive - 1 : 0, "num_ref_idx_l1_active_minus1"); + WRITE_UE(stream, MAX(0, ((int)state->frame->ref_LX_size[1]) - 1), "num_ref_idx_l1_active_minus1"); WRITE_U(stream, 0, 1, "mvd_l1_zero_flag"); } @@ -799,12 +807,16 @@ WRITE_UE(stream, 0, "collocated_ref_idx"); } } - - WRITE_UE(stream, 5-MRG_MAX_NUM_CANDS, "five_minus_max_num_merge_cand"); + const uint8_t max_merge_cands = state->encoder_control->cfg.max_merge; + WRITE_UE(stream, 5- max_merge_cands, "five_minus_max_num_merge_cand"); } { - int slice_qp_delta = state->frame->QP - encoder->cfg.qp; + // If tiles are enabled, signal the full QP here (relative to the base value of 26) + // If QP is to be set only at CU level, force slice_qp_delta zero + bool signal_qp_in_slice_header = (encoder->cfg.slices & KVZ_SLICES_TILES) && encoder->tiles_enable; + int slice_qp_delta = state->frame->QP - (signal_qp_in_slice_header ? 26 : encoder->cfg.qp); + if(encoder->cfg.set_qp_in_cu) slice_qp_delta = 0; WRITE_SE(stream, slice_qp_delta, "slice_qp_delta"); } }
View file
kvazaar-1.2.0.tar.gz/src/encoder_state-ctors_dtors.c -> kvazaar-1.3.0.tar.gz/src/encoder_state-ctors_dtors.c
Changed
@@ -312,6 +312,7 @@ child_state->children = MALLOC(encoder_state_t, 1); child_state->children[0].encoder_control = NULL; child_state->crypto_hdl = NULL; + child_state->must_code_qp_delta = false; child_state->tqj_bitstream_written = NULL; child_state->tqj_recon_done = NULL;
View file
kvazaar-1.2.0.tar.gz/src/encoderstate.c -> kvazaar-1.3.0.tar.gz/src/encoderstate.c
Changed
@@ -37,9 +37,6 @@ #include "tables.h" #include "threadqueue.h" -#define SAO_BUF_WIDTH (LCU_WIDTH + SAO_DELAY_PX + 2) -#define SAO_BUF_WIDTH_C (SAO_BUF_WIDTH / 2) - int kvz_encoder_state_match_children_of_previous_frame(encoder_state_t * const state) { int i; @@ -250,10 +247,18 @@ { videoframe_t *const frame = state->tile->frame; - // Temporary buffers for SAO input pixels. - kvz_pixel sao_buf_y_array[SAO_BUF_WIDTH * SAO_BUF_WIDTH]; - kvz_pixel sao_buf_u_array[SAO_BUF_WIDTH_C * SAO_BUF_WIDTH_C]; - kvz_pixel sao_buf_v_array[SAO_BUF_WIDTH_C * SAO_BUF_WIDTH_C]; + + // Temporary buffers for SAO input pixels. The buffers cover the pixels + // inside the LCU (LCU_WIDTH x LCU_WIDTH), SAO_DELAY_PX wide bands to the + // left and above the LCU, and one pixel border on the left and top + // sides. We add two extra pixels to the buffers because the AVX2 SAO + // reconstruction reads up to two extra bytes when using edge SAO in the + // horizontal direction. +#define SAO_BUF_WIDTH (1 + SAO_DELAY_PX + LCU_WIDTH) +#define SAO_BUF_WIDTH_C (1 + SAO_DELAY_PX/2 + LCU_WIDTH_C) + kvz_pixel sao_buf_y_array[SAO_BUF_WIDTH * SAO_BUF_WIDTH + 2]; + kvz_pixel sao_buf_u_array[SAO_BUF_WIDTH_C * SAO_BUF_WIDTH_C + 2]; + kvz_pixel sao_buf_v_array[SAO_BUF_WIDTH_C * SAO_BUF_WIDTH_C + 2]; // Pointers to the top-left pixel of the LCU in the buffers. kvz_pixel *const sao_buf_y = &sao_buf_y_array[(SAO_DELAY_PX + 1) * (SAO_BUF_WIDTH + 1)]; @@ -526,68 +531,81 @@ /** * \brief Sets the QP for each CU in state->tile->frame->cu_array. * - * The QPs are used in deblocking. + * The QPs are used in deblocking and QP prediction. * - * The delta QP for an LCU is coded when the first CU with coded block flag - * set is encountered. Hence, for the purposes of deblocking, all CUs - * before the first one with cbf set use state->ref_qp and all CUs after - * that use state->qp. + * The QP delta for a quantization group is coded when the first CU with + * coded block flag set is encountered. Hence, for the purposes of + * deblocking and QP prediction, all CUs in before the first one that has + * cbf set use the QP predictor and all CUs after that use (QP predictor + * + QP delta). * * \param state encoder state * \param x x-coordinate of the left edge of the root CU * \param y y-coordinate of the top edge of the root CU * \param depth depth in the CU quadtree - * \param coeffs_coded Used for tracking whether a CU with a residual - * has been encountered. Should be set to false at - * the top level. - * \return Whether there were any CUs with residual or not. + * \param last_qp QP of the last CU in the last quantization group + * \param prev_qp -1 if QP delta has not been coded in current QG, + * otherwise the QP of the current QG */ -static bool set_cu_qps(encoder_state_t *state, int x, int y, int depth, bool coeffs_coded) +static void set_cu_qps(encoder_state_t *state, int x, int y, int depth, int *last_qp, int *prev_qp) { - if (state->qp == state->ref_qp) { - // If the QPs are equal there is no need to care about the residuals. - coeffs_coded = true; - } + + // Stop recursion if the CU is completely outside the frame. + if (x >= state->tile->frame->width || y >= state->tile->frame->height) return; cu_info_t *cu = kvz_cu_array_at(state->tile->frame->cu_array, x, y); const int cu_width = LCU_WIDTH >> depth; - coeffs_coded = coeffs_coded || cbf_is_set_any(cu->cbf, cu->depth); - if (!coeffs_coded && cu->depth > depth) { + if (depth <= state->encoder_control->max_qp_delta_depth) { + *prev_qp = -1; + } + + if (cu->depth > depth) { // Recursively process sub-CUs. const int d = cu_width >> 1; - coeffs_coded = set_cu_qps(state, x, y, depth + 1, coeffs_coded); - coeffs_coded = set_cu_qps(state, x + d, y, depth + 1, coeffs_coded); - coeffs_coded = set_cu_qps(state, x, y + d, depth + 1, coeffs_coded); - coeffs_coded = set_cu_qps(state, x + d, y + d, depth + 1, coeffs_coded); + set_cu_qps(state, x, y, depth + 1, last_qp, prev_qp); + set_cu_qps(state, x + d, y, depth + 1, last_qp, prev_qp); + set_cu_qps(state, x, y + d, depth + 1, last_qp, prev_qp); + set_cu_qps(state, x + d, y + d, depth + 1, last_qp, prev_qp); } else { - if (!coeffs_coded && cu->tr_depth > depth) { + bool cbf_found = *prev_qp >= 0; + + if (cu->tr_depth > depth) { // The CU is split into smaller transform units. Check whether coded // block flag is set for any of the TUs. const int tu_width = LCU_WIDTH >> cu->tr_depth; - for (int y_scu = y; y_scu < y + cu_width; y_scu += tu_width) { - for (int x_scu = x; x_scu < x + cu_width; x_scu += tu_width) { + for (int y_scu = y; !cbf_found && y_scu < y + cu_width; y_scu += tu_width) { + for (int x_scu = x; !cbf_found && x_scu < x + cu_width; x_scu += tu_width) { cu_info_t *tu = kvz_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu); if (cbf_is_set_any(tu->cbf, cu->depth)) { - coeffs_coded = true; + cbf_found = true; } } } + } else if (cbf_is_set_any(cu->cbf, cu->depth)) { + cbf_found = true; + } + + int8_t qp; + if (cbf_found) { + *prev_qp = qp = cu->qp; + } else { + qp = kvz_get_cu_ref_qp(state, x, y, *last_qp); } // Set the correct QP for all state->tile->frame->cu_array elements in // the area covered by the CU. - const int8_t qp = coeffs_coded ? state->qp : state->ref_qp; - for (int y_scu = y; y_scu < y + cu_width; y_scu += SCU_WIDTH) { for (int x_scu = x; x_scu < x + cu_width; x_scu += SCU_WIDTH) { kvz_cu_array_at(state->tile->frame->cu_array, x_scu, y_scu)->qp = qp; } } - } - return coeffs_coded; + if (is_last_cu_in_qg(state, x, y, depth)) { + *last_qp = cu->qp; + } + } } @@ -608,11 +626,13 @@ encoder_state_recdata_to_bufs(state, lcu, state->tile->hor_buf_search, state->tile->ver_buf_search); - if (encoder->cfg.deblock_enable) { - if (encoder->lcu_dqp_enabled) { - set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, false); - } + if (encoder->max_qp_delta_depth >= 0) { + int last_qp = state->last_qp; + int prev_qp = -1; + set_cu_qps(state, lcu->position_px.x, lcu->position_px.y, 0, &last_qp, &prev_qp); + } + if (encoder->cfg.deblock_enable) { kvz_filter_deblock_lcu(state, lcu->position_px.x, lcu->position_px.y); } @@ -635,9 +655,6 @@ encode_sao(state, lcu->position.x, lcu->position.y, &frame->sao_luma[lcu->position.y * frame->width_in_lcu + lcu->position.x], &frame->sao_chroma[lcu->position.y * frame->width_in_lcu + lcu->position.x]); } - // QP delta is not used when rate control is turned off. - state->must_code_qp_delta = encoder->lcu_dqp_enabled; - //Encode coding tree kvz_encode_coding_tree(state, lcu->position.x * LCU_WIDTH, lcu->position.y * LCU_WIDTH, 0); @@ -709,7 +726,8 @@ const encoder_control_t *ctrl = state->encoder_control; const kvz_config *cfg = &ctrl->cfg; - state->ref_qp = state->frame->QP; + // Signaled slice QP may be different to frame QP with set-qp-in-cu enabled. + state->last_qp = ctrl->cfg.set_qp_in_cu ? 26 : state->frame->QP; if (cfg->crypto_features) { state->crypto_hdl = kvz_crypto_create(cfg); @@ -784,6 +802,21 @@ dep_lcu = dep_lcu->right; } kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_jobs[dep_lcu->id]); + + // Very spesific bug that happens when owf length is longer than the + // gop length. Takes care of that. + if(!state->encoder_control->cfg.gop_lowdelay && + state->encoder_control->cfg.open_gop && + state->encoder_control->cfg.gop_len != 0 && + state->encoder_control->cfg.owf > state->encoder_control->cfg.gop_len && + ref_state->frame->slicetype == KVZ_SLICE_I && + ref_state->frame->num != 0){ + + while (ref_state->frame->poc != state->frame->poc - state->encoder_control->cfg.gop_len){ + ref_state = ref_state->previous_encoder_state; + } + kvz_threadqueue_job_dep_add(job[0], ref_state->tile->wf_jobs[dep_lcu->id]); + } } // Add local WPP dependancy to the LCU on the left. @@ -945,13 +978,19 @@ } -static void encoder_ref_insertion_sort(const encoder_state_t *const state, uint8_t reflist[16], uint8_t length) { +static void encoder_ref_insertion_sort(const encoder_state_t *const state, + uint8_t reflist[16], + uint8_t length, + bool reverse) +{ for (uint8_t i = 1; i < length; ++i) { const uint8_t cur_idx = reflist[i]; const int32_t cur_poc = state->frame->ref->pocs[cur_idx]; int8_t j = i; - while (j > 0 && cur_poc > state->frame->ref->pocs[reflist[j - 1]]) { + while ((j > 0 && !reverse && cur_poc > state->frame->ref->pocs[reflist[j - 1]]) || + (j > 0 && reverse && cur_poc < state->frame->ref->pocs[reflist[j - 1]])) + { reflist[j] = reflist[j - 1]; --j; } @@ -966,29 +1005,54 @@ */ void kvz_encoder_create_ref_lists(const encoder_state_t *const state) { - // TODO check possibility to add L0 references to L1 list also - + const kvz_config *cfg = &state->encoder_control->cfg; + FILL_ARRAY(state->frame->ref_LX_size, 0, 2); - // List all pocs of lists - int j = 0; - for (j = 0; j < state->frame->ref->used_size; j++) { - if (state->frame->ref->pocs[j] < state->frame->poc) { - state->frame->ref_LX[0][state->frame->ref_LX_size[0]] = j; - state->frame->ref_LX_size[0] += 1; - } else { - state->frame->ref_LX[1][state->frame->ref_LX_size[1]] = j; + int num_negative = 0; + int num_positive = 0; + + // Add positive references to L1 list + for (int i = 0; i < state->frame->ref->used_size; i++) { + if (state->frame->ref->pocs[i] > state->frame->poc) { + state->frame->ref_LX[1][state->frame->ref_LX_size[1]] = i; state->frame->ref_LX_size[1] += 1; + num_positive++; + } + } + + // Add negative references to L1 list when bipred is enabled and GOP is + // either disabled or does not use picture reordering. + bool l1_negative_refs = + (cfg->bipred && (cfg->gop_len == 0 || cfg->gop_lowdelay)); + + // Add negative references to L0 and L1 lists. + for (int i = 0; i < state->frame->ref->used_size; i++) { + if (state->frame->ref->pocs[i] < state->frame->poc) { + state->frame->ref_LX[0][state->frame->ref_LX_size[0]] = i; + state->frame->ref_LX_size[0] += 1; + if (l1_negative_refs) { + state->frame->ref_LX[1][state->frame->ref_LX_size[1]] = i; + state->frame->ref_LX_size[1] += 1; + } + num_negative++; } } - // Fill the rest with -1s. - for (; j < 16; j++) { - state->frame->ref_LX[0][j] = (uint8_t) -1; - state->frame->ref_LX[1][j] = (uint8_t) -1; + // Fill the rest with -1. + for (int i = state->frame->ref_LX_size[0]; i < 16; i++) { + state->frame->ref_LX[0][i] = 0xff; + } + for (int i = state->frame->ref_LX_size[1]; i < 16; i++) { + state->frame->ref_LX[1][i] = 0xff; } - encoder_ref_insertion_sort(state, state->frame->ref_LX[0], state->frame->ref_LX_size[0]); + // Sort reference lists. + encoder_ref_insertion_sort(state, state->frame->ref_LX[0], num_negative, false); + encoder_ref_insertion_sort(state, state->frame->ref_LX[1], num_positive, true); + if (l1_negative_refs) { + encoder_ref_insertion_sort(state, state->frame->ref_LX[1] + num_positive, num_negative, false); + } } /** @@ -1092,7 +1156,7 @@ if (state->is_leaf) { //Leaf states have cabac and context kvz_cabac_start(&state->cabac); - kvz_init_contexts(state, state->frame->QP, state->frame->slicetype); + kvz_init_contexts(state, state->encoder_control->cfg.set_qp_in_cu ? 26 : state->frame->QP, state->frame->slicetype); } //Clear the jobs @@ -1133,14 +1197,38 @@ state->tile->frame->height ); + // Use this flag to handle closed gop irap picture selection. + // If set to true, irap is already set and we avoid + // setting it based on the intra period + bool is_closed_normal_gop = false; + // Set POC. if (state->frame->num == 0) { state->frame->poc = 0; } else if (cfg->gop_len && !cfg->gop_lowdelay) { - // Calculate POC according to the global frame counter and GOP structure - int32_t poc = state->frame->num - 1; - int32_t poc_offset = cfg->gop[state->frame->gop_offset].poc_offset; - state->frame->poc = poc - poc % cfg->gop_len + poc_offset; + + int32_t framenum = state->frame->num - 1; + // Handle closed GOP + // Closed GOP structure has an extra IDR between the GOPs + if (cfg->intra_period > 0 && !cfg->open_gop) { + is_closed_normal_gop = true; + if (framenum % (cfg->intra_period + 1) == cfg->intra_period) { + // Insert IDR before each new GOP after intra period in closed GOP configuration + state->frame->poc = 0; + } else { + // Calculate frame number again and use that for the POC + framenum = framenum % (cfg->intra_period + 1); + int32_t poc_offset = cfg->gop[state->frame->gop_offset].poc_offset; + state->frame->poc = framenum - framenum % cfg->gop_len + poc_offset; + // This should not be an irap picture in closed GOP + state->frame->is_irap = false; + } + } else { // Open GOP + // Calculate POC according to the global frame counter and GOP structure + int32_t poc_offset = cfg->gop[state->frame->gop_offset].poc_offset; + state->frame->poc = framenum - framenum % cfg->gop_len + poc_offset; + } + kvz_videoframe_set_poc(state->tile->frame, state->frame->poc); } else if (cfg->intra_period > 0) { state->frame->poc = state->frame->num % cfg->intra_period; @@ -1149,9 +1237,9 @@ } // Check whether the frame is a keyframe or not. - if (state->frame->num == 0) { + if (state->frame->num == 0 || state->frame->poc == 0) { state->frame->is_irap = true; - } else { + } else if(!is_closed_normal_gop) { // In closed-GOP IDR frames are poc==0 so skip this check state->frame->is_irap = cfg->intra_period > 0 && (state->frame->poc % cfg->intra_period) == 0; @@ -1165,7 +1253,8 @@ if (state->frame->num == 0 || cfg->intra_period == 1 || cfg->gop_len == 0 || - cfg->gop_lowdelay) + cfg->gop_lowdelay || + !cfg->open_gop) // Closed GOP uses IDR pictures { state->frame->pictype = KVZ_NAL_IDR_W_RADL; } else { @@ -1331,3 +1420,27 @@ state->encoder_control->in.width_in_lcu; return &state->frame->lcu_stats[index]; } + +int kvz_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp) +{ + const encoder_control_t *ctrl = state->encoder_control; + const cu_array_t *cua = state->tile->frame->cu_array; + // Quantization group width + const int qg_width = LCU_WIDTH >> MIN(ctrl->max_qp_delta_depth, kvz_cu_array_at_const(cua, x, y)->depth); + + // Coordinates of the top-left corner of the quantization group + const int x_qg = x & ~(qg_width - 1); + const int y_qg = y & ~(qg_width - 1); + + int qp_pred_a = last_qp; + if (x_qg % LCU_WIDTH > 0) { + qp_pred_a = kvz_cu_array_at_const(cua, x_qg - 1, y_qg)->qp; + } + + int qp_pred_b = last_qp; + if (y_qg % LCU_WIDTH > 0) { + qp_pred_b = kvz_cu_array_at_const(cua, x_qg, y_qg - 1)->qp; + } + + return ((qp_pred_a + qp_pred_b + 1) >> 1); +}
View file
kvazaar-1.2.0.tar.gz/src/encoderstate.h -> kvazaar-1.3.0.tar.gz/src/encoderstate.h
Changed
@@ -268,10 +268,17 @@ bool must_code_qp_delta; /** - * \brief Reference for computing QP delta for the next LCU that is coded - * next. Updated whenever a QP delta is coded. + * \brief QP value of the last CU in the last coded quantization group. + * + * A quantization group is a square of width + * (LCU_WIDTH >> encoder_control->max_qp_delta_depth). All CUs of in the + * same quantization group share the QP predictor value, but may have + * different QP values. + * + * Set to the frame QP at the beginning of a wavefront row or a tile and + * updated when the last CU of a quantization group is coded. */ - int8_t ref_qp; + int8_t last_qp; /** * \brief Coeffs for the LCU. @@ -297,6 +304,8 @@ lcu_stats_t* kvz_get_lcu_stats(encoder_state_t *state, int lcu_x, int lcu_y); +int kvz_get_cu_ref_qp(const encoder_state_t *state, int x, int y, int last_qp); + /** * Whether the parameter sets should be written with the current frame. */ @@ -309,6 +318,30 @@ (vps_period >= 0 && frame == 0); } + +/** + * \brief Returns true if the CU is the last CU in its containing + * quantization group. + * + * \param state encoder state + * \param x x-coordinate of the left edge of the CU + * \param y y-cooradinate of the top edge of the CU + * \param depth depth in the CU tree + * \return true, if it's the last CU in its QG, otherwise false + */ +static INLINE bool is_last_cu_in_qg(const encoder_state_t *state, int x, int y, int depth) +{ + if (state->encoder_control->max_qp_delta_depth < 0) return false; + + const int cu_width = LCU_WIDTH >> depth; + const int qg_width = LCU_WIDTH >> state->encoder_control->max_qp_delta_depth; + const int right = x + cu_width; + const int bottom = y + cu_width; + return (right % qg_width == 0 || right >= state->tile->frame->width) && + (bottom % qg_width == 0 || bottom >= state->tile->frame->height); +} + + static const uint8_t g_group_idx[32] = { 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8,
View file
kvazaar-1.2.0.tar.gz/src/extras/crypto.cpp -> kvazaar-1.3.0.tar.gz/src/extras/crypto.cpp
Changed
@@ -16,10 +16,10 @@ struct crypto_handle_t { cipher_t *cipher; - byte key[CryptoPP::AES::DEFAULT_KEYLENGTH]; - byte iv[CryptoPP::AES::BLOCKSIZE]; - byte out_stream_counter[CryptoPP::AES::BLOCKSIZE]; - byte counter[CryptoPP::AES::BLOCKSIZE]; + unsigned char key[CryptoPP::AES::DEFAULT_KEYLENGTH]; + unsigned char iv[CryptoPP::AES::BLOCKSIZE]; + unsigned char out_stream_counter[CryptoPP::AES::BLOCKSIZE]; + unsigned char counter[CryptoPP::AES::BLOCKSIZE]; int couter_avail; int counter_index; int counter_index_pos;
View file
kvazaar-1.2.0.tar.gz/src/filter.c -> kvazaar-1.3.0.tar.gz/src/filter.c
Changed
@@ -262,7 +262,7 @@ static int8_t get_qp_y_pred(const encoder_state_t* state, int x, int y, edge_dir dir) { - if (!state->encoder_control->lcu_dqp_enabled) { + if (state->encoder_control->max_qp_delta_depth < 0) { return state->qp; } @@ -272,7 +272,8 @@ } else if (dir == EDGE_VER && x > 0) { qp_p = kvz_cu_array_at_const(state->tile->frame->cu_array, x - 1, y)->qp; } else { - qp_p = state->frame->QP; + // TODO: This seems to be dead code. Investigate. + qp_p = state->encoder_control->cfg.set_qp_in_cu ? 26 : state->frame->QP; } const int32_t qp_q =
View file
kvazaar-1.2.0.tar.gz/src/global.h -> kvazaar-1.3.0.tar.gz/src/global.h
Changed
@@ -78,6 +78,12 @@ * Stuff related to multi-threading using pthreads */ + // Pthreads-win32 tries to define timespec even if it has already been defined. + // In Visual Studio 2015 timespec is defined in time.h so we may need to define + // HAVE_STRUCT_TIMESPEC. +#if _MSC_VER >= 1900 && !defined(HAVE_STRUCT_TIMESPEC) +# define HAVE_STRUCT_TIMESPEC +#endif #if defined(_MSC_VER) && defined(_M_AMD64) #define X86_64 @@ -200,7 +206,7 @@ // NOTE: When making a release, check to see if incrementing libversion in // configure.ac is necessary. #ifndef KVZ_VERSION -#define KVZ_VERSION 1.2.0 +#define KVZ_VERSION 1.3.0 #endif #define VERSION_STRING QUOTE_EXPAND(KVZ_VERSION) @@ -233,8 +239,10 @@ #ifdef _MSC_VER // Buggy VS2010 throws intellisense warnings if void* is not casted. #define MALLOC(type, num) (type *)malloc(sizeof(type) * (num)) + #define MALLOC_SIMD_PADDED(type, num, padding) (type *)malloc(sizeof(type) * (num) + (padding)) #else #define MALLOC(type, num) malloc(sizeof(type) * (num)) + #define MALLOC_SIMD_PADDED(type, num, padding) malloc(sizeof(type) * (num) + (padding)) #endif // Use memset through FILL and FILL_ARRAY when appropriate, such as when
View file
kvazaar-1.2.0.tar.gz/src/image.c -> kvazaar-1.3.0.tar.gz/src/image.c
Changed
@@ -47,6 +47,8 @@ assert((width % 2) == 0); assert((height % 2) == 0); + const size_t simd_padding_width = 64; + kvz_picture *im = MALLOC(kvz_picture, 1); if (!im) return NULL; @@ -56,12 +58,13 @@ im->chroma_format = chroma_format; - //Allocate memory - im->fulldata = MALLOC(kvz_pixel, (luma_size + 2 * chroma_size)); - if (!im->fulldata) { + //Allocate memory, pad the full data buffer from both ends + im->fulldata_buf = MALLOC_SIMD_PADDED(kvz_pixel, (luma_size + 2 * chroma_size), simd_padding_width * 2); + if (!im->fulldata_buf) { free(im); return NULL; } + im->fulldata = im->fulldata_buf + simd_padding_width / sizeof(kvz_pixel); im->base_image = im; im->refcount = 1; //We give a reference to caller @@ -110,11 +113,12 @@ // Free our reference to the base image. kvz_image_free(im->base_image); } else { - free(im->fulldata); + free(im->fulldata_buf); } // Make sure freed data won't be used. im->base_image = NULL; + im->fulldata_buf = NULL; im->fulldata = NULL; im->y = im->u = im->v = NULL; im->data[COLOR_Y] = im->data[COLOR_U] = im->data[COLOR_V] = NULL; @@ -128,10 +132,10 @@ */ kvz_picture *kvz_image_copy_ref(kvz_picture *im) { - // The caller should have had another reference. - assert(im->refcount > 0); - KVZ_ATOMIC_INC(&(im->refcount)); - + int32_t new_refcount = KVZ_ATOMIC_INC(&im->refcount); + // The caller should have had another reference and we added one + // reference so refcount should be at least 2. + assert(new_refcount >= 2); return im; } @@ -223,6 +227,15 @@ free(yuv); } +static INLINE uint32_t reg_sad_maybe_optimized(const kvz_pixel * const data1, const kvz_pixel * const data2, + const int32_t width, const int32_t height, const uint32_t stride1, + const uint32_t stride2, optimized_sad_func_ptr_t optimized_sad) +{ + if (optimized_sad != NULL) + return optimized_sad(data1, data2, height, stride1, stride2); + else + return kvz_reg_sad(data1, data2, width, height, stride1, stride2); +} /** * \brief Diagonally interpolate SAD outside the frame. @@ -251,58 +264,6 @@ return sad; } -/** - * \brief Vertically interpolate SAD outside the frame. - * - * \param data1 Starting point of the first picture. - * \param data2 Starting point of the second picture. - * \param width Width of the region for which SAD is calculated. - * \param height Height of the region for which SAD is calculated. - * \param width Width of the pixel array. - * - * \returns Sum of Absolute Differences - */ -static unsigned ver_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data, - int block_width, int block_height, unsigned pic_stride) -{ - int x, y; - unsigned sad = 0; - - for (y = 0; y < block_height; ++y) { - for (x = 0; x < block_width; ++x) { - sad += abs(pic_data[y * pic_stride + x] - ref_data[x]); - } - } - - return sad; -} - -/** - * \brief Horizontally interpolate SAD outside the frame. - * - * \param data1 Starting point of the first picture. - * \param data2 Starting point of the second picture. - * \param width Width of the region for which SAD is calculated. - * \param height Height of the region for which SAD is calculated. - * \param width Width of the pixel array. - * - * \returns Sum of Absolute Differences - */ -static unsigned hor_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data, - int block_width, int block_height, unsigned pic_stride, unsigned ref_stride) -{ - int x, y; - unsigned sad = 0; - - for (y = 0; y < block_height; ++y) { - for (x = 0; x < block_width; ++x) { - sad += abs(pic_data[y * pic_stride + x] - ref_data[y * ref_stride]); - } - } - - return sad; -} - /** * \brief Handle special cases of comparing blocks that are not completely @@ -319,7 +280,8 @@ */ static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture *ref, int pic_x, int pic_y, int ref_x, int ref_y, - int block_width, int block_height) + int block_width, int block_height, + optimized_sad_func_ptr_t optimized_sad) { kvz_pixel *pic_data, *ref_data; @@ -356,94 +318,86 @@ // that we compare the right part of the block to the ref_data. // - Reduce block_width and block_height so that the the size of the area // being compared is correct. + // + // NOTE: No more correct since hor_sad was modified to be a separate + // strategy if (top && left) { result += cor_sad(pic_data, &ref_data[top * ref->stride + left], left, top, pic->stride); - result += ver_sad(&pic_data[left], + result += kvz_ver_sad(&pic_data[left], &ref_data[top * ref->stride + left], block_width - left, top, pic->stride); - result += hor_sad(&pic_data[top * pic->stride], - &ref_data[top * ref->stride + left], - left, block_height - top, pic->stride, ref->stride); - result += kvz_reg_sad(&pic_data[top * pic->stride + left], - &ref_data[top * ref->stride + left], - block_width - left, block_height - top, pic->stride, ref->stride); + + result += kvz_hor_sad(pic_data + top * pic->stride, + ref_data + top * ref->stride, + block_width, block_height - top, + pic->stride, ref->stride, + left, right); + } else if (top && right) { - result += ver_sad(pic_data, + result += kvz_ver_sad(pic_data, &ref_data[top * ref->stride], block_width - right, top, pic->stride); result += cor_sad(&pic_data[block_width - right], &ref_data[top * ref->stride + (block_width - right - 1)], right, top, pic->stride); - result += kvz_reg_sad(&pic_data[top * pic->stride], - &ref_data[top * ref->stride], - block_width - right, block_height - top, pic->stride, ref->stride); - result += hor_sad(&pic_data[top * pic->stride + (block_width - right)], - &ref_data[top * ref->stride + (block_width - right - 1)], - right, block_height - top, pic->stride, ref->stride); + + result += kvz_hor_sad(pic_data + top * pic->stride, + ref_data + top * ref->stride, + block_width, block_height - top, + pic->stride, ref->stride, + left, right); + } else if (bottom && left) { - result += hor_sad(pic_data, - &ref_data[left], - left, block_height - bottom, pic->stride, ref->stride); - result += kvz_reg_sad(&pic_data[left], - &ref_data[left], - block_width - left, block_height - bottom, pic->stride, ref->stride); + result += kvz_hor_sad(pic_data, ref_data, block_width, block_height - bottom, + pic->stride, ref->stride, left, right); + result += cor_sad(&pic_data[(block_height - bottom) * pic->stride], &ref_data[(block_height - bottom - 1) * ref->stride + left], left, bottom, pic->stride); - result += ver_sad(&pic_data[(block_height - bottom) * pic->stride + left], + result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride + left], &ref_data[(block_height - bottom - 1) * ref->stride + left], block_width - left, bottom, pic->stride); } else if (bottom && right) { - result += kvz_reg_sad(pic_data, - ref_data, - block_width - right, block_height - bottom, pic->stride, ref->stride); - result += hor_sad(&pic_data[block_width - right], - &ref_data[block_width - right - 1], - right, block_height - bottom, pic->stride, ref->stride); - result += ver_sad(&pic_data[(block_height - bottom) * pic->stride], + result += kvz_hor_sad(pic_data, ref_data, block_width, block_height - bottom, + pic->stride, ref->stride, left, right); + + result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride], &ref_data[(block_height - bottom - 1) * ref->stride], block_width - right, bottom, pic->stride); result += cor_sad(&pic_data[(block_height - bottom) * pic->stride + block_width - right], &ref_data[(block_height - bottom - 1) * ref->stride + block_width - right - 1], right, bottom, pic->stride); } else if (top) { - result += ver_sad(pic_data, + result += kvz_ver_sad(pic_data, &ref_data[top * ref->stride], block_width, top, pic->stride); - result += kvz_reg_sad(&pic_data[top * pic->stride], + result += reg_sad_maybe_optimized(&pic_data[top * pic->stride], &ref_data[top * ref->stride], - block_width, block_height - top, pic->stride, ref->stride); + block_width, block_height - top, pic->stride, ref->stride, + optimized_sad); } else if (bottom) { - result += kvz_reg_sad(pic_data, + result += reg_sad_maybe_optimized(pic_data, ref_data, - block_width, block_height - bottom, pic->stride, ref->stride); - result += ver_sad(&pic_data[(block_height - bottom) * pic->stride], + block_width, block_height - bottom, pic->stride, ref->stride, + optimized_sad); + result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride], &ref_data[(block_height - bottom - 1) * ref->stride], block_width, bottom, pic->stride); - } else if (left) { - result += hor_sad(pic_data, - &ref_data[left], - left, block_height, pic->stride, ref->stride); - result += kvz_reg_sad(&pic_data[left], - &ref_data[left], - block_width - left, block_height, pic->stride, ref->stride); - } else if (right) { - result += kvz_reg_sad(pic_data, - ref_data, - block_width - right, block_height, pic->stride, ref->stride); - result += hor_sad(&pic_data[block_width - right], - &ref_data[block_width - right - 1], - right, block_height, pic->stride, ref->stride); + } else if (left | right) { + result += kvz_hor_sad(pic_data, ref_data, + block_width, block_height, pic->stride, + ref->stride, left, right); } else { - result += kvz_reg_sad(pic_data, ref_data, block_width, block_height, pic->stride, ref->stride); + result += reg_sad_maybe_optimized(pic_data, ref_data, + block_width, block_height, + pic->stride, ref->stride, + optimized_sad); } - return result; } - /** * \brief Calculate interpolated SAD between two blocks. * @@ -459,11 +413,14 @@ int ref_x, int ref_y, int block_width, - int block_height) + int block_height, + optimized_sad_func_ptr_t optimized_sad) { assert(pic_x >= 0 && pic_x <= pic->width - block_width); assert(pic_y >= 0 && pic_y <= pic->height - block_height); + uint32_t res; + if (ref_x >= 0 && ref_x <= ref->width - block_width && ref_y >= 0 && ref_y <= ref->height - block_height) { @@ -471,11 +428,19 @@ // SAD directly. This is the most common case, which is why it's first. const kvz_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x]; const kvz_pixel *ref_data = &ref->y[ref_y * ref->stride + ref_x]; - return kvz_reg_sad(pic_data, ref_data, block_width, block_height, pic->stride, ref->stride)>>(KVZ_BIT_DEPTH-8); + + res = reg_sad_maybe_optimized(pic_data, + ref_data, + block_width, + block_height, + pic->stride, + ref->stride, + optimized_sad); } else { // Call a routine that knows how to interpolate pixels outside the frame. - return image_interpolated_sad(pic, ref, pic_x, pic_y, ref_x, ref_y, block_width, block_height) >> (KVZ_BIT_DEPTH - 8); + res = image_interpolated_sad(pic, ref, pic_x, pic_y, ref_x, ref_y, block_width, block_height, optimized_sad); } + return res >> (KVZ_BIT_DEPTH - 8); }
View file
kvazaar-1.2.0.tar.gz/src/image.h -> kvazaar-1.3.0.tar.gz/src/image.h
Changed
@@ -29,6 +29,7 @@ #include "global.h" // IWYU pragma: keep #include "kvazaar.h" +#include "strategies/optimized_sad_func_ptr_t.h" typedef struct { @@ -81,7 +82,8 @@ int ref_x, int ref_y, int block_width, - int block_height); + int block_height, + optimized_sad_func_ptr_t optimized_sad); unsigned kvz_image_calc_satd(const kvz_picture *pic,
View file
kvazaar-1.2.0.tar.gz/src/input_frame_buffer.c -> kvazaar-1.3.0.tar.gz/src/input_frame_buffer.c
Changed
@@ -58,6 +58,11 @@ const int gop_buf_size = 3 * cfg->gop_len; + bool is_closed_gop = false; + + // Check for closed gop, we need an extra frame in the buffer in this case + if (!cfg->open_gop && cfg->intra_period > 0 && cfg->gop_len > 0) is_closed_gop = true; + if (cfg->gop_len == 0 || cfg->gop_lowdelay) { // No reordering of output pictures necessary. @@ -94,11 +99,11 @@ buf->pts_buffer[buf_idx] = img_in->pts; buf->num_in++; - if (buf->num_in < cfg->gop_len) { + if (buf->num_in < cfg->gop_len + is_closed_gop ? 1 : 0) { // Not enough frames to start output. return 0; - } else if (buf->num_in == cfg->gop_len) { + } else if (buf->num_in == cfg->gop_len + is_closed_gop ? 1 : 0) { // Now we known the PTSs that are needed to compute the delay. buf->delay = buf->pts_buffer[gop_buf_size - 1] - img_in->pts; } @@ -109,7 +114,7 @@ return NULL; } - if (img_in == NULL && buf->num_in < cfg->gop_len) { + if (img_in == NULL && buf->num_in < cfg->gop_len + is_closed_gop ? 1 : 0) { // End of the sequence but we have less than a single GOP of frames. Use // the difference between the PTSs of the first and the last frame as the // delay. @@ -137,22 +142,35 @@ } else { gop_offset = (buf->num_out - 1) % cfg->gop_len; + + // For closed gop, calculate the gop_offset again + if (!cfg->open_gop && cfg->intra_period > 0) { + // Offset the GOP position for each extra I-frame added to the structure + // in closed gop case + int num_extra_frames = (buf->num_out - 1) / (cfg->intra_period + 1); + gop_offset = (buf->num_out - 1 - num_extra_frames) % cfg->gop_len; + } // Index of the first picture in the GOP that is being output. int gop_start_idx = buf->num_out - 1 - gop_offset; // Skip pictures until we find an available one. gop_offset += buf->gop_skipped; - for (;;) { - assert(gop_offset < cfg->gop_len); - idx_out = gop_start_idx + cfg->gop[gop_offset].poc_offset - 1; - if (idx_out < buf->num_in - 1) { - // An available picture found. - break; + // Every closed-gop IRAP handled here + if (is_closed_gop && (!cfg->open_gop && ((buf->num_out - 1) % (cfg->intra_period + 1)) == cfg->intra_period)) { + idx_out = gop_start_idx; + } else { + for (;;) { + assert(gop_offset < cfg->gop_len + is_closed_gop ? 1 : 0); + idx_out = gop_start_idx + cfg->gop[gop_offset].poc_offset - 1; + if (idx_out < buf->num_in - 1) { + // An available picture found. + break; + } + buf->gop_skipped++; + gop_offset++; } - buf->gop_skipped++; - gop_offset++; } if (buf->num_out < cfg->gop_len - 1) {
View file
kvazaar-1.2.0.tar.gz/src/inter.c -> kvazaar-1.3.0.tar.gz/src/inter.c
Changed
@@ -29,6 +29,7 @@ #include "strategies/generic/picture-generic.h" #include "strategies/strategies-ipol.h" #include "videoframe.h" +#include "strategies/strategies-picture.h" typedef struct { @@ -51,8 +52,6 @@ int mv_frac_x = (mv_param[0] & 3); int mv_frac_y = (mv_param[1] & 3); - #define FILTER_SIZE_Y 8 //Luma filter size - // Fractional luma 1/4-pel kvz_extended_block src = {0, 0, 0, 0}; @@ -66,7 +65,7 @@ ref->y, ref->width, ref->height, - FILTER_SIZE_Y, + KVZ_LUMA_FILTER_TAPS, block_width, block_height, &src); @@ -75,7 +74,7 @@ src.stride, block_width, block_height, - lcu->rec.y + (ypos%LCU_WIDTH)*LCU_WIDTH + (xpos%LCU_WIDTH), + lcu->rec.y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH), LCU_WIDTH, mv_frac_x, mv_frac_y, @@ -96,8 +95,6 @@ int mv_frac_x = (mv_param[0] & 3); int mv_frac_y = (mv_param[1] & 3); -#define FILTER_SIZE_Y 8 //Luma filter size - // Fractional luma 1/4-pel kvz_extended_block src = { 0, 0, 0, 0 }; @@ -111,7 +108,7 @@ ref->y, ref->width, ref->height, - FILTER_SIZE_Y, + KVZ_LUMA_FILTER_TAPS, block_width, block_height, &src); @@ -120,7 +117,7 @@ src.stride, block_width, block_height, - hi_prec_out->y + (ypos%LCU_WIDTH)*LCU_WIDTH + (xpos%LCU_WIDTH), + hi_prec_out->y + (ypos % LCU_WIDTH) * LCU_WIDTH + (xpos % LCU_WIDTH), LCU_WIDTH, mv_frac_x, mv_frac_y, @@ -147,8 +144,6 @@ block_width >>= 1; block_height >>= 1; -#define FILTER_SIZE_C 4 //Chroma filter size - // Fractional chroma 1/8-pel kvz_extended_block src_u = { 0, 0, 0, 0 }; kvz_extended_block src_v = { 0, 0, 0, 0 }; @@ -162,7 +157,7 @@ ref->u, ref->width >> 1, ref->height >> 1, - FILTER_SIZE_C, + KVZ_CHROMA_FILTER_TAPS, block_width, block_height, &src_u); @@ -178,12 +173,12 @@ ref->v, ref->width >> 1, ref->height >> 1, - FILTER_SIZE_C, + KVZ_CHROMA_FILTER_TAPS, block_width, block_height, &src_v); kvz_sample_octpel_chroma(state->encoder_control, src_v.orig_topleft, src_v.stride, block_width, - block_height, lcu->rec.v + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param); + block_height, lcu->rec.v + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, mv_param); if (src_u.malloc_used) free(src_u.buffer); if (src_v.malloc_used) free(src_v.buffer); @@ -207,8 +202,6 @@ block_width >>= 1; block_height >>= 1; -#define FILTER_SIZE_C 4 //Chroma filter size - // Fractional chroma 1/8-pel kvz_extended_block src_u = { 0, 0, 0, 0 }; kvz_extended_block src_v = { 0, 0, 0, 0 }; @@ -223,7 +216,7 @@ ref->u, ref->width >> 1, ref->height >> 1, - FILTER_SIZE_C, + KVZ_CHROMA_FILTER_TAPS, block_width, block_height, &src_u); @@ -232,7 +225,7 @@ src_u.stride, block_width, block_height, - hi_prec_out->u + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), + hi_prec_out->u + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, @@ -248,7 +241,7 @@ ref->v, ref->width >> 1, ref->height >> 1, - FILTER_SIZE_C, + KVZ_CHROMA_FILTER_TAPS, block_width, block_height, &src_v); @@ -257,7 +250,7 @@ src_v.stride, block_width, block_height, - hi_prec_out->v + (ypos % LCU_WIDTH_C)*LCU_WIDTH_C + (xpos % LCU_WIDTH_C), + hi_prec_out->v + (ypos % LCU_WIDTH_C) * LCU_WIDTH_C + (xpos % LCU_WIDTH_C), LCU_WIDTH_C, mv_frac_x, mv_frac_y, @@ -306,27 +299,27 @@ /** - * \brief Reconstruct inter block + * \brief Reconstruct an inter PU using uniprediction. * * \param state encoder state * \param ref picture to copy the data from - * \param xpos block x position - * \param ypos block y position - * \param width block width - * \param height block height + * \param xpos PU x position + * \param ypos PU y position + * \param width PU width + * \param height PU height * \param mv_param motion vector * \param lcu destination lcu - * \param hi_prec_out destination of high precision output (null if not needed) + * \param hi_prec_out destination of high precision output, or NULL if not needed */ -void kvz_inter_recon_lcu(const encoder_state_t * const state, - const kvz_picture * const ref, - int32_t xpos, - int32_t ypos, - int32_t width, - int32_t height, - const int16_t mv_param[2], - lcu_t *lcu, - hi_prec_buf_t *hi_prec_out) +static void inter_recon_unipred(const encoder_state_t * const state, + const kvz_picture * const ref, + int32_t xpos, + int32_t ypos, + int32_t width, + int32_t height, + const int16_t mv_param[2], + lcu_t *lcu, + hi_prec_buf_t *hi_prec_out) { const vector2d_t pu_in_tile = { xpos, ypos }; const vector2d_t pu_in_lcu = { xpos % LCU_WIDTH, ypos % LCU_WIDTH }; @@ -426,36 +419,32 @@ } } } - /** - * \brief Reconstruct bi-pred inter block + * \brief Reconstruct bi-pred inter PU * * \param state encoder state * \param ref1 reference picture to copy the data from * \param ref2 other reference picture to copy the data from - * \param xpos block x position - * \param ypos block y position - * \param width block width - * \param height block height + * \param xpos PU x position + * \param ypos PU y position + * \param width PU width + * \param height PU height * \param mv_param motion vectors * \param lcu destination lcu */ -void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state, - const kvz_picture * ref1, - const kvz_picture * ref2, - int32_t xpos, - int32_t ypos, - int32_t width, - int32_t height, - int16_t mv_param[2][2], - lcu_t* lcu) +void kvz_inter_recon_bipred(const encoder_state_t * const state, + const kvz_picture * ref1, + const kvz_picture * ref2, + int32_t xpos, + int32_t ypos, + int32_t width, + int32_t height, + int16_t mv_param[2][2], + lcu_t* lcu) { kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH]; kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C]; kvz_pixel temp_lcu_v[LCU_WIDTH_C*LCU_WIDTH_C]; - int temp_x, temp_y; - int shift = 15 - KVZ_BIT_DEPTH; - int offset = 1 << (shift - 1); const int hi_prec_luma_rec0 = mv_param[0][0] & 3 || mv_param[0][1] & 3; const int hi_prec_luma_rec1 = mv_param[1][0] & 3 || mv_param[1][1] & 3; @@ -467,43 +456,87 @@ hi_prec_buf_t* high_precision_rec1 = 0; if (hi_prec_chroma_rec0) high_precision_rec0 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH); if (hi_prec_chroma_rec1) high_precision_rec1 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH); + + //Reconstruct both predictors - kvz_inter_recon_lcu(state, ref1, xpos, ypos, width, height, mv_param[0], lcu, high_precision_rec0); + inter_recon_unipred(state, ref1, xpos, ypos, width, height, mv_param[0], lcu, high_precision_rec0); if (!hi_prec_luma_rec0){ - memcpy(temp_lcu_y, lcu->rec.y, sizeof(kvz_pixel) * 64 * 64); + memcpy(temp_lcu_y, lcu->rec.y, sizeof(kvz_pixel) * 64 * 64); // copy to temp_lcu_y } if (!hi_prec_chroma_rec0){ - memcpy(temp_lcu_u, lcu->rec.u, sizeof(kvz_pixel) * 32 * 32); - memcpy(temp_lcu_v, lcu->rec.v, sizeof(kvz_pixel) * 32 * 32); + memcpy(temp_lcu_u, lcu->rec.u, sizeof(kvz_pixel) * 32 * 32); // copy to temp_lcu_u + memcpy(temp_lcu_v, lcu->rec.v, sizeof(kvz_pixel) * 32 * 32); // copy to temp_lcu_v } - kvz_inter_recon_lcu(state, ref2, xpos, ypos, width, height, mv_param[1], lcu, high_precision_rec1); + inter_recon_unipred(state, ref2, xpos, ypos, width, height, mv_param[1], lcu, high_precision_rec1); // After reconstruction, merge the predictors by taking an average of each pixel - for (temp_y = 0; temp_y < height; ++temp_y) { - int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); - for (temp_x = 0; temp_x < width; ++temp_x) { - int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1)); - int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); - int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); - lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift); - } + kvz_inter_recon_bipred_blend(hi_prec_luma_rec0, hi_prec_luma_rec1, hi_prec_chroma_rec0, hi_prec_chroma_rec1, height, width, ypos, xpos, high_precision_rec0, high_precision_rec1, lcu, temp_lcu_y, temp_lcu_u, temp_lcu_v); + + if (high_precision_rec0 != 0) kvz_hi_prec_buf_t_free(high_precision_rec0); + if (high_precision_rec1 != 0) kvz_hi_prec_buf_t_free(high_precision_rec1); +} - } - for (temp_y = 0; temp_y < height >> 1; ++temp_y) { - int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1)); - for (temp_x = 0; temp_x < width >> 1; ++temp_x) { - int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1)); - int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); - int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); - lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift); - - int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); - int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); - lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift); + +/** + * Reconstruct a single CU. + * + * The CU may consist of multiple PUs, each of which can use either + * uniprediction or biprediction. + * + * \param state encoder state + * \param lcu containing LCU + * \param x x-coordinate of the CU in pixels + * \param y y-coordinate of the CU in pixels + * \param width CU width + */ +void kvz_inter_recon_cu(const encoder_state_t * const state, + lcu_t *lcu, + int32_t x, + int32_t y, + int32_t width) +{ + cu_info_t *cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + + const int num_pu = kvz_part_mode_num_parts[cu->part_size]; + for (int i = 0; i < num_pu; ++i) { + const int pu_x = PU_GET_X(cu->part_size, width, x, i); + const int pu_y = PU_GET_Y(cu->part_size, width, y, i); + const int pu_w = PU_GET_W(cu->part_size, width, i); + const int pu_h = PU_GET_H(cu->part_size, width, i); + + cu_info_t *pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); + + if (pu->inter.mv_dir == 3) { + const kvz_picture *const refs[2] = { + state->frame->ref->images[ + state->frame->ref_LX[0][ + pu->inter.mv_ref[0]]], + state->frame->ref->images[ + state->frame->ref_LX[1][ + pu->inter.mv_ref[1]]], + }; + kvz_inter_recon_bipred(state, + refs[0], refs[1], + pu_x, pu_y, + pu_w, pu_h, + pu->inter.mv, + lcu); + } else { + const int mv_idx = pu->inter.mv_dir - 1; + const kvz_picture *const ref = + state->frame->ref->images[ + state->frame->ref_LX[mv_idx][ + pu->inter.mv_ref[mv_idx]]]; + + inter_recon_unipred(state, + ref, + pu_x, pu_y, + pu_w, pu_h, + pu->inter.mv[mv_idx], + lcu, + NULL); } } - if (high_precision_rec0 != 0) kvz_hi_prec_buf_t_free(high_precision_rec0); - if (high_precision_rec1 != 0) kvz_hi_prec_buf_t_free(high_precision_rec1); } /** @@ -996,11 +1029,15 @@ // in L0 or L1, the primary list for the colocated PU is the inverse of // collocated_from_l0_flag. Otherwise it is equal to reflist. // - // In Kvazaar, the L1 list is only used for future pictures and the slice - // type is set to KVZ_SLICE_B if and only if L1 is used. Therefore we can - // simply check the slice type here. Kvazaar always sets - // collocated_from_l0_flag so the list is L1 for B-slices. - int col_list = state->frame->slicetype == KVZ_SLICE_P ? reflist : 1; + // Kvazaar always sets collocated_from_l0_flag so the list is L1 when + // there are future references. + int col_list = reflist; + for (int i = 0; i < state->frame->ref->used_size; i++) { + if (state->frame->ref->pocs[i] > state->frame->poc) { + col_list = 1; + break; + } + } if ((colocated->inter.mv_dir & (col_list + 1)) == 0) { // Use the other list if the colocated PU does not have a MV for the @@ -1033,22 +1070,27 @@ if (!cand) return false; assert(cand->inter.mv_dir != 0); - const int cand_list = cand->inter.mv_dir & (1 << reflist) ? reflist : !reflist; - if (scaling) { - mv_cand_out[0] = cand->inter.mv[cand_list][0]; - mv_cand_out[1] = cand->inter.mv[cand_list][1]; - apply_mv_scaling(state, cur_cu, cand, reflist, cand_list, mv_cand_out); - return true; - } + for (int i = 0; i < 2; i++) { + const int cand_list = i == 0 ? reflist : !reflist; - if (cand->inter.mv_dir & (1 << cand_list) && - state->frame->ref_LX[cand_list][cand->inter.mv_ref[cand_list]] == - state->frame->ref_LX[reflist][cur_cu->inter.mv_ref[reflist]]) - { - mv_cand_out[0] = cand->inter.mv[cand_list][0]; - mv_cand_out[1] = cand->inter.mv[cand_list][1]; - return true; + if ((cand->inter.mv_dir & (1 << cand_list)) == 0) continue; + + if (scaling) { + mv_cand_out[0] = cand->inter.mv[cand_list][0]; + mv_cand_out[1] = cand->inter.mv[cand_list][1]; + apply_mv_scaling(state, cur_cu, cand, reflist, cand_list, mv_cand_out); + return true; + } + + if (cand->inter.mv_dir & (1 << cand_list) && + state->frame->ref_LX[cand_list][cand->inter.mv_ref[cand_list]] == + state->frame->ref_LX[reflist][cur_cu->inter.mv_ref[reflist]]) + { + mv_cand_out[0] = cand->inter.mv[cand_list][0]; + mv_cand_out[1] = cand->inter.mv[cand_list][1]; + return true; + } } return false; @@ -1238,11 +1280,14 @@ static bool add_merge_candidate(const cu_info_t *cand, const cu_info_t *possible_duplicate1, const cu_info_t *possible_duplicate2, - inter_merge_cand_t *merge_cand_out) + inter_merge_cand_t *merge_cand_out, + uint8_t candidates, + uint8_t max_num_cands) { if (!cand || is_duplicate_candidate(cand, possible_duplicate1) || - is_duplicate_candidate(cand, possible_duplicate2)) { + is_duplicate_candidate(cand, possible_duplicate2) || + candidates >= max_num_cands) { return false; } @@ -1280,7 +1325,7 @@ int8_t zero_idx = 0; merge_candidates_t merge_cand = { {0, 0}, {0, 0, 0}, 0, 0 }; - + const uint8_t max_num_cands = state->encoder_control->cfg.max_merge; get_spatial_merge_candidates(x, y, width, height, state->tile->frame->width, state->tile->frame->height, @@ -1293,16 +1338,16 @@ if (!use_a1) a[1] = NULL; if (!use_b1) b[1] = NULL; - if (add_merge_candidate(a[1], NULL, NULL, &mv_cand[candidates])) candidates++; - if (add_merge_candidate(b[1], a[1], NULL, &mv_cand[candidates])) candidates++; - if (add_merge_candidate(b[0], b[1], NULL, &mv_cand[candidates])) candidates++; - if (add_merge_candidate(a[0], a[1], NULL, &mv_cand[candidates])) candidates++; + if (add_merge_candidate(a[1], NULL, NULL, &mv_cand[candidates], candidates, max_num_cands)) candidates++; + if (add_merge_candidate(b[1], a[1], NULL, &mv_cand[candidates], candidates, max_num_cands)) candidates++; + if (add_merge_candidate(b[0], b[1], NULL, &mv_cand[candidates], candidates, max_num_cands)) candidates++; + if (add_merge_candidate(a[0], a[1], NULL, &mv_cand[candidates], candidates, max_num_cands)) candidates++; if (candidates < 4 && - add_merge_candidate(b[2], a[1], b[1], &mv_cand[candidates])) candidates++; + add_merge_candidate(b[2], a[1], b[1], &mv_cand[candidates], candidates, max_num_cands)) candidates++; bool can_use_tmvp = state->encoder_control->cfg.tmvp_enable && - candidates < MRG_MAX_NUM_CANDS && + candidates < max_num_cands && state->frame->ref->used_size; if (can_use_tmvp) { @@ -1333,12 +1378,12 @@ if (mv_cand[candidates].dir != 0) candidates++; } - if (candidates < MRG_MAX_NUM_CANDS && state->frame->slicetype == KVZ_SLICE_B) { + if (candidates < max_num_cands && state->frame->slicetype == KVZ_SLICE_B) { #define NUM_PRIORITY_LIST 12; static const uint8_t priorityList0[] = { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 }; static const uint8_t priorityList1[] = { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 }; uint8_t cutoff = candidates; - for (int32_t idx = 0; idx<cutoff*(cutoff - 1) && candidates != MRG_MAX_NUM_CANDS; idx++) { + for (int32_t idx = 0; idx<cutoff*(cutoff - 1) && candidates != max_num_cands; idx++) { uint8_t i = priorityList0[idx]; uint8_t j = priorityList1[idx]; if (i >= candidates || j >= candidates) break; @@ -1370,7 +1415,7 @@ int num_ref = state->frame->ref->used_size; - if (candidates < MRG_MAX_NUM_CANDS && state->frame->slicetype == KVZ_SLICE_B) { + if (candidates < max_num_cands && state->frame->slicetype == KVZ_SLICE_B) { int j; int ref_negative = 0; int ref_positive = 0; @@ -1385,7 +1430,7 @@ } // Add (0,0) prediction - while (candidates != MRG_MAX_NUM_CANDS) { + while (candidates != max_num_cands) { mv_cand[candidates].mv[0][0] = 0; mv_cand[candidates].mv[0][1] = 0; mv_cand[candidates].ref[0] = (zero_idx >= num_ref - 1) ? 0 : zero_idx;
View file
kvazaar-1.2.0.tar.gz/src/inter.h -> kvazaar-1.3.0.tar.gz/src/inter.h
Changed
@@ -40,26 +40,22 @@ } inter_merge_cand_t; +void kvz_inter_recon_cu(const encoder_state_t * const state, + lcu_t *lcu, + int32_t x, + int32_t y, + int32_t width); -void kvz_inter_recon_lcu(const encoder_state_t * const state, - const kvz_picture * ref, - int32_t xpos, - int32_t ypos, - int32_t width, - int32_t height, - const int16_t mv_param[2], - lcu_t* lcu, - hi_prec_buf_t *hi_prec_out); +void kvz_inter_recon_bipred(const encoder_state_t * const state, + const kvz_picture * ref1, + const kvz_picture * ref2, + int32_t xpos, + int32_t ypos, + int32_t width, + int32_t height, + int16_t mv_param[2][2], + lcu_t* lcu); -void kvz_inter_recon_lcu_bipred(const encoder_state_t * const state, - const kvz_picture * ref1, - const kvz_picture * ref2, - int32_t xpos, - int32_t ypos, - int32_t width, - int32_t height, - int16_t mv_param[2][2], - lcu_t* lcu); void kvz_inter_get_mv_cand(const encoder_state_t * const state, int32_t x,
View file
kvazaar-1.2.0.tar.gz/src/kvazaar.c -> kvazaar-1.3.0.tar.gz/src/kvazaar.c
Changed
@@ -142,8 +142,8 @@ info->nal_unit_type = state->frame->pictype; info->slice_type = state->frame->slicetype; - memset(info->ref_list[0], 0, 16); - memset(info->ref_list[1], 0, 16); + memset(info->ref_list[0], 0, 16 * sizeof(int)); + memset(info->ref_list[1], 0, 16 * sizeof(int)); for (size_t i = 0; i < state->frame->ref_LX_size[0]; i++) { info->ref_list[0][i] = state->frame->ref->pocs[state->frame->ref_LX[0][i]];
View file
kvazaar-1.2.0.tar.gz/src/kvazaar.h -> kvazaar-1.3.0.tar.gz/src/kvazaar.h
Changed
@@ -92,6 +92,7 @@ KVZ_IME_FULL16 = 4, //! \since 3.6.0 KVZ_IME_FULL32 = 5, //! \since 3.6.0 KVZ_IME_FULL64 = 6, //! \since 3.6.0 + KVZ_IME_DIA = 7, // Experimental. TODO: change into a proper doc comment }; /** @@ -206,6 +207,12 @@ KVZ_SAO_FULL = 3 }; +enum kvz_scalinglist { + KVZ_SCALING_LIST_OFF = 0, + KVZ_SCALING_LIST_CUSTOM = 1, + KVZ_SCALING_LIST_DEFAULT = 2, +}; + // Map from input format to chroma format. #define KVZ_FORMAT2CSP(format) ((enum kvz_chroma_format)"\0\1\2\3"[format]) @@ -322,6 +329,7 @@ uint8_t *optional_key; enum kvz_me_early_termination me_early_termination; /*!< \since 3.8.0 \brief Mode of me early termination. */ + int32_t intra_rdo_et; /*!< \since 4.1.0 \brief Use early termination in intra rdo. */ int32_t lossless; /*!< \brief Use lossless coding. */ @@ -351,6 +359,37 @@ * \brief Use adaptive QP for 360 video with equirectangular projection. */ int32_t erp_aqp; + + /** \brief The HEVC level */ + uint8_t level; + /** \brief Whether we ignore and just warn from all of the errors about the output not conforming to the level's requirements. */ + uint8_t force_level; + /** \brief Whether we use the high tier bitrates. Requires the level to be 4 or higher. */ + uint8_t high_tier; + /** \brief The maximum allowed bitrate for this level and tier. */ + uint32_t max_bitrate; + + /** \brief Maximum steps that hexagonal and diagonal motion estimation can use. -1 to disable */ + uint32_t me_max_steps; + + /** \brief Minimum QP that uses CABAC for residual cost instead of a fast estimate. */ + int8_t fast_residual_cost_limit; + + /** \brief Set QP at CU level keeping pic_init_qp_minus26 in PPS zero */ + int8_t set_qp_in_cu; + + /** \brief Flag to enable/disable open GOP configuration */ + int8_t open_gop; + + /** \brief Type of scaling lists to use */ + int8_t scaling_list; + + /** \brief Maximum number of merge cadidates */ + uint8_t max_merge; + + /** \brief Enable Early Skip Mode Decision */ + uint8_t early_skip; + } kvz_config; /** @@ -359,7 +398,8 @@ * Function picture_alloc in kvz_api must be used for allocation. */ typedef struct kvz_picture { - kvz_pixel *fulldata; //!< \brief Allocated buffer (only used in the base_image) + kvz_pixel *fulldata_buf; //!< \brief Allocated buffer with padding (only used in the base_image) + kvz_pixel *fulldata; //!< \brief Allocated buffer portion that's actually used kvz_pixel *y; //!< \brief Pointer to luma pixel array. kvz_pixel *u; //!< \brief Pointer to chroma U pixel array.
View file
kvazaar-1.2.0.tar.gz/src/rate_control.c -> kvazaar-1.3.0.tar.gz/src/rate_control.c
Changed
@@ -79,8 +79,8 @@ int pictures_coded = MAX(0, state->frame->num - encoder->cfg.owf); int gop_offset = (state->frame->gop_offset - encoder->cfg.owf) % MAX(1, encoder->cfg.gop_len); - // Only take fully coded GOPs into account. - if (encoder->cfg.gop_len > 0 && gop_offset != encoder->cfg.gop_len - 1) { + + if (encoder->cfg.gop_len > 0 && gop_offset != encoder->cfg.gop_len - 1 && encoder->cfg.gop_lp_definition.d == 0) { // Subtract number of bits in the partially coded GOP. bits_coded -= state->frame->cur_gop_bits_coded; // Subtract number of pictures in the partially coded GOP. @@ -293,7 +293,7 @@ int dqp = ctrl->cfg.roi.dqps[roi_index]; state->qp = CLIP_TO_QP(state->frame->QP + dqp); state->lambda = qp_to_lamba(state, state->qp); - state->lambda_sqrt = sqrt(state->frame->lambda); + state->lambda_sqrt = sqrt(state->lambda); } else if (ctrl->cfg.target_bitrate > 0) { lcu_stats_t *lcu = kvz_get_lcu_stats(state, pos.x, pos.y);
View file
kvazaar-1.2.0.tar.gz/src/rdo.c -> kvazaar-1.3.0.tar.gz/src/rdo.c
Changed
@@ -30,6 +30,7 @@ #include "imagelist.h" #include "inter.h" #include "scalinglist.h" +#include "strategyselector.h" #include "tables.h" #include "transform.h" @@ -41,8 +42,6 @@ #define LOG2_SCAN_SET_SIZE 4 #define SBH_THRESHOLD 4 -static const double COEFF_SUM_MULTIPLIER = 1.9; - const uint32_t kvz_g_go_rice_range[5] = { 7, 14, 26, 46, 78 }; const uint32_t kvz_g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 }; @@ -195,7 +194,6 @@ return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3); } - /** * \brief Estimate bitcost for coding coefficients. * @@ -211,15 +209,17 @@ int32_t type, int8_t scan_mode) { - if (state->encoder_control->cfg.rdo > 0) { + if (state->qp >= state->encoder_control->cfg.fast_residual_cost_limit) { return get_coeff_cabac_cost(state, coeff, width, type, scan_mode); } else { - return COEFF_SUM_MULTIPLIER * kvz_coeff_abs_sum(coeff, width * width) + 0.5; + // Estimate coeff coding cost based on QP and sum of absolute coeffs. + // const uint32_t sum = kvz_coeff_abs_sum(coeff, width * width); + // return (uint32_t)(sum * (state->qp * COEFF_COST_QP_FACTOR + COEFF_COST_BIAS) + 0.5); + return kvz_fast_coeff_cost(coeff, width, state->qp); } } - #define COEF_REMAIN_BIN_REDUCTION 3 /** Calculates the cost for specific absolute transform level * \param abs_level scaled quantized level @@ -879,52 +879,23 @@ } } -/** MVD cost calculation with CABAC -* \returns int -* Calculates cost of actual motion vectors using CABAC coding -*/ +/** + * Calculate cost of actual motion vectors using CABAC coding + */ uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state, - vector2d_t *mvd, - const cabac_data_t* real_cabac) + const cabac_data_t* cabac, + const int32_t mvd_hor, + const int32_t mvd_ver) { - uint32_t bitcost = 0; - const int32_t mvd_hor = mvd->x; - const int32_t mvd_ver = mvd->y; - const int8_t hor_abs_gr0 = mvd_hor != 0; - const int8_t ver_abs_gr0 = mvd_ver != 0; - const uint32_t mvd_hor_abs = abs(mvd_hor); - const uint32_t mvd_ver_abs = abs(mvd_ver); + cabac_data_t cabac_copy = *cabac; + cabac_copy.only_count = 1; - cabac_data_t cabac_copy; - memcpy(&cabac_copy, real_cabac, sizeof(cabac_data_t)); - cabac_data_t *cabac = &cabac_copy; - cabac->only_count = 1; - - cabac->cur_ctx = &(cabac->ctx.cu_mvd_model[0]); - CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor"); - CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver"); - cabac->cur_ctx = &(cabac->ctx.cu_mvd_model[1]); - if (hor_abs_gr0) { - CABAC_BIN(cabac, (mvd_hor_abs > 1), "abs_mvd_greater1_flag_hor"); - } - if (ver_abs_gr0) { - CABAC_BIN(cabac, (mvd_ver_abs > 1), "abs_mvd_greater1_flag_ver"); - } - if (hor_abs_gr0) { - if (mvd_hor_abs > 1) { - // It is safe to drop const here because cabac->only_count is set. - kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_hor_abs - 2, 1); - } - CABAC_BIN_EP(cabac, (mvd_hor > 0) ? 0 : 1, "mvd_sign_flag_hor"); - } - if (ver_abs_gr0) { - if (mvd_ver_abs > 1) { - // It is safe to drop const here because cabac->only_count is set. - kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_ver_abs - 2, 1); - } - CABAC_BIN_EP(cabac, (mvd_ver > 0) ? 0 : 1, "mvd_sign_flag_ver"); - } - bitcost = ((23 - cabac->bits_left) + (cabac->num_buffered_bytes << 3)) - ((23 - real_cabac->bits_left) + (real_cabac->num_buffered_bytes << 3)); + // It is safe to drop const here because cabac->only_count is set. + kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver); + + uint32_t bitcost = + ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3)) - + ((23 - cabac->bits_left) + (cabac->num_buffered_bytes << 3)); return bitcost; } @@ -946,8 +917,7 @@ cabac_data_t state_cabac_copy; cabac_data_t* cabac; uint32_t merge_idx; - int cand1_cost, cand2_cost; - vector2d_t mvd_temp1, mvd_temp2, mvd = { 0, 0 }; + vector2d_t mvd = { 0, 0 }; int8_t merged = 0; int8_t cur_mv_cand = 0; @@ -979,27 +949,30 @@ cabac = &state_cabac_copy; if (!merged) { - mvd_temp1.x = x - mv_cand[0][0]; - mvd_temp1.y = y - mv_cand[0][1]; - cand1_cost = kvz_get_mvd_coding_cost_cabac(state, &mvd_temp1, cabac); - - mvd_temp2.x = x - mv_cand[1][0]; - mvd_temp2.y = y - mv_cand[1][1]; - cand2_cost = kvz_get_mvd_coding_cost_cabac(state, &mvd_temp2, cabac); + vector2d_t mvd1 = { + x - mv_cand[0][0], + y - mv_cand[0][1], + }; + vector2d_t mvd2 = { + x - mv_cand[1][0], + y - mv_cand[1][1], + }; + uint32_t cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y); + uint32_t cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y); // Select candidate 1 if it has lower cost if (cand2_cost < cand1_cost) { cur_mv_cand = 1; - mvd = mvd_temp2; + mvd = mvd2; } else { - mvd = mvd_temp1; + mvd = mvd1; } } cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model); CABAC_BIN(cabac, merged, "MergeFlag"); - num_cand = MRG_MAX_NUM_CANDS; + num_cand = state->encoder_control->cfg.max_merge; if (merged) { if (num_cand > 1) { int32_t ui; @@ -1058,51 +1031,18 @@ // ToDo: Bidir vector support if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) { - const int32_t mvd_hor = mvd.x; - const int32_t mvd_ver = mvd.y; - const int8_t hor_abs_gr0 = mvd_hor != 0; - const int8_t ver_abs_gr0 = mvd_ver != 0; - const uint32_t mvd_hor_abs = abs(mvd_hor); - const uint32_t mvd_ver_abs = abs(mvd_ver); - - cabac->cur_ctx = &(cabac->ctx.cu_mvd_model[0]); - CABAC_BIN(cabac, (mvd_hor != 0), "abs_mvd_greater0_flag_hor"); - CABAC_BIN(cabac, (mvd_ver != 0), "abs_mvd_greater0_flag_ver"); - - cabac->cur_ctx = &(cabac->ctx.cu_mvd_model[1]); - - if (hor_abs_gr0) { - CABAC_BIN(cabac, (mvd_hor_abs > 1), "abs_mvd_greater1_flag_hor"); - } - - if (ver_abs_gr0) { - CABAC_BIN(cabac, (mvd_ver_abs > 1), "abs_mvd_greater1_flag_ver"); - } - - if (hor_abs_gr0) { - if (mvd_hor_abs > 1) { - // It is safe to drop const because cabac->only_count is set. - kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_hor_abs - 2, 1); - } - - CABAC_BIN_EP(cabac, (mvd_hor > 0) ? 0 : 1, "mvd_sign_flag_hor"); - } - - if (ver_abs_gr0) { - if (mvd_ver_abs > 1) { - // It is safe to drop const because cabac->only_count is set. - kvz_cabac_write_ep_ex_golomb((encoder_state_t*)state, cabac, mvd_ver_abs - 2, 1); - } - - CABAC_BIN_EP(cabac, (mvd_ver > 0) ? 0 : 1, "mvd_sign_flag_ver"); - } + // It is safe to drop const here because cabac->only_count is set. + kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y); } // Signal which candidate MV to use - kvz_cabac_write_unary_max_symbol(cabac, cabac->ctx.mvp_idx_model, cur_mv_cand, 1, - AMVP_MAX_NUM_CANDS - 1); + kvz_cabac_write_unary_max_symbol( + cabac, + cabac->ctx.mvp_idx_model, + cur_mv_cand, + 1, + AMVP_MAX_NUM_CANDS - 1); } - } }
View file
kvazaar-1.2.0.tar.gz/src/rdo.h -> kvazaar-1.3.0.tar.gz/src/rdo.h
Changed
@@ -39,7 +39,7 @@ void kvz_rdoq(encoder_state_t *state, coeff_t *coef, coeff_t *dest_coeff, int32_t width, int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth); -uint32_t kvz_get_coeff_cost(const encoder_state_t *state, +uint32_t kvz_get_coeff_cost(const encoder_state_t * const state, const coeff_t *coeff, int32_t width, int32_t type, @@ -57,8 +57,9 @@ kvz_mvd_cost_func kvz_calc_mvd_cost_cabac; uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state, - vector2d_t *mvd, - const cabac_data_t* cabac); + const cabac_data_t* cabac, + int32_t mvd_hor, + int32_t mvd_ver); // Number of fixed point fractional bits used in the fractional bit table. #define CTX_FRAC_BITS 15
View file
kvazaar-1.2.0.tar.gz/src/scalinglist.c -> kvazaar-1.3.0.tar.gz/src/scalinglist.c
Changed
@@ -102,6 +102,7 @@ } scaling_list->enable = 0; + scaling_list->use_default_list = 0; } /** @@ -397,9 +398,9 @@ for (size = 0; size < SCALING_LIST_SIZE_NUM; size++) { for (list = 0; list < kvz_g_scaling_list_num[size]; list++) { - const int32_t * const list_ptr = scaling_list->enable ? - scaling_list->scaling_list_coeff[size][list] : - kvz_scalinglist_get_default(size, list); + const int32_t * const list_ptr = scaling_list->use_default_list ? + kvz_scalinglist_get_default(size, list) : + scaling_list->scaling_list_coeff[size][list]; for (qp = 0; qp < SCALING_LIST_REM_NUM; qp++) { kvz_scalinglist_set(scaling_list, list_ptr, list, size, qp);
View file
kvazaar-1.2.0.tar.gz/src/scalinglist.h -> kvazaar-1.3.0.tar.gz/src/scalinglist.h
Changed
@@ -33,6 +33,7 @@ typedef struct { int8_t enable; + int8_t use_default_list; int32_t scaling_list_dc [SCALING_LIST_SIZE_NUM][SCALING_LIST_NUM]; const int32_t *scaling_list_coeff[SCALING_LIST_SIZE_NUM][SCALING_LIST_NUM]; const int32_t *quant_coeff[4][6][6];
View file
kvazaar-1.2.0.tar.gz/src/search.c -> kvazaar-1.3.0.tar.gz/src/search.c
Changed
@@ -116,7 +116,7 @@ } } -void kvz_lcu_set_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth) +void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth) { const int x_local = SUB_SCU(x_px); const int y_local = SUB_SCU(y_px); @@ -138,6 +138,7 @@ to->type = cu->type; to->depth = cu->depth; to->part_size = cu->part_size; + to->qp = cu->qp; if (cu->type == CU_INTRA) { to->intra.mode = cu->intra.mode; @@ -152,7 +153,7 @@ } } -static void lcu_set_inter(lcu_t *lcu, int x_local, int y_local, int cu_width) +static void lcu_fill_inter(lcu_t *lcu, int x_local, int y_local, int cu_width) { const part_mode_t part_mode = LCU_GET_CU_AT_PX(lcu, x_local, y_local)->part_size; const int num_pu = kvz_part_mode_num_parts[part_mode]; @@ -169,7 +170,7 @@ } } -static void lcu_set_coeff(lcu_t *lcu, int x_local, int y_local, int width, cu_info_t *cur_cu) +static void lcu_fill_cbf(lcu_t *lcu, int x_local, int y_local, int width, cu_info_t *cur_cu) { const uint32_t tr_split = cur_cu->tr_depth - cur_cu->depth; const uint32_t mask = ~((width >> tr_split)-1); @@ -189,6 +190,40 @@ } +//Calculates cost for all zero coeffs +static double cu_zero_coeff_cost(const encoder_state_t *state, lcu_t *work_tree, const int x, const int y, + const int depth) +{ + int x_local = SUB_SCU(x); + int y_local = SUB_SCU(y); + int cu_width = LCU_WIDTH >> depth; + lcu_t *const lcu = &work_tree[depth]; + + const int luma_index = y_local * LCU_WIDTH + x_local; + const int chroma_index = (y_local / 2) * LCU_WIDTH_C + (x_local / 2); + + double ssd = 0.0; + ssd += LUMA_MULT * kvz_pixels_calc_ssd( + &lcu->ref.y[luma_index], &lcu->rec.y[luma_index], + LCU_WIDTH, LCU_WIDTH, cu_width + ); + if (x % 8 == 0 && y % 8 == 0 && state->encoder_control->chroma_format != KVZ_CSP_400) { + ssd += CHROMA_MULT * kvz_pixels_calc_ssd( + &lcu->ref.u[chroma_index], &lcu->rec.u[chroma_index], + LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 + ); + ssd += CHROMA_MULT * kvz_pixels_calc_ssd( + &lcu->ref.v[chroma_index], &lcu->rec.v[chroma_index], + LCU_WIDTH_C, LCU_WIDTH_C, cu_width / 2 + ); + } + // Save the pixels at a lower level of the working tree. + copy_cu_pixels(x_local, y_local, cu_width, lcu, &work_tree[depth + 1]); + + return ssd; +} + + /** * Calculate RD cost for a Coding Unit. * \return Cost of block @@ -368,6 +403,30 @@ } +/** + * \brief Sort modes and costs to ascending order according to costs. + */ +void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length) +{ + // Length for intra is always between 5 and 23, and is either 21, 17, 9 or 8 about + // 60% of the time, so there should be no need for anything more complex + // than insertion sort. + // Length for merge is 5 or less. + for (uint8_t i = 1; i < length; ++i) { + const double cur_cost = costs[i]; + const int8_t cur_mode = modes[i]; + uint8_t j = i; + while (j > 0 && cur_cost < costs[j - 1]) { + costs[j] = costs[j - 1]; + modes[j] = modes[j - 1]; + --j; + } + costs[j] = cur_cost; + modes[j] = cur_mode; + } +} + + static uint8_t get_ctx_cu_split_model(const lcu_t *lcu, int x, int y, int depth) { vector2d_t lcu_cu = { SUB_SCU(x), SUB_SCU(y) }; @@ -392,6 +451,7 @@ const videoframe_t * const frame = state->tile->frame; int cu_width = LCU_WIDTH >> depth; double cost = MAX_INT; + double inter_zero_coeff_cost = MAX_INT; uint32_t inter_bitcost = MAX_INT; cu_info_t *cur_cu; @@ -412,6 +472,7 @@ cur_cu->tr_depth = depth > 0 ? depth : 1; cur_cu->type = CU_NOTSET; cur_cu->part_size = SIZE_2Nx2N; + cur_cu->qp = state->qp; // If the CU is completely inside the frame at this depth, search for // prediction modes at this depth. @@ -419,14 +480,17 @@ y + cu_width <= frame->height) { int cu_width_inter_min = LCU_WIDTH >> ctrl->cfg.pu_depth_inter.max; - bool can_use_inter = state->frame->slicetype != KVZ_SLICE_I && ( - WITHIN(depth, ctrl->cfg.pu_depth_inter.min, ctrl->cfg.pu_depth_inter.max) || - // When the split was forced because the CTU is partially outside the - // frame, we permit inter coding even if pu_depth_inter would - // otherwise forbid it. - (x & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame->width || - (y & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame->height - ); + bool can_use_inter = + state->frame->slicetype != KVZ_SLICE_I && + depth <= MAX_DEPTH && + ( + WITHIN(depth, ctrl->cfg.pu_depth_inter.min, ctrl->cfg.pu_depth_inter.max) || + // When the split was forced because the CTU is partially outside the + // frame, we permit inter coding even if pu_depth_inter would + // otherwise forbid it. + (x & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame->width || + (y & ~(cu_width_inter_min - 1)) + cu_width_inter_min > frame->height + ); if (can_use_inter) { double mode_cost; @@ -442,30 +506,31 @@ cur_cu->type = CU_INTER; } - // Try SMP and AMP partitioning. - static const part_mode_t mp_modes[] = { - // SMP - SIZE_2NxN, SIZE_Nx2N, - // AMP - SIZE_2NxnU, SIZE_2NxnD, - SIZE_nLx2N, SIZE_nRx2N, - }; - - const int first_mode = ctrl->cfg.smp_enable ? 0 : 2; - const int last_mode = (ctrl->cfg.amp_enable && cu_width >= 16) ? 5 : 1; - for (int i = first_mode; i <= last_mode; ++i) { - kvz_search_cu_smp(state, - x, y, - depth, - mp_modes[i], - &work_tree[depth + 1], - &mode_cost, &mode_bitcost); - // TODO: take cost of coding part mode into account - if (mode_cost < cost) { - cost = mode_cost; - inter_bitcost = mode_bitcost; - // TODO: only copy inter prediction info, not pixels - work_tree_copy_up(x_local, y_local, depth, work_tree); + if (!(ctrl->cfg.early_skip && cur_cu->skipped)) { + // Try SMP and AMP partitioning. + static const part_mode_t mp_modes[] = { + // SMP + SIZE_2NxN, SIZE_Nx2N, + // AMP + SIZE_2NxnU, SIZE_2NxnD, + SIZE_nLx2N, SIZE_nRx2N, + }; + + const int first_mode = ctrl->cfg.smp_enable ? 0 : 2; + const int last_mode = (ctrl->cfg.amp_enable && cu_width >= 16) ? 5 : 1; + for (int i = first_mode; i <= last_mode; ++i) { + kvz_search_cu_smp(state, + x, y, + depth, + mp_modes[i], + &work_tree[depth + 1], + &mode_cost, &mode_bitcost); + if (mode_cost < cost) { + cost = mode_cost; + inter_bitcost = mode_bitcost; + // Copy inter prediction info to current level. + copy_cu_info(x_local, y_local, cu_width, &work_tree[depth + 1], lcu); + } } } } @@ -473,9 +538,10 @@ // Try to skip intra search in rd==0 mode. // This can be quite severe on bdrate. It might be better to do this // decision after reconstructing the inter frame. - bool skip_intra = state->encoder_control->cfg.rdo == 0 + bool skip_intra = (state->encoder_control->cfg.rdo == 0 && cur_cu->type != CU_NOTSET - && cost / (cu_width * cu_width) < INTRA_THRESHOLD; + && cost / (cu_width * cu_width) < INTRA_THRESHOLD) + || (ctrl->cfg.early_skip && cur_cu->skipped); int32_t cu_width_intra_min = LCU_WIDTH >> ctrl->cfg.pu_depth_intra.max; bool can_use_intra = @@ -516,7 +582,7 @@ // rd2. Possibly because the luma mode search already takes chroma // into account, so there is less of a chanse of luma mode being // really bad for chroma. - if (state->encoder_control->cfg.rdo == 3) { + if (ctrl->cfg.rdo == 3) { cur_cu->intra.mode_chroma = kvz_search_cu_intra_chroma(state, x, y, depth, lcu); lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); } @@ -528,74 +594,47 @@ NULL, lcu); } } else if (cur_cu->type == CU_INTER) { - // Reset transform depth because intra messes with them. - // This will no longer be necessary if the transform depths are not shared. - int tr_depth = depth > 0 ? depth : 1; - kvz_lcu_set_trdepth(lcu, x, y, depth, tr_depth); - - const int num_pu = kvz_part_mode_num_parts[cur_cu->part_size]; - for (int i = 0; i < num_pu; ++i) { - const int pu_x = PU_GET_X(cur_cu->part_size, cu_width, x, i); - const int pu_y = PU_GET_Y(cur_cu->part_size, cu_width, y, i); - const int pu_w = PU_GET_W(cur_cu->part_size, cu_width, i); - const int pu_h = PU_GET_H(cur_cu->part_size, cu_width, i); - - cu_info_t *cur_pu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(pu_x), SUB_SCU(pu_y)); - - if (cur_pu->inter.mv_dir == 3) { - const kvz_picture *const refs[2] = { - state->frame->ref->images[ - state->frame->ref_LX[0][ - cur_pu->inter.mv_ref[0]]], - state->frame->ref->images[ - state->frame->ref_LX[1][ - cur_pu->inter.mv_ref[1]]], - }; - kvz_inter_recon_lcu_bipred(state, - refs[0], refs[1], - pu_x, pu_y, - pu_w, pu_h, - cur_pu->inter.mv, - lcu); - } else { - const int mv_idx = cur_pu->inter.mv_dir - 1; - - const kvz_picture *const ref = - state->frame->ref->images[ - state->frame->ref_LX[mv_idx][ - cur_pu->inter.mv_ref[mv_idx]]]; - - kvz_inter_recon_lcu(state, - ref, - pu_x, pu_y, - pu_w, pu_h, - cur_pu->inter.mv[mv_idx], - lcu, - 0); + + if (!cur_cu->skipped) { + // Reset transform depth because intra messes with them. + // This will no longer be necessary if the transform depths are not shared. + int tr_depth = MAX(1, depth); + if (cur_cu->part_size != SIZE_2Nx2N) { + tr_depth = depth + 1; } - } + kvz_lcu_fill_trdepth(lcu, x, y, depth, tr_depth); - const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; - kvz_quantize_lcu_residual(state, - true, has_chroma, - x, y, depth, - NULL, - lcu); + kvz_inter_recon_cu(state, lcu, x, y, cu_width); - int cbf = cbf_is_set_any(cur_cu->cbf, depth); + if (!ctrl->cfg.lossless && !ctrl->cfg.rdoq_enable) { + //Calculate cost for zero coeffs + inter_zero_coeff_cost = cu_zero_coeff_cost(state, work_tree, x, y, depth) + inter_bitcost * state->lambda; - if(cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) { - cur_cu->merged = 0; - cur_cu->skipped = 1; - // Selecting skip reduces bits needed to code the CU - if (inter_bitcost > 1) { - inter_bitcost -= 1; + } + + const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; + kvz_quantize_lcu_residual(state, + true, has_chroma, + x, y, depth, + NULL, + lcu); + + int cbf = cbf_is_set_any(cur_cu->cbf, depth); + + if (cur_cu->merged && !cbf && cur_cu->part_size == SIZE_2Nx2N) { + cur_cu->merged = 0; + cur_cu->skipped = 1; + // Selecting skip reduces bits needed to code the CU + if (inter_bitcost > 1) { + inter_bitcost -= 1; + } } } - lcu_set_inter(lcu, x_local, y_local, cu_width); - lcu_set_coeff(lcu, x_local, y_local, cu_width, cur_cu); + lcu_fill_inter(lcu, x_local, y_local, cu_width); + lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu); } } + if (cur_cu->type == CU_INTRA || cur_cu->type == CU_INTER) { cost = kvz_cu_rd_cost_luma(state, x_local, y_local, depth, cur_cu, lcu); if (state->encoder_control->chroma_format != KVZ_CSP_400) { @@ -610,6 +649,28 @@ } cost += mode_bits * state->lambda; + + if (inter_zero_coeff_cost <= cost) { + cost = inter_zero_coeff_cost; + + // Restore saved pixels from lower level of the working tree. + copy_cu_pixels(x_local, y_local, cu_width, &work_tree[depth + 1], lcu); + + if (cur_cu->merged && cur_cu->part_size == SIZE_2Nx2N) { + cur_cu->merged = 0; + cur_cu->skipped = 1; + lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); + } + + if (cur_cu->tr_depth != depth) { + // Reset transform depth since there are no coefficients. This + // ensures that CBF is cleared for the whole area of the CU. + kvz_lcu_fill_trdepth(lcu, x, y, depth, depth); + } + + cur_cu->cbf = 0; + lcu_fill_cbf(lcu, x_local, y_local, cu_width, cur_cu); + } } bool can_split_cu = @@ -672,7 +733,7 @@ cur_cu->type = CU_INTRA; cur_cu->part_size = SIZE_2Nx2N; - kvz_lcu_set_trdepth(lcu, x, y, depth, cur_cu->tr_depth); + kvz_lcu_fill_trdepth(lcu, x, y, depth, cur_cu->tr_depth); lcu_fill_cu_info(lcu, x_local, y_local, cu_width, cu_width, cur_cu); const bool has_chroma = state->encoder_control->chroma_format != KVZ_CSP_400;
View file
kvazaar-1.2.0.tar.gz/src/search.h -> kvazaar-1.3.0.tar.gz/src/search.h
Changed
@@ -31,6 +31,7 @@ #include "global.h" // IWYU pragma: keep #include "image.h" +void kvz_sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length); void kvz_search_lcu(encoder_state_t *state, int x, int y, const yuv_t *hor_buf, const yuv_t *ver_buf); @@ -42,7 +43,7 @@ const int x_px, const int y_px, const int depth, const cu_info_t *const pred_cu, lcu_t *const lcu); -void kvz_lcu_set_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth); +void kvz_lcu_fill_trdepth(lcu_t *lcu, int x_px, int y_px, int depth, int tr_depth); void kvz_intra_recon_lcu_luma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu); void kvz_intra_recon_lcu_chroma(encoder_state_t * const state, int x, int y, int depth, int8_t intra_mode, cu_info_t *cur_cu, lcu_t *lcu);
View file
kvazaar-1.2.0.tar.gz/src/search_inter.c -> kvazaar-1.3.0.tar.gz/src/search_inter.c
Changed
@@ -30,11 +30,12 @@ #include "inter.h" #include "kvazaar.h" #include "rdo.h" +#include "search.h" #include "strategies/strategies-ipol.h" #include "strategies/strategies-picture.h" +#include "transform.h" #include "videoframe.h" - typedef struct { encoder_state_t *state; @@ -77,6 +78,13 @@ * \brief Bit cost of best_mv */ uint32_t best_bitcost; + + /** + * \brief Possible optimized SAD implementation for the width, leave as + * NULL for arbitrary-width blocks + */ + optimized_sad_func_ptr_t optimized_sad; + } inter_search_info_t; @@ -204,7 +212,8 @@ info->state->tile->offset_x + info->origin.x + x, info->state->tile->offset_y + info->origin.y + y, info->width, - info->height + info->height, + info->optimized_sad ); if (cost >= info->best_cost) return false; @@ -261,8 +270,8 @@ for (int i = 0; i < info->num_merge_cand; ++i) { if (info->merge_cand[i].dir == 3) continue; const vector2d_t merge_mv = { - info->merge_cand[i].mv[info->merge_cand[i].dir - 1][0] >> 2, - info->merge_cand[i].mv[info->merge_cand[i].dir - 1][1] >> 2 + (info->merge_cand[i].mv[info->merge_cand[i].dir - 1][0] + 2) >> 2, + (info->merge_cand[i].mv[info->merge_cand[i].dir - 1][1] + 2) >> 2 }; if (merge_mv.x == mv.x && merge_mv.y == mv.y) { return true; @@ -296,8 +305,8 @@ for (unsigned i = 0; i < info->num_merge_cand; ++i) { if (info->merge_cand[i].dir == 3) continue; - int x = info->merge_cand[i].mv[info->merge_cand[i].dir - 1][0] >> 2; - int y = info->merge_cand[i].mv[info->merge_cand[i].dir - 1][1] >> 2; + int x = (info->merge_cand[i].mv[info->merge_cand[i].dir - 1][0] + 2) >> 2; + int y = (info->merge_cand[i].mv[info->merge_cand[i].dir - 1][1] + 2) >> 2; if (x == 0 && y == 0) continue; @@ -307,32 +316,65 @@ static uint32_t get_mvd_coding_cost(const encoder_state_t *state, - vector2d_t *mvd, - const cabac_data_t* cabac) + const cabac_data_t* cabac, + const int32_t mvd_hor, + const int32_t mvd_ver) { unsigned bitcost = 0; - const vector2d_t abs_mvd = { abs(mvd->x), abs(mvd->y) }; + const vector2d_t abs_mvd = { abs(mvd_hor), abs(mvd_ver) }; - bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[0], abs_mvd.x > 0); - if (abs_mvd.x > 0) { - bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[1], abs_mvd.x > 1); - if (abs_mvd.x > 1) { - bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x - 2) << CTX_FRAC_BITS; - } - bitcost += CTX_FRAC_ONE_BIT; // sign + bitcost += get_ep_ex_golomb_bitcost(abs_mvd.x) << CTX_FRAC_BITS; + bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y) << CTX_FRAC_BITS; + + // Round and shift back to integer bits. + return (bitcost + CTX_FRAC_HALF_BIT) >> CTX_FRAC_BITS; +} + + +static int select_mv_cand(const encoder_state_t *state, + int16_t mv_cand[2][2], + int32_t mv_x, + int32_t mv_y, + uint32_t *cost_out) +{ + const bool same_cand = + (mv_cand[0][0] == mv_cand[1][0] && mv_cand[0][1] == mv_cand[1][1]); + + if (same_cand && !cost_out) { + // Pick the first one if both candidates are the same. + return 0; } - bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[0], abs_mvd.y > 0); - if (abs_mvd.y > 0) { - bitcost += CTX_ENTROPY_BITS(&cabac->ctx.cu_mvd_model[1], abs_mvd.y > 1); - if (abs_mvd.y > 1) { - bitcost += get_ep_ex_golomb_bitcost(abs_mvd.y - 2) << CTX_FRAC_BITS; - } - bitcost += CTX_FRAC_ONE_BIT; // sign + uint32_t (*mvd_coding_cost)(const encoder_state_t * const state, + const cabac_data_t*, + int32_t, int32_t); + if (state->encoder_control->cfg.mv_rdo) { + mvd_coding_cost = kvz_get_mvd_coding_cost_cabac; + } else { + mvd_coding_cost = get_mvd_coding_cost; } - // Round and shift back to integer bits. - return (bitcost + CTX_FRAC_HALF_BIT) >> CTX_FRAC_BITS; + uint32_t cand1_cost = mvd_coding_cost( + state, &state->cabac, + mv_x - mv_cand[0][0], + mv_y - mv_cand[0][1]); + + uint32_t cand2_cost; + if (same_cand) { + cand2_cost = cand1_cost; + } else { + cand2_cost = mvd_coding_cost( + state, &state->cabac, + mv_x - mv_cand[1][0], + mv_y - mv_cand[1][1]); + } + + if (cost_out) { + *cost_out = MIN(cand1_cost, cand2_cost); + } + + // Pick the second candidate if it has lower cost. + return cand2_cost < cand1_cost ? 1 : 0; } @@ -348,10 +390,7 @@ { uint32_t temp_bitcost = 0; uint32_t merge_idx; - int cand1_cost,cand2_cost; - vector2d_t mvd_temp1, mvd_temp2; int8_t merged = 0; - int8_t cur_mv_cand = 0; x *= 1 << mv_shift; y *= 1 << mv_shift; @@ -371,20 +410,10 @@ } // Check mvd cost only if mv is not merged - if(!merged) { - mvd_temp1.x = x - mv_cand[0][0]; - mvd_temp1.y = y - mv_cand[0][1]; - cand1_cost = get_mvd_coding_cost(state, &mvd_temp1, &state->cabac); - - mvd_temp2.x = x - mv_cand[1][0]; - mvd_temp2.y = y - mv_cand[1][1]; - cand2_cost = get_mvd_coding_cost(state, &mvd_temp2, &state->cabac); - - // Select candidate 1 if it has lower cost - if (cand2_cost < cand1_cost) { - cur_mv_cand = 1; - } - temp_bitcost += cur_mv_cand ? cand2_cost : cand1_cost; + if (!merged) { + uint32_t mvd_cost = 0; + select_mv_cand(state, mv_cand, x, y, &mvd_cost); + temp_bitcost += mvd_cost; } *bitcost = temp_bitcost; return temp_bitcost*(int32_t)(state->lambda_sqrt + 0.5); @@ -442,6 +471,7 @@ void kvz_tz_pattern_search(inter_search_info_t *info, unsigned pattern_type, const int iDist, + vector2d_t mv, int *best_dist) { assert(pattern_type < 4); @@ -537,8 +567,6 @@ }; } - const vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; - // Compute SAD values for all chosen points. int best_index = -1; for (int i = 0; i < n_points; i++) { @@ -579,8 +607,9 @@ const int iRaster = 5; // search distance limit and downsampling factor for step 3 const unsigned step2_type = 0; // search patterns for steps 2 and 4 const unsigned step4_type = 0; - const bool bRasterRefinementEnable = true; // enable step 4 mode 1 - const bool bStarRefinementEnable = false; // enable step 4 mode 2 (only one mode will be executed) + const bool use_raster_scan = false; // enable step 3 + const bool use_raster_refinement = false; // enable step 4 mode 1 + const bool use_star_refinement = true; // enable step 4 mode 2 (only one mode will be executed) int best_dist = 0; info->best_cost = UINT32_MAX; @@ -596,13 +625,33 @@ return; } - //step 2, grid search + vector2d_t start = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; + + // step 2, grid search + int rounds_without_improvement = 0; for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) { - kvz_tz_pattern_search(info, step2_type, iDist, &best_dist); + kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist); + + // Break the loop if the last three rounds didn't produce a better MV. + if (best_dist != iDist) rounds_without_improvement++; + if (rounds_without_improvement >= 3) break; + } + + if (start.x != 0 || start.y != 0) { + // repeat step 2 starting from the zero MV + start.x = 0; + start.y = 0; + rounds_without_improvement = 0; + for (int iDist = 1; iDist <= iSearchRange/2; iDist *= 2) { + kvz_tz_pattern_search(info, step2_type, iDist, start, &best_dist); + + if (best_dist != iDist) rounds_without_improvement++; + if (rounds_without_improvement >= 3) break; + } } //step 3, raster scan - if (best_dist > iRaster) { + if (use_raster_scan && best_dist > iRaster) { best_dist = iRaster; kvz_tz_raster_search(info, iSearchRange, iRaster); } @@ -610,16 +659,21 @@ //step 4 //raster refinement - if (bRasterRefinementEnable && best_dist > 0) { + if (use_raster_refinement && best_dist > 0) { for (int iDist = best_dist >> 1; iDist > 0; iDist >>= 1) { - kvz_tz_pattern_search(info, step4_type, iDist, &best_dist); + start.x = info->best_mv.x >> 2; + start.y = info->best_mv.y >> 2; + kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist); } } //star refinement (repeat step 2 for the current starting point) - if (bStarRefinementEnable && best_dist > 0) { + while (use_star_refinement && best_dist > 0) { + best_dist = 0; + start.x = info->best_mv.x >> 2; + start.y = info->best_mv.y >> 2; for (int iDist = 1; iDist <= iSearchRange; iDist *= 2) { - kvz_tz_pattern_search(info, step4_type, iDist, &best_dist); + kvz_tz_pattern_search(info, step4_type, iDist, start, &best_dist); } } } @@ -630,6 +684,7 @@ * * \param info search info * \param extra_mv extra motion vector to check + * \param steps how many steps are done at maximum before exiting, does not affect the final step * * Motion vector is searched by first searching iteratively with the large * hexagon pattern until the best match is at the center of the hexagon. @@ -640,7 +695,7 @@ * the predicted motion vector is way off. In the future even more additional * points like 0,0 might be used, such as vectors from top or left. */ -static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv) +static void hexagon_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps) { // The start of the hexagonal pattern has been repeated at the end so that // the indices between 1-6 can be used as the start of a 3-point list of new @@ -659,9 +714,10 @@ // 1 // 2 0 3 // 4 - static const vector2d_t small_hexbs[5] = { + static const vector2d_t small_hexbs[9] = { { 0, 0 }, - { 0, -1 }, { -1, 0 }, { 1, 0 }, { 0, 1 } + { 0, -1 }, { -1, 0 }, { 1, 0 }, { 0, 1 }, + { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; info->best_cost = UINT32_MAX; @@ -679,7 +735,7 @@ vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; - // Current best index, either to merge_cands, large_hebx or small_hexbs. + // Current best index, either to merge_cands, large_hexbs or small_hexbs. int best_index = 0; // Search the initial 7 points of the hexagon. @@ -691,7 +747,10 @@ // Iteratively search the 3 new points around the best match, until the best // match is in the center. - while (best_index != 0) { + while (best_index != 0 && steps != 0) { + // decrement count if enabled + if (steps > 0) steps -= 1; + // Starting point of the 3 offsets to be searched. unsigned start; if (best_index == 1) { @@ -717,16 +776,120 @@ } // Move the center to the best match. - mv.x += large_hexbs[best_index].x; - mv.y += large_hexbs[best_index].y; - best_index = 0; + //mv.x += large_hexbs[best_index].x; + //mv.y += large_hexbs[best_index].y; // Do the final step of the search with a small pattern. - for (int i = 1; i < 5; ++i) { + for (int i = 1; i < 9; ++i) { check_mv_cost(info, mv.x + small_hexbs[i].x, mv.y + small_hexbs[i].y); } } +/** +* \brief Do motion search using the diamond algorithm. +* +* \param info search info +* \param extra_mv extra motion vector to check +* \param steps how many steps are done at maximum before exiting +* +* Motion vector is searched by searching iteratively with a diamond-shaped +* pattern. We take care of not checking the direction we came from, but +* further checking for avoiding visits to already visited points is not done. +* +* If a non 0,0 predicted motion vector predictor is given as extra_mv, +* the 0,0 vector is also tried. This is hoped to help in the case where +* the predicted motion vector is way off. In the future even more additional +* points like 0,0 might be used, such as vectors from top or left. +**/ +static void diamond_search(inter_search_info_t *info, vector2d_t extra_mv, uint32_t steps) +{ + enum diapos { + DIA_UP = 0, + DIA_RIGHT = 1, + DIA_LEFT = 2, + DIA_DOWN = 3, + DIA_CENTER = 4, + }; + + // a diamond shape with the center included + // 0 + // 2 4 1 + // 3 + static const vector2d_t diamond[5] = { + {0, -1}, {1, 0}, {0, 1}, {-1, 0}, + {0, 0} + }; + + info->best_cost = UINT32_MAX; + + // Select starting point from among merge candidates. These should + // include both mv_cand vectors and (0, 0). + select_starting_point(info, extra_mv); + + // Check if we should stop search + if (info->state->encoder_control->cfg.me_early_termination && + early_terminate(info)) + { + return; + } + + // current motion vector + vector2d_t mv = { info->best_mv.x >> 2, info->best_mv.y >> 2 }; + + // current best index + enum diapos best_index = DIA_CENTER; + + // initial search of the points of the diamond + for (int i = 0; i < 5; ++i) { + if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y)) { + best_index = i; + } + } + + if (best_index == DIA_CENTER) { + // the center point was the best in initial check + return; + } + + // Move the center to the best match. + mv.x += diamond[best_index].x; + mv.y += diamond[best_index].y; + + // the arrival direction, the index of the diamond member that will be excluded + enum diapos from_dir = DIA_CENTER; + + // whether we found a better candidate this iteration + uint8_t better_found; + + do { + better_found = 0; + // decrement count if enabled + if (steps > 0) steps -= 1; + + // search the points of the diamond + for (int i = 0; i < 4; ++i) { + // this is where we came from so it's checked already + if (i == from_dir) continue; + + if (check_mv_cost(info, mv.x + diamond[i].x, mv.y + diamond[i].y)) { + best_index = i; + better_found = 1; + } + } + + if (better_found) { + // Move the center to the best match. + mv.x += diamond[best_index].x; + mv.y += diamond[best_index].y; + + // record where we came from to the next iteration + // the xor operation flips the orientation + from_dir = best_index ^ 0x3; + } + } while (better_found && steps != 0); + // and we're done +} + static void search_mv_full(inter_search_info_t *info, int32_t search_range, @@ -830,65 +993,39 @@ unsigned costs[4] = { 0 }; kvz_extended_block src = { 0, 0, 0, 0 }; + ALIGNED(64) kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH]; - // Buffers for interpolated fractional pixels one - // for each position excluding the integer position. - // Has one extra column on left and row on top because - // samples are used also from those integer pixels when - // searching positions to the left and up. - frac_search_block fracpel_blocks[15]; - - kvz_pixel *hpel_pos[8]; - - // Horizontal hpel positions - hpel_pos[0] = fracpel_blocks[HPEL_POS_HOR] + (LCU_WIDTH + 1); - hpel_pos[1] = fracpel_blocks[HPEL_POS_HOR] + (LCU_WIDTH + 1) + 1; - - // Vertical hpel positions - hpel_pos[2] = fracpel_blocks[HPEL_POS_VER] + 1; - hpel_pos[3] = fracpel_blocks[HPEL_POS_VER] + (LCU_WIDTH + 1) + 1; - - // Diagonal hpel positions - hpel_pos[4] = fracpel_blocks[HPEL_POS_DIA]; - hpel_pos[5] = fracpel_blocks[HPEL_POS_DIA] + 1; - hpel_pos[6] = fracpel_blocks[HPEL_POS_DIA] + (LCU_WIDTH + 1); - hpel_pos[7] = fracpel_blocks[HPEL_POS_DIA] + (LCU_WIDTH + 1) + 1; + // Storage buffers for intermediate horizontally filtered results. + // Have the first columns in contiguous memory for vectorization. + ALIGNED(64) int16_t intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH]; + int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1]; const kvz_picture *ref = info->ref; const kvz_picture *pic = info->pic; vector2d_t orig = info->origin; const int width = info->width; const int height = info->height; + const int internal_width = ((width + 7) >> 3) << 3; // Round up to closest 8 + const int internal_height = ((height + 7) >> 3) << 3; const encoder_state_t *state = info->state; int fme_level = state->encoder_control->cfg.fme_level; + int8_t sample_off_x = 0; + int8_t sample_off_y = 0; kvz_get_extended_block(orig.x, orig.y, mv.x - 1, mv.y - 1, state->tile->offset_x, state->tile->offset_y, - ref->y, ref->width, ref->height, FILTER_SIZE, - width+1, height+1, + ref->y, ref->width, ref->height, KVZ_LUMA_FILTER_TAPS, + internal_width+1, internal_height+1, &src); - kvz_filter_frac_blocks_luma(state->encoder_control, - src.orig_topleft, - src.stride, - width, - height, - fracpel_blocks, - fme_level); - - kvz_pixel tmp_pic[LCU_WIDTH*LCU_WIDTH]; - kvz_pixels_blit(pic->y + orig.y * pic->stride + orig.x, - tmp_pic, - width, - height, - pic->stride, - width); - + kvz_pixel *tmp_pic = pic->y + orig.y * pic->stride + orig.x; + int tmp_stride = pic->stride; + // Search integer position costs[0] = kvz_satd_any_size(width, height, - tmp_pic, width, + tmp_pic, tmp_stride, src.orig_topleft + src.stride + 1, src.stride); costs[0] += info->mvd_cost_func(state, @@ -900,31 +1037,51 @@ &bitcosts[0]); best_cost = costs[0]; best_bitcost = bitcosts[0]; - - int last_hpel_index = (fme_level == 1) ? 4 : 8; - + //Set mv to half-pixel precision mv.x *= 2; mv.y *= 2; + ipol_blocks_func * filter_steps[4] = { + kvz_filter_hpel_blocks_hor_ver_luma, + kvz_filter_hpel_blocks_diag_luma, + kvz_filter_qpel_blocks_hor_ver_luma, + kvz_filter_qpel_blocks_diag_luma, + }; + // Search halfpel positions around best integer mv - for (int i = 1; i <= last_hpel_index; i += 4) { + int i = 1; + for (int step = 0; step < fme_level; ++step){ + + const int mv_shift = (step < 2) ? 1 : 0; + + filter_steps[step](state->encoder_control, + src.orig_topleft, + src.stride, + internal_width, + internal_height, + filtered, + intermediate, + fme_level, + hor_first_cols, + sample_off_x, + sample_off_y); + const vector2d_t *pattern[4] = { &square[i], &square[i + 1], &square[i + 2], &square[i + 3] }; int8_t within_tile[4]; for (int j = 0; j < 4; j++) { within_tile[j] = - fracmv_within_tile(info, (mv.x + pattern[j]->x) * 2, (mv.y + pattern[j]->y) * 2); + fracmv_within_tile(info, (mv.x + pattern[j]->x) * (1 << mv_shift), (mv.y + pattern[j]->y) * (1 << mv_shift)); }; - int hpel_strides[4] = { - (LCU_WIDTH + 1), - (LCU_WIDTH + 1), - (LCU_WIDTH + 1), - (LCU_WIDTH + 1) - }; + kvz_pixel *filtered_pos[4] = { 0 }; + filtered_pos[0] = &filtered[0][0]; + filtered_pos[1] = &filtered[1][0]; + filtered_pos[2] = &filtered[2][0]; + filtered_pos[3] = &filtered[3][0]; - kvz_satd_any_size_quad(width, height, (const kvz_pixel**)(hpel_pos + i - 1), hpel_strides, tmp_pic, width, 4, costs, within_tile); + kvz_satd_any_size_quad(width, height, (const kvz_pixel **)filtered_pos, LCU_WIDTH, tmp_pic, tmp_stride, 4, costs, within_tile); for (int j = 0; j < 4; j++) { if (within_tile[j]) { @@ -932,7 +1089,7 @@ state, mv.x + pattern[j]->x, mv.y + pattern[j]->y, - 1, + mv_shift, info->mv_cand, info->merge_cand, info->num_merge_cand, @@ -949,108 +1106,26 @@ best_index = i + j; } } - } - - unsigned int best_hpel_index = best_index; - // Move search to best_index - mv.x += square[best_index].x; - mv.y += square[best_index].y; - - //Set mv to quarterpel precision - mv.x *= 2; - mv.y *= 2; - - if (fme_level >= 3) { - - best_index = 0; - - int last_qpel_index = (fme_level == 3) ? 4 : 8; - - //Search quarterpel points around best halfpel mv - for (int i = 1; i <= last_qpel_index; i += 4) { - const vector2d_t *pattern[4] = { &square[i], &square[i + 1], &square[i + 2], &square[i + 3] }; - - int8_t within_tile[4]; - for (int j = 0; j < 4; j++) { - within_tile[j] = - fracmv_within_tile(info, mv.x + pattern[j]->x, mv.y + pattern[j]->y); - } - - int qpel_indices[4] = { 0 }; - int int_offset_x[4] = { 0 }; - int int_offset_y[4] = { 0 }; - - for (int j = 0; j < 4; ++j) { - int hpel_offset_x = square[best_hpel_index].x; - int hpel_offset_y = square[best_hpel_index].y; - - int qpel_offset_x = 2 * hpel_offset_x + pattern[j]->x; - int qpel_offset_y = 2 * hpel_offset_y + pattern[j]->y; - - unsigned qpel_filter_x = (qpel_offset_x + 4) % 4; - unsigned qpel_filter_y = (qpel_offset_y + 4) % 4; - - // The first value (-1) is for the integer position and - // it will not be used - int filters_to_block_idx[4][4] = { - { -1, 3, 0, 4 }, - { 7, 11, 8, 12 }, - { 1, 5, 2, 6 }, - { 9, 13, 10, 14 } - }; - - qpel_indices[j] = filters_to_block_idx[qpel_filter_y][qpel_filter_x]; - - // Select values filtered from correct integer samples - int_offset_x[j] = qpel_offset_x >= 0; - int_offset_y[j] = qpel_offset_y >= 0; - } - - kvz_pixel *qpel_pos[4] = { - fracpel_blocks[qpel_indices[0]] + int_offset_y[0] * (LCU_WIDTH + 1) + int_offset_x[0], - fracpel_blocks[qpel_indices[1]] + int_offset_y[1] * (LCU_WIDTH + 1) + int_offset_x[1], - fracpel_blocks[qpel_indices[2]] + int_offset_y[2] * (LCU_WIDTH + 1) + int_offset_x[2], - fracpel_blocks[qpel_indices[3]] + int_offset_y[3] * (LCU_WIDTH + 1) + int_offset_x[3] - }; - - int qpel_strides[4] = { - (LCU_WIDTH + 1), - (LCU_WIDTH + 1), - (LCU_WIDTH + 1), - (LCU_WIDTH + 1) - }; - - kvz_satd_any_size_quad(width, height, (const kvz_pixel**)qpel_pos, qpel_strides, tmp_pic, width, 4, costs, within_tile); - - for (int j = 0; j < 4; j++) { - if (within_tile[j]) { - costs[j] += info->mvd_cost_func( - state, - mv.x + pattern[j]->x, - mv.y + pattern[j]->y, - 0, - info->mv_cand, - info->merge_cand, - info->num_merge_cand, - info->ref_idx, - &bitcosts[j] - ); - } - } - - for (int j = 0; j < 4; ++j) { - if (within_tile[j] && costs[j] < best_cost) { - best_cost = costs[j]; - best_bitcost = bitcosts[j]; - best_index = i + j; - } + i += 4; + + // Update mv for the best position on current precision + if (step == 1 || step == fme_level - 1) { + // Move search to best_index + mv.x += square[best_index].x; + mv.y += square[best_index].y; + + // On last hpel step... + if (step == MIN(fme_level - 1, 1)) { + //Set mv to quarterpel precision + mv.x *= 2; + mv.y *= 2; + sample_off_x = square[best_index].x; + sample_off_y = square[best_index].y; + best_index = 0; + i = 1; } } - - //Set mv to best final best match - mv.x += square[best_index].x; - mv.y += square[best_index].y; } info->best_mv = mv; @@ -1162,8 +1237,12 @@ search_mv_full(info, search_range, mv); break; + case KVZ_IME_DIA: + diamond_search(info, mv, info->state->encoder_control->cfg.me_max_steps); + break; + default: - hexagon_search(info, mv); + hexagon_search(info, mv, info->state->encoder_control->cfg.me_max_steps); break; } @@ -1203,30 +1282,9 @@ // Only check when candidates are different int cu_mv_cand = 0; - if (!merged && ( - info->mv_cand[0][0] != info->mv_cand[1][0] || - info->mv_cand[0][1] != info->mv_cand[1][1])) - { - uint32_t (*mvd_coding_cost)(const encoder_state_t * const state, - vector2d_t *, - const cabac_data_t*) = - cfg->mv_rdo ? kvz_get_mvd_coding_cost_cabac : get_mvd_coding_cost; - - vector2d_t mvd_temp1, mvd_temp2; - int cand1_cost,cand2_cost; - - mvd_temp1.x = mv.x - info->mv_cand[0][0]; - mvd_temp1.y = mv.y - info->mv_cand[0][1]; - cand1_cost = mvd_coding_cost(info->state, &mvd_temp1, &info->state->cabac); - - mvd_temp2.x = mv.x - info->mv_cand[1][0]; - mvd_temp2.y = mv.y - info->mv_cand[1][1]; - cand2_cost = mvd_coding_cost(info->state, &mvd_temp2, &info->state->cabac); - - // Select candidate 1 if it has lower cost - if (cand2_cost < cand1_cost) { - cu_mv_cand = 1; - } + if (!merged) { + cu_mv_cand = + select_mv_cand(info->state, info->mv_cand, mv.x, mv.y, NULL); } if (info->best_cost < *inter_cost) { @@ -1249,6 +1307,141 @@ /** + * \brief Search bipred modes for a PU. + */ +static void search_pu_inter_bipred(inter_search_info_t *info, + int depth, + lcu_t *lcu, cu_info_t *cur_cu, + double *inter_cost, + uint32_t *inter_bitcost) +{ + const image_list_t *const ref = info->state->frame->ref; + uint8_t (*ref_LX)[16] = info->state->frame->ref_LX; + const videoframe_t * const frame = info->state->tile->frame; + const int x = info->origin.x; + const int y = info->origin.y; + const int width = info->width; + const int height = info->height; + + static const uint8_t priorityList0[] = { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 }; + static const uint8_t priorityList1[] = { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 }; + const unsigned num_cand_pairs = + MIN(info->num_merge_cand * (info->num_merge_cand - 1), 12); + + inter_merge_cand_t *merge_cand = info->merge_cand; + + for (int32_t idx = 0; idx < num_cand_pairs; idx++) { + uint8_t i = priorityList0[idx]; + uint8_t j = priorityList1[idx]; + if (i >= info->num_merge_cand || j >= info->num_merge_cand) break; + + // Find one L0 and L1 candidate according to the priority list + if (!(merge_cand[i].dir & 0x1) || !(merge_cand[j].dir & 0x2)) continue; + + if (ref_LX[0][merge_cand[i].ref[0]] == ref_LX[1][merge_cand[j].ref[1]] && + merge_cand[i].mv[0][0] == merge_cand[j].mv[1][0] && + merge_cand[i].mv[0][1] == merge_cand[j].mv[1][1]) + { + continue; + } + + int16_t mv[2][2]; + mv[0][0] = merge_cand[i].mv[0][0]; + mv[0][1] = merge_cand[i].mv[0][1]; + mv[1][0] = merge_cand[j].mv[1][0]; + mv[1][1] = merge_cand[j].mv[1][1]; + + // Don't try merge candidates that don't satisfy mv constraints. + if (!fracmv_within_tile(info, mv[0][0], mv[0][1]) || + !fracmv_within_tile(info, mv[1][0], mv[1][1])) + { + continue; + } + + kvz_inter_recon_bipred(info->state, + ref->images[ref_LX[0][merge_cand[i].ref[0]]], + ref->images[ref_LX[1][merge_cand[j].ref[1]]], + x, y, + width, + height, + mv, + lcu); + + const kvz_pixel *rec = &lcu->rec.y[SUB_SCU(y) * LCU_WIDTH + SUB_SCU(x)]; + const kvz_pixel *src = &frame->source->y[x + y * frame->source->width]; + uint32_t cost = + kvz_satd_any_size(width, height, rec, LCU_WIDTH, src, frame->source->width); + + uint32_t bitcost[2] = { 0, 0 }; + + cost += info->mvd_cost_func(info->state, + merge_cand[i].mv[0][0], + merge_cand[i].mv[0][1], + 0, + info->mv_cand, + NULL, 0, 0, + &bitcost[0]); + cost += info->mvd_cost_func(info->state, + merge_cand[i].mv[1][0], + merge_cand[i].mv[1][1], + 0, + info->mv_cand, + NULL, 0, 0, + &bitcost[1]); + + const uint8_t mv_ref_coded[2] = { + merge_cand[i].ref[0], + merge_cand[j].ref[1] + }; + const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; + cost += info->state->lambda_sqrt * extra_bits + 0.5; + + if (cost < *inter_cost) { + cur_cu->inter.mv_dir = 3; + + cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0]; + cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1]; + + cur_cu->inter.mv[0][0] = merge_cand[i].mv[0][0]; + cur_cu->inter.mv[0][1] = merge_cand[i].mv[0][1]; + cur_cu->inter.mv[1][0] = merge_cand[j].mv[1][0]; + cur_cu->inter.mv[1][1] = merge_cand[j].mv[1][1]; + cur_cu->merged = 0; + + // Check every candidate to find a match + for (int merge_idx = 0; merge_idx < info->num_merge_cand; merge_idx++) { + if (merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] && + merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] && + merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] && + merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] && + merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] && + merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1]) + { + cur_cu->merged = 1; + cur_cu->merge_idx = merge_idx; + break; + } + } + + // Each motion vector has its own candidate + for (int reflist = 0; reflist < 2; reflist++) { + kvz_inter_get_mv_cand(info->state, x, y, width, height, info->mv_cand, cur_cu, lcu, reflist); + int cu_mv_cand = select_mv_cand( + info->state, + info->mv_cand, + cur_cu->inter.mv[reflist][0], + cur_cu->inter.mv[reflist][1], + NULL); + CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand); + } + + *inter_cost = cost; + *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits; + } + } +} + +/** * \brief Update PU to have best modes at this depth. * * \param state encoder state @@ -1300,6 +1493,7 @@ .width = width, .height = height, .mvd_cost_func = cfg->mv_rdo ? kvz_calc_mvd_cost_cabac : calc_mvd_cost, + .optimized_sad = kvz_get_optimized_sad(width), }; // Search for merge mode candidates @@ -1316,6 +1510,90 @@ CU_SET_MV_CAND(cur_cu, 0, 0); CU_SET_MV_CAND(cur_cu, 1, 0); + // Early Skip Mode Decision + if (cfg->early_skip && cur_cu->part_size == SIZE_2Nx2N) { + + int num_rdo_cands = 0; + int8_t mrg_cands[MRG_MAX_NUM_CANDS] = { 0, 1, 2, 3, 4 }; + double mrg_costs[MRG_MAX_NUM_CANDS] = { MAX_DOUBLE }; + + // Check motion vector constraints and perform rough search + for (int merge_idx = 0; merge_idx < info.num_merge_cand; ++merge_idx) { + + cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir; + cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; + cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; + cur_cu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0]; + cur_cu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1]; + cur_cu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0]; + cur_cu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1]; + + // Don't try merge candidates that don't satisfy mv constraints. + if (!fracmv_within_tile(&info, cur_cu->inter.mv[0][0], cur_cu->inter.mv[0][1]) || + !fracmv_within_tile(&info, cur_cu->inter.mv[1][0], cur_cu->inter.mv[1][1])) + { + continue; + } + + if (cfg->rdo >= 2) { + + kvz_lcu_fill_trdepth(lcu, x, y, depth, depth); + kvz_inter_recon_cu(state, lcu, x, y, width); + mrg_costs[merge_idx] = kvz_satd_any_size(width, height, + lcu->rec.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH, + lcu->ref.y + y_local * LCU_WIDTH + x_local, LCU_WIDTH); + } + + num_rdo_cands++; + } + + + if (cfg->rdo >= 2) { + // Sort candidates by cost + kvz_sort_modes(mrg_cands, mrg_costs, num_rdo_cands); + } + + // Limit by availability + // TODO: Do not limit to just 1 + num_rdo_cands = MIN(1, num_rdo_cands); + + // RDO search + for (int merge_rdo_idx = 0; merge_rdo_idx < num_rdo_cands; ++merge_rdo_idx) { + + // Reconstruct blocks with merge candidate. + // Check luma CBF. Then, check chroma CBFs if luma CBF is not set + // and chroma exists. + // Early terminate if merge candidate with zero CBF is found. + int merge_idx = mrg_cands[merge_rdo_idx]; + cur_cu->inter.mv_dir = info.merge_cand[merge_idx].dir; + cur_cu->inter.mv_ref[0] = info.merge_cand[merge_idx].ref[0]; + cur_cu->inter.mv_ref[1] = info.merge_cand[merge_idx].ref[1]; + cur_cu->inter.mv[0][0] = info.merge_cand[merge_idx].mv[0][0]; + cur_cu->inter.mv[0][1] = info.merge_cand[merge_idx].mv[0][1]; + cur_cu->inter.mv[1][0] = info.merge_cand[merge_idx].mv[1][0]; + cur_cu->inter.mv[1][1] = info.merge_cand[merge_idx].mv[1][1]; + kvz_lcu_fill_trdepth(lcu, x, y, depth, MAX(1, depth)); + kvz_inter_recon_cu(state, lcu, x, y, width); + kvz_quantize_lcu_residual(state, true, false, x, y, depth, cur_cu, lcu); + + if (cbf_is_set(cur_cu->cbf, depth, COLOR_Y)) { + continue; + } + else if(state->encoder_control->chroma_format != KVZ_CSP_400) { + + kvz_quantize_lcu_residual(state, false, true, x, y, depth, cur_cu, lcu); + if (!cbf_is_set_any(cur_cu->cbf, depth)) { + cur_cu->type = CU_INTER; + cur_cu->merge_idx = merge_idx; + cur_cu->skipped = true; + *inter_cost = 0.0; // TODO: Check this + *inter_bitcost = 0; // TODO: Check this + return; + } + } + } + } + for (int ref_idx = 0; ref_idx < state->frame->ref->used_size; ref_idx++) { info.ref_idx = ref_idx; info.ref = state->frame->ref->images[ref_idx]; @@ -1329,159 +1607,7 @@ && width + height >= 16; // 4x8 and 8x4 PBs are restricted to unipred if (can_use_bipred) { - lcu_t *templcu = MALLOC(lcu_t, 1); - unsigned cu_width = LCU_WIDTH >> depth; - static const uint8_t priorityList0[] = { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 }; - static const uint8_t priorityList1[] = { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 }; - const unsigned num_cand_pairs = - MIN(info.num_merge_cand * (info.num_merge_cand - 1), 12); - - inter_merge_cand_t *merge_cand = info.merge_cand; - - for (int32_t idx = 0; idx < num_cand_pairs; idx++) { - uint8_t i = priorityList0[idx]; - uint8_t j = priorityList1[idx]; - if (i >= info.num_merge_cand || j >= info.num_merge_cand) break; - - // Find one L0 and L1 candidate according to the priority list - if ((merge_cand[i].dir & 0x1) && (merge_cand[j].dir & 0x2)) { - if (state->frame->ref_LX[0][merge_cand[i].ref[0]] != - state->frame->ref_LX[1][merge_cand[j].ref[1]] || - - merge_cand[i].mv[0][0] != merge_cand[j].mv[1][0] || - merge_cand[i].mv[0][1] != merge_cand[j].mv[1][1]) - { - uint32_t bitcost[2]; - uint32_t cost = 0; - int8_t cu_mv_cand = 0; - int16_t mv[2][2]; - kvz_pixel tmp_block[64 * 64]; - kvz_pixel tmp_pic[64 * 64]; - - mv[0][0] = merge_cand[i].mv[0][0]; - mv[0][1] = merge_cand[i].mv[0][1]; - mv[1][0] = merge_cand[j].mv[1][0]; - mv[1][1] = merge_cand[j].mv[1][1]; - - // Don't try merge candidates that don't satisfy mv constraints. - if (!fracmv_within_tile(&info, mv[0][0], mv[0][1]) || - !fracmv_within_tile(&info, mv[1][0], mv[1][1])) - { - continue; - } - - kvz_inter_recon_lcu_bipred(state, - state->frame->ref->images[ - state->frame->ref_LX[0][merge_cand[i].ref[0]] - ], - state->frame->ref->images[ - state->frame->ref_LX[1][merge_cand[j].ref[1]] - ], - x, y, - width, - height, - mv, - templcu); - - for (int ypos = 0; ypos < height; ++ypos) { - int dst_y = ypos * width; - for (int xpos = 0; xpos < width; ++xpos) { - tmp_block[dst_y + xpos] = templcu->rec.y[ - SUB_SCU(y + ypos) * LCU_WIDTH + SUB_SCU(x + xpos)]; - tmp_pic[dst_y + xpos] = frame->source->y[x + xpos + (y + ypos)*frame->source->width]; - } - } - - cost = kvz_satd_any_size(cu_width, cu_width, tmp_pic, cu_width, tmp_block, cu_width); - - cost += info.mvd_cost_func(state, - merge_cand[i].mv[0][0], - merge_cand[i].mv[0][1], - 0, - info.mv_cand, - NULL, 0, 0, - &bitcost[0]); - cost += info.mvd_cost_func(state, - merge_cand[i].mv[1][0], - merge_cand[i].mv[1][1], - 0, - info.mv_cand, - NULL, 0, 0, - &bitcost[1]); - - const uint8_t mv_ref_coded[2] = { - merge_cand[i].ref[0], - merge_cand[j].ref[1] - }; - const int extra_bits = mv_ref_coded[0] + mv_ref_coded[1] + 2 /* mv dir cost */; - cost += state->lambda_sqrt * extra_bits + 0.5; - - - if (cost < *inter_cost) { - cur_cu->inter.mv_dir = 3; - - cur_cu->inter.mv_ref[0] = merge_cand[i].ref[0]; - cur_cu->inter.mv_ref[1] = merge_cand[j].ref[1]; - - cur_cu->inter.mv[0][0] = merge_cand[i].mv[0][0]; - cur_cu->inter.mv[0][1] = merge_cand[i].mv[0][1]; - cur_cu->inter.mv[1][0] = merge_cand[j].mv[1][0]; - cur_cu->inter.mv[1][1] = merge_cand[j].mv[1][1]; - cur_cu->merged = 0; - - // Check every candidate to find a match - for (int merge_idx = 0; merge_idx < info.num_merge_cand; merge_idx++) { - if (merge_cand[merge_idx].mv[0][0] == cur_cu->inter.mv[0][0] && - merge_cand[merge_idx].mv[0][1] == cur_cu->inter.mv[0][1] && - merge_cand[merge_idx].mv[1][0] == cur_cu->inter.mv[1][0] && - merge_cand[merge_idx].mv[1][1] == cur_cu->inter.mv[1][1] && - merge_cand[merge_idx].ref[0] == cur_cu->inter.mv_ref[0] && - merge_cand[merge_idx].ref[1] == cur_cu->inter.mv_ref[1]) - { - cur_cu->merged = 1; - cur_cu->merge_idx = merge_idx; - break; - } - } - - // Each motion vector has its own candidate - for (int reflist = 0; reflist < 2; reflist++) { - cu_mv_cand = 0; - kvz_inter_get_mv_cand(state, x, y, width, height, info.mv_cand, cur_cu, lcu, reflist); - if (info.mv_cand[0][0] != info.mv_cand[1][0] || - info.mv_cand[0][1] != info.mv_cand[1][1]) - { - uint32_t (*mvd_coding_cost)(const encoder_state_t * const state, - vector2d_t *, - const cabac_data_t*) = - cfg->mv_rdo ? kvz_get_mvd_coding_cost_cabac : get_mvd_coding_cost; - - vector2d_t mvd_temp1, mvd_temp2; - int cand1_cost, cand2_cost; - - mvd_temp1.x = cur_cu->inter.mv[reflist][0] - info.mv_cand[0][0]; - mvd_temp1.y = cur_cu->inter.mv[reflist][1] - info.mv_cand[0][1]; - cand1_cost = mvd_coding_cost(state, &mvd_temp1, (cabac_data_t*)&state->cabac); - - mvd_temp2.x = cur_cu->inter.mv[reflist][0] - info.mv_cand[1][0]; - mvd_temp2.y = cur_cu->inter.mv[reflist][1] - info.mv_cand[1][1]; - cand2_cost = mvd_coding_cost(state, &mvd_temp2, (cabac_data_t*)&state->cabac); - - // Select candidate 1 if it has lower cost - if (cand2_cost < cand1_cost) { - cu_mv_cand = 1; - } - } - CU_SET_MV_CAND(cur_cu, reflist, cu_mv_cand); - } - - *inter_cost = cost; - *inter_bitcost = bitcost[0] + bitcost[1] + extra_bits; - } - } - } - } - FREE_POINTER(templcu); + search_pu_inter_bipred(&info, depth, lcu, cur_cu, inter_cost, inter_bitcost); } if (*inter_cost < INT_MAX && cur_cu->inter.mv_dir == 1) { @@ -1489,6 +1615,53 @@ } } +/** +* \brief Calculate inter coding cost for luma and chroma CBs (--rd=2 accuracy). +* +* Calculate inter coding cost of each CB. This should match the intra coding cost +* calculation that is used on this RDO accuracy, since CU type decision is based +* on this. +* +* The cost includes SSD distortion, transform unit tree bits and motion vector bits +* for both luma and chroma if enabled. +* +* \param state encoder state +* \param x x-coordinate of the CU +* \param y y-coordinate of the CU +* \param depth depth of the CU in the quadtree +* \param lcu containing LCU +* +* \param inter_cost Return inter cost +* \param inter_bitcost Return inter bitcost +*/ +void kvz_cu_cost_inter_rd2(encoder_state_t * const state, + int x, int y, int depth, + lcu_t *lcu, + double *inter_cost, + uint32_t *inter_bitcost){ + + cu_info_t *cur_cu = LCU_GET_CU_AT_PX(lcu, SUB_SCU(x), SUB_SCU(y)); + int tr_depth = MAX(1, depth); + if (cur_cu->part_size != SIZE_2Nx2N) { + tr_depth = depth + 1; + } + kvz_lcu_fill_trdepth(lcu, x, y, depth, tr_depth); + kvz_inter_recon_cu(state, lcu, x, y, CU_WIDTH_FROM_DEPTH(depth)); + + const bool reconstruct_chroma = state->encoder_control->chroma_format != KVZ_CSP_400; + kvz_quantize_lcu_residual(state, true, reconstruct_chroma, + x, y, depth, + NULL, + lcu); + + *inter_cost = kvz_cu_rd_cost_luma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu); + if (state->encoder_control->chroma_format != KVZ_CSP_400) { + *inter_cost += kvz_cu_rd_cost_chroma(state, SUB_SCU(x), SUB_SCU(y), depth, cur_cu, lcu); + } + + *inter_cost += *inter_bitcost * state->lambda; +} + /** * \brief Update CU to have best modes at this depth. @@ -1516,6 +1689,15 @@ lcu, inter_cost, inter_bitcost); + + // Calculate more accurate cost when needed + if (state->encoder_control->cfg.rdo >= 2) { + kvz_cu_cost_inter_rd2(state, + x, y, depth, + lcu, + inter_cost, + inter_bitcost); + } } @@ -1560,6 +1742,7 @@ cur_pu->type = CU_INTER; cur_pu->part_size = part_mode; cur_pu->depth = depth; + cur_pu->qp = state->qp; double cost = MAX_INT; uint32_t bitcost = MAX_INT; @@ -1584,4 +1767,28 @@ } } } + + // Calculate more accurate cost when needed + if (state->encoder_control->cfg.rdo >= 2) { + kvz_cu_cost_inter_rd2(state, + x, y, depth, + lcu, + inter_cost, + inter_bitcost); + } + + // Count bits spent for coding the partition mode. + int smp_extra_bits = 1; // horizontal or vertical + if (state->encoder_control->cfg.amp_enable) { + smp_extra_bits += 1; // symmetric or asymmetric + if (part_mode != SIZE_2NxN && part_mode != SIZE_Nx2N) { + smp_extra_bits += 1; // U,L or D,R + } + } + // The transform is split for SMP and AMP blocks so we need more bits for + // coding the CBF. + smp_extra_bits += 6; + + *inter_cost += (state->encoder_control->cfg.rdo >= 2 ? state->lambda : state->lambda_sqrt) * smp_extra_bits; + *inter_bitcost += smp_extra_bits; }
View file
kvazaar-1.2.0.tar.gz/src/search_inter.h -> kvazaar-1.3.0.tar.gz/src/search_inter.h
Changed
@@ -32,17 +32,19 @@ #include "inter.h" #include "kvazaar.h" -#define FILTER_SIZE 8 -#define HALF_FILTER (FILTER_SIZE>>1) +#define KVZ_LUMA_FILTER_TAPS 8 +#define KVZ_LUMA_FILTER_OFFSET 3 +#define KVZ_CHROMA_FILTER_TAPS 4 +#define KVZ_CHROMA_FILTER_OFFSET 1 -// Maximum extra width a block needs to filter -// a fractional pixel with positive fractional mv.x and mv.y -#define KVZ_EXT_PADDING (FILTER_SIZE - 1) + // Maximum extra width a block needs to filter + // a fractional pixel with positive fractional mv.x and mv.y +#define KVZ_EXT_PADDING_LUMA (KVZ_LUMA_FILTER_TAPS - 1) +#define KVZ_EXT_PADDING_CHROMA (KVZ_CHROMA_FILTER_TAPS - 1) -// Maximum block width for extended block -#define KVZ_EXT_BLOCK_W (LCU_WIDTH + KVZ_EXT_PADDING) - -typedef kvz_pixel frac_search_block[(LCU_WIDTH + 1) * (LCU_WIDTH + 1)]; + // Maximum block width for extended block +#define KVZ_EXT_BLOCK_W_LUMA (LCU_WIDTH + KVZ_EXT_PADDING_LUMA) +#define KVZ_EXT_BLOCK_W_CHROMA (LCU_WIDTH_C + KVZ_EXT_PADDING_CHROMA) enum hpel_position { HPEL_POS_HOR = 0,
View file
kvazaar-1.2.0.tar.gz/src/search_intra.c -> kvazaar-1.3.0.tar.gz/src/search_intra.c
Changed
@@ -42,29 +42,6 @@ /** - * \brief Sort modes and costs to ascending order according to costs. - */ -static INLINE void sort_modes(int8_t *__restrict modes, double *__restrict costs, uint8_t length) -{ - // Length is always between 5 and 23, and is either 21, 17, 9 or 8 about - // 60% of the time, so there should be no need for anything more complex - // than insertion sort. - for (uint8_t i = 1; i < length; ++i) { - const double cur_cost = costs[i]; - const int8_t cur_mode = modes[i]; - uint8_t j = i; - while (j > 0 && cur_cost < costs[j - 1]) { - costs[j] = costs[j - 1]; - modes[j] = modes[j - 1]; - --j; - } - costs[j] = cur_cost; - modes[j] = cur_mode; - } -} - - -/** * \brief Select mode with the smallest cost. */ static INLINE uint8_t select_best_mode_index(const int8_t *modes, const double *costs, uint8_t length) @@ -309,7 +286,7 @@ if (depth == 0 || split_cost < nosplit_cost) { return split_cost; } else { - kvz_lcu_set_trdepth(lcu, x_px, y_px, depth, depth); + kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); pred_cu->cbf = nosplit_cbf; @@ -367,7 +344,7 @@ costs[i] += satd_func(pred, orig_block); } - sort_modes(modes, costs, 5); + kvz_sort_modes(modes, costs, 5); } @@ -617,12 +594,21 @@ FILL(pred_cu.cbf, 0); // Reset transform split data in lcu.cu for this area. - kvz_lcu_set_trdepth(lcu, x_px, y_px, depth, depth); + kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); double mode_cost = search_intra_trdepth(state, x_px, y_px, depth, tr_depth, modes[rdo_mode], MAX_INT, &pred_cu, lcu); costs[rdo_mode] += mode_cost; + + // Early termination if no coefficients has to be coded + if (state->encoder_control->cfg.intra_rdo_et && !cbf_is_set_any(pred_cu.cbf, depth)) { + modes_to_check = rdo_mode + 1; + break; + } } + // Update order according to new costs + kvz_sort_modes(modes, costs, modes_to_check); + // The best transform split hierarchy is not saved anywhere, so to get the // transform split hierarchy the search has to be performed again with the // best mode. @@ -844,8 +830,7 @@ } // Set transform depth to current depth, meaning no transform splits. - kvz_lcu_set_trdepth(lcu, x_px, y_px, depth, depth); - double best_rough_cost = costs[select_best_mode_index(modes, costs, number_of_modes)]; + kvz_lcu_fill_trdepth(lcu, x_px, y_px, depth, depth); // Refine results with slower search or get some results if rough search was skipped. const int32_t rdo_level = state->encoder_control->cfg.rdo; if (rdo_level >= 2 || skip_rough_search) { @@ -860,7 +845,7 @@ } int num_modes_to_check = MIN(number_of_modes, number_of_modes_to_search); - sort_modes(modes, costs, number_of_modes); + kvz_sort_modes(modes, costs, number_of_modes); number_of_modes = search_intra_rdo(state, x_px, y_px, depth, ref_pixels, LCU_WIDTH, @@ -872,5 +857,5 @@ uint8_t best_mode_i = select_best_mode_index(modes, costs, number_of_modes); *mode_out = modes[best_mode_i]; - *cost_out = skip_rough_search ? costs[best_mode_i]:best_rough_cost; + *cost_out = costs[best_mode_i]; }
View file
kvazaar-1.3.0.tar.gz/src/strategies/avx2/avx2_common_functions.h
Added
@@ -0,0 +1,114 @@ +#ifndef AVX2_COMMON_FUNCTIONS_H +#define AVX2_COMMON_FUNCTIONS_H + +#include <immintrin.h> + +/* + * Reorder coefficients from raster to scan order + * Fun fact: Once upon a time, doing this in a loop looked like this: + * for (int32_t n = 0; n < width * height; n++) { + * coef_reord[n] = coef[scan[n]]; + * q_coef_reord[n] = q_coef[scan[n]]; + * } + */ +static INLINE void scanord_read_vector(const int16_t **__restrict coeffs, const uint32_t *__restrict scan, int8_t scan_mode, int32_t subpos, int32_t width, __m256i *result_vecs, const int n_bufs) +{ + // For vectorized reordering of coef and q_coef + const __m128i low128_shuffle_masks[3] = { + _mm_setr_epi8(10,11, 4, 5, 12,13, 0, 1, 6, 7, 14,15, 8, 9, 2, 3), + _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12,13, 14,15), + _mm_setr_epi8( 4, 5, 6, 7, 0, 1, 2, 3, 12,13, 14,15, 8, 9, 10,11), + }; + + const __m128i blend_masks[3] = { + _mm_setr_epi16( 0, 0, 0, -1, 0, 0, -1, -1), + _mm_setr_epi16( 0, 0, 0, 0, 0, 0, 0, 0), + _mm_setr_epi16( 0, 0, -1, -1, 0, 0, -1, -1), + }; + + const __m128i invec_rearr_masks_upper[3] = { + _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 6, 7, 10,11, 4, 5, 12,13, 14,15), + _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12,13, 14,15), + _mm_setr_epi8( 0, 1, 8, 9, 4, 5, 12,13, 2, 3, 10,11, 6, 7, 14,15), + }; + + const __m128i invec_rearr_masks_lower[3] = { + _mm_setr_epi8(12,13, 6, 7, 0, 1, 2, 3, 14,15, 4, 5, 8, 9, 10,11), + _mm_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12,13, 14,15), + _mm_setr_epi8( 4, 5, 12,13, 0, 1, 8, 9, 6, 7, 14,15, 2, 3, 10,11), + }; + + const size_t row_offsets[4] = { + scan[subpos] + width * 0, + scan[subpos] + width * 1, + scan[subpos] + width * 2, + scan[subpos] + width * 3, + }; + + for (int i = 0; i < n_bufs; i++) { + const int16_t *__restrict coeff = coeffs[i]; + + // NOTE: Upper means "higher in pixel order inside block", which implies + // lower addresses (note the difference: HIGH and LOW vs UPPER and LOWER), + // so upper 128b vector actually becomes the lower part of a 256-bit coeff + // vector and lower vector the higher part! + __m128d coeffs_d_upper; + __m128d coeffs_d_lower; + + __m128i coeffs_upper; + __m128i coeffs_lower; + + __m128i coeffs_rearr1_upper; + __m128i coeffs_rearr1_lower; + + __m128i coeffs_rearr2_upper; + __m128i coeffs_rearr2_lower; + + // Zeroing these is actually unnecessary, but the compiler will whine + // about uninitialized values otherwise + coeffs_d_upper = _mm_setzero_pd(); + coeffs_d_lower = _mm_setzero_pd(); + + coeffs_d_upper = _mm_loadl_pd(coeffs_d_upper, (double *)(coeff + row_offsets[0])); + coeffs_d_upper = _mm_loadh_pd(coeffs_d_upper, (double *)(coeff + row_offsets[1])); + + coeffs_d_lower = _mm_loadl_pd(coeffs_d_lower, (double *)(coeff + row_offsets[2])); + coeffs_d_lower = _mm_loadh_pd(coeffs_d_lower, (double *)(coeff + row_offsets[3])); + + coeffs_upper = _mm_castpd_si128(coeffs_d_upper); + coeffs_lower = _mm_castpd_si128(coeffs_d_lower); + + coeffs_lower = _mm_shuffle_epi8(coeffs_lower, low128_shuffle_masks[scan_mode]); + + coeffs_rearr1_upper = _mm_blendv_epi8(coeffs_upper, coeffs_lower, blend_masks[scan_mode]); + coeffs_rearr1_lower = _mm_blendv_epi8(coeffs_lower, coeffs_upper, blend_masks[scan_mode]); + + coeffs_rearr2_upper = _mm_shuffle_epi8(coeffs_rearr1_upper, invec_rearr_masks_upper[scan_mode]); + coeffs_rearr2_lower = _mm_shuffle_epi8(coeffs_rearr1_lower, invec_rearr_masks_lower[scan_mode]); + + // The Intel Intrinsics Guide talks about _mm256_setr_m128i but my headers + // lack such an instruction. What it does is essentially this anyway. + result_vecs[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(coeffs_rearr2_upper), + coeffs_rearr2_lower, + 1); + } +} + +// If ints is completely zero, returns 16 in *first and -1 in *last +static INLINE void get_first_last_nz_int16(__m256i ints, int32_t *first, int32_t *last) +{ + // Note that nonzero_bytes will always have both bytes set for a set word + // even if said word only had one of its bytes set, because we're doing 16 + // bit wide comparisons. No big deal, just shift results to the right by one + // bit to have the results represent indexes of first set words, not bytes. + // Another note, it has to use right shift instead of division to preserve + // behavior on an all-zero vector (-1 / 2 == 0, but -1 >> 1 == -1) + const __m256i zero = _mm256_setzero_si256(); + + __m256i zeros = _mm256_cmpeq_epi16(ints, zero); + uint32_t nonzero_bytes = ~((uint32_t)_mm256_movemask_epi8(zeros)); + *first = ( (int32_t)_tzcnt_u32(nonzero_bytes)) >> 1; + *last = (31 - (int32_t)_lzcnt_u32(nonzero_bytes)) >> 1; +} + +#endif
View file
kvazaar-1.3.0.tar.gz/src/strategies/avx2/encode_coding_tree-avx2.c
Added
@@ -0,0 +1,605 @@ +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2015 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see <http://www.gnu.org/licenses/>. + ****************************************************************************/ + +#include "strategyselector.h" + +#if COMPILE_INTEL_AVX2 +#include "avx2_common_functions.h" +#include "cabac.h" +#include "context.h" +#include "encode_coding_tree-avx2.h" +#include "encode_coding_tree.h" +#include "strategies/missing-intel-intrinsics.h" +#include <immintrin.h> + +/* + * NOTE: Unlike SSE/AVX comparisons that would return 11 or 00 for gt/lte, + * this'll use 1x and 0x as bit patterns (x: garbage). A couple extra + * instructions will get you 11 and 00 if you need to use this as a mask + * somewhere at some point, but we don't need this right now. + * + * I'd love to draw a logic circuit here to describe this, but I can't. Two + * 2-bit uints can be compared for greaterness by first comparing their high + * bits using AND-NOT; (x AND (NOT y)) == 1 if x > y. If A_hi > B_hi, A > B. + * If A_hi == B_hi AND A_lo > B_lo, A > B. Otherwise, A <= B. It's really + * simple when drawn on paper, but quite messy on a general-purpose ALU. But + * look, just five instructions! + */ +static INLINE uint32_t u32vec_cmpgt_epu2(uint32_t a, uint32_t b) +{ + uint32_t a_gt_b = _andn_u32(b, a); + uint32_t a_ne_b = a ^ b; + uint32_t a_gt_b_sh = a_gt_b << 1; + uint32_t lobit_tiebrk_hi = _andn_u32(a_ne_b, a_gt_b_sh); + uint32_t res = a_gt_b | lobit_tiebrk_hi; + return res; +} + +static INLINE uint32_t pack_16x16b_to_16x2b(__m256i src) +{ + /* + * For each 16-bit element in src: + * ABCD EFGH IJKL MNOP Original elements + * 0000 0000 0000 00XY Element clipped to [0, 3] using _mm256_min_epu16 + * 0000 000X Y000 0000 Shift word to align LSBs across byte boundary + * 0000 0001 1000 0000 Comparison mask to be compared against + * XXXX XXXX YYYY YYYY Comparison result, for movemask + */ + const __m256i threes = _mm256_set1_epi16 (3); + const __m256i cmpmask = _mm256_slli_epi16 (threes, 7); // 0x0180 (avoid set1) + + __m256i clipped = _mm256_min_epu16 (src, threes); + __m256i shifted = _mm256_slli_epi16 (clipped, 7); + __m256i cmpres = _mm256_cmpeq_epi8 (shifted, cmpmask); + uint32_t result = _mm256_movemask_epi8(cmpres); + + return result; +} + +/** + * \brief Context derivation process of coeff_abs_significant_flag, + * parallelized to handle 16 coeffs at once + * \param pattern_sig_ctx pattern for current coefficient group + * \param scan_idx pixel scan type in use + * \param pos_xs column addresses of current scan positions + * \param pos_ys row addresses of current scan positions + * \param block_type log2 value of block size if square block, or 4 otherwise + * \param width width of the block + * \param texture_type texture type (TEXT_LUMA...) + * \returns ctx_inc for current scan position + */ +static INLINE __m256i kvz_context_get_sig_ctx_inc_16x16b(int32_t pattern_sig_ctx, uint32_t scan_idx, __m256i pos_xs, + __m256i pos_ys, int32_t block_type, int8_t texture_type) +{ + const __m256i zero = _mm256_set1_epi8(0); + const __m256i ff = _mm256_set1_epi8(0xff); + + const __m256i ones = _mm256_set1_epi16(1); + const __m256i twos = _mm256_set1_epi16(2); + const __m256i threes = _mm256_set1_epi16(3); + + const __m256i ctx_ind_map[3] = { + _mm256_setr_epi16( + 0, 2, 1, 6, + 3, 4, 7, 6, + 4, 5, 7, 8, + 5, 8, 8, 8 + ), + _mm256_setr_epi16( + 0, 1, 4, 5, + 2, 3, 4, 5, + 6, 6, 8, 8, + 7, 7, 8, 8 + ), + _mm256_setr_epi16( + 0, 2, 6, 7, + 1, 3, 6, 7, + 4, 4, 8, 8, + 5, 5, 8, 8 + ), + }; + + int16_t offset; + if (block_type == 3) + if (scan_idx == SCAN_DIAG) + offset = 9; + else + offset = 15; + else + if (texture_type == 0) + offset = 21; + else + offset = 12; + + __m256i offsets = _mm256_set1_epi16(offset); + + // This will only ever be compared to 0, 1 and 2, so it's fine to cast down + // to 16b (and it should never be above 3 anyways) + __m256i pattern_sig_ctxs = _mm256_set1_epi16((int16_t)(MIN(0xffff, pattern_sig_ctx))); + __m256i pattern_sig_ctxs_eq_zero = _mm256_cmpeq_epi16(pattern_sig_ctxs, zero); + __m256i pattern_sig_ctxs_eq_one = _mm256_cmpeq_epi16(pattern_sig_ctxs, ones); + __m256i pattern_sig_ctxs_eq_two = _mm256_cmpeq_epi16(pattern_sig_ctxs, twos); + + __m256i pattern_sig_ctxs_eq_1or2 = _mm256_or_si256 (pattern_sig_ctxs_eq_one, + pattern_sig_ctxs_eq_two); + __m256i pattern_sig_ctxs_lt3 = _mm256_or_si256 (pattern_sig_ctxs_eq_1or2, + pattern_sig_ctxs_eq_zero); + __m256i pattern_sig_ctxs_other = _mm256_xor_si256(pattern_sig_ctxs_lt3, + ff); + __m256i x_plus_y = _mm256_add_epi16 (pos_xs, pos_ys); + __m256i x_plus_y_zero = _mm256_cmpeq_epi16(x_plus_y, zero); // All these should be 0, preempts block_type_two rule + + __m256i texture_types = _mm256_set1_epi16((int16_t)texture_type); + + __m256i block_types = _mm256_set1_epi16((int16_t)block_type); + __m256i block_type_two = _mm256_cmpeq_epi16(block_types, twos); // All these should be ctx_ind_map[4 * pos_y + pos_x]; + __m256i bt2_vals = ctx_ind_map[scan_idx]; + __m256i bt2_vals_masked = _mm256_and_si256(bt2_vals, block_type_two); + + __m256i pos_xs_in_subset = _mm256_and_si256(pos_xs, threes); + __m256i pos_ys_in_subset = _mm256_and_si256(pos_ys, threes); + + __m256i cg_pos_xs = _mm256_srli_epi16(pos_xs, 2); + __m256i cg_pos_ys = _mm256_srli_epi16(pos_ys, 2); + __m256i cg_pos_xysums = _mm256_add_epi16 (cg_pos_xs, cg_pos_ys); + + __m256i pos_xy_sums_in_subset = _mm256_add_epi16(pos_xs_in_subset, pos_ys_in_subset); + + /* + * if (pattern_sig_ctx == 0) { + * switch (pos_x_in_subset + pos_y_in_subset) { + * case 0: + * cnt = 2; + * break; + * case 1: + * case 2: + * cnt = 1; + * break; + * default: + * cnt = 0; + * } + * } + * + * Equivalent to: + * + * if (pattern_sig_ctx == 0) { + * subamt = cnt <= 1 ? 1 : 0; + * pxyis_max3 = min(3, pos_x_in_subset + pos_y_in_subset); + * cnt = (3 - pxyis_max3) - subamt; + * } + */ + __m256i pxyis_lte_1 = _mm256_cmpgt_epi16(twos, pos_xy_sums_in_subset); + __m256i subamts = _mm256_and_si256 (pxyis_lte_1, ones); + __m256i pxyis_max3 = _mm256_min_epu16 (pos_xy_sums_in_subset, threes); + __m256i cnts_tmp = _mm256_sub_epi16 (threes, pxyis_max3); + __m256i cnts_sig_ctx_0 = _mm256_sub_epi16 (cnts_tmp, subamts); + __m256i cnts_sc0_masked = _mm256_and_si256 (cnts_sig_ctx_0, pattern_sig_ctxs_eq_zero); + + /* + * if (pattern_sig_ctx == 1 || pattern_sig_ctx == 2) { + * if (pattern_sig_ctx == 1) + * subtrahend = pos_y_in_subset; + * else + * subtrahend = pos_x_in_subset; + * cnt = 2 - min(2, subtrahend); + * } + */ + __m256i pos_operands_ctx_1or2 = _mm256_blendv_epi8(pos_ys_in_subset, + pos_xs_in_subset, + pattern_sig_ctxs_eq_two); + + __m256i pos_operands_max2 = _mm256_min_epu16 (pos_operands_ctx_1or2, twos); + __m256i cnts_sig_ctx_1or2 = _mm256_sub_epi16 (twos, pos_operands_max2); + __m256i cnts_sc12_masked = _mm256_and_si256 (cnts_sig_ctx_1or2, pattern_sig_ctxs_eq_1or2); + + /* + * if (pattern_sig_ctx > 2) + * cnt = 2; + */ + __m256i cnts_scother_masked = _mm256_and_si256(twos, pattern_sig_ctxs_other); + + // Select correct count + __m256i cnts_sc012_masked = _mm256_or_si256 (cnts_sc0_masked, cnts_sc12_masked); + __m256i cnts = _mm256_or_si256 (cnts_scother_masked, cnts_sc012_masked); + + // Compute final values + __m256i textype_eq_0 = _mm256_cmpeq_epi16(texture_types, zero); + __m256i cg_pos_sums_gt_0 = _mm256_cmpgt_epi16(cg_pos_xysums, zero); + __m256i tmpcond = _mm256_and_si256 (textype_eq_0, cg_pos_sums_gt_0); + __m256i tmp = _mm256_and_si256 (tmpcond, threes); + __m256i tmp_with_offsets = _mm256_add_epi16 (tmp, offsets); + __m256i rv_noshortcirc = _mm256_add_epi16 (cnts, tmp_with_offsets); + + // Ol' sprite mask method works here! + __m256i rv1 = _mm256_andnot_si256(block_type_two, rv_noshortcirc); + __m256i rv2 = _mm256_or_si256 (rv1, bt2_vals_masked); + __m256i rv = _mm256_andnot_si256(x_plus_y_zero, rv2); + return rv; +} + +void kvz_encode_coeff_nxn_avx2(encoder_state_t * const state, + cabac_data_t * const cabac, + const coeff_t *coeff, + uint8_t width, + uint8_t type, + int8_t scan_mode, + int8_t tr_skip) +{ + const encoder_control_t * const encoder = state->encoder_control; + int c1 = 1; + uint8_t last_coeff_x = 0; + uint8_t last_coeff_y = 0; + int32_t i; + uint32_t sig_coeffgroup_nzs[8 * 8] = { 0 }; + + int8_t be_valid = encoder->cfg.signhide_enable; + int32_t scan_pos_sig; + uint32_t go_rice_param = 0; + uint32_t ctx_sig; + + // CONSTANTS + const uint32_t num_blk_side = width >> TR_MIN_LOG2_SIZE; + const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2; + const uint32_t *scan = + kvz_g_sig_last_scan[scan_mode][log2_block_size - 1]; + const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 2][scan_mode]; + const uint32_t num_blocks = num_blk_side * num_blk_side; + + const __m256i zero = _mm256_set1_epi8(0); + const __m256i ones = _mm256_set1_epi16(1); + const __m256i twos = _mm256_set1_epi16(2); + + // Init base contexts according to block type + cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_model[type]); + cabac_ctx_t *baseCtx = (type == 0) ? &(cabac->ctx.cu_sig_model_luma[0]) : + &(cabac->ctx.cu_sig_model_chroma[0]); + + // Scan all coeff groups to find out which of them have coeffs. + // Populate sig_coeffgroup_nzs with that info. + + // NOTE: Modified the functionality a bit, sig_coeffgroup_flag used to be + // 1 if true and 0 if false, now it's "undefined but nonzero" if true and + // 0 if false (not actually undefined, it's a bitmask representing the + // significant coefficients' position in the group which in itself could + // be useful information) + int32_t scan_cg_last = -1; + + for (int32_t i = 0; i < num_blocks; i++) { + const uint32_t cg_id = scan_cg[i]; + const uint32_t n_xbits = log2_block_size - 2; // How many lowest bits of scan_cg represent X coord + const uint32_t cg_x = cg_id & ((1 << n_xbits) - 1); + const uint32_t cg_y = cg_id >> n_xbits; + + const uint32_t cg_pos = cg_y * width * 4 + cg_x * 4; + const uint32_t cg_pos_y = (cg_pos >> log2_block_size) >> TR_MIN_LOG2_SIZE; + const uint32_t cg_pos_x = (cg_pos & (width - 1)) >> TR_MIN_LOG2_SIZE; + const uint32_t idx = cg_pos_x + cg_pos_y * num_blk_side; + + __m128d coeffs_d_upper = _mm_setzero_pd(); + __m128d coeffs_d_lower = _mm_setzero_pd(); + __m128i coeffs_upper; + __m128i coeffs_lower; + __m256i cur_coeffs; + + coeffs_d_upper = _mm_loadl_pd(coeffs_d_upper, (double *)(coeff + cg_pos + 0 * width)); + coeffs_d_upper = _mm_loadh_pd(coeffs_d_upper, (double *)(coeff + cg_pos + 1 * width)); + coeffs_d_lower = _mm_loadl_pd(coeffs_d_lower, (double *)(coeff + cg_pos + 2 * width)); + coeffs_d_lower = _mm_loadh_pd(coeffs_d_lower, (double *)(coeff + cg_pos + 3 * width)); + + coeffs_upper = _mm_castpd_si128(coeffs_d_upper); + coeffs_lower = _mm_castpd_si128(coeffs_d_lower); + + cur_coeffs = _mm256_insertf128_si256(_mm256_castsi128_si256(coeffs_upper), + coeffs_lower, + 1); + + __m256i coeffs_zero = _mm256_cmpeq_epi16(cur_coeffs, zero); + + uint32_t nz_coeffs_2b = ~((uint32_t)_mm256_movemask_epi8(coeffs_zero)); + sig_coeffgroup_nzs[idx] = nz_coeffs_2b; + + if (nz_coeffs_2b) + scan_cg_last = i; + } + // Rest of the code assumes at least one non-zero coeff. + assert(scan_cg_last >= 0); + + ALIGNED(64) int16_t coeff_reord[LCU_WIDTH * LCU_WIDTH]; + uint32_t pos_last, scan_pos_last; + + { + __m256i coeffs_r; + for (int32_t i = 0; i <= scan_cg_last; i++) { + int32_t subpos = i * 16; + scanord_read_vector(&coeff, scan, scan_mode, subpos, width, &coeffs_r, 1); + _mm256_store_si256((__m256i *)(coeff_reord + subpos), coeffs_r); + } + + // Find the last coeff by going backwards in scan order. With cmpeq_epi16 + // and movemask, we can generate a dword with 16 2-bit masks that are 11 + // for zero words in the coeff vector, and 00 for nonzero words. By + // inverting the bits and counting leading zeros, we can determine the + // number of zero bytes in the vector counting from high to low memory + // addresses; subtract that from 31 and divide by 2 to get the offset of + // the last nonzero word. + uint32_t baseaddr = scan_cg_last * 16; + __m256i cur_coeffs_zeros = _mm256_cmpeq_epi16(coeffs_r, zero); + uint32_t nz_bytes = ~(_mm256_movemask_epi8(cur_coeffs_zeros)); + scan_pos_last = baseaddr + ((31 - _lzcnt_u32(nz_bytes)) >> 1); + pos_last = scan[scan_pos_last]; + } + + // transform skip flag + if(width == 4 && encoder->cfg.trskip_enable) { + cabac->cur_ctx = (type == 0) ? &(cabac->ctx.transform_skip_model_luma) : &(cabac->ctx.transform_skip_model_chroma); + CABAC_BIN(cabac, tr_skip, "transform_skip_flag"); + } + + last_coeff_x = pos_last & (width - 1); + last_coeff_y = (uint8_t)(pos_last >> log2_block_size); + + // Code last_coeff_x and last_coeff_y + kvz_encode_last_significant_xy(cabac, + last_coeff_x, + last_coeff_y, + width, + width, + type, + scan_mode); + + scan_pos_sig = scan_pos_last; + + ALIGNED(64) uint16_t abs_coeff[16]; + ALIGNED(32) uint16_t abs_coeff_buf_sb[16]; + ALIGNED(32) int16_t pos_ys_buf[16]; + ALIGNED(32) int16_t pos_xs_buf[16]; + ALIGNED(32) int16_t ctx_sig_buf[16]; + + abs_coeff[0] = abs(coeff[pos_last]); + uint32_t coeff_signs = (coeff[pos_last] < 0); + int32_t num_non_zero = 1; + int32_t last_nz_pos_in_cg = scan_pos_sig; + int32_t first_nz_pos_in_cg = scan_pos_sig; + scan_pos_sig--; + + // significant_coeff_flag + for (i = scan_cg_last; i >= 0; i--) { + int32_t sub_pos = i << 4; // LOG2_SCAN_SET_SIZE; + int32_t cg_blk_pos = scan_cg[i]; + int32_t cg_pos_y = cg_blk_pos / num_blk_side; + int32_t cg_pos_x = cg_blk_pos - (cg_pos_y * num_blk_side); + + go_rice_param = 0; + + if (i == scan_cg_last || i == 0) { + sig_coeffgroup_nzs[cg_blk_pos] = 1; + } else { + uint32_t sig_coeff_group = (sig_coeffgroup_nzs[cg_blk_pos] != 0); + uint32_t ctx_sig = kvz_context_get_sig_coeff_group(sig_coeffgroup_nzs, cg_pos_x, + cg_pos_y, width); + cabac->cur_ctx = &base_coeff_group_ctx[ctx_sig]; + CABAC_BIN(cabac, sig_coeff_group, "coded_sub_block_flag"); + } + + if (sig_coeffgroup_nzs[cg_blk_pos]) { + int32_t pattern_sig_ctx = kvz_context_calc_pattern_sig_ctx(sig_coeffgroup_nzs, + cg_pos_x, cg_pos_y, width); + + // A mask with the first 16-bit word unmasked (bits set ie. 0xffff) + const __m256i coeff_pos_zero = _mm256_castsi128_si256(_mm_cvtsi32_si128(0xffff)); + + const __m128i log2_block_size_128 = _mm_cvtsi32_si128(log2_block_size); + + __m256i coeffs = _mm256_load_si256((__m256i *)(coeff_reord + sub_pos)); + __m256i sigs_inv = _mm256_cmpeq_epi16(coeffs, zero); + __m256i is = _mm256_set1_epi16(i); + __m256i is_zero = _mm256_cmpeq_epi16(is, zero); + __m256i coeffs_negative = _mm256_cmpgt_epi16(zero, coeffs); + + __m256i masked_coeffs = _mm256_andnot_si256(sigs_inv, coeffs); + __m256i abs_coeffs = _mm256_abs_epi16(masked_coeffs); + + // TODO: obtain 16-bit block positions, maybe? :P + __m256i blk_poses_hi = _mm256_loadu_si256((__m256i *)(scan + sub_pos + 8)); + __m256i blk_poses_lo = _mm256_loadu_si256((__m256i *)(scan + sub_pos + 0)); + __m256i blk_poses_tmp = _mm256_packs_epi32(blk_poses_lo, blk_poses_hi); + __m256i blk_poses = _mm256_permute4x64_epi64(blk_poses_tmp, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i pos_ys = _mm256_srl_epi16(blk_poses, log2_block_size_128); + __m256i pos_xs = _mm256_sub_epi16(blk_poses, _mm256_sll_epi16(pos_ys, log2_block_size_128)); + + _mm256_store_si256((__m256i *)pos_ys_buf, pos_ys); + _mm256_store_si256((__m256i *)pos_xs_buf, pos_xs); + + __m256i encode_sig_coeff_flags_inv = _mm256_andnot_si256(is_zero, coeff_pos_zero); + + get_first_last_nz_int16(masked_coeffs, &first_nz_pos_in_cg, &last_nz_pos_in_cg); + _mm256_store_si256((__m256i *)abs_coeff_buf_sb, abs_coeffs); + + __m256i ctx_sigs = kvz_context_get_sig_ctx_inc_16x16b(pattern_sig_ctx, scan_mode, pos_xs, pos_ys, + log2_block_size, type); + + _mm256_store_si256((__m256i *)ctx_sig_buf, ctx_sigs); + + uint32_t esc_flags = ~(_mm256_movemask_epi8(encode_sig_coeff_flags_inv)); + uint32_t sigs = ~(_mm256_movemask_epi8(sigs_inv)); + uint32_t coeff_sign_buf = _mm256_movemask_epi8(coeffs_negative); + + for (; scan_pos_sig >= sub_pos; scan_pos_sig--) { + uint32_t id = scan_pos_sig - sub_pos; + uint32_t shift = (id << 1) + 1; + + uint32_t curr_sig = (sigs >> shift) & 1; + uint32_t curr_esc_flag = (esc_flags >> shift) & 1; + uint32_t curr_coeff_sign = (coeff_sign_buf >> shift) & 1; + + if (curr_esc_flag | num_non_zero) { + ctx_sig = ctx_sig_buf[id]; + cabac->cur_ctx = &baseCtx[ctx_sig]; + CABAC_BIN(cabac, curr_sig, "sig_coeff_flag"); + } + + if (curr_sig) { + abs_coeff[num_non_zero] = abs_coeff_buf_sb[id]; + coeff_signs = 2 * coeff_signs + curr_coeff_sign; + num_non_zero++; + } + } + } else { + scan_pos_sig = sub_pos - 1; + } + + if (num_non_zero > 0) { + bool sign_hidden = last_nz_pos_in_cg - first_nz_pos_in_cg >= 4 /* SBH_THRESHOLD */ + && !encoder->cfg.lossless; + uint32_t ctx_set = (i > 0 && type == 0) ? 2 : 0; + cabac_ctx_t *base_ctx_mod; + int32_t num_c1_flag, first_c2_flag_idx, idx; + + __m256i abs_coeffs = _mm256_load_si256((__m256i *)abs_coeff); + __m256i coeffs_gt1 = _mm256_cmpgt_epi16(abs_coeffs, ones); + __m256i coeffs_gt2 = _mm256_cmpgt_epi16(abs_coeffs, twos); + uint32_t coeffs_gt1_bits = _mm256_movemask_epi8(coeffs_gt1); + uint32_t coeffs_gt2_bits = _mm256_movemask_epi8(coeffs_gt2); + + if (c1 == 0) { + ctx_set++; + } + + base_ctx_mod = (type == 0) ? &(cabac->ctx.cu_one_model_luma[4 * ctx_set]) : + &(cabac->ctx.cu_one_model_chroma[4 * ctx_set]); + num_c1_flag = MIN(num_non_zero, C1FLAG_NUMBER); + first_c2_flag_idx = -1; + + + /* + * c1s_pattern is 16 base-4 numbers: 3, 3, 3, ... , 3, 2 (c1 will never + * be less than 0 or greater than 3, so two bits per iter are enough). + * It's essentially the values that c1 will be for the next iteration as + * long as we have not encountered any >1 symbols. Count how long run of + * such symbols there is in the beginning of this CG, and zero all c1's + * that are located at or after the first >1 symbol. + */ + const uint32_t c1s_pattern = 0xfffffffe; + uint32_t n_nongt1_bits = _tzcnt_u32(coeffs_gt1_bits); + uint32_t c1s_nextiter = _bzhi_u32(c1s_pattern, n_nongt1_bits); + first_c2_flag_idx = n_nongt1_bits >> 1; + + c1 = 1; + for (idx = 0; idx < num_c1_flag; idx++) { + uint32_t shift = idx << 1; + uint32_t symbol = (coeffs_gt1_bits >> shift) & 1; + + cabac->cur_ctx = &base_ctx_mod[c1]; + CABAC_BIN(cabac, symbol, "coeff_abs_level_greater1_flag"); + + c1 = (c1s_nextiter >> shift) & 3; + } + + if (c1 == 0) { + base_ctx_mod = (type == 0) ? &(cabac->ctx.cu_abs_model_luma[ctx_set]) : + &(cabac->ctx.cu_abs_model_chroma[ctx_set]); + + if (first_c2_flag_idx != -1) { + uint32_t shift = (first_c2_flag_idx << 1) + 1; + uint8_t symbol = (coeffs_gt2_bits >> shift) & 1; + cabac->cur_ctx = &base_ctx_mod[0]; + + CABAC_BIN(cabac, symbol, "coeff_abs_level_greater2_flag"); + } + } + int32_t shiftamt = (be_valid && sign_hidden) ? 1 : 0; + int32_t nnz = num_non_zero - shiftamt; + coeff_signs >>= shiftamt; + if (!cabac->only_count) { + if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) { + coeff_signs ^= kvz_crypto_get_key(state->crypto_hdl, nnz); + } + } + CABAC_BINS_EP(cabac, coeff_signs, nnz, "coeff_sign_flag"); + + if (c1 == 0 || num_non_zero > C1FLAG_NUMBER) { + + const __m256i ones = _mm256_set1_epi16(1); + + __m256i abs_coeffs_gt1 = _mm256_cmpgt_epi16 (abs_coeffs, ones); + uint32_t acgt1_bits = _mm256_movemask_epi8(abs_coeffs_gt1); + uint32_t first_acgt1_bpos = _tzcnt_u32(acgt1_bits); + + uint32_t abs_coeffs_base4 = pack_16x16b_to_16x2b(abs_coeffs); + + const uint32_t ones_base4 = 0x55555555; + const uint32_t twos_base4 = 0xaaaaaaaa; + + const uint32_t c1flag_number_mask_inv = 0xffffffff << (C1FLAG_NUMBER << 1); + const uint32_t c1flag_number_mask = ~c1flag_number_mask_inv; + + // The addition will not overflow between 2-bit atoms because + // first_coeff2s will only be 1 or 0, and the other addend is 2 + uint32_t first_coeff2s = _bzhi_u32(ones_base4, first_acgt1_bpos + 2); + uint32_t base_levels = first_coeff2s + twos_base4; + + base_levels &= c1flag_number_mask; + base_levels |= (ones_base4 & c1flag_number_mask_inv); + + uint32_t encode_decisions = u32vec_cmpgt_epu2(base_levels, abs_coeffs_base4); + + for (idx = 0; idx < num_non_zero; idx++) { + + uint32_t shift = idx << 1; + uint32_t dont_encode_curr = (encode_decisions >> shift); + int16_t base_level = (base_levels >> shift) & 3; + + uint16_t curr_abs_coeff = abs_coeff[idx]; + + if (!(dont_encode_curr & 2)) { + uint16_t level_diff = curr_abs_coeff - base_level; + if (!cabac->only_count && (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFFS)) { + kvz_cabac_write_coeff_remain_encry(state, cabac, level_diff, go_rice_param, base_level); + } else { + kvz_cabac_write_coeff_remain(cabac, level_diff, go_rice_param); + } + + if (curr_abs_coeff > 3 * (1 << go_rice_param)) { + go_rice_param = MIN(go_rice_param + 1, 4); + } + } + + } + } + } + last_nz_pos_in_cg = -1; + first_nz_pos_in_cg = 16; + num_non_zero = 0; + coeff_signs = 0; + } +} +#endif // COMPILE_INTEL_AVX2 + +int kvz_strategy_register_encode_avx2(void* opaque, uint8_t bitdepth) +{ + bool success = true; + +#if COMPILE_INTEL_AVX2 + success &= kvz_strategyselector_register(opaque, "encode_coeff_nxn", "avx2", 40, &kvz_encode_coeff_nxn_avx2); +#endif + + return success; +}
View file
kvazaar-1.3.0.tar.gz/src/strategies/avx2/encode_coding_tree-avx2.h
Added
@@ -0,0 +1,42 @@ +#ifndef ENCODE_CODING_TREE_AVX2_H_ +#define ENCODE_CODING_TREE_AVX2_H_ + +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2015 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see <http://www.gnu.org/licenses/>. + ****************************************************************************/ + +/** + * \file + * Functions for writing the coding quadtree and related syntax. + */ + +#include "encoderstate.h" +#include "global.h" + +void kvz_encode_coeff_nxn_avx2(encoder_state_t * const state, + cabac_data_t * const cabac, + const coeff_t *coeff, + uint8_t width, + uint8_t type, + int8_t scan_mode, + int8_t tr_skip); + +int kvz_strategy_register_encode_avx2(void* opaque, uint8_t bitdepth); + +#endif // ENCODE_CODING_TREE_AVX2_H_
View file
kvazaar-1.2.0.tar.gz/src/strategies/avx2/ipol-avx2.c -> kvazaar-1.3.0.tar.gz/src/strategies/avx2/ipol-avx2.c
Changed
@@ -31,1338 +31,1422 @@ #include "encoder.h" #include "kvazaar.h" +#include "search_inter.h" #include "strategies/generic/picture-generic.h" #include "strategies/strategies-ipol.h" #include "strategyselector.h" #include "strategies/generic/ipol-generic.h" -#define FILTER_OFFSET 3 -#define FILTER_SIZE 8 - -#define MAX_HEIGHT (4 * (LCU_WIDTH + 1) + FILTER_SIZE) -#define MAX_WIDTH ((LCU_WIDTH + 1) + FILTER_SIZE) - extern int8_t kvz_g_luma_filter[4][8]; extern int8_t kvz_g_chroma_filter[8][4]; -void kvz_eight_tap_filter_x8_and_flip(__m128i *data01, __m128i *data23, __m128i *data45, __m128i *data67, __m128i *filter, __m128i *dst) +static int32_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data) { - __m128i a, b, c, d; - __m128i fir = _mm_broadcastq_epi64(_mm_loadl_epi64(filter)); - - a = _mm_maddubs_epi16(*data01, fir); - b = _mm_maddubs_epi16(*data23, fir); - a = _mm_hadd_epi16(a, b); - - c = _mm_maddubs_epi16(*data45, fir); - d = _mm_maddubs_epi16(*data67, fir); - c = _mm_hadd_epi16(c, d); - - a = _mm_hadd_epi16(a, c); + __m128i fir = _mm_loadl_epi64((__m128i*)filter); + __m128i row = _mm_loadl_epi64((__m128i*)data); + __m128i acc; + acc = _mm_maddubs_epi16(row, fir); + __m128i temp = _mm_srli_si128(acc, 4); + acc = _mm_add_epi16(acc, temp); + temp = _mm_srli_si128(acc, 2); + acc = _mm_add_epi16(acc, temp); + int32_t filtered = _mm_cvtsi128_si32(acc); + + return filtered; +} - _mm_storeu_si128(dst, a); +static void kvz_init_shuffle_masks(__m256i *shuf_01_23, __m256i *shuf_45_67) { + // Shuffle pairs + *shuf_01_23 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + *shuf_45_67 = _mm256_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14); } -static __m128i kvz_eight_tap_filter_flip_x8_16bit_avx2(__m128i *row, int8_t *filter, int32_t offset23, int32_t shift23) -{ - __m128i temp[8]; - __m128i temp_lo; - __m128i temp_hi; - __m128i fir = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)filter)); - - temp[0] = _mm_madd_epi16(row[0], fir); - temp[1] = _mm_madd_epi16(row[1], fir); - temp_lo = _mm_unpacklo_epi32(temp[0], temp[1]); - temp_hi = _mm_unpackhi_epi32(temp[0], temp[1]); - temp[0] = _mm_add_epi32(temp_lo, temp_hi); - - temp[2] = _mm_madd_epi16(row[2], fir); - temp[3] = _mm_madd_epi16(row[3], fir); - temp_lo = _mm_unpacklo_epi32(temp[2], temp[3]); - temp_hi = _mm_unpackhi_epi32(temp[2], temp[3]); - temp[2] = _mm_add_epi32(temp_lo, temp_hi); - - temp[4] = _mm_madd_epi16(row[4], fir); - temp[5] = _mm_madd_epi16(row[5], fir); - temp_lo = _mm_unpacklo_epi32(temp[4], temp[5]); - temp_hi = _mm_unpackhi_epi32(temp[4], temp[5]); - temp[4] = _mm_add_epi32(temp_lo, temp_hi); - - temp[6] = _mm_madd_epi16(row[6], fir); - temp[7] = _mm_madd_epi16(row[7], fir); - temp_lo = _mm_unpacklo_epi32(temp[6], temp[7]); - temp_hi = _mm_unpackhi_epi32(temp[6], temp[7]); - temp[6] = _mm_add_epi32(temp_lo, temp_hi); - - temp_lo = _mm_unpacklo_epi32(temp[0], temp[2]); - temp_hi = _mm_unpackhi_epi32(temp[0], temp[2]); - temp[0] = _mm_add_epi32(temp_lo, temp_hi); - temp[0] = _mm_shuffle_epi32(temp[0], _MM_SHUFFLE(3, 1, 2, 0)); - - temp_lo = _mm_unpacklo_epi32(temp[4], temp[6]); - temp_hi = _mm_unpackhi_epi32(temp[4], temp[6]); - temp[4] = _mm_add_epi32(temp_lo, temp_hi); - temp[4] = _mm_shuffle_epi32(temp[4], _MM_SHUFFLE(3, 1, 2, 0)); - - __m128i add = _mm_set1_epi32(offset23); - temp[0] = _mm_add_epi32(temp[0], add); - temp[4] = _mm_add_epi32(temp[4], add); - temp[0] = _mm_srai_epi32(temp[0], shift23); - temp[4] = _mm_srai_epi32(temp[4], shift23); - - temp[0] = _mm_packus_epi32(temp[0], temp[4]); - temp[0] = _mm_packus_epi16(temp[0], temp[0]); - - return temp[0]; +static void kvz_init_shuffle_masks_chroma(__m256i *shuf_01, __m256i *shuf_23) { + // Shuffle pairs + *shuf_01 = _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12, + 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12); + *shuf_23 = _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14, + 2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14); } -static __m256i kvz_eight_tap_filter_flip_x8_16bit_dual_avx2(__m256i *row, int8_t *filter[2], int32_t offset23, int32_t shift23) -{ - __m256i temp[8]; - __m256i temp_lo; - __m256i temp_hi; - __m256i fir = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)filter[0]), _mm_loadl_epi64((__m128i*)filter[1]))); - - temp[0] = _mm256_madd_epi16(row[0], fir); - temp[1] = _mm256_madd_epi16(row[1], fir); - temp_lo = _mm256_unpacklo_epi32(temp[0], temp[1]); - temp_hi = _mm256_unpackhi_epi32(temp[0], temp[1]); - temp[0] = _mm256_add_epi32(temp_lo, temp_hi); - - temp[2] = _mm256_madd_epi16(row[2], fir); - temp[3] = _mm256_madd_epi16(row[3], fir); - temp_lo = _mm256_unpacklo_epi32(temp[2], temp[3]); - temp_hi = _mm256_unpackhi_epi32(temp[2], temp[3]); - temp[2] = _mm256_add_epi32(temp_lo, temp_hi); - - temp[4] = _mm256_madd_epi16(row[4], fir); - temp[5] = _mm256_madd_epi16(row[5], fir); - temp_lo = _mm256_unpacklo_epi32(temp[4], temp[5]); - temp_hi = _mm256_unpackhi_epi32(temp[4], temp[5]); - temp[4] = _mm256_add_epi32(temp_lo, temp_hi); - - temp[6] = _mm256_madd_epi16(row[6], fir); - temp[7] = _mm256_madd_epi16(row[7], fir); - temp_lo = _mm256_unpacklo_epi32(temp[6], temp[7]); - temp_hi = _mm256_unpackhi_epi32(temp[6], temp[7]); - temp[6] = _mm256_add_epi32(temp_lo, temp_hi); - - temp_lo = _mm256_unpacklo_epi32(temp[0], temp[2]); - temp_hi = _mm256_unpackhi_epi32(temp[0], temp[2]); - temp[0] = _mm256_add_epi32(temp_lo, temp_hi); - temp[0] = _mm256_shuffle_epi32(temp[0], _MM_SHUFFLE(3, 1, 2, 0)); - - temp_lo = _mm256_unpacklo_epi32(temp[4], temp[6]); - temp_hi = _mm256_unpackhi_epi32(temp[4], temp[6]); - temp[4] = _mm256_add_epi32(temp_lo, temp_hi); - temp[4] = _mm256_shuffle_epi32(temp[4], _MM_SHUFFLE(3, 1, 2, 0)); - - __m256i add = _mm256_set1_epi32(offset23); - temp[0] = _mm256_add_epi32(temp[0], add); - temp[4] = _mm256_add_epi32(temp[4], add); - temp[0] = _mm256_srai_epi32(temp[0], shift23); - temp[4] = _mm256_srai_epi32(temp[4], shift23); - - temp[0] = _mm256_packus_epi32(temp[0], temp[4]); - temp[0] = _mm256_packus_epi16(temp[0], temp[0]); - - return temp[0]; +static void kvz_init_filter_taps(int8_t *filter, + __m256i *taps_01_23, __m256i *taps_45_67) { + // Filter weights + __m256i all_taps = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)filter)); + __m256i perm_01 = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1); + __m256i perm_23 = _mm256_setr_epi32(2, 2, 2, 2, 3, 3, 3, 3); + all_taps = _mm256_unpacklo_epi16(all_taps, all_taps); + *taps_01_23 = _mm256_permutevar8x32_epi32(all_taps, perm_01); + *taps_45_67 = _mm256_permutevar8x32_epi32(all_taps, perm_23); } -/* -static __m128i kvz_eight_tap_filter_flip_x8_avx2(__m128i *row, int8_t *filter, int32_t shift1) -{ - __m128i temp[4]; - __m128i fir = _mm_broadcastq_epi64(_mm_loadl_epi64((__m128i*)filter)); - - temp[0] = _mm_unpacklo_epi64(row[0], row[1]); - temp[0] = _mm_maddubs_epi16(temp[0], fir); +static void kvz_init_filter_taps_chroma(int8_t *filter, + __m256i *taps_01, __m256i *taps_23) { + // Filter weights + __m256i all_taps = _mm256_set1_epi32(*(int32_t*)filter); + all_taps = _mm256_unpacklo_epi16(all_taps, all_taps); + *taps_01 = _mm256_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0)); + *taps_23 = _mm256_shuffle_epi32(all_taps, _MM_SHUFFLE(1, 1, 1, 1)); +} - temp[1] = _mm_unpacklo_epi64(row[2], row[3]); - temp[1] = _mm_maddubs_epi16(temp[1], fir); +static void kvz_init_ver_filter_taps(int8_t *filter, __m256i *filters) { + for (int i = 0; i < 4; ++i) filters[i] = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&filter[2 * i])); + filters[0] = _mm256_inserti128_si256(filters[0], _mm256_castsi256_si128(filters[3]), 1); // Pairs 01 67 + filters[1] = _mm256_inserti128_si256(filters[1], _mm256_castsi256_si128(filters[0]), 1); // Pairs 23 01 + filters[2] = _mm256_inserti128_si256(filters[2], _mm256_castsi256_si128(filters[1]), 1); // Pairs 45 23 + filters[3] = _mm256_inserti128_si256(filters[3], _mm256_castsi256_si128(filters[2]), 1); // Pairs 67 45 +} - temp[0] = _mm_hadd_epi16(temp[0], temp[1]); +static void kvz_eight_tap_filter_hor_8x1_avx2(kvz_pixel *data, int16_t * out, + __m256i *shuf_01_23, __m256i *shuf_45_67, + __m256i *taps_01_23, __m256i *taps_45_67) { - temp[2] = _mm_unpacklo_epi64(row[4], row[5]); - temp[2] = _mm_maddubs_epi16(temp[2], fir); + __m256i row = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)data)); - temp[3] = _mm_unpacklo_epi64(row[6], row[7]); - temp[3] = _mm_maddubs_epi16(temp[3], fir); + __m256i pairs_01_23 = _mm256_shuffle_epi8(row, *shuf_01_23); + __m256i pairs_45_67 = _mm256_shuffle_epi8(row, *shuf_45_67); - temp[2] = _mm_hadd_epi16(temp[2], temp[3]); + __m256i temp0 = _mm256_maddubs_epi16(pairs_01_23, *taps_01_23); + __m256i temp1 = _mm256_maddubs_epi16(pairs_45_67, *taps_45_67); - temp[0] = _mm_hadd_epi16(temp[0], temp[2]); - - temp[0] = _mm_srai_epi16(temp[0], shift1); - - return temp[0]; + __m256i sum = _mm256_add_epi16(temp0, temp1); + __m128i filtered = _mm_add_epi16(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); + _mm_storeu_si128((__m128i*)out, filtered); } -*/ - -static __m256i kvz_eight_tap_filter_flip_x8_dual_avx2(__m256i *row, int8_t *filter[2], int32_t shift1) -{ - __m256i temp[4]; - __m256i fir = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)filter[0])), _mm_loadl_epi64((__m128i*)filter[1]), 1); - fir = _mm256_shuffle_epi32(fir, _MM_SHUFFLE(1, 0, 1, 0)); - - temp[0] = _mm256_unpacklo_epi64(row[0], row[1]); - temp[0] = _mm256_maddubs_epi16(temp[0], fir); - temp[1] = _mm256_unpacklo_epi64(row[2], row[3]); - temp[1] = _mm256_maddubs_epi16(temp[1], fir); +static void kvz_four_tap_filter_hor_4x4_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride, + __m256i *shuf_01, __m256i *shuf_23, + __m256i *taps_01, __m256i *taps_23) { - temp[0] = _mm256_hadd_epi16(temp[0], temp[1]); + __m256i four_rows = _mm256_setr_epi64x( + *(int64_t*)&data[0 * stride], + *(int64_t*)&data[1 * stride], + *(int64_t*)&data[2 * stride], + *(int64_t*)&data[3 * stride]); - temp[2] = _mm256_unpacklo_epi64(row[4], row[5]); - temp[2] = _mm256_maddubs_epi16(temp[2], fir); + __m256i pairs_l = _mm256_shuffle_epi8(four_rows, *shuf_01); + __m256i pairs_r = _mm256_shuffle_epi8(four_rows, *shuf_23); - temp[3] = _mm256_unpacklo_epi64(row[6], row[7]); - temp[3] = _mm256_maddubs_epi16(temp[3], fir); - - temp[2] = _mm256_hadd_epi16(temp[2], temp[3]); + __m256i temp_l = _mm256_maddubs_epi16(pairs_l, *taps_01); + __m256i temp_r = _mm256_maddubs_epi16(pairs_r, *taps_23); - temp[0] = _mm256_hadd_epi16(temp[0], temp[2]); + __m256i sum = _mm256_add_epi16(temp_l, temp_r); - temp[0] = _mm256_srai_epi16(temp[0], shift1); - - return temp[0]; + __m128i lower = _mm256_castsi256_si128(sum); + __m128i upper = _mm256_extracti128_si256(sum, 1); + _mm_storel_epi64((__m128i*)(out + 0 * out_stride), lower); + _mm_storeh_pd((double*)(out + 1 * out_stride), _mm_castsi128_pd(lower)); + _mm_storel_epi64((__m128i*)(out + 2 * out_stride), upper); + _mm_storeh_pd((double*)(out + 3 * out_stride), _mm_castsi128_pd(upper)); } -/* -static INLINE void kvz_filter_flip_shift_x8_avx2(kvz_pixel *src, int16_t src_stride, int8_t *filter, int32_t shift1, int16_t *dst){ - - __m128i rows[8]; - rows[0] = _mm_loadl_epi64((__m128i*)(src + 0 * src_stride)); - rows[1] = _mm_loadl_epi64((__m128i*)(src + 1 * src_stride)); - rows[2] = _mm_loadl_epi64((__m128i*)(src + 2 * src_stride)); - rows[3] = _mm_loadl_epi64((__m128i*)(src + 3 * src_stride)); - rows[4] = _mm_loadl_epi64((__m128i*)(src + 4 * src_stride)); - rows[5] = _mm_loadl_epi64((__m128i*)(src + 5 * src_stride)); - rows[6] = _mm_loadl_epi64((__m128i*)(src + 6 * src_stride)); - rows[7] = _mm_loadl_epi64((__m128i*)(src + 7 * src_stride)); - __m128i out = kvz_eight_tap_filter_flip_x8_avx2(rows, filter, shift1); - _mm_storeu_si128((__m128i*)dst, out); -} -*/ +static void kvz_four_tap_filter_hor_4xN_avx2(kvz_pixel *data, int stride, int16_t * out, int out_stride, + __m256i *shuf_01_23, __m256i *taps_01_23, + int rows) { -static INLINE void kvz_filter_flip_shift_x8_dual_avx2(kvz_pixel *src, int16_t src_stride, int8_t *firs[2], int32_t shift1, int16_t *dst[2]){ - - __m256i rows[8]; - rows[0] = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 0 * src_stride))); - rows[1] = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 1 * src_stride))); - rows[2] = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 2 * src_stride))); - rows[3] = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 3 * src_stride))); - rows[4] = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 4 * src_stride))); - rows[5] = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 5 * src_stride))); - rows[6] = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 6 * src_stride))); - rows[7] = _mm256_broadcastsi128_si256(_mm_loadl_epi64((__m128i*)(src + 7 * src_stride))); - __m256i out = kvz_eight_tap_filter_flip_x8_dual_avx2(rows, firs, shift1); - _mm_storeu_si128((__m128i*)dst[0], _mm256_castsi256_si128(out)); - _mm_storeu_si128((__m128i*)dst[1], _mm256_extracti128_si256(out, 1)); -} + for (int i = 0; i < rows; ++i) { + __m256i row = _mm256_set1_epi64x(*(int64_t*)&data[i * stride]); -static INLINE void kvz_filter_flip_round_clip_x8_16bit_avx2(int16_t *flipped_filtered, int16_t src_stride, int8_t *filter, int32_t offset23, int32_t shift23, kvz_pixel *dst){ - - __m128i rows[8]; - rows[0] = _mm_loadu_si128((__m128i*)(flipped_filtered + 0 * src_stride)); - rows[1] = _mm_loadu_si128((__m128i*)(flipped_filtered + 1 * src_stride)); - rows[2] = _mm_loadu_si128((__m128i*)(flipped_filtered + 2 * src_stride)); - rows[3] = _mm_loadu_si128((__m128i*)(flipped_filtered + 3 * src_stride)); - rows[4] = _mm_loadu_si128((__m128i*)(flipped_filtered + 4 * src_stride)); - rows[5] = _mm_loadu_si128((__m128i*)(flipped_filtered + 5 * src_stride)); - rows[6] = _mm_loadu_si128((__m128i*)(flipped_filtered + 6 * src_stride)); - rows[7] = _mm_loadu_si128((__m128i*)(flipped_filtered + 7 * src_stride)); - _mm_storel_epi64((__m128i*)dst, kvz_eight_tap_filter_flip_x8_16bit_avx2(rows, filter, offset23, shift23) ); -} + __m256i pairs_l_r = _mm256_shuffle_epi8(row, *shuf_01_23); + __m256i temp_l_r = _mm256_maddubs_epi16(pairs_l_r, *taps_01_23); -static INLINE void kvz_filter_flip_round_clip_x8_16bit_dual_avx2(int16_t *flipped_filtered[2], int16_t src_stride, int8_t *firs[2], int32_t offset23, int32_t shift23, kvz_pixel *dst[2]){ - - __m256i rows[8]; - rows[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered[0] + 0 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered[1] + 0 * src_stride)), 1); - rows[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered[0] + 1 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered[1] + 1 * src_stride)), 1); - rows[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered[0] + 2 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered[1] + 2 * src_stride)), 1); - rows[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered[0] + 3 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered[1] + 3 * src_stride)), 1); - rows[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered[0] + 4 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered[1] + 4 * src_stride)), 1); - rows[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered[0] + 5 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered[1] + 5 * src_stride)), 1); - rows[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered[0] + 6 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered[1] + 6 * src_stride)), 1); - rows[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(flipped_filtered[0] + 7 * src_stride))), _mm_loadu_si128((__m128i*)(flipped_filtered[1] + 7 * src_stride)), 1); - __m256i out = kvz_eight_tap_filter_flip_x8_16bit_dual_avx2(rows, firs, offset23, shift23); - _mm_storel_epi64((__m128i*)dst[0], _mm256_castsi256_si128(out)); - _mm_storel_epi64((__m128i*)dst[1], _mm256_extracti128_si256(out, 1)); + __m128i temp_l = _mm256_castsi256_si128(temp_l_r); + __m128i temp_r = _mm256_extracti128_si256(temp_l_r, 1); + __m128i sum = _mm_add_epi16(temp_l, temp_r); + _mm_storel_epi64((__m128i*)(out + i * out_stride), sum); + } } -__m128i kvz_eight_tap_filter_x4_and_flip_16bit(__m128i *data0, __m128i *data1, __m128i *data2, __m128i *data3, __m128i *filter) +static int32_t kvz_eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data) { - __m128i a, b, c, d; - __m128i fir = _mm_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(filter))); - - a = _mm_madd_epi16(*data0, fir); - b = _mm_madd_epi16(*data1, fir); - a = _mm_hadd_epi32(a, b); - - c = _mm_madd_epi16(*data2, fir); - d = _mm_madd_epi16(*data3, fir); - c = _mm_hadd_epi32(c, d); - - a = _mm_hadd_epi32(a, c); - - return a; + __m128i fir = _mm_loadl_epi64((__m128i*)filter); + fir = _mm_cvtepi8_epi16(fir); + __m128i row = _mm_loadu_si128((__m128i*)data); + __m128i acc; + acc = _mm_madd_epi16(fir, row); + __m128i temp = _mm_srli_si128(acc, 8); + acc = _mm_add_epi32(acc, temp); + temp = _mm_srli_si128(acc, 4); + acc = _mm_add_epi32(acc, temp); + int32_t filtered = _mm_cvtsi128_si32(acc); + + return filtered; } -void kvz_eight_tap_filter_and_flip_avx2(int8_t filter[4][8], kvz_pixel *src, int16_t src_stride, int16_t* __restrict dst) +static void kvz_eight_tap_filter_ver_16bit_1x8_avx2(int8_t *filter, int16_t *data, int16_t stride, kvz_pixel *out) { + // Interpolation filter shifts + int32_t shift2 = 6; - //Load 2 rows per xmm register - __m128i rows01 = _mm_loadl_epi64((__m128i*)(src + 0 * src_stride)); - rows01 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows01), (double*)(src + 1 * src_stride))); - - __m128i rows23 = _mm_loadl_epi64((__m128i*)(src + 2 * src_stride)); - rows23 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows23), (double*)(src + 3 * src_stride))); - - __m128i rows45 = _mm_loadl_epi64((__m128i*)(src + 4 * src_stride)); - rows45 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows45), (double*)(src + 5 * src_stride))); + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); - __m128i rows67 = _mm_loadl_epi64((__m128i*)(src + 6 * src_stride)); - rows67 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows67), (double*)(src + 7 * src_stride))); + // Filter weights + __m256i all_taps = _mm256_castsi128_si256(_mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)filter))); + __m256i taps_01_23 = _mm256_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i taps_23 = _mm_shuffle_epi32(_mm256_castsi256_si128(all_taps), _MM_SHUFFLE(1, 1, 1, 1)); + __m256i taps_45_67 = _mm256_shuffle_epi32(all_taps, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i taps_67 = _mm_shuffle_epi32(_mm256_castsi256_si128(all_taps), _MM_SHUFFLE(3, 3, 3, 3)); - //Filter rows - const int dst_stride = MAX_WIDTH; - kvz_eight_tap_filter_x8_and_flip(&rows01, &rows23, &rows45, &rows67, (__m128i*)(&filter[0]), (__m128i*)(dst + 0)); - kvz_eight_tap_filter_x8_and_flip(&rows01, &rows23, &rows45, &rows67, (__m128i*)(&filter[1]), (__m128i*)(dst + 1 * dst_stride)); - kvz_eight_tap_filter_x8_and_flip(&rows01, &rows23, &rows45, &rows67, (__m128i*)(&filter[2]), (__m128i*)(dst + 2 * dst_stride)); - kvz_eight_tap_filter_x8_and_flip(&rows01, &rows23, &rows45, &rows67, (__m128i*)(&filter[3]), (__m128i*)(dst + 3 * dst_stride)); -} + taps_01_23 = _mm256_inserti128_si256(taps_01_23, taps_23, 1); + taps_45_67 = _mm256_inserti128_si256(taps_45_67, taps_67, 1); -static INLINE void eight_tap_filter_and_flip_16bit_avx2(int8_t filter[4][8], int16_t *src, int16_t src_stride, int offset, int combined_shift, kvz_pixel* __restrict dst, int16_t dst_stride) -{ + __m256i rows02 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&data[0 * stride])); + __m128i row2 = _mm_loadu_si128((__m128i*)&data[2 * stride]); + rows02 = _mm256_inserti128_si256(rows02, row2, 1); - //Load a row per xmm register - __m128i row0 = _mm_loadu_si128((__m128i*)(src + 0 * src_stride)); - __m128i row1 = _mm_loadu_si128((__m128i*)(src + 1 * src_stride)); - __m128i row2 = _mm_loadu_si128((__m128i*)(src + 2 * src_stride)); - __m128i row3 = _mm_loadu_si128((__m128i*)(src + 3 * src_stride)); + __m256i rows13 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&data[1 * stride])); + __m128i row3 = _mm_loadu_si128((__m128i*)&data[3 * stride]); + rows13 = _mm256_inserti128_si256(rows13, row3, 1); - //Filter rows - union { - __m128i vector; - int32_t array[4]; - } temp[4]; + __m256i pairs_01_23_lo = _mm256_unpacklo_epi16(rows02, rows13); + __m256i pairs_01_23_hi = _mm256_unpackhi_epi16(rows02, rows13); + __m256i temp_01_23_lo = _mm256_madd_epi16(pairs_01_23_lo, taps_01_23); + __m256i temp_01_23_hi = _mm256_madd_epi16(pairs_01_23_hi, taps_01_23); - temp[0].vector = kvz_eight_tap_filter_x4_and_flip_16bit(&row0, &row1, &row2, &row3, (__m128i*)(&filter[0])); - temp[1].vector = kvz_eight_tap_filter_x4_and_flip_16bit(&row0, &row1, &row2, &row3, (__m128i*)(&filter[1])); - temp[2].vector = kvz_eight_tap_filter_x4_and_flip_16bit(&row0, &row1, &row2, &row3, (__m128i*)(&filter[2])); - temp[3].vector = kvz_eight_tap_filter_x4_and_flip_16bit(&row0, &row1, &row2, &row3, (__m128i*)(&filter[3])); + __m256i rows46 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&data[4 * stride])); + __m128i row6 = _mm_loadu_si128((__m128i*)&data[6 * stride]); + rows46 = _mm256_inserti128_si256(rows46, row6, 1); - __m128i packed_offset = _mm_set1_epi32(offset); + __m256i rows57 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&data[5 * stride])); + __m128i row7 = _mm_loadu_si128((__m128i*)&data[7 * stride]); + rows57 = _mm256_inserti128_si256(rows57, row7, 1); - temp[0].vector = _mm_add_epi32(temp[0].vector, packed_offset); - temp[0].vector = _mm_srai_epi32(temp[0].vector, combined_shift); - temp[1].vector = _mm_add_epi32(temp[1].vector, packed_offset); - temp[1].vector = _mm_srai_epi32(temp[1].vector, combined_shift); + __m256i pairs_45_67_lo = _mm256_unpacklo_epi16(rows46, rows57); + __m256i pairs_45_67_hi = _mm256_unpackhi_epi16(rows46, rows57); + __m256i temp_45_67_lo = _mm256_madd_epi16(pairs_45_67_lo, taps_45_67); + __m256i temp_45_67_hi = _mm256_madd_epi16(pairs_45_67_hi, taps_45_67); - temp[0].vector = _mm_packus_epi32(temp[0].vector, temp[1].vector); + __m256i sum_lo_half = _mm256_add_epi32(temp_01_23_lo, temp_45_67_lo); + __m256i sum_hi_half = _mm256_add_epi32(temp_01_23_hi, temp_45_67_hi); - temp[2].vector = _mm_add_epi32(temp[2].vector, packed_offset); - temp[2].vector = _mm_srai_epi32(temp[2].vector, combined_shift); - temp[3].vector = _mm_add_epi32(temp[3].vector, packed_offset); - temp[3].vector = _mm_srai_epi32(temp[3].vector, combined_shift); + __m128i sum_lo = _mm_add_epi32(_mm256_castsi256_si128(sum_lo_half), _mm256_extracti128_si256(sum_lo_half, 1)); + __m128i sum_hi = _mm_add_epi32(_mm256_castsi256_si128(sum_hi_half), _mm256_extracti128_si256(sum_hi_half, 1)); - temp[2].vector = _mm_packus_epi32(temp[2].vector, temp[3].vector); + sum_lo = _mm_srai_epi32(sum_lo, shift2); + sum_hi = _mm_srai_epi32(sum_hi, shift2); - temp[0].vector = _mm_packus_epi16(temp[0].vector, temp[2].vector); + __m128i offset = _mm_set1_epi32(wp_offset1); + sum_lo = _mm_add_epi32(sum_lo, offset); + sum_lo = _mm_srai_epi32(sum_lo, wp_shift1); + sum_hi = _mm_add_epi32(sum_hi, offset); + sum_hi = _mm_srai_epi32(sum_hi, wp_shift1); + __m128i filtered = _mm_packus_epi32(sum_lo, sum_hi); + filtered = _mm_packus_epi16(filtered, filtered); - int32_t* four_pixels = (int32_t*)&(dst[0 * dst_stride]); - *four_pixels = temp[0].array[0]; - four_pixels = (int32_t*)&(dst[1 * dst_stride]); - *four_pixels = _mm_extract_epi32(temp[0].vector, 1); + _mm_storel_epi64((__m128i*)out, filtered); +} - four_pixels = (int32_t*)&(dst[2 * dst_stride]); - *four_pixels = _mm_extract_epi32(temp[0].vector, 2); +static void kvz_four_tap_filter_ver_16bit_4x4_avx2(int8_t *filter, int16_t *data, int16_t stride, kvz_pixel *out, int16_t out_stride) +{ + // Interpolation filter shifts + int32_t shift2 = 6; - four_pixels = (int32_t*)&(dst[3 * dst_stride]); - *four_pixels = _mm_extract_epi32(temp[0].vector, 3); + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); + + // Filter weights + __m128i all_taps = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(*(int32_t*)filter)); + __m128i taps_01 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i taps_23 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(1, 1, 1, 1)); + + __m128i row0 = _mm_loadl_epi64((__m128i*)&data[0 * stride]); + __m128i row1 = _mm_loadl_epi64((__m128i*)&data[1 * stride]); + __m128i row2 = _mm_loadl_epi64((__m128i*)&data[2 * stride]); + __m128i row3 = _mm_loadl_epi64((__m128i*)&data[3 * stride]); + __m128i row4 = _mm_loadl_epi64((__m128i*)&data[4 * stride]); + __m128i row5 = _mm_loadl_epi64((__m128i*)&data[5 * stride]); + __m128i row6 = _mm_loadl_epi64((__m128i*)&data[6 * stride]); + + __m128i pairs01 = _mm_unpacklo_epi16(row0, row1); + __m128i pairs23 = _mm_unpacklo_epi16(row2, row3); + __m128i temp01 = _mm_madd_epi16(pairs01, taps_01); + __m128i temp23 = _mm_madd_epi16(pairs23, taps_23); + __m128i sum0123 = _mm_add_epi32(temp01, temp23); + + __m128i pairs12 = _mm_unpacklo_epi16(row1, row2); + __m128i pairs34 = _mm_unpacklo_epi16(row3, row4); + __m128i temp12 = _mm_madd_epi16(pairs12, taps_01); + __m128i temp34 = _mm_madd_epi16(pairs34, taps_23); + __m128i sum1234 = _mm_add_epi32(temp12, temp34); + + __m128i pairs45 = _mm_unpacklo_epi16(row4, row5); + __m128i temp23_2 = _mm_madd_epi16(pairs23, taps_01); + __m128i temp45 = _mm_madd_epi16(pairs45, taps_23); + __m128i sum2345 = _mm_add_epi32(temp23_2, temp45); + + __m128i pairs56 = _mm_unpacklo_epi16(row5, row6); + __m128i temp34_2 = _mm_madd_epi16(pairs34, taps_01); + __m128i temp56 = _mm_madd_epi16(pairs56, taps_23); + __m128i sum3456 = _mm_add_epi32(temp34_2, temp56); + + sum0123 = _mm_srai_epi32(sum0123, shift2); + sum1234 = _mm_srai_epi32(sum1234, shift2); + sum2345 = _mm_srai_epi32(sum2345, shift2); + sum3456 = _mm_srai_epi32(sum3456, shift2); + + __m128i offset = _mm_set1_epi32(wp_offset1); + sum0123 = _mm_add_epi32(sum0123, offset); + sum1234 = _mm_add_epi32(sum1234, offset); + sum2345 = _mm_add_epi32(sum2345, offset); + sum3456 = _mm_add_epi32(sum3456, offset); + + sum0123 = _mm_srai_epi32(sum0123, wp_shift1); + sum1234 = _mm_srai_epi32(sum1234, wp_shift1); + sum2345 = _mm_srai_epi32(sum2345, wp_shift1); + sum3456 = _mm_srai_epi32(sum3456, wp_shift1); + + __m128i filtered01 = _mm_packs_epi32(sum0123, sum1234); + __m128i filtered23 = _mm_packs_epi32(sum2345, sum3456); + __m128i filtered = _mm_packus_epi16(filtered01, filtered23); + + *(int32_t*)&out[0 * out_stride] = _mm_cvtsi128_si32(filtered); + *(int32_t*)&out[1 * out_stride] = _mm_extract_epi32(filtered, 1); + *(int32_t*)&out[2 * out_stride] = _mm_extract_epi32(filtered, 2); + *(int32_t*)&out[3 * out_stride] = _mm_extract_epi32(filtered, 3); +} +static void kvz_four_tap_filter_ver_16bit_4x4_no_round_avx2(int8_t *filter, int16_t *data, int16_t stride, int16_t *out, int16_t out_stride) +{ + int32_t shift2 = 6; + // Filter weights + __m128i all_taps = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(*(int32_t*)filter)); + __m128i taps_01 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i taps_23 = _mm_shuffle_epi32(all_taps, _MM_SHUFFLE(1, 1, 1, 1)); + + __m128i row0 = _mm_loadl_epi64((__m128i*)&data[0 * stride]); + __m128i row1 = _mm_loadl_epi64((__m128i*)&data[1 * stride]); + __m128i row2 = _mm_loadl_epi64((__m128i*)&data[2 * stride]); + __m128i row3 = _mm_loadl_epi64((__m128i*)&data[3 * stride]); + __m128i row4 = _mm_loadl_epi64((__m128i*)&data[4 * stride]); + __m128i row5 = _mm_loadl_epi64((__m128i*)&data[5 * stride]); + __m128i row6 = _mm_loadl_epi64((__m128i*)&data[6 * stride]); + + __m128i pairs01 = _mm_unpacklo_epi16(row0, row1); + __m128i pairs23 = _mm_unpacklo_epi16(row2, row3); + __m128i temp01 = _mm_madd_epi16(pairs01, taps_01); + __m128i temp23 = _mm_madd_epi16(pairs23, taps_23); + __m128i sum0123 = _mm_add_epi32(temp01, temp23); + + __m128i pairs12 = _mm_unpacklo_epi16(row1, row2); + __m128i pairs34 = _mm_unpacklo_epi16(row3, row4); + __m128i temp12 = _mm_madd_epi16(pairs12, taps_01); + __m128i temp34 = _mm_madd_epi16(pairs34, taps_23); + __m128i sum1234 = _mm_add_epi32(temp12, temp34); + + __m128i pairs45 = _mm_unpacklo_epi16(row4, row5); + __m128i temp23_2 = _mm_madd_epi16(pairs23, taps_01); + __m128i temp45 = _mm_madd_epi16(pairs45, taps_23); + __m128i sum2345 = _mm_add_epi32(temp23_2, temp45); + + __m128i pairs56 = _mm_unpacklo_epi16(row5, row6); + __m128i temp34_2 = _mm_madd_epi16(pairs34, taps_01); + __m128i temp56 = _mm_madd_epi16(pairs56, taps_23); + __m128i sum3456 = _mm_add_epi32(temp34_2, temp56); + + sum0123 = _mm_srai_epi32(sum0123, shift2); + sum1234 = _mm_srai_epi32(sum1234, shift2); + sum2345 = _mm_srai_epi32(sum2345, shift2); + sum3456 = _mm_srai_epi32(sum3456, shift2); + + __m128i filtered01 = _mm_packs_epi32(sum0123, sum1234); + __m128i filtered23 = _mm_packs_epi32(sum2345, sum3456); + + _mm_storel_pi((__m64*)&out[0 * out_stride], _mm_castsi128_ps(filtered01)); + _mm_storeh_pi((__m64*)&out[1 * out_stride], _mm_castsi128_ps(filtered01)); + _mm_storel_pi((__m64*)&out[2 * out_stride], _mm_castsi128_ps(filtered23)); + _mm_storeh_pi((__m64*)&out[3 * out_stride], _mm_castsi128_ps(filtered23)); } -int16_t kvz_eight_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data) +INLINE static void filter_row_ver_16b_8x1_avx2(int16_t *data, int64_t stride, __m256i* taps, kvz_pixel * out, int64_t out_stride) { + // Interpolation filter shifts + int32_t shift2 = 6; - __m128i sample; + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); + + __m256i pairs_lo, pairs_hi; + + // Filter 01 later with 67 + __m256i br0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 0 * stride))); + __m256i br1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 1 * stride))); + + __m256i br2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 2 * stride))); + __m256i br3 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 3 * stride))); + pairs_lo = _mm256_unpacklo_epi16(br2, br3); + pairs_hi = _mm256_unpackhi_epi16(br2, br3); + __m256i rows02_23_01_lo = _mm256_madd_epi16(pairs_lo, taps[1]); // Firs 23 01 + __m256i rows02_23_01_hi = _mm256_madd_epi16(pairs_hi, taps[1]); // Firs 23 01 + + __m256i br4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 4 * stride))); + __m256i br5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 5 * stride))); + pairs_lo = _mm256_unpacklo_epi16(br4, br5); + pairs_hi = _mm256_unpackhi_epi16(br4, br5); + __m256i rows02_45_23_lo = _mm256_madd_epi16(pairs_lo, taps[2]); // Firs 45 23 + __m256i rows02_45_23_hi = _mm256_madd_epi16(pairs_hi, taps[2]); // Firs 45 23 + + __m256i br6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 6 * stride))); + __m256i br7 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 7 * stride))); + pairs_lo = _mm256_unpacklo_epi16(br6, br7); + pairs_hi = _mm256_unpackhi_epi16(br6, br7); + __m256i rows02_67_45_lo = _mm256_madd_epi16(pairs_lo, taps[3]); // Firs 67 45 + __m256i rows02_67_45_hi = _mm256_madd_epi16(pairs_hi, taps[3]); // Firs 67 45 + __m256i rows46_23_01_lo = _mm256_madd_epi16(pairs_lo, taps[1]); // Firs 23 01 + __m256i rows46_23_01_hi = _mm256_madd_epi16(pairs_hi, taps[1]); // Firs 23 01 + + __m256i br8 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 8 * stride))); + __m256i br9 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 9 * stride))); + pairs_lo = _mm256_unpacklo_epi16(br8, br9); + pairs_hi = _mm256_unpackhi_epi16(br8, br9); + // Filter rows02 later + __m256i rows46_45_23_lo = _mm256_madd_epi16(pairs_lo, taps[2]); // Firs 45 23 + __m256i rows46_45_23_hi = _mm256_madd_epi16(pairs_hi, taps[2]); // Firs 45 23 + + __m256i br10 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 10 * stride))); + __m256i br11 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 11 * stride))); + pairs_lo = _mm256_unpacklo_epi16(br10, br11); + pairs_hi = _mm256_unpackhi_epi16(br10, br11); + __m256i rows46_67_45_lo = _mm256_madd_epi16(pairs_lo, taps[3]); // Firs 67 45 + __m256i rows46_67_45_hi = _mm256_madd_epi16(pairs_hi, taps[3]); // Firs 67 45 + + // Deferred + __m256i r08 = _mm256_permute2x128_si256(br0, br8, _MM_SHUFFLE(0, 2, 0, 0)); + __m256i r19 = _mm256_permute2x128_si256(br1, br9, _MM_SHUFFLE(0, 2, 0, 0)); + pairs_lo = _mm256_unpacklo_epi16(r08, r19); + pairs_hi = _mm256_unpackhi_epi16(r08, r19); + __m256i rows02_01_67_lo = _mm256_madd_epi16(pairs_lo, taps[0]); // Firs 01 67 + __m256i rows02_01_67_hi = _mm256_madd_epi16(pairs_hi, taps[0]); // Firs 01 67 + + __m256i br12 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 12 * stride))); + __m256i br13 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 13 * stride))); + + __m256i r412 = _mm256_permute2x128_si256(br4, br12, _MM_SHUFFLE(0, 2, 0, 0)); + __m256i r513 = _mm256_permute2x128_si256(br5, br13, _MM_SHUFFLE(0, 2, 0, 0)); + pairs_lo = _mm256_unpacklo_epi16(r412, r513); + pairs_hi = _mm256_unpackhi_epi16(r412, r513); + __m256i rows46_01_67_lo = _mm256_madd_epi16(pairs_lo, taps[0]); // Firs 01 67 + __m256i rows46_01_67_hi = _mm256_madd_epi16(pairs_hi, taps[0]); // Firs 01 67 + + __m256i accu02_lo, accu02_hi; + __m256i accu46_lo, accu46_hi; + + accu02_lo = _mm256_add_epi32(rows02_23_01_lo, rows02_45_23_lo); + accu02_lo = _mm256_add_epi32(accu02_lo, rows02_67_45_lo); + accu02_lo = _mm256_add_epi32(accu02_lo, rows02_01_67_lo); + + accu02_hi = _mm256_add_epi32(rows02_23_01_hi, rows02_45_23_hi); + accu02_hi = _mm256_add_epi32(accu02_hi, rows02_67_45_hi); + accu02_hi = _mm256_add_epi32(accu02_hi, rows02_01_67_hi); + + accu46_lo = _mm256_add_epi32(rows46_23_01_lo, rows46_45_23_lo); + accu46_lo = _mm256_add_epi32(accu46_lo, rows46_67_45_lo); + accu46_lo = _mm256_add_epi32(accu46_lo, rows46_01_67_lo); + + accu46_hi = _mm256_add_epi32(rows46_23_01_hi, rows46_45_23_hi); + accu46_hi = _mm256_add_epi32(accu46_hi, rows46_67_45_hi); + accu46_hi = _mm256_add_epi32(accu46_hi, rows46_01_67_hi); + + accu02_lo = _mm256_srai_epi32(accu02_lo, shift2); + accu02_hi = _mm256_srai_epi32(accu02_hi, shift2); + accu46_lo = _mm256_srai_epi32(accu46_lo, shift2); + accu46_hi = _mm256_srai_epi32(accu46_hi, shift2); + + __m256i offset = _mm256_set1_epi32(wp_offset1); + accu02_lo = _mm256_add_epi32(accu02_lo, offset); + accu02_hi = _mm256_add_epi32(accu02_hi, offset); + accu46_lo = _mm256_add_epi32(accu46_lo, offset); + accu46_hi = _mm256_add_epi32(accu46_hi, offset); + + accu02_lo = _mm256_srai_epi32(accu02_lo, wp_shift1); + accu02_hi = _mm256_srai_epi32(accu02_hi, wp_shift1); + accu46_lo = _mm256_srai_epi32(accu46_lo, wp_shift1); + accu46_hi = _mm256_srai_epi32(accu46_hi, wp_shift1); + + __m256i rows02 = _mm256_packs_epi32(accu02_lo, accu02_hi); + __m256i rows46 = _mm256_packs_epi32(accu46_lo, accu46_hi); + + __m256i filtered04_26 = _mm256_packus_epi16(rows02, rows46); + __m128i filtered04 = _mm256_castsi256_si128(filtered04_26); + __m128i filtered26 = _mm256_extracti128_si256(filtered04_26, 1); + + _mm_storel_pi((__m64*)&out[0 * out_stride], _mm_castsi128_ps(filtered04)); + _mm_storel_pi((__m64*)&out[2 * out_stride], _mm_castsi128_ps(filtered26)); + _mm_storeh_pi((__m64*)&out[4 * out_stride], _mm_castsi128_ps(filtered04)); + _mm_storeh_pi((__m64*)&out[6 * out_stride], _mm_castsi128_ps(filtered26)); +} - __m128i packed_data = _mm_loadl_epi64((__m128i*)data); - __m128i packed_filter = _mm_loadl_epi64((__m128i*)filter); +INLINE static void filter_row_ver_16b_8x1_no_round_avx2(int16_t *data, int64_t stride, __m256i *taps, int16_t *out, int64_t out_stride) { - sample = _mm_maddubs_epi16(packed_data, packed_filter); - sample = _mm_add_epi16(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(0, 1, 0, 1))); - sample = _mm_add_epi16(sample, _mm_shufflelo_epi16(sample, _MM_SHUFFLE(0, 1, 0, 1))); + int32_t shift2 = 6; - return (int16_t)_mm_cvtsi128_si32(sample); + __m256i pairs_lo, pairs_hi; + + // Filter 01 later with 67 + __m256i br0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 0 * stride))); + __m256i br1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 1 * stride))); + + __m256i br2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 2 * stride))); + __m256i br3 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 3 * stride))); + pairs_lo = _mm256_unpacklo_epi16(br2, br3); + pairs_hi = _mm256_unpackhi_epi16(br2, br3); + __m256i rows02_23_01_lo = _mm256_madd_epi16(pairs_lo, taps[1]); // Firs 23 01 + __m256i rows02_23_01_hi = _mm256_madd_epi16(pairs_hi, taps[1]); // Firs 23 01 + + __m256i br4 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 4 * stride))); + __m256i br5 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 5 * stride))); + pairs_lo = _mm256_unpacklo_epi16(br4, br5); + pairs_hi = _mm256_unpackhi_epi16(br4, br5); + __m256i rows02_45_23_lo = _mm256_madd_epi16(pairs_lo, taps[2]); // Firs 45 23 + __m256i rows02_45_23_hi = _mm256_madd_epi16(pairs_hi, taps[2]); // Firs 45 23 + + __m256i br6 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 6 * stride))); + __m256i br7 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 7 * stride))); + pairs_lo = _mm256_unpacklo_epi16(br6, br7); + pairs_hi = _mm256_unpackhi_epi16(br6, br7); + __m256i rows02_67_45_lo = _mm256_madd_epi16(pairs_lo, taps[3]); // Firs 67 45 + __m256i rows02_67_45_hi = _mm256_madd_epi16(pairs_hi, taps[3]); // Firs 67 45 + __m256i rows46_23_01_lo = _mm256_madd_epi16(pairs_lo, taps[1]); // Firs 23 01 + __m256i rows46_23_01_hi = _mm256_madd_epi16(pairs_hi, taps[1]); // Firs 23 01 + + __m256i br8 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 8 * stride))); + __m256i br9 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 9 * stride))); + pairs_lo = _mm256_unpacklo_epi16(br8, br9); + pairs_hi = _mm256_unpackhi_epi16(br8, br9); + // Filter rows02 later + __m256i rows46_45_23_lo = _mm256_madd_epi16(pairs_lo, taps[2]); // Firs 45 23 + __m256i rows46_45_23_hi = _mm256_madd_epi16(pairs_hi, taps[2]); // Firs 45 23 + + __m256i br10 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 10 * stride))); + __m256i br11 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 11 * stride))); + pairs_lo = _mm256_unpacklo_epi16(br10, br11); + pairs_hi = _mm256_unpackhi_epi16(br10, br11); + __m256i rows46_67_45_lo = _mm256_madd_epi16(pairs_lo, taps[3]); // Firs 67 45 + __m256i rows46_67_45_hi = _mm256_madd_epi16(pairs_hi, taps[3]); // Firs 67 45 + + // Deferred + __m256i r08 = _mm256_permute2x128_si256(br0, br8, _MM_SHUFFLE(0, 2, 0, 0)); + __m256i r19 = _mm256_permute2x128_si256(br1, br9, _MM_SHUFFLE(0, 2, 0, 0)); + pairs_lo = _mm256_unpacklo_epi16(r08, r19); + pairs_hi = _mm256_unpackhi_epi16(r08, r19); + __m256i rows02_01_67_lo = _mm256_madd_epi16(pairs_lo, taps[0]); // Firs 01 67 + __m256i rows02_01_67_hi = _mm256_madd_epi16(pairs_hi, taps[0]); // Firs 01 67 + + __m256i br12 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 12 * stride))); + __m256i br13 = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)(data + 13 * stride))); + + __m256i r412 = _mm256_permute2x128_si256(br4, br12, _MM_SHUFFLE(0, 2, 0, 0)); + __m256i r513 = _mm256_permute2x128_si256(br5, br13, _MM_SHUFFLE(0, 2, 0, 0)); + pairs_lo = _mm256_unpacklo_epi16(r412, r513); + pairs_hi = _mm256_unpackhi_epi16(r412, r513); + __m256i rows46_01_67_lo = _mm256_madd_epi16(pairs_lo, taps[0]); // Firs 01 67 + __m256i rows46_01_67_hi = _mm256_madd_epi16(pairs_hi, taps[0]); // Firs 01 67 + + __m256i accu02_lo, accu02_hi; + __m256i accu46_lo, accu46_hi; + + accu02_lo = _mm256_add_epi32(rows02_23_01_lo, rows02_45_23_lo); + accu02_lo = _mm256_add_epi32(accu02_lo, rows02_67_45_lo); + accu02_lo = _mm256_add_epi32(accu02_lo, rows02_01_67_lo); + + accu02_hi = _mm256_add_epi32(rows02_23_01_hi, rows02_45_23_hi); + accu02_hi = _mm256_add_epi32(accu02_hi, rows02_67_45_hi); + accu02_hi = _mm256_add_epi32(accu02_hi, rows02_01_67_hi); + + accu46_lo = _mm256_add_epi32(rows46_23_01_lo, rows46_45_23_lo); + accu46_lo = _mm256_add_epi32(accu46_lo, rows46_67_45_lo); + accu46_lo = _mm256_add_epi32(accu46_lo, rows46_01_67_lo); + + accu46_hi = _mm256_add_epi32(rows46_23_01_hi, rows46_45_23_hi); + accu46_hi = _mm256_add_epi32(accu46_hi, rows46_67_45_hi); + accu46_hi = _mm256_add_epi32(accu46_hi, rows46_01_67_hi); + + accu02_lo = _mm256_srai_epi32(accu02_lo, shift2); + accu02_hi = _mm256_srai_epi32(accu02_hi, shift2); + accu46_lo = _mm256_srai_epi32(accu46_lo, shift2); + accu46_hi = _mm256_srai_epi32(accu46_hi, shift2); + + __m256i rows02 = _mm256_packs_epi32(accu02_lo, accu02_hi); + __m256i rows46 = _mm256_packs_epi32(accu46_lo, accu46_hi); + + __m128i filtered0 = _mm256_castsi256_si128(rows02); + __m128i filtered2 = _mm256_extracti128_si256(rows02, 1); + __m128i filtered4 = _mm256_castsi256_si128(rows46); + __m128i filtered6 = _mm256_extracti128_si256(rows46, 1); + + _mm_storeu_si128((__m128i*)(out + 0 * out_stride), filtered0); + _mm_storeu_si128((__m128i*)(out + 2 * out_stride), filtered2); + _mm_storeu_si128((__m128i*)(out + 4 * out_stride), filtered4); + _mm_storeu_si128((__m128i*)(out + 6 * out_stride), filtered6); } - -int32_t kvz_eight_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data) +INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_avx2(__m256i *filters, int16_t *data, int16_t stride, kvz_pixel *out, int out_stride) { - __m128i sample; - - __m128i packed_data = _mm_loadu_si128((__m128i*)data); - __m128i packed_filter = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)filter)); + // Filter even rows + filter_row_ver_16b_8x1_avx2(data, stride, filters, out, out_stride); // 0 2 4 6 - sample = _mm_madd_epi16(packed_data, packed_filter); - sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(0, 1, 3, 2))); - sample = _mm_add_epi32(sample, _mm_shuffle_epi32(sample, _MM_SHUFFLE(0, 1, 0, 1))); - - return _mm_extract_epi32(sample, 0); + // Filter odd rows + filter_row_ver_16b_8x1_avx2(data + stride, stride, filters, out + out_stride, out_stride); // 1 3 5 7 + } -int16_t kvz_eight_tap_filter_ver_avx2(int8_t *filter, kvz_pixel *data, int16_t stride) +INLINE static void kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(__m256i *filters, int16_t *data, int16_t stride, int16_t *out, int out_stride) { - int16_t temp = 0; - for (int i = 0; i < 8; ++i) - { - temp += filter[i] * data[stride * i]; - } + // Filter even rows + filter_row_ver_16b_8x1_no_round_avx2(data, stride, filters, out, out_stride); // 0 2 4 6 + + // Filter odd rows + filter_row_ver_16b_8x1_no_round_avx2(data + stride, stride, filters, out + out_stride, out_stride); // 1 3 5 7 - return temp; } -int32_t kvz_eight_tap_filter_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride) +static void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder, + kvz_pixel *src, + int16_t src_stride, + int width, + int height, + kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], + int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + int8_t fme_level, + int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], + int8_t hpel_off_x, int8_t hpel_off_y) { - int32_t temp = 0; - for (int i = 0; i < 8; ++i) - { - temp += filter[i] * data[stride * i]; - } + int x, y, first_y; - return temp; -} + // Interpolation filter shifts + int16_t shift1 = KVZ_BIT_DEPTH - 8; + int32_t shift2 = 6; -int16_t kvz_four_tap_filter_hor_avx2(int8_t *filter, kvz_pixel *data) -{ - __m128i packed_data = _mm_cvtsi32_si128(*(int32_t*)data); - __m128i packed_filter = _mm_cvtsi32_si128(*(int32_t*)filter); + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); - __m128i temp = _mm_maddubs_epi16(packed_data, packed_filter); - temp = _mm_hadd_epi16(temp, temp); + int8_t *fir0 = kvz_g_luma_filter[0]; + int8_t *fir2 = kvz_g_luma_filter[2]; - return _mm_extract_epi16(temp, 0); -} + int16_t dst_stride = LCU_WIDTH; + int16_t hor_stride = LCU_WIDTH; -int32_t kvz_four_tap_filter_hor_16bit_avx2(int8_t *filter, int16_t *data) -{ - __m128i packed_data = _mm_loadl_epi64((__m128i*)data); - __m128i packed_filter = _mm_cvtepi8_epi16(_mm_cvtsi32_si128(*(int32_t*)filter) ); + int16_t *hor_pos0 = hor_intermediate[0]; + int16_t *hor_pos2 = hor_intermediate[1]; + int16_t *col_pos0 = hor_first_cols[0]; + int16_t *col_pos2 = hor_first_cols[2]; - __m128i temp = _mm_madd_epi16(packed_data, packed_filter); - temp = _mm_hadd_epi32(temp, temp); + // Horizontally filtered samples from the top row are + // not needed unless samples for diagonal positions are filtered later. + first_y = fme_level > 1 ? 0 : 1; - return _mm_cvtsi128_si32(temp); -} + // HORIZONTAL STEP + // Integer pixels + __m256i shuf_01_23, shuf_45_67; + __m256i taps_01_23, taps_45_67; -int16_t kvz_four_tap_filter_ver_avx2(int8_t *filter, kvz_pixel *data, int16_t stride) -{ - int16_t temp = 0; - for (int i = 0; i < 4; ++i) - { - temp += filter[i] * data[stride * i]; + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { + + for (x = 0; x + 7 < width; x += 8) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x + 1; + __m128i* out = (__m128i*)&hor_pos0[y * hor_stride + x]; + __m128i chunk = _mm_loadl_epi64((__m128i*)&src[src_stride*ypos + xpos]); + chunk = _mm_cvtepu8_epi16(chunk); + chunk = _mm_slli_epi16(chunk, 6); // Multiply by 64 + _mm_storeu_si128(out, chunk); //TODO: >> shift1 + } } - return temp; -} + // Write the first column in contiguous memory + x = 0; + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int32_t first_sample = src[src_stride*ypos + x] << 6 >> shift1; + col_pos0[y] = first_sample; + } -int32_t kvz_four_tap_filter_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride) -{ - int32_t temp = 0; - for (int i = 0; i < 4; ++i) - { - temp += filter[i] * data[stride * i]; + // Half pixels + kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67); + kvz_init_filter_taps(fir2, &taps_01_23, &taps_45_67); + + for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { + + for (x = 0; x + 7 < width; x += 8) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1; + kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_pos2[y * hor_stride + x], + &shuf_01_23, &shuf_45_67, + &taps_01_23, &taps_45_67); //TODO: >> shift1 + } } - return temp; -} + // Write the first column in contiguous memory + x = 0; + for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET; + col_pos2[y] = kvz_eight_tap_filter_hor_avx2(fir2, &src[src_stride*ypos + xpos]) >> shift1; + } -void kvz_eight_tap_filter_x4_hor_avx2(int8_t *filter, kvz_pixel *data, int shift, int16_t* dst) -{ - __m256i packed_data = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)data)), _mm_loadl_epi64((__m128i*)(data + 2)), 1); - __m256i packed_filter = _mm256_broadcastq_epi64(_mm_loadl_epi64((__m128i*)filter)); - __m256i idx_lookup = _mm256_broadcastsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8)); + // VERTICAL STEP + kvz_pixel *out_l = filtered[0]; + kvz_pixel *out_r = filtered[1]; + kvz_pixel *out_t = filtered[2]; + kvz_pixel *out_b = filtered[3]; - __m256i temp = _mm256_shuffle_epi8(packed_data, idx_lookup); + __m256i taps[4]; + kvz_init_ver_filter_taps(fir0, taps); - temp = _mm256_maddubs_epi16(temp, packed_filter); - __m128i temp_128 = _mm_hadd_epi16(_mm256_extracti128_si256(temp, 0), _mm256_extracti128_si256(temp, 1)); - temp_128 = _mm_hadd_epi16(temp_128, temp_128); - temp_128 = _mm_srai_epi16(temp_128, shift); + // Right + for (y = 0; y + 7 < height; y+=8) { - _mm_storel_epi64((__m128i*)dst, temp_128); -} + for (x = 0; x + 7 < width ; x+=8) { + kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos2[(y + 1) * hor_stride + x], hor_stride, &out_r[y * dst_stride + x], dst_stride); + } + } -void kvz_four_tap_filter_x4_hor_avx2(int8_t *filter, kvz_pixel *data, int shift, int16_t* dst) -{ - __m128i packed_data = _mm_loadl_epi64((__m128i*)data); - __m128i packed_filter = _mm_set1_epi32(*(int32_t*)filter); - __m128i idx_lookup = _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); + // Left + // Copy from the right filtered block and filter the extra column + for (y = 0; y < height; ++y) { + x = 0; + *(uint64_t*)&out_l[y * dst_stride + x] = *(uint64_t*)&out_r[y * dst_stride + x] << 8; + for (x = 8; x < width; x += 8) *(int64_t*)&out_l[y * dst_stride + x] = *(int64_t*)&out_r[y * dst_stride + x - 1]; + x = 0; + int16_t sample = 64 * col_pos2[y + 1 + KVZ_LUMA_FILTER_OFFSET] >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + out_l[y * dst_stride + x] = sample; + } - __m128i temp = _mm_shuffle_epi8(packed_data, idx_lookup); + kvz_init_ver_filter_taps(fir2, taps); + // Top + for (y = 0; y + 7 < height; y+=8) { + for (x = 0; x + 7 < width; x+=8) { + kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos0[y * hor_stride + x], hor_stride, &out_t[y * dst_stride + x], dst_stride); + } + } - temp = _mm_maddubs_epi16(temp, packed_filter); - temp = _mm_hadd_epi16(temp, temp); - temp = _mm_srai_epi16(temp, shift); + // Bottom + // Copy what can be copied from the top filtered values. + // Then filter the last row from horizontal intermediate buffer. + for (y = 0; y < height - 1; ++y) { + for (x = 0; x + 7 < width; x += 8) { + *(int64_t*)&out_b[(y + 0) * dst_stride + x] = *(int64_t*)&out_t[(y + 1) * dst_stride + x]; + } + } - _mm_storel_epi64((__m128i*)dst, temp); + for (x = 0; x + 7 < width; x += 8) { + kvz_eight_tap_filter_ver_16bit_1x8_avx2(fir2, &hor_pos0[(y + 1) * hor_stride + x], hor_stride, &out_b[y * dst_stride + x]); + } } -void kvz_eight_tap_filter_x8_hor_avx2(int8_t *filter, kvz_pixel *data, int shift, int16_t* dst) +static void kvz_filter_hpel_blocks_diag_luma_avx2(const encoder_control_t * encoder, + kvz_pixel *src, + int16_t src_stride, + int width, + int height, + kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], + int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + int8_t fme_level, + int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], + int8_t hpel_off_x, int8_t hpel_off_y) { - __m256i packed_data = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)data)), _mm_loadu_si128((__m128i*)(data + 4)), 1); - __m256i packed_filter = _mm256_broadcastq_epi64(_mm_loadl_epi64((__m128i*)filter)); - __m256i idx_lookup0 = _mm256_broadcastsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8)); - __m256i idx_lookup1 = _mm256_broadcastsi128_si256(_mm_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10)); - - __m256i temp0 = _mm256_shuffle_epi8(packed_data, idx_lookup0); - __m256i temp1 = _mm256_shuffle_epi8(packed_data, idx_lookup1); - - temp0 = _mm256_maddubs_epi16(temp0, packed_filter); - temp1 = _mm256_maddubs_epi16(temp1, packed_filter); - temp0 = _mm256_hadd_epi16(temp0, temp1); - temp0 = _mm256_hadd_epi16(temp0, temp0); + int x, y; - temp0 = _mm256_srai_epi16(temp0, shift); + // Interpolation filter shifts + int32_t shift2 = 6; - temp0 = _mm256_permute4x64_epi64(temp0, _MM_SHUFFLE(3, 1, 2, 0)); + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); - _mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(temp0)); -} + int8_t *fir2 = kvz_g_luma_filter[2]; -void kvz_four_tap_filter_x8_hor_avx2(int8_t *filter, kvz_pixel *data, int shift, int16_t* dst) -{ - __m256i packed_data = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)data)), _mm_loadl_epi64((__m128i*)(data + 4)), 1); - __m256i packed_filter = _mm256_set1_epi32(*(int32_t*)filter); - __m256i idx_lookup = _mm256_broadcastsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6)); + int16_t dst_stride = LCU_WIDTH; + int16_t hor_stride = LCU_WIDTH; - __m256i temp = _mm256_shuffle_epi8(packed_data, idx_lookup); + int16_t *hor_pos2 = hor_intermediate[1]; + int16_t *col_pos2 = hor_first_cols[2]; - temp = _mm256_maddubs_epi16(temp, packed_filter); - temp = _mm256_hadd_epi16(temp, temp); - temp = _mm256_srai_epi16(temp, shift); + // VERTICAL STEP + kvz_pixel *out_tl = filtered[0]; + kvz_pixel *out_tr = filtered[1]; + kvz_pixel *out_bl = filtered[2]; + kvz_pixel *out_br = filtered[3]; - _mm_storel_epi64((__m128i*)dst, _mm256_castsi256_si128(temp)); - _mm_storel_epi64((__m128i*)(dst + 4), _mm256_extracti128_si256(temp, 1)); -} + __m256i taps[4]; + kvz_init_ver_filter_taps(fir2, taps); + // Top-Right + for (y = 0; y + 7 < height; y += 8) { + for (x = 0; x + 7 < width; x += 8) { + kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos2[y * hor_stride + x], hor_stride, &out_tr[y * dst_stride + x], dst_stride); + } + } -int32_t kvz_eight_tap_filter_x4_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3) -{ + // Top-left + // Copy from the top-right filtered block and filter the extra column + for (y = 0; y < height; ++y) { + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(fir2, &col_pos2[y]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + out_tl[y * dst_stride + x] = sample; - __m128i v_filter = _mm_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter[0]))); - __m128i v_data0 = _mm_loadl_epi64((__m128i*)(data + stride * 0)); - __m128i v_data1 = _mm_loadl_epi64((__m128i*)(data + stride * 1)); - __m128i v_data = _mm_unpacklo_epi16(v_data0, v_data1); - __m128i temp = _mm_madd_epi16(v_filter, v_data); + for (x = 1; x < width; ++x) out_tl[y * dst_stride + x] = out_tr[y * dst_stride + x - 1]; + } - v_filter = _mm_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter[2]))); - __m128i v_data2 = _mm_loadl_epi64((__m128i*)(data + stride * 2)); - __m128i v_data3 = _mm_loadl_epi64((__m128i*)(data + stride * 3)); - v_data = _mm_unpacklo_epi16(v_data2, v_data3); - temp = _mm_add_epi32(temp, _mm_madd_epi16(v_filter, v_data) ); + // Bottom-right + // Copy what can be copied from top-right filtered values. Filter the last row. + for (y = 0; y < height - 1; ++y) { + for (x = 0; x + 7 < width; x += 8) { + memcpy(&out_br[y * dst_stride + x], &out_tr[(y + 1) * dst_stride + x], 8); + } + } - temp = _mm_add_epi32(temp, _mm_set1_epi32(offset)); - temp = _mm_srai_epi32(temp, shift2 + shift3); + for (x = 0; x + 7 < width; x += 8) { + kvz_eight_tap_filter_ver_16bit_1x8_avx2(fir2, &hor_pos2[(y + 1) * hor_stride + x], hor_stride, &out_br[y * dst_stride + x]); + } - temp = _mm_packus_epi32(temp, temp); - temp = _mm_packus_epi16(temp, temp); + // Bottom-left + // Copy what can be copied from the top-left filtered values. + // Copy what can be copied from the bottom-right filtered values. + // Finally filter the last pixel from the column array. + for (y = 0; y < height - 1; ++y) { + for (x = 0; x + 7 < width; x += 8) { + memcpy(&out_bl[y * dst_stride + x], &out_tl[(y + 1) * dst_stride + x], 8); + } + } - return _mm_cvtsi128_si32(temp); + for (x = 1; x < width; ++x) out_bl[y * dst_stride + x] = out_br[y * dst_stride + x - 1]; + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(fir2, &col_pos2[(y + 1)]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + out_bl[y * dst_stride + x] = sample; } -int32_t kvz_four_tap_filter_x4_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3) +static void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * encoder, + kvz_pixel *src, + int16_t src_stride, + int width, + int height, + kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], + int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + int8_t fme_level, + int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], + int8_t hpel_off_x, int8_t hpel_off_y) { + int x, y; - __m128i v_filter = _mm_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter[0]))); - __m128i v_data0 = _mm_loadl_epi64((__m128i*)(data + stride * 0)); - __m128i v_data1 = _mm_loadl_epi64((__m128i*)(data + stride * 1)); - __m128i v_data = _mm_unpacklo_epi16(v_data0, v_data1); - __m128i temp = _mm_madd_epi16(v_filter, v_data); + // Interpolation filter shifts + int16_t shift1 = KVZ_BIT_DEPTH - 8; + int32_t shift2 = 6; - v_filter = _mm_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter[2]))); - __m128i v_data2 = _mm_loadl_epi64((__m128i*)(data + stride * 2)); - __m128i v_data3 = _mm_loadl_epi64((__m128i*)(data + stride * 3)); - v_data = _mm_unpacklo_epi16(v_data2, v_data3); - temp = _mm_add_epi32(temp, _mm_madd_epi16(v_filter, v_data) ); + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); - temp = _mm_add_epi32(temp, _mm_set1_epi32(offset)); - temp = _mm_srai_epi32(temp, shift2 + shift3); + int8_t *fir0 = kvz_g_luma_filter[0]; + int8_t *fir2 = kvz_g_luma_filter[2]; + int8_t *fir1 = kvz_g_luma_filter[1]; + int8_t *fir3 = kvz_g_luma_filter[3]; - temp = _mm_packus_epi32(temp, temp); - temp = _mm_packus_epi16(temp, temp); + // Horiziontal positions. Positions 0 and 2 have already been calculated in filtered. + int16_t *hor_pos0 = hor_intermediate[0]; + int16_t *hor_pos2 = hor_intermediate[1]; + int16_t *hor_pos_l = hor_intermediate[3]; + int16_t *hor_pos_r = hor_intermediate[4]; + int8_t *hor_fir_l = hpel_off_x != 0 ? fir1 : fir3; + int8_t *hor_fir_r = hpel_off_x != 0 ? fir3 : fir1; + int16_t *col_pos_l = hor_first_cols[1]; + int16_t *col_pos_r = hor_first_cols[3]; + + int16_t dst_stride = LCU_WIDTH; + int16_t hor_stride = LCU_WIDTH; + + int16_t *hor_hpel_pos = hpel_off_x != 0 ? hor_pos2 : hor_pos0; + int16_t *col_pos_hor = hpel_off_x != 0 ? hor_first_cols[2] : hor_first_cols[0]; + + // Specify if integer pixels are filtered from left or/and top integer samples + int off_x_fir_l = hpel_off_x < 1 ? 0 : 1; + int off_x_fir_r = hpel_off_x < 0 ? 0 : 1; + int off_y_fir_t = hpel_off_y < 1 ? 0 : 1; + int off_y_fir_b = hpel_off_y < 0 ? 0 : 1; + + // HORIZONTAL STEP + __m256i shuf_01_23, shuf_45_67; + __m256i taps_01_23, taps_45_67; + + // Left QPEL + kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67); + kvz_init_filter_taps(hor_fir_l, &taps_01_23, &taps_45_67); + + int sample_off_y = hpel_off_y < 0 ? 0 : 1; - return _mm_cvtsi128_si32(temp); -} + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { -void kvz_eight_tap_filter_x8_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3, kvz_pixel* dst) -{ + for (x = 0; x + 7 < width; x += 8) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1; + kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_pos_l[y * hor_stride + x], + &shuf_01_23, &shuf_45_67, + &taps_01_23, &taps_45_67); //TODO: >> shift1 + } + } - __m256i v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter[0]))); - __m256i v_data0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 0))); - __m256i v_data1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 1))); - __m256i v_data = _mm256_or_si256(v_data0, _mm256_slli_epi32(v_data1, 16)); - __m256i temp = _mm256_madd_epi16(v_filter, v_data); - - v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter[2]))); - __m256i v_data2 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 2))); - __m256i v_data3 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 3))); - v_data = _mm256_or_si256(v_data2, _mm256_slli_epi32(v_data3, 16)); - temp = _mm256_add_epi32(temp, _mm256_madd_epi16(v_filter, v_data) ); - - v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter[4]))); - __m256i v_data4 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 4))); - __m256i v_data5 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 5))); - v_data = _mm256_or_si256(v_data4, _mm256_slli_epi32(v_data5, 16)); - temp = _mm256_add_epi32(temp, _mm256_madd_epi16(v_filter, v_data) ); - - v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter[6]))); - __m256i v_data6 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 6))); - __m256i v_data7 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 7))); - v_data = _mm256_or_si256(v_data6, _mm256_slli_epi32(v_data7, 16)); - temp = _mm256_add_epi32(temp, _mm256_madd_epi16(v_filter, v_data) ); - - temp = _mm256_add_epi32(temp, _mm256_set1_epi32(offset)); - temp = _mm256_srai_epi32(temp, shift2 + shift3); - - temp = _mm256_packus_epi32(temp, temp); - temp = _mm256_packus_epi16(temp, temp); - - *(int32_t*)dst = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp)); - *(int32_t*)(dst + 4) = _mm_cvtsi128_si32(_mm256_extracti128_si256(temp, 1)); -} + // Write the first column in contiguous memory + x = 0; + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET; + col_pos_l[y] = kvz_eight_tap_filter_hor_avx2(hor_fir_l, &src[src_stride*ypos + xpos]) >> shift1; + } -void kvz_four_tap_filter_x8_ver_16bit_avx2(int8_t *filter, int16_t *data, int16_t stride, int offset, int shift2, int shift3, kvz_pixel* dst) -{ + // Right QPEL + kvz_init_filter_taps(hor_fir_r, &taps_01_23, &taps_45_67); - __m256i v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter[0]))); - __m256i v_data0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 0))); - __m256i v_data1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 1))); - __m256i v_data = _mm256_or_si256(v_data0, _mm256_slli_epi32(v_data1, 16)); - __m256i temp = _mm256_madd_epi16(v_filter, v_data); + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { - v_filter = _mm256_cvtepi8_epi16(_mm_set1_epi16(*(int16_t*)&(filter[2]))); - __m256i v_data2 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 2))); - __m256i v_data3 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(data + stride * 3))); - v_data = _mm256_or_si256(v_data2, _mm256_slli_epi32(v_data3, 16)); - temp = _mm256_add_epi32(temp, _mm256_madd_epi16(v_filter, v_data) ); + for (x = 0; x + 7 < width; x += 8) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1; + kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_pos_r[y * hor_stride + x], + &shuf_01_23, &shuf_45_67, + &taps_01_23, &taps_45_67); //TODO: >> shift1 + } + } - temp = _mm256_add_epi32(temp, _mm256_set1_epi32(offset)); - temp = _mm256_srai_epi32(temp, shift2 + shift3); + // Write the first column in contiguous memory + x = 0; + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET; + col_pos_r[y] = kvz_eight_tap_filter_hor_avx2(hor_fir_r, &src[src_stride*ypos + xpos]) >> shift1; + } - temp = _mm256_packus_epi32(temp, temp); - temp = _mm256_packus_epi16(temp, temp); + // VERTICAL STEP + kvz_pixel *out_l = filtered[0]; + kvz_pixel *out_r = filtered[1]; + kvz_pixel *out_t = filtered[2]; + kvz_pixel *out_b = filtered[3]; - *(int32_t*)dst = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp)); - *(int32_t*)(dst + 4) = _mm_cvtsi128_si32(_mm256_extracti128_si256(temp, 1)); -} + int8_t *ver_fir_l = hpel_off_y != 0 ? fir2 : fir0; + int8_t *ver_fir_r = hpel_off_y != 0 ? fir2 : fir0; + int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3; + int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1; -void kvz_filter_inter_quarterpel_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag) -{ + __m256i taps[4]; - int32_t x, y; - int16_t shift1 = KVZ_BIT_DEPTH - 8; - int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset23 = 1 << (shift2 + shift3 - 1); + // Left QPEL (1/4 or 3/4 x positions) + // Filter block and then filter column and align if neccessary + kvz_init_ver_filter_taps(ver_fir_l, taps); - //coefficients for 1/4, 2/4 and 3/4 positions - int8_t *c0, *c1, *c2, *c3; + for (y = 0; y + 7 < height; y += 8) { + for (x = 0; x + 7 < width; x += 8) { + int ypos = y + sample_off_y; + kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_l[ypos * hor_stride + x], hor_stride, &out_l[y * dst_stride + x], dst_stride); + } + } - c0 = kvz_g_luma_filter[0]; - c1 = kvz_g_luma_filter[1]; - c2 = kvz_g_luma_filter[2]; - c3 = kvz_g_luma_filter[3]; + if (!off_x_fir_l) { + for (y = 0; y < height; ++y) { + for (x = width - 8; x >= 8; x -= 8) { + uint64_t chunk = *(uint64_t*)&out_l[y * dst_stride + x - 1]; + *(uint64_t*)&out_l[y * dst_stride + x] = chunk; + } - int16_t flipped_hor_filtered[MAX_HEIGHT][MAX_WIDTH]; + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_l, &col_pos_l[y + sample_off_y]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + uint64_t first = sample; + uint64_t rest = *(uint64_t*)&out_l[y * dst_stride + x]; + uint64_t chunk = (rest << 8) | first; + *(uint64_t*)&out_l[y * dst_stride + x] = chunk; + } + } - // Filter horizontally and flip x and y - for (x = 0; x < width; ++x) { - for (y = 0; y < height; y += 8) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; + // Right QPEL (3/4 or 1/4 x positions) + // Filter block and then filter column and align if neccessary + kvz_init_ver_filter_taps(ver_fir_r, taps); - kvz_eight_tap_filter_and_flip_avx2(kvz_g_luma_filter, &src[src_stride*ypos + xpos], src_stride, (int16_t*)&(flipped_hor_filtered[4 * x + 0][y])); - + for (y = 0; y + 7 < height; y += 8) { + for (x = 0; x + 7 < width; x += 8) { + int ypos = y + sample_off_y; + kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_r[ypos * hor_stride + x], hor_stride, &out_r[y * dst_stride + x], dst_stride); } + } - for (; y < height + FILTER_SIZE - 1; ++y) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - flipped_hor_filtered[4 * x + 0][y] = kvz_eight_tap_filter_hor_avx2(c0, &src[src_stride*ypos + xpos]) << shift1; - flipped_hor_filtered[4 * x + 1][y] = kvz_eight_tap_filter_hor_avx2(c1, &src[src_stride*ypos + xpos]) << shift1; - flipped_hor_filtered[4 * x + 2][y] = kvz_eight_tap_filter_hor_avx2(c2, &src[src_stride*ypos + xpos]) << shift1; - flipped_hor_filtered[4 * x + 3][y] = kvz_eight_tap_filter_hor_avx2(c3, &src[src_stride*ypos + xpos]) << shift1; + if (!off_x_fir_r) { + for (y = 0; y < height; ++y) { + for (x = width - 8; x >= 8; x -= 8) { + uint64_t chunk = *(uint64_t*)&out_r[y * dst_stride + x - 1]; + *(uint64_t*)&out_r[y * dst_stride + x] = chunk; + } + + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_r, &col_pos_r[y + sample_off_y]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + uint64_t first = sample; + uint64_t rest = *(uint64_t*)&out_r[y * dst_stride + x]; + uint64_t chunk = (rest << 8) | first; + *(uint64_t*)&out_r[y * dst_stride + x] = chunk; } } - // Filter vertically and flip x and y - for (y = 0; y < height; ++y) { - for (x = 0; x < 4 * width - 3; x += 4) { + + // Top QPEL (1/4 or 3/4 y positions) + // Filter block and then filter column and align if neccessary + int sample_off_x = (hpel_off_x > -1 ? 1 : 0); + kvz_init_ver_filter_taps(ver_fir_t, taps); + + for (y = 0; y + 7 < height; y += 8) { + for (x = 0; x + 7 < width; x += 8) { + int ypos = y + off_y_fir_t; + kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_hpel_pos[ypos * hor_stride + x], hor_stride, &out_t[y * dst_stride + x], dst_stride); + } + } - eight_tap_filter_and_flip_16bit_avx2(kvz_g_luma_filter, &flipped_hor_filtered[x][y], MAX_WIDTH, offset23, shift2 + shift3, &(dst[(4 * y + 0)*dst_stride + x]), dst_stride); + if (!sample_off_x) { + for (y = 0; y < height; ++y) { + for (x = width - 8; x >= 8; x -= 8) { + uint64_t chunk = *(uint64_t*)&out_t[y * dst_stride + x - 1]; + *(uint64_t*)&out_t[y * dst_stride + x] = chunk; + } + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_t, &col_pos_hor[y + off_y_fir_t]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + uint64_t first = sample; + uint64_t rest = *(uint64_t*)&out_t[y * dst_stride + x]; + uint64_t chunk = (rest << 8) | first; + *(uint64_t*)&out_t[y * dst_stride + x] = chunk; } - } -} + // Bottom QPEL (3/4 or 1/4 y positions) + // Filter block and then filter column and align if neccessary + kvz_init_ver_filter_taps(ver_fir_b, taps); -/** -* \brief Interpolation for chroma half-pixel -* \param src source image in integer pels (-2..width+3, -2..height+3) -* \param src_stride stride of source image -* \param width width of source image block -* \param height height of source image block -* \param dst destination image in half-pixel resolution -* \param dst_stride stride of destination image -* -*/ -void kvz_filter_inter_halfpel_chroma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag) -{ - /* ____________ - * | B0,0|ae0,0| - * |ea0,0|ee0,0| - * - * ae0,0 = (-4*B-1,0 + 36*B0,0 + 36*B1,0 - 4*B2,0) >> shift1 - * ea0,0 = (-4*B0,-1 + 36*B0,0 + 36*B0,1 - 4*B0,2) >> shift1 - * ee0,0 = (-4*ae0,-1 + 36*ae0,0 + 36*ae0,1 - 4*ae0,2) >> shift2 - */ - int32_t x, y; - int32_t shift1 = KVZ_BIT_DEPTH - 8; - int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset3 = 1 << (shift3 - 1); - int32_t offset23 = 1 << (shift2 + shift3 - 1); - - int8_t* c = kvz_g_chroma_filter[4]; - int16_t temp[4] = {0,0,0,0}; - - // Loop source pixels and generate four filtered half-pel pixels on each round - for (y = 0; y < height; y++) { - int dst_pos_y = (y << 1)*dst_stride; - int src_pos_y = y*src_stride; - for (x = 0; x < width; x++) { - // Calculate current dst and src pixel positions - int dst_pos = dst_pos_y + (x << 1); - int src_pos = src_pos_y + x; - - // Original pixel (not really needed) - dst[dst_pos] = src[src_pos]; //B0,0 - - // ae0,0 - We need this only when hor_flag and for ee0,0 - if (hor_flag) { - temp[1] = kvz_four_tap_filter_hor_avx2(c, &src[src_pos - 1]) >> shift1; // ae0,0 - } - // ea0,0 - needed only when ver_flag - if (ver_flag) { - dst[dst_pos + 1 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c, &src[src_pos - src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); // ea0,0 - } + for (y = 0; y + 7 < height; y += 8) { + for (x = 0; x + 7 < width; x += 8) { + int ypos = y + off_y_fir_b; + kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_hpel_pos[ypos * hor_stride + x], hor_stride, &out_b[y * dst_stride + x], dst_stride); + } + } - // When both flags, we use _only_ this pixel (but still need ae0,0 for it) - if (hor_flag && ver_flag) { - // Calculate temporary values.. - src_pos -= src_stride; //0,-1 - temp[0] = (kvz_four_tap_filter_hor_avx2(c, &src[src_pos - 1]) >> shift1); // ae0,-1 - src_pos += 2 * src_stride; //0,1 - temp[2] = (kvz_four_tap_filter_hor_avx2(c, &src[src_pos - 1]) >> shift1); // ae0,1 - src_pos += src_stride; //0,2 - temp[3] = (kvz_four_tap_filter_hor_avx2(c, &src[src_pos - 1]) >> shift1); // ae0,2 - - dst[dst_pos + 1 * dst_stride + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c, temp) + offset23) >> shift2) >> shift3); // ee0,0 + if (!sample_off_x) { + for (y = 0; y < height; ++y) { + for (x = width - 8; x >= 8; x -= 8) { + uint64_t chunk = *(uint64_t*)&out_b[y * dst_stride + x - 1]; + *(uint64_t*)&out_b[y * dst_stride + x] = chunk; } - if (hor_flag) { - dst[dst_pos + 1] = kvz_fast_clip_32bit_to_pixel((temp[1] + offset3) >> shift3); - } + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_b, &col_pos_hor[y + off_y_fir_b]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + uint64_t first = sample; + uint64_t rest = *(uint64_t*)&out_b[y * dst_stride + x]; + uint64_t chunk = (rest << 8) | first; + *(uint64_t*)&out_b[y * dst_stride + x] = chunk; } } } -void kvz_filter_inter_octpel_chroma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag) +static void kvz_filter_qpel_blocks_diag_luma_avx2(const encoder_control_t * encoder, + kvz_pixel *src, + int16_t src_stride, + int width, + int height, + kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], + int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + int8_t fme_level, + int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], + int8_t hpel_off_x, int8_t hpel_off_y) { + int x, y; - int32_t x, y; - int32_t shift1 = KVZ_BIT_DEPTH - 8; + // Interpolation filter shifts int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset3 = 1 << (shift3 - 1); - int32_t offset23 = 1 << (shift2 + shift3 - 1); - - //coefficients for 1/8, 2/8, 3/8, 4/8, 5/8, 6/8 and 7/8 positions - int8_t *c1, *c2, *c3, *c4, *c5, *c6, *c7; - - int i; - c1 = kvz_g_chroma_filter[1]; - c2 = kvz_g_chroma_filter[2]; - c3 = kvz_g_chroma_filter[3]; - c4 = kvz_g_chroma_filter[4]; - c5 = kvz_g_chroma_filter[5]; - c6 = kvz_g_chroma_filter[6]; - c7 = kvz_g_chroma_filter[7]; - - int16_t temp[7][4]; // Temporary horizontal values calculated from integer pixels - - - // Loop source pixels and generate 64 filtered 1/8-pel pixels on each round - for (y = 0; y < height; y++) { - int dst_pos_y = (y << 3)*dst_stride; - int src_pos_y = y*src_stride; - for (x = 0; x < width; x++) { - // Calculate current dst and src pixel positions - int dst_pos = dst_pos_y + (x << 3); - int src_pos = src_pos_y + x; - - // Original pixel - dst[dst_pos] = src[src_pos]; - - // Horizontal 1/8-values - if (hor_flag && !ver_flag) { - - temp[0][1] = (kvz_four_tap_filter_hor_avx2(c1, &src[src_pos - 1]) >> shift1); // ae0,0 h0 - temp[1][1] = (kvz_four_tap_filter_hor_avx2(c2, &src[src_pos - 1]) >> shift1); - temp[2][1] = (kvz_four_tap_filter_hor_avx2(c3, &src[src_pos - 1]) >> shift1); - temp[3][1] = (kvz_four_tap_filter_hor_avx2(c4, &src[src_pos - 1]) >> shift1); - temp[4][1] = (kvz_four_tap_filter_hor_avx2(c5, &src[src_pos - 1]) >> shift1); - temp[5][1] = (kvz_four_tap_filter_hor_avx2(c6, &src[src_pos - 1]) >> shift1); - temp[6][1] = (kvz_four_tap_filter_hor_avx2(c7, &src[src_pos - 1]) >> shift1); - } + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); - // Vertical 1/8-values - if (ver_flag) { - dst[dst_pos + 1 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c1, &src[src_pos - 1 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); // - dst[dst_pos + 2 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c2, &src[src_pos - 1 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - dst[dst_pos + 3 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c3, &src[src_pos - 1 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - dst[dst_pos + 4 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c4, &src[src_pos - 1 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - dst[dst_pos + 5 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c5, &src[src_pos - 1 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - dst[dst_pos + 6 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c6, &src[src_pos - 1 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - dst[dst_pos + 7 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_avx2(c7, &src[src_pos - 1 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - } + int8_t *fir1 = kvz_g_luma_filter[1]; + int8_t *fir3 = kvz_g_luma_filter[3]; - // When both flags, interpolate values from temporary horizontal values - if (hor_flag && ver_flag) { + int16_t *hor_pos_l = hor_intermediate[3]; + int16_t *hor_pos_r = hor_intermediate[4]; - // Calculate temporary values - src_pos -= 1 * src_stride; //0,-3 - for (i = 0; i < 4; ++i) { + int16_t *col_pos_l = hor_first_cols[1]; + int16_t *col_pos_r = hor_first_cols[3]; - temp[0][i] = (kvz_four_tap_filter_hor_avx2(c1, &src[src_pos + i * src_stride - 1]) >> shift1); - temp[1][i] = (kvz_four_tap_filter_hor_avx2(c2, &src[src_pos + i * src_stride - 1]) >> shift1); - temp[2][i] = (kvz_four_tap_filter_hor_avx2(c3, &src[src_pos + i * src_stride - 1]) >> shift1); - temp[3][i] = (kvz_four_tap_filter_hor_avx2(c4, &src[src_pos + i * src_stride - 1]) >> shift1); - temp[4][i] = (kvz_four_tap_filter_hor_avx2(c5, &src[src_pos + i * src_stride - 1]) >> shift1); - temp[5][i] = (kvz_four_tap_filter_hor_avx2(c6, &src[src_pos + i * src_stride - 1]) >> shift1); - temp[6][i] = (kvz_four_tap_filter_hor_avx2(c7, &src[src_pos + i * src_stride - 1]) >> shift1); + int16_t dst_stride = LCU_WIDTH; + int16_t hor_stride = LCU_WIDTH; - } + // VERTICAL STEP + kvz_pixel *out_tl = filtered[0]; + kvz_pixel *out_tr = filtered[1]; + kvz_pixel *out_bl = filtered[2]; + kvz_pixel *out_br = filtered[3]; + int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3; + int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1; - //Calculate values from temporary horizontal 1/8-values - for (i = 0; i<7; ++i){ - dst[dst_pos + 1 * dst_stride + i + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c1, &temp[i][0]) + offset23) >> shift2) >> shift3); // ee0,0 - dst[dst_pos + 2 * dst_stride + i + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c2, &temp[i][0]) + offset23) >> shift2) >> shift3); - dst[dst_pos + 3 * dst_stride + i + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c3, &temp[i][0]) + offset23) >> shift2) >> shift3); - dst[dst_pos + 4 * dst_stride + i + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c4, &temp[i][0]) + offset23) >> shift2) >> shift3); - dst[dst_pos + 5 * dst_stride + i + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c5, &temp[i][0]) + offset23) >> shift2) >> shift3); - dst[dst_pos + 6 * dst_stride + i + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c6, &temp[i][0]) + offset23) >> shift2) >> shift3); - dst[dst_pos + 7 * dst_stride + i + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_avx2(c7, &temp[i][0]) + offset23) >> shift2) >> shift3); + // Specify if integer pixels are filtered from left or/and top integer samples + int off_x_fir_l = hpel_off_x < 1 ? 0 : 1; + int off_x_fir_r = hpel_off_x < 0 ? 0 : 1; + int off_y_fir_t = hpel_off_y < 1 ? 0 : 1; + int off_y_fir_b = hpel_off_y < 0 ? 0 : 1; - } + __m256i taps[4]; + // Top-left QPEL + // Filter block and then filter column and align if neccessary + kvz_init_ver_filter_taps(ver_fir_t, taps); - } + for (y = 0; y + 7 < height; y += 8) { + for (x = 0; x + 7 < width; x += 8) { + int ypos = y + off_y_fir_t; + kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_l[ypos * hor_stride + x], hor_stride, &out_tl[y * dst_stride + x], dst_stride); + } + } - if (hor_flag) { - dst[dst_pos + 1] = kvz_fast_clip_32bit_to_pixel((temp[0][1] + offset3) >> shift3); - dst[dst_pos + 2] = kvz_fast_clip_32bit_to_pixel((temp[1][1] + offset3) >> shift3); - dst[dst_pos + 3] = kvz_fast_clip_32bit_to_pixel((temp[2][1] + offset3) >> shift3); - dst[dst_pos + 4] = kvz_fast_clip_32bit_to_pixel((temp[3][1] + offset3) >> shift3); - dst[dst_pos + 5] = kvz_fast_clip_32bit_to_pixel((temp[4][1] + offset3) >> shift3); - dst[dst_pos + 6] = kvz_fast_clip_32bit_to_pixel((temp[5][1] + offset3) >> shift3); - dst[dst_pos + 7] = kvz_fast_clip_32bit_to_pixel((temp[6][1] + offset3) >> shift3); + if (!off_x_fir_l) { + for (y = 0; y < height; ++y) { + for (x = width - 8; x >= 8; x -= 8) { + uint64_t chunk = *(uint64_t*)&out_tl[y * dst_stride + x - 1]; + *(uint64_t*)&out_tl[y * dst_stride + x] = chunk; } - + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_t, &col_pos_l[y + off_y_fir_t]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + uint64_t first = sample; + uint64_t rest = *(uint64_t*)&out_tl[y * dst_stride + x]; + uint64_t chunk = (rest << 8) | first; + *(uint64_t*)&out_tl[y * dst_stride + x] = chunk; } } -} -void kvz_filter_hpel_blocks_hor_ver_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered) -{ - int x, y; - int16_t shift1 = KVZ_BIT_DEPTH - 8; - int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset23 = 1 << (shift2 + shift3 - 1); + // Top-right QPEL + // Filter block and then filter column and align if neccessary - int8_t *fir0 = kvz_g_luma_filter[0]; - int8_t *fir2 = kvz_g_luma_filter[2]; + for (y = 0; y + 7 < height; y += 8) { + for (x = 0; x + 7 < width; x += 8) { + int ypos = y + off_y_fir_t; + kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_r[ypos * hor_stride + x], hor_stride, &out_tr[y * dst_stride + x], dst_stride); + } + } - int16_t flipped0[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped2[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; + if (!off_x_fir_r) { + for (y = 0; y < height; ++y) { + for (x = width - 8; x >= 8; x -= 8) { + uint64_t chunk = *(uint64_t*)&out_tr[y * dst_stride + x - 1]; + *(uint64_t*)&out_tr[y * dst_stride + x] = chunk; + } - int16_t temp_stride = height + KVZ_EXT_PADDING + 1; - int16_t dst_stride = (LCU_WIDTH + 1); + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_t, &col_pos_r[y + off_y_fir_t]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + uint64_t first = sample; + uint64_t rest = *(uint64_t*)&out_tr[y * dst_stride + x]; + uint64_t chunk = (rest << 8) | first; + *(uint64_t*)&out_tr[y * dst_stride + x] = chunk; + } + } - // Horizontal positions - for (x = 0; x < width + 1; ++x) { - for (y = 0; y + 8 < height + KVZ_EXT_PADDING + 1; y += 8) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - int8_t *firs[2] = { fir0, fir2 }; - int16_t *dsts[2] = { &flipped0[x * temp_stride + y], &flipped2[x * temp_stride + y] }; - kvz_filter_flip_shift_x8_dual_avx2(&src[src_stride*ypos + xpos], src_stride, &firs[0], shift1, &dsts[0]); + // Bottom-left QPEL + // Filter block and then filter column and align if neccessary + kvz_init_ver_filter_taps(ver_fir_b, taps); + + for (y = 0; y + 7 < height; y += 8) { + for (x = 0; x + 7 < width; x += 8) { + int ypos = y + off_y_fir_b; + kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_l[ypos * hor_stride + x], hor_stride, &out_bl[y * dst_stride + x], dst_stride); } + } - for (; y < height + KVZ_EXT_PADDING + 1; ++y) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - flipped0[x * temp_stride + y] = kvz_eight_tap_filter_hor_avx2(fir0, &src[src_stride*ypos + xpos]) >> shift1; - flipped2[x * temp_stride + y] = kvz_eight_tap_filter_hor_avx2(fir2, &src[src_stride*ypos + xpos]) >> shift1; + if (!off_x_fir_l) { + for (y = 0; y < height; ++y) { + for (x = width - 8; x >= 8; x -= 8) { + uint64_t chunk = *(uint64_t*)&out_bl[y * dst_stride + x - 1]; + *(uint64_t*)&out_bl[y * dst_stride + x] = chunk; + } + + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_b, &col_pos_l[y + off_y_fir_b]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + uint64_t first = sample; + uint64_t rest = *(uint64_t*)&out_bl[y * dst_stride + x]; + uint64_t chunk = (rest << 8) | first; + *(uint64_t*)&out_bl[y * dst_stride + x] = chunk; } } - // Filter vertically and flip x and y - for (x = 0; x + 8 < width + 1; x += 8) { - for (y = 0; y < height + 1; ++y) { - int8_t *firs[2] = { fir0, fir2 }; - kvz_pixel *dsts[2] = { &filtered[HPEL_POS_HOR][y * dst_stride + x], &filtered[HPEL_POS_VER][y * dst_stride + x]}; - int16_t *srcs[2] = {flipped2 + x * temp_stride + y, flipped0 + x * temp_stride + y }; - kvz_filter_flip_round_clip_x8_16bit_dual_avx2(srcs, temp_stride, firs, offset23, shift2 + shift3, dsts); + // Bottom-right QPEL + // Filter block and then filter column and align if neccessary + for (y = 0; y + 7 < height; y += 8) { + for (x = 0; x + 7 < width; x += 8) { + int ypos = y + off_y_fir_b; + kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_pos_r[ypos * hor_stride + x], hor_stride, &out_br[y * dst_stride + x], dst_stride); } } - // The remaining pixels - for (; x < width + 1; ++x) { - for (y = 0; y < height + 1; ++y) { - filtered[HPEL_POS_HOR][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[HPEL_POS_VER][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); + if (!off_x_fir_r) { + for (y = 0; y < height; ++y) { + for (x = width - 8; x >= 8; x -= 8) { + uint64_t chunk = *(uint64_t*)&out_br[y * dst_stride + x - 1]; + *(uint64_t*)&out_br[y * dst_stride + x] = chunk; + } + + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_avx2(ver_fir_b, &col_pos_r[y + off_y_fir_b]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + uint64_t first = sample; + uint64_t rest = *(uint64_t*)&out_br[y * dst_stride + x]; + uint64_t chunk = (rest << 8) | first; + *(uint64_t*)&out_br[y * dst_stride + x] = chunk; } } } -void kvz_filter_hpel_blocks_full_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered) +static void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder, + kvz_pixel *src, + int16_t src_stride, + int width, + int height, + kvz_pixel *dst, + int16_t dst_stride, + int8_t hor_flag, + int8_t ver_flag, + const int16_t mv[2]) { + // TODO: Optimize SMP and AMP + if (width != height) { + kvz_sample_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv); + return; + } + int x, y; - int16_t shift1 = KVZ_BIT_DEPTH - 8; - int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset23 = 1 << (shift2 + shift3 - 1); - int8_t *fir0 = kvz_g_luma_filter[0]; - int8_t *fir2 = kvz_g_luma_filter[2]; + int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3]; + int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3]; - int16_t flipped0[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped2[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; + int16_t hor_stride = LCU_WIDTH; + int16_t hor_intermediate[KVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH]; - int16_t temp_stride = height + KVZ_EXT_PADDING + 1; - int16_t dst_stride = (LCU_WIDTH + 1); + // HORIZONTAL STEP + __m256i shuf_01_23, shuf_45_67; + __m256i taps_01_23, taps_45_67; - // Horizontal positions - for (x = 0; x < width + 1; ++x) { - for (y = 0; y + 8 < height + KVZ_EXT_PADDING + 1; y += 8) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - int8_t *firs[2] = { fir0, fir2 }; - int16_t *dsts[2] = { &flipped0[x * temp_stride + y], &flipped2[x * temp_stride + y] }; - kvz_filter_flip_shift_x8_dual_avx2(&src[src_stride*ypos + xpos], src_stride, &firs[0], shift1, &dsts[0]); + kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67); + kvz_init_filter_taps(hor_fir, &taps_01_23, &taps_45_67); - } + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) { - for (; y < height + KVZ_EXT_PADDING + 1; ++y) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - flipped0[x * temp_stride + y] = kvz_eight_tap_filter_hor_avx2(fir0, &src[src_stride*ypos + xpos]) >> shift1; - flipped2[x * temp_stride + y] = kvz_eight_tap_filter_hor_avx2(fir2, &src[src_stride*ypos + xpos]) >> shift1; + for (x = 0; x + 7 < width; x += 8) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET; + kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_intermediate[y * hor_stride + x], + &shuf_01_23, &shuf_45_67, + &taps_01_23, &taps_45_67); //TODO: >> shift1 } } - // Filter vertically and flip x and y - for (x = 0; x + 8 < width + 1; x += 8) { - for (y = 0; y < height + 1; ++y) { - int8_t *firs[2] = { fir0, fir2 }; - kvz_pixel *dsts[2] = { &filtered[HPEL_POS_HOR][y * dst_stride + x], &filtered[HPEL_POS_VER][y * dst_stride + x]}; - int16_t *srcs[2] = {flipped2 + x * temp_stride + y, flipped0 + x * temp_stride + y }; - kvz_filter_flip_round_clip_x8_16bit_dual_avx2(srcs, temp_stride, firs, offset23, shift2 + shift3, dsts); - kvz_filter_flip_round_clip_x8_16bit_avx2(flipped2 + x * temp_stride + y, temp_stride, fir2, offset23, shift2 + shift3, &filtered[HPEL_POS_DIA][y * dst_stride + x]); + // VERTICAL STEP + __m256i taps[4]; + kvz_init_ver_filter_taps(ver_fir, taps); - } - } - - // The remaining pixels - for (; x < width + 1; ++x) { - for (y = 0; y < height + 1; ++y) { - filtered[HPEL_POS_HOR][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[HPEL_POS_VER][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[HPEL_POS_DIA][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); + for (y = 0; y + 7 < height; y += 8) { + for (x = 0; x + 7 < width; x += 8) { + kvz_eight_tap_filter_ver_16bit_8x8_avx2(taps, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride); } } } -void kvz_filter_qpel_blocks_hor_ver_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered) +static void kvz_sample_14bit_quarterpel_luma_avx2(const encoder_control_t * const encoder, + kvz_pixel *src, + int16_t src_stride, + int width, + int height, + int16_t *dst, + int16_t dst_stride, + int8_t hor_flag, + int8_t ver_flag, + const int16_t mv[2]) { + // TODO: Optimize SMP and AMP + if (width != height) { + kvz_sample_14bit_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv); + return; + } + // TODO: horizontal and vertical only filtering int x, y; - int16_t shift1 = KVZ_BIT_DEPTH - 8; - int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset23 = 1 << (shift2 + shift3 - 1); - int8_t *fir0 = kvz_g_luma_filter[0]; - int8_t *fir2 = kvz_g_luma_filter[2]; - int8_t *fir1 = kvz_g_luma_filter[1]; - int8_t *fir3 = kvz_g_luma_filter[3]; + int8_t *hor_fir = kvz_g_luma_filter[mv[0] & 3]; + int8_t *ver_fir = kvz_g_luma_filter[mv[1] & 3]; - int16_t flipped0[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped2[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped1[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped3[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; + int16_t hor_stride = LCU_WIDTH; + int16_t hor_intermediate[KVZ_EXT_BLOCK_W_LUMA * LCU_WIDTH]; - int16_t temp_stride = height + KVZ_EXT_PADDING + 1; - int16_t dst_stride = (LCU_WIDTH + 1); - - // Horizontal positions - for (x = 0; x < width + 1; ++x) { - for (y = 0; y + 8 < height + KVZ_EXT_PADDING + 1; y += 8) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - int8_t *firs[4] = { fir0, fir2, fir1, fir3 }; - int16_t *dsts[4] = { &flipped0[x * temp_stride + y], &flipped2[x * temp_stride + y], &flipped1[x * temp_stride + y], &flipped3[x * temp_stride + y]}; - kvz_filter_flip_shift_x8_dual_avx2(&src[src_stride*ypos + xpos], src_stride, &firs[0], shift1, &dsts[0]); - kvz_filter_flip_shift_x8_dual_avx2(&src[src_stride*ypos + xpos], src_stride, &firs[2], shift1, &dsts[2]); - } + // HORIZONTAL STEP + __m256i shuf_01_23, shuf_45_67; + __m256i taps_01_23, taps_45_67; - for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - flipped0[x * temp_stride + y] = kvz_eight_tap_filter_hor_avx2(fir0, &src[src_stride*ypos + xpos]) >> shift1; - flipped2[x * temp_stride + y] = kvz_eight_tap_filter_hor_avx2(fir2, &src[src_stride*ypos + xpos]) >> shift1; - flipped1[x * temp_stride + y] = kvz_eight_tap_filter_hor_avx2(fir1, &src[src_stride*ypos + xpos]) >> shift1; - flipped3[x * temp_stride + y] = kvz_eight_tap_filter_hor_avx2(fir3, &src[src_stride*ypos + xpos]) >> shift1; - } - } + kvz_init_shuffle_masks(&shuf_01_23, &shuf_45_67); + kvz_init_filter_taps(hor_fir, &taps_01_23, &taps_45_67); + + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) { - // Filter vertically and flip x and y - for (x = 0; x + 8 < width + 1; x += 8) { - for (y = 0; y < height + 1; ++y) { - - // HPEL - int8_t *firs0[2] = { fir0, fir2 }; - kvz_pixel *dsts0[2] = { &filtered[0][y * dst_stride + x], &filtered[1][y * dst_stride + x]}; - int16_t *srcs0[4] = { flipped2 + x * temp_stride + y, flipped0 + x * temp_stride + y}; - kvz_filter_flip_round_clip_x8_16bit_dual_avx2(srcs0, temp_stride, firs0, offset23, shift2 + shift3, dsts0); - kvz_filter_flip_round_clip_x8_16bit_avx2(flipped2 + x * temp_stride + y, temp_stride, fir2, offset23, shift2 + shift3, &filtered[2][y * dst_stride + x]); - - // QPEL - // Horizontal - int8_t *firs1[4] = { fir0, fir0, fir2, fir2 }; - kvz_pixel *dsts1[4] = { &filtered[3][y * dst_stride + x], &filtered[4][y * dst_stride + x], - &filtered[5][y * dst_stride + x], &filtered[6][y * dst_stride + x] }; - int16_t *srcs1[4] = { flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, - flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, }; - kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs1[0], temp_stride, &firs1[0], offset23, shift2 + shift3, &dsts1[0]); - kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs1[2], temp_stride, &firs1[2], offset23, shift2 + shift3, &dsts1[2]); - - // Vertical - int8_t *firs2[4] = { fir1, fir1, fir3, fir3 }; - kvz_pixel *dsts2[4] = { &filtered[7][y * dst_stride + x], &filtered[8][y * dst_stride + x], - &filtered[9][y * dst_stride + x], &filtered[10][y * dst_stride + x] }; - int16_t *srcs2[4] = { flipped0 + x * temp_stride + y, flipped2 + x * temp_stride + y, - flipped0 + x * temp_stride + y, flipped2 + x * temp_stride + y, }; - kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs2[0], temp_stride, &firs2[0], offset23, shift2 + shift3, &dsts2[0]); - kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs2[2], temp_stride, &firs2[2], offset23, shift2 + shift3, &dsts2[2]); + for (x = 0; x + 7 < width; x += 8) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET; + kvz_eight_tap_filter_hor_8x1_avx2(&src[src_stride*ypos + xpos], &hor_intermediate[y * hor_stride + x], + &shuf_01_23, &shuf_45_67, + &taps_01_23, &taps_45_67); //TODO: >> shift1 } } - // The remaining pixels - for (; x < width + 1; ++x) { - for (y = 0; y < height + 1; ++y) { - - // HPEL - filtered[0][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[1][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[2][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - - // QPEL - // Horizontal - filtered[3][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[4][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped3[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[5][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[6][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped3[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - - // Vertical - filtered[7][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[8][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[9][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[10][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); + // VERTICAL STEP + __m256i taps[4]; + kvz_init_ver_filter_taps(ver_fir, taps); + + for (y = 0; y + 7 < height; y += 8) { + for (x = 0; x + 7 < width; x += 8) { + kvz_eight_tap_filter_ver_16bit_8x8_no_round_avx2(taps, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride); } } } -void kvz_filter_qpel_blocks_full_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered) + +static void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encoder, + kvz_pixel *src, + int16_t src_stride, + int width, + int height, + kvz_pixel *dst, + int16_t dst_stride, + int8_t hor_flag, + int8_t ver_flag, + const int16_t mv[2]) { + // TODO: Optimize SMP and AMP + if (width != height) { + kvz_sample_octpel_chroma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv); + return; + } int x, y; - int16_t shift1 = KVZ_BIT_DEPTH - 8; - int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset23 = 1 << (shift2 + shift3 - 1); - int8_t *fir0 = kvz_g_luma_filter[0]; - int8_t *fir2 = kvz_g_luma_filter[2]; - int8_t *fir1 = kvz_g_luma_filter[1]; - int8_t *fir3 = kvz_g_luma_filter[3]; + int8_t *hor_fir = kvz_g_chroma_filter[mv[0] & 7]; + int8_t *ver_fir = kvz_g_chroma_filter[mv[1] & 7]; - int16_t flipped0[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped2[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped1[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped3[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; + int16_t hor_stride = LCU_WIDTH_C; + int16_t hor_intermediate[KVZ_EXT_BLOCK_W_CHROMA * LCU_WIDTH_C]; - int16_t temp_stride = height + KVZ_EXT_PADDING + 1; - int16_t dst_stride = (LCU_WIDTH + 1); - - // Horizontal positions - for (x = 0; x < (width + 1); ++x) { - for (y = 0; y + 8 < height + KVZ_EXT_PADDING + 1; y += 8) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - int8_t *firs[4] = { fir0, fir2, fir1, fir3 }; - int16_t *dsts[4] = { &flipped0[x * temp_stride + y], &flipped2[x * temp_stride + y], &flipped1[x * temp_stride + y], &flipped3[x * temp_stride + y]}; - kvz_filter_flip_shift_x8_dual_avx2(&src[src_stride*ypos + xpos], src_stride, &firs[0], shift1, &dsts[0]); - kvz_filter_flip_shift_x8_dual_avx2(&src[src_stride*ypos + xpos], src_stride, &firs[2], shift1, &dsts[2]); - } + // HORIZONTAL STEP + __m256i shuf_01, shuf_23; + __m256i taps_01, taps_23; - for (; y < height + KVZ_EXT_PADDING + 1; ++y) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - flipped0[x * temp_stride + y] = kvz_eight_tap_filter_hor_avx2(fir0, &src[src_stride*ypos + xpos]) >> shift1; - flipped2[x * temp_stride + y] = kvz_eight_tap_filter_hor_avx2(fir2, &src[src_stride*ypos + xpos]) >> shift1; - flipped1[x * temp_stride + y] = kvz_eight_tap_filter_hor_avx2(fir1, &src[src_stride*ypos + xpos]) >> shift1; - flipped3[x * temp_stride + y] = kvz_eight_tap_filter_hor_avx2(fir3, &src[src_stride*ypos + xpos]) >> shift1; - } - } + kvz_init_shuffle_masks_chroma(&shuf_01, &shuf_23); + kvz_init_filter_taps_chroma(hor_fir, &taps_01, &taps_23); + + for (y = 0; y + 3 < height + KVZ_EXT_PADDING_CHROMA; y += 4) { - // Filter vertically and flip x and y - for (x = 0; x + 8 < width + 1; x += 8) { - for (y = 0; y < height + 1; ++y) { - - // HPEL - int8_t *firs0[2] = { fir0, fir2 }; - kvz_pixel *dsts0[2] = { &filtered[0][y * dst_stride + x], &filtered[1][y * dst_stride + x]}; - int16_t *srcs0[4] = { flipped2 + x * temp_stride + y, flipped0 + x * temp_stride + y}; - kvz_filter_flip_round_clip_x8_16bit_dual_avx2(srcs0, temp_stride, firs0, offset23, shift2 + shift3, dsts0); - kvz_filter_flip_round_clip_x8_16bit_avx2(flipped2 + x * temp_stride + y, temp_stride, fir2, offset23, shift2 + shift3, &filtered[2][y * dst_stride + x]); - - // QPEL - // Horizontal - int8_t *firs1[4] = { fir0, fir0, fir2, fir2 }; - kvz_pixel *dsts1[4] = { &filtered[3][y * dst_stride + x], &filtered[4][y * dst_stride + x], - &filtered[5][y * dst_stride + x], &filtered[6][y * dst_stride + x] }; - int16_t *srcs1[4] = { flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, - flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, }; - kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs1[0], temp_stride, &firs1[0], offset23, shift2 + shift3, &dsts1[0]); - kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs1[2], temp_stride, &firs1[2], offset23, shift2 + shift3, &dsts1[2]); - - // Vertical - int8_t *firs2[4] = { fir1, fir1, fir3, fir3 }; - kvz_pixel *dsts2[4] = { &filtered[7][y * dst_stride + x], &filtered[8][y * dst_stride + x], - &filtered[9][y * dst_stride + x], &filtered[10][y * dst_stride + x] }; - int16_t *srcs2[4] = { flipped0 + x * temp_stride + y, flipped2 + x * temp_stride + y, - flipped0 + x * temp_stride + y, flipped2 + x * temp_stride + y, }; - kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs2[0], temp_stride, &firs2[0], offset23, shift2 + shift3, &dsts2[0]); - kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs2[2], temp_stride, &firs2[2], offset23, shift2 + shift3, &dsts2[2]); - - // Diagonal - int8_t *firs3[4] = { fir1, fir1, fir3, fir3 }; - kvz_pixel *dsts3[4] = { &filtered[11][y * dst_stride + x], &filtered[12][y * dst_stride + x], - &filtered[13][y * dst_stride + x], &filtered[14][y * dst_stride + x] }; - int16_t *srcs3[4] = { flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, - flipped1 + x * temp_stride + y, flipped3 + x * temp_stride + y, }; - kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs3[0], temp_stride, &firs3[0], offset23, shift2 + shift3, &dsts3[0]); - kvz_filter_flip_round_clip_x8_16bit_dual_avx2(&srcs3[2], temp_stride, &firs3[2], offset23, shift2 + shift3, &dsts3[2]); + for (x = 0; x + 3 < width; x += 4) { + int ypos = y - KVZ_CHROMA_FILTER_OFFSET; + int xpos = x - KVZ_CHROMA_FILTER_OFFSET; + kvz_four_tap_filter_hor_4x4_avx2(&src[src_stride*ypos + xpos], src_stride, &hor_intermediate[y * hor_stride + x], hor_stride, + &shuf_01, &shuf_23, + &taps_01, &taps_23); //TODO: >> shift1 } } - // The remaining pixels - for (; x < width + 1; ++x) { - for (y = 0; y < height + 1; ++y) { - - // HPEL - filtered[0][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[1][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[2][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - - // QPEL - // Horizontal - filtered[3][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[4][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir0, &flipped3[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[5][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[6][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir2, &flipped3[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - - // Vertical - filtered[7][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[8][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[9][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[10][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - - // Diagonal - filtered[11][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[12][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir1, &flipped3[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[13][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[14][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_avx2(fir3, &flipped3[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - } + __m256i shuf_01_23 = _mm256_permute2x128_si256(shuf_01, shuf_23, _MM_SHUFFLE(0, 2, 0, 0)); + __m256i taps_01_23 = _mm256_permute2x128_si256(taps_01, taps_23, _MM_SHUFFLE(0, 2, 0, 0)); + + int rows = 3; + for (x = 0; x + 3 < width; x += 4) { + int ypos = y - KVZ_CHROMA_FILTER_OFFSET; + int xpos = x - KVZ_CHROMA_FILTER_OFFSET; + kvz_four_tap_filter_hor_4xN_avx2(&src[src_stride*ypos + xpos], src_stride, &hor_intermediate[y * hor_stride + x], hor_stride, + &shuf_01_23, &taps_01_23, + rows); //TODO: >> shift1 } -} -void kvz_filter_frac_blocks_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block filtered[15], int8_t fme_level) -{ - switch (fme_level) { - case 1: - kvz_filter_hpel_blocks_hor_ver_luma_avx2(encoder, src, src_stride, width, height, filtered); - break; - case 2: - kvz_filter_hpel_blocks_full_luma_avx2(encoder, src, src_stride, width, height, filtered); - break; - case 3: - kvz_filter_qpel_blocks_hor_ver_luma_avx2(encoder, src, src_stride, width, height, filtered); - break; - default: - kvz_filter_qpel_blocks_full_luma_avx2(encoder, src, src_stride, width, height, filtered); - break; + // VERTICAL STEP + for (y = 0; y + 3 < height; y += 4) { + for (x = 0; x + 3 < width; x += 4) { + kvz_four_tap_filter_ver_16bit_4x4_avx2(ver_fir, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride); + } } } -void kvz_sample_quarterpel_luma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height,kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]) +static void kvz_sample_14bit_octpel_chroma_avx2(const encoder_control_t * const encoder, + kvz_pixel *src, + int16_t src_stride, + int width, + int height, + int16_t *dst, + int16_t dst_stride, + int8_t hor_flag, + int8_t ver_flag, + const int16_t mv[2]) { - //Check for amp + // TODO: Optimize SMP and AMP if (width != height) { - kvz_sample_quarterpel_luma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv); + kvz_sample_14bit_octpel_chroma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv); return; } - //TODO: horizontal and vertical only filtering - int32_t x, y; - int16_t shift1 = KVZ_BIT_DEPTH - 8; - int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset23 = 1 << (shift2 + shift3 - 1); - - int8_t *hor_filter = kvz_g_luma_filter[mv[0] & 3]; - int8_t *ver_filter = kvz_g_luma_filter[mv[1] & 3]; - - int16_t hor_filtered[(LCU_WIDTH + 1) + FILTER_SIZE][(LCU_WIDTH + 1) + FILTER_SIZE]; - - if (width == 4) { - // Filter horizontally and flip x and y - for (y = 0; y < height + FILTER_SIZE - 1; ++y) { - for (x = 0; x < width; x += 4) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - int16_t *out = &(hor_filtered[y][x]); - kvz_eight_tap_filter_x4_hor_avx2(hor_filter, &src[src_stride*ypos + xpos], shift1, out); - } - } + // TODO: horizontal and vertical only filtering + int x, y; - // Filter vertically and flip x and y - for (y = 0; y < height; ++y) { - for (x = 0; x < width; x+=4) { - int ypos = y; - int xpos = x; - *(int32_t*)&(dst[y*dst_stride + x]) = kvz_eight_tap_filter_x4_ver_16bit_avx2(ver_filter, &hor_filtered[ypos][xpos], sizeof(hor_filtered[0])/sizeof(int16_t), offset23, shift2, shift3); - } - } + int8_t *hor_fir = kvz_g_chroma_filter[mv[0] & 7]; + int8_t *ver_fir = kvz_g_chroma_filter[mv[1] & 7]; - } else { - // Filter horizontally and flip x and y - for (y = 0; y < height + FILTER_SIZE - 1; ++y) { - for (x = 0; x < width; x+=8) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - int16_t *dst = &(hor_filtered[y][x]); - kvz_eight_tap_filter_x8_hor_avx2(hor_filter, &src[src_stride*ypos + xpos], shift1, dst); - } - } + int16_t hor_stride = LCU_WIDTH_C; + int16_t hor_intermediate[KVZ_EXT_BLOCK_W_CHROMA * LCU_WIDTH_C]; - // Filter vertically and flip x and y - for (y = 0; y < height; ++y) { - for (x = 0; x < width; x+=8) { - int ypos = y; - int xpos = x; - kvz_pixel *out = &(dst[y*dst_stride + x]); - kvz_eight_tap_filter_x8_ver_16bit_avx2(ver_filter, &hor_filtered[ypos][xpos], sizeof(hor_filtered[0])/sizeof(int16_t), offset23, shift2, shift3, out); - } - } - } -} + // HORIZONTAL STEP + __m256i shuf_01, shuf_23; + __m256i taps_01, taps_23; -void kvz_sample_octpel_chroma_avx2(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height,kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]) -{ - //Check for amp - if (width != height) { - kvz_sample_octpel_chroma_generic(encoder, src, src_stride, width, height, dst, dst_stride, hor_flag, ver_flag, mv); - return; - } - //TODO: horizontal and vertical only filtering - int32_t x, y; - int16_t shift1 = KVZ_BIT_DEPTH - 8; - int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset23 = 1 << (shift2 + shift3 - 1); - - int8_t *hor_filter = kvz_g_chroma_filter[mv[0] & 7]; - int8_t *ver_filter = kvz_g_chroma_filter[mv[1] & 7]; - -#define FILTER_SIZE_C (FILTER_SIZE / 2) -#define FILTER_OFFSET_C (FILTER_OFFSET / 2) - int16_t hor_filtered[(LCU_WIDTH_C + 1) + FILTER_SIZE_C][(LCU_WIDTH_C + 1) + FILTER_SIZE_C]; - - if (width == 4) { - // Filter horizontally and flip x and y - for (y = 0; y < height + FILTER_SIZE_C - 1; ++y) { - for (x = 0; x < width; x += 4) { - int ypos = y - FILTER_OFFSET_C; - int xpos = x - FILTER_OFFSET_C; - int16_t *out = &(hor_filtered[y][x]); - kvz_four_tap_filter_x4_hor_avx2(hor_filter, &src[src_stride*ypos + xpos], shift1, out); - } - } + kvz_init_shuffle_masks_chroma(&shuf_01, &shuf_23); + kvz_init_filter_taps_chroma(hor_fir, &taps_01, &taps_23); - // Filter vertically and flip x and y - for (y = 0; y < height; ++y) { - for (x = 0; x < width; x+=4) { - int ypos = y; - int xpos = x; - *(int32_t*)&(dst[y*dst_stride + x]) = kvz_four_tap_filter_x4_ver_16bit_avx2(ver_filter, &hor_filtered[ypos][xpos], sizeof(hor_filtered[0])/sizeof(int16_t), offset23, shift2, shift3); - } - } + for (y = 0; y + 3 < height + KVZ_EXT_PADDING_CHROMA; y += 4) { - } else { - // Filter horizontally and flip x and y - for (y = 0; y < height + FILTER_SIZE_C - 1; ++y) { - for (x = 0; x < width; x += 8) { - int ypos = y - FILTER_OFFSET_C; - int xpos = x - FILTER_OFFSET_C; - int16_t *dst = &(hor_filtered[y][x]); - kvz_four_tap_filter_x8_hor_avx2(hor_filter, &src[src_stride*ypos + xpos], shift1, dst); - } + for (x = 0; x + 3 < width; x += 4) { + int ypos = y - KVZ_CHROMA_FILTER_OFFSET; + int xpos = x - KVZ_CHROMA_FILTER_OFFSET; + kvz_four_tap_filter_hor_4x4_avx2(&src[src_stride*ypos + xpos], src_stride, &hor_intermediate[y * hor_stride + x], hor_stride, + &shuf_01, &shuf_23, + &taps_01, &taps_23); //TODO: >> shift1 } + } - // Filter vertically and flip x and y - for (y = 0; y < height; ++y) { - for (x = 0; x < width; x+=8) { - int ypos = y; - int xpos = x; - kvz_pixel *out = &(dst[y*dst_stride + x]); - kvz_four_tap_filter_x8_ver_16bit_avx2(ver_filter, &hor_filtered[ypos][xpos], sizeof(hor_filtered[0])/sizeof(int16_t), offset23, shift2, shift3, out); - } + __m256i shuf_01_23 = _mm256_permute2x128_si256(shuf_01, shuf_23, _MM_SHUFFLE(0, 2, 0, 0)); + __m256i taps_01_23 = _mm256_permute2x128_si256(taps_01, taps_23, _MM_SHUFFLE(0, 2, 0, 0)); + + int rows = 3; + for (x = 0; x + 3 < width; x += 4) { + int ypos = y - KVZ_CHROMA_FILTER_OFFSET; + int xpos = x - KVZ_CHROMA_FILTER_OFFSET; + kvz_four_tap_filter_hor_4xN_avx2(&src[src_stride*ypos + xpos], src_stride, &hor_intermediate[y * hor_stride + x], hor_stride, + &shuf_01_23, &taps_01_23, + rows); //TODO: >> shift1 + } + + // VERTICAL STEP + for (y = 0; y + 3 < height; y += 4) { + for (x = 0; x + 3 < width; x += 4) { + kvz_four_tap_filter_ver_16bit_4x4_no_round_avx2(ver_fir, &hor_intermediate[y * hor_stride + x], hor_stride, &dst[y * dst_stride + x], dst_stride); } } } - void kvz_get_extended_block_avx2(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height, int filter_size, int width, int height, kvz_extended_block *out) { @@ -1427,12 +1511,14 @@ bool success = true; #if COMPILE_INTEL_AVX2 if (bitdepth == 8){ - success &= kvz_strategyselector_register(opaque, "filter_inter_quarterpel_luma", "avx2", 40, &kvz_filter_inter_quarterpel_luma_avx2); - success &= kvz_strategyselector_register(opaque, "filter_inter_halfpel_chroma", "avx2", 40, &kvz_filter_inter_halfpel_chroma_avx2); - success &= kvz_strategyselector_register(opaque, "filter_inter_octpel_chroma", "avx2", 40, &kvz_filter_inter_octpel_chroma_avx2); - success &= kvz_strategyselector_register(opaque, "filter_frac_blocks_luma", "avx2", 40, &kvz_filter_frac_blocks_luma_avx2); + success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_hor_ver_luma", "avx2", 40, &kvz_filter_hpel_blocks_hor_ver_luma_avx2); + success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_diag_luma", "avx2", 40, &kvz_filter_hpel_blocks_diag_luma_avx2); + success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_hor_ver_luma", "avx2", 40, &kvz_filter_qpel_blocks_hor_ver_luma_avx2); + success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "avx2", 40, &kvz_filter_qpel_blocks_diag_luma_avx2); success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "avx2", 40, &kvz_sample_quarterpel_luma_avx2); success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "avx2", 40, &kvz_sample_octpel_chroma_avx2); + success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "avx2", 40, &kvz_sample_14bit_quarterpel_luma_avx2); + success &= kvz_strategyselector_register(opaque, "sample_14bit_octpel_chroma", "avx2", 40, &kvz_sample_14bit_octpel_chroma_avx2); } success &= kvz_strategyselector_register(opaque, "get_extended_block", "avx2", 40, &kvz_get_extended_block_avx2); #endif //COMPILE_INTEL_AVX2
View file
kvazaar-1.2.0.tar.gz/src/strategies/avx2/picture-avx2.c -> kvazaar-1.3.0.tar.gz/src/strategies/avx2/picture-avx2.c
Changed
@@ -21,17 +21,59 @@ /* * \file */ -#include "strategies/avx2/picture-avx2.h" + +#include "global.h" #if COMPILE_INTEL_AVX2 +#include "strategies/avx2/picture-avx2.h" +#include "strategies/avx2/reg_sad_pow2_widths-avx2.h" + #include <immintrin.h> +#include <emmintrin.h> +#include <mmintrin.h> +#include <xmmintrin.h> #include <string.h> - #include "kvazaar.h" #include "strategies/strategies-picture.h" #include "strategyselector.h" #include "strategies/generic/picture-generic.h" +/** + * \brief Calculate Sum of Absolute Differences (SAD) + * + * Calculate Sum of Absolute Differences (SAD) between two rectangular regions + * located in arbitrary points in the picture. + * + * \param data1 Starting point of the first picture. + * \param data2 Starting point of the second picture. + * \param width Width of the region for which SAD is calculated. + * \param height Height of the region for which SAD is calculated. + * \param stride Width of the pixel array. + * + * \returns Sum of Absolute Differences + */ +uint32_t kvz_reg_sad_avx2(const kvz_pixel * const data1, const kvz_pixel * const data2, + const int width, const int height, const unsigned stride1, const unsigned stride2) +{ + if (width == 0) + return 0; + if (width == 4) + return reg_sad_w4(data1, data2, height, stride1, stride2); + if (width == 8) + return reg_sad_w8(data1, data2, height, stride1, stride2); + if (width == 12) + return reg_sad_w12(data1, data2, height, stride1, stride2); + if (width == 16) + return reg_sad_w16(data1, data2, height, stride1, stride2); + if (width == 24) + return reg_sad_w24(data1, data2, height, stride1, stride2); + if (width == 32) + return reg_sad_w32(data1, data2, height, stride1, stride2); + if (width == 64) + return reg_sad_w64(data1, data2, height, stride1, stride2); + else + return reg_sad_arbitrary(data1, data2, width, height, stride1, stride2); +} /** * \brief Calculate SAD for 8x8 bytes in continuous memory. @@ -484,13 +526,13 @@ } static void kvz_satd_4x4_subblock_quad_avx2(const kvz_pixel *preds[4], - const int strides[4], + const int stride, const kvz_pixel *orig, const int orig_stride, unsigned costs[4]) { // TODO: AVX2 implementation - kvz_satd_4x4_subblock_quad_generic(preds, strides, orig, orig_stride, costs); + kvz_satd_4x4_subblock_quad_generic(preds, stride, orig, orig_stride, costs); } static unsigned satd_8x8_subblock_8bit_avx2(const kvz_pixel * buf1, unsigned stride1, const kvz_pixel * buf2, unsigned stride2) @@ -508,13 +550,13 @@ } static void satd_8x8_subblock_quad_avx2(const kvz_pixel **preds, - const int *strides, + const int stride, const kvz_pixel *orig, const int orig_stride, unsigned *costs) { - kvz_satd_8bit_8x8_general_dual_avx2(preds[0], strides[0], preds[1], strides[1], orig, orig_stride, &costs[0], &costs[1]); - kvz_satd_8bit_8x8_general_dual_avx2(preds[2], strides[2], preds[3], strides[3], orig, orig_stride, &costs[2], &costs[3]); + kvz_satd_8bit_8x8_general_dual_avx2(preds[0], stride, preds[1], stride, orig, orig_stride, &costs[0], &costs[1]); + kvz_satd_8bit_8x8_general_dual_avx2(preds[2], stride, preds[3], stride, orig, orig_stride, &costs[2], &costs[3]); } SATD_NxN(8bit_avx2, 8) @@ -577,7 +619,7 @@ static void satd_any_size_ ## suffix ( \ int width, int height, \ const kvz_pixel **preds, \ - const int *strides, \ + const int stride, \ const kvz_pixel *orig, \ const int orig_stride, \ unsigned num_modes, \ @@ -591,7 +633,7 @@ if (width % 8 != 0) { \ /* Process the first column using 4x4 blocks. */ \ for (int y = 0; y < height; y += 4) { \ - kvz_satd_4x4_subblock_ ## suffix(preds, strides, orig, orig_stride, sums); \ + kvz_satd_4x4_subblock_ ## suffix(preds, stride, orig, orig_stride, sums); \ } \ orig_ptr += 4; \ for(int blk = 0; blk < num_parallel_blocks; ++blk){\ @@ -602,23 +644,23 @@ if (height % 8 != 0) { \ /* Process the first row using 4x4 blocks. */ \ for (int x = 0; x < width; x += 4 ) { \ - kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \ + kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \ } \ orig_ptr += 4 * orig_stride; \ for(int blk = 0; blk < num_parallel_blocks; ++blk){\ - pred_ptrs[blk] += 4 * strides[blk]; \ + pred_ptrs[blk] += 4 * stride; \ }\ height -= 4; \ } \ /* The rest can now be processed with 8x8 blocks. */ \ for (int y = 0; y < height; y += 8) { \ orig_ptr = &orig[y * orig_stride]; \ - pred_ptrs[0] = &preds[0][y * strides[0]]; \ - pred_ptrs[1] = &preds[1][y * strides[1]]; \ - pred_ptrs[2] = &preds[2][y * strides[2]]; \ - pred_ptrs[3] = &preds[3][y * strides[3]]; \ + pred_ptrs[0] = &preds[0][y * stride]; \ + pred_ptrs[1] = &preds[1][y * stride]; \ + pred_ptrs[2] = &preds[2][y * stride]; \ + pred_ptrs[3] = &preds[3][y * stride]; \ for (int x = 0; x < width; x += 8) { \ - satd_8x8_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \ + satd_8x8_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \ orig_ptr += 8; \ pred_ptrs[0] += 8; \ pred_ptrs[1] += 8; \ @@ -714,8 +756,570 @@ } } -#endif //COMPILE_INTEL_AVX2 +static void inter_recon_bipred_no_mov_avx2( + const int height, + const int width, + const int ypos, + const int xpos, + const hi_prec_buf_t*high_precision_rec0, + const hi_prec_buf_t*high_precision_rec1, + lcu_t* lcu, + kvz_pixel* temp_lcu_y, + kvz_pixel* temp_lcu_u, + kvz_pixel* temp_lcu_v) { + + // This function is used only when kvazaar can't find any movement from the current block + int y_in_lcu, x_in_lcu; + __m256i sample0_epi8, sample1_epi8, temp_y_epi8; + int32_t * pointer = 0; + + for (int temp_y = 0; temp_y < height; temp_y += 1) { + y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); + + for (int temp_x = 0; temp_x < width; temp_x += 32) { + + x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1)); + + switch (width) + { + + case 4: + + sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])); + sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])); + + temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + pointer = (int32_t*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]); + *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_y_epi8)); + + break; + + case 8: + + sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])); + sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])); + + temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + // Store 64-bits from vector to memory + _mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_y_epi8)); + + break; + + case 12: + sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])); + sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])); + + temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + // Store 64-bits from vector to memory + _mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_y_epi8)); + + x_in_lcu = ((xpos + temp_x + 8) & ((LCU_WIDTH)-1)); + + sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])); + sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])); + + temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + pointer = (int32_t*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]); + *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_y_epi8)); + break; + + + case 16: + + sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])); + sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])); + + temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + // Store 128-bit to memory + _mm_storeu_si128((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_y_epi8)); + + break; + + case 32: + + sample0_epi8 = _mm256_loadu_si256((__m256i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])); + sample1_epi8 = _mm256_loadu_si256((__m256i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])); + + temp_y_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + + // Store 256-bit integers to memory + _mm256_storeu_si256((__m256i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), temp_y_epi8); + break; + + default: + // If width is something strange size, use this + for (int temp_i = 0; temp_i < width; ++temp_i) { + x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1)); + + int sample0_y = (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)); + int sample1_y = (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH)); + + lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y) >> 1); + } + + + } + + if (temp_x < width >> 1 && temp_y < height >> 1) { + y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1)); + x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1)); + + __m256i temp_u_epi8; + __m256i temp_v_epi8; + + + switch (width) + { + + case 8: + + + sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + pointer = (int32_t*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]); + *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_u_epi8)); + + pointer = (int32_t*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]); + *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_v_epi8)); + + break; + + case 12: + + sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + sample0_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + sample1_epi8 = _mm256_castsi128_si256(_mm_cvtsi32_si128(*(int32_t*)&lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + pointer = (int32_t*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]); + *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_u_epi8)); + + pointer = (int32_t*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]); + *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_v_epi8)); + + // This is used only with odd shaped objects + for (int temp_i = 4; temp_i < width >> 1; ++temp_i) { + int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1)); + int16_t sample0_u = (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); + int16_t sample1_u = (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); + lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u) >> 1); + + int16_t sample0_v = (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); + int16_t sample1_v = (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); + lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v) >> 1); + } + + break; + + case 16: + + sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + sample0_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + sample1_epi8 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)&lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + // Store 64-bit integer into memory + _mm_storel_epi64((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_u_epi8)); + + // Store 64-bit integer into memory + _mm_storel_epi64((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_v_epi8)); + + break; + + case 32: + + sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + sample0_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + sample1_epi8 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + // Fill 128 bit vector with packed data and store it to memory + _mm_storeu_si128((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_u_epi8)); + + // Fill 128 bit vector with packed data and store it to memory + _mm_storeu_si128((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_v_epi8)); + + + break; + + case 64: + + sample0_epi8 = _mm256_loadu_si256((__m256i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + sample1_epi8 = _mm256_loadu_si256((__m256i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + temp_u_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + sample0_epi8 = _mm256_loadu_si256((__m256i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + sample1_epi8 = _mm256_loadu_si256((__m256i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu])); + temp_v_epi8 = _mm256_avg_epu8(sample0_epi8, sample1_epi8); + + _mm256_storeu_si256((__m256i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), temp_u_epi8); + _mm256_storeu_si256((__m256i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), temp_v_epi8); + break; + + default: + // This is used only with odd shaped objects + for (int temp_i = 0; temp_i < width >> 1; ++temp_i) { + int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1)); + int16_t sample0_u = (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); + int16_t sample1_u = (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); + lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u) >> 1); + + int16_t sample0_v = (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); + int16_t sample1_v = (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH)); + lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v) >> 1); + } + + break; + } + y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); + } + } + } + + +} + +static void inter_recon_bipred_avx2(const int hi_prec_luma_rec0, + const int hi_prec_luma_rec1, + const int hi_prec_chroma_rec0, + const int hi_prec_chroma_rec1, + const int height, + const int width, + const int ypos, + const int xpos, + const hi_prec_buf_t*high_precision_rec0, + const hi_prec_buf_t*high_precision_rec1, + lcu_t* lcu, + kvz_pixel* temp_lcu_y, + kvz_pixel* temp_lcu_u, + kvz_pixel* temp_lcu_v) +{ + if(hi_prec_luma_rec0 == 0 && hi_prec_luma_rec1 == 0 && hi_prec_chroma_rec0 == 0 && hi_prec_chroma_rec1 == 0) + { + inter_recon_bipred_no_mov_avx2(height, width, ypos, xpos, high_precision_rec0, high_precision_rec1, lcu, temp_lcu_y, temp_lcu_u, temp_lcu_v); + } + + else + { + + int y_in_lcu, x_in_lcu; + int shift = 15 - KVZ_BIT_DEPTH; + int offset = 1 << (shift - 1); + __m256i temp_epi8, temp_y_epi32, sample0_epi32, sample1_epi32, temp_epi16; + int32_t * pointer = 0; + __m256i offset_epi32 = _mm256_set1_epi32(offset); + + for (int temp_y = 0; temp_y < height; ++temp_y) { + + y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); + + for (int temp_x = 0; temp_x < width; temp_x += 8) { + x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1)); + + bool use_8_elements = ((temp_x + 8) <= width); + + switch (use_8_elements) + { + + case false: + + if (width < 4) { + // If width is smaller than 4 there's no need to use SIMD + for (int temp_i = 0; temp_i < width; ++temp_i) { + x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1)); + + int sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + int sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + + lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift); + } + } + + else{ + // Load total of 4 elements from memory to vector + sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*)&(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu]))), 14 - KVZ_BIT_DEPTH); + + + sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]))), 14 - KVZ_BIT_DEPTH); + + + // (sample1 + sample2 + offset)>>shift + temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32); + temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32); + temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift); + + // Pack the bits from 32-bit to 8-bit + temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32); + temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0)); + temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16); + + pointer = (int32_t*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]); + *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8)); + + + + for (int temp_i = temp_x + 4; temp_i < width; ++temp_i) { + x_in_lcu = ((xpos + temp_i) & ((LCU_WIDTH)-1)); + + int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + + lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift); + } + + } + break; + + default: + // Load total of 8 elements from memory to vector + sample0_epi32 = hi_prec_luma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), 14 - KVZ_BIT_DEPTH); + + sample1_epi32 = hi_prec_luma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepu8_epi32((_mm_loadl_epi64((__m128i*) &(lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu])))), 14 - KVZ_BIT_DEPTH); + + // (sample1 + sample2 + offset)>>shift + temp_y_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32); + temp_y_epi32 = _mm256_add_epi32(temp_y_epi32, offset_epi32); + temp_y_epi32 = _mm256_srai_epi32(temp_y_epi32, shift); + + // Pack the bits from 32-bit to 8-bit + temp_epi16 = _mm256_packs_epi32(temp_y_epi32, temp_y_epi32); + temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0)); + temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16); + + // Store 64-bits from vector to memory + _mm_storel_epi64((__m128i*)&(lcu->rec.y[(y_in_lcu)* LCU_WIDTH + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); + + break; + } + + + } + } + for (int temp_y = 0; temp_y < height >> 1; ++temp_y) { + int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1)); + + for (int temp_x = 0; temp_x < width >> 1; temp_x += 8) { + + int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1)); + + if ((width >> 1) < 4) { + // If width>>1 is smaller than 4 there's no need to use SIMD + + for (int temp_i = 0; temp_i < width >> 1; ++temp_i) { + int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1)); + int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift); + + int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift); + } + } + + else{ + + bool use_8_elements = ((temp_x + 8) <= (width>>1)); + + __m256i temp_u_epi32, temp_v_epi32; + + switch (use_8_elements) + { + + case false: + // Load 4 pixels to vector + sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH); + + sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH); + + // (sample1 + sample2 + offset)>>shift + temp_u_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32); + temp_u_epi32 = _mm256_add_epi32(temp_u_epi32, offset_epi32); + temp_u_epi32 = _mm256_srai_epi32(temp_u_epi32, shift); + + + + sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH); + + sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadl_epi64((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH); + + + // (sample1 + sample2 + offset)>>shift + temp_v_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32); + temp_v_epi32 = _mm256_add_epi32(temp_v_epi32, offset_epi32); + temp_v_epi32 = _mm256_srai_epi32(temp_v_epi32, shift); + + + temp_epi16 = _mm256_packs_epi32(temp_u_epi32, temp_u_epi32); + temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0)); + temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16); + + pointer = (int32_t*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]); + *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8)); + + + temp_epi16 = _mm256_packs_epi32(temp_v_epi32, temp_v_epi32); + temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0)); + temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16); + + pointer = (int32_t*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]); + *pointer = _mm_cvtsi128_si32(_mm256_castsi256_si128(temp_epi8)); + + for (int temp_i = 4; temp_i < width >> 1; ++temp_i) { + + // Use only if width>>1 is not divideble by 4 + int temp_x_in_lcu = (((xpos >> 1) + temp_i) & (LCU_WIDTH_C - 1)); + int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + lcu->rec.u[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift); + + int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + lcu->rec.v[y_in_lcu * LCU_WIDTH_C + temp_x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift); + } + + + break; + + default: + // Load 8 pixels to vector + sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH); + + sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH); + + // (sample1 + sample2 + offset)>>shift + temp_u_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32); + temp_u_epi32 = _mm256_add_epi32(temp_u_epi32, offset_epi32); + temp_u_epi32 = _mm256_srai_epi32(temp_u_epi32, shift); + + sample0_epi32 = hi_prec_chroma_rec0 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH); + + sample1_epi32 = hi_prec_chroma_rec1 ? _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*) &(high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))) : + _mm256_slli_epi32(_mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i*) &(lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu]))), 14 - KVZ_BIT_DEPTH); + + + // (sample1 + sample2 + offset)>>shift + temp_v_epi32 = _mm256_add_epi32(sample0_epi32, sample1_epi32); + temp_v_epi32 = _mm256_add_epi32(temp_v_epi32, offset_epi32); + temp_v_epi32 = _mm256_srai_epi32(temp_v_epi32, shift); + + temp_epi16 = _mm256_packs_epi32(temp_u_epi32, temp_u_epi32); + temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0)); + temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16); + + // Store 64-bit integer into memory + _mm_storel_epi64((__m128i*)&(lcu->rec.u[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); + + temp_epi16 = _mm256_packs_epi32(temp_v_epi32, temp_v_epi32); + temp_epi16 = _mm256_permute4x64_epi64(temp_epi16, _MM_SHUFFLE(3, 1, 2, 0)); + temp_epi8 = _mm256_packus_epi16(temp_epi16, temp_epi16); + + // Store 64-bit integer into memory + _mm_storel_epi64((__m128i*)&(lcu->rec.v[(y_in_lcu)* LCU_WIDTH_C + x_in_lcu]), _mm256_castsi256_si128(temp_epi8)); + + + break; + } + } + } + } + } +} + +static optimized_sad_func_ptr_t get_optimized_sad_avx2(int32_t width) +{ + if (width == 0) + return reg_sad_w0; + if (width == 4) + return reg_sad_w4; + if (width == 8) + return reg_sad_w8; + if (width == 12) + return reg_sad_w12; + if (width == 16) + return reg_sad_w16; + if (width == 24) + return reg_sad_w24; + if (width == 32) + return reg_sad_w32; + if (width == 64) + return reg_sad_w64; + else + return NULL; +} + +static uint32_t ver_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t width, int32_t height, uint32_t stride) +{ + if (width == 0) + return 0; + if (width == 4) + return ver_sad_w4(pic_data, ref_data, height, stride); + if (width == 8) + return ver_sad_w8(pic_data, ref_data, height, stride); + if (width == 12) + return ver_sad_w12(pic_data, ref_data, height, stride); + if (width == 16) + return ver_sad_w16(pic_data, ref_data, height, stride); + else + return ver_sad_arbitrary(pic_data, ref_data, width, height, stride); +} + +static uint32_t hor_sad_avx2(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t width, int32_t height, uint32_t pic_stride, + uint32_t ref_stride, uint32_t left, uint32_t right) +{ + if (width == 4) + return hor_sad_sse41_w4(pic_data, ref_data, height, + pic_stride, ref_stride, left, right); + if (width == 8) + return hor_sad_sse41_w8(pic_data, ref_data, height, + pic_stride, ref_stride, left, right); + if (width == 16) + return hor_sad_sse41_w16(pic_data, ref_data, height, + pic_stride, ref_stride, left, right); + if (width == 32) + return hor_sad_avx2_w32 (pic_data, ref_data, height, + pic_stride, ref_stride, left, right); + else + return hor_sad_sse41_arbitrary(pic_data, ref_data, width, height, + pic_stride, ref_stride, left, right); +} + +#endif //COMPILE_INTEL_AVX2 int kvz_strategy_register_picture_avx2(void* opaque, uint8_t bitdepth) { @@ -726,6 +1330,8 @@ // simplest code to look at for anyone interested in doing more // optimizations, so it's worth it to keep this maintained. if (bitdepth == 8){ + + success &= kvz_strategyselector_register(opaque, "reg_sad", "avx2", 40, &kvz_reg_sad_avx2); success &= kvz_strategyselector_register(opaque, "sad_8x8", "avx2", 40, &sad_8bit_8x8_avx2); success &= kvz_strategyselector_register(opaque, "sad_16x16", "avx2", 40, &sad_8bit_16x16_avx2); success &= kvz_strategyselector_register(opaque, "sad_32x32", "avx2", 40, &sad_8bit_32x32_avx2); @@ -746,6 +1352,11 @@ success &= kvz_strategyselector_register(opaque, "satd_any_size_quad", "avx2", 40, &satd_any_size_quad_avx2); success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "avx2", 40, &pixels_calc_ssd_avx2); + success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "avx2", 40, &inter_recon_bipred_avx2); + success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "avx2", 40, &get_optimized_sad_avx2); + success &= kvz_strategyselector_register(opaque, "ver_sad", "avx2", 40, &ver_sad_avx2); + success &= kvz_strategyselector_register(opaque, "hor_sad", "avx2", 40, &hor_sad_avx2); + } #endif return success;
View file
kvazaar-1.2.0.tar.gz/src/strategies/avx2/quant-avx2.c -> kvazaar-1.3.0.tar.gz/src/strategies/avx2/quant-avx2.c
Changed
@@ -28,6 +28,7 @@ #include <immintrin.h> #include <stdlib.h> +#include "avx2_common_functions.h" #include "cu.h" #include "encoder.h" #include "encoderstate.h" @@ -40,17 +41,316 @@ #include "tables.h" #include "transform.h" +static INLINE int32_t hsum32_8x32i(__m256i src) +{ + __m128i a = _mm256_extracti128_si256(src, 0); + __m128i b = _mm256_extracti128_si256(src, 1); + + a = _mm_add_epi32(a, b); + b = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3)); + + a = _mm_add_epi32(a, b); + b = _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1)); + + a = _mm_add_epi32(a, b); + return _mm_cvtsi128_si32(a); +} + +static INLINE int32_t hsum32_16x16i(__m256i src) +{ + __m128i a = _mm256_extracti128_si256(src, 0); + __m128i b = _mm256_extracti128_si256(src, 1); + __m256i c = _mm256_cvtepi16_epi32(a); + __m256i d = _mm256_cvtepi16_epi32(b); + + c = _mm256_add_epi32(c, d); + return hsum32_8x32i(c); +} + +// Rearranges a 16x32b double vector into a format suitable for a stable SIMD +// max algorithm: +// (abcd|efgh) (ijkl|mnop) => (aceg|ikmo) (bdfh|jlnp) +static INLINE void rearrange_512(__m256i *hi, __m256i *lo) +{ + const __m256i perm8x32mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); + + __m256i tmphi = _mm256_permutevar8x32_epi32(*hi, perm8x32mask); + __m256i tmplo = _mm256_permutevar8x32_epi32(*lo, perm8x32mask); + + *hi = _mm256_permute2x128_si256(tmplo, tmphi, 0x31); + *lo = _mm256_permute2x128_si256(tmplo, tmphi, 0x20); +} + +static INLINE void get_cheapest_alternative(__m256i costs_hi, __m256i costs_lo, + __m256i ns, __m256i changes, + int16_t *final_change, int32_t *min_pos) +{ + // Interleave ns and lo into 32-bit variables and to two 256-bit wide vecs, + // to have the same data layout as in costs. Zero extend to 32b width, shift + // changes 16 bits to the left, and store them into the same vectors. + __m256i tmp1hi = _mm256_unpackhi_epi16(ns, changes); + __m256i tmp1lo = _mm256_unpacklo_epi16(ns, changes); + + __m256i pl1hi = _mm256_permute2x128_si256(tmp1lo, tmp1hi, 0x31); + __m256i pl1lo = _mm256_permute2x128_si256(tmp1lo, tmp1hi, 0x20); + + // Reorder to afford result stability (if multiple atoms tie for cheapest, + // rightmost ie. the highest is the wanted one) + rearrange_512(&costs_hi, &costs_lo); + rearrange_512(&pl1hi, &pl1lo); + + // 0: pick hi, 1: pick lo (equality evaluates as 0) + __m256i cmpmask1 = _mm256_cmpgt_epi32(costs_hi, costs_lo); + __m256i cost1 = _mm256_blendv_epi8(costs_hi, costs_lo, cmpmask1); + __m256i pl1_1 = _mm256_blendv_epi8(pl1hi, pl1lo, cmpmask1); + + __m256i cost2 = _mm256_shuffle_epi32(cost1, _MM_SHUFFLE(2, 3, 0, 1)); + __m256i pl1_2 = _mm256_shuffle_epi32(pl1_1, _MM_SHUFFLE(2, 3, 0, 1)); + + __m256i cmpmask2 = _mm256_cmpgt_epi32(cost2, cost1); + __m256i cost3 = _mm256_blendv_epi8(cost2, cost1, cmpmask2); + __m256i pl1_3 = _mm256_blendv_epi8(pl1_2, pl1_1, cmpmask2); + + __m256i cost4 = _mm256_shuffle_epi32(cost3, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i pl1_4 = _mm256_shuffle_epi32(pl1_3, _MM_SHUFFLE(1, 0, 3, 2)); + + __m256i cmpmask3 = _mm256_cmpgt_epi32(cost4, cost3); + __m256i cost5 = _mm256_blendv_epi8(cost4, cost3, cmpmask3); + __m256i pl1_5 = _mm256_blendv_epi8(pl1_4, pl1_3, cmpmask3); + + __m256i cost6 = _mm256_permute4x64_epi64(cost5, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i pl1_6 = _mm256_permute4x64_epi64(pl1_5, _MM_SHUFFLE(1, 0, 3, 2)); + + __m256i cmpmask4 = _mm256_cmpgt_epi32(cost6, cost5); + __m256i pl1_7 = _mm256_blendv_epi8(pl1_6, pl1_5, cmpmask4); + + __m128i res1_128 = _mm256_castsi256_si128(pl1_7); + uint32_t tmp1 = (uint32_t)_mm_extract_epi32(res1_128, 0); + uint16_t n = (uint16_t)(tmp1 & 0xffff); + uint16_t chng = (uint16_t)(tmp1 >> 16); + + *final_change = (int16_t)chng; + *min_pos = (int32_t)n; +} + +static INLINE __m256i concatenate_2x128i(__m128i lo, __m128i hi) +{ + __m256i v = _mm256_castsi128_si256(lo); + return _mm256_inserti128_si256(v, hi, 1); +} + +static INLINE void scanord_read_vector_32(const int32_t *__restrict quant_coeff, + const uint32_t *__restrict scan, + int8_t scan_mode, + int32_t subpos, + int32_t width, + __m256i *__restrict v_quant_coeffs) +{ + const size_t row_offsets[4] = { + scan[subpos] + width * 0, + scan[subpos] + width * 1, + scan[subpos] + width * 2, + scan[subpos] + width * 3, + }; + + const __m256i shufmasks[3] = { + _mm256_setr_epi32(5, 2, 6, 0, 3, 7, 4, 1), + _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7), + _mm256_setr_epi32(2, 3, 0, 1, 6, 7, 4, 5), + }; + + const __m256i blend_masks[3] = { + _mm256_setr_epi32( 0, 0, 0, -1, 0, 0, -1, -1), + _mm256_setr_epi32( 0, 0, 0, 0, 0, 0, 0, 0), + _mm256_setr_epi32( 0, 0, -1, -1, 0, 0, -1, -1), + }; + + const __m256i rearr_masks_lo[3] = { + _mm256_setr_epi32(0, 4, 1, 3, 5, 2, 6, 7), + _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7), + _mm256_setr_epi32(0, 4, 2, 6, 1, 5, 3, 7), + }; + + const __m256i rearr_masks_hi[3] = { + _mm256_setr_epi32(6, 3, 0, 1, 7, 2, 4, 5), + _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7), + _mm256_setr_epi32(2, 6, 0, 4, 3, 7, 1, 5), + }; + + __m128i coeffs[4] = { + _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets[0])), + _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets[1])), + _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets[2])), + _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets[3])), + }; + + __m256i coeffs_upper = concatenate_2x128i(coeffs[0], coeffs[1]); + __m256i coeffs_lower = concatenate_2x128i(coeffs[2], coeffs[3]); + + __m256i lower_shuffled = _mm256_permutevar8x32_epi32(coeffs_lower, shufmasks[scan_mode]); + + __m256i upper_blended = _mm256_blendv_epi8(coeffs_upper, lower_shuffled, blend_masks[scan_mode]); + __m256i lower_blended = _mm256_blendv_epi8(lower_shuffled, coeffs_upper, blend_masks[scan_mode]); + + __m256i result_lo = _mm256_permutevar8x32_epi32(upper_blended, rearr_masks_lo[scan_mode]); + __m256i result_hi = _mm256_permutevar8x32_epi32(lower_blended, rearr_masks_hi[scan_mode]); + + v_quant_coeffs[0] = result_lo; + v_quant_coeffs[1] = result_hi; +} + +#define VEC_WIDTH 16 +#define SCAN_SET_SIZE 16 +#define LOG2_SCAN_SET_SIZE 4 + +static INLINE int32_t hide_block_sign(__m256i coefs, __m256i q_coefs, __m256i deltas_h, __m256i deltas_l, coeff_t * __restrict q_coef, const uint32_t * __restrict scan, int32_t subpos, int32_t last_cg) +{ + assert(SCAN_SET_SIZE == 16); + + int32_t first_nz_pos_in_cg, last_nz_pos_in_cg; + int32_t abssum = 0; + + // Find first and last nonzero coeffs + get_first_last_nz_int16(q_coefs, &first_nz_pos_in_cg, &last_nz_pos_in_cg); + + // Sum all kvz_quant coeffs between first and last + abssum = hsum32_16x16i(q_coefs); + + if (last_nz_pos_in_cg >= 0 && last_cg == -1) { + last_cg = 1; + } + + if (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4) { + + uint32_t q_coef_signbits = _mm256_movemask_epi8(q_coefs); + int32_t signbit = (q_coef_signbits >> (2 * first_nz_pos_in_cg + 1)) & 0x1; + + if (signbit != (abssum & 0x1)) { // compare signbit with sum_parity + int32_t min_pos; + int16_t final_change; + int16_t cheapest_q; + + const int32_t mask_max = (last_cg == 1) ? last_nz_pos_in_cg : SCAN_SET_SIZE - 1; + + const __m256i zero = _mm256_setzero_si256(); + const __m256i ones = _mm256_set1_epi16(1); + const __m256i maxiters = _mm256_set1_epi16(mask_max); + const __m256i ff = _mm256_set1_epi8(0xff); + + const __m256i fnpics = _mm256_set1_epi16((int16_t)first_nz_pos_in_cg); + const __m256i ns = _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + __m256i block_signbit = _mm256_set1_epi16(((int16_t)signbit) * -1); + __m256i coef_signbits = _mm256_cmpgt_epi16(zero, coefs); + __m256i signbits_equal_block = _mm256_cmpeq_epi16(coef_signbits, block_signbit); + + __m256i q_coefs_zero = _mm256_cmpeq_epi16(q_coefs, zero); + + __m256i dus_packed = _mm256_packs_epi32(deltas_l, deltas_h); + __m256i dus_ordered = _mm256_permute4x64_epi64(dus_packed, _MM_SHUFFLE(3, 1, 2, 0)); + __m256i dus_positive = _mm256_cmpgt_epi16(dus_ordered, zero); + + __m256i q_coef_abss = _mm256_abs_epi16(q_coefs); + __m256i q_coefs_plusminus_one = _mm256_cmpeq_epi16(q_coef_abss, ones); + + __m256i eq_fnpics = _mm256_cmpeq_epi16(fnpics, ns); + __m256i lt_fnpics = _mm256_cmpgt_epi16(fnpics, ns); + + __m256i maxcost_subcond1s = _mm256_and_si256(eq_fnpics, q_coefs_plusminus_one); + __m256i maxcost_subcond2s = _mm256_andnot_si256(signbits_equal_block, lt_fnpics); + __m256i elsecond1s_inv = _mm256_or_si256(dus_positive, maxcost_subcond1s); + __m256i elsecond1s = _mm256_andnot_si256(elsecond1s_inv, ff); + + __m256i outside_maxiters = _mm256_cmpgt_epi16(ns, maxiters); + + __m256i negdelta_cond1s = _mm256_andnot_si256(q_coefs_zero, dus_positive); + __m256i negdelta_cond2s = _mm256_andnot_si256(maxcost_subcond2s, q_coefs_zero); + __m256i negdelta_mask16s_part1 = _mm256_or_si256(negdelta_cond1s, negdelta_cond2s); + __m256i negdelta_mask16s = _mm256_andnot_si256(outside_maxiters, negdelta_mask16s_part1); + + __m256i posdelta_mask16s_part1 = _mm256_andnot_si256(q_coefs_zero, elsecond1s); + __m256i posdelta_mask16s = _mm256_andnot_si256(outside_maxiters, posdelta_mask16s_part1); + + __m256i maxcost_cond1_parts = _mm256_andnot_si256(dus_positive, maxcost_subcond1s); + __m256i maxcost_cond1s = _mm256_andnot_si256(q_coefs_zero, maxcost_cond1_parts); + __m256i maxcost_cond2s = _mm256_and_si256(q_coefs_zero, maxcost_subcond2s); + __m256i maxcost_mask16s_parts = _mm256_or_si256(maxcost_cond1s, maxcost_cond2s); + __m256i maxcost_mask16s = _mm256_or_si256(maxcost_mask16s_parts, outside_maxiters); + + __m128i tmp_l, tmp_h; + tmp_l = _mm256_extracti128_si256(negdelta_mask16s, 0); + tmp_h = _mm256_extracti128_si256(negdelta_mask16s, 1); + __m256i negdelta_mask32s_l = _mm256_cvtepi16_epi32(tmp_l); + __m256i negdelta_mask32s_h = _mm256_cvtepi16_epi32(tmp_h); + + tmp_l = _mm256_extracti128_si256(posdelta_mask16s, 0); + tmp_h = _mm256_extracti128_si256(posdelta_mask16s, 1); + __m256i posdelta_mask32s_l = _mm256_cvtepi16_epi32(tmp_l); + __m256i posdelta_mask32s_h = _mm256_cvtepi16_epi32(tmp_h); + + tmp_l = _mm256_extracti128_si256(maxcost_mask16s, 0); + tmp_h = _mm256_extracti128_si256(maxcost_mask16s, 1); + __m256i maxcost_mask32s_l = _mm256_cvtepi16_epi32(tmp_l); + __m256i maxcost_mask32s_h = _mm256_cvtepi16_epi32(tmp_h); + + // Output value generation + // cur_change_max: zero + // cur_change_negdelta: ff + // cur_change_posdelta: ones + __m256i costs_negdelta_h = _mm256_sub_epi32(zero, deltas_h); + __m256i costs_negdelta_l = _mm256_sub_epi32(zero, deltas_l); + // costs_posdelta_l and _h: deltas_l and _h + __m256i costs_max_lh = _mm256_set1_epi32(0x7fffffff); + + __m256i change_neg = _mm256_and_si256(negdelta_mask16s, ones); + __m256i change_pos = _mm256_and_si256(posdelta_mask16s, ff); + __m256i change_max = _mm256_and_si256(maxcost_mask16s, zero); + + __m256i cost_neg_l = _mm256_and_si256(negdelta_mask32s_l, costs_negdelta_l); + __m256i cost_neg_h = _mm256_and_si256(negdelta_mask32s_h, costs_negdelta_h); + __m256i cost_pos_l = _mm256_and_si256(posdelta_mask32s_l, deltas_l); + __m256i cost_pos_h = _mm256_and_si256(posdelta_mask32s_h, deltas_h); + __m256i cost_max_l = _mm256_and_si256(maxcost_mask32s_l, costs_max_lh); + __m256i cost_max_h = _mm256_and_si256(maxcost_mask32s_h, costs_max_lh); + + __m256i changes = _mm256_or_si256(change_neg, _mm256_or_si256(change_pos, change_max)); + __m256i costs_l = _mm256_or_si256(cost_neg_l, _mm256_or_si256(cost_pos_l, cost_max_l)); + __m256i costs_h = _mm256_or_si256(cost_neg_h, _mm256_or_si256(cost_pos_h, cost_max_h)); + + get_cheapest_alternative(costs_h, costs_l, ns, changes, &final_change, &min_pos); + const int32_t best_id = scan[min_pos + subpos]; + + cheapest_q = q_coef[best_id]; + if (cheapest_q == 32767 || cheapest_q == -32768) + final_change = -1; + + uint32_t coef_signs = _mm256_movemask_epi8(coef_signbits); + uint32_t cheapest_coef_sign_mask = (uint32_t)(1 << (2 * min_pos)); + + if (!(coef_signs & cheapest_coef_sign_mask)) + cheapest_q += final_change; + else + cheapest_q -= final_change; + + q_coef[best_id] = cheapest_q; + } // Hide + } + if (last_cg == 1) + last_cg = 0; + + return last_cg; +} /** * \brief quantize transformed coefficents * */ -void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width, +void kvz_quant_avx2(const encoder_state_t * const state, const coeff_t * __restrict coef, coeff_t * __restrict q_coef, int32_t width, int32_t height, int8_t type, int8_t scan_idx, int8_t block_type) { const encoder_control_t * const encoder = state->encoder_control; const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2; - const uint32_t * const scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1]; + const uint32_t * const __restrict scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1]; int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6); const uint32_t log2_tr_size = kvz_g_convert_to_bit[width] + 2; @@ -61,28 +361,58 @@ const int32_t add = ((state->frame->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9); const int32_t q_bits8 = q_bits - 8; - assert(quant_coeff[0] <= (1 << 15) - 1 && quant_coeff[0] >= -(1 << 15)); //Assuming flat values to fit int16_t - uint32_t ac_sum = 0; + int32_t last_cg = -1; __m256i v_ac_sum = _mm256_setzero_si256(); - __m256i v_quant_coeff = _mm256_set1_epi16(quant_coeff[0]); - for (int32_t n = 0; n < width * height; n += 16) { + // Loading once is enough if scaling lists are not off + __m256i low_b = _mm256_setzero_si256(), high_b = _mm256_setzero_si256(); + if (!(state->encoder_control->scaling_list.enable)) { + low_b = _mm256_set1_epi32(quant_coeff[0]); + high_b = low_b; + } + + for (int32_t n = 0; n < width * height; n += VEC_WIDTH) { - __m256i v_level = _mm256_loadu_si256((__m256i*)&(coef[n])); + __m256i v_level = _mm256_loadu_si256((__m256i *)(coef + n)); __m256i v_sign = _mm256_cmpgt_epi16(_mm256_setzero_si256(), v_level); v_sign = _mm256_or_si256(v_sign, _mm256_set1_epi16(1)); - v_level = _mm256_abs_epi16(v_level); - __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)); - __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0)); + if (state->encoder_control->scaling_list.enable) { + __m256i v_quant_coeff_lo = _mm256_loadu_si256(((__m256i *)(quant_coeff + n)) + 0); + __m256i v_quant_coeff_hi = _mm256_loadu_si256(((__m256i *)(quant_coeff + n)) + 1); - __m256i low_b = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0)); - __m256i high_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0)); + low_b = _mm256_permute2x128_si256(v_quant_coeff_lo, + v_quant_coeff_hi, + 0x20); + + high_b = _mm256_permute2x128_si256(v_quant_coeff_lo, + v_quant_coeff_hi, + 0x31); + } - __m256i v_level32_a = _mm256_madd_epi16(low_a, low_b); - __m256i v_level32_b = _mm256_madd_epi16(high_a, high_b); +// TODO: do we need to have this? +// #define CHECK_QUANT_COEFFS +#ifdef CHECK_QUANT_COEFFS + __m256i abs_vq_lo = _mm256_abs_epi32(v_quant_coeff_lo); + __m256i abs_vq_hi = _mm256_abs_epi32(v_quant_coeff_hi); + + __m256i vq_over_16b_lo = _mm256_cmpgt_epi32(abs_vq_lo, _mm256_set1_epi32(0x7fff)); + __m256i vq_over_16b_hi = _mm256_cmpgt_epi32(abs_vq_hi, _mm256_set1_epi32(0x7fff)); + + uint32_t over_16b_mask_lo = _mm256_movemask_epi8(vq_over_16b_lo); + uint32_t over_16b_mask_hi = _mm256_movemask_epi8(vq_over_16b_hi); + + assert(!(over_16b_mask_lo || over_16b_mask_hi)); +#endif + + v_level = _mm256_abs_epi16(v_level); + __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_setzero_si256()); + __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_setzero_si256()); + + __m256i v_level32_a = _mm256_mullo_epi32(low_a, low_b); + __m256i v_level32_b = _mm256_mullo_epi32(high_a, high_b); v_level32_a = _mm256_add_epi32(v_level32_a, _mm256_set1_epi32(add)); v_level32_b = _mm256_add_epi32(v_level32_b, _mm256_set1_epi32(add)); @@ -93,7 +423,7 @@ v_level = _mm256_packs_epi32(v_level32_a, v_level32_b); v_level = _mm256_sign_epi16(v_level, v_sign); - _mm256_storeu_si256((__m256i*)&(q_coef[n]), v_level); + _mm256_storeu_si256((__m256i *)(q_coef + n), v_level); v_ac_sum = _mm256_add_epi32(v_ac_sum, v_level32_a); v_ac_sum = _mm256_add_epi32(v_ac_sum, v_level32_b); @@ -104,23 +434,47 @@ temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, _MM_SHUFFLE(0, 1, 0, 1))); ac_sum += _mm_cvtsi128_si32(temp); - if (!encoder->cfg.signhide_enable || ac_sum < 2) return; + if (!encoder->cfg.signhide_enable || ac_sum < 2) + return; - int32_t delta_u[LCU_WIDTH*LCU_WIDTH >> 2]; + assert(VEC_WIDTH == SCAN_SET_SIZE); + for (int32_t subpos = (width * height - 1) & (~(VEC_WIDTH - 1)); subpos >= 0; subpos -= VEC_WIDTH) { + const int16_t *coeffs[2] = {coef, q_coef}; + __m256i result_coeffs[2]; + __m256i v_quant_coeffs[2]; - for (int32_t n = 0; n < width * height; n += 16) { + __m256i v_coef, q_coefs; + __m256i v_quant_coeff_lo, v_quant_coeff_hi; - __m256i v_level = _mm256_loadu_si256((__m256i*)&(coef[n])); + scanord_read_vector(coeffs, scan, scan_idx, subpos, width, result_coeffs, 2); - v_level = _mm256_abs_epi16(v_level); - __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)); - __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0)); + v_coef = result_coeffs[0]; + q_coefs = result_coeffs[1]; + + if (state->encoder_control->scaling_list.enable) { + scanord_read_vector_32(quant_coeff, scan, scan_idx, subpos, width, v_quant_coeffs); - __m256i low_b = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0)); - __m256i high_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0)); + v_quant_coeff_lo = v_quant_coeffs[0]; + v_quant_coeff_hi = v_quant_coeffs[1]; - __m256i v_level32_a = _mm256_madd_epi16(low_a, low_b); - __m256i v_level32_b = _mm256_madd_epi16(high_a, high_b); + low_b = _mm256_permute2x128_si256(v_quant_coeff_lo, + v_quant_coeff_hi, + 0x20); + + high_b = _mm256_permute2x128_si256(v_quant_coeff_lo, + v_quant_coeff_hi, + 0x31); + } + + __m256i v_level = _mm256_abs_epi16(v_coef); + __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_setzero_si256()); + __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_setzero_si256()); + + __m256i v_quant_coeff_a = _mm256_or_si256(low_b, _mm256_setzero_si256()); + __m256i v_quant_coeff_b = _mm256_or_si256(high_b, _mm256_setzero_si256()); + + __m256i v_level32_a = _mm256_mullo_epi32(low_a, low_b); + __m256i v_level32_b = _mm256_mullo_epi32(high_a, high_b); v_level32_a = _mm256_add_epi32(v_level32_a, _mm256_set1_epi32(add)); v_level32_b = _mm256_add_epi32(v_level32_b, _mm256_set1_epi32(add)); @@ -130,107 +484,26 @@ v_level = _mm256_packs_epi32(v_level32_a, v_level32_b); - __m256i v_coef = _mm256_loadu_si256((__m256i*)&(coef[n])); __m256i v_coef_a = _mm256_unpacklo_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0)); __m256i v_coef_b = _mm256_unpackhi_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0)); - __m256i v_quant_coeff_a = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0)); - __m256i v_quant_coeff_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0)); - v_coef_a = _mm256_madd_epi16(v_coef_a, v_quant_coeff_a); - v_coef_b = _mm256_madd_epi16(v_coef_b, v_quant_coeff_b); + + v_coef_a = _mm256_mullo_epi32(v_coef_a, v_quant_coeff_a); + v_coef_b = _mm256_mullo_epi32(v_coef_b, v_quant_coeff_b); + v_coef_a = _mm256_sub_epi32(v_coef_a, _mm256_slli_epi32(_mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)), q_bits) ); v_coef_b = _mm256_sub_epi32(v_coef_b, _mm256_slli_epi32(_mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0)), q_bits) ); v_coef_a = _mm256_srai_epi32(v_coef_a, q_bits8); v_coef_b = _mm256_srai_epi32(v_coef_b, q_bits8); - _mm_storeu_si128((__m128i*)&(delta_u[n+0*4]), _mm256_castsi256_si128(v_coef_a)); - _mm_storeu_si128((__m128i*)&(delta_u[n+2*4]), _mm256_extracti128_si256(v_coef_a, 1)); - _mm_storeu_si128((__m128i*)&(delta_u[n+1*4]), _mm256_castsi256_si128(v_coef_b)); - _mm_storeu_si128((__m128i*)&(delta_u[n+3*4]), _mm256_extracti128_si256(v_coef_b, 1)); - } - - if (ac_sum >= 2) { -#define SCAN_SET_SIZE 16 -#define LOG2_SCAN_SET_SIZE 4 - int32_t n, last_cg = -1, abssum = 0, subset, subpos; - for (subset = (width*height - 1) >> LOG2_SCAN_SET_SIZE; subset >= 0; subset--) { - int32_t first_nz_pos_in_cg = SCAN_SET_SIZE, last_nz_pos_in_cg = -1; - subpos = subset << LOG2_SCAN_SET_SIZE; - abssum = 0; - - // Find last coeff pos - for (n = SCAN_SET_SIZE - 1; n >= 0; n--) { - if (q_coef[scan[n + subpos]]) { - last_nz_pos_in_cg = n; - break; - } - } + __m256i deltas_h = _mm256_permute2x128_si256(v_coef_a, v_coef_b, 0x31); + __m256i deltas_l = _mm256_permute2x128_si256(v_coef_a, v_coef_b, 0x20); - // First coeff pos - for (n = 0; n <SCAN_SET_SIZE; n++) { - if (q_coef[scan[n + subpos]]) { - first_nz_pos_in_cg = n; - break; - } - } - - // Sum all kvz_quant coeffs between first and last - for (n = first_nz_pos_in_cg; n <= last_nz_pos_in_cg; n++) { - abssum += q_coef[scan[n + subpos]]; - } - - if (last_nz_pos_in_cg >= 0 && last_cg == -1) { - last_cg = 1; - } - - if (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4) { - int32_t signbit = (q_coef[scan[subpos + first_nz_pos_in_cg]] > 0 ? 0 : 1); - if (signbit != (abssum & 0x1)) { // compare signbit with sum_parity - int32_t min_cost_inc = 0x7fffffff, min_pos = -1, cur_cost = 0x7fffffff; - int16_t final_change = 0, cur_change = 0; - for (n = (last_cg == 1 ? last_nz_pos_in_cg : SCAN_SET_SIZE - 1); n >= 0; n--) { - uint32_t blkPos = scan[n + subpos]; - if (q_coef[blkPos] != 0) { - if (delta_u[blkPos] > 0) { - cur_cost = -delta_u[blkPos]; - cur_change = 1; - } - else if (n == first_nz_pos_in_cg && abs(q_coef[blkPos]) == 1) { - cur_cost = 0x7fffffff; - } - else { - cur_cost = delta_u[blkPos]; - cur_change = -1; - } - } - else if (n < first_nz_pos_in_cg && ((coef[blkPos] >= 0) ? 0 : 1) != signbit) { - cur_cost = 0x7fffffff; - } - else { - cur_cost = -delta_u[blkPos]; - cur_change = 1; - } - - if (cur_cost < min_cost_inc) { - min_cost_inc = cur_cost; - final_change = cur_change; - min_pos = blkPos; - } - } // CG loop - - if (q_coef[min_pos] == 32767 || q_coef[min_pos] == -32768) { - final_change = -1; - } - - if (coef[min_pos] >= 0) q_coef[min_pos] += final_change; - else q_coef[min_pos] -= final_change; - } // Hide - } - if (last_cg == 1) last_cg = 0; - } + last_cg = hide_block_sign(v_coef, q_coefs, deltas_h, deltas_l, q_coef, scan, subpos, last_cg); + } +#undef VEC_WIDTH #undef SCAN_SET_SIZE #undef LOG2_SCAN_SET_SIZE - } } static INLINE __m128i get_residual_4x1_avx2(const kvz_pixel *a_in, const kvz_pixel *b_in){ @@ -375,7 +648,7 @@ kvz_transformskip(state->encoder_control, residual, coeff, width); } else { - kvz_transform2d(state->encoder_control, residual, coeff, width, (color == COLOR_Y ? 0 : 65535)); + kvz_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu->type); } // Quantize coeffs. (coeff -> coeff_out) @@ -408,7 +681,7 @@ kvz_itransformskip(state->encoder_control, residual, coeff, width); } else { - kvz_itransform2d(state->encoder_control, residual, coeff, width, (color == COLOR_Y ? 0 : 65535)); + kvz_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu->type); } // Get quantized reconstruction. (residual + pred_in -> rec_out) @@ -429,17 +702,6 @@ return has_coeffs; } -void kvz_quant_avx2(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width, - int32_t height, int8_t type, int8_t scan_idx, int8_t block_type) -{ - if (state->encoder_control->scaling_list.enable){ - kvz_quant_generic(state, coef, q_coef, width, height, type, scan_idx, block_type); - } - else { - kvz_quant_flat_avx2(state, coef, q_coef, width, height, type, scan_idx, block_type); - } -} - /** * \brief inverse quantize transformed and quantized coefficents * @@ -524,6 +786,81 @@ return parts[0] + parts[1] + parts[2] + parts[3]; } +#define TO_Q88(f) ((int16_t)((f) * 256.0f)) + +static uint32_t fast_coeff_cost_avx2(const coeff_t *coeff, int32_t width, int32_t qp) +{ +#define NUM_BUCKETS 5 + static const int16_t wt_m[NUM_BUCKETS] = { + TO_Q88(-0.004916), + TO_Q88( 0.010806), + TO_Q88( 0.055562), + TO_Q88( 0.033436), + TO_Q88(-0.007690), + }; + static const int16_t wt_c[NUM_BUCKETS] = { + TO_Q88( 0.172024), + TO_Q88( 3.421462), + TO_Q88( 2.879506), + TO_Q88( 5.585471), + TO_Q88( 0.256772), + }; + + const __m256i zero = _mm256_setzero_si256(); + const __m256i threes = _mm256_set1_epi16(3); + const __m256i ones = _mm256_srli_epi16(threes, 1); + const __m256i twos = _mm256_slli_epi16(ones, 1); + + __m256i wt[NUM_BUCKETS - 1]; + for (int32_t i = 0; i < NUM_BUCKETS - 1; i++) + wt[i] = _mm256_set1_epi16(wt_m[i] * qp + wt_c[i]); + + uint32_t wid_wt = width * (wt_m[NUM_BUCKETS - 1] * qp + wt_c[NUM_BUCKETS - 1]); + __m256i avx_inc = _mm256_setzero_si256(); + + for (int32_t i = 0; i < width * width; i += 16) { + __m256i curr = _mm256_loadu_si256((__m256i *)(coeff + i)); + __m256i curr_abs = _mm256_abs_epi16 (curr); + __m256i curr_max3 = _mm256_min_epi16 (curr_abs, threes); + + __m256i curr_eq_0 = _mm256_cmpeq_epi16(curr_max3, zero); + __m256i curr_eq_1 = _mm256_cmpeq_epi16(curr_max3, ones); + __m256i curr_eq_2 = _mm256_cmpeq_epi16(curr_max3, twos); + __m256i curr_eq_3 = _mm256_cmpeq_epi16(curr_max3, threes); + + __m256i curr_0_wt = _mm256_and_si256 (curr_eq_0, wt[0]); + __m256i curr_1_wt = _mm256_and_si256 (curr_eq_1, wt[1]); + __m256i curr_2_wt = _mm256_and_si256 (curr_eq_2, wt[2]); + __m256i curr_3_wt = _mm256_and_si256 (curr_eq_3, wt[3]); + + // Use madd to horizontally sum 16-bit weights into 32-bit atoms + __m256i wt_0_32b = _mm256_madd_epi16(curr_0_wt, ones); + __m256i wt_1_32b = _mm256_madd_epi16(curr_1_wt, ones); + __m256i wt_2_32b = _mm256_madd_epi16(curr_2_wt, ones); + __m256i wt_3_32b = _mm256_madd_epi16(curr_3_wt, ones); + + __m256i wt_01 = _mm256_add_epi32(wt_0_32b, wt_1_32b); + __m256i wt_23 = _mm256_add_epi32(wt_2_32b, wt_3_32b); + __m256i curr_wts = _mm256_add_epi32(wt_01, wt_23); + avx_inc = _mm256_add_epi32(avx_inc, curr_wts); + } + __m128i inchi = _mm256_extracti128_si256(avx_inc, 1); + __m128i inclo = _mm256_castsi256_si128 (avx_inc); + + __m128i sum_1 = _mm_add_epi32 (inclo, inchi); + __m128i sum_2 = _mm_shuffle_epi32(sum_1, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sum_3 = _mm_add_epi32 (sum_1, sum_2); + __m128i sum_4 = _mm_shuffle_epi32(sum_3, _MM_SHUFFLE(2, 3, 0, 1)); + __m128i sum = _mm_add_epi32 (sum_3, sum_4); + + uint32_t sum_u32 = _mm_cvtsi128_si32(sum); + uint32_t sum_total = sum_u32 + wid_wt; + return sum_total >> 8; +#undef NUM_BUCKETS +} + +#undef TO_Q88 + #endif //COMPILE_INTEL_AVX2 && defined X86_64 int kvz_strategy_register_quant_avx2(void* opaque, uint8_t bitdepth) @@ -537,6 +874,7 @@ success &= kvz_strategyselector_register(opaque, "dequant", "avx2", 40, &kvz_dequant_avx2); } success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "avx2", 0, &coeff_abs_sum_avx2); + success &= kvz_strategyselector_register(opaque, "fast_coeff_cost", "avx2", 40, &fast_coeff_cost_avx2); #endif //COMPILE_INTEL_AVX2 && defined X86_64 return success;
View file
kvazaar-1.3.0.tar.gz/src/strategies/avx2/reg_sad_pow2_widths-avx2.h
Added
@@ -0,0 +1,209 @@ +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2015 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see <http://www.gnu.org/licenses/>. + ****************************************************************************/ + +#ifndef REG_SAD_POW2_WIDTHS_AVX2_H_ +#define REG_SAD_POW2_WIDTHS_AVX2_H_ + +#include "strategies/sse41/reg_sad_pow2_widths-sse41.h" +#include "kvazaar.h" + +static INLINE uint32_t reg_sad_w32(const kvz_pixel * const data1, const kvz_pixel * const data2, + const int32_t height, const uint32_t stride1, + const uint32_t stride2) +{ + __m256i avx_inc = _mm256_setzero_si256(); + int32_t y; + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + for (y = 0; y < height_fourline_groups; y += 4) { + __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1)); + __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2)); + __m256i c = _mm256_loadu_si256((const __m256i *)(data1 + (y + 1) * stride1)); + __m256i d = _mm256_loadu_si256((const __m256i *)(data2 + (y + 1) * stride2)); + __m256i e = _mm256_loadu_si256((const __m256i *)(data1 + (y + 2) * stride1)); + __m256i f = _mm256_loadu_si256((const __m256i *)(data2 + (y + 2) * stride2)); + __m256i g = _mm256_loadu_si256((const __m256i *)(data1 + (y + 3) * stride1)); + __m256i h = _mm256_loadu_si256((const __m256i *)(data2 + (y + 3) * stride2)); + + __m256i curr_sads_ab = _mm256_sad_epu8(a, b); + __m256i curr_sads_cd = _mm256_sad_epu8(c, d); + __m256i curr_sads_ef = _mm256_sad_epu8(e, f); + __m256i curr_sads_gh = _mm256_sad_epu8(g, h); + + avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ab); + avx_inc = _mm256_add_epi64(avx_inc, curr_sads_cd); + avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ef); + avx_inc = _mm256_add_epi64(avx_inc, curr_sads_gh); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1)); + __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2)); + + __m256i curr_sads = _mm256_sad_epu8(a, b); + avx_inc = _mm256_add_epi64(avx_inc, curr_sads); + } + } + + __m128i inchi = _mm256_extracti128_si256(avx_inc, 1); + __m128i inclo = _mm256_castsi256_si128 (avx_inc); + + __m128i sum_1 = _mm_add_epi64 (inclo, inchi); + __m128i sum_2 = _mm_shuffle_epi32(sum_1, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sum_1, sum_2); + + return _mm_cvtsi128_si32(sad); +} + +static INLINE uint32_t reg_sad_w64(const kvz_pixel * const data1, const kvz_pixel * const data2, + const int32_t height, const uint32_t stride1, + const uint32_t stride2) +{ + __m256i avx_inc = _mm256_setzero_si256(); + int32_t y; + + const int32_t height_twoline_groups = height & ~1; + const int32_t height_residual_lines = height & 1; + + for (y = 0; y < height_twoline_groups; y += 2) { + __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1)); + __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2)); + __m256i c = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1 + 32)); + __m256i d = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2 + 32)); + + __m256i e = _mm256_loadu_si256((const __m256i *)(data1 + (y + 1) * stride1)); + __m256i f = _mm256_loadu_si256((const __m256i *)(data2 + (y + 1) * stride2)); + __m256i g = _mm256_loadu_si256((const __m256i *)(data1 + (y + 1) * stride1 + 32)); + __m256i h = _mm256_loadu_si256((const __m256i *)(data2 + (y + 1) * stride2 + 32)); + + __m256i curr_sads_ab = _mm256_sad_epu8(a, b); + __m256i curr_sads_cd = _mm256_sad_epu8(c, d); + __m256i curr_sads_ef = _mm256_sad_epu8(e, f); + __m256i curr_sads_gh = _mm256_sad_epu8(g, h); + + avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ab); + avx_inc = _mm256_add_epi64(avx_inc, curr_sads_cd); + avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ef); + avx_inc = _mm256_add_epi64(avx_inc, curr_sads_gh); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1)); + __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2)); + __m256i c = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1 + 32)); + __m256i d = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2 + 32)); + + __m256i curr_sads_ab = _mm256_sad_epu8(a, b); + __m256i curr_sads_cd = _mm256_sad_epu8(c, d); + avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ab); + avx_inc = _mm256_add_epi64(avx_inc, curr_sads_cd); + } + } + + __m128i inchi = _mm256_extracti128_si256(avx_inc, 1); + __m128i inclo = _mm256_castsi256_si128 (avx_inc); + + __m128i sum_1 = _mm_add_epi64 (inclo, inchi); + __m128i sum_2 = _mm_shuffle_epi32(sum_1, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sum_1, sum_2); + + return _mm_cvtsi128_si32(sad); +} + +static uint32_t hor_sad_avx2_w32(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t height, uint32_t pic_stride, uint32_t ref_stride, + const uint32_t left, const uint32_t right) +{ + __m256i avx_inc = _mm256_setzero_si256(); + + const size_t block_width = 32; + const size_t block_width_log2 = 5; + const size_t lane_width = 16; + + const int32_t left_eq_wid = left >> block_width_log2; + const int32_t left_clamped = left - left_eq_wid; + const int32_t right_eq_wid = right >> block_width_log2; + const int32_t right_clamped = right - right_eq_wid; + + const __m256i zero = _mm256_setzero_si256(); + const __m256i lane_widths = _mm256_set1_epi8((uint8_t)lane_width); + const __m256i lefts = _mm256_set1_epi8((uint8_t)left_clamped); + const __m256i rights = _mm256_set1_epi8((uint8_t)right_clamped); + const __m256i unsign_mask = _mm256_set1_epi8(0x7f); + const __m256i ns = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + + const __m256i rightmost_good_idx = _mm256_set1_epi8((uint8_t)(block_width - right - 1)); + + const __m256i shufmask1_l = _mm256_sub_epi8 (ns, lefts); + const __m256i shufmask1_r = _mm256_add_epi8 (shufmask1_l, rights); + const __m256i shufmask1 = _mm256_and_si256 (shufmask1_r, unsign_mask); + + const __m256i epol_mask_r = _mm256_min_epi8 (ns, rightmost_good_idx); + const __m256i epol_mask = _mm256_max_epi8 (lefts, epol_mask_r); + + const __m256i mlo2hi_mask_l = _mm256_cmpgt_epi8(lefts, ns); + const __m256i mlo2hi_imask_r = _mm256_cmpgt_epi8(lane_widths, shufmask1); + const __m256i mlo2hi_mask_r = _mm256_cmpeq_epi8(mlo2hi_imask_r, zero); + + // For left != 0, use low lane of mlo2hi_mask_l as blend mask for high lane. + // For right != 0, use low lane of mlo2hi_mask_r as blend mask for low lane. + const __m256i xchg_mask1 = _mm256_permute2x128_si256(mlo2hi_mask_l, mlo2hi_mask_r, 0x02); + + // If left != 0 (ie. right == 0), the xchg should only affect high lane, + // if right != 0 (ie. left == 0), the low lane. Set bits on the lane that + // the xchg should affect. left == right == 0 should never happen, this'll + // break if it does. + const __m256i lanes_llo_rhi = _mm256_blend_epi32(lefts, rights, 0xf0); + const __m256i xchg_lane_mask = _mm256_cmpeq_epi32(lanes_llo_rhi, zero); + + const __m256i xchg_data_mask = _mm256_and_si256(xchg_mask1, xchg_lane_mask); + + // If we're straddling the left border, start from the left border instead, + // and if right border, end on the border + const int32_t ld_offset = left - right; + + int32_t y; + for (y = 0; y < height; y++) { + __m256i a = _mm256_loadu_si256((__m256i *)(pic_data + (y + 0) * pic_stride + 0)); + __m256i b = _mm256_loadu_si256((__m256i *)(ref_data + (y + 0) * ref_stride + 0 + ld_offset)); + + __m256i b_shifted = _mm256_shuffle_epi8 (b, shufmask1); + __m256i b_lanes_reversed = _mm256_permute4x64_epi64(b_shifted, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i b_data_transfered = _mm256_blendv_epi8 (b_shifted, b_lanes_reversed, xchg_data_mask); + __m256i b_epoled = _mm256_shuffle_epi8 (b_data_transfered, epol_mask); + + __m256i curr_sads_ab = _mm256_sad_epu8(a, b_epoled); + + avx_inc = _mm256_add_epi64(avx_inc, curr_sads_ab); + } + __m128i inchi = _mm256_extracti128_si256(avx_inc, 1); + __m128i inclo = _mm256_castsi256_si128 (avx_inc); + + __m128i sum_1 = _mm_add_epi64 (inclo, inchi); + __m128i sum_2 = _mm_shuffle_epi32(sum_1, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sum_1, sum_2); + + return _mm_cvtsi128_si32(sad); +} + +#endif
View file
kvazaar-1.3.0.tar.gz/src/strategies/generic/encode_coding_tree-generic.c
Added
@@ -0,0 +1,279 @@ +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2015 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see <http://www.gnu.org/licenses/>. + ****************************************************************************/ + +#include "strategyselector.h" + +#include "cabac.h" +#include "context.h" +#include "encode_coding_tree-generic.h" +#include "encode_coding_tree.h" + +void kvz_encode_coeff_nxn_generic(encoder_state_t * const state, + cabac_data_t * const cabac, + const coeff_t *coeff, + uint8_t width, + uint8_t type, + int8_t scan_mode, + int8_t tr_skip) +{ + const encoder_control_t * const encoder = state->encoder_control; + int c1 = 1; + uint8_t last_coeff_x = 0; + uint8_t last_coeff_y = 0; + int32_t i; + uint32_t sig_coeffgroup_flag[8 * 8] = { 0 }; + + int8_t be_valid = encoder->cfg.signhide_enable; + int32_t scan_pos_sig; + uint32_t go_rice_param = 0; + uint32_t blk_pos, pos_y, pos_x, sig, ctx_sig; + + // CONSTANTS + const uint32_t num_blk_side = width >> TR_MIN_LOG2_SIZE; + const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2; + const uint32_t *scan = + kvz_g_sig_last_scan[scan_mode][log2_block_size - 1]; + const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 2][scan_mode]; + + // Init base contexts according to block type + cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_model[type]); + cabac_ctx_t *baseCtx = (type == 0) ? &(cabac->ctx.cu_sig_model_luma[0]) : + &(cabac->ctx.cu_sig_model_chroma[0]); + + // Scan all coeff groups to find out which of them have coeffs. + // Populate sig_coeffgroup_flag with that info. + + unsigned sig_cg_cnt = 0; + for (int cg_y = 0; cg_y < width / 4; ++cg_y) { + for (int cg_x = 0; cg_x < width / 4; ++cg_x) { + unsigned cg_pos = cg_y * width * 4 + cg_x * 4; + for (int coeff_row = 0; coeff_row < 4; ++coeff_row) { + // Load four 16-bit coeffs and see if any of them are non-zero. + unsigned coeff_pos = cg_pos + coeff_row * width; + uint64_t four_coeffs = *(uint64_t*)(&coeff[coeff_pos]); + if (four_coeffs) { + ++sig_cg_cnt; + unsigned cg_pos_y = (cg_pos >> log2_block_size) >> TR_MIN_LOG2_SIZE; + unsigned cg_pos_x = (cg_pos & (width - 1)) >> TR_MIN_LOG2_SIZE; + sig_coeffgroup_flag[cg_pos_x + cg_pos_y * num_blk_side] = 1; + break; + } + } + } + } + + // Rest of the code assumes at least one non-zero coeff. + assert(sig_cg_cnt > 0); + + // Find the last coeff group by going backwards in scan order. + unsigned scan_cg_last = num_blk_side * num_blk_side - 1; + while (!sig_coeffgroup_flag[scan_cg[scan_cg_last]]) { + --scan_cg_last; + } + + // Find the last coeff by going backwards in scan order. + unsigned scan_pos_last = scan_cg_last * 16 + 15; + while (!coeff[scan[scan_pos_last]]) { + --scan_pos_last; + } + + int pos_last = scan[scan_pos_last]; + + // transform skip flag + if(width == 4 && encoder->cfg.trskip_enable) { + cabac->cur_ctx = (type == 0) ? &(cabac->ctx.transform_skip_model_luma) : &(cabac->ctx.transform_skip_model_chroma); + CABAC_BIN(cabac, tr_skip, "transform_skip_flag"); + } + + last_coeff_x = pos_last & (width - 1); + last_coeff_y = (uint8_t)(pos_last >> log2_block_size); + + // Code last_coeff_x and last_coeff_y + kvz_encode_last_significant_xy(cabac, + last_coeff_x, + last_coeff_y, + width, + width, + type, + scan_mode); + + scan_pos_sig = scan_pos_last; + + // significant_coeff_flag + for (i = scan_cg_last; i >= 0; i--) { + int32_t sub_pos = i << 4; // LOG2_SCAN_SET_SIZE; + int32_t abs_coeff[16]; + int32_t cg_blk_pos = scan_cg[i]; + int32_t cg_pos_y = cg_blk_pos / num_blk_side; + int32_t cg_pos_x = cg_blk_pos - (cg_pos_y * num_blk_side); + + uint32_t coeff_signs = 0; + int32_t last_nz_pos_in_cg = -1; + int32_t first_nz_pos_in_cg = 16; + int32_t num_non_zero = 0; + go_rice_param = 0; + + if (scan_pos_sig == scan_pos_last) { + abs_coeff[0] = abs(coeff[pos_last]); + coeff_signs = (coeff[pos_last] < 0); + num_non_zero = 1; + last_nz_pos_in_cg = scan_pos_sig; + first_nz_pos_in_cg = scan_pos_sig; + scan_pos_sig--; + } + + if (i == scan_cg_last || i == 0) { + sig_coeffgroup_flag[cg_blk_pos] = 1; + } else { + uint32_t sig_coeff_group = (sig_coeffgroup_flag[cg_blk_pos] != 0); + uint32_t ctx_sig = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x, + cg_pos_y, width); + cabac->cur_ctx = &base_coeff_group_ctx[ctx_sig]; + CABAC_BIN(cabac, sig_coeff_group, "coded_sub_block_flag"); + } + + if (sig_coeffgroup_flag[cg_blk_pos]) { + int32_t pattern_sig_ctx = kvz_context_calc_pattern_sig_ctx(sig_coeffgroup_flag, + cg_pos_x, cg_pos_y, width); + + for (; scan_pos_sig >= sub_pos; scan_pos_sig--) { + blk_pos = scan[scan_pos_sig]; + pos_y = blk_pos >> log2_block_size; + pos_x = blk_pos - (pos_y << log2_block_size); + sig = (coeff[blk_pos] != 0) ? 1 : 0; + + if (scan_pos_sig > sub_pos || i == 0 || num_non_zero) { + ctx_sig = kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y, + log2_block_size, type); + cabac->cur_ctx = &baseCtx[ctx_sig]; + CABAC_BIN(cabac, sig, "sig_coeff_flag"); + } + + if (sig) { + abs_coeff[num_non_zero] = abs(coeff[blk_pos]); + coeff_signs = 2 * coeff_signs + (coeff[blk_pos] < 0); + num_non_zero++; + + if (last_nz_pos_in_cg == -1) { + last_nz_pos_in_cg = scan_pos_sig; + } + + first_nz_pos_in_cg = scan_pos_sig; + } + } + } else { + scan_pos_sig = sub_pos - 1; + } + + if (num_non_zero > 0) { + bool sign_hidden = last_nz_pos_in_cg - first_nz_pos_in_cg >= 4 /* SBH_THRESHOLD */ + && !encoder->cfg.lossless; + uint32_t ctx_set = (i > 0 && type == 0) ? 2 : 0; + cabac_ctx_t *base_ctx_mod; + int32_t num_c1_flag, first_c2_flag_idx, idx, first_coeff2; + + if (c1 == 0) { + ctx_set++; + } + + c1 = 1; + + base_ctx_mod = (type == 0) ? &(cabac->ctx.cu_one_model_luma[4 * ctx_set]) : + &(cabac->ctx.cu_one_model_chroma[4 * ctx_set]); + num_c1_flag = MIN(num_non_zero, C1FLAG_NUMBER); + first_c2_flag_idx = -1; + + for (idx = 0; idx < num_c1_flag; idx++) { + uint32_t symbol = (abs_coeff[idx] > 1) ? 1 : 0; + cabac->cur_ctx = &base_ctx_mod[c1]; + CABAC_BIN(cabac, symbol, "coeff_abs_level_greater1_flag"); + + if (symbol) { + c1 = 0; + + if (first_c2_flag_idx == -1) { + first_c2_flag_idx = idx; + } + } else if ((c1 < 3) && (c1 > 0)) { + c1++; + } + } + + if (c1 == 0) { + base_ctx_mod = (type == 0) ? &(cabac->ctx.cu_abs_model_luma[ctx_set]) : + &(cabac->ctx.cu_abs_model_chroma[ctx_set]); + + if (first_c2_flag_idx != -1) { + uint8_t symbol = (abs_coeff[first_c2_flag_idx] > 2) ? 1 : 0; + cabac->cur_ctx = &base_ctx_mod[0]; + CABAC_BIN(cabac, symbol, "coeff_abs_level_greater2_flag"); + } + } + if (be_valid && sign_hidden) { + coeff_signs = coeff_signs >> 1; + if (!cabac->only_count) + if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) { + coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero-1); + } + CABAC_BINS_EP(cabac, coeff_signs , (num_non_zero - 1), "coeff_sign_flag"); + } else { + if (!cabac->only_count) + if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFF_SIGNS) + coeff_signs = coeff_signs ^ kvz_crypto_get_key(state->crypto_hdl, num_non_zero); + CABAC_BINS_EP(cabac, coeff_signs, num_non_zero, "coeff_sign_flag"); + } + + if (c1 == 0 || num_non_zero > C1FLAG_NUMBER) { + first_coeff2 = 1; + + for (idx = 0; idx < num_non_zero; idx++) { + int32_t base_level = (idx < C1FLAG_NUMBER) ? (2 + first_coeff2) : 1; + + if (abs_coeff[idx] >= base_level) { + if (!cabac->only_count) { + if (encoder->cfg.crypto_features & KVZ_CRYPTO_TRANSF_COEFFS) + kvz_cabac_write_coeff_remain_encry(state, cabac, abs_coeff[idx] - base_level, go_rice_param, base_level); + else + kvz_cabac_write_coeff_remain(cabac, abs_coeff[idx] - base_level, go_rice_param); + } else + kvz_cabac_write_coeff_remain(cabac, abs_coeff[idx] - base_level, go_rice_param); + + if (abs_coeff[idx] > 3 * (1 << go_rice_param)) { + go_rice_param = MIN(go_rice_param + 1, 4); + } + } + + if (abs_coeff[idx] >= 2) { + first_coeff2 = 0; + } + } + } + } + } +} + +int kvz_strategy_register_encode_generic(void* opaque, uint8_t bitdepth) +{ + bool success = true; + + success &= kvz_strategyselector_register(opaque, "encode_coeff_nxn", "generic", 0, &kvz_encode_coeff_nxn_generic); + + return success; +}
View file
kvazaar-1.3.0.tar.gz/src/strategies/generic/encode_coding_tree-generic.h
Added
@@ -0,0 +1,42 @@ +#ifndef ENCODE_CODING_TREE_GENERIC_H_ +#define ENCODE_CODING_TREE_GENERIC_H_ + +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2015 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see <http://www.gnu.org/licenses/>. + ****************************************************************************/ + +/** + * \file + * Functions for writing the coding quadtree and related syntax. + */ + +#include "encoderstate.h" +#include "global.h" + +void kvz_encode_coeff_nxn_generic(encoder_state_t * const state, + cabac_data_t * const cabac, + const coeff_t *coeff, + uint8_t width, + uint8_t type, + int8_t scan_mode, + int8_t tr_skip); + +int kvz_strategy_register_encode_generic(void* opaque, uint8_t bitdepth); + +#endif // ENCODE_CODING_TREE_GENERIC_H_
View file
kvazaar-1.2.0.tar.gz/src/strategies/generic/ipol-generic.c -> kvazaar-1.3.0.tar.gz/src/strategies/generic/ipol-generic.c
Changed
@@ -119,510 +119,541 @@ return temp; } -void kvz_filter_inter_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag) +void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]) { //TODO: horizontal and vertical only filtering int32_t x, y; + + // Interpolation filter shifts int16_t shift1 = KVZ_BIT_DEPTH - 8; int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset23 = 1 << (shift2 + shift3 - 1); - - //coefficients for 1/4, 2/4 and 3/4 positions - int8_t *c0, *c1, *c2, *c3; - c0 = kvz_g_luma_filter[0]; - c1 = kvz_g_luma_filter[1]; - c2 = kvz_g_luma_filter[2]; - c3 = kvz_g_luma_filter[3]; + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); - #define FILTER_OFFSET 3 - #define FILTER_SIZE 8 + // Select filters according to the fractional part of the x and y mv components + int8_t *hor_filter = kvz_g_luma_filter[mv[0] & 3]; + int8_t *ver_filter = kvz_g_luma_filter[mv[1] & 3]; - int16_t flipped_hor_filtered[4 * (LCU_WIDTH + 1) + FILTER_SIZE][(LCU_WIDTH + 1) + FILTER_SIZE]; + int16_t hor_filtered[KVZ_EXT_BLOCK_W_LUMA][LCU_WIDTH]; + int16_t hor_stride = LCU_WIDTH; - // Filter horizontally and flip x and y - for (x = 0; x < width; ++x) { - for (y = 0; y < height + FILTER_SIZE - 1; ++y) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - // Original pixel - flipped_hor_filtered[4 * x + 0][y] = (c0[FILTER_OFFSET] * src[src_stride*ypos + xpos + FILTER_OFFSET]) >> shift1; - flipped_hor_filtered[4 * x + 1][y] = kvz_eight_tap_filter_hor_generic(c1, &src[src_stride*ypos + xpos]) >> shift1; - flipped_hor_filtered[4 * x + 2][y] = kvz_eight_tap_filter_hor_generic(c2, &src[src_stride*ypos + xpos]) >> shift1; - flipped_hor_filtered[4 * x + 3][y] = kvz_eight_tap_filter_hor_generic(c3, &src[src_stride*ypos + xpos]) >> shift1; + // Filter horizontally + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) { + for (x = 0; x < width; ++x) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET; + hor_filtered[y][x] = kvz_eight_tap_filter_hor_generic(hor_filter, &src[src_stride * ypos + xpos]) >> shift1; } } - // Filter vertically and flip x and y - for (x = 0; x < 4 * width; ++x) { - for (y = 0; y < height; ++y) { - int ypos = y; - int xpos = x; - dst[(4 * y + 0)*dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((c0[FILTER_OFFSET] * flipped_hor_filtered[xpos][ypos + FILTER_OFFSET] + offset23) >> shift2) >> shift3); - dst[(4 * y + 1)*dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(c1, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3); - dst[(4 * y + 2)*dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(c2, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3); - dst[(4 * y + 3)*dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(c3, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3); - + // Filter vertically + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + dst[y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_ver_16bit_generic(ver_filter, &hor_filtered[y][x], hor_stride) >> shift2) + wp_offset1) >> wp_shift1); } } } -void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]) +void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]) { //TODO: horizontal and vertical only filtering int32_t x, y; + + // Interpolation filter shifts int16_t shift1 = KVZ_BIT_DEPTH - 8; int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset23 = 1 << (shift2 + shift3 - 1); - //coefficients for 1/4, 2/4 and 3/4 positions - int8_t *hor_filter = kvz_g_luma_filter[mv[0]&3]; - int8_t *ver_filter = kvz_g_luma_filter[mv[1]&3]; + // Select filters according to the fractional part of the x and y mv components + int8_t *hor_filter = kvz_g_luma_filter[mv[0] & 3]; + int8_t *ver_filter = kvz_g_luma_filter[mv[1] & 3]; - int16_t flipped_hor_filtered[(LCU_WIDTH + 1) + FILTER_SIZE][(LCU_WIDTH + 1) + FILTER_SIZE]; + int16_t hor_filtered[KVZ_EXT_BLOCK_W_LUMA][LCU_WIDTH]; + int16_t hor_stride = LCU_WIDTH; - // Filter horizontally and flip x and y - for (x = 0; x < width; ++x) { - for (y = 0; y < height + FILTER_SIZE - 1; ++y) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - flipped_hor_filtered[x][y] = kvz_eight_tap_filter_hor_generic(hor_filter, &src[src_stride*ypos + xpos]) >> shift1; + // Filter horizontally + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA; ++y) { + for (x = 0; x < width; ++x) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET; + hor_filtered[y][x] = kvz_eight_tap_filter_hor_generic(hor_filter, &src[src_stride * ypos + xpos]) >> shift1; } } - // Filter vertically and flip x and y - for (x = 0; x < width; ++x) { - for (y = 0; y < height; ++y) { - int ypos = y; - int xpos = x; - dst[y*dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(ver_filter, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3); + // Filter vertically + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + dst[y * dst_stride + x] = kvz_eight_tap_filter_ver_16bit_generic(ver_filter, &hor_filtered[y][x], hor_stride) >> shift2; } } } -void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]) +void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * encoder, + kvz_pixel *src, + int16_t src_stride, + int width, + int height, + kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], + int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + int8_t fme_level, + int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], + int8_t hpel_off_x, int8_t hpel_off_y) { - //TODO: horizontal and vertical only filtering - int32_t x, y; + int x, y, first_y; + + // Interpolation filter shifts int16_t shift1 = KVZ_BIT_DEPTH - 8; - int32_t shift2 = 6; - //coefficients for 1/4, 2/4 and 3/4 positions - int8_t *hor_filter = kvz_g_luma_filter[mv[0] & 3]; - int8_t *ver_filter = kvz_g_luma_filter[mv[1] & 3]; + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); - int16_t flipped_hor_filtered[(LCU_WIDTH + 1) + FILTER_SIZE][(LCU_WIDTH + 1) + FILTER_SIZE]; + int8_t *fir0 = kvz_g_luma_filter[0]; + int8_t *fir2 = kvz_g_luma_filter[2]; - // Filter horizontally and flip x and y - for (x = 0; x < width; ++x) { - for (y = 0; y < height + FILTER_SIZE - 1; ++y) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - flipped_hor_filtered[x][y] = kvz_eight_tap_filter_hor_generic(hor_filter, &src[src_stride*ypos + xpos]) >> shift1; + int16_t dst_stride = LCU_WIDTH; + int16_t hor_stride = LCU_WIDTH; + int32_t first_row_offset = (KVZ_LUMA_FILTER_OFFSET + 1) * hor_stride; + + int16_t *col_pos0 = hor_first_cols[0]; + int16_t *col_pos2 = hor_first_cols[2]; + + // Horizontally filtered samples from the top row are + // not needed unless samples for diagonal positions are filtered later. + first_y = fme_level > 1 ? 0 : 1; + + // HORIZONTAL STEP + // Integer pixels + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { + for (x = 0; x < width; ++x) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1; + hor_intermediate[0][y * hor_stride + x] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1; } } - // Filter vertically and flip x and y - for (x = 0; x < width; ++x) { - for (y = 0; y < height; ++y) { - int ypos = y; - int xpos = x; - dst[y*dst_stride + x] = (kvz_eight_tap_filter_hor_16bit_generic(ver_filter, &flipped_hor_filtered[xpos][ypos])) >> shift2; + // Write the first column in contiguous memory + x = 0; + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET; + col_pos0[y] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1; + } + + // Half pixels + for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { + for (x = 0; x < width; ++x) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1; + hor_intermediate[1][y * hor_stride + x] = kvz_eight_tap_filter_hor_generic(fir2, &src[src_stride*ypos + xpos]) >> shift1; } } -} -/** - * \brief Interpolation for chroma half-pixel - * \param src source image in integer pels (-2..width+3, -2..height+3) - * \param src_stride stride of source image - * \param width width of source image block - * \param height height of source image block - * \param dst destination image in half-pixel resolution - * \param dst_stride stride of destination image - * - */ -void kvz_filter_inter_halfpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag) -{ - /* ____________ - * | B0,0|ae0,0| - * |ea0,0|ee0,0| - * - * ae0,0 = (-4*B-1,0 + 36*B0,0 + 36*B1,0 - 4*B2,0) >> shift1 - * ea0,0 = (-4*B0,-1 + 36*B0,0 + 36*B0,1 - 4*B0,2) >> shift1 - * ee0,0 = (-4*ae0,-1 + 36*ae0,0 + 36*ae0,1 - 4*ae0,2) >> shift2 - */ - int32_t x, y; - int32_t shift1 = KVZ_BIT_DEPTH - 8; - int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset3 = 1 << (shift3 - 1); - int32_t offset23 = 1 << (shift2 + shift3 - 1); - - int8_t* c = kvz_g_chroma_filter[4]; - int16_t temp[4] = {0,0,0,0}; - - // Loop source pixels and generate four filtered half-pel pixels on each round - for (y = 0; y < height; y++) { - int dst_pos_y = (y << 1)*dst_stride; - int src_pos_y = y*src_stride; - for (x = 0; x < width; x++) { - // Calculate current dst and src pixel positions - int dst_pos = dst_pos_y + (x << 1); - int src_pos = src_pos_y + x; - - // Original pixel (not really needed) - dst[dst_pos] = src[src_pos]; //B0,0 - - // ae0,0 - We need this only when hor_flag and for ee0,0 - if (hor_flag) { - temp[1] = kvz_four_tap_filter_hor_generic(c, &src[src_pos - 1]) >> shift1; // ae0,0 - } - // ea0,0 - needed only when ver_flag - if (ver_flag) { - dst[dst_pos + 1 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c, &src[src_pos - src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); // ea0,0 - } + // Write the first column in contiguous memory + x = 0; + for (y = first_y; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET; + col_pos2[y] = kvz_eight_tap_filter_hor_generic(fir2, &src[src_stride*ypos + xpos]) >> shift1; + } - // When both flags, we use _only_ this pixel (but still need ae0,0 for it) - if (hor_flag && ver_flag) { - // Calculate temporary values.. - src_pos -= src_stride; //0,-1 - temp[0] = (kvz_four_tap_filter_hor_generic(c, &src[src_pos - 1]) >> shift1); // ae0,-1 - src_pos += 2 * src_stride; //0,1 - temp[2] = (kvz_four_tap_filter_hor_generic(c, &src[src_pos - 1]) >> shift1); // ae0,1 - src_pos += src_stride; //0,2 - temp[3] = (kvz_four_tap_filter_hor_generic(c, &src[src_pos - 1]) >> shift1); // ae0,2 - - dst[dst_pos + 1 * dst_stride + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c, temp) + offset23) >> shift2) >> shift3); // ee0,0 - } + // VERTICAL STEP - if (hor_flag) { - dst[dst_pos + 1] = kvz_fast_clip_32bit_to_pixel((temp[1] + offset3) >> shift3); - } + // Right + // Only horizontal filter + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + filtered[1][y * dst_stride + x] = kvz_fast_clip_16bit_to_pixel((hor_intermediate[1][first_row_offset + y * hor_stride + x] + wp_offset1) >> wp_shift1); + } + } + + // Left + // Copy from the right filtered block and the extra column + for (y = 0; y < height; ++y) { + x = 0; + filtered[0][y * dst_stride + x] = kvz_fast_clip_16bit_to_pixel((col_pos2[y + KVZ_LUMA_FILTER_OFFSET + 1] + wp_offset1) >> wp_shift1); + for (x = 1; x < width; ++x) filtered[0][y * dst_stride + x] = filtered[1][y * dst_stride + x - 1]; + } + + // Top + // Only vertical filter + for (y = 0; y < height; ++y) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + for (x = 0; x < width; ++x) { + int xpos = x; + int16_t sample = kvz_eight_tap_filter_ver_generic(fir2, &src[src_stride*ypos + xpos + 1], src_stride) >> shift1; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[2][y * dst_stride + x] = sample; } } + + // Bottom + // Copy what can be copied from the top filtered values. + // Then filter the last row from horizontal intermediate buffer. + for (y = 0; y < height - 1; ++y) { + for (x = 0; x < width; ++x) filtered[3][y * dst_stride + x] = filtered[2][(y + 1) * dst_stride + x]; + } + + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + for (x = 0; x < width; ++x) { + int xpos = x; + int16_t sample = kvz_eight_tap_filter_ver_generic(fir2, &src[src_stride*(ypos + 1) + xpos + 1], src_stride) >> shift1; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[3][y * dst_stride + x] = sample; + } } -void kvz_filter_inter_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag) +void kvz_filter_hpel_blocks_diag_luma_generic(const encoder_control_t * encoder, + kvz_pixel *src, + int16_t src_stride, + int width, + int height, + kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], + int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + int8_t fme_level, + int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], + int8_t hpel_off_x, int8_t hpel_off_y) { + int x, y; - int32_t x, y; - int32_t shift1 = KVZ_BIT_DEPTH - 8; + // Interpolation filter shifts int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset3 = 1 << (shift3 - 1); - int32_t offset23 = 1 << (shift2 + shift3 - 1); - - //coefficients for 1/8, 2/8, 3/8, 4/8, 5/8, 6/8 and 7/8 positions - int8_t *c1, *c2, *c3, *c4, *c5, *c6, *c7; - - int i; - c1 = kvz_g_chroma_filter[1]; - c2 = kvz_g_chroma_filter[2]; - c3 = kvz_g_chroma_filter[3]; - c4 = kvz_g_chroma_filter[4]; - c5 = kvz_g_chroma_filter[5]; - c6 = kvz_g_chroma_filter[6]; - c7 = kvz_g_chroma_filter[7]; - - int16_t temp[7][4]; // Temporary horizontal values calculated from integer pixels - - - // Loop source pixels and generate 64 filtered 1/8-pel pixels on each round - for (y = 0; y < height; y++) { - int dst_pos_y = (y << 3)*dst_stride; - int src_pos_y = y*src_stride; - for (x = 0; x < width; x++) { - // Calculate current dst and src pixel positions - int dst_pos = dst_pos_y + (x << 3); - int src_pos = src_pos_y + x; - - // Original pixel - dst[dst_pos] = src[src_pos]; - - // Horizontal 1/8-values - if (hor_flag && !ver_flag) { - - temp[0][1] = (kvz_four_tap_filter_hor_generic(c1, &src[src_pos - 1]) >> shift1); // ae0,0 h0 - temp[1][1] = (kvz_four_tap_filter_hor_generic(c2, &src[src_pos - 1]) >> shift1); - temp[2][1] = (kvz_four_tap_filter_hor_generic(c3, &src[src_pos - 1]) >> shift1); - temp[3][1] = (kvz_four_tap_filter_hor_generic(c4, &src[src_pos - 1]) >> shift1); - temp[4][1] = (kvz_four_tap_filter_hor_generic(c5, &src[src_pos - 1]) >> shift1); - temp[5][1] = (kvz_four_tap_filter_hor_generic(c6, &src[src_pos - 1]) >> shift1); - temp[6][1] = (kvz_four_tap_filter_hor_generic(c7, &src[src_pos - 1]) >> shift1); - } + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); - // Vertical 1/8-values - if (ver_flag) { - dst[dst_pos + 1 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c1, &src[src_pos - 1 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); // - dst[dst_pos + 2 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c2, &src[src_pos - 1 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - dst[dst_pos + 3 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c3, &src[src_pos - 1 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - dst[dst_pos + 4 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c4, &src[src_pos - 1 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - dst[dst_pos + 5 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c5, &src[src_pos - 1 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - dst[dst_pos + 6 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c6, &src[src_pos - 1 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - dst[dst_pos + 7 * dst_stride] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_generic(c7, &src[src_pos - 1 * src_stride], src_stride) >> shift1) + (1 << (shift3 - 1))) >> shift3); - } + int8_t *fir2 = kvz_g_luma_filter[2]; - // When both flags, interpolate values from temporary horizontal values - if (hor_flag && ver_flag) { + int16_t dst_stride = LCU_WIDTH; + int16_t hor_stride = LCU_WIDTH; - // Calculate temporary values - src_pos -= 1 * src_stride; //0,-3 - for (i = 0; i < 4; ++i) { + // Horizontal positions + int16_t *col_pos2 = hor_first_cols[2]; - temp[0][i] = (kvz_four_tap_filter_hor_generic(c1, &src[src_pos + i * src_stride - 1]) >> shift1); - temp[1][i] = (kvz_four_tap_filter_hor_generic(c2, &src[src_pos + i * src_stride - 1]) >> shift1); - temp[2][i] = (kvz_four_tap_filter_hor_generic(c3, &src[src_pos + i * src_stride - 1]) >> shift1); - temp[3][i] = (kvz_four_tap_filter_hor_generic(c4, &src[src_pos + i * src_stride - 1]) >> shift1); - temp[4][i] = (kvz_four_tap_filter_hor_generic(c5, &src[src_pos + i * src_stride - 1]) >> shift1); - temp[5][i] = (kvz_four_tap_filter_hor_generic(c6, &src[src_pos + i * src_stride - 1]) >> shift1); - temp[6][i] = (kvz_four_tap_filter_hor_generic(c7, &src[src_pos + i * src_stride - 1]) >> shift1); - - } + // VERTICAL STEP + // Top-right + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(fir2, &hor_intermediate[1][y * hor_stride + x], hor_stride) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[1][y * dst_stride + x] = sample; + } + } - //Calculate values from temporary horizontal 1/8-values - for (i = 0; i<7; ++i){ - dst[dst_pos + 1 * dst_stride + i + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c1, &temp[i][0]) + offset23) >> shift2) >> shift3); // ee0,0 - dst[dst_pos + 2 * dst_stride + i + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c2, &temp[i][0]) + offset23) >> shift2) >> shift3); - dst[dst_pos + 3 * dst_stride + i + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c3, &temp[i][0]) + offset23) >> shift2) >> shift3); - dst[dst_pos + 4 * dst_stride + i + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c4, &temp[i][0]) + offset23) >> shift2) >> shift3); - dst[dst_pos + 5 * dst_stride + i + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c5, &temp[i][0]) + offset23) >> shift2) >> shift3); - dst[dst_pos + 6 * dst_stride + i + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c6, &temp[i][0]) + offset23) >> shift2) >> shift3); - dst[dst_pos + 7 * dst_stride + i + 1] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(c7, &temp[i][0]) + offset23) >> shift2) >> shift3); - - } + for (y = 0; y < height; ++y) { + x = 0; + filtered[0][y * dst_stride + x] = kvz_fast_clip_16bit_to_pixel((col_pos2[y + KVZ_LUMA_FILTER_OFFSET + 1] + wp_offset1) >> wp_shift1); + for (x = 1; x < width; ++x) filtered[0][y * dst_stride + x] = filtered[1][y * dst_stride + x - 1]; + } - } + // Top-left + // Copy what can be copied from top-right filtered values. Filter the first column from the column array. + for (y = 0; y < height; ++y) { + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(fir2, &col_pos2[y]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[0][y * dst_stride + x] = sample; + for (x = 1; x < width; ++x) filtered[0][y * dst_stride + x] = filtered[1][y * dst_stride + x - 1]; + } - if (hor_flag) { - dst[dst_pos + 1] = kvz_fast_clip_32bit_to_pixel((temp[0][1] + offset3) >> shift3); - dst[dst_pos + 2] = kvz_fast_clip_32bit_to_pixel((temp[1][1] + offset3) >> shift3); - dst[dst_pos + 3] = kvz_fast_clip_32bit_to_pixel((temp[2][1] + offset3) >> shift3); - dst[dst_pos + 4] = kvz_fast_clip_32bit_to_pixel((temp[3][1] + offset3) >> shift3); - dst[dst_pos + 5] = kvz_fast_clip_32bit_to_pixel((temp[4][1] + offset3) >> shift3); - dst[dst_pos + 6] = kvz_fast_clip_32bit_to_pixel((temp[5][1] + offset3) >> shift3); - dst[dst_pos + 7] = kvz_fast_clip_32bit_to_pixel((temp[6][1] + offset3) >> shift3); - } + // Bottom-right + // Copy what can be copied from top-right filtered values. Filter the last row. + for (y = 0; y < height - 1; ++y) { + for (x = 0; x < width; ++x) filtered[3][y* dst_stride + x] = filtered[1][(y + 1) * dst_stride + x]; + } + for (x = 0; x < width; ++x) { + int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(fir2, &hor_intermediate[1][(y + 1) * hor_stride + x], hor_stride) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[3][y * dst_stride + x] = sample; + } - } + // Bottom-left + // Copy what can be copied from the top-left filtered values. + // Copy what can be copied from the bottom-right filtered values. + // Finally filter the last pixel from the column array. + for (y = 0; y < height - 1; ++y) { + for (x = 0; x < width; ++x) filtered[2][y * dst_stride + x] = filtered[0][(y + 1) * dst_stride + x]; } + for (x = 1; x < width; ++x) filtered[2][y * dst_stride + x] = filtered[3][y * dst_stride + x - 1]; + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(fir2, &col_pos2[(y + 1)]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[2][y * dst_stride + x] = sample; } -void kvz_filter_hpel_blocks_hor_ver_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered) +void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * encoder, + kvz_pixel *src, + int16_t src_stride, + int width, + int height, + kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], + int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + int8_t fme_level, + int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], + int8_t hpel_off_x, int8_t hpel_off_y) { int x, y; + + // Interpolation filter shifts int16_t shift1 = KVZ_BIT_DEPTH - 8; int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset23 = 1 << (shift2 + shift3 - 1); + + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); int8_t *fir0 = kvz_g_luma_filter[0]; int8_t *fir2 = kvz_g_luma_filter[2]; + int8_t *fir1 = kvz_g_luma_filter[1]; + int8_t *fir3 = kvz_g_luma_filter[3]; - int16_t flipped0[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped2[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - - int16_t temp_stride = height + KVZ_EXT_PADDING + 1; - int16_t dst_stride = (LCU_WIDTH + 1); - - // Horizontal positions - for (x = 0; x < width + 1; ++x) { - for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - flipped0[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1; - flipped2[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir2, &src[src_stride*ypos + xpos]) >> shift1; + // Horiziontal positions. Positions 0 and 2 have already been calculated in filtered. + int16_t *hor_pos0 = hor_intermediate[0]; + int16_t *hor_pos2 = hor_intermediate[1]; + int16_t *hor_pos_l = hor_intermediate[3]; + int16_t *hor_pos_r = hor_intermediate[4]; + int8_t *hor_fir_l = hpel_off_x != 0 ? fir1 : fir3; + int8_t *hor_fir_r = hpel_off_x != 0 ? fir3 : fir1; + int16_t *col_pos_l = hor_first_cols[1]; + int16_t *col_pos_r = hor_first_cols[3]; + + int16_t dst_stride = LCU_WIDTH; + int16_t hor_stride = LCU_WIDTH; + + int16_t *hor_hpel_pos = hpel_off_x != 0 ? hor_pos2 : hor_pos0; + int16_t *col_pos_hor = hpel_off_x != 0 ? hor_first_cols[2] : hor_first_cols[0]; + + // Specify if integer pixels are filtered from left or/and top integer samples + int off_x_fir_l = hpel_off_x < 1 ? 0 : 1; + int off_x_fir_r = hpel_off_x < 0 ? 0 : 1; + int off_y_fir_t = hpel_off_y < 1 ? 0 : 1; + int off_y_fir_b = hpel_off_y < 0 ? 0 : 1; + + // HORIZONTAL STEP + // Left QPEL + int sample_off_y = hpel_off_y < 0 ? 0 : 1; + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { + for (x = 0; x < width; ++x) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1; + hor_pos_l[y * hor_stride + x] = kvz_eight_tap_filter_hor_generic(hor_fir_l, &src[src_stride*ypos + xpos]) >> shift1; } } - // Filter vertically and flip x and y - for (x = 0; x < width + 1; ++x) { - for (y = 0; y < height + 1; ++y) { - filtered[HPEL_POS_HOR][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[HPEL_POS_VER][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - } + // Write the first column in contiguous memory + x = 0; + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET; + col_pos_l[y] = kvz_eight_tap_filter_hor_generic(hor_fir_l, &src[src_stride*ypos + xpos]) >> shift1; } -} -void kvz_filter_hpel_blocks_full_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered) -{ - int x, y; - int16_t shift1 = KVZ_BIT_DEPTH - 8; - int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset23 = 1 << (shift2 + shift3 - 1); + // Right QPEL + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { + for (x = 0; x < width; ++x) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET + 1; + hor_pos_r[y * hor_stride + x] = kvz_eight_tap_filter_hor_generic(hor_fir_r, &src[src_stride*ypos + xpos]) >> shift1; + } + } - int8_t *fir0 = kvz_g_luma_filter[0]; - int8_t *fir2 = kvz_g_luma_filter[2]; + // Write the first column in contiguous memory + x = 0; + for (y = 0; y < height + KVZ_EXT_PADDING_LUMA + 1; ++y) { + int ypos = y - KVZ_LUMA_FILTER_OFFSET; + int xpos = x - KVZ_LUMA_FILTER_OFFSET; + col_pos_r[y] = kvz_eight_tap_filter_hor_generic(hor_fir_r, &src[src_stride*ypos + xpos]) >> shift1; + } - int16_t flipped0[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped2[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; + // VERTICAL STEP + int8_t *ver_fir_l = hpel_off_y != 0 ? fir2 : fir0; + int8_t *ver_fir_r = hpel_off_y != 0 ? fir2 : fir0; + int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3; + int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1; + + // Left QPEL (1/4 or 3/4 x positions) + for (y = 0; y < height; ++y) { + if (!off_x_fir_l) { + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_l, &col_pos_l[y + sample_off_y]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[0][y * dst_stride + x] = sample; + } + for (x = !off_x_fir_l; x < width; ++x) { + int ypos = y + sample_off_y; + int xpos = x - !off_x_fir_l; + int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_l, &hor_pos_l[ypos * hor_stride + xpos], hor_stride) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[0][y * dst_stride + x] = sample; + } + } - int16_t temp_stride = height + KVZ_EXT_PADDING + 1; - int16_t dst_stride = (LCU_WIDTH + 1); + // Right QPEL (3/4 or 1/4 x positions) + for (y = 0; y < height; ++y) { + if (!off_x_fir_r) { + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_r, &col_pos_r[y + sample_off_y]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[1][y * dst_stride + x] = sample; + } + for (x = !off_x_fir_r; x < width; ++x) { + int ypos = y + sample_off_y; + int xpos = x - !off_x_fir_r; + int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_r, &hor_pos_r[ypos * hor_stride + xpos], hor_stride) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[1][y * dst_stride + x] = sample; + } + } - // Horizontal positions - for (x = 0; x < width + 1; ++x) { - for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - flipped0[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1; - flipped2[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir2, &src[src_stride*ypos + xpos]) >> shift1; + // Top QPEL (1/4 or 3/4 y positions) + int sample_off_x = (hpel_off_x > -1 ? 1 : 0); + for (y = 0; y < height; ++y) { + if (!sample_off_x) { + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_t, &col_pos_hor[y + off_y_fir_t]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[2][y * dst_stride + x] = sample; + } + for (x = !sample_off_x; x < width; ++x) { + int ypos = y + off_y_fir_t; + int xpos = x - !sample_off_x; + int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_t, &hor_hpel_pos[ypos * hor_stride + xpos], hor_stride) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[2][y * dst_stride + x] = sample; } } - // Filter vertically and flip x and y - for (x = 0; x < width + 1; ++x) { - for (y = 0; y < height + 1; ++y) { - filtered[HPEL_POS_HOR][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[HPEL_POS_VER][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[HPEL_POS_DIA][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); + // Bottom QPEL (3/4 or 1/4 y positions) + for (y = 0; y < height; ++y) { + if (!sample_off_x) { + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_b, &col_pos_hor[y + off_y_fir_b]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[3][y * dst_stride + x] = sample; + } + for (x = !sample_off_x; x < width; ++x) { + int ypos = y + off_y_fir_b; + int xpos = x - !sample_off_x; + int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_b, &hor_hpel_pos[ypos * hor_stride + xpos], hor_stride) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[3][y * dst_stride + x] = sample; } } } -void kvz_filter_qpel_blocks_hor_ver_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered) +void kvz_filter_qpel_blocks_diag_luma_generic(const encoder_control_t * encoder, + kvz_pixel *src, + int16_t src_stride, + int width, + int height, + kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], + int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], + int8_t fme_level, + int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], + int8_t hpel_off_x, int8_t hpel_off_y) { int x, y; - int16_t shift1 = KVZ_BIT_DEPTH - 8; + + // Interpolation filter shifts int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset23 = 1 << (shift2 + shift3 - 1); - int8_t *fir0 = kvz_g_luma_filter[0]; - int8_t *fir2 = kvz_g_luma_filter[2]; + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); + int8_t *fir1 = kvz_g_luma_filter[1]; int8_t *fir3 = kvz_g_luma_filter[3]; - int16_t flipped0[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped2[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped1[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped3[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - - int16_t temp_stride = height + KVZ_EXT_PADDING + 1; - int16_t dst_stride = (LCU_WIDTH + 1); - - // Horizontal positions - for (x = 0; x < width + 1; ++x) { - for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - flipped0[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1; - flipped2[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir2, &src[src_stride*ypos + xpos]) >> shift1; - flipped1[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir1, &src[src_stride*ypos + xpos]) >> shift1; - flipped3[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir3, &src[src_stride*ypos + xpos]) >> shift1; - } - } - - // Filter vertically and flip x and y - for (x = 0; x < width + 1; ++x) { - for (y = 0; y < height + 1; ++y) { - - // HPEL - filtered[ 0][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[ 1][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[ 2][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - - // QPEL - // Horizontal - filtered[ 3][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[ 4][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped3[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[ 5][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[ 6][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped3[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - - // Vertical - filtered[ 7][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[ 8][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[ 9][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[10][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); + // Horiziontal positions. + int16_t *hor_pos_l = hor_intermediate[3]; + int16_t *hor_pos_r = hor_intermediate[4]; + + int16_t *col_pos_l = hor_first_cols[1]; + int16_t *col_pos_r = hor_first_cols[3]; + + int16_t dst_stride = LCU_WIDTH; + int16_t hor_stride = LCU_WIDTH; + + // VERTICAL STEP + int8_t *ver_fir_t = hpel_off_y != 0 ? fir1 : fir3; + int8_t *ver_fir_b = hpel_off_y != 0 ? fir3 : fir1; + + // Specify if integer pixels are filtered from left or/and top integer samples + int off_x_fir_l = hpel_off_x < 1 ? 0 : 1; + int off_x_fir_r = hpel_off_x < 0 ? 0 : 1; + int off_y_fir_t = hpel_off_y < 1 ? 0 : 1; + int off_y_fir_b = hpel_off_y < 0 ? 0 : 1; + + // Top-left QPEL + for (y = 0; y < height; ++y) { + if (!off_x_fir_l) { + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_t, &col_pos_l[y + off_y_fir_t]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[0][y * dst_stride + x] = sample; + } + for (x = !off_x_fir_l; x < width; ++x) { + int ypos = y + off_y_fir_t; + int xpos = x - !off_x_fir_l; + int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_t, &hor_pos_l[ypos * hor_stride + xpos], hor_stride) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[0][y * dst_stride + x] = sample; } } -} -void kvz_filter_qpel_blocks_full_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block *filtered) -{ - int x, y; - int16_t shift1 = KVZ_BIT_DEPTH - 8; - int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset23 = 1 << (shift2 + shift3 - 1); - - int8_t *fir0 = kvz_g_luma_filter[0]; - int8_t *fir2 = kvz_g_luma_filter[2]; - int8_t *fir1 = kvz_g_luma_filter[1]; - int8_t *fir3 = kvz_g_luma_filter[3]; - - int16_t flipped0[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped2[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped1[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; - int16_t flipped3[(LCU_WIDTH + 1) * (KVZ_EXT_BLOCK_W + 1)]; + // Top-right QPEL + for (y = 0; y < height; ++y) { + if (!off_x_fir_r) { + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_t, &col_pos_r[y + off_y_fir_t]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[1][y * dst_stride + x] = sample; + } + for (x = !off_x_fir_r; x < width; ++x) { + int ypos = y + off_y_fir_t; + int xpos = x - !off_x_fir_r; + int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_t, &hor_pos_r[ypos * hor_stride + xpos], hor_stride) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[1][y * dst_stride + x] = sample; + } + } - int16_t temp_stride = height + KVZ_EXT_PADDING + 1; - int16_t dst_stride = (LCU_WIDTH + 1); - - // Horizontal positions - for (x = 0; x < width + 1; ++x) { - for (y = 0; y < height + KVZ_EXT_PADDING + 1; ++y) { - int ypos = y - FILTER_OFFSET; - int xpos = x - FILTER_OFFSET; - flipped0[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir0, &src[src_stride*ypos + xpos]) >> shift1; - flipped2[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir2, &src[src_stride*ypos + xpos]) >> shift1; - flipped1[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir1, &src[src_stride*ypos + xpos]) >> shift1; - flipped3[x * temp_stride + y] = kvz_eight_tap_filter_hor_generic(fir3, &src[src_stride*ypos + xpos]) >> shift1; - } - } - - // Filter vertically and flip x and y - for (x = 0; x < width + 1; ++x) { - for (y = 0; y < height + 1; ++y) { - - // HPEL - filtered[ 0][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[ 1][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[ 2][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - - // QPEL - // Horizontal - filtered[ 3][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[ 4][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir0, &flipped3[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[ 5][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[ 6][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir2, &flipped3[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - - // Vertical - filtered[ 7][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[ 8][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[ 9][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped0[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[10][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped2[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - - // Diagonal - filtered[11][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[12][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir1, &flipped3[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[13][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped1[x * temp_stride + y]) + offset23) >> shift2) >> shift3); - filtered[14][y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_eight_tap_filter_hor_16bit_generic(fir3, &flipped3[x * temp_stride + y]) + offset23) >> shift2) >> shift3); + // Bottom-left QPEL + for (y = 0; y < height; ++y) { + if (!off_x_fir_l) { + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_b, &col_pos_l[y + off_y_fir_b]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[2][y * dst_stride + x] = sample; + } + for (x = !off_x_fir_l; x < width; ++x) { + int ypos = y + off_y_fir_b; + int xpos = x - !off_x_fir_l; + int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_b, &hor_pos_l[ypos * hor_stride + xpos], hor_stride) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[2][y * dst_stride + x] = sample; } } -} -void kvz_filter_frac_blocks_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, frac_search_block filtered[15], int8_t fme_level) -{ - switch (fme_level) { - case 1: - kvz_filter_hpel_blocks_hor_ver_luma_generic(encoder, src, src_stride, width, height, filtered); - break; - case 2: - kvz_filter_hpel_blocks_full_luma_generic(encoder, src, src_stride, width, height, filtered); - break; - case 3: - kvz_filter_qpel_blocks_hor_ver_luma_generic(encoder, src, src_stride, width, height, filtered); - break; - default: - kvz_filter_qpel_blocks_full_luma_generic(encoder, src, src_stride, width, height, filtered); - break; + // Bottom-right QPEL + for (y = 0; y < height; ++y) { + if (!off_x_fir_r) { + x = 0; + int16_t sample = kvz_eight_tap_filter_hor_16bit_generic(ver_fir_b, &col_pos_r[y + off_y_fir_b]) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[3][y * dst_stride + x] = sample; + } + for (x = !off_x_fir_r; x < width; ++x) { + int ypos = y + off_y_fir_b; + int xpos = x - !off_x_fir_r; + int16_t sample = kvz_eight_tap_filter_ver_16bit_generic(ver_fir_b, &hor_pos_r[ypos * hor_stride + xpos], hor_stride) >> shift2; + sample = kvz_fast_clip_16bit_to_pixel((sample + wp_offset1) >> wp_shift1); + filtered[3][y * dst_stride + x] = sample; + } } } @@ -630,33 +661,35 @@ { //TODO: horizontal and vertical only filtering int32_t x, y; + + // Interpolation filter shifts int16_t shift1 = KVZ_BIT_DEPTH - 8; int32_t shift2 = 6; - int32_t shift3 = 14 - KVZ_BIT_DEPTH; - int32_t offset23 = 1 << (shift2 + shift3 - 1); + // Weighted prediction offset and shift + int32_t wp_shift1 = 14 - KVZ_BIT_DEPTH; + int32_t wp_offset1 = 1 << (wp_shift1 - 1); + + // Select filters according to the fractional part of the x and y mv components int8_t *hor_filter = kvz_g_chroma_filter[mv[0] & 7]; int8_t *ver_filter = kvz_g_chroma_filter[mv[1] & 7]; -#define FILTER_SIZE_C (FILTER_SIZE / 2) -#define FILTER_OFFSET_C (FILTER_OFFSET / 2) - int16_t flipped_hor_filtered[(LCU_WIDTH_C + 1) + FILTER_SIZE_C][(LCU_WIDTH_C + 1) + FILTER_SIZE_C]; + int16_t hor_filtered[KVZ_EXT_BLOCK_W_CHROMA][LCU_WIDTH_C]; + int16_t hor_stride = LCU_WIDTH_C; - // Filter horizontally and flip x and y - for (x = 0; x < width; ++x) { - for (y = 0; y < height + FILTER_SIZE_C - 1; ++y) { - int ypos = y - FILTER_OFFSET_C; - int xpos = x - FILTER_OFFSET_C; - flipped_hor_filtered[x][y] = kvz_four_tap_filter_hor_generic(hor_filter, &src[src_stride*ypos + xpos]) >> shift1; + // Filter horizontally + for (y = 0; y < height + KVZ_EXT_PADDING_CHROMA; ++y) { + for (x = 0; x < width; ++x) { + int ypos = y - KVZ_CHROMA_FILTER_OFFSET; + int xpos = x - KVZ_CHROMA_FILTER_OFFSET; + hor_filtered[y][x] = kvz_four_tap_filter_hor_generic(hor_filter, &src[src_stride * ypos + xpos]) >> shift1; } } - // Filter vertically and flip x and y - for (x = 0; x < width; ++x) { - for (y = 0; y < height; ++y) { - int ypos = y; - int xpos = x; - dst[y*dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_hor_16bit_generic(ver_filter, &flipped_hor_filtered[xpos][ypos]) + offset23) >> shift2) >> shift3); + // Filter vertically + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + dst[y * dst_stride + x] = kvz_fast_clip_32bit_to_pixel(((kvz_four_tap_filter_ver_16bit_generic(ver_filter, &hor_filtered[y][x], hor_stride) >> shift2) + wp_offset1) >> wp_shift1); } } } @@ -665,30 +698,31 @@ { //TODO: horizontal and vertical only filtering int32_t x, y; + + // Interpolation filter shifts int16_t shift1 = KVZ_BIT_DEPTH - 8; int32_t shift2 = 6; + + // Select filters according to the fractional part of the x and y mv components int8_t *hor_filter = kvz_g_chroma_filter[mv[0] & 7]; int8_t *ver_filter = kvz_g_chroma_filter[mv[1] & 7]; -#define FILTER_SIZE_C (FILTER_SIZE / 2) -#define FILTER_OFFSET_C (FILTER_OFFSET / 2) - int16_t flipped_hor_filtered[(LCU_WIDTH_C + 1) + FILTER_SIZE_C][(LCU_WIDTH_C + 1) + FILTER_SIZE_C]; + int16_t hor_filtered[KVZ_EXT_BLOCK_W_CHROMA][LCU_WIDTH_C]; + int16_t hor_stride = LCU_WIDTH_C; - // Filter horizontally and flip x and y - for (x = 0; x < width; ++x) { - for (y = 0; y < height + FILTER_SIZE_C - 1; ++y) { - int ypos = y - FILTER_OFFSET_C; - int xpos = x - FILTER_OFFSET_C; - flipped_hor_filtered[x][y] = kvz_four_tap_filter_hor_generic(hor_filter, &src[src_stride*ypos + xpos]) >> shift1; + // Filter horizontally + for (y = 0; y < height + KVZ_EXT_PADDING_CHROMA; ++y) { + for (x = 0; x < width; ++x) { + int ypos = y - KVZ_CHROMA_FILTER_OFFSET; + int xpos = x - KVZ_CHROMA_FILTER_OFFSET; + hor_filtered[y][x] = kvz_four_tap_filter_hor_generic(hor_filter, &src[src_stride * ypos + xpos]) >> shift1; } } - // Filter vertically and flip x and y - for (x = 0; x < width; ++x) { - for (y = 0; y < height; ++y) { - int ypos = y; - int xpos = x; - dst[y*dst_stride + x] = (kvz_four_tap_filter_hor_16bit_generic(ver_filter, &flipped_hor_filtered[xpos][ypos])) >> shift2; + // Filter vertically + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + dst[y * dst_stride + x] = kvz_four_tap_filter_ver_16bit_generic(ver_filter, &hor_filtered[y][x], hor_stride) >> shift2; } } } @@ -749,15 +783,14 @@ } } - int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth) { bool success = true; - success &= kvz_strategyselector_register(opaque, "filter_inter_quarterpel_luma", "generic", 0, &kvz_filter_inter_quarterpel_luma_generic); - success &= kvz_strategyselector_register(opaque, "filter_inter_halfpel_chroma", "generic", 0, &kvz_filter_inter_halfpel_chroma_generic); - success &= kvz_strategyselector_register(opaque, "filter_inter_octpel_chroma", "generic", 0, &kvz_filter_inter_octpel_chroma_generic); - success &= kvz_strategyselector_register(opaque, "filter_frac_blocks_luma", "generic", 0, &kvz_filter_frac_blocks_luma_generic); + success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_hor_ver_luma", "generic", 0, &kvz_filter_hpel_blocks_hor_ver_luma_generic); + success &= kvz_strategyselector_register(opaque, "filter_hpel_blocks_diag_luma", "generic", 0, &kvz_filter_hpel_blocks_diag_luma_generic); + success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_hor_ver_luma", "generic", 0, &kvz_filter_qpel_blocks_hor_ver_luma_generic); + success &= kvz_strategyselector_register(opaque, "filter_qpel_blocks_diag_luma", "generic", 0, &kvz_filter_qpel_blocks_diag_luma_generic); success &= kvz_strategyselector_register(opaque, "sample_quarterpel_luma", "generic", 0, &kvz_sample_quarterpel_luma_generic); success &= kvz_strategyselector_register(opaque, "sample_octpel_chroma", "generic", 0, &kvz_sample_octpel_chroma_generic); success &= kvz_strategyselector_register(opaque, "sample_14bit_quarterpel_luma", "generic", 0, &kvz_sample_14bit_quarterpel_luma_generic);
View file
kvazaar-1.2.0.tar.gz/src/strategies/generic/ipol-generic.h -> kvazaar-1.3.0.tar.gz/src/strategies/generic/ipol-generic.h
Changed
@@ -32,7 +32,9 @@ int kvz_strategy_register_ipol_generic(void* opaque, uint8_t bitdepth); void kvz_sample_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); +void kvz_sample_14bit_quarterpel_luma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); void kvz_sample_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); +void kvz_sample_14bit_octpel_chroma_generic(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); #endif //STRATEGIES_IPOL_GENERIC_H_
View file
kvazaar-1.2.0.tar.gz/src/strategies/generic/picture-generic.c -> kvazaar-1.3.0.tar.gz/src/strategies/generic/picture-generic.c
Changed
@@ -213,7 +213,7 @@ } void kvz_satd_4x4_subblock_quad_generic(const kvz_pixel *preds[4], - const int strides[4], + const int stride, const kvz_pixel *orig, const int orig_stride, unsigned costs[4]) @@ -221,10 +221,10 @@ int32_t diff[4][4 * 4]; for (int y = 0; y < 4; y++) { for (int x = 0; x < 4; x++) { - diff[0][x + y * 4] = orig[x + y * orig_stride] - preds[0][x + y * strides[0]]; - diff[1][x + y * 4] = orig[x + y * orig_stride] - preds[1][x + y * strides[1]]; - diff[2][x + y * 4] = orig[x + y * orig_stride] - preds[2][x + y * strides[2]]; - diff[3][x + y * 4] = orig[x + y * orig_stride] - preds[3][x + y * strides[3]]; + diff[0][x + y * 4] = orig[x + y * orig_stride] - preds[0][x + y * stride]; + diff[1][x + y * 4] = orig[x + y * orig_stride] - preds[1][x + y * stride]; + diff[2][x + y * 4] = orig[x + y * orig_stride] - preds[2][x + y * stride]; + diff[3][x + y * 4] = orig[x + y * orig_stride] - preds[3][x + y * stride]; } } @@ -328,15 +328,15 @@ } static void satd_8x8_subblock_quad_generic(const kvz_pixel **preds, - const int *strides, + const int stride, const kvz_pixel *orig, const int orig_stride, unsigned *costs) { - costs[0] = satd_8x8_subblock_generic(orig, orig_stride, preds[0], strides[0]); - costs[1] = satd_8x8_subblock_generic(orig, orig_stride, preds[1], strides[1]); - costs[2] = satd_8x8_subblock_generic(orig, orig_stride, preds[2], strides[2]); - costs[3] = satd_8x8_subblock_generic(orig, orig_stride, preds[3], strides[3]); + costs[0] = satd_8x8_subblock_generic(orig, orig_stride, preds[0], stride); + costs[1] = satd_8x8_subblock_generic(orig, orig_stride, preds[1], stride); + costs[2] = satd_8x8_subblock_generic(orig, orig_stride, preds[2], stride); + costs[3] = satd_8x8_subblock_generic(orig, orig_stride, preds[3], stride); } // These macros define sadt_16bit_NxN for N = 8, 16, 32, 64 @@ -394,7 +394,7 @@ static void satd_any_size_ ## suffix ( \ int width, int height, \ const kvz_pixel **preds, \ - const int *strides, \ + const int stride, \ const kvz_pixel *orig, \ const int orig_stride, \ unsigned num_modes, \ @@ -408,7 +408,7 @@ if (width % 8 != 0) { \ /* Process the first column using 4x4 blocks. */ \ for (int y = 0; y < height; y += 4) { \ - kvz_satd_4x4_subblock_ ## suffix(preds, strides, orig, orig_stride, sums); \ + kvz_satd_4x4_subblock_ ## suffix(preds, stride, orig, orig_stride, sums); \ } \ orig_ptr += 4; \ for(int blk = 0; blk < num_parallel_blocks; ++blk){\ @@ -419,23 +419,23 @@ if (height % 8 != 0) { \ /* Process the first row using 4x4 blocks. */ \ for (int x = 0; x < width; x += 4 ) { \ - kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \ + kvz_satd_4x4_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \ } \ orig_ptr += 4 * orig_stride; \ for(int blk = 0; blk < num_parallel_blocks; ++blk){\ - pred_ptrs[blk] += 4 * strides[blk]; \ + pred_ptrs[blk] += 4 * stride; \ }\ height -= 4; \ } \ /* The rest can now be processed with 8x8 blocks. */ \ for (int y = 0; y < height; y += 8) { \ orig_ptr = &orig[y * orig_stride]; \ - pred_ptrs[0] = &preds[0][y * strides[0]]; \ - pred_ptrs[1] = &preds[1][y * strides[1]]; \ - pred_ptrs[2] = &preds[2][y * strides[2]]; \ - pred_ptrs[3] = &preds[3][y * strides[3]]; \ + pred_ptrs[0] = &preds[0][y * stride]; \ + pred_ptrs[1] = &preds[1][y * stride]; \ + pred_ptrs[2] = &preds[2][y * stride]; \ + pred_ptrs[3] = &preds[3][y * stride]; \ for (int x = 0; x < width; x += 8) { \ - satd_8x8_subblock_ ## suffix(pred_ptrs, strides, orig_ptr, orig_stride, sums); \ + satd_8x8_subblock_ ## suffix(pred_ptrs, stride, orig_ptr, orig_stride, sums); \ orig_ptr += 8; \ pred_ptrs[0] += 8; \ pred_ptrs[1] += 8; \ @@ -535,6 +535,141 @@ return ssd >> (2*(KVZ_BIT_DEPTH-8)); } +static void inter_recon_bipred_generic(const int hi_prec_luma_rec0, + const int hi_prec_luma_rec1, + const int hi_prec_chroma_rec0, + const int hi_prec_chroma_rec1, + int32_t height, + int32_t width, + int32_t ypos, + int32_t xpos, + const hi_prec_buf_t*high_precision_rec0, + const hi_prec_buf_t*high_precision_rec1, + lcu_t* lcu, + kvz_pixel* temp_lcu_y, + kvz_pixel* temp_lcu_u, + kvz_pixel* temp_lcu_v) { + + int shift = 15 - KVZ_BIT_DEPTH; + int offset = 1 << (shift - 1); + + int y_in_lcu; + int x_in_lcu; + + //After reconstruction, merge the predictors by taking an average of each pixel + for (int temp_y = 0; temp_y < height; ++temp_y) { + + + for (int temp_x = 0; temp_x < width; ++temp_x) { + y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); + x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1)); + + int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + + lcu->rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift); + + if (temp_x < width >> 1 && temp_y < height >> 1) { + + y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1)); + x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1)); + + int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + lcu->rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift); + + int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + lcu->rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift); + } + } + } + +} + + +static optimized_sad_func_ptr_t get_optimized_sad_generic(int32_t width) +{ + return NULL; +} + +/** + * \brief Vertically interpolate SAD outside the frame. + * + * \param data1 Starting point of the first picture. + * \param data2 Starting point of the second picture. + * \param width Width of the region for which SAD is calculated. + * \param height Height of the region for which SAD is calculated. + * \param width Width of the pixel array. + * + * \returns Sum of Absolute Differences + */ +static uint32_t ver_sad_generic(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int block_width, int block_height, unsigned pic_stride) +{ + int x, y; + unsigned sad = 0; + + for (y = 0; y < block_height; ++y) { + for (x = 0; x < block_width; ++x) { + sad += abs(pic_data[y * pic_stride + x] - ref_data[x]); + } + } + + return sad; +} + +/** + * \brief Horizontally interpolate SAD outside the frame. + * + * \param data1 Starting point of the first picture. + * \param data2 Starting point of the second picture. + * \param width Width of the region for which SAD is calculated. + * \param height Height of the region for which SAD is calculated. + * \param width Width of the pixel array. + * + * \returns Sum of Absolute Differences + */ +static unsigned hor_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int block_width, int block_height, unsigned pic_stride, unsigned ref_stride) +{ + int x, y; + unsigned sad = 0; + + for (y = 0; y < block_height; ++y) { + for (x = 0; x < block_width; ++x) { + sad += abs(pic_data[y * pic_stride + x] - ref_data[y * ref_stride]); + } + } + + return sad; +} + + +static uint32_t hor_sad_generic(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t width, int32_t height, uint32_t pic_stride, + uint32_t ref_stride, uint32_t left, uint32_t right) +{ + uint32_t result = 0; + if (left) { + result += hor_sad (pic_data, ref_data + left, left, + height, pic_stride, ref_stride); + + result += kvz_reg_sad(pic_data + left, ref_data + left, width - left, + height, pic_stride, ref_stride); + } else if (right) { + result += kvz_reg_sad(pic_data, ref_data, width - right, + height, pic_stride, ref_stride); + + result += hor_sad (pic_data + width - right, + ref_data + width - right - 1, + right, height, pic_stride, ref_stride); + } else { + result += kvz_reg_sad(pic_data, ref_data, width, + height, pic_stride, ref_stride); + } + return result; +} int kvz_strategy_register_picture_generic(void* opaque, uint8_t bitdepth) { @@ -569,6 +704,11 @@ success &= kvz_strategyselector_register(opaque, "satd_any_size_quad", "generic", 0, &satd_any_size_quad_generic); success &= kvz_strategyselector_register(opaque, "pixels_calc_ssd", "generic", 0, &pixels_calc_ssd_generic); + success &= kvz_strategyselector_register(opaque, "inter_recon_bipred", "generic", 0, &inter_recon_bipred_generic); + + success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "generic", 0, &get_optimized_sad_generic); + success &= kvz_strategyselector_register(opaque, "ver_sad", "generic", 0, &ver_sad_generic); + success &= kvz_strategyselector_register(opaque, "hor_sad", "generic", 0, &hor_sad_generic); return success; }
View file
kvazaar-1.2.0.tar.gz/src/strategies/generic/picture-generic.h -> kvazaar-1.3.0.tar.gz/src/strategies/generic/picture-generic.h
Changed
@@ -45,9 +45,11 @@ const int32_t stride2); void kvz_satd_4x4_subblock_quad_generic(const kvz_pixel *preds[4], - const int strides[4], + const int stride, const kvz_pixel *orig, const int orig_stride, unsigned costs[4]); + + #endif //STRATEGIES_PICTURE_GENERIC_H_
View file
kvazaar-1.2.0.tar.gz/src/strategies/generic/quant-generic.c -> kvazaar-1.3.0.tar.gz/src/strategies/generic/quant-generic.c
Changed
@@ -53,17 +53,19 @@ uint32_t ac_sum = 0; for (int32_t n = 0; n < width * height; n++) { - int32_t level; + int32_t level = coef[n]; + int64_t abs_level = (int64_t)abs(level); int32_t sign; - level = coef[n]; sign = (level < 0 ? -1 : 1); - level = ((int64_t)abs(level) * quant_coeff[n] + add) >> q_bits; + int32_t curr_quant_coeff = quant_coeff[n]; + level = (abs_level * curr_quant_coeff + add) >> q_bits; ac_sum += level; level *= sign; q_coef[n] = (coeff_t)(CLIP(-32768, 32767, level)); + } if (!encoder->cfg.signhide_enable || ac_sum < 2) return; @@ -71,10 +73,12 @@ int32_t delta_u[LCU_WIDTH*LCU_WIDTH >> 2]; for (int32_t n = 0; n < width * height; n++) { - int32_t level; - level = coef[n]; - level = ((int64_t)abs(level) * quant_coeff[n] + add) >> q_bits; - delta_u[n] = (int32_t)(((int64_t)abs(coef[n]) * quant_coeff[n] - (level << q_bits)) >> q_bits8); + int32_t level = coef[n]; + int64_t abs_level = (int64_t)abs(level); + int32_t curr_quant_coeff = quant_coeff[n]; + + level = (abs_level * curr_quant_coeff + add) >> q_bits; + delta_u[n] = (int32_t)((abs_level * curr_quant_coeff - (level << q_bits)) >> q_bits8); } if (ac_sum >= 2) { @@ -208,7 +212,7 @@ kvz_transformskip(state->encoder_control, residual, coeff, width); } else { - kvz_transform2d(state->encoder_control, residual, coeff, width, (color == COLOR_Y ? 0 : 65535)); + kvz_transform2d(state->encoder_control, residual, coeff, width, color, cur_cu->type); } // Quantize coeffs. (coeff -> coeff_out) @@ -246,7 +250,7 @@ kvz_itransformskip(state->encoder_control, residual, coeff, width); } else { - kvz_itransform2d(state->encoder_control, residual, coeff, width, (color == COLOR_Y ? 0 : 65535)); + kvz_itransform2d(state->encoder_control, residual, coeff, width, color, cur_cu->type); } // Get quantized reconstruction. (residual + pred_in -> rec_out) @@ -329,6 +333,48 @@ return sum; } +static INLINE int16_t to_q88(float f) +{ + return (int16_t)(f * 256.0f); +} + +static uint32_t fast_coeff_cost_generic(const coeff_t *coeff, int32_t width, int32_t qp) +{ + uint32_t sum = 0; +#define NUM_BUCKETS 5 + const int16_t wt_m[NUM_BUCKETS] = { + to_q88(-0.004916), + to_q88(0.010806), + to_q88(0.055562), + to_q88(0.033436), + to_q88(-0.007690), + }; + const int16_t wt_c[NUM_BUCKETS] = { + to_q88(0.172024), + to_q88(3.421462), + to_q88(2.879506), + to_q88(5.585471), + to_q88(0.256772), + }; + + int16_t wt[NUM_BUCKETS]; + for (int32_t i = 0; i < NUM_BUCKETS; i++) + wt[i] = wt_m[i] * qp + wt_c[i]; + + for (int32_t i = 0; i < width * width; i++) { + int16_t curr = coeff[i]; + int16_t signmask = curr >> 15; + int16_t curr_abs = (curr ^ signmask) - signmask; + if (curr_abs > 3) + curr_abs = 3; + + sum += wt[curr_abs]; + } + sum += wt[NUM_BUCKETS - 1] * width; + return sum >> 8; +#undef NUM_BUCKETS +} + int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth) { bool success = true; @@ -337,6 +383,7 @@ success &= kvz_strategyselector_register(opaque, "quantize_residual", "generic", 0, &kvz_quantize_residual_generic); success &= kvz_strategyselector_register(opaque, "dequant", "generic", 0, &kvz_dequant_generic); success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "generic", 0, &coeff_abs_sum_generic); + success &= kvz_strategyselector_register(opaque, "fast_coeff_cost", "generic", 0, &fast_coeff_cost_generic); return success; }
View file
kvazaar-1.3.0.tar.gz/src/strategies/missing-intel-intrinsics.h
Added
@@ -0,0 +1,21 @@ +#ifndef MISSING_INTEL_INTRINSICS_H_ +#define MISSING_INTEL_INTRINSICS_H_ + +#include <immintrin.h> + +// Old Visual Studio headers lack the bsrli variant +#ifndef _mm_bsrli_si128 + #define _mm_bsrli_si128(a, imm8) _mm_srli_si128((a), (imm8)) +#endif + +// GCC headers apparently won't have this at all.. sigh +#ifndef _andn_u32 + // VS2015 headers apparently won't have this at all.. sigh + #ifdef __andn_u32 + #define _andn_u32(x, y) (__andn_u32((x), (y))) + #else + #define _andn_u32(x, y) ((~(x)) & (y)) + #endif // __andn_u32 +#endif // _andn_u32 + +#endif
View file
kvazaar-1.3.0.tar.gz/src/strategies/optimized_sad_func_ptr_t.h
Added
@@ -0,0 +1,19 @@ +#ifndef OPTIMIZED_SAD_FUNC_T_H_ +#define OPTIMIZED_SAD_FUNC_T_H_ + +#include "kvazaar.h" + +/** + * \param data1: Picture block pointer + * \param data2: Reference block pointer + * \param height: Scan block height + * \param stride1: Picture block stride + * \param stride2: Reference block stride + */ +typedef uint32_t (*optimized_sad_func_ptr_t)(const kvz_pixel * const, + const kvz_pixel * const, + const int32_t, + const uint32_t, + const uint32_t); + +#endif
View file
kvazaar-1.2.0.tar.gz/src/strategies/sse41/picture-sse41.c -> kvazaar-1.3.0.tar.gz/src/strategies/sse41/picture-sse41.c
Changed
@@ -18,73 +18,201 @@ * with Kvazaar. If not, see <http://www.gnu.org/licenses/>. ****************************************************************************/ -#include "strategies/sse41/picture-sse41.h" +#include "global.h" #if COMPILE_INTEL_SSE41 +#include "strategies/sse41/picture-sse41.h" +#include "strategies/sse41/reg_sad_pow2_widths-sse41.h" + #include <immintrin.h> #include <stdlib.h> #include "kvazaar.h" #include "strategyselector.h" +uint32_t kvz_reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2, + const int32_t width, const int32_t height, const uint32_t stride1, + const uint32_t stride2) +{ + if (width == 0) + return 0; + if (width == 4) + return reg_sad_w4(data1, data2, height, stride1, stride2); + if (width == 8) + return reg_sad_w8(data1, data2, height, stride1, stride2); + if (width == 12) + return reg_sad_w12(data1, data2, height, stride1, stride2); + if (width == 16) + return reg_sad_w16(data1, data2, height, stride1, stride2); + if (width == 24) + return reg_sad_w24(data1, data2, height, stride1, stride2); + else + return reg_sad_arbitrary(data1, data2, width, height, stride1, stride2); +} -unsigned kvz_reg_sad_sse41(const kvz_pixel * const data1, const kvz_pixel * const data2, - const int width, const int height, const unsigned stride1, const unsigned stride2) +static optimized_sad_func_ptr_t get_optimized_sad_sse41(int32_t width) { - int y, x; - unsigned sad = 0; - __m128i sse_inc = _mm_setzero_si128 (); - long long int sse_inc_array[2]; - - for (y = 0; y < height; ++y) { - for (x = 0; x <= width-16; x+=16) { - const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]); - const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]); - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a,b)); - } - - { - const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]); - const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]); - switch (((width - (width%2)) - x)/2) { - case 0: - break; - case 1: - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x01))); - break; - case 2: - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x03))); - break; - case 3: - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x07))); - break; - case 4: - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x0f))); - break; - case 5: - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x1f))); - break; - case 6: - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x3f))); - break; - case 7: - sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x7f))); - break; - default: - //Should not happen - assert(0); - } - x = (width - (width%2)); - } - - for (; x < width; ++x) { - sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]); - } + if (width == 0) + return reg_sad_w0; + if (width == 4) + return reg_sad_w4; + if (width == 8) + return reg_sad_w8; + if (width == 12) + return reg_sad_w12; + if (width == 16) + return reg_sad_w16; + if (width == 24) + return reg_sad_w24; + else + return NULL; +} + +static uint32_t ver_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t width, int32_t height, uint32_t stride) +{ + if (width == 0) + return 0; + if (width == 4) + return ver_sad_w4(pic_data, ref_data, height, stride); + if (width == 8) + return ver_sad_w8(pic_data, ref_data, height, stride); + if (width == 12) + return ver_sad_w12(pic_data, ref_data, height, stride); + if (width == 16) + return ver_sad_w16(pic_data, ref_data, height, stride); + else + return ver_sad_arbitrary(pic_data, ref_data, width, height, stride); +} + +static uint32_t hor_sad_sse41_w32(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t height, uint32_t pic_stride, uint32_t ref_stride, + uint32_t left, uint32_t right) +{ + const size_t vec_width = 16; + const uint32_t blkwidth_log2 = 5; + const uint32_t left_eq_wid = left >> blkwidth_log2; + const uint32_t right_eq_wid = right >> blkwidth_log2; + const int32_t left_clamped = left - left_eq_wid; + const int32_t right_clamped = right - right_eq_wid; + + const int32_t height_twoline_groups = height & ~1; + const int32_t height_residual_lines = height & 1; + + const __m128i zero = _mm_setzero_si128(); + const __m128i vec_widths = _mm_set1_epi8((uint8_t)vec_width); + const __m128i lefts = _mm_set1_epi8((uint8_t)left_clamped); + const __m128i rights = _mm_set1_epi8((uint8_t)right_clamped); + const __m128i nslo = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i nshi = _mm_add_epi8 (nslo, vec_widths); + + const __m128i rightmost_good_idx = _mm_set1_epi8((uint8_t)((vec_width << 1) - right - 1)); + + const __m128i epol_mask_right_lo = _mm_min_epi8 (nslo, rightmost_good_idx); + const __m128i epol_mask_right_hi = _mm_min_epi8 (nshi, rightmost_good_idx); + const __m128i epol_mask_lo = _mm_max_epi8 (lefts, epol_mask_right_lo); + const __m128i epol_mask_hi = _mm_max_epi8 (lefts, epol_mask_right_hi); + + const __m128i is_left = _mm_cmpeq_epi8(rights, zero); + const __m128i vecwid_for_left = _mm_and_si128 (is_left, vec_widths); + const __m128i ns_for_shufmask = _mm_or_si128 (nslo, vecwid_for_left); + + const __m128i shufmask1_right = _mm_add_epi8 (ns_for_shufmask, rights); + const __m128i shufmask1 = _mm_sub_epi8 (shufmask1_right, lefts); + + const __m128i md2bimask = _mm_cmpgt_epi8(vec_widths, shufmask1); + const __m128i move_d_to_b_imask = _mm_or_si128 (is_left, md2bimask); + const __m128i move_b_to_d_mask = _mm_cmpgt_epi8(lefts, nslo); + + // If we're straddling the left border, start from the left border instead, + // and if right border, end on the border + const int32_t ld_offset = left - right; + + int32_t y; + __m128i sse_inc = _mm_setzero_si128(); + for (y = 0; y < height_twoline_groups; y += 2) { + __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + 0)); + __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + 0 + ld_offset)); + __m128i c = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + 16)); + __m128i d = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + 16 + ld_offset)); + __m128i e = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * pic_stride + 0)); + __m128i f = _mm_loadu_si128((__m128i *)(ref_data + (y + 1) * ref_stride + 0 + ld_offset)); + __m128i g = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * pic_stride + 16)); + __m128i h = _mm_loadu_si128((__m128i *)(ref_data + (y + 1) * ref_stride + 16 + ld_offset)); + + __m128i b_shifted = _mm_shuffle_epi8(b, shufmask1); + __m128i d_shifted = _mm_shuffle_epi8(d, shufmask1); + __m128i f_shifted = _mm_shuffle_epi8(f, shufmask1); + __m128i h_shifted = _mm_shuffle_epi8(h, shufmask1); + + // TODO: could these be optimized for two-operand efficiency? Only one of + // these ever does useful work, the other should leave the vector untouched, + // so could the first result be used in the second calculation or something? + __m128i b_with_d_data = _mm_blendv_epi8(d_shifted, b_shifted, move_d_to_b_imask); + __m128i d_with_b_data = _mm_blendv_epi8(d_shifted, b_shifted, move_b_to_d_mask); + __m128i f_with_h_data = _mm_blendv_epi8(h_shifted, f_shifted, move_d_to_b_imask); + __m128i h_with_f_data = _mm_blendv_epi8(h_shifted, f_shifted, move_b_to_d_mask); + + __m128i b_final = _mm_shuffle_epi8(b_with_d_data, epol_mask_lo); + __m128i d_final = _mm_shuffle_epi8(d_with_b_data, epol_mask_hi); + __m128i f_final = _mm_shuffle_epi8(f_with_h_data, epol_mask_lo); + __m128i h_final = _mm_shuffle_epi8(h_with_f_data, epol_mask_hi); + + __m128i curr_sads_ab = _mm_sad_epu8 (a, b_final); + __m128i curr_sads_cd = _mm_sad_epu8 (c, d_final); + __m128i curr_sads_ef = _mm_sad_epu8 (e, f_final); + __m128i curr_sads_gh = _mm_sad_epu8 (g, h_final); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh); } - _mm_storeu_si128((__m128i*) sse_inc_array, sse_inc); - sad += sse_inc_array[0] + sse_inc_array[1]; + if (height_residual_lines) { + __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + 0)); + __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + 0 + ld_offset)); + __m128i c = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride + 16)); + __m128i d = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + 16 + ld_offset)); - return sad; + __m128i b_shifted = _mm_shuffle_epi8(b, shufmask1); + __m128i d_shifted = _mm_shuffle_epi8(d, shufmask1); + + __m128i b_with_d_data = _mm_blendv_epi8(d_shifted, b_shifted, move_d_to_b_imask); + __m128i d_with_b_data = _mm_blendv_epi8(d_shifted, b_shifted, move_b_to_d_mask); + + __m128i b_final = _mm_shuffle_epi8(b_with_d_data, epol_mask_lo); + __m128i d_final = _mm_shuffle_epi8(d_with_b_data, epol_mask_hi); + + __m128i curr_sads_ab = _mm_sad_epu8 (a, b_final); + __m128i curr_sads_cd = _mm_sad_epu8 (c, d_final); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd); + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + return _mm_cvtsi128_si32(sad); +} + +static uint32_t hor_sad_sse41(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t width, int32_t height, uint32_t pic_stride, + uint32_t ref_stride, uint32_t left, uint32_t right) +{ + if (width == 4) + return hor_sad_sse41_w4(pic_data, ref_data, height, + pic_stride, ref_stride, left, right); + if (width == 8) + return hor_sad_sse41_w8(pic_data, ref_data, height, + pic_stride, ref_stride, left, right); + if (width == 16) + return hor_sad_sse41_w16(pic_data, ref_data, height, + pic_stride, ref_stride, left, right); + if (width == 32) + return hor_sad_sse41_w32(pic_data, ref_data, height, + pic_stride, ref_stride, left, right); + else + return hor_sad_sse41_arbitrary(pic_data, ref_data, width, height, + pic_stride, ref_stride, left, right); } #endif //COMPILE_INTEL_SSE41 @@ -95,6 +223,9 @@ #if COMPILE_INTEL_SSE41 if (bitdepth == 8){ success &= kvz_strategyselector_register(opaque, "reg_sad", "sse41", 20, &kvz_reg_sad_sse41); + success &= kvz_strategyselector_register(opaque, "get_optimized_sad", "sse41", 20, &get_optimized_sad_sse41); + success &= kvz_strategyselector_register(opaque, "ver_sad", "sse41", 20, &ver_sad_sse41); + success &= kvz_strategyselector_register(opaque, "hor_sad", "sse41", 20, &hor_sad_sse41); } #endif return success;
View file
kvazaar-1.3.0.tar.gz/src/strategies/sse41/reg_sad_pow2_widths-sse41.h
Added
@@ -0,0 +1,1027 @@ +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2015 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see <http://www.gnu.org/licenses/>. + ****************************************************************************/ + +#ifndef REG_SAD_POW2_WIDTHS_SSE41_H_ +#define REG_SAD_POW2_WIDTHS_SSE41_H_ + +#include "kvazaar.h" +#include "strategies/missing-intel-intrinsics.h" +#include <immintrin.h> + +static INLINE uint32_t reg_sad_w0(const kvz_pixel * const data1, const kvz_pixel * const data2, + const int32_t height, const uint32_t stride1, + const uint32_t stride2) +{ + return 0; +} + +static INLINE uint32_t reg_sad_w4(const kvz_pixel * const data1, const kvz_pixel * const data2, + const int32_t height, const uint32_t stride1, + const uint32_t stride2) +{ + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i a = _mm_cvtsi32_si128(*(uint32_t *)(data1 + y * stride1)); + __m128i b = _mm_cvtsi32_si128(*(uint32_t *)(data2 + y * stride2)); + + a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 1) * stride1), 1); + b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 1) * stride2), 1); + a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 2) * stride1), 2); + b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 2) * stride2), 2); + a = _mm_insert_epi32(a, *(const uint32_t *)(data1 + (y + 3) * stride1), 3); + b = _mm_insert_epi32(b, *(const uint32_t *)(data2 + (y + 3) * stride2), 3); + + __m128i curr_sads = _mm_sad_epu8(a, b); + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(data1 + y * stride1)); + __m128i b = _mm_cvtsi32_si128(*(const uint32_t *)(data2 + y * stride2)); + + __m128i curr_sads = _mm_sad_epu8(a, b); + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + + return _mm_cvtsi128_si32(sad); +} + +static INLINE uint32_t reg_sad_w8(const kvz_pixel * const data1, const kvz_pixel * const data2, + const int32_t height, const uint32_t stride1, + const uint32_t stride2) +{ + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + for (y = 0; y < height_fourline_groups; y += 4) { + __m128d a_d = _mm_setzero_pd(); + __m128d b_d = _mm_setzero_pd(); + __m128d c_d = _mm_setzero_pd(); + __m128d d_d = _mm_setzero_pd(); + + a_d = _mm_loadl_pd(a_d, (const double *)(data1 + (y + 0) * stride1)); + b_d = _mm_loadl_pd(b_d, (const double *)(data2 + (y + 0) * stride2)); + a_d = _mm_loadh_pd(a_d, (const double *)(data1 + (y + 1) * stride1)); + b_d = _mm_loadh_pd(b_d, (const double *)(data2 + (y + 1) * stride2)); + + c_d = _mm_loadl_pd(c_d, (const double *)(data1 + (y + 2) * stride1)); + d_d = _mm_loadl_pd(d_d, (const double *)(data2 + (y + 2) * stride2)); + c_d = _mm_loadh_pd(c_d, (const double *)(data1 + (y + 3) * stride1)); + d_d = _mm_loadh_pd(d_d, (const double *)(data2 + (y + 3) * stride2)); + + __m128i a = _mm_castpd_si128(a_d); + __m128i b = _mm_castpd_si128(b_d); + __m128i c = _mm_castpd_si128(c_d); + __m128i d = _mm_castpd_si128(d_d); + + __m128i curr_sads_ab = _mm_sad_epu8(a, b); + __m128i curr_sads_cd = _mm_sad_epu8(c, d); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i a = _mm_loadl_epi64((__m128i *)(data1 + y * stride1)); + __m128i b = _mm_loadl_epi64((__m128i *)(data2 + y * stride2)); + + __m128i curr_sads_ab = _mm_sad_epu8(a, b); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + + return _mm_cvtsi128_si32(sad); +} + +static INLINE uint32_t reg_sad_w12(const kvz_pixel * const data1, const kvz_pixel * const data2, + const int32_t height, const uint32_t stride1, + const uint32_t stride2) +{ + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + for (y = 0; y < height; y++) { + __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1)); + __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2)); + + __m128i b_masked = _mm_blend_epi16(a, b, 0x3f); + __m128i curr_sads = _mm_sad_epu8 (a, b_masked); + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + return _mm_cvtsi128_si32(sad); +} + +static INLINE uint32_t reg_sad_w16(const kvz_pixel * const data1, const kvz_pixel * const data2, + const int32_t height, const uint32_t stride1, + const uint32_t stride2) +{ + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1)); + __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2)); + __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1)); + __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2)); + __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1)); + __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2)); + __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1)); + __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2)); + + __m128i curr_sads_ab = _mm_sad_epu8(a, b); + __m128i curr_sads_cd = _mm_sad_epu8(c, d); + __m128i curr_sads_ef = _mm_sad_epu8(e, f); + __m128i curr_sads_gh = _mm_sad_epu8(g, h); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1)); + __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2)); + + __m128i curr_sads = _mm_sad_epu8(a, b); + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + return _mm_cvtsi128_si32(sad); +} + +static INLINE uint32_t reg_sad_w24(const kvz_pixel * const data1, const kvz_pixel * const data2, + const int32_t height, const uint32_t stride1, + const uint32_t stride2) +{ + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + + const int32_t height_doublelines = height & ~1; + const int32_t height_parity = height & 1; + + for (y = 0; y < height_doublelines; y += 2) { + __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1)); + __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2)); + __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1)); + __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2)); + + __m128d e_d = _mm_setzero_pd(); + __m128d f_d = _mm_setzero_pd(); + + e_d = _mm_loadl_pd(e_d, (const double *)(data1 + (y + 0) * stride1 + 16)); + f_d = _mm_loadl_pd(f_d, (const double *)(data2 + (y + 0) * stride2 + 16)); + e_d = _mm_loadh_pd(e_d, (const double *)(data1 + (y + 1) * stride1 + 16)); + f_d = _mm_loadh_pd(f_d, (const double *)(data2 + (y + 1) * stride2 + 16)); + + __m128i e = _mm_castpd_si128(e_d); + __m128i f = _mm_castpd_si128(f_d); + + __m128i curr_sads_1 = _mm_sad_epu8(a, b); + __m128i curr_sads_2 = _mm_sad_epu8(c, d); + __m128i curr_sads_3 = _mm_sad_epu8(e, f); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_1); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_2); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_3); + } + if (height_parity) { + __m128i a = _mm_loadu_si128 ((const __m128i *)(data1 + y * stride1)); + __m128i b = _mm_loadu_si128 ((const __m128i *)(data2 + y * stride2)); + __m128i c = _mm_loadl_epi64 ((const __m128i *)(data1 + y * stride1 + 16)); + __m128i d = _mm_loadl_epi64 ((const __m128i *)(data2 + y * stride2 + 16)); + + __m128i curr_sads_1 = _mm_sad_epu8(a, b); + __m128i curr_sads_2 = _mm_sad_epu8(c, d); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_1); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_2); + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + return _mm_cvtsi128_si32(sad); +} + +static INLINE uint32_t reg_sad_arbitrary(const kvz_pixel * const data1, const kvz_pixel * const data2, + const int32_t width, const int32_t height, const uint32_t stride1, + const uint32_t stride2) +{ + int32_t y, x; + __m128i sse_inc = _mm_setzero_si128(); + + // Bytes in block in 128-bit blocks per each scanline, and remainder + const int32_t width_xmms = width & ~15; + const int32_t width_residual_pixels = width & 15; + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + const __m128i rds = _mm_set1_epi8 (width_residual_pixels); + const __m128i ns = _mm_setr_epi8 (0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i rdmask = _mm_cmpgt_epi8(rds, ns); + + for (x = 0; x < width_xmms; x += 16) { + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1 + x)); + __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2 + x)); + __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1 + x)); + __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2 + x)); + __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1 + x)); + __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2 + x)); + __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1 + x)); + __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2 + x)); + + __m128i curr_sads_ab = _mm_sad_epu8(a, b); + __m128i curr_sads_cd = _mm_sad_epu8(c, d); + __m128i curr_sads_ef = _mm_sad_epu8(e, f); + __m128i curr_sads_gh = _mm_sad_epu8(g, h); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1 + x)); + __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2 + x)); + + __m128i curr_sads = _mm_sad_epu8(a, b); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + } + + if (width_residual_pixels) { + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i a = _mm_loadu_si128((const __m128i *)(data1 + (y + 0) * stride1 + x)); + __m128i b = _mm_loadu_si128((const __m128i *)(data2 + (y + 0) * stride2 + x)); + __m128i c = _mm_loadu_si128((const __m128i *)(data1 + (y + 1) * stride1 + x)); + __m128i d = _mm_loadu_si128((const __m128i *)(data2 + (y + 1) * stride2 + x)); + __m128i e = _mm_loadu_si128((const __m128i *)(data1 + (y + 2) * stride1 + x)); + __m128i f = _mm_loadu_si128((const __m128i *)(data2 + (y + 2) * stride2 + x)); + __m128i g = _mm_loadu_si128((const __m128i *)(data1 + (y + 3) * stride1 + x)); + __m128i h = _mm_loadu_si128((const __m128i *)(data2 + (y + 3) * stride2 + x)); + + __m128i b_masked = _mm_blendv_epi8(a, b, rdmask); + __m128i d_masked = _mm_blendv_epi8(c, d, rdmask); + __m128i f_masked = _mm_blendv_epi8(e, f, rdmask); + __m128i h_masked = _mm_blendv_epi8(g, h, rdmask); + + __m128i curr_sads_ab = _mm_sad_epu8 (a, b_masked); + __m128i curr_sads_cd = _mm_sad_epu8 (c, d_masked); + __m128i curr_sads_ef = _mm_sad_epu8 (e, f_masked); + __m128i curr_sads_gh = _mm_sad_epu8 (g, h_masked); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i a = _mm_loadu_si128((const __m128i *)(data1 + y * stride1 + x)); + __m128i b = _mm_loadu_si128((const __m128i *)(data2 + y * stride2 + x)); + + __m128i b_masked = _mm_blendv_epi8(a, b, rdmask); + __m128i curr_sads = _mm_sad_epu8 (a, b_masked); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + + return _mm_cvtsi128_si32(sad); +} + +static uint32_t ver_sad_w4(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t height, uint32_t stride) +{ + __m128i ref_row = _mm_set1_epi32(*(const uint32_t *)ref_data); + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i a = _mm_cvtsi32_si128(*(uint32_t *)(pic_data + y * stride)); + + a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 1) * stride), 1); + a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 2) * stride), 2); + a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 3) * stride), 3); + + __m128i curr_sads = _mm_sad_epu8(a, ref_row); + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + if (height_residual_lines) { + // Only pick the last dword, because we're comparing single dwords (lines) + ref_row = _mm_bsrli_si128(ref_row, 12); + + for (; y < height; y++) { + __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * stride)); + + __m128i curr_sads = _mm_sad_epu8(a, ref_row); + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + + return _mm_cvtsi128_si32(sad); +} + +static uint32_t ver_sad_w8(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t height, uint32_t stride) +{ + const __m128i ref_row = _mm_set1_epi64x(*(const uint64_t *)ref_data); + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + for (y = 0; y < height_fourline_groups; y += 4) { + __m128d a_d = _mm_setzero_pd(); + __m128d c_d = _mm_setzero_pd(); + + a_d = _mm_loadl_pd(a_d, (const double *)(pic_data + (y + 0) * stride)); + a_d = _mm_loadh_pd(a_d, (const double *)(pic_data + (y + 1) * stride)); + + c_d = _mm_loadl_pd(c_d, (const double *)(pic_data + (y + 2) * stride)); + c_d = _mm_loadh_pd(c_d, (const double *)(pic_data + (y + 3) * stride)); + + __m128i a = _mm_castpd_si128(a_d); + __m128i c = _mm_castpd_si128(c_d); + + __m128i curr_sads_ab = _mm_sad_epu8(a, ref_row); + __m128i curr_sads_cd = _mm_sad_epu8(c, ref_row); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd); + } + if (height_residual_lines) { + __m128i b = _mm_move_epi64(ref_row); + + for (; y < height; y++) { + __m128i a = _mm_loadl_epi64((__m128i *)(pic_data + y * stride)); + + __m128i curr_sads_ab = _mm_sad_epu8(a, b); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + + return _mm_cvtsi128_si32(sad); +} + +static uint32_t ver_sad_w12(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t height, uint32_t stride) +{ + const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data); + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + + for (y = 0; y < height; y++) { + __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride)); + + __m128i a_masked = _mm_blend_epi16(ref_row, a, 0x3f); + __m128i curr_sads = _mm_sad_epu8 (ref_row, a_masked); + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + return _mm_cvtsi128_si32(sad); +} + +static uint32_t ver_sad_w16(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t height, uint32_t stride) +{ + const __m128i ref_row = _mm_loadu_si128((__m128i *)ref_data); + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i pic_row_1 = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride)); + __m128i pic_row_2 = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * stride)); + __m128i pic_row_3 = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * stride)); + __m128i pic_row_4 = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * stride)); + + __m128i curr_sads_1 = _mm_sad_epu8 (pic_row_1, ref_row); + __m128i curr_sads_2 = _mm_sad_epu8 (pic_row_2, ref_row); + __m128i curr_sads_3 = _mm_sad_epu8 (pic_row_3, ref_row); + __m128i curr_sads_4 = _mm_sad_epu8 (pic_row_4, ref_row); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_1); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_2); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_3); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_4); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i pic_row = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * stride)); + __m128i curr_sads = _mm_sad_epu8 (pic_row, ref_row); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + + return _mm_cvtsi128_si32(sad); +} + +static uint32_t ver_sad_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t width, int32_t height, uint32_t stride) +{ + int32_t y, x; + __m128i sse_inc = _mm_setzero_si128(); + + // Bytes in block in 128-bit blocks per each scanline, and remainder + const int32_t width_xmms = width & ~15; + const int32_t width_residual_pixels = width & 15; + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + const __m128i rds = _mm_set1_epi8 (width_residual_pixels); + const __m128i ns = _mm_setr_epi8 (0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i rdmask = _mm_cmpgt_epi8(rds, ns); + + for (x = 0; x < width_xmms; x += 16) { + const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x)); + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x)); + __m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x)); + __m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x)); + __m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x)); + + __m128i curr_sads_ab = _mm_sad_epu8(ref_row, a); + __m128i curr_sads_cd = _mm_sad_epu8(ref_row, c); + __m128i curr_sads_ef = _mm_sad_epu8(ref_row, e); + __m128i curr_sads_gh = _mm_sad_epu8(ref_row, g); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x)); + + __m128i curr_sads = _mm_sad_epu8(a, ref_row); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + } + + if (width_residual_pixels) { + const __m128i ref_row = _mm_loadu_si128((__m128i *)(ref_data + x)); + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + (y + 0) * stride + x)); + __m128i c = _mm_loadu_si128((const __m128i *)(pic_data + (y + 1) * stride + x)); + __m128i e = _mm_loadu_si128((const __m128i *)(pic_data + (y + 2) * stride + x)); + __m128i g = _mm_loadu_si128((const __m128i *)(pic_data + (y + 3) * stride + x)); + + __m128i a_masked = _mm_blendv_epi8(ref_row, a, rdmask); + __m128i c_masked = _mm_blendv_epi8(ref_row, c, rdmask); + __m128i e_masked = _mm_blendv_epi8(ref_row, e, rdmask); + __m128i g_masked = _mm_blendv_epi8(ref_row, g, rdmask); + + __m128i curr_sads_ab = _mm_sad_epu8 (ref_row, a_masked); + __m128i curr_sads_cd = _mm_sad_epu8 (ref_row, c_masked); + __m128i curr_sads_ef = _mm_sad_epu8 (ref_row, e_masked); + __m128i curr_sads_gh = _mm_sad_epu8 (ref_row, g_masked); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i a = _mm_loadu_si128((const __m128i *)(pic_data + y * stride + x)); + + __m128i a_masked = _mm_blendv_epi8(ref_row, a, rdmask); + __m128i curr_sads = _mm_sad_epu8 (ref_row, a_masked); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + + return _mm_cvtsi128_si32(sad); +} + +static uint32_t hor_sad_sse41_w4(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t height, uint32_t pic_stride, uint32_t ref_stride, + uint32_t left, uint32_t right) +{ + const int32_t right_border_idx = 3 - right; + const int32_t border_idx = left ? left : right_border_idx; + + const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + + const int32_t border_idx_negative = border_idx >> 31; + const int32_t leftoff = border_idx_negative | left; + + // Dualword (ie. line) base indexes, ie. the edges the lines read will be + // clamped towards + const __m128i dwbaseids = _mm_setr_epi8(0, 0, 0, 0, 4, 4, 4, 4, + 8, 8, 8, 8, 12, 12, 12, 12); + + __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx); + __m128i left_128 = _mm_set1_epi8((int8_t)left); + + right_border_idxs = _mm_add_epi8 (right_border_idxs, dwbaseids); + + __m128i mask_right = _mm_min_epi8 (ns, right_border_idxs); + __m128i mask1 = _mm_sub_epi8 (mask_right, left_128); + + const __m128i epol_mask = _mm_max_epi8(mask1, dwbaseids); + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * pic_stride)); + __m128i b = _mm_cvtsi32_si128(*(const uint32_t *)(ref_data + y * ref_stride + leftoff)); + + a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 1) * pic_stride), 1); + b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 1) * ref_stride + leftoff), 1); + a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 2) * pic_stride), 2); + b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 2) * ref_stride + leftoff), 2); + a = _mm_insert_epi32(a, *(const uint32_t *)(pic_data + (y + 3) * pic_stride), 3); + b = _mm_insert_epi32(b, *(const uint32_t *)(ref_data + (y + 3) * ref_stride + leftoff), 3); + + __m128i b_epol = _mm_shuffle_epi8(b, epol_mask); + __m128i curr_sads = _mm_sad_epu8 (a, b_epol); + sse_inc = _mm_add_epi64 (sse_inc, curr_sads); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i a = _mm_cvtsi32_si128(*(const uint32_t *)(pic_data + y * pic_stride)); + __m128i b = _mm_cvtsi32_si128(*(const uint32_t *)(ref_data + y * ref_stride + leftoff)); + + __m128i b_epol = _mm_shuffle_epi8(b, epol_mask); + __m128i curr_sads = _mm_sad_epu8 (a, b_epol); + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + + return _mm_cvtsi128_si32(sad); +} + +static uint32_t hor_sad_sse41_w8(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t height, uint32_t pic_stride, uint32_t ref_stride, + uint32_t left, uint32_t right) +{ + // right is the number of overhanging pixels in the vector, so it has to be + // handled this way to produce the index of last valid (border) pixel + const int32_t right_border_idx = 7 - right; + const int32_t border_idx = left ? left : right_border_idx; + + const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + + // Quadword (ie. line) base indexes, ie. the edges the lines read will be + // clamped towards; higher qword (lower line) bytes tend towards 8 and lower + // qword (higher line) bytes towards 0 + const __m128i qwbaseids = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, + 8, 8, 8, 8, 8, 8, 8, 8); + + // Dirty hack alert! If right == block_width (ie. the entire vector is + // outside the frame), move the block offset one pixel to the left (so + // that the leftmost pixel in vector is actually the valid border pixel + // from which we want to extrapolate), and use an epol mask that will + // simply stretch the pixel all over the vector. + // + // To avoid a branch here: + // The mask will be -1 (0xffffffff) for border_idx -1 and 0 for >= 0 + const int32_t border_idx_negative = border_idx >> 31; + const int32_t leftoff = border_idx_negative | left; + + __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx); + __m128i left_128 = _mm_set1_epi8((int8_t)left); + + right_border_idxs = _mm_add_epi8 (right_border_idxs, qwbaseids); + + // If we're straddling the left border, right_border_idx is 7 and the first + // operation does nothing. If right border, left is 0 and the second + // operation does nothing. + __m128i mask_right = _mm_min_epi8 (ns, right_border_idxs); + __m128i mask1 = _mm_sub_epi8 (mask_right, left_128); + + // If right == 8 (we're completely outside the frame), right_border_idx is + // -1 and so is mask1. Clamp negative values to qwbaseid and as discussed + // earlier, adjust the load offset instead to load the "-1'st" pixels and + // using qwbaseids as the shuffle mask, broadcast it all over the rows. + const __m128i epol_mask = _mm_max_epi8(mask1, qwbaseids); + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + for (y = 0; y < height_fourline_groups; y += 4) { + __m128d a_d = _mm_setzero_pd(); + __m128d b_d = _mm_setzero_pd(); + __m128d c_d = _mm_setzero_pd(); + __m128d d_d = _mm_setzero_pd(); + + a_d = _mm_loadl_pd(a_d, (const double *)(pic_data + (y + 0) * pic_stride)); + b_d = _mm_loadl_pd(b_d, (const double *)(ref_data + (y + 0) * ref_stride + leftoff)); + a_d = _mm_loadh_pd(a_d, (const double *)(pic_data + (y + 1) * pic_stride)); + b_d = _mm_loadh_pd(b_d, (const double *)(ref_data + (y + 1) * ref_stride + leftoff)); + + c_d = _mm_loadl_pd(c_d, (const double *)(pic_data + (y + 2) * pic_stride)); + d_d = _mm_loadl_pd(d_d, (const double *)(ref_data + (y + 2) * ref_stride + leftoff)); + c_d = _mm_loadh_pd(c_d, (const double *)(pic_data + (y + 3) * pic_stride)); + d_d = _mm_loadh_pd(d_d, (const double *)(ref_data + (y + 3) * ref_stride + leftoff)); + + __m128i a = _mm_castpd_si128(a_d); + __m128i b = _mm_castpd_si128(b_d); + __m128i c = _mm_castpd_si128(c_d); + __m128i d = _mm_castpd_si128(d_d); + + __m128i b_epol = _mm_shuffle_epi8(b, epol_mask); + __m128i d_epol = _mm_shuffle_epi8(d, epol_mask); + + __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol); + __m128i curr_sads_cd = _mm_sad_epu8(c, d_epol); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i a = _mm_loadl_epi64((__m128i *)(pic_data + y * pic_stride)); + __m128i b = _mm_loadl_epi64((__m128i *)(ref_data + y * ref_stride + leftoff)); + + __m128i b_epol = _mm_shuffle_epi8(b, epol_mask); + + __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + return _mm_cvtsi128_si32(sad); +} + +/* + * left and right measure how many pixels of one horizontal scanline will be + * outside either the left or the right screen border. For blocks straddling + * the left border, read the scanlines starting from the left border instead, + * and use the extrapolation mask to essentially move the pixels right while + * copying the left border pixel to the vector positions that logically point + * outside of the buffer. + * + * For blocks straddling the right border, just read over the right border, + * and extrapolate all pixels beyond the border idx to copy the value of the + * border pixel. An exception is right == width (leftmost reference pixel is + * one place right from the right border, it's ugly because the pixel to + * extrapolate from is located at relative X offset -1), abuse the left border + * aligning functionality instead to actually read starting from the valid + * border pixel, and use a suitable mask to fill all the other pixels with + * that value. + */ +static uint32_t hor_sad_sse41_w16(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t height, uint32_t pic_stride, uint32_t ref_stride, + const uint32_t left, const uint32_t right) +{ + // right is the number of overhanging pixels in the vector, so it has to be + // handled this way to produce the index of last valid (border) pixel + const int32_t right_border_idx = 15 - right; + const int32_t border_idx = left ? left : right_border_idx; + + const __m128i ns = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + const __m128i zero = _mm_setzero_si128(); + + // Dirty hack alert! If right == block_width (ie. the entire vector is + // outside the frame), move the block offset one pixel to the left (so + // that the leftmost pixel in vector is actually the valid border pixel + // from which we want to extrapolate), and use an epol mask that will + // simply stretch the pixel all over the vector. + // + // To avoid a branch here: + // The mask will be -1 (0xffffffff) for border_idx -1 and 0 for >= 0 + const int32_t border_idx_negative = border_idx >> 31; + const int32_t leftoff = border_idx_negative | left; + + __m128i right_border_idxs = _mm_set1_epi8((int8_t)right_border_idx); + __m128i left_128 = _mm_set1_epi8((int8_t)left); + + // If we're straddling the left border, right_border_idx is 15 and the first + // operation does nothing. If right border, left is 0 and the second + // operation does nothing. + __m128i mask_right = _mm_min_epi8 (ns, right_border_idxs); + __m128i mask1 = _mm_sub_epi8 (mask_right, left_128); + + // If right == 16 (we're completely outside the frame), right_border_idx is + // -1 and so is mask1. Clamp negative values to zero and as discussed + // earlier, adjust the load offset instead to load the "-1'st" pixel and + // using an all-zero shuffle mask, broadcast it all over the vector. + const __m128i epol_mask = _mm_max_epi8(mask1, zero); + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + __m128i sse_inc = _mm_setzero_si128(); + int32_t y; + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride)); + __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + leftoff)); + __m128i c = _mm_loadu_si128((__m128i *)(pic_data + (y + 1) * pic_stride)); + __m128i d = _mm_loadu_si128((__m128i *)(ref_data + (y + 1) * ref_stride + leftoff)); + __m128i e = _mm_loadu_si128((__m128i *)(pic_data + (y + 2) * pic_stride)); + __m128i f = _mm_loadu_si128((__m128i *)(ref_data + (y + 2) * ref_stride + leftoff)); + __m128i g = _mm_loadu_si128((__m128i *)(pic_data + (y + 3) * pic_stride)); + __m128i h = _mm_loadu_si128((__m128i *)(ref_data + (y + 3) * ref_stride + leftoff)); + + __m128i b_epol = _mm_shuffle_epi8(b, epol_mask); + __m128i d_epol = _mm_shuffle_epi8(d, epol_mask); + __m128i f_epol = _mm_shuffle_epi8(f, epol_mask); + __m128i h_epol = _mm_shuffle_epi8(h, epol_mask); + + __m128i curr_sads_ab = _mm_sad_epu8(a, b_epol); + __m128i curr_sads_cd = _mm_sad_epu8(c, d_epol); + __m128i curr_sads_ef = _mm_sad_epu8(e, f_epol); + __m128i curr_sads_gh = _mm_sad_epu8(g, h_epol); + + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ab); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_cd); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_ef); + sse_inc = _mm_add_epi64(sse_inc, curr_sads_gh); + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i a = _mm_loadu_si128((__m128i *)(pic_data + (y + 0) * pic_stride)); + __m128i b = _mm_loadu_si128((__m128i *)(ref_data + (y + 0) * ref_stride + leftoff)); + __m128i b_epol = _mm_shuffle_epi8(b, epol_mask); + __m128i curr_sads = _mm_sad_epu8(a, b_epol); + sse_inc = _mm_add_epi64(sse_inc, curr_sads); + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + return _mm_cvtsi128_si32(sad); +} + +static INLINE uint32_t hor_sad_sse41_arbitrary(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t width, int32_t height, uint32_t pic_stride, + uint32_t ref_stride, uint32_t left, uint32_t right) +{ + __m128i sse_inc = _mm_setzero_si128(); + + const size_t vec_width = 16; + const size_t vecwid_bitmask = 15; + const size_t vec_width_log2 = 4; + + const int32_t height_fourline_groups = height & ~3; + const int32_t height_residual_lines = height & 3; + + const __m128i rights = _mm_set1_epi8((uint8_t)right); + const __m128i blk_widths = _mm_set1_epi8((uint8_t)width); + const __m128i vec_widths = _mm_set1_epi8((uint8_t)vec_width); + const __m128i nslo = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + + uint32_t outside_vecs, inside_vecs, left_offset, is_left_bm; + int32_t outside_width, inside_width, border_off, invec_lstart, + invec_lend, invec_linc; + if (left) { + outside_vecs = left >> vec_width_log2; + inside_vecs = (( width + vecwid_bitmask) >> vec_width_log2) - outside_vecs; + outside_width = outside_vecs * vec_width; + inside_width = inside_vecs * vec_width; + left_offset = left; + border_off = left; + invec_lstart = 0; + invec_lend = inside_vecs; + invec_linc = 1; + is_left_bm = -1; + } else { + inside_vecs = ((width - right) + vecwid_bitmask) >> vec_width_log2; + outside_vecs = (( width + vecwid_bitmask) >> vec_width_log2) - inside_vecs; + outside_width = outside_vecs * vec_width; + inside_width = inside_vecs * vec_width; + left_offset = right - width; + border_off = width - 1 - right; + invec_lstart = inside_vecs - 1; + invec_lend = -1; + invec_linc = -1; + is_left_bm = 0; + } + left_offset &= vecwid_bitmask; + + const __m128i left_offsets = _mm_set1_epi8 ((uint8_t)left_offset); + const __m128i is_left = _mm_cmpeq_epi8(rights, _mm_setzero_si128()); + const __m128i vw_for_left = _mm_and_si128 (is_left, vec_widths); + + // -x == (x ^ 0xff) + 1 = (x ^ 0xff) - 0xff. Also x == (x ^ 0x00) - 0x00. + // in other words, calculate inverse of left_offsets if is_left is true. + const __m128i offs_neg = _mm_xor_si128 (left_offsets, is_left); + const __m128i offs_for_sm1 = _mm_sub_epi8 (offs_neg, is_left); + + const __m128i ns_for_sm1 = _mm_or_si128 (vw_for_left, nslo); + const __m128i shufmask1 = _mm_add_epi8 (ns_for_sm1, offs_for_sm1); + + const __m128i mo2bmask_l = _mm_cmpgt_epi8(left_offsets, nslo); + const __m128i mo2bimask_l = _mm_cmpeq_epi8(mo2bmask_l, _mm_setzero_si128()); + const __m128i mo2bimask_r = _mm_cmpgt_epi8(vec_widths, shufmask1); + const __m128i move_old_to_b_imask = _mm_blendv_epi8(mo2bimask_r, mo2bimask_l, is_left); + + const int32_t outvec_offset = (~is_left_bm) & inside_width; + int32_t x, y; + for (y = 0; y < height_fourline_groups; y += 4) { + __m128i borderpx_vec_b = _mm_set1_epi8(ref_data[(int32_t)((y + 0) * ref_stride + border_off)]); + __m128i borderpx_vec_d = _mm_set1_epi8(ref_data[(int32_t)((y + 1) * ref_stride + border_off)]); + __m128i borderpx_vec_f = _mm_set1_epi8(ref_data[(int32_t)((y + 2) * ref_stride + border_off)]); + __m128i borderpx_vec_h = _mm_set1_epi8(ref_data[(int32_t)((y + 3) * ref_stride + border_off)]); + + for (x = 0; x < outside_vecs; x++) { + __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + outvec_offset)); + __m128i c = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 1) * pic_stride + outvec_offset)); + __m128i e = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 2) * pic_stride + outvec_offset)); + __m128i g = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 3) * pic_stride + outvec_offset)); + + __m128i startoffs = _mm_set1_epi8 ((x + inside_vecs) << vec_width_log2); + __m128i ns = _mm_add_epi8 (startoffs, nslo); + + // Unread imask is (is_left NOR unrd_imask_for_right), do the maths etc + __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, ns); + unrd_imask = _mm_or_si128 (unrd_imask, is_left); + __m128i unrd_mask = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128()); + + __m128i b_unread = _mm_blendv_epi8(borderpx_vec_b, a, unrd_mask); + __m128i d_unread = _mm_blendv_epi8(borderpx_vec_d, c, unrd_mask); + __m128i f_unread = _mm_blendv_epi8(borderpx_vec_f, e, unrd_mask); + __m128i h_unread = _mm_blendv_epi8(borderpx_vec_h, g, unrd_mask); + + __m128i sad_ab = _mm_sad_epu8 (a, b_unread); + __m128i sad_cd = _mm_sad_epu8 (c, d_unread); + __m128i sad_ef = _mm_sad_epu8 (e, f_unread); + __m128i sad_gh = _mm_sad_epu8 (g, h_unread); + + sse_inc = _mm_add_epi64(sse_inc, sad_ab); + sse_inc = _mm_add_epi64(sse_inc, sad_cd); + sse_inc = _mm_add_epi64(sse_inc, sad_ef); + sse_inc = _mm_add_epi64(sse_inc, sad_gh); + } + int32_t a_off = outside_width & is_left_bm; + int32_t leftoff_with_sign_neg = (left_offset ^ is_left_bm) - is_left_bm; + + __m128i old_b = borderpx_vec_b; + __m128i old_d = borderpx_vec_d; + __m128i old_f = borderpx_vec_f; + __m128i old_h = borderpx_vec_h; + + for (x = invec_lstart; x != invec_lend; x += invec_linc) { + __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + a_off)); + __m128i c = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 1) * pic_stride + a_off)); + __m128i e = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 2) * pic_stride + a_off)); + __m128i g = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 3) * pic_stride + a_off)); + __m128i b = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 0) * ref_stride + a_off - leftoff_with_sign_neg)); + __m128i d = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 1) * ref_stride + a_off - leftoff_with_sign_neg)); + __m128i f = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 2) * ref_stride + a_off - leftoff_with_sign_neg)); + __m128i h = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 3) * ref_stride + a_off - leftoff_with_sign_neg)); + + __m128i b_shifted = _mm_shuffle_epi8(b, shufmask1); + __m128i d_shifted = _mm_shuffle_epi8(d, shufmask1); + __m128i f_shifted = _mm_shuffle_epi8(f, shufmask1); + __m128i h_shifted = _mm_shuffle_epi8(h, shufmask1); + + __m128i b_with_old = _mm_blendv_epi8 (old_b, b_shifted, move_old_to_b_imask); + __m128i d_with_old = _mm_blendv_epi8 (old_d, d_shifted, move_old_to_b_imask); + __m128i f_with_old = _mm_blendv_epi8 (old_f, f_shifted, move_old_to_b_imask); + __m128i h_with_old = _mm_blendv_epi8 (old_h, h_shifted, move_old_to_b_imask); + + uint8_t startoff = (x << vec_width_log2) + a_off; + __m128i startoffs = _mm_set1_epi8 (startoff); + __m128i curr_ns = _mm_add_epi8 (startoffs, nslo); + __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, curr_ns); + __m128i unrd_mask = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128()); + + __m128i b_unread = _mm_blendv_epi8 (b_with_old, a, unrd_mask); + __m128i d_unread = _mm_blendv_epi8 (d_with_old, c, unrd_mask); + __m128i f_unread = _mm_blendv_epi8 (f_with_old, e, unrd_mask); + __m128i h_unread = _mm_blendv_epi8 (h_with_old, g, unrd_mask); + + old_b = b_shifted; + old_d = d_shifted; + old_f = f_shifted; + old_h = h_shifted; + + __m128i sad_ab = _mm_sad_epu8(a, b_unread); + __m128i sad_cd = _mm_sad_epu8(c, d_unread); + __m128i sad_ef = _mm_sad_epu8(e, f_unread); + __m128i sad_gh = _mm_sad_epu8(g, h_unread); + + sse_inc = _mm_add_epi64(sse_inc, sad_ab); + sse_inc = _mm_add_epi64(sse_inc, sad_cd); + sse_inc = _mm_add_epi64(sse_inc, sad_ef); + sse_inc = _mm_add_epi64(sse_inc, sad_gh); + } + } + if (height_residual_lines) { + for (; y < height; y++) { + __m128i borderpx_vec = _mm_set1_epi8(ref_data[(int32_t)((y + 0) * ref_stride + border_off)]); + for (x = 0; x < outside_vecs; x++) { + __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + outvec_offset)); + + __m128i startoffs = _mm_set1_epi8 ((x + inside_vecs) << vec_width_log2); + __m128i ns = _mm_add_epi8 (startoffs, nslo); + + // Unread imask is (is_left NOR unrd_imask_for_right), do the maths etc + __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, ns); + unrd_imask = _mm_or_si128 (unrd_imask, is_left); + __m128i unrd_mask = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128()); + __m128i b_unread = _mm_blendv_epi8(borderpx_vec, a, unrd_mask); + + __m128i sad_ab = _mm_sad_epu8 (a, b_unread); + sse_inc = _mm_add_epi64(sse_inc, sad_ab); + } + int32_t a_off = outside_width & is_left_bm; + int32_t leftoff_with_sign_neg = (left_offset ^ is_left_bm) - is_left_bm; + + __m128i old_b = borderpx_vec; + for (x = invec_lstart; x != invec_lend; x += invec_linc) { + __m128i a = _mm_loadu_si128((__m128i *)(pic_data + x * vec_width + (y + 0) * pic_stride + a_off)); + __m128i b = _mm_loadu_si128((__m128i *)(ref_data + x * vec_width + (y + 0) * ref_stride + a_off - leftoff_with_sign_neg)); + + __m128i b_shifted = _mm_shuffle_epi8(b, shufmask1); + __m128i b_with_old = _mm_blendv_epi8 (old_b, b_shifted, move_old_to_b_imask); + + uint8_t startoff = (x << vec_width_log2) + a_off; + __m128i startoffs = _mm_set1_epi8 (startoff); + __m128i curr_ns = _mm_add_epi8 (startoffs, nslo); + __m128i unrd_imask = _mm_cmpgt_epi8 (blk_widths, curr_ns); + __m128i unrd_mask = _mm_cmpeq_epi8 (unrd_imask, _mm_setzero_si128()); + __m128i b_unread = _mm_blendv_epi8 (b_with_old, a, unrd_mask); + + old_b = b_shifted; + + __m128i sad_ab = _mm_sad_epu8(a, b_unread); + sse_inc = _mm_add_epi64(sse_inc, sad_ab); + } + } + } + __m128i sse_inc_2 = _mm_shuffle_epi32(sse_inc, _MM_SHUFFLE(1, 0, 3, 2)); + __m128i sad = _mm_add_epi64 (sse_inc, sse_inc_2); + return _mm_cvtsi128_si32(sad); +} + +#endif
View file
kvazaar-1.2.0.tar.gz/src/strategies/strategies-dct.c -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-dct.c
Changed
@@ -55,22 +55,23 @@ /** -* \brief Get a function that calculates SAD for NxN block. -* -* \param n Width of the region for which SAD is calculated. -* -* \returns Pointer to cost_16bit_nxn_func. -*/ -dct_func * kvz_get_dct_func(int8_t width, int32_t mode) + * \brief Get a function that performs the transform for a block. + * + * \param width Width of the region + * \param color Color plane + * \param type Prediction type + * + * \returns Pointer to the function. + */ +dct_func * kvz_get_dct_func(int8_t width, color_t color, cu_type_t type) { switch (width) { case 4: - switch (mode){ - case 65535: - return kvz_dct_4x4; - default: + if (color == COLOR_Y && type == CU_INTRA) { return kvz_fast_forward_dst_4x4; - } + } else { + return kvz_dct_4x4; + } case 8: return kvz_dct_8x8; case 16: @@ -83,21 +84,22 @@ } /** -* \brief Get a function that calculates SAD for NxN block. -* -* \param n Width of the region for which SAD is calculated. -* -* \returns Pointer to cost_16bit_nxn_func. -*/ -dct_func * kvz_get_idct_func(int8_t width, int32_t mode) + * \brief Get a function that performs the inverse transform for a block. + * + * \param width Width of the region + * \param color Color plane + * \param type Prediction type + * + * \returns Pointer to the function. + */ +dct_func * kvz_get_idct_func(int8_t width, color_t color, cu_type_t type) { switch (width) { case 4: - switch (mode){ - case 65535: - return kvz_idct_4x4; - default: + if (color == COLOR_Y && type == CU_INTRA) { return kvz_fast_inverse_dst_4x4; + } else { + return kvz_idct_4x4; } case 8: return kvz_idct_8x8;
View file
kvazaar-1.2.0.tar.gz/src/strategies/strategies-dct.h -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-dct.h
Changed
@@ -27,7 +27,7 @@ */ #include "global.h" // IWYU pragma: keep - +#include "cu.h" typedef unsigned (dct_func)(int8_t bitdepth, const int16_t *input, int16_t *output); @@ -49,8 +49,9 @@ int kvz_strategy_register_dct(void* opaque, uint8_t bitdepth); -dct_func * kvz_get_dct_func(int8_t width, int32_t mode); -dct_func * kvz_get_idct_func(int8_t width, int32_t mode); +dct_func * kvz_get_dct_func(int8_t width, color_t color, cu_type_t type); +dct_func * kvz_get_idct_func(int8_t width, color_t color, cu_type_t type); + #define STRATEGIES_DCT_EXPORTS \
View file
kvazaar-1.3.0.tar.gz/src/strategies/strategies-encode.c
Added
@@ -0,0 +1,41 @@ +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2015 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see <http://www.gnu.org/licenses/>. + ****************************************************************************/ + +#include "strategies/strategies-encode.h" + +#include "strategies/avx2/encode_coding_tree-avx2.h" +#include "strategies/generic/encode_coding_tree-generic.h" +#include "strategyselector.h" + + +// Define function pointers. +encode_coeff_nxn_func *kvz_encode_coeff_nxn; + + +int kvz_strategy_register_encode(void* opaque, uint8_t bitdepth) { + bool success = true; + + success &= kvz_strategy_register_encode_generic(opaque, bitdepth); + + if (kvz_g_hardware_flags.intel_flags.avx2) { + success &= kvz_strategy_register_encode_avx2(opaque, bitdepth); + } + return success; +}
View file
kvazaar-1.3.0.tar.gz/src/strategies/strategies-encode.h
Added
@@ -0,0 +1,56 @@ +#ifndef STRATEGIES_ENCODE_H_ +#define STRATEGIES_ENCODE_H_ +/***************************************************************************** + * This file is part of Kvazaar HEVC encoder. + * + * Copyright (C) 2013-2015 Tampere University of Technology and others (see + * COPYING file). + * + * Kvazaar is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the + * Free Software Foundation; either version 2.1 of the License, or (at your + * option) any later version. + * + * Kvazaar is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with Kvazaar. If not, see <http://www.gnu.org/licenses/>. + ****************************************************************************/ + +/** + * \ingroup Optimization + * \file + * Interface for quantization functions. + */ + +#include "cu.h" +#include "encoderstate.h" +#include "global.h" // IWYU pragma: keep +#include "kvazaar.h" +#include "tables.h" + + +// Declare function pointers. +typedef unsigned (encode_coeff_nxn_func)(encoder_state_t * const state, + cabac_data_t * const cabac, + const coeff_t *coeff, + uint8_t width, + uint8_t type, + int8_t scan_mode, + int8_t tr_skip); + +// Declare function pointers. +extern encode_coeff_nxn_func *kvz_encode_coeff_nxn; + +int kvz_strategy_register_encode(void* opaque, uint8_t bitdepth); + + +#define STRATEGIES_ENCODE_EXPORTS \ + {"encode_coeff_nxn", (void**) &kvz_encode_coeff_nxn}, \ + + + +#endif //STRATEGIES_ENCODE_H_
View file
kvazaar-1.2.0.tar.gz/src/strategies/strategies-ipol.c -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-ipol.c
Changed
@@ -26,10 +26,10 @@ // Define function pointers. -ipol_func *kvz_filter_inter_quarterpel_luma; -ipol_func *kvz_filter_inter_halfpel_chroma; -ipol_func *kvz_filter_inter_octpel_chroma; -ipol_frac_blocks_func *kvz_filter_frac_blocks_luma; +ipol_blocks_func * kvz_filter_hpel_blocks_hor_ver_luma; +ipol_blocks_func * kvz_filter_hpel_blocks_diag_luma; +ipol_blocks_func * kvz_filter_qpel_blocks_hor_ver_luma; +ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma; epol_func *kvz_get_extended_block; kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma; kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma;
View file
kvazaar-1.2.0.tar.gz/src/strategies/strategies-ipol.h -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-ipol.h
Changed
@@ -34,11 +34,9 @@ typedef struct { kvz_pixel *buffer; kvz_pixel *orig_topleft; unsigned stride; unsigned malloc_used; } kvz_extended_block; -typedef unsigned(ipol_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height, kvz_pixel *dst, - int16_t dst_stride, int8_t hor_flag, int8_t ver_flag); - -typedef unsigned(ipol_frac_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height, - frac_search_block filtered_out[15], int8_t fme_level); +typedef void(ipol_blocks_func)(const encoder_control_t * encoder, kvz_pixel *src, int16_t src_stride, int width, int height, + kvz_pixel filtered[4][LCU_WIDTH * LCU_WIDTH], int16_t hor_intermediate[5][(KVZ_EXT_BLOCK_W_LUMA + 1) * LCU_WIDTH], int8_t fme_level, int16_t hor_first_cols[5][KVZ_EXT_BLOCK_W_LUMA + 1], + int8_t sample_off_x, int8_t sample_off_y); typedef unsigned(epol_func)(int xpos, int ypos, int mv_x, int mv_y, int off_x, int off_y, kvz_pixel *ref, int ref_width, int ref_height, int filter_size, int width, int height, kvz_extended_block *out); @@ -50,10 +48,10 @@ typedef void(kvz_sample_14bit_octpel_chroma_func)(const encoder_control_t * const encoder, kvz_pixel *src, int16_t src_stride, int width, int height, int16_t *dst, int16_t dst_stride, int8_t hor_flag, int8_t ver_flag, const int16_t mv[2]); // Declare function pointers. -extern ipol_func * kvz_filter_inter_quarterpel_luma; -extern ipol_func * kvz_filter_inter_halfpel_chroma; -extern ipol_func * kvz_filter_inter_octpel_chroma; -extern ipol_frac_blocks_func *kvz_filter_frac_blocks_luma; +extern ipol_blocks_func * kvz_filter_hpel_blocks_hor_ver_luma; +extern ipol_blocks_func * kvz_filter_hpel_blocks_diag_luma; +extern ipol_blocks_func * kvz_filter_qpel_blocks_hor_ver_luma; +extern ipol_blocks_func * kvz_filter_qpel_blocks_diag_luma; extern epol_func * kvz_get_extended_block; extern kvz_sample_quarterpel_luma_func * kvz_sample_quarterpel_luma; extern kvz_sample_octpel_chroma_func * kvz_sample_octpel_chroma; @@ -65,10 +63,10 @@ #define STRATEGIES_IPOL_EXPORTS \ - {"filter_inter_quarterpel_luma", (void**) &kvz_filter_inter_quarterpel_luma}, \ - {"filter_inter_halfpel_chroma", (void**) &kvz_filter_inter_halfpel_chroma}, \ - {"filter_inter_octpel_chroma", (void**) &kvz_filter_inter_octpel_chroma}, \ - {"filter_frac_blocks_luma", (void**) &kvz_filter_frac_blocks_luma}, \ + {"filter_hpel_blocks_hor_ver_luma", (void**) &kvz_filter_hpel_blocks_hor_ver_luma}, \ + {"filter_hpel_blocks_diag_luma", (void**) &kvz_filter_hpel_blocks_diag_luma}, \ + {"filter_qpel_blocks_hor_ver_luma", (void**) &kvz_filter_qpel_blocks_hor_ver_luma}, \ + {"filter_qpel_blocks_diag_luma", (void**) &kvz_filter_qpel_blocks_diag_luma}, \ {"sample_quarterpel_luma", (void**) &kvz_sample_quarterpel_luma}, \ {"sample_octpel_chroma", (void**) &kvz_sample_octpel_chroma}, \ {"sample_14bit_quarterpel_luma", (void**) &kvz_sample_14bit_quarterpel_luma}, \
View file
kvazaar-1.2.0.tar.gz/src/strategies/strategies-picture.c -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-picture.c
Changed
@@ -61,6 +61,12 @@ pixels_calc_ssd_func * kvz_pixels_calc_ssd = 0; +inter_recon_bipred_func * kvz_inter_recon_bipred_blend = 0; + +get_optimized_sad_func *kvz_get_optimized_sad = 0; +ver_sad_func *kvz_ver_sad = 0; +hor_sad_func *kvz_hor_sad = 0; + int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth) { bool success = true;
View file
kvazaar-1.2.0.tar.gz/src/strategies/strategies-picture.h -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-picture.h
Changed
@@ -28,11 +28,12 @@ #include "global.h" // IWYU pragma: keep #include "kvazaar.h" +#include "encoderstate.h" +#include "strategies/optimized_sad_func_ptr_t.h" typedef kvz_pixel (*pred_buffer)[32 * 32]; - // Function macro for defining hadamard calculating functions // for fixed size blocks. They calculate hadamard for integer // multiples of 8x8 with the 8x8 hadamard function. @@ -108,9 +109,33 @@ const kvz_pixel *block2, int stride2 ); typedef void (cost_pixel_nxn_multi_func)(const pred_buffer preds, const kvz_pixel *orig, unsigned num_modes, unsigned *costs_out); -typedef void (cost_pixel_any_size_multi_func)(int width, int height, const kvz_pixel **preds, const int *strides, const kvz_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid); +typedef void (cost_pixel_any_size_multi_func)(int width, int height, const kvz_pixel **preds, const int stride, const kvz_pixel *orig, const int orig_stride, unsigned num_modes, unsigned *costs_out, int8_t *valid); typedef unsigned (pixels_calc_ssd_func)(const kvz_pixel *const ref, const kvz_pixel *const rec, const int ref_stride, const int rec_stride, const int width); +typedef optimized_sad_func_ptr_t (get_optimized_sad_func)(int32_t); +typedef uint32_t (ver_sad_func)(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t block_width, int32_t block_height, + uint32_t pic_stride); +typedef uint32_t (hor_sad_func)(const kvz_pixel *pic_data, const kvz_pixel *ref_data, + int32_t width, int32_t height, uint32_t pic_stride, + uint32_t ref_stride, uint32_t left, uint32_t right); + +typedef void (inter_recon_bipred_func)(const int hi_prec_luma_rec0, + const int hi_prec_luma_rec1, + const int hi_prec_chroma_rec0, + const int hi_prec_chroma_rec1, + int height, + int width, + int ypos, + int xpos, + const hi_prec_buf_t*high_precision_rec0, + const hi_prec_buf_t*high_precision_rec1, + lcu_t* lcu, + kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH], + kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C], + kvz_pixel temp_lcu_v[LCU_WIDTH_C*LCU_WIDTH_C]); + + // Declare function pointers. extern reg_sad_func * kvz_reg_sad; @@ -144,6 +169,12 @@ extern pixels_calc_ssd_func *kvz_pixels_calc_ssd; +extern inter_recon_bipred_func * kvz_inter_recon_bipred_blend; + +extern get_optimized_sad_func *kvz_get_optimized_sad; +extern ver_sad_func *kvz_ver_sad; +extern hor_sad_func *kvz_hor_sad; + int kvz_strategy_register_picture(void* opaque, uint8_t bitdepth); cost_pixel_nxn_func * kvz_pixels_get_satd_func(unsigned n); cost_pixel_nxn_func * kvz_pixels_get_sad_func(unsigned n); @@ -175,6 +206,10 @@ {"satd_64x64_dual", (void**) &kvz_satd_64x64_dual}, \ {"satd_any_size_quad", (void**) &kvz_satd_any_size_quad}, \ {"pixels_calc_ssd", (void**) &kvz_pixels_calc_ssd}, \ + {"inter_recon_bipred", (void**) &kvz_inter_recon_bipred_blend}, \ + {"get_optimized_sad", (void**) &kvz_get_optimized_sad}, \ + {"ver_sad", (void**) &kvz_ver_sad}, \ + {"hor_sad", (void**) &kvz_hor_sad}, \
View file
kvazaar-1.2.0.tar.gz/src/strategies/strategies-quant.c -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-quant.c
Changed
@@ -30,6 +30,7 @@ quant_residual_func *kvz_quantize_residual; dequant_func *kvz_dequant; coeff_abs_sum_func *kvz_coeff_abs_sum; +fast_coeff_cost_func *kvz_fast_coeff_cost; int kvz_strategy_register_quant(void* opaque, uint8_t bitdepth) {
View file
kvazaar-1.2.0.tar.gz/src/strategies/strategies-quant.h -> kvazaar-1.3.0.tar.gz/src/strategies/strategies-quant.h
Changed
@@ -44,6 +44,7 @@ kvz_pixel *rec_out, coeff_t *coeff_out); typedef unsigned (dequant_func)(const encoder_state_t * const state, coeff_t *q_coef, coeff_t *coef, int32_t width, int32_t height, int8_t type, int8_t block_type); +typedef uint32_t (fast_coeff_cost_func)(const coeff_t *coeff, int32_t width, int32_t qp); typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length); @@ -52,6 +53,7 @@ extern quant_residual_func * kvz_quantize_residual; extern dequant_func *kvz_dequant; extern coeff_abs_sum_func *kvz_coeff_abs_sum; +extern fast_coeff_cost_func *kvz_fast_coeff_cost; int kvz_strategy_register_quant(void* opaque, uint8_t bitdepth); @@ -61,6 +63,7 @@ {"quantize_residual", (void**) &kvz_quantize_residual}, \ {"dequant", (void**) &kvz_dequant}, \ {"coeff_abs_sum", (void**) &kvz_coeff_abs_sum}, \ + {"fast_coeff_cost", (void**) &kvz_fast_coeff_cost}, \
View file
kvazaar-1.3.0.tar.gz/src/strategies/x86_asm/x86inc.asm
Added
@@ -0,0 +1,1466 @@ +;***************************************************************************** +;* x86inc.asm: x264asm abstraction layer +;***************************************************************************** +;* Copyright (C) 2005-2014 x264 project +;* +;* Authors: Loren Merritt <lorenm@u.washington.edu> +;* Anton Mitrofanov <BugMaster@narod.ru> +;* Jason Garrett-Glaser <darkshikari@gmail.com> +;* Henrik Gramner <henrik@gramner.com> +;* +;* Permission to use, copy, modify, and/or distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;***************************************************************************** + +; This is a header file for the x264ASM assembly language, which uses +; NASM/YASM syntax combined with a large number of macros to provide easy +; abstraction between different calling conventions (x86_32, win64, linux64). +; It also has various other useful features to simplify writing the kind of +; DSP functions that are most often used in x264. + +; Unlike the rest of x264, this file is available under an ISC license, as it +; has significant usefulness outside of x264 and we want it to be available +; to the largest audience possible. Of course, if you modify it for your own +; purposes to add a new feature, we strongly encourage contributing a patch +; as this feature might be useful for others as well. Send patches or ideas +; to x264-devel@videolan.org . + +%ifndef private_prefix + %define private_prefix kvz +%endif + +%ifndef public_prefix + %define public_prefix private_prefix +%endif + +%define WIN64 0 +%define UNIX64 0 +%if ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win32 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,win64 + %define WIN64 1 + %elifidn __OUTPUT_FORMAT__,x64 + %define WIN64 1 + %else + %define UNIX64 1 + %endif +%endif + +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + +%macro SECTION_RODATA 0-1 16 + SECTION .rodata align=%1 +%endmacro + +%macro SECTION_TEXT 0-1 16 + SECTION .text align=%1 +%endmacro + +%if WIN64 + %define PIC +%elif ARCH_X86_64 == 0 +; x86_32 doesn't require PIC. +; Some distros prefer shared objects to be PIC, but nothing breaks if +; the code contains a few textrels, so we'll skip that complexity. + %undef PIC +%endif +%ifdef PIC + default rel +%endif + +%macro CPUNOP 1 + %ifdef __YASM_MAJOR__ + CPU %1 + %endif +%endmacro + +; Always use long nops (reduces 0x90 spam in disassembly on x86_32) +CPUNOP amdnop + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most of x264's asm. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used. pushes callee-saved regs if needed. +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. +; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x, +; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes), +; and an extra register will be allocated to hold the original stack +; pointer (to not invalidate r0m etc.). To prevent the use of an extra +; register as stack pointer, request a negative stack size. +; %4+/%5+ = list of names to define to registers +; PROLOGUE can also be invoked by adding the same options to cglobal + +; e.g. +; cglobal foo, 2,3,0, dst, src, tmp +; declares a function (foo), taking two args (dst and src) and one local variable (tmp) + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE, and returns. + +; REP_RET: +; Use this instead of RET if it's a branch target. + +; registers: +; rN and rNq are the native-size register holding function argument N +; rNd, rNw, rNb are dword, word, and byte size +; rNh is the high 8 bits of the word size +; rNm is the original location of arg N (a register or on the stack), dword +; rNmp is native size + +%macro DECLARE_REG 2-3 + %define r%1q %2 + %define r%1d %2d + %define r%1w %2w + %define r%1b %2b + %define r%1h %2h + %if %0 == 2 + %define r%1m %2d + %define r%1mp %2 + %elif ARCH_X86_64 ; memory + %define r%1m [rstk + stack_offset + %3] + %define r%1mp qword r %+ %1 %+ m + %else + %define r%1m [rstk + stack_offset + %3] + %define r%1mp dword r %+ %1 %+ m + %endif + %define r%1 %2 +%endmacro + +%macro DECLARE_REG_SIZE 3 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 + %define r%1h %3 + %define e%1h %3 + %define r%1b %2 + %define e%1b %2 +%if ARCH_X86_64 == 0 + %define r%1 e%1 +%endif +%endmacro + +DECLARE_REG_SIZE ax, al, ah +DECLARE_REG_SIZE bx, bl, bh +DECLARE_REG_SIZE cx, cl, ch +DECLARE_REG_SIZE dx, dl, dh +DECLARE_REG_SIZE si, sil, null +DECLARE_REG_SIZE di, dil, null +DECLARE_REG_SIZE bp, bpl, null + +; t# defines for when per-arch register allocation is more complex than just function arguments + +%macro DECLARE_REG_TMP 1-* + %assign %%i 0 + %rep %0 + CAT_XDEFINE t, %%i, r%1 + %assign %%i %%i+1 + %rotate 1 + %endrep +%endmacro + +%macro DECLARE_REG_TMP_SIZE 0-* + %rep %0 + %define t%1q t%1 %+ q + %define t%1d t%1 %+ d + %define t%1w t%1 %+ w + %define t%1h t%1 %+ h + %define t%1b t%1 %+ b + %rotate 1 + %endrep +%endmacro + +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 + +%if ARCH_X86_64 + %define gprsize 8 +%else + %define gprsize 4 +%endif + +%macro PUSH 1 + push %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset+gprsize + %endif +%endmacro + +%macro POP 1 + pop %1 + %ifidn rstk, rsp + %assign stack_offset stack_offset-gprsize + %endif +%endmacro + +%macro PUSH_IF_USED 1-* + %rep %0 + %if %1 < regs_used + PUSH r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro POP_IF_USED 1-* + %rep %0 + %if %1 < regs_used + pop r%1 + %endif + %rotate 1 + %endrep +%endmacro + +%macro LOAD_IF_USED 1-* + %rep %0 + %if %1 < num_args + mov r%1, r %+ %1 %+ mp + %endif + %rotate 1 + %endrep +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rstk + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assert failed + %endif +%endmacro + +%macro DEFINE_ARGS 0-* + %ifdef n_arg_names + %assign %%i 0 + %rep n_arg_names + CAT_UNDEF arg_name %+ %%i, q + CAT_UNDEF arg_name %+ %%i, d + CAT_UNDEF arg_name %+ %%i, w + CAT_UNDEF arg_name %+ %%i, h + CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name %+ %%i, m + CAT_UNDEF arg_name %+ %%i, mp + CAT_UNDEF arg_name, %%i + %assign %%i %%i+1 + %endrep + %endif + + %xdefine %%stack_offset stack_offset + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine + %assign %%i 0 + %rep %0 + %xdefine %1q r %+ %%i %+ q + %xdefine %1d r %+ %%i %+ d + %xdefine %1w r %+ %%i %+ w + %xdefine %1h r %+ %%i %+ h + %xdefine %1b r %+ %%i %+ b + %xdefine %1m r %+ %%i %+ m + %xdefine %1mp r %+ %%i %+ mp + CAT_XDEFINE arg_name, %%i, %1 + %assign %%i %%i+1 + %rotate 1 + %endrep + %xdefine stack_offset %%stack_offset + %assign n_arg_names %0 +%endmacro + +%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) + %ifnum %1 + %if %1 != 0 + %assign %%stack_alignment ((mmsize + 15) & ~15) + %assign stack_size %1 + %if stack_size < 0 + %assign stack_size -stack_size + %endif + %assign stack_size_padded stack_size + %if WIN64 + %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space + %if mmsize != 8 + %assign xmm_regs_used %2 + %if xmm_regs_used > 8 + %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16 + %endif + %endif + %endif + %if mmsize <= 16 && HAVE_ALIGNED_STACK + %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) + SUB rsp, stack_size_padded + %else + %assign %%reg_num (regs_used - 1) + %xdefine rstk r %+ %%reg_num + ; align stack, and save original stack location directly above + ; it, i.e. in [rsp+stack_size_padded], so we can restore the + ; stack in a single instruction (i.e. mov rsp, rstk or mov + ; rsp, [rsp+stack_size_padded]) + mov rstk, rsp + %if %1 < 0 ; need to store rsp on stack + sub rsp, gprsize+stack_size_padded + and rsp, ~(%%stack_alignment-1) + %xdefine rstkm [rsp+stack_size_padded] + mov rstkm, rstk + %else ; can keep rsp in rstk during whole function + sub rsp, stack_size_padded + and rsp, ~(%%stack_alignment-1) + %xdefine rstkm rstk + %endif + %endif + WIN64_PUSH_XMM + %endif + %endif +%endmacro + +%macro SETUP_STACK_POINTER 1 + %ifnum %1 + %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32) + %if %1 > 0 + %assign regs_used (regs_used + 1) + %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 + %warning "Stack pointer will overwrite register argument" + %endif + %endif + %endif +%endmacro + +%macro DEFINE_ARGS_INTERNAL 3+ + %ifnum %2 + DEFINE_ARGS %3 + %elif %1 == 4 + DEFINE_ARGS %2 + %elif %1 > 4 + DEFINE_ARGS %2, %3 + %endif +%endmacro + +%if WIN64 ; Windows x64 ;================================================= + +DECLARE_REG 0, rcx +DECLARE_REG 1, rdx +DECLARE_REG 2, R8 +DECLARE_REG 3, R9 +DECLARE_REG 4, R10, 40 +DECLARE_REG 5, R11, 48 +DECLARE_REG 6, rax, 56 +DECLARE_REG 7, rdi, 64 +DECLARE_REG 8, rsi, 72 +DECLARE_REG 9, rbx, 80 +DECLARE_REG 10, rbp, 88 +DECLARE_REG 11, R12, 96 +DECLARE_REG 12, R13, 104 +DECLARE_REG 13, R14, 112 +DECLARE_REG 14, R15, 120 + +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4, %3 + %if mmsize != 8 && stack_size == 0 + WIN64_SPILL_XMM %3 + %endif + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%macro WIN64_PUSH_XMM 0 + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if xmm_regs_used > 6 + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if xmm_regs_used > 7 + movaps [rstk + stack_offset + 24], xmm7 + %endif + %if xmm_regs_used > 8 + %assign %%i 8 + %rep xmm_regs_used-8 + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +%macro WIN64_SPILL_XMM 1 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + %if xmm_regs_used > 8 + %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32 + SUB rsp, stack_size_padded + %endif + WIN64_PUSH_XMM +%endmacro + +%macro WIN64_RESTORE_XMM_INTERNAL 1 + %assign %%pad_size 0 + %if xmm_regs_used > 8 + %assign %%i xmm_regs_used + %rep xmm_regs_used-8 + %assign %%i %%i-1 + movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] + %endrep + %endif + %if stack_size_padded > 0 + %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0) + mov rsp, rstkm + %else + add %1, stack_size_padded + %assign %%pad_size stack_size_padded + %endif + %endif + %if xmm_regs_used > 7 + movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + %endif + %if xmm_regs_used > 6 + movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + %endif +%endmacro + +%macro WIN64_RESTORE_XMM 1 + WIN64_RESTORE_XMM_INTERNAL %1 + %assign stack_offset (stack_offset-stack_size_padded) + %assign xmm_regs_used 0 +%endmacro + +%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 + +%macro RET 0 + WIN64_RESTORE_XMM_INTERNAL rsp + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 +%if mmsize == 32 + vzeroupper +%endif + AUTO_REP_RET +%endmacro + +%elif ARCH_X86_64 ; *nix x64 ;============================================= + +DECLARE_REG 0, rdi +DECLARE_REG 1, rsi +DECLARE_REG 2, rdx +DECLARE_REG 3, rcx +DECLARE_REG 4, R8 +DECLARE_REG 5, R9 +DECLARE_REG 6, rax, 8 +DECLARE_REG 7, R10, 16 +DECLARE_REG 8, R11, 24 +DECLARE_REG 9, rbx, 32 +DECLARE_REG 10, rbp, 40 +DECLARE_REG 11, R12, 48 +DECLARE_REG 12, R13, 56 +DECLARE_REG 13, R14, 64 +DECLARE_REG 14, R15, 72 + +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 15 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 + ALLOC_STACK %4 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 + +%macro RET 0 +%if stack_size_padded > 0 +%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 + mov rsp, rstkm +%else + add rsp, stack_size_padded +%endif +%endif + POP_IF_USED 14, 13, 12, 11, 10, 9 +%if mmsize == 32 + vzeroupper +%endif + AUTO_REP_RET +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, 4 +DECLARE_REG 1, ecx, 8 +DECLARE_REG 2, edx, 12 +DECLARE_REG 3, ebx, 16 +DECLARE_REG 4, esi, 20 +DECLARE_REG 5, edi, 24 +DECLARE_REG 6, ebp, 28 +%define rsp esp + +%macro DECLARE_ARG 1-* + %rep %0 + %define r%1m [rstk + stack_offset + 4*%1 + 4] + %define r%1mp dword r%1m + %rotate 1 + %endrep +%endmacro + +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 + +%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... + %assign num_args %1 + %assign regs_used %2 + ASSERT regs_used >= num_args + %if num_args > 7 + %assign num_args 7 + %endif + %if regs_used > 7 + %assign regs_used 7 + %endif + SETUP_STACK_POINTER %4 + ASSERT regs_used <= 7 + PUSH_IF_USED 3, 4, 5, 6 + ALLOC_STACK %4 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 + DEFINE_ARGS_INTERNAL %0, %4, %5 +%endmacro + +%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 + +%macro RET 0 +%if stack_size_padded > 0 +%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 + mov rsp, rstkm +%else + add rsp, stack_size_padded +%endif +%endif + POP_IF_USED 6, 5, 4, 3 +%if mmsize == 32 + vzeroupper +%endif + AUTO_REP_RET +%endmacro + +%endif ;====================================================================== + +%if WIN64 == 0 +%macro WIN64_SPILL_XMM 1 +%endmacro +%macro WIN64_RESTORE_XMM 1 +%endmacro +%macro WIN64_PUSH_XMM 0 +%endmacro +%endif + +; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either +; a branch or a branch target. So switch to a 2-byte form of ret in that case. +; We can automatically detect "follows a branch", but not a branch target. +; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) +%macro REP_RET 0 + %if has_epilogue + RET + %else + rep ret + %endif +%endmacro + +%define last_branch_adr $$ +%macro AUTO_REP_RET 0 + %ifndef cpuflags + times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr. + %elif notcpuflag(ssse3) + times ((last_branch_adr-$)>>31)+1 rep + %endif + ret +%endmacro + +%macro BRANCH_INSTR 0-* + %rep %0 + %macro %1 1-2 %1 + %2 %1 + %%branch_instr: + %xdefine last_branch_adr %%branch_instr + %endmacro + %rotate 1 + %endrep +%endmacro + +BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp + +%macro TAIL_CALL 2 ; callee, is_nonadjacent + %if has_epilogue + call %1 + RET + %elif %2 + jmp %1 + %endif +%endmacro + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Begin a function. +; Applies any symbol mangling needed for C linkage, and sets up a define such that +; subsequent uses of the function name automatically refer to the mangled version. +; Appends cpuflags to the function name if cpuflags has been specified. +; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX +; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). +%macro cglobal 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 1, %1 %+ SUFFIX, %2 +%endmacro +%macro cvisible 1-2+ "" ; name, [PROLOGUE args] + cglobal_internal 0, %1 %+ SUFFIX, %2 +%endmacro +%macro cglobal_internal 2-3+ + %if %1 + %xdefine %%FUNCTION_PREFIX private_prefix + %xdefine %%VISIBILITY hidden + %else + %xdefine %%FUNCTION_PREFIX public_prefix + %xdefine %%VISIBILITY + %endif + %ifndef cglobaled_%2 + %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) + %xdefine %2.skip_prologue %2 %+ .skip_prologue + CAT_XDEFINE cglobaled_, %2, 1 + %endif + %xdefine current_function %2 + %ifidn __OUTPUT_FORMAT__,elf + global %2:function %%VISIBILITY + %else + global %2 + %endif + align function_align + %2: + RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer + %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required + %assign stack_offset 0 ; stack pointer offset relative to the return address + %assign stack_size 0 ; amount of stack space that can be freely used inside a function + %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 + %ifnidn %3, "" + PROLOGUE %3 + %endif +%endmacro + +%macro cextern 1 + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +; like cextern, but without the prefix +%macro cextern_naked 1 + %xdefine %1 mangle(%1) + CAT_XDEFINE cglobaled_, %1, 1 + extern %1 +%endmacro + +%macro const 1-2+ + %xdefine %1 mangle(private_prefix %+ _ %+ %1) + %ifidn __OUTPUT_FORMAT__,elf + global %1:data hidden + %else + global %1 + %endif + %1: %2 +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is +; executable by default. +%ifidn __OUTPUT_FORMAT__,elf +SECTION .note.GNU-stack noalloc noexec nowrite progbits +%endif + +; cpuflags + +%assign cpuflags_mmx (1<<0) +%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx +%assign cpuflags_3dnow (1<<2) | cpuflags_mmx +%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow +%assign cpuflags_sse (1<<4) | cpuflags_mmx2 +%assign cpuflags_sse2 (1<<5) | cpuflags_sse +%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 +%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 +%assign cpuflags_avx (1<<11)| cpuflags_sse42 +%assign cpuflags_xop (1<<12)| cpuflags_avx +%assign cpuflags_fma4 (1<<13)| cpuflags_avx +%assign cpuflags_avx2 (1<<14)| cpuflags_avx +%assign cpuflags_fma3 (1<<15)| cpuflags_avx + +%assign cpuflags_cache32 (1<<16) +%assign cpuflags_cache64 (1<<17) +%assign cpuflags_slowctz (1<<18) +%assign cpuflags_lzcnt (1<<19) +%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<21) +%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 + +%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) +%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) + +; Takes up to 2 cpuflags from the above list. +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. +; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. +%macro INIT_CPUFLAGS 0-2 + CPUNOP amdnop + %if %0 >= 1 + %xdefine cpuname %1 + %assign cpuflags cpuflags_%1 + %if %0 >= 2 + %xdefine cpuname %1_%2 + %assign cpuflags cpuflags | cpuflags_%2 + %endif + %xdefine SUFFIX _ %+ cpuname + %if cpuflag(avx) + %assign avx_enabled 1 + %endif + %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) + %define mova movaps + %define movu movups + %define movnta movntps + %endif + %if cpuflag(aligned) + %define movu mova + %elifidn %1, sse3 + %define movu lddqu + %endif + %if ARCH_X86_64 == 0 && notcpuflag(sse2) + CPUNOP basicnop + %endif + %else + %xdefine SUFFIX + %undef cpuname + %undef cpuflags + %endif +%endmacro + +; Merge mmx and sse* +; m# is a simd register of the currently selected size +; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# +; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# +; (All 3 remain in sync through SWAP.) + +%macro CAT_XDEFINE 3 + %xdefine %1%2 %3 +%endmacro + +%macro CAT_UNDEF 2 + %undef %1%2 +%endmacro + +%macro INIT_MMX 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_MMX %1 + %define mmsize 8 + %define num_mmregs 8 + %define mova movq + %define movu movq + %define movh movd + %define movnta movntq + %assign %%i 0 + %rep 8 + CAT_XDEFINE m, %%i, mm %+ %%i + CAT_XDEFINE nmm, %%i, %%i + %assign %%i %%i+1 + %endrep + %rep 8 + CAT_UNDEF m, %%i + CAT_UNDEF nmm, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_XMM 0-1+ + %assign avx_enabled 0 + %define RESET_MM_PERMUTATION INIT_XMM %1 + %define mmsize 16 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %define movh movq + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, xmm %+ %%i + CAT_XDEFINE nxmm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +%macro INIT_YMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_YMM %1 + %define mmsize 32 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, ymm %+ %%i + CAT_XDEFINE nymm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 +%endmacro + +INIT_XMM + +%macro DECLARE_MMCAST 1 + %define mmmm%1 mm%1 + %define mmxmm%1 mm%1 + %define mmymm%1 mm%1 + %define xmmmm%1 mm%1 + %define xmmxmm%1 xmm%1 + %define xmmymm%1 xmm%1 + %define ymmmm%1 mm%1 + %define ymmxmm%1 xmm%1 + %define ymmymm%1 ymm%1 + %define xm%1 xmm %+ m%1 + %define ym%1 ymm %+ m%1 +%endmacro + +%assign i 0 +%rep 16 + DECLARE_MMCAST i +%assign i i+1 +%endrep + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap +%rep %0/2 + %xdefine %%tmp%2 m%2 + %rotate 2 +%endrep +%rep %0/2 + %xdefine m%1 %%tmp%2 + CAT_XDEFINE n, m%1, %1 + %rotate 2 +%endrep +%endmacro + +%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) +%ifnum %1 ; SWAP 0, 1, ... + SWAP_INTERNAL_NUM %1, %2 +%else ; SWAP m0, m1, ... + SWAP_INTERNAL_NAME %1, %2 +%endif +%endmacro + +%macro SWAP_INTERNAL_NUM 2-* + %rep %0-1 + %xdefine %%tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 %%tmp + CAT_XDEFINE n, m%1, %1 + CAT_XDEFINE n, m%2, %2 + %rotate 1 + %endrep +%endmacro + +%macro SWAP_INTERNAL_NAME 2-* + %xdefine %%args n %+ %1 + %rep %0-1 + %xdefine %%args %%args, n %+ %2 + %rotate 1 + %endrep + SWAP_INTERNAL_NUM %%args +%endmacro + +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later +; calls to that function will automatically load the permutation, so values can +; be returned in mmregs. +%macro SAVE_MM_PERMUTATION 0-1 + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE %%f, %%i, m %+ %%i + %assign %%i %%i+1 + %endrep +%endmacro + +%macro LOAD_MM_PERMUTATION 1 ; name to load from + %ifdef %1_m0 + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1_m %+ %%i + CAT_XDEFINE n, m %+ %%i, %%i + %assign %%i %%i+1 + %endrep + %endif +%endmacro + +; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't +%macro call 1 + call_internal %1, %1 %+ SUFFIX +%endmacro +%macro call_internal 2 + %xdefine %%i %1 + %ifndef cglobaled_%1 + %ifdef cglobaled_%2 + %xdefine %%i %2 + %endif + %endif + call %%i + LOAD_MM_PERMUTATION %%i +%endmacro + +; Substitutions that reduce instruction size but are functionally equivalent +%macro add 2 + %ifnum %2 + %if %2==128 + sub %1, -128 + %else + add %1, %2 + %endif + %else + add %1, %2 + %endif +%endmacro + +%macro sub 2 + %ifnum %2 + %if %2==128 + add %1, -128 + %else + sub %1, %2 + %endif + %else + sub %1, %2 + %endif +%endmacro + +;============================================================================= +; AVX abstraction layer +;============================================================================= + +%assign i 0 +%rep 16 + %if i < 8 + CAT_XDEFINE sizeofmm, i, 8 + %endif + CAT_XDEFINE sizeofxmm, i, 16 + CAT_XDEFINE sizeofymm, i, 32 +%assign i i+1 +%endrep +%undef i + +%macro CHECK_AVX_INSTR_EMU 3-* + %xdefine %%opcode %1 + %xdefine %%dst %2 + %rep %0-2 + %ifidn %%dst, %3 + %error non-avx emulation of ``%%opcode'' is not supported + %endif + %rotate 1 + %endrep +%endmacro + +;%1 == instruction +;%2 == 1 if float, 0 if int +;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise +;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +;%5+: operands +%macro RUN_AVX_INSTR 5-8+ + %ifnum sizeof%6 + %assign %%sizeofreg sizeof%6 + %elifnum sizeof%5 + %assign %%sizeofreg sizeof%5 + %else + %assign %%sizeofreg mmsize + %endif + %assign %%emulate_avx 0 + %if avx_enabled && %%sizeofreg >= 16 + %xdefine %%instr v%1 + %else + %xdefine %%instr %1 + %if %0 >= 7+%3 + %assign %%emulate_avx 1 + %endif + %endif + + %if %%emulate_avx + %xdefine %%src1 %6 + %xdefine %%src2 %7 + %ifnidn %5, %6 + %if %0 >= 8 + CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8 + %else + CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 + %endif + %if %4 && %3 == 0 + %ifnid %7 + ; 3-operand AVX instructions with a memory arg can only have it in src2, + ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). + ; So, if the instruction is commutative with a memory arg, swap them. + %xdefine %%src1 %7 + %xdefine %%src2 %6 + %endif + %endif + %if %%sizeofreg == 8 + MOVQ %5, %%src1 + %elif %2 + MOVAPS %5, %%src1 + %else + MOVDQA %5, %%src1 + %endif + %endif + %if %0 >= 8 + %1 %5, %%src2, %8 + %else + %1 %5, %%src2 + %endif + %elif %0 >= 8 + %%instr %5, %6, %7, %8 + %elif %0 == 7 + %%instr %5, %6, %7 + %elif %0 == 6 + %%instr %5, %6 + %else + %%instr %5 + %endif +%endmacro + +;%1 == instruction +;%2 == 1 if float, 0 if int +;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise +;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 1-4 0, 1, 0 + %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4 + %ifidn %2, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %1 + %elifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %1, %2 + %elifidn %4, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3 + %elifidn %5, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4 + %else + RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5 + %endif + %endmacro +%endmacro + +; Instructions with both VEX and non-VEX encodings +; Non-destructive instructions are written without parameters +AVX_INSTR addpd, 1, 0, 1 +AVX_INSTR addps, 1, 0, 1 +AVX_INSTR addsd, 1, 0, 1 +AVX_INSTR addss, 1, 0, 1 +AVX_INSTR addsubpd, 1, 0, 0 +AVX_INSTR addsubps, 1, 0, 0 +AVX_INSTR aesdec, 0, 0, 0 +AVX_INSTR aesdeclast, 0, 0, 0 +AVX_INSTR aesenc, 0, 0, 0 +AVX_INSTR aesenclast, 0, 0, 0 +AVX_INSTR aesimc +AVX_INSTR aeskeygenassist +AVX_INSTR andnpd, 1, 0, 0 +AVX_INSTR andnps, 1, 0, 0 +AVX_INSTR andpd, 1, 0, 1 +AVX_INSTR andps, 1, 0, 1 +AVX_INSTR blendpd, 1, 0, 0 +AVX_INSTR blendps, 1, 0, 0 +AVX_INSTR blendvpd, 1, 0, 0 +AVX_INSTR blendvps, 1, 0, 0 +AVX_INSTR cmppd, 1, 1, 0 +AVX_INSTR cmpps, 1, 1, 0 +AVX_INSTR cmpsd, 1, 1, 0 +AVX_INSTR cmpss, 1, 1, 0 +AVX_INSTR comisd +AVX_INSTR comiss +AVX_INSTR cvtdq2pd +AVX_INSTR cvtdq2ps +AVX_INSTR cvtpd2dq +AVX_INSTR cvtpd2ps +AVX_INSTR cvtps2dq +AVX_INSTR cvtps2pd +AVX_INSTR cvtsd2si +AVX_INSTR cvtsd2ss +AVX_INSTR cvtsi2sd +AVX_INSTR cvtsi2ss +AVX_INSTR cvtss2sd +AVX_INSTR cvtss2si +AVX_INSTR cvttpd2dq +AVX_INSTR cvttps2dq +AVX_INSTR cvttsd2si +AVX_INSTR cvttss2si +AVX_INSTR divpd, 1, 0, 0 +AVX_INSTR divps, 1, 0, 0 +AVX_INSTR divsd, 1, 0, 0 +AVX_INSTR divss, 1, 0, 0 +AVX_INSTR dppd, 1, 1, 0 +AVX_INSTR dpps, 1, 1, 0 +AVX_INSTR extractps +AVX_INSTR haddpd, 1, 0, 0 +AVX_INSTR haddps, 1, 0, 0 +AVX_INSTR hsubpd, 1, 0, 0 +AVX_INSTR hsubps, 1, 0, 0 +AVX_INSTR insertps, 1, 1, 0 +AVX_INSTR lddqu +AVX_INSTR ldmxcsr +AVX_INSTR maskmovdqu +AVX_INSTR maxpd, 1, 0, 1 +AVX_INSTR maxps, 1, 0, 1 +AVX_INSTR maxsd, 1, 0, 1 +AVX_INSTR maxss, 1, 0, 1 +AVX_INSTR minpd, 1, 0, 1 +AVX_INSTR minps, 1, 0, 1 +AVX_INSTR minsd, 1, 0, 1 +AVX_INSTR minss, 1, 0, 1 +AVX_INSTR movapd +AVX_INSTR movaps +AVX_INSTR movd +AVX_INSTR movddup +AVX_INSTR movdqa +AVX_INSTR movdqu +AVX_INSTR movhlps, 1, 0, 0 +AVX_INSTR movhpd, 1, 0, 0 +AVX_INSTR movhps, 1, 0, 0 +AVX_INSTR movlhps, 1, 0, 0 +AVX_INSTR movlpd, 1, 0, 0 +AVX_INSTR movlps, 1, 0, 0 +AVX_INSTR movmskpd +AVX_INSTR movmskps +AVX_INSTR movntdq +AVX_INSTR movntdqa +AVX_INSTR movntpd +AVX_INSTR movntps +AVX_INSTR movq +AVX_INSTR movsd, 1, 0, 0 +AVX_INSTR movshdup +AVX_INSTR movsldup +AVX_INSTR movss, 1, 0, 0 +AVX_INSTR movupd +AVX_INSTR movups +AVX_INSTR mpsadbw, 0, 1, 0 +AVX_INSTR mulpd, 1, 0, 1 +AVX_INSTR mulps, 1, 0, 1 +AVX_INSTR mulsd, 1, 0, 1 +AVX_INSTR mulss, 1, 0, 1 +AVX_INSTR orpd, 1, 0, 1 +AVX_INSTR orps, 1, 0, 1 +AVX_INSTR pabsb +AVX_INSTR pabsd +AVX_INSTR pabsw +AVX_INSTR packsswb, 0, 0, 0 +AVX_INSTR packssdw, 0, 0, 0 +AVX_INSTR packuswb, 0, 0, 0 +AVX_INSTR packusdw, 0, 0, 0 +AVX_INSTR paddb, 0, 0, 1 +AVX_INSTR paddw, 0, 0, 1 +AVX_INSTR paddd, 0, 0, 1 +AVX_INSTR paddq, 0, 0, 1 +AVX_INSTR paddsb, 0, 0, 1 +AVX_INSTR paddsw, 0, 0, 1 +AVX_INSTR paddusb, 0, 0, 1 +AVX_INSTR paddusw, 0, 0, 1 +AVX_INSTR palignr, 0, 1, 0 +AVX_INSTR pand, 0, 0, 1 +AVX_INSTR pandn, 0, 0, 0 +AVX_INSTR pavgb, 0, 0, 1 +AVX_INSTR pavgw, 0, 0, 1 +AVX_INSTR pblendvb, 0, 0, 0 +AVX_INSTR pblendw, 0, 1, 0 +AVX_INSTR pclmulqdq, 0, 1, 0 +AVX_INSTR pcmpestri +AVX_INSTR pcmpestrm +AVX_INSTR pcmpistri +AVX_INSTR pcmpistrm +AVX_INSTR pcmpeqb, 0, 0, 1 +AVX_INSTR pcmpeqw, 0, 0, 1 +AVX_INSTR pcmpeqd, 0, 0, 1 +AVX_INSTR pcmpeqq, 0, 0, 1 +AVX_INSTR pcmpgtb, 0, 0, 0 +AVX_INSTR pcmpgtw, 0, 0, 0 +AVX_INSTR pcmpgtd, 0, 0, 0 +AVX_INSTR pcmpgtq, 0, 0, 0 +AVX_INSTR pextrb +AVX_INSTR pextrd +AVX_INSTR pextrq +AVX_INSTR pextrw +AVX_INSTR phaddw, 0, 0, 0 +AVX_INSTR phaddd, 0, 0, 0 +AVX_INSTR phaddsw, 0, 0, 0 +AVX_INSTR phminposuw +AVX_INSTR phsubw, 0, 0, 0 +AVX_INSTR phsubd, 0, 0, 0 +AVX_INSTR phsubsw, 0, 0, 0 +AVX_INSTR pinsrb, 0, 1, 0 +AVX_INSTR pinsrd, 0, 1, 0 +AVX_INSTR pinsrq, 0, 1, 0 +AVX_INSTR pinsrw, 0, 1, 0 +AVX_INSTR pmaddwd, 0, 0, 1 +AVX_INSTR pmaddubsw, 0, 0, 0 +AVX_INSTR pmaxsb, 0, 0, 1 +AVX_INSTR pmaxsw, 0, 0, 1 +AVX_INSTR pmaxsd, 0, 0, 1 +AVX_INSTR pmaxub, 0, 0, 1 +AVX_INSTR pmaxuw, 0, 0, 1 +AVX_INSTR pmaxud, 0, 0, 1 +AVX_INSTR pminsb, 0, 0, 1 +AVX_INSTR pminsw, 0, 0, 1 +AVX_INSTR pminsd, 0, 0, 1 +AVX_INSTR pminub, 0, 0, 1 +AVX_INSTR pminuw, 0, 0, 1 +AVX_INSTR pminud, 0, 0, 1 +AVX_INSTR pmovmskb +AVX_INSTR pmovsxbw +AVX_INSTR pmovsxbd +AVX_INSTR pmovsxbq +AVX_INSTR pmovsxwd +AVX_INSTR pmovsxwq +AVX_INSTR pmovsxdq +AVX_INSTR pmovzxbw +AVX_INSTR pmovzxbd +AVX_INSTR pmovzxbq +AVX_INSTR pmovzxwd +AVX_INSTR pmovzxwq +AVX_INSTR pmovzxdq +AVX_INSTR pmuldq, 0, 0, 1 +AVX_INSTR pmulhrsw, 0, 0, 1 +AVX_INSTR pmulhuw, 0, 0, 1 +AVX_INSTR pmulhw, 0, 0, 1 +AVX_INSTR pmullw, 0, 0, 1 +AVX_INSTR pmulld, 0, 0, 1 +AVX_INSTR pmuludq, 0, 0, 1 +AVX_INSTR por, 0, 0, 1 +AVX_INSTR psadbw, 0, 0, 1 +AVX_INSTR pshufb, 0, 0, 0 +AVX_INSTR pshufd +AVX_INSTR pshufhw +AVX_INSTR pshuflw +AVX_INSTR psignb, 0, 0, 0 +AVX_INSTR psignw, 0, 0, 0 +AVX_INSTR psignd, 0, 0, 0 +AVX_INSTR psllw, 0, 0, 0 +AVX_INSTR pslld, 0, 0, 0 +AVX_INSTR psllq, 0, 0, 0 +AVX_INSTR pslldq, 0, 0, 0 +AVX_INSTR psraw, 0, 0, 0 +AVX_INSTR psrad, 0, 0, 0 +AVX_INSTR psrlw, 0, 0, 0 +AVX_INSTR psrld, 0, 0, 0 +AVX_INSTR psrlq, 0, 0, 0 +AVX_INSTR psrldq, 0, 0, 0 +AVX_INSTR psubb, 0, 0, 0 +AVX_INSTR psubw, 0, 0, 0 +AVX_INSTR psubd, 0, 0, 0 +AVX_INSTR psubq, 0, 0, 0 +AVX_INSTR psubsb, 0, 0, 0 +AVX_INSTR psubsw, 0, 0, 0 +AVX_INSTR psubusb, 0, 0, 0 +AVX_INSTR psubusw, 0, 0, 0 +AVX_INSTR ptest +AVX_INSTR punpckhbw, 0, 0, 0 +AVX_INSTR punpckhwd, 0, 0, 0 +AVX_INSTR punpckhdq, 0, 0, 0 +AVX_INSTR punpckhqdq, 0, 0, 0 +AVX_INSTR punpcklbw, 0, 0, 0 +AVX_INSTR punpcklwd, 0, 0, 0 +AVX_INSTR punpckldq, 0, 0, 0 +AVX_INSTR punpcklqdq, 0, 0, 0 +AVX_INSTR pxor, 0, 0, 1 +AVX_INSTR rcpps, 1, 0, 0 +AVX_INSTR rcpss, 1, 0, 0 +AVX_INSTR roundpd +AVX_INSTR roundps +AVX_INSTR roundsd +AVX_INSTR roundss +AVX_INSTR rsqrtps, 1, 0, 0 +AVX_INSTR rsqrtss, 1, 0, 0 +AVX_INSTR shufpd, 1, 1, 0 +AVX_INSTR shufps, 1, 1, 0 +AVX_INSTR sqrtpd, 1, 0, 0 +AVX_INSTR sqrtps, 1, 0, 0 +AVX_INSTR sqrtsd, 1, 0, 0 +AVX_INSTR sqrtss, 1, 0, 0 +AVX_INSTR stmxcsr +AVX_INSTR subpd, 1, 0, 0 +AVX_INSTR subps, 1, 0, 0 +AVX_INSTR subsd, 1, 0, 0 +AVX_INSTR subss, 1, 0, 0 +AVX_INSTR ucomisd +AVX_INSTR ucomiss +AVX_INSTR unpckhpd, 1, 0, 0 +AVX_INSTR unpckhps, 1, 0, 0 +AVX_INSTR unpcklpd, 1, 0, 0 +AVX_INSTR unpcklps, 1, 0, 0 +AVX_INSTR xorpd, 1, 0, 1 +AVX_INSTR xorps, 1, 0, 1 + +; 3DNow instructions, for sharing code between AVX, SSE and 3DN +AVX_INSTR pfadd, 1, 0, 1 +AVX_INSTR pfsub, 1, 0, 0 +AVX_INSTR pfmul, 1, 0, 1 + +; base-4 constants for shuffles +%assign i 0 +%rep 256 + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) + %if j < 10 + CAT_XDEFINE q000, j, i + %elif j < 100 + CAT_XDEFINE q00, j, i + %elif j < 1000 + CAT_XDEFINE q0, j, i + %else + CAT_XDEFINE q, j, i + %endif +%assign i i+1 +%endrep +%undef i +%undef j + +%macro FMA_INSTR 3 + %macro %1 4-7 %1, %2, %3 + %if cpuflag(xop) + v%5 %1, %2, %3, %4 + %else + %6 %1, %2, %3 + %7 %1, %4 + %endif + %endmacro +%endmacro + +FMA_INSTR pmacsdd, pmulld, paddd +FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmadcswd, pmaddwd, paddd + +; convert FMA4 to FMA3 if possible +%macro FMA4_INSTR 4 + %macro %1 4-8 %1, %2, %3, %4 + %if cpuflag(fma4) + v%5 %1, %2, %3, %4 + %elifidn %1, %2 + v%6 %1, %4, %3 ; %1 = %1 * %3 + %4 + %elifidn %1, %3 + v%7 %1, %2, %4 ; %1 = %2 * %1 + %4 + %elifidn %1, %4 + v%8 %1, %2, %3 ; %1 = %2 * %3 + %1 + %else + %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported + %endif + %endmacro +%endmacro + +FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd +FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps +FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd +FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss + +FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd +FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps +FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd +FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps + +FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd +FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps +FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd +FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss + +FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd +FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps +FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd +FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss + +FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd +FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps +FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd +FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss + +; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug +%if ARCH_X86_64 == 0 +%macro vpbroadcastq 2 +%if sizeof%1 == 16 + movddup %1, %2 +%else + vbroadcastsd %1, %2 +%endif +%endmacro +%endif + +%ifidn __OUTPUT_FORMAT__,elf +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif +%ifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +%endif
View file
kvazaar-1.2.0.tar.gz/src/strategyselector.c -> kvazaar-1.3.0.tar.gz/src/strategyselector.c
Changed
@@ -26,9 +26,6 @@ #ifdef _WIN32 #include <windows.h> -#elif MACOS -#include <sys/param.h> -#include <sys/sysctl.h> #else #include <unistd.h> #endif @@ -89,6 +86,11 @@ return 0; } + if (!kvz_strategy_register_encode(&strategies, bitdepth)) { + fprintf(stderr, "kvz_strategy_register_encode failed!\n"); + return 0; + } + while(cur_strategy_to_select->fptr) { *(cur_strategy_to_select->fptr) = strategyselector_choose_for(&strategies, cur_strategy_to_select->strategy_type); @@ -372,40 +374,67 @@ #endif // COMPILE_INTEL #if COMPILE_POWERPC -#include <fcntl.h> -#include <unistd.h> -#include <linux/auxvec.h> +# if defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD__ >= 12) +#ifdef __linux__ #include <asm/cputable.h> +#else +#include <machine/cpu.h> +#endif +#include <sys/auxv.h> -//Source: http://freevec.org/function/altivec_runtime_detection_linux static int altivec_available(void) { - int result = 0; - unsigned long buf[64]; - ssize_t count; - int fd, i; - - fd = open("/proc/self/auxv", O_RDONLY); - if (fd < 0) { - return 0; - } - // loop on reading - do { - count = read(fd, buf, sizeof(buf)); - if (count < 0) - break; - for (i=0; i < (count / sizeof(unsigned long)); i += 2) { - if (buf[i] == AT_HWCAP) { - result = !!(buf[i+1] & PPC_FEATURE_HAS_ALTIVEC); - goto out_close; - } else if (buf[i] == AT_NULL) - goto out_close; - } - } while (count == sizeof(buf)); -out_close: - close(fd); - return result; + unsigned long hwcap = 0; +#ifdef __linux__ + hwcap = getauxval(AT_HWCAP); +#else + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); +#endif + return !!(hwcap & PPC_FEATURE_HAS_ALTIVEC); } +# elif defined(__FreeBSD__) +#include <sys/types.h> +#include <sys/sysctl.h> +#include <machine/cpu.h> + +static int altivec_available(void) +{ + u_long cpu_features = 0; + size_t len = sizeof(cpu_features); + + sysctlbyname("hw.cpu_features", &cpu_features, &len, NULL, 0); + return !!(cpu_features & PPC_FEATURE_HAS_ALTIVEC); +} +# elif defined(__APPLE__) || defined(__NetBSD__) || defined(__OpenBSD__) +#include <sys/param.h> +#include <sys/sysctl.h> +#ifndef __APPLE__ +#include <machine/cpu.h> +#endif + +static int altivec_available(void) +{ + int cpu_altivec = 0; + size_t len = sizeof(cpu_altivec); +#ifdef HW_VECTORUNIT + int mib[] = { CTL_HW, HW_VECTORUNIT }; +#else + int mib[] = { CTL_MACHDEP, CPU_ALTIVEC }; +#endif + + sysctl(mib, sizeof(mib)/sizeof(mib[0]), &cpu_altivec, &len, NULL, 0); + return cpu_altivec; +} +# else +static int altivec_available(void) +{ +#if COMPILE_POWERPC_ALTIVEC + return 1; +#else + return 0; +#endif +} +# endif #endif //COMPILE_POWERPC static void set_hardware_flags(int32_t cpuid) {
View file
kvazaar-1.2.0.tar.gz/src/strategyselector.h -> kvazaar-1.3.0.tar.gz/src/strategyselector.h
Changed
@@ -95,6 +95,7 @@ #include "strategies/strategies-quant.h" #include "strategies/strategies-intra.h" #include "strategies/strategies-sao.h" +#include "strategies/strategies-encode.h" static const strategy_to_select_t strategies_to_select[] = { STRATEGIES_NAL_EXPORTS @@ -104,6 +105,7 @@ STRATEGIES_QUANT_EXPORTS STRATEGIES_INTRA_EXPORTS STRATEGIES_SAO_EXPORTS + STRATEGIES_ENCODE_EXPORTS { NULL, NULL }, };
View file
kvazaar-1.2.0.tar.gz/src/threadqueue.c -> kvazaar-1.3.0.tar.gz/src/threadqueue.c
Changed
@@ -18,6 +18,7 @@ * with Kvazaar. If not, see <http://www.gnu.org/licenses/>. ****************************************************************************/ +#include "global.h" #include "threadqueue.h" #include <errno.h> // ETIMEDOUT @@ -26,7 +27,6 @@ #include <stdlib.h> #include <string.h> -#include "global.h" #include "threads.h" @@ -500,9 +500,10 @@ */ threadqueue_job_t *kvz_threadqueue_copy_ref(threadqueue_job_t *job) { - // The caller should have had another reference. - assert(job->refcount > 0); - KVZ_ATOMIC_INC(&job->refcount); + int32_t new_refcount = KVZ_ATOMIC_INC(&job->refcount); + // The caller should have had another reference and we added one + // reference so refcount should be at least 2. + assert(new_refcount >= 2); return job; }
View file
kvazaar-1.2.0.tar.gz/src/threadqueue.h -> kvazaar-1.3.0.tar.gz/src/threadqueue.h
Changed
@@ -26,10 +26,10 @@ * Container for worker tasks. */ -#include <pthread.h> - #include "global.h" // IWYU pragma: keep +#include <pthread.h> + typedef struct threadqueue_job_t threadqueue_job_t; typedef struct threadqueue_queue_t threadqueue_queue_t;
View file
kvazaar-1.3.0.tar.gz/src/threadwrapper
Added
+(directory)
View file
kvazaar-1.3.0.tar.gz/src/threadwrapper/LICENSE
Added
@@ -0,0 +1,5 @@ +Copyright 2019 Tampere University + +Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. \ No newline at end of file
View file
kvazaar-1.3.0.tar.gz/src/threadwrapper/README.md
Added
@@ -0,0 +1,6 @@ +ThreadWrapper +======= +Wraps pthread functions so that they actually call C++ standard functions. + +Only functions used by Kvazaar, an open-source HEVC encoder, are implemented. +People are free to contribute if they implement other functions.
View file
kvazaar-1.3.0.tar.gz/src/threadwrapper/include
Added
+(directory)
View file
kvazaar-1.3.0.tar.gz/src/threadwrapper/include/pthread.h
Added
@@ -0,0 +1,53 @@ +/* +Copyright 2019 Tampere University + +Permission to use, copy, modify, and/or distribute this software for +any purpose with or without fee is hereby granted, provided that the +above copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void* pthread_cond_t; +typedef void* pthread_cond_t; +typedef void* pthread_mutex_t; +typedef void* pthread_t; +typedef void*(voidp_voidp_func)(void*); + +typedef void pthread_attr_t; +typedef void pthread_condattr_t; +typedef void pthread_mutexattr_t; + +// Parameter names that have been commented away do nothing, +// as they are always null when the functions are used in Kvazaar. + +int pthread_cond_broadcast(pthread_cond_t* cond); +int pthread_cond_destroy(pthread_cond_t* cond); +int pthread_cond_init(pthread_cond_t* cond, const pthread_condattr_t* /*attr*/); +int pthread_cond_signal(pthread_cond_t* cond); +int pthread_cond_wait(pthread_cond_t* cond, pthread_mutex_t* mutex); + +int pthread_create(pthread_t* thread, const pthread_attr_t* /*attr*/, voidp_voidp_func executee, void* arg); +void pthread_exit(void* /*value_ptr*/); +int pthread_join(pthread_t thread, void** /*value_ptr*/); + +int pthread_mutex_destroy(pthread_mutex_t* mutex); +int pthread_mutex_init(pthread_mutex_t* mutex, const pthread_mutexattr_t* /*attr*/); +int pthread_mutex_lock(pthread_mutex_t* mutex); +int pthread_mutex_unlock(pthread_mutex_t* mutex); + +#ifdef __cplusplus +} +#endif
View file
kvazaar-1.3.0.tar.gz/src/threadwrapper/include/semaphore.h
Added
@@ -0,0 +1,33 @@ +/* +Copyright 2019 Tampere University + +Permission to use, copy, modify, and/or distribute this software for +any purpose with or without fee is hereby granted, provided that the +above copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void* sem_t; + +int sem_destroy(sem_t* sem); +// pshared is always 0 in Kvazaar on w32. +int sem_init(sem_t* sem, int /*pshared*/, unsigned int value); +int sem_post(sem_t* sem); +int sem_wait(sem_t* sem); + +#ifdef __cplusplus +} +#endif
View file
kvazaar-1.3.0.tar.gz/src/threadwrapper/src
Added
+(directory)
View file
kvazaar-1.3.0.tar.gz/src/threadwrapper/src/pthread.cpp
Added
@@ -0,0 +1,88 @@ +/* +Copyright 2019 Tampere University + +Permission to use, copy, modify, and/or distribute this software for +any purpose with or without fee is hereby granted, provided that the +above copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#include "pthread.h" +#include <condition_variable> +#include <mutex> +#include <thread> + + +int pthread_cond_broadcast(pthread_cond_t* cond) { + static_cast<std::condition_variable*>(*cond)->notify_all(); + return 0; +} + +int pthread_cond_destroy(pthread_cond_t* cond) { + delete static_cast<std::condition_variable*>(*cond); + *cond = nullptr; + return 0; +} + +int pthread_cond_init(pthread_cond_t* cond, const pthread_condattr_t*) { + *cond = new std::condition_variable(); + return 0; +} + +int pthread_cond_signal(pthread_cond_t* cond) { + static_cast<std::condition_variable*>(*cond)->notify_one(); + return 0; +} + +int pthread_cond_wait(pthread_cond_t* cond, pthread_mutex_t* mutex) { + std::mutex* real_mutex = static_cast<std::mutex*>(*mutex); + std::unique_lock<std::mutex> lock(*real_mutex, std::adopt_lock); + static_cast<std::condition_variable*>(*cond)->wait(lock); + lock.release(); + return 0; +} + +int pthread_create(pthread_t* thread, const pthread_attr_t*, voidp_voidp_func executee, void* arg) { + *thread = new std::thread(executee, arg); + return 0; +} + +void pthread_exit(void*) { + // It might be enough to do nothing here + // considering Kvazaar's current use of pthread_exit +} + +int pthread_join(pthread_t thread, void**) { + std::thread* real_thread = static_cast<std::thread*>(thread); + real_thread->join(); + delete real_thread; + return 0; +} + +int pthread_mutex_destroy(pthread_mutex_t* mutex) { + delete static_cast<std::mutex*>(*mutex); + *mutex = nullptr; + return 0; +} + +int pthread_mutex_init(pthread_mutex_t* mutex, const pthread_mutexattr_t*) { + *mutex = new std::mutex(); + return 0; +} + +int pthread_mutex_lock(pthread_mutex_t* mutex) { + static_cast<std::mutex*>(*mutex)->lock(); + return 0; +} + +int pthread_mutex_unlock(pthread_mutex_t* mutex) { + static_cast<std::mutex*>(*mutex)->unlock(); + return 0; +}
View file
kvazaar-1.3.0.tar.gz/src/threadwrapper/src/semaphore.cpp
Added
@@ -0,0 +1,72 @@ +/* +Copyright 2019 Tampere University + +Permission to use, copy, modify, and/or distribute this software for +any purpose with or without fee is hereby granted, provided that the +above copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ + +#include "semaphore.h" +#include <condition_variable> +#include <mutex> + + +class Semaphore { +public: + + Semaphore(int value): + val_(value) { + } + + void post() { + std::unique_lock<std::mutex> lck(mtx_); + if (++val_ <= 0) { + cvar_.notify_one(); + } + } + + void wait() { + std::unique_lock<std::mutex> lck(mtx_); + if (--val_ < 0) { + cvar_.wait(lck); + } + } + + +private: + + int val_; + std::condition_variable cvar_; + std::mutex mtx_; + +}; // class Semaphore + + +int sem_destroy(sem_t* sem) { + delete static_cast<Semaphore*>(*sem); + *sem = nullptr; + return 0; +} + +int sem_init(sem_t* sem, int, unsigned int value) { + *sem = new Semaphore(value); + return 0; +} + +int sem_post(sem_t* sem) { + static_cast<Semaphore*>(*sem)->post(); + return 0; +} + +int sem_wait(sem_t* sem) { + static_cast<Semaphore*>(*sem)->wait(); + return 0; +}
View file
kvazaar-1.2.0.tar.gz/src/transform.c -> kvazaar-1.3.0.tar.gz/src/transform.c
Changed
@@ -186,15 +186,25 @@ * \param coeff transform coefficients * \param block_size width of transform */ -void kvz_transform2d(const encoder_control_t * const encoder, int16_t *block, int16_t *coeff, int8_t block_size, int32_t mode) +void kvz_transform2d(const encoder_control_t * const encoder, + int16_t *block, + int16_t *coeff, + int8_t block_size, + color_t color, + cu_type_t type) { - dct_func *dct_func = kvz_get_dct_func(block_size, mode); + dct_func *dct_func = kvz_get_dct_func(block_size, color, type); dct_func(encoder->bitdepth, block, coeff); } -void kvz_itransform2d(const encoder_control_t * const encoder, int16_t *block, int16_t *coeff, int8_t block_size, int32_t mode) +void kvz_itransform2d(const encoder_control_t * const encoder, + int16_t *block, + int16_t *coeff, + int8_t block_size, + color_t color, + cu_type_t type) { - dct_func *idct_func = kvz_get_idct_func(block_size, mode); + dct_func *idct_func = kvz_get_idct_func(block_size, color, type); idct_func(encoder->bitdepth, coeff, block); } @@ -359,19 +369,22 @@ } } else if (can_use_trskip) { + int8_t tr_skip = 0; + // Try quantization with trskip and use it if it's better. has_coeffs = kvz_quantize_residual_trskip(state, cur_pu, tr_width, color, scan_idx, - &cur_pu->intra.tr_skip, + &tr_skip, lcu_width, lcu_width, ref, pred, pred, coeff); + cur_pu->tr_skip = tr_skip; } else { has_coeffs = kvz_quantize_residual(state, cur_pu, @@ -450,10 +463,8 @@ LCU_GET_CU_AT_PX(lcu, lcu_px.x + offset, lcu_px.y + offset)->cbf, }; - if (luma && depth < MAX_DEPTH) { + if (depth <= MAX_DEPTH) { cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_Y); - } - if (chroma && depth <= MAX_DEPTH) { cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_U); cbf_set_conditionally(&cur_pu->cbf, child_cbfs, depth, COLOR_V); }
View file
kvazaar-1.2.0.tar.gz/src/transform.h -> kvazaar-1.3.0.tar.gz/src/transform.h
Changed
@@ -38,8 +38,18 @@ void kvz_transformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size); void kvz_itransformskip(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size); -void kvz_transform2d(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size, int32_t mode); -void kvz_itransform2d(const encoder_control_t *encoder, int16_t *block,int16_t *coeff, int8_t block_size, int32_t mode); +void kvz_transform2d(const encoder_control_t * const encoder, + int16_t *block, + int16_t *coeff, + int8_t block_size, + color_t color, + cu_type_t type); +void kvz_itransform2d(const encoder_control_t * const encoder, + int16_t *block, + int16_t *coeff, + int8_t block_size, + color_t color, + cu_type_t type); int32_t kvz_get_scaled_qp(int8_t type, int8_t qp, int8_t qp_offset);
View file
kvazaar-1.2.0.tar.gz/tests/Makefile.am -> kvazaar-1.3.0.tar.gz/tests/Makefile.am
Changed
@@ -13,6 +13,21 @@ test_tools.sh \ test_weird_shapes.sh +EXTRA_DIST = \ + test_external_symbols.sh \ + test_gop.sh \ + test_interlace.sh \ + test_intra.sh \ + test_invalid_input.sh \ + test_mv_constraint.sh \ + test_owf_wpp_tiles.sh \ + test_rate_control.sh \ + test_slices.sh \ + test_smp.sh \ + test_tools.sh \ + test_weird_shapes.sh \ + util.sh + check_PROGRAMS = kvazaar_tests kvazaar_tests_SOURCES = \ @@ -35,6 +50,8 @@ nodist_EXTRA_kvazaar_tests_SOURCES = cpp.cpp if USE_CRYPTOPP +XFAIL_TESTS = \ + test_external_symbols.sh kvazaar_tests_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(kvazaar_tests_CFLAGS) $(CXXFLAGS) \ $(kvazaar_tests_LDFLAGS) $(LDFLAGS) -o $@
View file
kvazaar-1.2.0.tar.gz/tests/dct_tests.c -> kvazaar-1.3.0.tar.gz/tests/dct_tests.c
Changed
@@ -186,7 +186,7 @@ // Loop through all strategies picking out the intra sad ones and run // select strategies though all tests - for (unsigned i = 0; i < strategies.count; ++i) { + for (volatile unsigned i = 0; i < strategies.count; ++i) { const strategy_t * strategy = &strategies.strategies[i]; // Select buffer width according to function name for dct function.
View file
kvazaar-1.3.0.tar.gz/tests/inter_recon_bipred_tests.c
Added
@@ -0,0 +1,184 @@ +/***************************************************************************** +* This file is part of Kvazaar HEVC encoder. +* +* Copyright (C) 2017 Tampere University of Technology and others (see +* COPYING file). +* +* Kvazaar is free software: you can redistribute it and/or modify +* it under the terms of the GNU Lesser General Public License version 2.1 as +* published by the Free Software Foundation. +* +* Kvazaar is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* Lesser General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with Kvazaar. If not, see <http://www.gnu.org/licenses/>. +****************************************************************************/ + +#include "greatest/greatest.h" + +#include "test_strategies.h" +#include "strategies/generic/picture-generic.h" +#include <string.h> +#include <stdlib.h> + + +static lcu_t expected_test_result; +static lcu_t result; + +static lcu_t lcu1; + +int temp1, temp2, temp3, temp4; + +int16_t mv_param[2][2] = { { 3,3 },{ 3,3 } }; +int width = 16; +int height = 16; +int xpos = 0; +int ypos = 0; + + +kvz_pixel temp_lcu_y[LCU_WIDTH*LCU_WIDTH]; +kvz_pixel temp_lcu_u[LCU_WIDTH_C*LCU_WIDTH_C]; +kvz_pixel temp_lcu_v[LCU_WIDTH_C*LCU_WIDTH_C]; + +int hi_prec_luma_rec0; +int hi_prec_luma_rec1; +int hi_prec_chroma_rec0; +int hi_prec_chroma_rec1; + +hi_prec_buf_t* high_precision_rec0 = 0; +hi_prec_buf_t* high_precision_rec1 = 0; + +int temp_x, temp_y; + + + +static void setup() +{ + + memset(lcu1.rec.y, 0, sizeof(kvz_pixel) * 64 * 64); + memset(lcu1.rec.u, 0, sizeof(kvz_pixel) * 32 * 32); + memset(lcu1.rec.v, 0, sizeof(kvz_pixel) * 32 * 32); + + + memset(expected_test_result.rec.y, 0, sizeof(kvz_pixel) * 64 * 64); + memset(expected_test_result.rec.u, 0, sizeof(kvz_pixel) * 32 * 32); + memset(expected_test_result.rec.v, 0, sizeof(kvz_pixel) * 32 * 32); + + memcpy(expected_test_result.rec.y, lcu1.rec.y, sizeof(kvz_pixel) * 64 * 64); + memcpy(expected_test_result.rec.u, lcu1.rec.u, sizeof(kvz_pixel) * 32 * 32); + memcpy(expected_test_result.rec.v, lcu1.rec.v, sizeof(kvz_pixel) * 32 * 32); + + // Setup is not optimized working function from picture-generic.c. + + + int shift = 15 - KVZ_BIT_DEPTH; + int offset = 1 << (shift - 1); + + hi_prec_luma_rec0 = mv_param[0][0] & 3 || mv_param[0][1] & 3; + hi_prec_luma_rec1 = mv_param[1][0] & 3 || mv_param[1][1] & 3; + + hi_prec_chroma_rec0 = mv_param[0][0] & 7 || mv_param[0][1] & 7; + hi_prec_chroma_rec1 = mv_param[1][0] & 7 || mv_param[1][1] & 7; + + if (hi_prec_chroma_rec0) high_precision_rec0 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH); + if (hi_prec_chroma_rec1) high_precision_rec1 = kvz_hi_prec_buf_t_alloc(LCU_WIDTH*LCU_WIDTH); + + + + + for (temp_y = 0; temp_y < height; ++temp_y) { + int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); + for (temp_x = 0; temp_x < width; ++temp_x) { + int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1)); + int16_t sample0_y = (hi_prec_luma_rec0 ? high_precision_rec0->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (temp_lcu_y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + int16_t sample1_y = (hi_prec_luma_rec1 ? high_precision_rec1->y[y_in_lcu * LCU_WIDTH + x_in_lcu] : (expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_y + sample1_y + offset) >> shift); + } + + } + for (temp_y = 0; temp_y < height >> 1; ++temp_y) { + int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1)); + for (temp_x = 0; temp_x < width >> 1; ++temp_x) { + int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1)); + int16_t sample0_u = (hi_prec_chroma_rec0 ? high_precision_rec0->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + int16_t sample1_u = (hi_prec_chroma_rec1 ? high_precision_rec1->u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (expected_test_result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + expected_test_result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_u + sample1_u + offset) >> shift); + + int16_t sample0_v = (hi_prec_chroma_rec0 ? high_precision_rec0->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (temp_lcu_v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + int16_t sample1_v = (hi_prec_chroma_rec1 ? high_precision_rec1->v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] : (expected_test_result.rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] << (14 - KVZ_BIT_DEPTH))); + expected_test_result.rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu] = (kvz_pixel)kvz_fast_clip_32bit_to_pixel((sample0_v + sample1_v + offset) >> shift); + + + + } + } +} + + +TEST test_inter_recon_bipred() +{ + + + memcpy(result.rec.y, lcu1.rec.y, sizeof(kvz_pixel) * 64 * 64); + memcpy(result.rec.u, lcu1.rec.u, sizeof(kvz_pixel) * 32 * 32); + memcpy(result.rec.v, lcu1.rec.v, sizeof(kvz_pixel) * 32 * 32); + + + kvz_inter_recon_bipred_blend(hi_prec_luma_rec0, hi_prec_luma_rec1, hi_prec_chroma_rec0, hi_prec_chroma_rec1, width, height, xpos, ypos, high_precision_rec0, high_precision_rec1, &result, temp_lcu_y, temp_lcu_u, temp_lcu_v); + + for (temp_y = 0; temp_y < height; ++temp_y) { + int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); + for (temp_x = 0; temp_x < width; temp_x += 1) { + int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1)); + printf("%d ", result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu]); + } + } + printf("\n"); + + /* + for (temp_y = 0; temp_y < height >> 1; ++temp_y) { + int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1)); + for (temp_x = 0; temp_x < width >> 1; ++temp_x) { + int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1)); + printf("%d ", result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu]); + } + } + printf("\n"); + */ + + for (temp_y = 0; temp_y < height; ++temp_y) { + int y_in_lcu = ((ypos + temp_y) & ((LCU_WIDTH)-1)); + for (temp_x = 0; temp_x < width; temp_x+=1) { + int x_in_lcu = ((xpos + temp_x) & ((LCU_WIDTH)-1)); + ASSERT_EQ_FMT(expected_test_result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu], result.rec.y[y_in_lcu * LCU_WIDTH + x_in_lcu], "%d"); + } + } + + for (temp_y = 0; temp_y < height >> 1; ++temp_y) { + int y_in_lcu = (((ypos >> 1) + temp_y) & (LCU_WIDTH_C - 1)); + for (temp_x = 0; temp_x < width >> 1; ++temp_x) { + int x_in_lcu = (((xpos >> 1) + temp_x) & (LCU_WIDTH_C - 1)); + ASSERT_EQ_FMT(expected_test_result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu], result.rec.u[y_in_lcu * LCU_WIDTH_C + x_in_lcu], "%d"); + ASSERT_EQ_FMT(expected_test_result.rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu], result.rec.v[y_in_lcu * LCU_WIDTH_C + x_in_lcu], "%d"); + } + } + + PASS(); +} + +SUITE(inter_recon_bipred_tests) +{ + setup(); + + for (volatile int i = 0; i < strategies.count; ++i) { + if (strcmp(strategies.strategies[i].type, "inter_recon_bipred") != 0) { + continue; + } + + kvz_inter_recon_bipred_blend = strategies.strategies[i].fptr; + RUN_TEST(test_inter_recon_bipred); + } +}
View file
kvazaar-1.2.0.tar.gz/tests/sad_tests.c -> kvazaar-1.3.0.tar.gz/tests/sad_tests.c
Changed
@@ -31,7 +31,7 @@ ////////////////////////////////////////////////////////////////////////// // DEFINES -#define TEST_SAD(X, Y) kvz_image_calc_sad(g_pic, g_ref, 0, 0, (X), (Y), 8, 8) +#define TEST_SAD(X, Y) kvz_image_calc_sad(g_pic, g_ref, 0, 0, (X), (Y), 8, 8, NULL) ////////////////////////////////////////////////////////////////////////// // GLOBALS @@ -329,7 +329,7 @@ setup_tests(); - for (unsigned i = 0; i < strategies.count; ++i) { + for (volatile unsigned i = 0; i < strategies.count; ++i) { if (strcmp(strategies.strategies[i].type, "reg_sad") != 0) { continue; }
View file
kvazaar-1.2.0.tar.gz/tests/speed_tests.c -> kvazaar-1.3.0.tar.gz/tests/speed_tests.c
Changed
@@ -355,7 +355,7 @@ // Loop through all strategies picking out the intra sad ones and run // selectec strategies though all tests - for (unsigned i = 0; i < strategies.count; ++i) { + for (volatile unsigned i = 0; i < strategies.count; ++i) { const strategy_t * strategy = &strategies.strategies[i]; // Select buffer width according to function name.
View file
kvazaar-1.2.0.tar.gz/tests/test_gop.sh -> kvazaar-1.3.0.tar.gz/tests/test_gop.sh
Changed
@@ -9,4 +9,13 @@ valgrind_test 264x130 10 $common_args --gop=8 -p0 --owf=1 valgrind_test 264x130 10 $common_args --gop=8 -p0 --owf=4 valgrind_test 264x130 20 $common_args --gop=8 -p16 --owf=0 +valgrind_test 264x130 10 $common_args --gop=8 -p1 --owf=4 valgrind_test 264x130 10 $common_args --gop=lp-g4d3t1 -p5 --owf=4 +valgrind_test 264x130 10 $common_args --gop=8 -p8 --owf=4 --no-open-gop +valgrind_test 264x130 30 $common_args --gop=8 -p16 --owf=16 +# Do more extensive tests in a private gitlab CI runner +if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 20 $common_args --gop=8 -p8 --owf=0 --no-open-gop; fi +if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 40 $common_args --gop=8 -p32 --owf=4 --no-open-gop; fi +if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 70 $common_args --gop=8 -p64 --owf=4 --no-open-gop; fi +if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 50 $common_args --gop=8 -p40 --owf=4 --no-open-gop; fi +if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 10 $common_args --gop=8 -p8 --owf=0 --no-open-gop --bipred; fi
View file
kvazaar-1.2.0.tar.gz/tests/test_owf_wpp_tiles.sh -> kvazaar-1.3.0.tar.gz/tests/test_owf_wpp_tiles.sh
Changed
@@ -16,3 +16,4 @@ valgrind_test 264x130 10 $common_args -r2 --owf=0 --threads=2 --tiles-height-split=u2 --no-wpp valgrind_test 512x512 3 $common_args -r2 --owf=1 --threads=2 --tiles=2x2 --no-wpp valgrind_test 512x512 3 $common_args -r2 --owf=0 --threads=2 --tiles=2x2 --no-wpp +if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 512x512 30 $common_args -r2 --owf=0 --threads=2 --tiles=2x2 --no-wpp --bipred; fi
View file
kvazaar-1.2.0.tar.gz/tests/test_rate_control.sh -> kvazaar-1.3.0.tar.gz/tests/test_rate_control.sh
Changed
@@ -4,3 +4,4 @@ . "${0%/*}/util.sh" valgrind_test 264x130 10 --bitrate=500000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=0 --pu-depth-inter=1-3 --pu-depth-intra=2-3 +if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 512x512 30 --bitrate=100000 -p0 -r1 --owf=1 --threads=2 --rd=0 --no-rdoq --no-deblock --no-sao --no-signhide --subme=2 --pu-depth-inter=1-3 --pu-depth-intra=2-3 --bipred; fi
View file
kvazaar-1.2.0.tar.gz/tests/test_slices.sh -> kvazaar-1.3.0.tar.gz/tests/test_slices.sh
Changed
@@ -5,3 +5,4 @@ valgrind_test 512x256 10 --threads=2 --owf=1 --preset=ultrafast --tiles=2x2 --slices=tiles valgrind_test 264x130 10 --threads=2 --owf=1 --preset=ultrafast --slices=wpp +if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 20 --threads=2 --owf=1 --preset=fast --slices=wpp --no-open-gop; fi
View file
kvazaar-1.2.0.tar.gz/tests/test_smp.sh -> kvazaar-1.3.0.tar.gz/tests/test_smp.sh
Changed
@@ -8,3 +8,4 @@ valgrind_test 264x130 4 --threads=2 --owf=1 --wpp --smp valgrind_test 264x130 4 --threads=2 --owf=1 --wpp --amp valgrind_test 264x130 4 --threads=2 --owf=1 --wpp --smp --amp +if [ ! -z ${GITLAB_CI+x} ];then valgrind_test 264x130 16 --gop=8 --threads=2 --owf=1 --wpp --smp --amp --bipred; fi
View file
kvazaar-1.2.0.tar.gz/tests/tests_main.c -> kvazaar-1.3.0.tar.gz/tests/tests_main.c
Changed
@@ -32,6 +32,7 @@ extern SUITE(coeff_sum_tests); extern SUITE(mv_cand_tests); +extern SUITE(inter_recon_bipred_tests); int main(int argc, char **argv) { @@ -57,5 +58,8 @@ RUN_SUITE(mv_cand_tests); + // Doesn't work in git + //RUN_SUITE(inter_recon_bipred_tests); + GREATEST_MAIN_END(); }
View file
kvazaar-1.2.0.tar.gz/tests/util.sh -> kvazaar-1.3.0.tar.gz/tests/util.sh
Changed
@@ -34,9 +34,18 @@ prepare "${dimensions}" "${frames}" + # If $KVZ_TEST_VALGRIND is defined and equal to "1", run the test with + # valgrind. Otherwise, run without valgrind. + if [ "${KVZ_TEST_VALGRIND:-0}" = '1' ]; then + valgrind='valgrind --leak-check=full --error-exitcode=1 --' + else + valgrind='' + fi + + # No quotes for $valgrind because it expands to multiple (or zero) + # arguments. print_and_run \ - libtool execute \ - valgrind --leak-check=full --error-exitcode=1 -- \ + libtool execute $valgrind \ ../src/kvazaar -i "${yuvfile}" "--input-res=${dimensions}" -o "${hevcfile}" "$@" print_and_run \
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.