Projects
Essentials
kvazaar
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
Expand all
Collapse all
Changes of Revision 19
View file
kvazaar.changes
Changed
@@ -1,4 +1,16 @@ ------------------------------------------------------------------- +Wed Jan 17 18:39:21 UTC 2024 - Luigi Baldoni <aloisio@gmx.com> + +- Update to version 2.3.0 + Too many changes to list, see + https://github.com/ultravideo/kvazaar/compare/v2.2.0...v2.3.0 +- Drop kvazaar.memset.patch (no longer necessary with gcc11) +- Add kvazaar_fix_libm_underlinking.patch, + kvazaar-add_soversion.patch, kvazaar-fix_install_libdir.patch + and kvazaar-fix_install_mandir.patch +- Use gcc11 on Leap + +------------------------------------------------------------------- Wed Jan 4 11:29:30 UTC 2023 - Luigi Baldoni <aloisio@gmx.com> - Update to version 2.2.0
View file
kvazaar.spec
Changed
@@ -1,7 +1,7 @@ # # spec file for package kvazaar # -# Copyright (c) 2023 Packman Team <packman@links2linux.de> +# Copyright (c) 2024 Packman Team <packman@links2linux.de> # Copyright (c) 2017 SUSE LINUX GmbH, Nuernberg, Germany. # # All modifications and additions to the file contributed by third parties @@ -16,24 +16,33 @@ # Please submit bugfixes or comments via https://bugs.links2linux.org/ # - %define libname libkvazaar %define libmver 7 Name: kvazaar -Version: 2.2.0 -Release: 0 +Version: 2.3.0 +Release: 0.pm.0 Summary: HEVC encoder License: BSD-3-Clause Group: Productivity/Multimedia/Video/Editors and Convertors URL: http://ultravideo.cs.tut.fi/#encoder Source0: https://github.com/ultravideo/%{name}/archive/v%{version}.tar.gz#/%{name}-%{version}.tar.gz -Patch0: kvazaar.memset.patch -BuildRequires: automake -BuildRequires: findutils +# PATCH-FIX-OPENSUSE kvazaar-fix_libm_underlinking.patch +Patch1: kvazaar-fix_libm_underlinking.patch +# PATCH-FIX-UPSTREAM kvazaar-add_soversion.patch +Patch2: kvazaar-add_soversion.patch +# PATCH-FIX-OPENSUSE kvazaar-fix_install_libdir.patch +Patch3: kvazaar-fix_install_libdir.patch +# PATCH-FIX-OPENSUSE kvazaar-fix_install_mandir.patch +Patch4: kvazaar-fix_install_mandir.patch +BuildRequires: cmake >= 3.12 BuildRequires: gcc >= 4.4 -BuildRequires: gcc-c++ -BuildRequires: libtool BuildRequires: pkgconfig +%if 0%{?suse_version} > 1500 +BuildRequires: gcc-c++ +%else +BuildRequires: gcc11 +BuildRequires: gcc11-c++ +%endif Requires: %{libname}%{libmver} = %{version} %ifnarch %{arm} BuildRequires: yasm @@ -58,21 +67,20 @@ Header files for the %{libname} library %prep -%setup -q -%patch0 -p1 +%autosetup -p1 %build -autoreconf -fvi -%configure \ - --disable-static \ - --disable-silent-rules \ - --docdir=%{_defaultdocdir}/%{name} -make %{?_smp_mflags} +export CC=gcc +export CXX=g++ +test -x "$(type -p gcc-11)" && export CC=gcc-11 +test -x "$(type -p g++-11)" && export CXX=g++-11 + +%cmake \ + -DCMAKE_SKIP_INSTALL_RPATH=ON +%cmake_build %install -%make_install -find %{buildroot} -type f -name "*.la" -delete -print -rm %{buildroot}%{_defaultdocdir}/%{name}/LICENSE* +%cmake_install %post -n %{libname}%{libmver} -p /sbin/ldconfig %postun -n %{libname}%{libmver} -p /sbin/ldconfig
View file
kvazaar-add_soversion.patch
Added
@@ -0,0 +1,21 @@ +From 621a2bba8f12c9fed07c266e590bc05dea2861b2 Mon Sep 17 00:00:00 2001 +From: Joose Sainio <joose.sainio@tuni.fi> +Date: Thu, 18 Jan 2024 09:14:35 +0200 +Subject: PATCH CMake versions .so file + +--- + CMakeLists.txt | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 278939d9..1f459c44 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -165,6 +165,7 @@ if(MSVC) + set_property( SOURCE ${LIB_SOURCES_STRATEGIES_AVX2} APPEND PROPERTY COMPILE_FLAGS "/arch:AVX2" ) + else() + set_target_properties(kvazaar-bin PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/src) ++ set_target_properties(kvazaar PROPERTIES SOVERSION "7" VERSION "7.3.0") + list(APPEND ALLOW_AVX2 "x86_64" "AMD64") + if(${CMAKE_SYSTEM_PROCESSOR} IN_LIST ALLOW_AVX2) + set_property( SOURCE ${LIB_SOURCES_STRATEGIES_AVX2} APPEND PROPERTY COMPILE_FLAGS "-mavx2 -mbmi -mpopcnt -mlzcnt -mbmi2" )
View file
kvazaar-fix_install_libdir.patch
Added
@@ -0,0 +1,37 @@ +Index: kvazaar-2.3.0/CMakeLists.txt +=================================================================== +--- kvazaar-2.3.0.orig/CMakeLists.txt ++++ kvazaar-2.3.0/CMakeLists.txt +@@ -128,7 +128,7 @@ if(MSVC) + endif() + + if(BUILD_SHARED_LIBS) +- list( APPEND CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib" "./" "../lib" ) ++ list( APPEND CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_FULL_LIBDIR}" "./" "../lib" ) + set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + add_library(kvazaar SHARED ${LIB_SOURCES}) + else() +@@ -233,9 +233,9 @@ source_group( "" FILES ${SOURCE_GROUP_TO + + # ToDo: make configurable + +-install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.pc DESTINATION ${CMAKE_INSTALL_PREFIX}/share/pkgconfig) ++install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.pc DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/pkgconfig) + install(TARGETS kvazaar-bin DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) +-install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) ++install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}) + if(BUILD_SHARED_LIBS) # Just add the lib to the bin directory for now + if(MSVC) + install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) +Index: kvazaar-2.3.0/src/kvazaar.pc.in +=================================================================== +--- kvazaar-2.3.0.orig/src/kvazaar.pc.in ++++ kvazaar-2.3.0/src/kvazaar.pc.in +@@ -1,6 +1,6 @@ + prefix=@CMAKE_INSTALL_PREFIX@ + exec_prefix=${prefix} +-libdir=${prefix}/lib ++libdir=@CMAKE_INSTALL_FULL_LIBDIR@ + incdir=${prefix}/include + + Name: libkvazaar
View file
kvazaar-fix_install_mandir.patch
Added
@@ -0,0 +1,13 @@ +Index: kvazaar-2.3.0/CMakeLists.txt +=================================================================== +--- kvazaar-2.3.0.orig/CMakeLists.txt ++++ kvazaar-2.3.0/CMakeLists.txt +@@ -242,7 +242,7 @@ if(BUILD_SHARED_LIBS) # Just add the lib + endif() + endif() + install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include) +-install(FILES ${PROJECT_SOURCE_DIR}/doc/kvazaar.1 DESTINATION ${CMAKE_INSTALL_PREFIX}/share/man) ++install(FILES ${PROJECT_SOURCE_DIR}/doc/kvazaar.1 DESTINATION ${CMAKE_INSTALL_PREFIX}/share/man/man1) + + IF(UNIX) + # DIST
View file
kvazaar-fix_libm_underlinking.patch
Added
@@ -0,0 +1,12 @@ +Index: kvazaar-2.3.0/CMakeLists.txt +=================================================================== +--- kvazaar-2.3.0.orig/CMakeLists.txt ++++ kvazaar-2.3.0/CMakeLists.txt +@@ -182,6 +182,7 @@ else() + set(EXTRA_LIBS ${EXTRA_LIBS} m) + endif (HAVE_LIB_M) + ++ target_link_libraries(kvazaar PUBLIC ${EXTRA_LIBS}) + target_link_libraries(kvazaar-bin PUBLIC ${EXTRA_LIBS}) + endif() +
View file
kvazaar.memset.patch
Deleted
@@ -1,22 +0,0 @@ -Index: kvazaar-1.2.0/src/rdo.c -=================================================================== ---- kvazaar-1.2.0.orig/src/rdo.c -+++ kvazaar-1.2.0/src/rdo.c -@@ -593,6 +593,7 @@ void kvz_rdoq(encoder_state_t * const st - - uint32_t cg_num = width * height >> 4; - -+#if 0 - // Explicitly tell the only possible numbers of elements to be zeroed. - // Hope the compiler is able to utilize this information. - switch (cg_num) { -@@ -602,6 +603,9 @@ void kvz_rdoq(encoder_state_t * const st - case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); break; - default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups"); - } -+#else -+ memset(&sig_coeffgroup_flag, 0, sizeof(sig_coeffgroup_flag)); -+#endif - - cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_modeltype); - cabac_ctx_t *baseCtx = (type == 0) ? &(cabac->ctx.cu_sig_model_luma0) : &(cabac->ctx.cu_sig_model_chroma0);
View file
kvazaar-2.2.0.tar.gz/LICENSE.EXT.x264asm
Deleted
@@ -1,2 +0,0 @@ -Kvazaar uses x264asm abstraction layer -library (included in src/x86/x86inc.asm) -licensed under ISC license.
View file
kvazaar-2.2.0.tar.gz/build/yasm
Deleted
-(directory)
View file
kvazaar-2.2.0.tar.gz/build/yasm/vsyasm.props
Deleted
@@ -1,31 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> - <PropertyGroup - Condition="'$(YASMBeforeTargets)' == '' and '$(YASMAfterTargets)' == '' and '$(ConfigurationType)' != 'Makefile'"> - <YASMBeforeTargets>Midl</YASMBeforeTargets> - <YASMAfterTargets>CustomBuild</YASMAfterTargets> - </PropertyGroup> - <PropertyGroup> - <YASMDependsOn - Condition="'$(ConfigurationType)' != 'Makefile'">_SelectedFiles;$(YASMDependsOn)</YASMDependsOn> - </PropertyGroup> - <!-- Object format name for vsyasm must be in lower case. --> - <PropertyGroup Condition="'$(Platform)' == 'Win32'"> - <YASMFormat>win32</YASMFormat> - </PropertyGroup> - <PropertyGroup Condition="'$(Platform)' == 'x64'"> - <YASMFormat>win64</YASMFormat> - </PropertyGroup> - <ItemDefinitionGroup> - <YASM> - <Debug>False</Debug> - <ObjectFile>$(IntDir)</ObjectFile> - <PreProc>0</PreProc> - <Parser>0</Parser> - <CommandLineTemplate>vsyasm.exe -Xvc -f $(YASMFormat) AllOptions AdditionalOptions Inputs</CommandLineTemplate> - <Outputs>%(ObjectFile)</Outputs> - <ExecutionDescription>Assembling %(Filename)%(Extension)</ExecutionDescription> - <ShowOnlyRuleProperties>false</ShowOnlyRuleProperties> - </YASM> - </ItemDefinitionGroup> -</Project>
View file
kvazaar-2.2.0.tar.gz/build/yasm/vsyasm.targets
Deleted
@@ -1,109 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> - <ItemGroup> - <PropertyPageSchema - Include="$(MSBuildThisFileDirectory)$(MSBuildThisFileName).xml" /> - <AvailableItemName - Include="YASM"> - <Targets>_YASM</Targets> - </AvailableItemName> - </ItemGroup> - <UsingTask - TaskName="YASM" - TaskFactory="XamlTaskFactory" - AssemblyName="Microsoft.Build.Tasks.v4.0"> - <Task>$(MSBuildThisFileDirectory)$(MSBuildThisFileName).xml</Task> - </UsingTask> - <Target - Name="_YASM" - BeforeTargets="$(YASMBeforeTargets)" - AfterTargets="$(YASMAfterTargets)" - Condition="'@(YASM)' != ''" - DependsOnTargets="$(YASMDependsOn);ComputeYASMOutput" - Outputs="@(YASM->'$(ProjectDir)$(IntDir)%(Filename).obj')" - Inputs="@(YASM);%(YASM.AdditionalDependencies);$(MSBuildProjectFile)"> - <ItemGroup - Condition="'@(SelectedFiles)' != ''"> - <YASM - Remove="@(YASM)" - Condition="'%(Identity)' != '@(SelectedFiles)'" /> - </ItemGroup> - <ItemGroup> - <YASM_tlog - Include="%(YASM.ObjectFile)" - Condition="'%(YASM.ObjectFile)' != '' and '%(YASM.ExcludedFromBuild)' != 'true'"> - <Source>@(YASM->'%(FullPath)', '|')</Source> - </YASM_tlog> - </ItemGroup> - <Message - Importance="High" - Text="%(YASM.ExecutionDescription)" /> - <WriteLinesToFile - Condition="'@(YASM_tlog)' != '' and '%(YASM_tlog.ExcludedFromBuild)' != 'true'" - File="$(TLogLocation)$(ProjectName).write.1.tlog" - Lines="^%(YASM_tlog.Source);@(YASM->'$(ProjectDir)$(IntDir)%(Filename).obj')" - Encoding="Unicode" /> - <YASM - Condition="'@(YASM)' != '' and '%(YASM.ExcludedFromBuild)' != 'true'" - CommandLineTemplate="%(YASM.CommandLineTemplate)" - Debug="%(YASM.Debug)" - PreIncludeFile="%(YASM.PreIncludeFile)" - IncludePaths="%(YASM.IncludePaths)" - Defines="%(YASM.Defines)" - UnDefines="%(YASM.UnDefines)" - ObjectFile="%(YASM.ObjectFile)" - ListFile="%(YASM.ListFile)" - MapFile="%(YASM.MapFile)" - ErrorFile="%(YASM.ErrorFile)" - SymbolPrefix="%(YASM.SymbolPrefix)" - SymbolSuffix="%(YASM.SymbolSuffix)" - PreProc="%(YASM.PreProc)" - Parser="%(YASM.Parser)" - AdditionalOptions="%(YASM.AdditionalOptions)" - Inputs="@(YASM)" /> - </Target> - <PropertyGroup> - <ComputeLinkInputsTargets> - $(ComputeLinkInputsTargets); - ComputeYASMOutput; - </ComputeLinkInputsTargets> - <ComputeLibInputsTargets> - $(ComputeLibInputsTargets); - ComputeYASMOutput; - </ComputeLibInputsTargets> - </PropertyGroup> - <Target - Name="ComputeYASMOutput" - Condition="'@(YASM)' != ''"> - <ItemGroup> - <YASMDirsToMake - Condition="'@(YASM)' != '' and '%(YASM.ExcludedFromBuild)' != 'true' and !HasTrailingSlash('%(YASM.ObjectFile)')" - Include="%(YASM.ObjectFile)" /> - <Link - Include="%(YASMDirsToMake.Identity)" - Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" /> - <Lib - Include="%(YASMDirsToMake.Identity)" - Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" /> - <ImpLib - Include="%(YASMDirsToMake.Identity)" - Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" /> - </ItemGroup> - <ItemGroup> - <YASMDirsToMake - Condition="'@(YASM)' != '' and '%(YASM.ExcludedFromBuild)' != 'true' and HasTrailingSlash('%(YASM.ObjectFile)')" - Include="@(YASM->'%(ObjectFile)%(Filename).obj')" /> - <Link - Include="%(YASMDirsToMake.Identity)" - Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" /> - <Lib - Include="%(YASMDirsToMake.Identity)" - Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" /> - <ImpLib - Include="%(YASMDirsToMake.Identity)" - Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" /> - </ItemGroup> - <MakeDir - Directories="@(YASMDirsToMake->'%(RootDir)%(Directory)')" /> - </Target> -</Project> \ No newline at end of file
View file
kvazaar-2.2.0.tar.gz/build/yasm/vsyasm.xml
Deleted
@@ -1,283 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<ProjectSchemaDefinitions xmlns="clr-namespace:Microsoft.Build.Framework.XamlTypes;assembly=Microsoft.Build.Framework" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:sys="clr-namespace:System;assembly=mscorlib" xmlns:transformCallback="Microsoft.Cpp.Dev10.ConvertPropertyCallback"> - <Rule - Name="YASM" - PageTemplate="tool" - DisplayName="Yasm Assembler" - Order="200"> - - <Rule.DataSource> - <DataSource - Persistence="ProjectFile" - ItemType="YASM" /> - </Rule.DataSource> - - <Rule.Categories> - - <Category - Name="General"> - <Category.DisplayName> - <sys:String>General</sys:String> - </Category.DisplayName> - </Category> - - <Category - Name="Symbols"> - <Category.DisplayName> - <sys:String>Symbols</sys:String> - </Category.DisplayName> - </Category> - - <Category - Name="Files"> - <Category.DisplayName> - <sys:String>Files</sys:String> - </Category.DisplayName> - </Category> - - <Category - Name="Command Line" - Subtype="CommandLine"> - <Category.DisplayName> - <sys:String>Command Line</sys:String> - </Category.DisplayName> - </Category> - - </Rule.Categories> - - <StringListProperty - Name="Inputs" - Category="Command Line" - IsRequired="true" - Switch=" "> - <StringListProperty.DataSource> - <DataSource - Persistence="ProjectFile" - ItemType="YASM" - SourceType="Item" /> - </StringListProperty.DataSource> - </StringListProperty> - - <BoolProperty - Name="Debug" - Subcategory="Configuration" - HelpContext="0" - DisplayName="Debug Information" - Description="Generate debugging information" - Switch="-g cv8" /> - - <StringListProperty - Name="IncludePaths" - Subcategory="Configuration" - HelpContext="0" - DisplayName="Include Paths" - Description="Set the paths for any additional include files" - Switch="-i "value"" /> - - <StringListProperty - Name="Defines" - Category="Symbols" - Subcategory="Pre-Defined Symbols" - HelpContext="0" - DisplayName="Defined Symbols" - Description="Specify pre-defined symbols ('symbol' or 'symbol = value') " - Switch="-d "value"" /> - - <StringListProperty - Name="UnDefines" - Category="Symbols" - Subcategory="Pre-Defined Symbols" - HelpContext="0" - DisplayName="Remove Symbols" - Description="Remove pre-defined symbols " - Switch="-u "value"" /> - - <StringProperty - Name="ObjectFile" - Subcategory="Output" - HelpContext="0" - DisplayName="Object File Name" - Description="Select the output file name" - Switch="-o "value"" /> - - <StringProperty - Name="ListFile" - Category="Files" - Subcategory="Output" - HelpContext="0" - DisplayName="List File Name" - Description="Select an output listing by setting its file name" - Switch="-l "value"" /> - - <StringProperty - Name="PreIncludeFile" - Category="Files" - Subcategory="Configuration" - HelpContext="0" - DisplayName="Pre Include File" - Description="Select a pre-included file by setting its name" - Switch="-P "value"" /> - - <StringProperty - Name="MapFile" - Category="Files" - Subcategory="Output" - HelpContext="0" - DisplayName="Map File Name" - Description="Select a map output by setting its file name" - Switch="--mapdir= "value"" /> - - <StringProperty - Name="ErrorFile" - Category="Files" - Subcategory="Output" - HelpContext="0" - DisplayName="Error File Name" - Description="Send error/warning messages to a file by setting its name" - Switch="-E "value"" /> - - <StringProperty - Name="SymbolPrefix" - Category="Symbols" - Subcategory="Symbols" - HelpContext="0" - DisplayName="External Symbol Prefix" - Description="Prepend symbol to all external symbols" - Switch="--prefix "value"" /> - - <StringProperty - Name="SymbolSuffix" - Category="Symbols" - Subcategory="Symbols" - HelpContext="0" - DisplayName="External Symbol Suffix" - Description="Append symbol to all external symbols" - Switch="--suffix "value"" /> - - <EnumProperty - Name="PreProc" - Subcategory="Configuration" - HelpContext="0" - DisplayName="Pre-Processor" - Description="Select the pre-processor ('nasm' or 'raw')"> - <EnumValue - Name="0" - DisplayName="Nasm " - Switch="-rnasm" /> - <EnumValue - Name="1" - DisplayName="Raw" - Switch="-rraw" /> - </EnumProperty> - - <EnumProperty - Name="Parser" - Subcategory="Configuration" - HelpContext="0" - DisplayName="Parser" - Description="Select the parser for Intel ('nasm') or AT&T ( 'gas') syntax"> - <EnumValue - Name="0" - DisplayName="Nasm" - Switch="-pnasm" /> - <EnumValue - Name="1" - DisplayName="Gas" - Switch="-pgas" /> - </EnumProperty> - - <StringProperty - Name="CommandLineTemplate" - DisplayName="Command Line" - Visible="False" - IncludeInCommandLine="False" /> - - <DynamicEnumProperty - Name="YASMBeforeTargets" - Category="General" - EnumProvider="Targets" - IncludeInCommandLine="False"> - <DynamicEnumProperty.DisplayName> - <sys:String>Execute Before</sys:String> - </DynamicEnumProperty.DisplayName> - <DynamicEnumProperty.Description> - <sys:String>Specifies the targets for the build customization to run before.</sys:String> - </DynamicEnumProperty.Description> - <DynamicEnumProperty.ProviderSettings> - <NameValuePair - Name="Exclude" - Value="^YASMBeforeTargets|^Compute" /> - </DynamicEnumProperty.ProviderSettings> - <DynamicEnumProperty.DataSource> - <DataSource - Persistence="ProjectFile" - HasConfigurationCondition="true" /> - </DynamicEnumProperty.DataSource> - </DynamicEnumProperty> - - <DynamicEnumProperty - Name="YASMAfterTargets" - Category="General" - EnumProvider="Targets" - IncludeInCommandLine="False"> - <DynamicEnumProperty.DisplayName> - <sys:String>Execute After</sys:String> - </DynamicEnumProperty.DisplayName> - <DynamicEnumProperty.Description> - <sys:String>Specifies the targets for the build customization to run after.</sys:String> - </DynamicEnumProperty.Description> - <DynamicEnumProperty.ProviderSettings> - <NameValuePair - Name="Exclude" - Value="^YASMAfterTargets|^Compute" /> - </DynamicEnumProperty.ProviderSettings> - <DynamicEnumProperty.DataSource> - <DataSource - Persistence="ProjectFile" - ItemType="" - HasConfigurationCondition="true" /> - </DynamicEnumProperty.DataSource> - </DynamicEnumProperty> - - <StringListProperty - Name="Outputs" - DisplayName="Outputs" - Visible="False" - IncludeInCommandLine="False" /> - - <StringProperty - Name="ExecutionDescription" - DisplayName="Execution Description" - Visible="False" - IncludeInCommandLine="False" /> - - <StringListProperty - Name="AdditionalDependencies" - DisplayName="Additional Dependencies" - IncludeInCommandLine="False" - Visible="true" /> - - <StringProperty - Subtype="AdditionalOptions" - Name="AdditionalOptions" - Category="Command Line"> - <StringProperty.DisplayName> - <sys:String>Additional Options</sys:String> - </StringProperty.DisplayName> - <StringProperty.Description> - <sys:String>Additional Options</sys:String> - </StringProperty.Description> - </StringProperty> - </Rule> - - <ItemType - Name="YASM" - DisplayName="Yasm Assembler" /> - <FileExtension - Name="*.asm" - ContentType="YASM" /> - <ContentType - Name="YASM" - DisplayName="Yasm Assembler" - ItemType="YASM" /> -</ProjectSchemaDefinitions> \ No newline at end of file
View file
kvazaar-2.2.0.tar.gz/src/extras/x86inc.asm
Deleted
@@ -1,1456 +0,0 @@ -;***************************************************************************** -;* x86inc.asm: x264asm abstraction layer -;***************************************************************************** -;* Copyright (C) 2005-2014 x264 project -;* -;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Anton Mitrofanov <BugMaster@narod.ru> -;* Jason Garrett-Glaser <darkshikari@gmail.com> -;* Henrik Gramner <henrik@gramner.com> -;* -;* Permission to use, copy, modify, and/or distribute this software for any -;* purpose with or without fee is hereby granted, provided that the above -;* copyright notice and this permission notice appear in all copies. -;* -;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -;***************************************************************************** - -; This is a header file for the x264ASM assembly language, which uses -; NASM/YASM syntax combined with a large number of macros to provide easy -; abstraction between different calling conventions (x86_32, win64, linux64). -; It also has various other useful features to simplify writing the kind of -; DSP functions that are most often used in x264. - -; Unlike the rest of x264, this file is available under an ISC license, as it -; has significant usefulness outside of x264 and we want it to be available -; to the largest audience possible. Of course, if you modify it for your own -; purposes to add a new feature, we strongly encourage contributing a patch -; as this feature might be useful for others as well. Send patches or ideas -; to x264-devel@videolan.org . - -%ifndef private_prefix - %define private_prefix kvz -%endif - -%ifndef public_prefix - %define public_prefix private_prefix -%endif - -%define WIN64 0 -%define UNIX64 0 -%if ARCH_X86_64 - %ifidn __OUTPUT_FORMAT__,win32 - %define WIN64 1 - %elifidn __OUTPUT_FORMAT__,win64 - %define WIN64 1 - %elifidn __OUTPUT_FORMAT__,x64 - %define WIN64 1 - %else - %define UNIX64 1 - %endif -%endif - -%ifdef PREFIX - %define mangle(x) _ %+ x -%else - %define mangle(x) x -%endif - -%macro SECTION_RODATA 0-1 16 - SECTION .rodata align=%1 -%endmacro - -%macro SECTION_TEXT 0-1 16 - SECTION .text align=%1 -%endmacro - -%if WIN64 - %define PIC -%elif ARCH_X86_64 == 0 -; x86_32 doesn't require PIC. -; Some distros prefer shared objects to be PIC, but nothing breaks if -; the code contains a few textrels, so we'll skip that complexity. - %undef PIC -%endif -%ifdef PIC - default rel -%endif - -%macro CPUNOP 1 - %ifdef __YASM_MAJOR__ - CPU %1 - %endif -%endmacro - -; Always use long nops (reduces 0x90 spam in disassembly on x86_32) -CPUNOP amdnop - -; Macros to eliminate most code duplication between x86_32 and x86_64: -; Currently this works only for leaf functions which load all their arguments -; into registers at the start, and make no other use of the stack. Luckily that -; covers most of x264's asm. - -; PROLOGUE: -; %1 = number of arguments. loads them from stack if needed. -; %2 = number of registers used. pushes callee-saved regs if needed. -; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. -; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x, -; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes), -; and an extra register will be allocated to hold the original stack -; pointer (to not invalidate r0m etc.). To prevent the use of an extra -; register as stack pointer, request a negative stack size. -; %4+/%5+ = list of names to define to registers -; PROLOGUE can also be invoked by adding the same options to cglobal - -; e.g. -; cglobal foo, 2,3,0, dst, src, tmp -; declares a function (foo), taking two args (dst and src) and one local variable (tmp) - -; TODO Some functions can use some args directly from the stack. If they're the -; last args then you can just not declare them, but if they're in the middle -; we need more flexible macro. - -; RET: -; Pops anything that was pushed by PROLOGUE, and returns. - -; REP_RET: -; Use this instead of RET if it's a branch target. - -; registers: -; rN and rNq are the native-size register holding function argument N -; rNd, rNw, rNb are dword, word, and byte size -; rNh is the high 8 bits of the word size -; rNm is the original location of arg N (a register or on the stack), dword -; rNmp is native size - -%macro DECLARE_REG 2-3 - %define r%1q %2 - %define r%1d %2d - %define r%1w %2w - %define r%1b %2b - %define r%1h %2h - %if %0 == 2 - %define r%1m %2d - %define r%1mp %2 - %elif ARCH_X86_64 ; memory - %define r%1m rstk + stack_offset + %3 - %define r%1mp qword r %+ %1 %+ m - %else - %define r%1m rstk + stack_offset + %3 - %define r%1mp dword r %+ %1 %+ m - %endif - %define r%1 %2 -%endmacro - -%macro DECLARE_REG_SIZE 3 - %define r%1q r%1 - %define e%1q r%1 - %define r%1d e%1 - %define e%1d e%1 - %define r%1w %1 - %define e%1w %1 - %define r%1h %3 - %define e%1h %3 - %define r%1b %2 - %define e%1b %2 -%if ARCH_X86_64 == 0 - %define r%1 e%1 -%endif -%endmacro - -DECLARE_REG_SIZE ax, al, ah -DECLARE_REG_SIZE bx, bl, bh -DECLARE_REG_SIZE cx, cl, ch -DECLARE_REG_SIZE dx, dl, dh -DECLARE_REG_SIZE si, sil, null -DECLARE_REG_SIZE di, dil, null -DECLARE_REG_SIZE bp, bpl, null - -; t# defines for when per-arch register allocation is more complex than just function arguments - -%macro DECLARE_REG_TMP 1-* - %assign %%i 0 - %rep %0 - CAT_XDEFINE t, %%i, r%1 - %assign %%i %%i+1 - %rotate 1 - %endrep -%endmacro - -%macro DECLARE_REG_TMP_SIZE 0-* - %rep %0 - %define t%1q t%1 %+ q - %define t%1d t%1 %+ d - %define t%1w t%1 %+ w - %define t%1h t%1 %+ h - %define t%1b t%1 %+ b - %rotate 1 - %endrep -%endmacro - -DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 - -%if ARCH_X86_64 - %define gprsize 8 -%else - %define gprsize 4 -%endif - -%macro PUSH 1 - push %1 - %ifidn rstk, rsp - %assign stack_offset stack_offset+gprsize - %endif -%endmacro - -%macro POP 1 - pop %1 - %ifidn rstk, rsp - %assign stack_offset stack_offset-gprsize - %endif -%endmacro - -%macro PUSH_IF_USED 1-* - %rep %0 - %if %1 < regs_used - PUSH r%1 - %endif - %rotate 1 - %endrep -%endmacro - -%macro POP_IF_USED 1-* - %rep %0 - %if %1 < regs_used - pop r%1 - %endif - %rotate 1 - %endrep -%endmacro - -%macro LOAD_IF_USED 1-* - %rep %0 - %if %1 < num_args - mov r%1, r %+ %1 %+ mp - %endif - %rotate 1 - %endrep -%endmacro - -%macro SUB 2 - sub %1, %2 - %ifidn %1, rstk - %assign stack_offset stack_offset+(%2) - %endif -%endmacro - -%macro ADD 2 - add %1, %2 - %ifidn %1, rstk - %assign stack_offset stack_offset-(%2) - %endif -%endmacro - -%macro movifnidn 2 - %ifnidn %1, %2 - mov %1, %2 - %endif -%endmacro - -%macro movsxdifnidn 2 - %ifnidn %1, %2 - movsxd %1, %2 - %endif -%endmacro - -%macro ASSERT 1 - %if (%1) == 0 - %error assert failed - %endif -%endmacro - -%macro DEFINE_ARGS 0-* - %ifdef n_arg_names - %assign %%i 0 - %rep n_arg_names - CAT_UNDEF arg_name %+ %%i, q - CAT_UNDEF arg_name %+ %%i, d - CAT_UNDEF arg_name %+ %%i, w - CAT_UNDEF arg_name %+ %%i, h - CAT_UNDEF arg_name %+ %%i, b - CAT_UNDEF arg_name %+ %%i, m - CAT_UNDEF arg_name %+ %%i, mp - CAT_UNDEF arg_name, %%i - %assign %%i %%i+1 - %endrep - %endif - - %xdefine %%stack_offset stack_offset - %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine - %assign %%i 0 - %rep %0 - %xdefine %1q r %+ %%i %+ q - %xdefine %1d r %+ %%i %+ d - %xdefine %1w r %+ %%i %+ w - %xdefine %1h r %+ %%i %+ h - %xdefine %1b r %+ %%i %+ b - %xdefine %1m r %+ %%i %+ m - %xdefine %1mp r %+ %%i %+ mp - CAT_XDEFINE arg_name, %%i, %1 - %assign %%i %%i+1 - %rotate 1 - %endrep - %xdefine stack_offset %%stack_offset - %assign n_arg_names %0 -%endmacro - -%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) - %ifnum %1 - %if %1 != 0 - %assign %%stack_alignment ((mmsize + 15) & ~15) - %assign stack_size %1 - %if stack_size < 0 - %assign stack_size -stack_size - %endif - %assign stack_size_padded stack_size - %if WIN64 - %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space - %if mmsize != 8 - %assign xmm_regs_used %2 - %if xmm_regs_used > 8 - %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16 - %endif - %endif - %endif - %if mmsize <= 16 && HAVE_ALIGNED_STACK - %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) - SUB rsp, stack_size_padded - %else - %assign %%reg_num (regs_used - 1) - %xdefine rstk r %+ %%reg_num - ; align stack, and save original stack location directly above - ; it, i.e. in rsp+stack_size_padded, so we can restore the - ; stack in a single instruction (i.e. mov rsp, rstk or mov - ; rsp, rsp+stack_size_padded) - mov rstk, rsp - %if %1 < 0 ; need to store rsp on stack - sub rsp, gprsize+stack_size_padded - and rsp, ~(%%stack_alignment-1) - %xdefine rstkm rsp+stack_size_padded - mov rstkm, rstk - %else ; can keep rsp in rstk during whole function - sub rsp, stack_size_padded - and rsp, ~(%%stack_alignment-1) - %xdefine rstkm rstk - %endif - %endif - WIN64_PUSH_XMM - %endif - %endif -%endmacro - -%macro SETUP_STACK_POINTER 1 - %ifnum %1 - %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32) - %if %1 > 0 - %assign regs_used (regs_used + 1) - %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 - %warning "Stack pointer will overwrite register argument" - %endif - %endif - %endif -%endmacro - -%macro DEFINE_ARGS_INTERNAL 3+ - %ifnum %2 - DEFINE_ARGS %3 - %elif %1 == 4 - DEFINE_ARGS %2 - %elif %1 > 4 - DEFINE_ARGS %2, %3 - %endif -%endmacro - -%if WIN64 ; Windows x64 ;================================================= - -DECLARE_REG 0, rcx -DECLARE_REG 1, rdx -DECLARE_REG 2, R8 -DECLARE_REG 3, R9 -DECLARE_REG 4, R10, 40 -DECLARE_REG 5, R11, 48 -DECLARE_REG 6, rax, 56 -DECLARE_REG 7, rdi, 64 -DECLARE_REG 8, rsi, 72 -DECLARE_REG 9, rbx, 80 -DECLARE_REG 10, rbp, 88 -DECLARE_REG 11, R12, 96 -DECLARE_REG 12, R13, 104 -DECLARE_REG 13, R14, 112 -DECLARE_REG 14, R15, 120 - -%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, stack_size, arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 15 - PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 - ALLOC_STACK %4, %3 - %if mmsize != 8 && stack_size == 0 - WIN64_SPILL_XMM %3 - %endif - LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%macro WIN64_PUSH_XMM 0 - ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. - %if xmm_regs_used > 6 - movaps rstk + stack_offset + 8, xmm6 - %endif - %if xmm_regs_used > 7 - movaps rstk + stack_offset + 24, xmm7 - %endif - %if xmm_regs_used > 8 - %assign %%i 8 - %rep xmm_regs_used-8 - movaps rsp + (%%i-8)*16 + stack_size + 32, xmm %+ %%i - %assign %%i %%i+1 - %endrep - %endif -%endmacro - -%macro WIN64_SPILL_XMM 1 - %assign xmm_regs_used %1 - ASSERT xmm_regs_used <= 16 - %if xmm_regs_used > 8 - %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32 - SUB rsp, stack_size_padded - %endif - WIN64_PUSH_XMM -%endmacro - -%macro WIN64_RESTORE_XMM_INTERNAL 1 - %assign %%pad_size 0 - %if xmm_regs_used > 8 - %assign %%i xmm_regs_used - %rep xmm_regs_used-8 - %assign %%i %%i-1 - movaps xmm %+ %%i, %1 + (%%i-8)*16 + stack_size + 32 - %endrep - %endif - %if stack_size_padded > 0 - %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0) - mov rsp, rstkm - %else - add %1, stack_size_padded - %assign %%pad_size stack_size_padded - %endif - %endif - %if xmm_regs_used > 7 - movaps xmm7, %1 + stack_offset - %%pad_size + 24 - %endif - %if xmm_regs_used > 6 - movaps xmm6, %1 + stack_offset - %%pad_size + 8 - %endif -%endmacro - -%macro WIN64_RESTORE_XMM 1 - WIN64_RESTORE_XMM_INTERNAL %1 - %assign stack_offset (stack_offset-stack_size_padded) - %assign xmm_regs_used 0 -%endmacro - -%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 - -%macro RET 0 - WIN64_RESTORE_XMM_INTERNAL rsp - POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 -%if mmsize == 32 - vzeroupper -%endif - AUTO_REP_RET -%endmacro - -%elif ARCH_X86_64 ; *nix x64 ;============================================= - -DECLARE_REG 0, rdi -DECLARE_REG 1, rsi -DECLARE_REG 2, rdx -DECLARE_REG 3, rcx -DECLARE_REG 4, R8 -DECLARE_REG 5, R9 -DECLARE_REG 6, rax, 8 -DECLARE_REG 7, R10, 16 -DECLARE_REG 8, R11, 24 -DECLARE_REG 9, rbx, 32 -DECLARE_REG 10, rbp, 40 -DECLARE_REG 11, R12, 48 -DECLARE_REG 12, R13, 56 -DECLARE_REG 13, R14, 64 -DECLARE_REG 14, R15, 72 - -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 15 - PUSH_IF_USED 9, 10, 11, 12, 13, 14 - ALLOC_STACK %4 - LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 - -%macro RET 0 -%if stack_size_padded > 0 -%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 - mov rsp, rstkm -%else - add rsp, stack_size_padded -%endif -%endif - POP_IF_USED 14, 13, 12, 11, 10, 9 -%if mmsize == 32 - vzeroupper -%endif - AUTO_REP_RET -%endmacro - -%else ; X86_32 ;============================================================== - -DECLARE_REG 0, eax, 4 -DECLARE_REG 1, ecx, 8 -DECLARE_REG 2, edx, 12 -DECLARE_REG 3, ebx, 16 -DECLARE_REG 4, esi, 20 -DECLARE_REG 5, edi, 24 -DECLARE_REG 6, ebp, 28 -%define rsp esp - -%macro DECLARE_ARG 1-* - %rep %0 - %define r%1m rstk + stack_offset + 4*%1 + 4 - %define r%1mp dword r%1m - %rotate 1 - %endrep -%endmacro - -DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 - -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - %if num_args > 7 - %assign num_args 7 - %endif - %if regs_used > 7 - %assign regs_used 7 - %endif - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 7 - PUSH_IF_USED 3, 4, 5, 6 - ALLOC_STACK %4 - LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 - -%macro RET 0 -%if stack_size_padded > 0 -%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 - mov rsp, rstkm -%else - add rsp, stack_size_padded -%endif -%endif - POP_IF_USED 6, 5, 4, 3 -%if mmsize == 32 - vzeroupper -%endif - AUTO_REP_RET -%endmacro - -%endif ;====================================================================== - -%if WIN64 == 0 -%macro WIN64_SPILL_XMM 1 -%endmacro -%macro WIN64_RESTORE_XMM 1 -%endmacro -%macro WIN64_PUSH_XMM 0 -%endmacro -%endif - -; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either -; a branch or a branch target. So switch to a 2-byte form of ret in that case. -; We can automatically detect "follows a branch", but not a branch target. -; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) -%macro REP_RET 0 - %if has_epilogue - RET - %else - rep ret - %endif -%endmacro - -%define last_branch_adr $$ -%macro AUTO_REP_RET 0 - %ifndef cpuflags - times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr. - %elif notcpuflag(ssse3) - times ((last_branch_adr-$)>>31)+1 rep - %endif - ret -%endmacro - -%macro BRANCH_INSTR 0-* - %rep %0 - %macro %1 1-2 %1 - %2 %1 - %%branch_instr: - %xdefine last_branch_adr %%branch_instr - %endmacro - %rotate 1 - %endrep -%endmacro - -BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp - -%macro TAIL_CALL 2 ; callee, is_nonadjacent - %if has_epilogue - call %1 - RET - %elif %2 - jmp %1 - %endif -%endmacro - -;============================================================================= -; arch-independent part -;============================================================================= - -%assign function_align 16 - -; Begin a function. -; Applies any symbol mangling needed for C linkage, and sets up a define such that -; subsequent uses of the function name automatically refer to the mangled version. -; Appends cpuflags to the function name if cpuflags has been specified. -; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX -; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). -%macro cglobal 1-2+ "" ; name, PROLOGUE args - cglobal_internal 1, %1 %+ SUFFIX, %2 -%endmacro -%macro cvisible 1-2+ "" ; name, PROLOGUE args - cglobal_internal 0, %1 %+ SUFFIX, %2 -%endmacro -%macro cglobal_internal 2-3+ - %if %1 - %xdefine %%FUNCTION_PREFIX private_prefix - %xdefine %%VISIBILITY hidden - %else - %xdefine %%FUNCTION_PREFIX public_prefix - %xdefine %%VISIBILITY - %endif - %ifndef cglobaled_%2 - %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) - %xdefine %2.skip_prologue %2 %+ .skip_prologue - CAT_XDEFINE cglobaled_, %2, 1 - %endif - %xdefine current_function %2 - %ifidn __OUTPUT_FORMAT__,elf - global %2:function %%VISIBILITY - %else - global %2 - %endif - align function_align - %2: - RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer - %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required - %assign stack_offset 0 ; stack pointer offset relative to the return address - %assign stack_size 0 ; amount of stack space that can be freely used inside a function - %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding - %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 - %ifnidn %3, "" - PROLOGUE %3 - %endif -%endmacro - -%macro cextern 1 - %xdefine %1 mangle(private_prefix %+ _ %+ %1) - CAT_XDEFINE cglobaled_, %1, 1 - extern %1 -%endmacro - -; like cextern, but without the prefix -%macro cextern_naked 1 - %xdefine %1 mangle(%1) - CAT_XDEFINE cglobaled_, %1, 1 - extern %1 -%endmacro - -%macro const 1-2+ - %xdefine %1 mangle(private_prefix %+ _ %+ %1) - %ifidn __OUTPUT_FORMAT__,elf - global %1:data hidden - %else - global %1 - %endif - %1: %2 -%endmacro - -; This is needed for ELF, otherwise the GNU linker assumes the stack is -; executable by default. -%ifidn __OUTPUT_FORMAT__,elf -SECTION .note.GNU-stack noalloc noexec nowrite progbits -%endif - -; cpuflags - -%assign cpuflags_mmx (1<<0) -%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx -%assign cpuflags_3dnow (1<<2) | cpuflags_mmx -%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow -%assign cpuflags_sse (1<<4) | cpuflags_mmx2 -%assign cpuflags_sse2 (1<<5) | cpuflags_sse -%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 -%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 -%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 -%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 -%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 -%assign cpuflags_avx (1<<11)| cpuflags_sse42 -%assign cpuflags_xop (1<<12)| cpuflags_avx -%assign cpuflags_fma4 (1<<13)| cpuflags_avx -%assign cpuflags_avx2 (1<<14)| cpuflags_avx -%assign cpuflags_fma3 (1<<15)| cpuflags_avx - -%assign cpuflags_cache32 (1<<16) -%assign cpuflags_cache64 (1<<17) -%assign cpuflags_slowctz (1<<18) -%assign cpuflags_lzcnt (1<<19) -%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<21) -%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 - -%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) -%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) - -; Takes up to 2 cpuflags from the above list. -; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. -; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. -%macro INIT_CPUFLAGS 0-2 - CPUNOP amdnop - %if %0 >= 1 - %xdefine cpuname %1 - %assign cpuflags cpuflags_%1 - %if %0 >= 2 - %xdefine cpuname %1_%2 - %assign cpuflags cpuflags | cpuflags_%2 - %endif - %xdefine SUFFIX _ %+ cpuname - %if cpuflag(avx) - %assign avx_enabled 1 - %endif - %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) - %define mova movaps - %define movu movups - %define movnta movntps - %endif - %if cpuflag(aligned) - %define movu mova - %elifidn %1, sse3 - %define movu lddqu - %endif - %if ARCH_X86_64 == 0 && notcpuflag(sse2) - CPUNOP basicnop - %endif - %else - %xdefine SUFFIX - %undef cpuname - %undef cpuflags - %endif -%endmacro - -; Merge mmx and sse* -; m# is a simd register of the currently selected size -; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# -; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# -; (All 3 remain in sync through SWAP.) - -%macro CAT_XDEFINE 3 - %xdefine %1%2 %3 -%endmacro - -%macro CAT_UNDEF 2 - %undef %1%2 -%endmacro - -%macro INIT_MMX 0-1+ - %assign avx_enabled 0 - %define RESET_MM_PERMUTATION INIT_MMX %1 - %define mmsize 8 - %define num_mmregs 8 - %define mova movq - %define movu movq - %define movh movd - %define movnta movntq - %assign %%i 0 - %rep 8 - CAT_XDEFINE m, %%i, mm %+ %%i - CAT_XDEFINE nmm, %%i, %%i - %assign %%i %%i+1 - %endrep - %rep 8 - CAT_UNDEF m, %%i - CAT_UNDEF nmm, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -%macro INIT_XMM 0-1+ - %assign avx_enabled 0 - %define RESET_MM_PERMUTATION INIT_XMM %1 - %define mmsize 16 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif - %define mova movdqa - %define movu movdqu - %define movh movq - %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, xmm %+ %%i - CAT_XDEFINE nxmm, %%i, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -%macro INIT_YMM 0-1+ - %assign avx_enabled 1 - %define RESET_MM_PERMUTATION INIT_YMM %1 - %define mmsize 32 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif - %define mova movdqa - %define movu movdqu - %undef movh - %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, ymm %+ %%i - CAT_XDEFINE nymm, %%i, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -INIT_XMM - -%macro DECLARE_MMCAST 1 - %define mmmm%1 mm%1 - %define mmxmm%1 mm%1 - %define mmymm%1 mm%1 - %define xmmmm%1 mm%1 - %define xmmxmm%1 xmm%1 - %define xmmymm%1 xmm%1 - %define ymmmm%1 mm%1 - %define ymmxmm%1 xmm%1 - %define ymmymm%1 ymm%1 - %define xm%1 xmm %+ m%1 - %define ym%1 ymm %+ m%1 -%endmacro - -%assign i 0 -%rep 16 - DECLARE_MMCAST i -%assign i i+1 -%endrep - -; I often want to use macros that permute their arguments. e.g. there's no -; efficient way to implement butterfly or transpose or dct without swapping some -; arguments. -; -; I would like to not have to manually keep track of the permutations: -; If I insert a permutation in the middle of a function, it should automatically -; change everything that follows. For more complex macros I may also have multiple -; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. -; -; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that -; permutes its arguments. It's equivalent to exchanging the contents of the -; registers, except that this way you exchange the register names instead, so it -; doesn't cost any cycles. - -%macro PERMUTE 2-* ; takes a list of pairs to swap -%rep %0/2 - %xdefine %%tmp%2 m%2 - %rotate 2 -%endrep -%rep %0/2 - %xdefine m%1 %%tmp%2 - CAT_XDEFINE n, m%1, %1 - %rotate 2 -%endrep -%endmacro - -%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) -%ifnum %1 ; SWAP 0, 1, ... - SWAP_INTERNAL_NUM %1, %2 -%else ; SWAP m0, m1, ... - SWAP_INTERNAL_NAME %1, %2 -%endif -%endmacro - -%macro SWAP_INTERNAL_NUM 2-* - %rep %0-1 - %xdefine %%tmp m%1 - %xdefine m%1 m%2 - %xdefine m%2 %%tmp - CAT_XDEFINE n, m%1, %1 - CAT_XDEFINE n, m%2, %2 - %rotate 1 - %endrep -%endmacro - -%macro SWAP_INTERNAL_NAME 2-* - %xdefine %%args n %+ %1 - %rep %0-1 - %xdefine %%args %%args, n %+ %2 - %rotate 1 - %endrep - SWAP_INTERNAL_NUM %%args -%endmacro - -; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later -; calls to that function will automatically load the permutation, so values can -; be returned in mmregs. -%macro SAVE_MM_PERMUTATION 0-1 - %if %0 - %xdefine %%f %1_m - %else - %xdefine %%f current_function %+ _m - %endif - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE %%f, %%i, m %+ %%i - %assign %%i %%i+1 - %endrep -%endmacro - -%macro LOAD_MM_PERMUTATION 1 ; name to load from - %ifdef %1_m0 - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, %1_m %+ %%i - CAT_XDEFINE n, m %+ %%i, %%i - %assign %%i %%i+1 - %endrep - %endif -%endmacro - -; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't -%macro call 1 - call_internal %1, %1 %+ SUFFIX -%endmacro -%macro call_internal 2 - %xdefine %%i %1 - %ifndef cglobaled_%1 - %ifdef cglobaled_%2 - %xdefine %%i %2 - %endif - %endif - call %%i - LOAD_MM_PERMUTATION %%i -%endmacro - -; Substitutions that reduce instruction size but are functionally equivalent -%macro add 2 - %ifnum %2 - %if %2==128 - sub %1, -128 - %else - add %1, %2 - %endif - %else - add %1, %2 - %endif -%endmacro - -%macro sub 2 - %ifnum %2 - %if %2==128 - add %1, -128 - %else - sub %1, %2 - %endif - %else - sub %1, %2 - %endif -%endmacro - -;============================================================================= -; AVX abstraction layer -;============================================================================= - -%assign i 0 -%rep 16 - %if i < 8 - CAT_XDEFINE sizeofmm, i, 8 - %endif - CAT_XDEFINE sizeofxmm, i, 16 - CAT_XDEFINE sizeofymm, i, 32 -%assign i i+1 -%endrep -%undef i - -%macro CHECK_AVX_INSTR_EMU 3-* - %xdefine %%opcode %1 - %xdefine %%dst %2 - %rep %0-2 - %ifidn %%dst, %3 - %error non-avx emulation of ``%%opcode'' is not supported - %endif - %rotate 1 - %endrep -%endmacro - -;%1 == instruction -;%2 == 1 if float, 0 if int -;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise -;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not -;%5+: operands -%macro RUN_AVX_INSTR 5-8+ - %ifnum sizeof%6 - %assign %%sizeofreg sizeof%6 - %elifnum sizeof%5 - %assign %%sizeofreg sizeof%5 - %else - %assign %%sizeofreg mmsize - %endif - %assign %%emulate_avx 0 - %if avx_enabled && %%sizeofreg >= 16 - %xdefine %%instr v%1 - %else - %xdefine %%instr %1 - %if %0 >= 7+%3 - %assign %%emulate_avx 1 - %endif - %endif - - %if %%emulate_avx - %xdefine %%src1 %6 - %xdefine %%src2 %7 - %ifnidn %5, %6 - %if %0 >= 8 - CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8 - %else - CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 - %endif - %if %4 && %3 == 0 - %ifnid %7 - ; 3-operand AVX instructions with a memory arg can only have it in src2, - ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). - ; So, if the instruction is commutative with a memory arg, swap them. - %xdefine %%src1 %7 - %xdefine %%src2 %6 - %endif - %endif - %if %%sizeofreg == 8 - MOVQ %5, %%src1 - %elif %2 - MOVAPS %5, %%src1 - %else - MOVDQA %5, %%src1 - %endif - %endif - %if %0 >= 8 - %1 %5, %%src2, %8 - %else - %1 %5, %%src2 - %endif - %elif %0 >= 8 - %%instr %5, %6, %7, %8 - %elif %0 == 7 - %%instr %5, %6, %7 - %elif %0 == 6 - %%instr %5, %6 - %else - %%instr %5 - %endif -%endmacro - -;%1 == instruction -;%2 == 1 if float, 0 if int -;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise -;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not -%macro AVX_INSTR 1-4 0, 1, 0 - %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4 - %ifidn %2, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1 - %elifidn %3, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2 - %elifidn %4, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3 - %elifidn %5, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4 - %else - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5 - %endif - %endmacro -%endmacro - -; Instructions with both VEX and non-VEX encodings -; Non-destructive instructions are written without parameters -AVX_INSTR addpd, 1, 0, 1 -AVX_INSTR addps, 1, 0, 1 -AVX_INSTR addsd, 1, 0, 1 -AVX_INSTR addss, 1, 0, 1 -AVX_INSTR addsubpd, 1, 0, 0 -AVX_INSTR addsubps, 1, 0, 0 -AVX_INSTR aesdec, 0, 0, 0 -AVX_INSTR aesdeclast, 0, 0, 0 -AVX_INSTR aesenc, 0, 0, 0 -AVX_INSTR aesenclast, 0, 0, 0 -AVX_INSTR aesimc -AVX_INSTR aeskeygenassist -AVX_INSTR andnpd, 1, 0, 0 -AVX_INSTR andnps, 1, 0, 0 -AVX_INSTR andpd, 1, 0, 1 -AVX_INSTR andps, 1, 0, 1 -AVX_INSTR blendpd, 1, 0, 0 -AVX_INSTR blendps, 1, 0, 0 -AVX_INSTR blendvpd, 1, 0, 0 -AVX_INSTR blendvps, 1, 0, 0 -AVX_INSTR cmppd, 1, 1, 0 -AVX_INSTR cmpps, 1, 1, 0 -AVX_INSTR cmpsd, 1, 1, 0 -AVX_INSTR cmpss, 1, 1, 0 -AVX_INSTR comisd -AVX_INSTR comiss -AVX_INSTR cvtdq2pd -AVX_INSTR cvtdq2ps -AVX_INSTR cvtpd2dq -AVX_INSTR cvtpd2ps -AVX_INSTR cvtps2dq -AVX_INSTR cvtps2pd -AVX_INSTR cvtsd2si -AVX_INSTR cvtsd2ss -AVX_INSTR cvtsi2sd -AVX_INSTR cvtsi2ss -AVX_INSTR cvtss2sd -AVX_INSTR cvtss2si -AVX_INSTR cvttpd2dq -AVX_INSTR cvttps2dq -AVX_INSTR cvttsd2si -AVX_INSTR cvttss2si -AVX_INSTR divpd, 1, 0, 0 -AVX_INSTR divps, 1, 0, 0 -AVX_INSTR divsd, 1, 0, 0 -AVX_INSTR divss, 1, 0, 0 -AVX_INSTR dppd, 1, 1, 0 -AVX_INSTR dpps, 1, 1, 0 -AVX_INSTR extractps -AVX_INSTR haddpd, 1, 0, 0 -AVX_INSTR haddps, 1, 0, 0 -AVX_INSTR hsubpd, 1, 0, 0 -AVX_INSTR hsubps, 1, 0, 0 -AVX_INSTR insertps, 1, 1, 0 -AVX_INSTR lddqu -AVX_INSTR ldmxcsr -AVX_INSTR maskmovdqu -AVX_INSTR maxpd, 1, 0, 1 -AVX_INSTR maxps, 1, 0, 1 -AVX_INSTR maxsd, 1, 0, 1 -AVX_INSTR maxss, 1, 0, 1 -AVX_INSTR minpd, 1, 0, 1 -AVX_INSTR minps, 1, 0, 1 -AVX_INSTR minsd, 1, 0, 1 -AVX_INSTR minss, 1, 0, 1 -AVX_INSTR movapd -AVX_INSTR movaps -AVX_INSTR movd -AVX_INSTR movddup -AVX_INSTR movdqa -AVX_INSTR movdqu -AVX_INSTR movhlps, 1, 0, 0 -AVX_INSTR movhpd, 1, 0, 0 -AVX_INSTR movhps, 1, 0, 0 -AVX_INSTR movlhps, 1, 0, 0 -AVX_INSTR movlpd, 1, 0, 0 -AVX_INSTR movlps, 1, 0, 0 -AVX_INSTR movmskpd -AVX_INSTR movmskps -AVX_INSTR movntdq -AVX_INSTR movntdqa -AVX_INSTR movntpd -AVX_INSTR movntps -AVX_INSTR movq -AVX_INSTR movsd, 1, 0, 0 -AVX_INSTR movshdup -AVX_INSTR movsldup -AVX_INSTR movss, 1, 0, 0 -AVX_INSTR movupd -AVX_INSTR movups -AVX_INSTR mpsadbw, 0, 1, 0 -AVX_INSTR mulpd, 1, 0, 1 -AVX_INSTR mulps, 1, 0, 1 -AVX_INSTR mulsd, 1, 0, 1 -AVX_INSTR mulss, 1, 0, 1 -AVX_INSTR orpd, 1, 0, 1 -AVX_INSTR orps, 1, 0, 1 -AVX_INSTR pabsb -AVX_INSTR pabsd -AVX_INSTR pabsw -AVX_INSTR packsswb, 0, 0, 0 -AVX_INSTR packssdw, 0, 0, 0 -AVX_INSTR packuswb, 0, 0, 0 -AVX_INSTR packusdw, 0, 0, 0 -AVX_INSTR paddb, 0, 0, 1 -AVX_INSTR paddw, 0, 0, 1 -AVX_INSTR paddd, 0, 0, 1 -AVX_INSTR paddq, 0, 0, 1 -AVX_INSTR paddsb, 0, 0, 1 -AVX_INSTR paddsw, 0, 0, 1 -AVX_INSTR paddusb, 0, 0, 1 -AVX_INSTR paddusw, 0, 0, 1 -AVX_INSTR palignr, 0, 1, 0 -AVX_INSTR pand, 0, 0, 1 -AVX_INSTR pandn, 0, 0, 0 -AVX_INSTR pavgb, 0, 0, 1 -AVX_INSTR pavgw, 0, 0, 1 -AVX_INSTR pblendvb, 0, 0, 0 -AVX_INSTR pblendw, 0, 1, 0 -AVX_INSTR pclmulqdq, 0, 1, 0 -AVX_INSTR pcmpestri -AVX_INSTR pcmpestrm -AVX_INSTR pcmpistri -AVX_INSTR pcmpistrm -AVX_INSTR pcmpeqb, 0, 0, 1 -AVX_INSTR pcmpeqw, 0, 0, 1 -AVX_INSTR pcmpeqd, 0, 0, 1 -AVX_INSTR pcmpeqq, 0, 0, 1 -AVX_INSTR pcmpgtb, 0, 0, 0 -AVX_INSTR pcmpgtw, 0, 0, 0 -AVX_INSTR pcmpgtd, 0, 0, 0 -AVX_INSTR pcmpgtq, 0, 0, 0 -AVX_INSTR pextrb -AVX_INSTR pextrd -AVX_INSTR pextrq -AVX_INSTR pextrw -AVX_INSTR phaddw, 0, 0, 0 -AVX_INSTR phaddd, 0, 0, 0 -AVX_INSTR phaddsw, 0, 0, 0 -AVX_INSTR phminposuw -AVX_INSTR phsubw, 0, 0, 0 -AVX_INSTR phsubd, 0, 0, 0 -AVX_INSTR phsubsw, 0, 0, 0 -AVX_INSTR pinsrb, 0, 1, 0 -AVX_INSTR pinsrd, 0, 1, 0 -AVX_INSTR pinsrq, 0, 1, 0 -AVX_INSTR pinsrw, 0, 1, 0 -AVX_INSTR pmaddwd, 0, 0, 1 -AVX_INSTR pmaddubsw, 0, 0, 0 -AVX_INSTR pmaxsb, 0, 0, 1 -AVX_INSTR pmaxsw, 0, 0, 1 -AVX_INSTR pmaxsd, 0, 0, 1 -AVX_INSTR pmaxub, 0, 0, 1 -AVX_INSTR pmaxuw, 0, 0, 1 -AVX_INSTR pmaxud, 0, 0, 1 -AVX_INSTR pminsb, 0, 0, 1 -AVX_INSTR pminsw, 0, 0, 1 -AVX_INSTR pminsd, 0, 0, 1 -AVX_INSTR pminub, 0, 0, 1 -AVX_INSTR pminuw, 0, 0, 1 -AVX_INSTR pminud, 0, 0, 1 -AVX_INSTR pmovmskb -AVX_INSTR pmovsxbw -AVX_INSTR pmovsxbd -AVX_INSTR pmovsxbq -AVX_INSTR pmovsxwd -AVX_INSTR pmovsxwq -AVX_INSTR pmovsxdq -AVX_INSTR pmovzxbw -AVX_INSTR pmovzxbd -AVX_INSTR pmovzxbq -AVX_INSTR pmovzxwd -AVX_INSTR pmovzxwq -AVX_INSTR pmovzxdq -AVX_INSTR pmuldq, 0, 0, 1 -AVX_INSTR pmulhrsw, 0, 0, 1 -AVX_INSTR pmulhuw, 0, 0, 1 -AVX_INSTR pmulhw, 0, 0, 1 -AVX_INSTR pmullw, 0, 0, 1 -AVX_INSTR pmulld, 0, 0, 1 -AVX_INSTR pmuludq, 0, 0, 1 -AVX_INSTR por, 0, 0, 1 -AVX_INSTR psadbw, 0, 0, 1 -AVX_INSTR pshufb, 0, 0, 0 -AVX_INSTR pshufd -AVX_INSTR pshufhw -AVX_INSTR pshuflw -AVX_INSTR psignb, 0, 0, 0 -AVX_INSTR psignw, 0, 0, 0 -AVX_INSTR psignd, 0, 0, 0 -AVX_INSTR psllw, 0, 0, 0 -AVX_INSTR pslld, 0, 0, 0 -AVX_INSTR psllq, 0, 0, 0 -AVX_INSTR pslldq, 0, 0, 0 -AVX_INSTR psraw, 0, 0, 0 -AVX_INSTR psrad, 0, 0, 0 -AVX_INSTR psrlw, 0, 0, 0 -AVX_INSTR psrld, 0, 0, 0 -AVX_INSTR psrlq, 0, 0, 0 -AVX_INSTR psrldq, 0, 0, 0 -AVX_INSTR psubb, 0, 0, 0 -AVX_INSTR psubw, 0, 0, 0 -AVX_INSTR psubd, 0, 0, 0 -AVX_INSTR psubq, 0, 0, 0 -AVX_INSTR psubsb, 0, 0, 0 -AVX_INSTR psubsw, 0, 0, 0 -AVX_INSTR psubusb, 0, 0, 0 -AVX_INSTR psubusw, 0, 0, 0 -AVX_INSTR ptest -AVX_INSTR punpckhbw, 0, 0, 0 -AVX_INSTR punpckhwd, 0, 0, 0 -AVX_INSTR punpckhdq, 0, 0, 0 -AVX_INSTR punpckhqdq, 0, 0, 0 -AVX_INSTR punpcklbw, 0, 0, 0 -AVX_INSTR punpcklwd, 0, 0, 0 -AVX_INSTR punpckldq, 0, 0, 0 -AVX_INSTR punpcklqdq, 0, 0, 0 -AVX_INSTR pxor, 0, 0, 1 -AVX_INSTR rcpps, 1, 0, 0 -AVX_INSTR rcpss, 1, 0, 0 -AVX_INSTR roundpd -AVX_INSTR roundps -AVX_INSTR roundsd -AVX_INSTR roundss -AVX_INSTR rsqrtps, 1, 0, 0 -AVX_INSTR rsqrtss, 1, 0, 0 -AVX_INSTR shufpd, 1, 1, 0 -AVX_INSTR shufps, 1, 1, 0 -AVX_INSTR sqrtpd, 1, 0, 0 -AVX_INSTR sqrtps, 1, 0, 0 -AVX_INSTR sqrtsd, 1, 0, 0 -AVX_INSTR sqrtss, 1, 0, 0 -AVX_INSTR stmxcsr -AVX_INSTR subpd, 1, 0, 0 -AVX_INSTR subps, 1, 0, 0 -AVX_INSTR subsd, 1, 0, 0 -AVX_INSTR subss, 1, 0, 0 -AVX_INSTR ucomisd -AVX_INSTR ucomiss -AVX_INSTR unpckhpd, 1, 0, 0 -AVX_INSTR unpckhps, 1, 0, 0 -AVX_INSTR unpcklpd, 1, 0, 0 -AVX_INSTR unpcklps, 1, 0, 0 -AVX_INSTR xorpd, 1, 0, 1 -AVX_INSTR xorps, 1, 0, 1 - -; 3DNow instructions, for sharing code between AVX, SSE and 3DN -AVX_INSTR pfadd, 1, 0, 1 -AVX_INSTR pfsub, 1, 0, 0 -AVX_INSTR pfmul, 1, 0, 1 - -; base-4 constants for shuffles -%assign i 0 -%rep 256 - %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) - %if j < 10 - CAT_XDEFINE q000, j, i - %elif j < 100 - CAT_XDEFINE q00, j, i - %elif j < 1000 - CAT_XDEFINE q0, j, i - %else - CAT_XDEFINE q, j, i - %endif -%assign i i+1 -%endrep -%undef i -%undef j - -%macro FMA_INSTR 3 - %macro %1 4-7 %1, %2, %3 - %if cpuflag(xop) - v%5 %1, %2, %3, %4 - %else - %6 %1, %2, %3 - %7 %1, %4 - %endif - %endmacro -%endmacro - -FMA_INSTR pmacsdd, pmulld, paddd -FMA_INSTR pmacsww, pmullw, paddw -FMA_INSTR pmadcswd, pmaddwd, paddd - -; convert FMA4 to FMA3 if possible -%macro FMA4_INSTR 4 - %macro %1 4-8 %1, %2, %3, %4 - %if cpuflag(fma4) - v%5 %1, %2, %3, %4 - %elifidn %1, %2 - v%6 %1, %4, %3 ; %1 = %1 * %3 + %4 - %elifidn %1, %3 - v%7 %1, %2, %4 ; %1 = %2 * %1 + %4 - %elifidn %1, %4 - v%8 %1, %2, %3 ; %1 = %2 * %3 + %1 - %else - %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported - %endif - %endmacro -%endmacro - -FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd -FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps -FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd -FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss - -FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd -FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps -FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd -FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps - -FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd -FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps -FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd -FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss - -FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd -FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps -FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd -FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss - -FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd -FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps -FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd -FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss - -; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug -%if ARCH_X86_64 == 0 -%macro vpbroadcastq 2 -%if sizeof%1 == 16 - movddup %1, %2 -%else - vbroadcastsd %1, %2 -%endif -%endmacro -%endif
View file
kvazaar-2.2.0.tar.gz/src/strategies/x86_asm
Deleted
-(directory)
View file
kvazaar-2.2.0.tar.gz/src/strategies/x86_asm/picture-x86-asm-sad.asm
Deleted
@@ -1,385 +0,0 @@ -;/***************************************************************************** -; * This file is part of Kvazaar HEVC encoder. -; * -; * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors -; * All rights reserved. -; * -; * Redistribution and use in source and binary forms, with or without modification, -; * are permitted provided that the following conditions are met: -; * -; * * Redistributions of source code must retain the above copyright notice, this -; * list of conditions and the following disclaimer. -; * -; * * Redistributions in binary form must reproduce the above copyright notice, this -; * list of conditions and the following disclaimer in the documentation and/or -; * other materials provided with the distribution. -; * -; * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its -; * contributors may be used to endorse or promote products derived from -; * this software without specific prior written permission. -; * -; * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -; * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -; * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -; * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; ****************************************************************************/ - -%include "x86inc.asm" - -;cglobal and RET macros are from the x86.inc -;they push and pop the necessary registers to -;stack depending on the operating system - -;Usage: cglobal name, %1, %2, %3 -;1%: Number of arguments -;2%: Number of registers used -;3%: Number of xmm registers used. -;More info in x86inc.asm - -SECTION .text - -;Set x86inc.asm macros to use avx and xmm registers -INIT_XMM avx - -;KVZ_SAD_4X4 -;Calculates SAD of the 16 consequtive bytes in memory -;r0 address of the first value(current frame) -;r1 address of the first value(reference frame) - -cglobal sad_4x4, 2, 2, 2 - - ;Load 16 bytes of both frames - vmovdqu m0, r0 - vmovdqu m1, r1 - - ;Calculate SAD. The results are written in - ;m015:0 and m079:64 - vpsadbw m0, m1 - - ;Sum the results - vmovhlps m1, m0 - vpaddw m0, m1 - - ;Write the result to eax - vmovd eax, m0 - - RET - - -;KVZ_SAD_4X4_STRIDE -;Calculates SAD of a 4x4 block inside a frame with stride -;r0 address of the first value(current) -;r1 address of the first value(reference) -;r2 stride - -cglobal sad_4x4_stride, 3, 3, 2 - - ;Load 4 times 4 bytes of both frames - vpinsrd m0, r0, 0 - add r0, r2 - vpinsrd m0, r0, 1 - vpinsrd m0, r0+r2, 2 - vpinsrd m0, r0+r2*2, 3 - - vpinsrd m1, r1, 0 - add r1, r2 - vpinsrd m1, r1, 1 - vpinsrd m1, r1+r2, 2 - vpinsrd m1, r1+r2*2, 3 - - vpsadbw m0, m1 - - vmovhlps m1, m0 - vpaddw m0, m1 - - vmovd eax, m0 - - RET - - -;KVZ_SAD_8X8 -;Calculates SAD of the 64 consequtive bytes in memory -;r0 address of the first value(current) -;r1 address of the first value(reference) - -cglobal sad_8x8, 2, 2, 5 - - ;Load the first half of both frames - vmovdqu m0, r0 - vmovdqu m2, r0+16 - - vmovdqu m1, r1 - vmovdqu m3, r1+16 - - ;Calculate SADs for both - vpsadbw m0, m1 - vpsadbw m2, m3 - - ;Sum - vpaddw m0, m2 - - ;Repeat for the latter half - vmovdqu m1, r0+16*2 - vmovdqu m3, r0+16*3 - - vmovdqu m2, r1+16*2 - vmovdqu m4, r1+16*3 - - vpsadbw m1, m2 - vpsadbw m3, m4 - - vpaddw m1, m3 - - ;Sum all the SADs - vpaddw m0, m1 - - vmovhlps m1, m0 - vpaddw m0, m1 - - vmovd eax, m0 - - RET - - -;KVZ_SAD_8X8_STRIDE -;Calculates SAD of a 8x8 block inside a frame with stride -;r0 address of the first value(current) -;r1 address of the first value(reference) -;r2 stride - -cglobal sad_8x8_stride, 3, 3, 5 - - ;Zero m0 register - vpxor m0, m0 - - ;Load the first half to m1 and m3 registers(cur) - ;Current frame - ;Load to the high 64 bits of xmm - vmovhpd m1, r0 - add r0, r2 - ;Load to the low 64 bits - vmovlpd m1, r0 - - vmovhpd m3, r0+r2 - vmovlpd m3, r0+r2*2 - ;lea calculates the address to r0, - ;but doesn't load anything from - ;the memory. Equivalent for - ;two add r0, r2 instructions. - lea r0, r0+r2*2 - add r0, r2 - - ;Reference frame - vmovhpd m2, r1 - add r1, r2 - vmovlpd m2, r1 - - vmovhpd m4, r1+r2 - vmovlpd m4, r1+r2*2 - lea r1, r1+r2*2 - add r1, r2 - - vpsadbw m1, m2 - vpsadbw m3, m4 - - vpaddw m0, m1 - vpaddw m0, m3 - - ;Repeat for the other half - vmovhpd m1, r0 - add r0, r2 - vmovlpd m1, r0 - - vmovhpd m3, r0+r2 - vmovlpd m3, r0+r2*2 - lea r0, r0+r2*2 - add r0, r2 - - vmovhpd m2, r1 - add r1, r2 - vmovlpd m2, r1 - - vmovhpd m4, r1+r2 - vmovlpd m4, r1+r2*2 - lea r1, r1+r2*2 - add r1, r2 - - vpsadbw m1, m2 - vpsadbw m3, m4 - - vpaddw m0, m1 - vpaddw m0, m3 - - vmovhlps m1, m0 - vpaddw m0, m1 - - vmovd eax, m0 - - RET - - -;KVZ_SAD_16X16 -;Calculates SAD of the 256 consequtive bytes in memory -;r0 address of the first value(current) -;r1 address of the first value(reference) - -cglobal sad_16x16, 2, 2, 5 - - ;Zero m4 - vpxor m4, m4 - - %assign i 0 - - ;Repeat 8 times. - %rep 8 - - ;Load the next to rows of the current frame - vmovdqu m0, r0 + 16 * i - vmovdqu m2, r0 + 16 * (i + 1) - - ;Load the next to rows of the reference frame - vmovdqu m1, r1 + 16 * i - vmovdqu m3, r1 + 16 * (i + 1) - - vpsadbw m0, m1 - vpsadbw m2, m3 - - ;Accumulate SADs to m4 - vpaddw m4, m0 - vpaddw m4, m2 - - %assign i i+2 - - %endrep - - ;Calculate the final sum - vmovhlps m0, m4 - vpaddw m4, m0 - - vmovd eax, m4 - - RET - - -;KVZ_SAD_16X16_STRIDE -;Calculates SAD of a 16x16 block inside a frame with stride -;r0 address of the first value(current) -;r1 address of the first value(reference) -;r2 stride - -cglobal sad_16x16_stride, 3, 3, 5 - - vpxor m4, m4 - - %rep 8 - - ; Load the next 2 rows from rec_buf to m0 and m2 - vmovdqu m0, r0 - vmovdqu m2, r0 + r2 - lea r0, r0 + r2*2 - - ; Load the next 2 rows from ref_buf to m1 and m3 - vmovdqu m1, r1 - vmovdqu m3, r1 + r2 - lea r1, r1 + r2*2 - - vpsadbw m0, m1 - vpsadbw m2, m3 - - vpaddw m4, m0 - vpaddw m4, m2 - - %endrep - - vmovhlps m0, m4 - vpaddw m4, m0 - - vmovd eax, m4 - - RET - - -;KVZ_SAD_32x32_STRIDE -;Calculates SAD of a 32x32 block inside a frame with stride -;r0 address of the first value(current) -;r1 address of the first value(reference) -;r2 stride -cglobal sad_32x32_stride, 3, 3, 5 - vpxor m4, m4 - - ; Handle 2 lines per iteration - %rep 16 - vmovdqu m0, r0 - vmovdqu m1, r0 + 16 - vmovdqu m2, r0 + r2 - vmovdqu m3, r0 + r2 + 16 - lea r0, r0 + 2 * r2 - - vpsadbw m0, r1 - vpsadbw m1, r1 + 16 - vpsadbw m2, r1 + r2 - vpsadbw m3, r1 + r2 + 16 - lea r1, r1 + 2 * r2 - - vpaddd m4, m0 - vpaddd m4, m1 - vpaddd m4, m2 - vpaddd m4, m3 - %endrep - - vmovhlps m0, m4 - vpaddd m4, m0 - - vmovd eax, m4 - - RET - - -;KVZ_SAD_64x64_STRIDE -;Calculates SAD of a 64x64 block inside a frame with stride -;r0 address of the first value(current) -;r1 address of the first value(reference) -;r2 stride -cglobal sad_64x64_stride, 3, 4, 5 - vpxor m4, m4 ; sum accumulation register - mov r3, 4 ; number of iterations in the loop - -Process16Lines: - ; Intel optimization manual says to not unroll beyond 500 instructions. - ; Didn't seem to have much of an affect on Ivy Bridge or Haswell, but - ; smaller is better, when speed is the same, right? - %rep 16 - vmovdqu m0, r0 - vmovdqu m1, r0 + 1*16 - vmovdqu m2, r0 + 2*16 - vmovdqu m3, r0 + 3*16 - - vpsadbw m0, r1 - vpsadbw m1, r1 + 1*16 - vpsadbw m2, r1 + 2*16 - vpsadbw m3, r1 + 3*16 - - lea r0, r0 + r2 - lea r1, r1 + r2 - - vpaddd m4, m0 - vpaddd m4, m1 - vpaddd m4, m2 - vpaddd m4, m3 - %endrep - - dec r3 - jnz Process16Lines - - vmovhlps m0, m4 - vpaddd m4, m0 - - vmovd eax, m4 - - RET
View file
kvazaar-2.2.0.tar.gz/src/strategies/x86_asm/picture-x86-asm-sad.h
Deleted
@@ -1,56 +0,0 @@ -#ifndef _PICTURE_X86_ASM_SAD_H_ -#define _PICTURE_X86_ASM_SAD_H_ -/***************************************************************************** - * This file is part of Kvazaar HEVC encoder. - * - * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or - * other materials provided with the distribution. - * - * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS - ****************************************************************************/ - -/** - * \ingroup Optimization - * \file - * Optimizations for AVX, utilizing ASM implementations. - */ - -#include "global.h" // IWYU pragma: keep -#include "kvazaar.h" - -#if KVZ_BIT_DEPTH == 8 -unsigned kvz_sad_4x4_avx(const uint8_t*, const uint8_t*); -unsigned kvz_sad_8x8_avx(const uint8_t*, const uint8_t*); -unsigned kvz_sad_16x16_avx(const uint8_t*, const uint8_t*); - -unsigned kvz_sad_4x4_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); -unsigned kvz_sad_8x8_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); -unsigned kvz_sad_16x16_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); -unsigned kvz_sad_32x32_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); -unsigned kvz_sad_64x64_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride); -#endif // KVZ_BIT_DEPTH == 8 - -#endif
View file
kvazaar-2.2.0.tar.gz/src/strategies/x86_asm/picture-x86-asm-satd.asm
Deleted
@@ -1,575 +0,0 @@ -;/***************************************************************************** -; * This file is part of Kvazaar HEVC encoder. -; * -; * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors -; * All rights reserved. -; * -; * Redistribution and use in source and binary forms, with or without modification, -; * are permitted provided that the following conditions are met: -; * -; * * Redistributions of source code must retain the above copyright notice, this -; * list of conditions and the following disclaimer. -; * -; * * Redistributions in binary form must reproduce the above copyright notice, this -; * list of conditions and the following disclaimer in the documentation and/or -; * other materials provided with the distribution. -; * -; * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its -; * contributors may be used to endorse or promote products derived from -; * this software without specific prior written permission. -; * -; * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -; * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -; * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -; * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -; * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -; * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -; * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -; * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -; ****************************************************************************/ - -%include "x86inc.asm" - -;cglobal and RET macros are from the x86.inc -;they push and pop the necessary registers to -;stack depending on the operating system - -;Usage: cglobal name, %1, %2, %3 -;1%: Number of arguments -;2%: Number of registers used -;3%: Number of xmm registers used. -;More info in x86inc.asm - -SECTION .text - -;Set x86inc.asm macros to use avx and xmm registers -INIT_XMM avx - -;KVZ_ZERO_EXTEND_WD -;zero extend all packed words in xmm to dwords in 2 xmm registers -;%1 source register -;%2 lower destination register -;%3 higher destination register - -%macro KVZ_ZERO_EXTEND_WD 3 - - ;Zero extend high 64 bits - vmovhlps %3, %1 - vpmovzxwd %3, %3 - ;Zero extend low 64 bits - vpmovzxwd %2, %1 - -%endmacro ; KVZ_ZERO_EXTEND_WD - -; Use nondestructive horizontal add and sub to calculate both at the same time. -; TODO: It would probably be possible to do this with 3 registers (destructive vphsubw). -; args: -; 1, 2: input registers -; 3, 4: output registers - -%macro SATD_HORIZONTAL_SUB_AND_ADD 4 - - ; TODO: It might be possible to do this with 3 registers? - - ;First stage - vphaddw %3, %1, %2 - vphsubw %4, %1, %2 - - ;Second stage - vphaddw %1, %3, %4 - vphsubw %2, %3, %4 - - ;Third stage - vphaddw %3, %1, %2 - vphsubw %4, %1, %2 - -%endmacro ; SATD_HORIZONTAL_SUB_AND_ADD - -;KVZ_SATD_8X8_STRIDE -;Calculates SATD of a 8x8 block inside a frame with stride -;r0 address of the first value(reference) -;r1 address of the first value(current) -;r2 stride -; -;The Result is written in the register r4 - -%macro KVZ_SATD_8X8_STRIDE 0 - - ;Calculate differences of the 8 rows into - ;registers m0-m7 - vpmovzxbw m0, r0 - vpmovzxbw m7, r2 - vpsubw m0, m7 - - vpmovzxbw m1, r0+r1 - vpmovzxbw m7, r2+r3 - vpsubw m1, m7 - - ;Set r0 and r2 2 rows forward - lea r0, r0+r1*2 - lea r2, r2+r3*2 - - vpmovzxbw m2, r0 - vpmovzxbw m7, r2 - vpsubw m2, m7 - - vpmovzxbw m3, r0+r1 - vpmovzxbw m7, r2+r3 - vpsubw m3, m7 - - lea r0, r0+r1*2 - lea r2, r2+r3*2 - - vpmovzxbw m4, r0 - vpmovzxbw m7, r2 - vpsubw m4, m7 - - vpmovzxbw m5, r0+r1 - vpmovzxbw m7, r2+r3 - vpsubw m5, m7 - - lea r0, r0+r1*2 - lea r2, r2+r3*2 - - vpmovzxbw m6, r0 - vpmovzxbw m7, r2 - vpsubw m6, m7 - - ;32-bit AVX doesn't have registers - ;xmm8-xmm15, use stack instead - - %if ARCH_X86_64 - vpmovzxbw m7, r0+r1 - vpmovzxbw m8, r2+r3 - vpsubw m7, m8 - %else - %define temp0 esp+16*3 - %define temp1 esp+16*2 - %define temp2 esp+16*1 - %define temp3 esp+16*0 - - ;Reserve memory for 4 x 128 bits. - sub esp, 16*4 - - vpmovzxbw m7, r2+r3 - vmovdqu temp0, m7 - vpmovzxbw m7, r0+r1 - vpsubw m7, temp0 - - ;Put rows 5-8 to stack - vmovdqu temp0, m4 - vmovdqu temp1, m5 - vmovdqu temp2, m6 - vmovdqu temp3, m7 - %endif - - ;Hadamard transform (FWHT algorithm) - ;Horizontal transform - - %if ARCH_X86_64 - ;Calculate horizontal transform for each row. - ;Transforms of two rows are interleaved in register pairs. - ;(m8 and m9, m10 and m11,...) - - SATD_HORIZONTAL_SUB_AND_ADD m0, m1, m8, m9 - SATD_HORIZONTAL_SUB_AND_ADD m2, m3, m10, m11 - SATD_HORIZONTAL_SUB_AND_ADD m4, m5, m12, m13 - SATD_HORIZONTAL_SUB_AND_ADD m6, m7, m14, m15 - - %else - ;Calculate horizontal transforms for the first four rows. - ;Then load the other four into the registers and store - ;ready transforms in the stack. - ;Input registers are m0-m3, results are written in - ;registers m4-m7 (and memory). - - SATD_HORIZONTAL_SUB_AND_ADD m0, m1, m4, m5 - SATD_HORIZONTAL_SUB_AND_ADD m2, m3, m6, m7 - - vmovdqu m3, temp3 - vmovdqu m2, temp2 - vmovdqu m1, temp1 - vmovdqu m0, temp0 - - vmovdqu temp3, m7 - vmovdqu temp2, m6 - vmovdqu temp1, m5 - vmovdqu temp0, m4 - - SATD_HORIZONTAL_SUB_AND_ADD m0, m1, m4, m5 - SATD_HORIZONTAL_SUB_AND_ADD m2, m3, m6, m7 - %endif - - - ;Vertical transform - ;Transform columns of the 8x8 block. - ;First sum the interleaved horizontally - ;transformed values with one horizontal add - ;for each pair of rows. Then calculate - ;with regular packed additions and - ;subtractions. - - %if ARCH_X86_64 - ;Horizontally transformed values are in registers m8-m15 - ;Results are written in m0-m7 - - ;First stage - vphaddw m0, m8, m9 - vphsubw m1, m8, m9 - - vphaddw m2, m10, m11 - vphsubw m3, m10, m11 - - vphaddw m4, m12, m13 - vphsubw m5, m12, m13 - - vphaddw m6, m14, m15 - vphsubw m7, m14, m15 - - ;Second stage - vpaddw m8, m0, m2 - vpaddw m9, m1, m3 - vpsubw m10, m0, m2 - vpsubw m11, m1, m3 - - vpaddw m12, m4, m6 - vpaddw m13, m5, m7 - vpsubw m14, m4, m6 - vpsubw m15, m5, m7 - - ;Third stage - vpaddw m0, m8, m12 - vpaddw m1, m9, m13 - vpaddw m2, m10, m14 - vpaddw m3, m11, m15 - - vpsubw m4, m8, m12 - vpsubw m5, m9, m13 - vpsubw m6, m10, m14 - vpsubw m7, m11, m15 - - %else - ;Transformed values are in registers m4-m7 - ;and in memory(temp0-temp3). Transformed values - ;are written in m4-m7. Also calculate absolute - ;values for them and accumulate into ymm0. - - ;First stage - vphaddw m0, m4, m5 - vphsubw m1, m4, m5 - - vphaddw m2, m6, m7 - vphsubw m3, m6, m7 - - ;Second stage - vpaddw m4, m0, m2 - vpaddw m5, m1, m3 - vpsubw m6, m0, m2 - vpsubw m7, m1, m3 - - vmovdqu m3, temp3 - vmovdqu m2, temp2 - vmovdqu m1, temp1 - vmovdqu m0, temp0 - - vmovdqu temp3, m7 - vmovdqu temp2, m6 - vmovdqu temp1, m5 - vmovdqu temp0, m4 - - ;First stage (second half) - vphaddw m4, m0, m1 - vphsubw m5, m0, m1 - - vphaddw m6, m2, m3 - vphsubw m7, m2, m3 - - ;Second stage (second half) - vpaddw m0, m4, m6 - vpaddw m1, m5, m7 - vpsubw m2, m4, m6 - vpsubw m3, m5, m7 - - ;Third stage - vpaddw m4, m0, temp0 - vpaddw m5, m1, temp1 - vpsubw m6, m0, temp0 - vpsubw m7, m1, temp1 - - ;Calculate the absolute values and - ;zero extend 16-bit values to 32-bit - ;values. Then sum the values. - - vpabsw m4, m4 - KVZ_ZERO_EXTEND_WD m4, m4, m1 - vpaddd m4, m1 - - vpabsw m5, m5 - KVZ_ZERO_EXTEND_WD m5, m5, m1 - vpaddd m5, m1 - - vpabsw m6, m6 - KVZ_ZERO_EXTEND_WD m6, m6, m1 - vpaddd m6, m1 - - vpabsw m7, m7 - KVZ_ZERO_EXTEND_WD m7, m7, m1 - vpaddd m7, m1 - - vpaddd m0, m4, m5 - vpaddd m0, m6 - vpaddd m0, m7 - - ;Repeat for the rest - vpaddw m4, m2, temp2 - vpaddw m5, m3, temp3 - vpsubw m6, m2, temp2 - vpsubw m7, m3, temp3 - - vpabsw m4, m4 - KVZ_ZERO_EXTEND_WD m4, m4, m1 - vpaddd m4, m1 - - vpabsw m5, m5 - KVZ_ZERO_EXTEND_WD m5, m5, m1 - vpaddd m5, m1 - - vpabsw m6, m6 - KVZ_ZERO_EXTEND_WD m6, m6, m1 - vpaddd m6, m1 - - vpabsw m7, m7 - KVZ_ZERO_EXTEND_WD m7, m7, m1 - vpaddd m7, m1 - - ;Sum the other half of the packed results to ymm4 - vpaddd m4, m5 - vpaddd m4, m6 - vpaddd m4, m7 - - ;Sum all packed results to ymm0 - vpaddd m0, m4 - - %endif - - %if ARCH_X86_64 - - ;Calculate the absolute values and - ;zero extend 16-bit values to 32-bit - ;values. In other words: extend xmm to - ;corresponding ymm. - - vpabsw m0, m0 - KVZ_ZERO_EXTEND_WD m0, m0, m8 - vpaddd m0, m8 - - vpabsw m1, m1 - KVZ_ZERO_EXTEND_WD m1, m1, m8 - vpaddd m1, m8 - - vpabsw m2, m2 - KVZ_ZERO_EXTEND_WD m2, m2, m8 - vpaddd m1, m8 - - vpabsw m3, m3 - KVZ_ZERO_EXTEND_WD m3, m3, m8 - vpaddd m3, m8 - - vpabsw m4, m4 - KVZ_ZERO_EXTEND_WD m4, m4, m8 - vpaddd m4, m8 - - vpabsw m5, m5 - KVZ_ZERO_EXTEND_WD m5, m5, m8 - vpaddd m5, m8 - - vpabsw m6, m6 - KVZ_ZERO_EXTEND_WD m6, m6, m8 - vpaddd m6, m8 - - vpabsw m7, m7 - KVZ_ZERO_EXTEND_WD m7, m7, m8 - vpaddd m7, m8 - - ;Calculate packed sum of transformed values to ymm0 - vpaddd m0, m1 - vpaddd m0, m2 - vpaddd m0, m3 - vpaddd m0, m4 - vpaddd m0, m5 - vpaddd m0, m6 - vpaddd m0, m7 - %endif - - ;Sum the packed values to m032:0 - vphaddd m0, m0 - vphaddd m0, m0 - - ;The result is in the lowest 32 bits in m0 - vmovd r4d, m0 - - ;8x8 Hadamard transform requires - ;adding 2 and dividing by 4 - add r4, 2 - shr r4, 2 - - ;Zero high 128 bits of ymm registers to - ;prevent AVX-SSE transition penalty. - vzeroupper - - %if ARCH_X86_64 == 0 - add esp, 16*4 - %endif - -%endmacro ; KVZ_SATD_8X8_STRIDE - -;KVZ_SATD_4X4 -;Calculates SATD of the 16 consequtive bytes in memory -;r0 address of the first value(current) -;r1 address of the first value(reference) - -cglobal satd_4x4, 2, 2, 6 - - ;Load 8 bytes from memory and zero extend - ;to 16-bit values. Calculate difference. - vpmovzxbw m0, r0 - vpmovzxbw m2, r1 - vpsubw m0, m2 - - vpmovzxbw m1, r0+8 - vpmovzxbw m3, r1+8 - vpsubw m1, m3 - - ;Hadamard transform - ;Horizontal phase - ;First stage - vphaddw m4, m0, m1 - vphsubw m5, m0, m1 - ;Second stage - vphaddw m0, m4, m5 - vphsubw m1, m4, m5 - - ;Vertical phase - ;First stage - vphaddw m4, m0, m1 - vphsubw m5, m0, m1 - ;Second stage - vphaddw m0, m4, m5 - vphsubw m1, m4, m5 - - ;Calculate absolute values - vpabsw m0, m0 - vpabsw m1, m1 - - ;Sum the all the transformed values - vpaddw m0, m1 - - vphaddw m0, m0 - vphaddw m0, m0 - vphaddw m0, m0 - - ;Extract the lowest 16 bits of m0 - ;into eax - vpextrw eax, m0, 0 - - ;4x4 Hadamard transform requires - ;Addition of 1 and division by 2 - add eax, 1 - shr eax, 1 - - RET - - - -;KVZ_SATD_8X8 -;Calculates SATD of a 8x8 block inside a frame with stride -;r0 address of the first value(reference) -;r1 address of the first value(current) -;r2 stride - -%if ARCH_X86_64 - cglobal satd_8x8, 4, 5, 16 -%else - cglobal satd_8x8, 4, 5, 8 -%endif - - ;Set arguments - mov r2, r1 - mov r1, 8 - mov r3, 8 - - ;Calculate 8x8 SATD. Result is written - ;in the register r4. - KVZ_SATD_8X8_STRIDE - mov rax, r4 - RET - -;KVZ_SATD_NXN -;Calculates SATD of a NxN block inside a frame with stride -;r0 address of the first value(reference) -;r1 address of the first value(current) - -%macro KVZ_SATD_NXN 1 - - %if ARCH_X86_64 - cglobal satd_%1x%1, 2, 7, 16 - %else - cglobal satd_%1x%1, 2, 7, 8 - %endif - - ;Set arguments - mov r2, r1 - mov r1, %1 - mov r3, %1 - - ;Zero r5 and r6 - xor r5, r5 - xor r6, r6 - - ;Calculate SATDs of each 8x8 sub-blocks - ;and accumulate the results in r6. Repeat yloop - ;N times. Repeat xloop N times. r4 and r5 are counters - ;for the loops. - - .yloop - - ;zero r4 - xor r4, r4 - - .xloop - push r4 - - ;Calculate SATD of the sub-block. Result is - ;written in the register r4. - KVZ_SATD_8X8_STRIDE - add r6, r4 - - ;Set r2 and r0 to the next sub-block - ;on the same row - sub r2, 6*%1-8 - sub r0, 6*%1-8 - - pop r4 - add r4, 8 - cmp r4, %1 - jne .xloop - - ;Set r2 and r0 to the first sub-block - ;on the next row(of 8x8 sub-blocks) - add r2, 7*%1 - add r0, 7*%1 - - add r5, 8 - cmp r5, %1 - jne .yloop - - mov rax, r6 - RET - -%endmacro ; KVZ_SATD_NXN - -KVZ_SATD_NXN 16 -KVZ_SATD_NXN 32 -KVZ_SATD_NXN 64
View file
kvazaar-2.2.0.tar.gz/src/strategies/x86_asm/picture-x86-asm-satd.h
Deleted
@@ -1,50 +0,0 @@ -#ifndef _PICTURE_X86_ASM_SATD_H_ -#define _PICTURE_X86_ASM_SATD_H_ -/***************************************************************************** - * This file is part of Kvazaar HEVC encoder. - * - * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or - * other materials provided with the distribution. - * - * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS - ****************************************************************************/ - -/** - * \ingroup Optimization - * \file - * Optimizations for AVX, utilizing ASM implementations. - */ - -#include "global.h" // IWYU pragma: keep - - -unsigned kvz_satd_4x4_avx(const kvz_pixel *org, const kvz_pixel *cur); -unsigned kvz_satd_8x8_avx(const kvz_pixel *org, const kvz_pixel *cur); -unsigned kvz_satd_16x16_avx(const kvz_pixel *org, const kvz_pixel *cur); -unsigned kvz_satd_32x32_avx(const kvz_pixel *org, const kvz_pixel *cur); -unsigned kvz_satd_64x64_avx(const kvz_pixel *org, const kvz_pixel *cur); - -#endif
View file
kvazaar-2.2.0.tar.gz/src/strategies/x86_asm/picture-x86-asm.c
Deleted
@@ -1,132 +0,0 @@ -/***************************************************************************** - * This file is part of Kvazaar HEVC encoder. - * - * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or - * other materials provided with the distribution. - * - * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS - ****************************************************************************/ - -#include "strategies/x86_asm/picture-x86-asm.h" - -#if defined(KVZ_COMPILE_ASM) -#include "kvazaar.h" -#if KVZ_BIT_DEPTH == 8 -#include <stdlib.h> - -#include "strategies/x86_asm/picture-x86-asm-sad.h" -#include "strategies/x86_asm/picture-x86-asm-satd.h" -#include "strategies/sse41/picture-sse41.h" -#include "strategyselector.h" - - -static unsigned kvz_sad_32x32_avx(const uint8_t *data1, const uint8_t *data2) -{ - unsigned sad = 0; - sad += kvz_sad_16x16_avx(data1, data2); - sad += kvz_sad_16x16_avx(data1 + 8 * 32, data2 + 8 * 32); - sad += kvz_sad_16x16_avx(data1 + 16 * 32, data2 + 16 * 32); - sad += kvz_sad_16x16_avx(data1 + 24 * 32, data2 + 24 * 32); - return sad; -} - -static unsigned kvz_sad_64x64_avx(const uint8_t *data1, const uint8_t *data2) -{ - unsigned sad = 0; - sad += kvz_sad_32x32_avx(data1, data2); - sad += kvz_sad_32x32_avx(data1 + 16 * 64, data2 + 16 * 64); - sad += kvz_sad_32x32_avx(data1 + 32 * 64, data2 + 32 * 64); - sad += kvz_sad_32x32_avx(data1 + 48 * 64, data2 + 48 * 64); - return sad; -} - -static unsigned kvz_sad_other_avx(const uint8_t *data1, const uint8_t *data2, - int width, int height, - unsigned stride) -{ - unsigned sad = 0; - - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - sad += abs(data1y * stride + x - data2y * stride + x); - } - } - - return sad; -} - -static unsigned reg_sad_x86_asm(const uint8_t *data1, const uint8_t * data2, - const int width, const int height, - const unsigned stride1, const unsigned stride2) -{ - if (width == height) { - if (width == 8) { - return kvz_sad_8x8_stride_avx(data1, data2, stride1); - } else if (width == 16) { - return kvz_sad_16x16_stride_avx(data1, data2, stride1); - } else if (width == 32) { - return kvz_sad_32x32_stride_avx(data1, data2, stride1); - } else if (width == 64) { - return kvz_sad_64x64_stride_avx(data1, data2, stride1); - } - } - - if (width * height >= 16) { - // Call the vectorized general SAD SSE41 function when the block - // is big enough to make it worth it. - return kvz_reg_sad_sse41(data1, data2, width, height, stride1, stride2); - } else { - return kvz_sad_other_avx(data1, data2, width, height, stride1); - } -} - -#endif // KVZ_BIT_DEPTH == 8 -#endif //defined(KVZ_COMPILE_ASM) - -int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth) -{ - bool success = true; -#if defined(KVZ_COMPILE_ASM) -#if KVZ_BIT_DEPTH == 8 - if (bitdepth == 8){ - success &= kvz_strategyselector_register(opaque, "reg_sad", "x86_asm_avx", 30, ®_sad_x86_asm); - - success &= kvz_strategyselector_register(opaque, "sad_4x4", "x86_asm_avx", 30, &kvz_sad_4x4_avx); - success &= kvz_strategyselector_register(opaque, "sad_8x8", "x86_asm_avx", 30, &kvz_sad_8x8_avx); - success &= kvz_strategyselector_register(opaque, "sad_16x16", "x86_asm_avx", 30, &kvz_sad_16x16_avx); - success &= kvz_strategyselector_register(opaque, "sad_32x32", "x86_asm_avx", 30, &kvz_sad_32x32_avx); - success &= kvz_strategyselector_register(opaque, "sad_64x64", "x86_asm_avx", 30, &kvz_sad_64x64_avx); - - success &= kvz_strategyselector_register(opaque, "satd_4x4", "x86_asm_avx", 30, &kvz_satd_4x4_avx); - success &= kvz_strategyselector_register(opaque, "satd_8x8", "x86_asm_avx", 30, &kvz_satd_8x8_avx); - success &= kvz_strategyselector_register(opaque, "satd_16x16", "x86_asm_avx", 30, &kvz_satd_16x16_avx); - success &= kvz_strategyselector_register(opaque, "satd_32x32", "x86_asm_avx", 30, &kvz_satd_32x32_avx); - success &= kvz_strategyselector_register(opaque, "satd_64x64", "x86_asm_avx", 30, &kvz_satd_64x64_avx); - } -#endif // KVZ_BIT_DEPTH == 8 -#endif //!defined(KVZ_COMPILE_ASM) - return success; -}
View file
kvazaar-2.2.0.tar.gz/src/strategies/x86_asm/picture-x86-asm.h
Deleted
@@ -1,46 +0,0 @@ -#ifndef STRATEGIES_PICTURE_X86_ASM_H_ -#define STRATEGIES_PICTURE_X86_ASM_H_ -/***************************************************************************** - * This file is part of Kvazaar HEVC encoder. - * - * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or - * other materials provided with the distribution. - * - * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON - * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS - ****************************************************************************/ - -/** - * \ingroup Optimization - * \file - * Optimizations for AVX, utilizing ASM implementations. - */ - -#include "global.h" // IWYU pragma: keep - - -int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth); - -#endif //STRATEGIES_PICTURE_X86_ASM_H_
View file
kvazaar-2.2.0.tar.gz/src/strategies/x86_asm/x86inc.asm
Deleted
@@ -1,1466 +0,0 @@ -;***************************************************************************** -;* x86inc.asm: x264asm abstraction layer -;***************************************************************************** -;* Copyright (C) 2005-2014 x264 project -;* -;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* Anton Mitrofanov <BugMaster@narod.ru> -;* Jason Garrett-Glaser <darkshikari@gmail.com> -;* Henrik Gramner <henrik@gramner.com> -;* -;* Permission to use, copy, modify, and/or distribute this software for any -;* purpose with or without fee is hereby granted, provided that the above -;* copyright notice and this permission notice appear in all copies. -;* -;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -;***************************************************************************** - -; This is a header file for the x264ASM assembly language, which uses -; NASM/YASM syntax combined with a large number of macros to provide easy -; abstraction between different calling conventions (x86_32, win64, linux64). -; It also has various other useful features to simplify writing the kind of -; DSP functions that are most often used in x264. - -; Unlike the rest of x264, this file is available under an ISC license, as it -; has significant usefulness outside of x264 and we want it to be available -; to the largest audience possible. Of course, if you modify it for your own -; purposes to add a new feature, we strongly encourage contributing a patch -; as this feature might be useful for others as well. Send patches or ideas -; to x264-devel@videolan.org . - -%ifndef private_prefix - %define private_prefix kvz -%endif - -%ifndef public_prefix - %define public_prefix private_prefix -%endif - -%define WIN64 0 -%define UNIX64 0 -%if ARCH_X86_64 - %ifidn __OUTPUT_FORMAT__,win32 - %define WIN64 1 - %elifidn __OUTPUT_FORMAT__,win64 - %define WIN64 1 - %elifidn __OUTPUT_FORMAT__,x64 - %define WIN64 1 - %else - %define UNIX64 1 - %endif -%endif - -%ifdef PREFIX - %define mangle(x) _ %+ x -%else - %define mangle(x) x -%endif - -%macro SECTION_RODATA 0-1 16 - SECTION .rodata align=%1 -%endmacro - -%macro SECTION_TEXT 0-1 16 - SECTION .text align=%1 -%endmacro - -%if WIN64 - %define PIC -%elif ARCH_X86_64 == 0 -; x86_32 doesn't require PIC. -; Some distros prefer shared objects to be PIC, but nothing breaks if -; the code contains a few textrels, so we'll skip that complexity. - %undef PIC -%endif -%ifdef PIC - default rel -%endif - -%macro CPUNOP 1 - %ifdef __YASM_MAJOR__ - CPU %1 - %endif -%endmacro - -; Always use long nops (reduces 0x90 spam in disassembly on x86_32) -CPUNOP amdnop - -; Macros to eliminate most code duplication between x86_32 and x86_64: -; Currently this works only for leaf functions which load all their arguments -; into registers at the start, and make no other use of the stack. Luckily that -; covers most of x264's asm. - -; PROLOGUE: -; %1 = number of arguments. loads them from stack if needed. -; %2 = number of registers used. pushes callee-saved regs if needed. -; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. -; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x, -; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes), -; and an extra register will be allocated to hold the original stack -; pointer (to not invalidate r0m etc.). To prevent the use of an extra -; register as stack pointer, request a negative stack size. -; %4+/%5+ = list of names to define to registers -; PROLOGUE can also be invoked by adding the same options to cglobal - -; e.g. -; cglobal foo, 2,3,0, dst, src, tmp -; declares a function (foo), taking two args (dst and src) and one local variable (tmp) - -; TODO Some functions can use some args directly from the stack. If they're the -; last args then you can just not declare them, but if they're in the middle -; we need more flexible macro. - -; RET: -; Pops anything that was pushed by PROLOGUE, and returns. - -; REP_RET: -; Use this instead of RET if it's a branch target. - -; registers: -; rN and rNq are the native-size register holding function argument N -; rNd, rNw, rNb are dword, word, and byte size -; rNh is the high 8 bits of the word size -; rNm is the original location of arg N (a register or on the stack), dword -; rNmp is native size - -%macro DECLARE_REG 2-3 - %define r%1q %2 - %define r%1d %2d - %define r%1w %2w - %define r%1b %2b - %define r%1h %2h - %if %0 == 2 - %define r%1m %2d - %define r%1mp %2 - %elif ARCH_X86_64 ; memory - %define r%1m rstk + stack_offset + %3 - %define r%1mp qword r %+ %1 %+ m - %else - %define r%1m rstk + stack_offset + %3 - %define r%1mp dword r %+ %1 %+ m - %endif - %define r%1 %2 -%endmacro - -%macro DECLARE_REG_SIZE 3 - %define r%1q r%1 - %define e%1q r%1 - %define r%1d e%1 - %define e%1d e%1 - %define r%1w %1 - %define e%1w %1 - %define r%1h %3 - %define e%1h %3 - %define r%1b %2 - %define e%1b %2 -%if ARCH_X86_64 == 0 - %define r%1 e%1 -%endif -%endmacro - -DECLARE_REG_SIZE ax, al, ah -DECLARE_REG_SIZE bx, bl, bh -DECLARE_REG_SIZE cx, cl, ch -DECLARE_REG_SIZE dx, dl, dh -DECLARE_REG_SIZE si, sil, null -DECLARE_REG_SIZE di, dil, null -DECLARE_REG_SIZE bp, bpl, null - -; t# defines for when per-arch register allocation is more complex than just function arguments - -%macro DECLARE_REG_TMP 1-* - %assign %%i 0 - %rep %0 - CAT_XDEFINE t, %%i, r%1 - %assign %%i %%i+1 - %rotate 1 - %endrep -%endmacro - -%macro DECLARE_REG_TMP_SIZE 0-* - %rep %0 - %define t%1q t%1 %+ q - %define t%1d t%1 %+ d - %define t%1w t%1 %+ w - %define t%1h t%1 %+ h - %define t%1b t%1 %+ b - %rotate 1 - %endrep -%endmacro - -DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 - -%if ARCH_X86_64 - %define gprsize 8 -%else - %define gprsize 4 -%endif - -%macro PUSH 1 - push %1 - %ifidn rstk, rsp - %assign stack_offset stack_offset+gprsize - %endif -%endmacro - -%macro POP 1 - pop %1 - %ifidn rstk, rsp - %assign stack_offset stack_offset-gprsize - %endif -%endmacro - -%macro PUSH_IF_USED 1-* - %rep %0 - %if %1 < regs_used - PUSH r%1 - %endif - %rotate 1 - %endrep -%endmacro - -%macro POP_IF_USED 1-* - %rep %0 - %if %1 < regs_used - pop r%1 - %endif - %rotate 1 - %endrep -%endmacro - -%macro LOAD_IF_USED 1-* - %rep %0 - %if %1 < num_args - mov r%1, r %+ %1 %+ mp - %endif - %rotate 1 - %endrep -%endmacro - -%macro SUB 2 - sub %1, %2 - %ifidn %1, rstk - %assign stack_offset stack_offset+(%2) - %endif -%endmacro - -%macro ADD 2 - add %1, %2 - %ifidn %1, rstk - %assign stack_offset stack_offset-(%2) - %endif -%endmacro - -%macro movifnidn 2 - %ifnidn %1, %2 - mov %1, %2 - %endif -%endmacro - -%macro movsxdifnidn 2 - %ifnidn %1, %2 - movsxd %1, %2 - %endif -%endmacro - -%macro ASSERT 1 - %if (%1) == 0 - %error assert failed - %endif -%endmacro - -%macro DEFINE_ARGS 0-* - %ifdef n_arg_names - %assign %%i 0 - %rep n_arg_names - CAT_UNDEF arg_name %+ %%i, q - CAT_UNDEF arg_name %+ %%i, d - CAT_UNDEF arg_name %+ %%i, w - CAT_UNDEF arg_name %+ %%i, h - CAT_UNDEF arg_name %+ %%i, b - CAT_UNDEF arg_name %+ %%i, m - CAT_UNDEF arg_name %+ %%i, mp - CAT_UNDEF arg_name, %%i - %assign %%i %%i+1 - %endrep - %endif - - %xdefine %%stack_offset stack_offset - %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine - %assign %%i 0 - %rep %0 - %xdefine %1q r %+ %%i %+ q - %xdefine %1d r %+ %%i %+ d - %xdefine %1w r %+ %%i %+ w - %xdefine %1h r %+ %%i %+ h - %xdefine %1b r %+ %%i %+ b - %xdefine %1m r %+ %%i %+ m - %xdefine %1mp r %+ %%i %+ mp - CAT_XDEFINE arg_name, %%i, %1 - %assign %%i %%i+1 - %rotate 1 - %endrep - %xdefine stack_offset %%stack_offset - %assign n_arg_names %0 -%endmacro - -%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) - %ifnum %1 - %if %1 != 0 - %assign %%stack_alignment ((mmsize + 15) & ~15) - %assign stack_size %1 - %if stack_size < 0 - %assign stack_size -stack_size - %endif - %assign stack_size_padded stack_size - %if WIN64 - %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space - %if mmsize != 8 - %assign xmm_regs_used %2 - %if xmm_regs_used > 8 - %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16 - %endif - %endif - %endif - %if mmsize <= 16 && HAVE_ALIGNED_STACK - %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) - SUB rsp, stack_size_padded - %else - %assign %%reg_num (regs_used - 1) - %xdefine rstk r %+ %%reg_num - ; align stack, and save original stack location directly above - ; it, i.e. in rsp+stack_size_padded, so we can restore the - ; stack in a single instruction (i.e. mov rsp, rstk or mov - ; rsp, rsp+stack_size_padded) - mov rstk, rsp - %if %1 < 0 ; need to store rsp on stack - sub rsp, gprsize+stack_size_padded - and rsp, ~(%%stack_alignment-1) - %xdefine rstkm rsp+stack_size_padded - mov rstkm, rstk - %else ; can keep rsp in rstk during whole function - sub rsp, stack_size_padded - and rsp, ~(%%stack_alignment-1) - %xdefine rstkm rstk - %endif - %endif - WIN64_PUSH_XMM - %endif - %endif -%endmacro - -%macro SETUP_STACK_POINTER 1 - %ifnum %1 - %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32) - %if %1 > 0 - %assign regs_used (regs_used + 1) - %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 - %warning "Stack pointer will overwrite register argument" - %endif - %endif - %endif -%endmacro - -%macro DEFINE_ARGS_INTERNAL 3+ - %ifnum %2 - DEFINE_ARGS %3 - %elif %1 == 4 - DEFINE_ARGS %2 - %elif %1 > 4 - DEFINE_ARGS %2, %3 - %endif -%endmacro - -%if WIN64 ; Windows x64 ;================================================= - -DECLARE_REG 0, rcx -DECLARE_REG 1, rdx -DECLARE_REG 2, R8 -DECLARE_REG 3, R9 -DECLARE_REG 4, R10, 40 -DECLARE_REG 5, R11, 48 -DECLARE_REG 6, rax, 56 -DECLARE_REG 7, rdi, 64 -DECLARE_REG 8, rsi, 72 -DECLARE_REG 9, rbx, 80 -DECLARE_REG 10, rbp, 88 -DECLARE_REG 11, R12, 96 -DECLARE_REG 12, R13, 104 -DECLARE_REG 13, R14, 112 -DECLARE_REG 14, R15, 120 - -%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, stack_size, arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 15 - PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 - ALLOC_STACK %4, %3 - %if mmsize != 8 && stack_size == 0 - WIN64_SPILL_XMM %3 - %endif - LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%macro WIN64_PUSH_XMM 0 - ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. - %if xmm_regs_used > 6 - movaps rstk + stack_offset + 8, xmm6 - %endif - %if xmm_regs_used > 7 - movaps rstk + stack_offset + 24, xmm7 - %endif - %if xmm_regs_used > 8 - %assign %%i 8 - %rep xmm_regs_used-8 - movaps rsp + (%%i-8)*16 + stack_size + 32, xmm %+ %%i - %assign %%i %%i+1 - %endrep - %endif -%endmacro - -%macro WIN64_SPILL_XMM 1 - %assign xmm_regs_used %1 - ASSERT xmm_regs_used <= 16 - %if xmm_regs_used > 8 - %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32 - SUB rsp, stack_size_padded - %endif - WIN64_PUSH_XMM -%endmacro - -%macro WIN64_RESTORE_XMM_INTERNAL 1 - %assign %%pad_size 0 - %if xmm_regs_used > 8 - %assign %%i xmm_regs_used - %rep xmm_regs_used-8 - %assign %%i %%i-1 - movaps xmm %+ %%i, %1 + (%%i-8)*16 + stack_size + 32 - %endrep - %endif - %if stack_size_padded > 0 - %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0) - mov rsp, rstkm - %else - add %1, stack_size_padded - %assign %%pad_size stack_size_padded - %endif - %endif - %if xmm_regs_used > 7 - movaps xmm7, %1 + stack_offset - %%pad_size + 24 - %endif - %if xmm_regs_used > 6 - movaps xmm6, %1 + stack_offset - %%pad_size + 8 - %endif -%endmacro - -%macro WIN64_RESTORE_XMM 1 - WIN64_RESTORE_XMM_INTERNAL %1 - %assign stack_offset (stack_offset-stack_size_padded) - %assign xmm_regs_used 0 -%endmacro - -%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 - -%macro RET 0 - WIN64_RESTORE_XMM_INTERNAL rsp - POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 -%if mmsize == 32 - vzeroupper -%endif - AUTO_REP_RET -%endmacro - -%elif ARCH_X86_64 ; *nix x64 ;============================================= - -DECLARE_REG 0, rdi -DECLARE_REG 1, rsi -DECLARE_REG 2, rdx -DECLARE_REG 3, rcx -DECLARE_REG 4, R8 -DECLARE_REG 5, R9 -DECLARE_REG 6, rax, 8 -DECLARE_REG 7, R10, 16 -DECLARE_REG 8, R11, 24 -DECLARE_REG 9, rbx, 32 -DECLARE_REG 10, rbp, 40 -DECLARE_REG 11, R12, 48 -DECLARE_REG 12, R13, 56 -DECLARE_REG 13, R14, 64 -DECLARE_REG 14, R15, 72 - -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 15 - PUSH_IF_USED 9, 10, 11, 12, 13, 14 - ALLOC_STACK %4 - LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 - -%macro RET 0 -%if stack_size_padded > 0 -%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 - mov rsp, rstkm -%else - add rsp, stack_size_padded -%endif -%endif - POP_IF_USED 14, 13, 12, 11, 10, 9 -%if mmsize == 32 - vzeroupper -%endif - AUTO_REP_RET -%endmacro - -%else ; X86_32 ;============================================================== - -DECLARE_REG 0, eax, 4 -DECLARE_REG 1, ecx, 8 -DECLARE_REG 2, edx, 12 -DECLARE_REG 3, ebx, 16 -DECLARE_REG 4, esi, 20 -DECLARE_REG 5, edi, 24 -DECLARE_REG 6, ebp, 28 -%define rsp esp - -%macro DECLARE_ARG 1-* - %rep %0 - %define r%1m rstk + stack_offset + 4*%1 + 4 - %define r%1mp dword r%1m - %rotate 1 - %endrep -%endmacro - -DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 - -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names... - %assign num_args %1 - %assign regs_used %2 - ASSERT regs_used >= num_args - %if num_args > 7 - %assign num_args 7 - %endif - %if regs_used > 7 - %assign regs_used 7 - %endif - SETUP_STACK_POINTER %4 - ASSERT regs_used <= 7 - PUSH_IF_USED 3, 4, 5, 6 - ALLOC_STACK %4 - LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 - DEFINE_ARGS_INTERNAL %0, %4, %5 -%endmacro - -%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 - -%macro RET 0 -%if stack_size_padded > 0 -%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 - mov rsp, rstkm -%else - add rsp, stack_size_padded -%endif -%endif - POP_IF_USED 6, 5, 4, 3 -%if mmsize == 32 - vzeroupper -%endif - AUTO_REP_RET -%endmacro - -%endif ;====================================================================== - -%if WIN64 == 0 -%macro WIN64_SPILL_XMM 1 -%endmacro -%macro WIN64_RESTORE_XMM 1 -%endmacro -%macro WIN64_PUSH_XMM 0 -%endmacro -%endif - -; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either -; a branch or a branch target. So switch to a 2-byte form of ret in that case. -; We can automatically detect "follows a branch", but not a branch target. -; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) -%macro REP_RET 0 - %if has_epilogue - RET - %else - rep ret - %endif -%endmacro - -%define last_branch_adr $$ -%macro AUTO_REP_RET 0 - %ifndef cpuflags - times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr. - %elif notcpuflag(ssse3) - times ((last_branch_adr-$)>>31)+1 rep - %endif - ret -%endmacro - -%macro BRANCH_INSTR 0-* - %rep %0 - %macro %1 1-2 %1 - %2 %1 - %%branch_instr: - %xdefine last_branch_adr %%branch_instr - %endmacro - %rotate 1 - %endrep -%endmacro - -BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp - -%macro TAIL_CALL 2 ; callee, is_nonadjacent - %if has_epilogue - call %1 - RET - %elif %2 - jmp %1 - %endif -%endmacro - -;============================================================================= -; arch-independent part -;============================================================================= - -%assign function_align 16 - -; Begin a function. -; Applies any symbol mangling needed for C linkage, and sets up a define such that -; subsequent uses of the function name automatically refer to the mangled version. -; Appends cpuflags to the function name if cpuflags has been specified. -; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX -; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). -%macro cglobal 1-2+ "" ; name, PROLOGUE args - cglobal_internal 1, %1 %+ SUFFIX, %2 -%endmacro -%macro cvisible 1-2+ "" ; name, PROLOGUE args - cglobal_internal 0, %1 %+ SUFFIX, %2 -%endmacro -%macro cglobal_internal 2-3+ - %if %1 - %xdefine %%FUNCTION_PREFIX private_prefix - %xdefine %%VISIBILITY hidden - %else - %xdefine %%FUNCTION_PREFIX public_prefix - %xdefine %%VISIBILITY - %endif - %ifndef cglobaled_%2 - %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) - %xdefine %2.skip_prologue %2 %+ .skip_prologue - CAT_XDEFINE cglobaled_, %2, 1 - %endif - %xdefine current_function %2 - %ifidn __OUTPUT_FORMAT__,elf - global %2:function %%VISIBILITY - %else - global %2 - %endif - align function_align - %2: - RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer - %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required - %assign stack_offset 0 ; stack pointer offset relative to the return address - %assign stack_size 0 ; amount of stack space that can be freely used inside a function - %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding - %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 - %ifnidn %3, "" - PROLOGUE %3 - %endif -%endmacro - -%macro cextern 1 - %xdefine %1 mangle(private_prefix %+ _ %+ %1) - CAT_XDEFINE cglobaled_, %1, 1 - extern %1 -%endmacro - -; like cextern, but without the prefix -%macro cextern_naked 1 - %xdefine %1 mangle(%1) - CAT_XDEFINE cglobaled_, %1, 1 - extern %1 -%endmacro - -%macro const 1-2+ - %xdefine %1 mangle(private_prefix %+ _ %+ %1) - %ifidn __OUTPUT_FORMAT__,elf - global %1:data hidden - %else - global %1 - %endif - %1: %2 -%endmacro - -; This is needed for ELF, otherwise the GNU linker assumes the stack is -; executable by default. -%ifidn __OUTPUT_FORMAT__,elf -SECTION .note.GNU-stack noalloc noexec nowrite progbits -%endif - -; cpuflags - -%assign cpuflags_mmx (1<<0) -%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx -%assign cpuflags_3dnow (1<<2) | cpuflags_mmx -%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow -%assign cpuflags_sse (1<<4) | cpuflags_mmx2 -%assign cpuflags_sse2 (1<<5) | cpuflags_sse -%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 -%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 -%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 -%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 -%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 -%assign cpuflags_avx (1<<11)| cpuflags_sse42 -%assign cpuflags_xop (1<<12)| cpuflags_avx -%assign cpuflags_fma4 (1<<13)| cpuflags_avx -%assign cpuflags_avx2 (1<<14)| cpuflags_avx -%assign cpuflags_fma3 (1<<15)| cpuflags_avx - -%assign cpuflags_cache32 (1<<16) -%assign cpuflags_cache64 (1<<17) -%assign cpuflags_slowctz (1<<18) -%assign cpuflags_lzcnt (1<<19) -%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<21) -%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 - -%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) -%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) - -; Takes up to 2 cpuflags from the above list. -; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. -; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. -%macro INIT_CPUFLAGS 0-2 - CPUNOP amdnop - %if %0 >= 1 - %xdefine cpuname %1 - %assign cpuflags cpuflags_%1 - %if %0 >= 2 - %xdefine cpuname %1_%2 - %assign cpuflags cpuflags | cpuflags_%2 - %endif - %xdefine SUFFIX _ %+ cpuname - %if cpuflag(avx) - %assign avx_enabled 1 - %endif - %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) - %define mova movaps - %define movu movups - %define movnta movntps - %endif - %if cpuflag(aligned) - %define movu mova - %elifidn %1, sse3 - %define movu lddqu - %endif - %if ARCH_X86_64 == 0 && notcpuflag(sse2) - CPUNOP basicnop - %endif - %else - %xdefine SUFFIX - %undef cpuname - %undef cpuflags - %endif -%endmacro - -; Merge mmx and sse* -; m# is a simd register of the currently selected size -; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# -; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# -; (All 3 remain in sync through SWAP.) - -%macro CAT_XDEFINE 3 - %xdefine %1%2 %3 -%endmacro - -%macro CAT_UNDEF 2 - %undef %1%2 -%endmacro - -%macro INIT_MMX 0-1+ - %assign avx_enabled 0 - %define RESET_MM_PERMUTATION INIT_MMX %1 - %define mmsize 8 - %define num_mmregs 8 - %define mova movq - %define movu movq - %define movh movd - %define movnta movntq - %assign %%i 0 - %rep 8 - CAT_XDEFINE m, %%i, mm %+ %%i - CAT_XDEFINE nmm, %%i, %%i - %assign %%i %%i+1 - %endrep - %rep 8 - CAT_UNDEF m, %%i - CAT_UNDEF nmm, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -%macro INIT_XMM 0-1+ - %assign avx_enabled 0 - %define RESET_MM_PERMUTATION INIT_XMM %1 - %define mmsize 16 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif - %define mova movdqa - %define movu movdqu - %define movh movq - %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, xmm %+ %%i - CAT_XDEFINE nxmm, %%i, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -%macro INIT_YMM 0-1+ - %assign avx_enabled 1 - %define RESET_MM_PERMUTATION INIT_YMM %1 - %define mmsize 32 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif - %define mova movdqa - %define movu movdqu - %undef movh - %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, ymm %+ %%i - CAT_XDEFINE nymm, %%i, %%i - %assign %%i %%i+1 - %endrep - INIT_CPUFLAGS %1 -%endmacro - -INIT_XMM - -%macro DECLARE_MMCAST 1 - %define mmmm%1 mm%1 - %define mmxmm%1 mm%1 - %define mmymm%1 mm%1 - %define xmmmm%1 mm%1 - %define xmmxmm%1 xmm%1 - %define xmmymm%1 xmm%1 - %define ymmmm%1 mm%1 - %define ymmxmm%1 xmm%1 - %define ymmymm%1 ymm%1 - %define xm%1 xmm %+ m%1 - %define ym%1 ymm %+ m%1 -%endmacro - -%assign i 0 -%rep 16 - DECLARE_MMCAST i -%assign i i+1 -%endrep - -; I often want to use macros that permute their arguments. e.g. there's no -; efficient way to implement butterfly or transpose or dct without swapping some -; arguments. -; -; I would like to not have to manually keep track of the permutations: -; If I insert a permutation in the middle of a function, it should automatically -; change everything that follows. For more complex macros I may also have multiple -; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. -; -; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that -; permutes its arguments. It's equivalent to exchanging the contents of the -; registers, except that this way you exchange the register names instead, so it -; doesn't cost any cycles. - -%macro PERMUTE 2-* ; takes a list of pairs to swap -%rep %0/2 - %xdefine %%tmp%2 m%2 - %rotate 2 -%endrep -%rep %0/2 - %xdefine m%1 %%tmp%2 - CAT_XDEFINE n, m%1, %1 - %rotate 2 -%endrep -%endmacro - -%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) -%ifnum %1 ; SWAP 0, 1, ... - SWAP_INTERNAL_NUM %1, %2 -%else ; SWAP m0, m1, ... - SWAP_INTERNAL_NAME %1, %2 -%endif -%endmacro - -%macro SWAP_INTERNAL_NUM 2-* - %rep %0-1 - %xdefine %%tmp m%1 - %xdefine m%1 m%2 - %xdefine m%2 %%tmp - CAT_XDEFINE n, m%1, %1 - CAT_XDEFINE n, m%2, %2 - %rotate 1 - %endrep -%endmacro - -%macro SWAP_INTERNAL_NAME 2-* - %xdefine %%args n %+ %1 - %rep %0-1 - %xdefine %%args %%args, n %+ %2 - %rotate 1 - %endrep - SWAP_INTERNAL_NUM %%args -%endmacro - -; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later -; calls to that function will automatically load the permutation, so values can -; be returned in mmregs. -%macro SAVE_MM_PERMUTATION 0-1 - %if %0 - %xdefine %%f %1_m - %else - %xdefine %%f current_function %+ _m - %endif - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE %%f, %%i, m %+ %%i - %assign %%i %%i+1 - %endrep -%endmacro - -%macro LOAD_MM_PERMUTATION 1 ; name to load from - %ifdef %1_m0 - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, %1_m %+ %%i - CAT_XDEFINE n, m %+ %%i, %%i - %assign %%i %%i+1 - %endrep - %endif -%endmacro - -; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't -%macro call 1 - call_internal %1, %1 %+ SUFFIX -%endmacro -%macro call_internal 2 - %xdefine %%i %1 - %ifndef cglobaled_%1 - %ifdef cglobaled_%2 - %xdefine %%i %2 - %endif - %endif - call %%i - LOAD_MM_PERMUTATION %%i -%endmacro - -; Substitutions that reduce instruction size but are functionally equivalent -%macro add 2 - %ifnum %2 - %if %2==128 - sub %1, -128 - %else - add %1, %2 - %endif - %else - add %1, %2 - %endif -%endmacro - -%macro sub 2 - %ifnum %2 - %if %2==128 - add %1, -128 - %else - sub %1, %2 - %endif - %else - sub %1, %2 - %endif -%endmacro - -;============================================================================= -; AVX abstraction layer -;============================================================================= - -%assign i 0 -%rep 16 - %if i < 8 - CAT_XDEFINE sizeofmm, i, 8 - %endif - CAT_XDEFINE sizeofxmm, i, 16 - CAT_XDEFINE sizeofymm, i, 32 -%assign i i+1 -%endrep -%undef i - -%macro CHECK_AVX_INSTR_EMU 3-* - %xdefine %%opcode %1 - %xdefine %%dst %2 - %rep %0-2 - %ifidn %%dst, %3 - %error non-avx emulation of ``%%opcode'' is not supported - %endif - %rotate 1 - %endrep -%endmacro - -;%1 == instruction -;%2 == 1 if float, 0 if int -;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise -;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not -;%5+: operands -%macro RUN_AVX_INSTR 5-8+ - %ifnum sizeof%6 - %assign %%sizeofreg sizeof%6 - %elifnum sizeof%5 - %assign %%sizeofreg sizeof%5 - %else - %assign %%sizeofreg mmsize - %endif - %assign %%emulate_avx 0 - %if avx_enabled && %%sizeofreg >= 16 - %xdefine %%instr v%1 - %else - %xdefine %%instr %1 - %if %0 >= 7+%3 - %assign %%emulate_avx 1 - %endif - %endif - - %if %%emulate_avx - %xdefine %%src1 %6 - %xdefine %%src2 %7 - %ifnidn %5, %6 - %if %0 >= 8 - CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8 - %else - CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 - %endif - %if %4 && %3 == 0 - %ifnid %7 - ; 3-operand AVX instructions with a memory arg can only have it in src2, - ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). - ; So, if the instruction is commutative with a memory arg, swap them. - %xdefine %%src1 %7 - %xdefine %%src2 %6 - %endif - %endif - %if %%sizeofreg == 8 - MOVQ %5, %%src1 - %elif %2 - MOVAPS %5, %%src1 - %else - MOVDQA %5, %%src1 - %endif - %endif - %if %0 >= 8 - %1 %5, %%src2, %8 - %else - %1 %5, %%src2 - %endif - %elif %0 >= 8 - %%instr %5, %6, %7, %8 - %elif %0 == 7 - %%instr %5, %6, %7 - %elif %0 == 6 - %%instr %5, %6 - %else - %%instr %5 - %endif -%endmacro - -;%1 == instruction -;%2 == 1 if float, 0 if int -;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise -;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not -%macro AVX_INSTR 1-4 0, 1, 0 - %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4 - %ifidn %2, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1 - %elifidn %3, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2 - %elifidn %4, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3 - %elifidn %5, fnord - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4 - %else - RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5 - %endif - %endmacro -%endmacro - -; Instructions with both VEX and non-VEX encodings -; Non-destructive instructions are written without parameters -AVX_INSTR addpd, 1, 0, 1 -AVX_INSTR addps, 1, 0, 1 -AVX_INSTR addsd, 1, 0, 1 -AVX_INSTR addss, 1, 0, 1 -AVX_INSTR addsubpd, 1, 0, 0 -AVX_INSTR addsubps, 1, 0, 0 -AVX_INSTR aesdec, 0, 0, 0 -AVX_INSTR aesdeclast, 0, 0, 0 -AVX_INSTR aesenc, 0, 0, 0 -AVX_INSTR aesenclast, 0, 0, 0 -AVX_INSTR aesimc -AVX_INSTR aeskeygenassist -AVX_INSTR andnpd, 1, 0, 0 -AVX_INSTR andnps, 1, 0, 0 -AVX_INSTR andpd, 1, 0, 1 -AVX_INSTR andps, 1, 0, 1 -AVX_INSTR blendpd, 1, 0, 0 -AVX_INSTR blendps, 1, 0, 0 -AVX_INSTR blendvpd, 1, 0, 0 -AVX_INSTR blendvps, 1, 0, 0 -AVX_INSTR cmppd, 1, 1, 0 -AVX_INSTR cmpps, 1, 1, 0 -AVX_INSTR cmpsd, 1, 1, 0 -AVX_INSTR cmpss, 1, 1, 0 -AVX_INSTR comisd -AVX_INSTR comiss -AVX_INSTR cvtdq2pd -AVX_INSTR cvtdq2ps -AVX_INSTR cvtpd2dq -AVX_INSTR cvtpd2ps -AVX_INSTR cvtps2dq -AVX_INSTR cvtps2pd -AVX_INSTR cvtsd2si -AVX_INSTR cvtsd2ss -AVX_INSTR cvtsi2sd -AVX_INSTR cvtsi2ss -AVX_INSTR cvtss2sd -AVX_INSTR cvtss2si -AVX_INSTR cvttpd2dq -AVX_INSTR cvttps2dq -AVX_INSTR cvttsd2si -AVX_INSTR cvttss2si -AVX_INSTR divpd, 1, 0, 0 -AVX_INSTR divps, 1, 0, 0 -AVX_INSTR divsd, 1, 0, 0 -AVX_INSTR divss, 1, 0, 0 -AVX_INSTR dppd, 1, 1, 0 -AVX_INSTR dpps, 1, 1, 0 -AVX_INSTR extractps -AVX_INSTR haddpd, 1, 0, 0 -AVX_INSTR haddps, 1, 0, 0 -AVX_INSTR hsubpd, 1, 0, 0 -AVX_INSTR hsubps, 1, 0, 0 -AVX_INSTR insertps, 1, 1, 0 -AVX_INSTR lddqu -AVX_INSTR ldmxcsr -AVX_INSTR maskmovdqu -AVX_INSTR maxpd, 1, 0, 1 -AVX_INSTR maxps, 1, 0, 1 -AVX_INSTR maxsd, 1, 0, 1 -AVX_INSTR maxss, 1, 0, 1 -AVX_INSTR minpd, 1, 0, 1 -AVX_INSTR minps, 1, 0, 1 -AVX_INSTR minsd, 1, 0, 1 -AVX_INSTR minss, 1, 0, 1 -AVX_INSTR movapd -AVX_INSTR movaps -AVX_INSTR movd -AVX_INSTR movddup -AVX_INSTR movdqa -AVX_INSTR movdqu -AVX_INSTR movhlps, 1, 0, 0 -AVX_INSTR movhpd, 1, 0, 0 -AVX_INSTR movhps, 1, 0, 0 -AVX_INSTR movlhps, 1, 0, 0 -AVX_INSTR movlpd, 1, 0, 0 -AVX_INSTR movlps, 1, 0, 0 -AVX_INSTR movmskpd -AVX_INSTR movmskps -AVX_INSTR movntdq -AVX_INSTR movntdqa -AVX_INSTR movntpd -AVX_INSTR movntps -AVX_INSTR movq -AVX_INSTR movsd, 1, 0, 0 -AVX_INSTR movshdup -AVX_INSTR movsldup -AVX_INSTR movss, 1, 0, 0 -AVX_INSTR movupd -AVX_INSTR movups -AVX_INSTR mpsadbw, 0, 1, 0 -AVX_INSTR mulpd, 1, 0, 1 -AVX_INSTR mulps, 1, 0, 1 -AVX_INSTR mulsd, 1, 0, 1 -AVX_INSTR mulss, 1, 0, 1 -AVX_INSTR orpd, 1, 0, 1 -AVX_INSTR orps, 1, 0, 1 -AVX_INSTR pabsb -AVX_INSTR pabsd -AVX_INSTR pabsw -AVX_INSTR packsswb, 0, 0, 0 -AVX_INSTR packssdw, 0, 0, 0 -AVX_INSTR packuswb, 0, 0, 0 -AVX_INSTR packusdw, 0, 0, 0 -AVX_INSTR paddb, 0, 0, 1 -AVX_INSTR paddw, 0, 0, 1 -AVX_INSTR paddd, 0, 0, 1 -AVX_INSTR paddq, 0, 0, 1 -AVX_INSTR paddsb, 0, 0, 1 -AVX_INSTR paddsw, 0, 0, 1 -AVX_INSTR paddusb, 0, 0, 1 -AVX_INSTR paddusw, 0, 0, 1 -AVX_INSTR palignr, 0, 1, 0 -AVX_INSTR pand, 0, 0, 1 -AVX_INSTR pandn, 0, 0, 0 -AVX_INSTR pavgb, 0, 0, 1 -AVX_INSTR pavgw, 0, 0, 1 -AVX_INSTR pblendvb, 0, 0, 0 -AVX_INSTR pblendw, 0, 1, 0 -AVX_INSTR pclmulqdq, 0, 1, 0 -AVX_INSTR pcmpestri -AVX_INSTR pcmpestrm -AVX_INSTR pcmpistri -AVX_INSTR pcmpistrm -AVX_INSTR pcmpeqb, 0, 0, 1 -AVX_INSTR pcmpeqw, 0, 0, 1 -AVX_INSTR pcmpeqd, 0, 0, 1 -AVX_INSTR pcmpeqq, 0, 0, 1 -AVX_INSTR pcmpgtb, 0, 0, 0 -AVX_INSTR pcmpgtw, 0, 0, 0 -AVX_INSTR pcmpgtd, 0, 0, 0 -AVX_INSTR pcmpgtq, 0, 0, 0 -AVX_INSTR pextrb -AVX_INSTR pextrd -AVX_INSTR pextrq -AVX_INSTR pextrw -AVX_INSTR phaddw, 0, 0, 0 -AVX_INSTR phaddd, 0, 0, 0 -AVX_INSTR phaddsw, 0, 0, 0 -AVX_INSTR phminposuw -AVX_INSTR phsubw, 0, 0, 0 -AVX_INSTR phsubd, 0, 0, 0 -AVX_INSTR phsubsw, 0, 0, 0 -AVX_INSTR pinsrb, 0, 1, 0 -AVX_INSTR pinsrd, 0, 1, 0 -AVX_INSTR pinsrq, 0, 1, 0 -AVX_INSTR pinsrw, 0, 1, 0 -AVX_INSTR pmaddwd, 0, 0, 1 -AVX_INSTR pmaddubsw, 0, 0, 0 -AVX_INSTR pmaxsb, 0, 0, 1 -AVX_INSTR pmaxsw, 0, 0, 1 -AVX_INSTR pmaxsd, 0, 0, 1 -AVX_INSTR pmaxub, 0, 0, 1 -AVX_INSTR pmaxuw, 0, 0, 1 -AVX_INSTR pmaxud, 0, 0, 1 -AVX_INSTR pminsb, 0, 0, 1 -AVX_INSTR pminsw, 0, 0, 1 -AVX_INSTR pminsd, 0, 0, 1 -AVX_INSTR pminub, 0, 0, 1 -AVX_INSTR pminuw, 0, 0, 1 -AVX_INSTR pminud, 0, 0, 1 -AVX_INSTR pmovmskb -AVX_INSTR pmovsxbw -AVX_INSTR pmovsxbd -AVX_INSTR pmovsxbq -AVX_INSTR pmovsxwd -AVX_INSTR pmovsxwq -AVX_INSTR pmovsxdq -AVX_INSTR pmovzxbw -AVX_INSTR pmovzxbd -AVX_INSTR pmovzxbq -AVX_INSTR pmovzxwd -AVX_INSTR pmovzxwq -AVX_INSTR pmovzxdq -AVX_INSTR pmuldq, 0, 0, 1 -AVX_INSTR pmulhrsw, 0, 0, 1 -AVX_INSTR pmulhuw, 0, 0, 1 -AVX_INSTR pmulhw, 0, 0, 1 -AVX_INSTR pmullw, 0, 0, 1 -AVX_INSTR pmulld, 0, 0, 1 -AVX_INSTR pmuludq, 0, 0, 1 -AVX_INSTR por, 0, 0, 1 -AVX_INSTR psadbw, 0, 0, 1 -AVX_INSTR pshufb, 0, 0, 0 -AVX_INSTR pshufd -AVX_INSTR pshufhw -AVX_INSTR pshuflw -AVX_INSTR psignb, 0, 0, 0 -AVX_INSTR psignw, 0, 0, 0 -AVX_INSTR psignd, 0, 0, 0 -AVX_INSTR psllw, 0, 0, 0 -AVX_INSTR pslld, 0, 0, 0 -AVX_INSTR psllq, 0, 0, 0 -AVX_INSTR pslldq, 0, 0, 0 -AVX_INSTR psraw, 0, 0, 0 -AVX_INSTR psrad, 0, 0, 0 -AVX_INSTR psrlw, 0, 0, 0 -AVX_INSTR psrld, 0, 0, 0 -AVX_INSTR psrlq, 0, 0, 0 -AVX_INSTR psrldq, 0, 0, 0 -AVX_INSTR psubb, 0, 0, 0 -AVX_INSTR psubw, 0, 0, 0 -AVX_INSTR psubd, 0, 0, 0 -AVX_INSTR psubq, 0, 0, 0 -AVX_INSTR psubsb, 0, 0, 0 -AVX_INSTR psubsw, 0, 0, 0 -AVX_INSTR psubusb, 0, 0, 0 -AVX_INSTR psubusw, 0, 0, 0 -AVX_INSTR ptest -AVX_INSTR punpckhbw, 0, 0, 0 -AVX_INSTR punpckhwd, 0, 0, 0 -AVX_INSTR punpckhdq, 0, 0, 0 -AVX_INSTR punpckhqdq, 0, 0, 0 -AVX_INSTR punpcklbw, 0, 0, 0 -AVX_INSTR punpcklwd, 0, 0, 0 -AVX_INSTR punpckldq, 0, 0, 0 -AVX_INSTR punpcklqdq, 0, 0, 0 -AVX_INSTR pxor, 0, 0, 1 -AVX_INSTR rcpps, 1, 0, 0 -AVX_INSTR rcpss, 1, 0, 0 -AVX_INSTR roundpd -AVX_INSTR roundps -AVX_INSTR roundsd -AVX_INSTR roundss -AVX_INSTR rsqrtps, 1, 0, 0 -AVX_INSTR rsqrtss, 1, 0, 0 -AVX_INSTR shufpd, 1, 1, 0 -AVX_INSTR shufps, 1, 1, 0 -AVX_INSTR sqrtpd, 1, 0, 0 -AVX_INSTR sqrtps, 1, 0, 0 -AVX_INSTR sqrtsd, 1, 0, 0 -AVX_INSTR sqrtss, 1, 0, 0 -AVX_INSTR stmxcsr -AVX_INSTR subpd, 1, 0, 0 -AVX_INSTR subps, 1, 0, 0 -AVX_INSTR subsd, 1, 0, 0 -AVX_INSTR subss, 1, 0, 0 -AVX_INSTR ucomisd -AVX_INSTR ucomiss -AVX_INSTR unpckhpd, 1, 0, 0 -AVX_INSTR unpckhps, 1, 0, 0 -AVX_INSTR unpcklpd, 1, 0, 0 -AVX_INSTR unpcklps, 1, 0, 0 -AVX_INSTR xorpd, 1, 0, 1 -AVX_INSTR xorps, 1, 0, 1 - -; 3DNow instructions, for sharing code between AVX, SSE and 3DN -AVX_INSTR pfadd, 1, 0, 1 -AVX_INSTR pfsub, 1, 0, 0 -AVX_INSTR pfmul, 1, 0, 1 - -; base-4 constants for shuffles -%assign i 0 -%rep 256 - %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) - %if j < 10 - CAT_XDEFINE q000, j, i - %elif j < 100 - CAT_XDEFINE q00, j, i - %elif j < 1000 - CAT_XDEFINE q0, j, i - %else - CAT_XDEFINE q, j, i - %endif -%assign i i+1 -%endrep -%undef i -%undef j - -%macro FMA_INSTR 3 - %macro %1 4-7 %1, %2, %3 - %if cpuflag(xop) - v%5 %1, %2, %3, %4 - %else - %6 %1, %2, %3 - %7 %1, %4 - %endif - %endmacro -%endmacro - -FMA_INSTR pmacsdd, pmulld, paddd -FMA_INSTR pmacsww, pmullw, paddw -FMA_INSTR pmadcswd, pmaddwd, paddd - -; convert FMA4 to FMA3 if possible -%macro FMA4_INSTR 4 - %macro %1 4-8 %1, %2, %3, %4 - %if cpuflag(fma4) - v%5 %1, %2, %3, %4 - %elifidn %1, %2 - v%6 %1, %4, %3 ; %1 = %1 * %3 + %4 - %elifidn %1, %3 - v%7 %1, %2, %4 ; %1 = %2 * %1 + %4 - %elifidn %1, %4 - v%8 %1, %2, %3 ; %1 = %2 * %3 + %1 - %else - %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported - %endif - %endmacro -%endmacro - -FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd -FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps -FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd -FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss - -FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd -FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps -FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd -FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps - -FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd -FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps -FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd -FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss - -FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd -FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps -FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd -FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss - -FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd -FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps -FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd -FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss - -; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug -%if ARCH_X86_64 == 0 -%macro vpbroadcastq 2 -%if sizeof%1 == 16 - movddup %1, %2 -%else - vbroadcastsd %1, %2 -%endif -%endmacro -%endif - -%ifidn __OUTPUT_FORMAT__,elf -section .note.GNU-stack noalloc noexec nowrite progbits -%endif -%ifidn __OUTPUT_FORMAT__,elf32 -section .note.GNU-stack noalloc noexec nowrite progbits -%endif -%ifidn __OUTPUT_FORMAT__,elf64 -section .note.GNU-stack noalloc noexec nowrite progbits -%endif
View file
kvazaar-2.2.0.tar.gz/.gitignore -> kvazaar-2.3.0.tar.gz/.gitignore
Changed
@@ -15,6 +15,7 @@ .deps .dirstamp .libs +.vs Makefile Makefile.in /aclocal.m4 @@ -53,5 +54,6 @@ src/kvazaar src/libkvazaar.so.* src/kvazaar.pc +src/version.h tests/kvazaar_tests tests/kvazaar_tests.trs
View file
kvazaar-2.3.0.tar.gz/CMakeLists.txt
Added
@@ -0,0 +1,391 @@ +cmake_minimum_required(VERSION 3.12) + +project(kvazaar +LANGUAGES C CXX +HOMEPAGE_URL https://github.com/ultravideo/kvazaar +DESCRIPTION "An open-source VVC encoder licensed under 3-clause BSD" +VERSION 2.3.0 ) + +option(BUILD_SHARED_LIBS "Build using shared kvazaar library" ON) + +option(BUILD_TESTS "Build tests" ON) + + +include(GNUInstallDirs) #Helps to define correct distro specific install directories + +set(KVAZAAR_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}" CACHE PATH "kvazaar library install path") +set(KVAZAAR_INSTALL_BINDIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH "kvazaar binary install path") +set(KVAZAAR_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}" CACHE PATH "kvazaar include install path") +set(KVAZAAR_INSTALL_MANDIR "${CMAKE_INSTALL_MANDIR}/man1" CACHE PATH "kvazaar manual page file install path") + +# https://www.kitware.com/cmake-and-the-default-build-type/ +# Set a default build type if none was specified +set(KVZ_DEFAULT_BUILD_TYPE "RelWithDebInfo") + +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + message(STATUS "No build type specified, setting to '${KVZ_DEFAULT_BUILD_TYPE}'.") + set(CMAKE_BUILD_TYPE "${KVZ_DEFAULT_BUILD_TYPE}" CACHE + STRING "Choose the type of build." FORCE) + # Set the possible values of build type for cmake-gui + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS + "Debug" "Release" "MinSizeRel" "RelWithDebInfo") +endif() + + +find_package(Git QUIET) +if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git") + # Update submodules as needed + option(GIT_SUBMODULE "Check submodules during build" ON) + if(GIT_SUBMODULE) + message(STATUS "Submodule update") + execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE GIT_SUBMOD_RESULT) + if(NOT GIT_SUBMOD_RESULT EQUAL "0") + message(WARNING "git submodule update --init --recursive failed with ${GIT_SUBMOD_RESULT}, please checkout submodules") + endif() + endif() + # Check git hash and fetch tag + execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE GIT_HEAD_OK + OUTPUT_VARIABLE GIT_HEAD) + if(GIT_HEAD_OK EQUAL "0") + string(SUBSTRING ${GIT_HEAD} 0 30 GIT_TAG_LONG) + execute_process(COMMAND ${GIT_EXECUTABLE} name-rev --tags --name-only ${GIT_TAG_LONG} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE GIT_TAG_OK + OUTPUT_VARIABLE GIT_TAG) + string(SUBSTRING ${GIT_TAG} 0 9 GIT_TAG_STRIP) + + # If tag is not defined, add part of the commit hash to the version + if(GIT_TAG_OK EQUAL "0" AND GIT_TAG_STRIP STREQUAL "undefined") + string(SUBSTRING ${GIT_HEAD} 0 7 GIT_TAG_SHORT) + set(PROJECT_VERSION ${PROJECT_VERSION}-${GIT_TAG_SHORT}) + message(INFO " No tag detected, version changed to ${PROJECT_VERSION}") + endif() + endif() +endif() + +if(NOT EXISTS "${PROJECT_SOURCE_DIR}/greatest/greatest.h") + message(WARNING "The submodule greatest was not loaded, some tests may fail") +endif() + +# Grab <year>-<month>-<day> timestamp for debug purposes +string(TIMESTAMP CMAKE_BUILD_DATE %Y-%m-%d) + +set(KVZ_COMPILER_VERSION "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") + +if(MSVC) + if(MSVC_VERSION LESS 1800) + set(KVZ_COMPILER_VERSION "VS") + elseif(MSVC_VERSION LESS 1900) + set(KVZ_COMPILER_VERSION "VS2013") + elseif(MSVC_VERSION LESS 1910) + set(KVZ_COMPILER_VERSION "VS2015") + elseif(MSVC_VERSION LESS 1920) + set(KVZ_COMPILER_VERSION "VS2017") + elseif(MSVC_VERSION LESS 1930) + set(KVZ_COMPILER_VERSION "VS2019") + else() + set(KVZ_COMPILER_VERSION "VS2022") + endif() +endif() + +# Set compiler info to print at runtime +set(KVZ_COMPILER_STRING "${KVZ_COMPILER_VERSION}") + +add_definitions(-DCMAKE_BUILD) + +# Apply dynamic info to the config files +configure_file("${PROJECT_SOURCE_DIR}/src/kvazaar.pc.in" "${PROJECT_SOURCE_DIR}/src/kvazaar.pc" @ONLY) +configure_file("${PROJECT_SOURCE_DIR}/src/version.h.in" "${PROJECT_SOURCE_DIR}/src/version.h" @ONLY) + +# Add all sources in src/ base +file(GLOB LIB_SOURCES RELATIVE ${PROJECT_SOURCE_DIR} "src/*.h" "src/*.c") + +# We don't want CLI main in the library +list(REMOVE_ITEM LIB_SOURCES "src/encmain.c" "src/cli.c" "src/cli.h" "src/yuv_io.c" "src/yuv_io.h") + +# Add also all the strategies +file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.c") + +list(APPEND LIB_SOURCES ${LIB_SOURCES_STRATEGIES}) + +# We also need the libmd5 +list(APPEND LIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/extras/libmd5.c) + +add_definitions(-DKVZ_DLL_EXPORTS) + +if(BUILD_SHARED_LIBS) + add_definitions(-DPIC) +endif() + +# For visual studio / windows we also need our own pthread implementation and getopt +if(MSVC) + list(APPEND LIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/extras/getopt.c ${CMAKE_CURRENT_SOURCE_DIR}/src/threadwrapper/src/pthread.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/threadwrapper/src/semaphore.cpp) + add_definitions(-DWIN32_LEAN_AND_MEAN -D_WIN32 -DWIN32 -DWIN64) +endif() + +if(BUILD_SHARED_LIBS) + list( APPEND CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib" "./" "../lib" ) + set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + add_library(kvazaar SHARED ${LIB_SOURCES}) +else() + add_library(kvazaar STATIC ${LIB_SOURCES}) + if(MSVC) # Fix a linking problem with visual studio when the library is the same name as the binary + set_target_properties(kvazaar PROPERTIES OUTPUT_NAME libkvazaar) + endif() + +endif() + +target_include_directories(kvazaar PUBLIC src) +target_include_directories(kvazaar PUBLIC src/extras) +target_include_directories(kvazaar PUBLIC src/strategies) + +file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c") +file(GLOB LIB_SOURCES_STRATEGIES_SSE41 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse41/*.c") + +set(CLI_SOURCES "src/encmain.c" "src/cli.c" "src/cli.h" "src/yuv_io.c" "src/yuv_io.h") + +# Add the getopt and pthread for visual studio +if(MSVC) + list(APPEND CLI_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/extras/getopt.c ${CMAKE_CURRENT_SOURCE_DIR}/src/threadwrapper/src/pthread.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/threadwrapper/src/semaphore.cpp) +endif() + +add_executable(kvazaar-bin ${CLI_SOURCES}) + +set_target_properties(kvazaar-bin PROPERTIES OUTPUT_NAME kvazaar) +set_target_properties(kvazaar-bin PROPERTIES RUNTIME_OUTPUT_NAME kvazaar) + +target_link_libraries(kvazaar-bin PUBLIC kvazaar) + +if(MSVC) + target_include_directories(kvazaar PUBLIC src/threadwrapper/include) + set_property( SOURCE ${LIB_SOURCES_STRATEGIES_AVX2} APPEND PROPERTY COMPILE_FLAGS "/arch:AVX2" ) +else() + set_target_properties(kvazaar-bin PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/src) + list(APPEND ALLOW_AVX2 "x86_64" "AMD64") + if(${CMAKE_SYSTEM_PROCESSOR} IN_LIST ALLOW_AVX2) + set_property( SOURCE ${LIB_SOURCES_STRATEGIES_AVX2} APPEND PROPERTY COMPILE_FLAGS "-mavx2 -mbmi -mpopcnt -mlzcnt -mbmi2" ) + set_property( SOURCE ${LIB_SOURCES_STRATEGIES_SSE41} APPEND PROPERTY COMPILE_FLAGS "-msse4.1" ) + endif() + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) + target_link_libraries(kvazaar PUBLIC Threads::Threads) + + include(CheckLibraryExists) + + CHECK_LIBRARY_EXISTS(m sin "" HAVE_LIB_M) + + if (HAVE_LIB_M) + set(EXTRA_LIBS ${EXTRA_LIBS} m) + endif (HAVE_LIB_M) + + target_link_libraries(kvazaar-bin PUBLIC ${EXTRA_LIBS}) +endif() + +# Source grouping + +# Some basic structuring of the files based on previous visual studio project files +file(GLOB SOURCE_GROUP_BITSTREAM RELATIVE ${PROJECT_SOURCE_DIR} "src/encode_coding_tree.*" "src/encoder_state-bitstream.*" "src/nal.*") +file(GLOB SOURCE_GROUP_CABAC RELATIVE ${PROJECT_SOURCE_DIR} "src/bitstream.*" "src/cabac.*" "src/context.*") +file(GLOB SOURCE_GROUP_COMPRESSION RELATIVE ${PROJECT_SOURCE_DIR} "src/search*" "src/rdo.*" "src/fast_coeff*") +file(GLOB SOURCE_GROUP_CONSTRAINT RELATIVE ${PROJECT_SOURCE_DIR} "src/constraint.*" "src/ml_*") +file(GLOB SOURCE_GROUP_CONTROL RELATIVE ${PROJECT_SOURCE_DIR} "src/cfg.*" "src/encoder.*" "src/encoder_state-c*" "src/encoder_state-g*" "src/encoderstate*" "src/gop.*" "src/input_frame_buffer.*" "src/kvazaar*" "src/rate_control.*" "src/mip_data.h") +file(GLOB SOURCE_GROUP_DATA_STRUCTURES RELATIVE ${PROJECT_SOURCE_DIR} "src/cu.*" "src/image.*" "src/imagelist.*" "src/videoframe.*") +file(GLOB SOURCE_GROUP_EXTRAS RELATIVE ${PROJECT_SOURCE_DIR} "src/extras/*.h" "src/extras/*.c") +file(GLOB_RECURSE SOURCE_GROUP_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.h" "src/strategies/*.c") +file(GLOB SOURCE_GROUP_RECON RELATIVE ${PROJECT_SOURCE_DIR} "src/alf.*" "src/filter.*" "src/inter.*" "src/intra.*" "src/reshape.*" "src/sao.*" "src/scalinglist.*" "src/tables.*" "src/transform.*") +file(GLOB SOURCE_GROUP_THREADING RELATIVE ${PROJECT_SOURCE_DIR} "src/threadqueue.*" "src/threads.*") +file(GLOB_RECURSE SOURCE_GROUP_THREADWRAPPER RELATIVE ${PROJECT_SOURCE_DIR} "src/threadwrapper/*.cpp" "src/threadwrapper/*.h") +file(GLOB SOURCE_GROUP_TOPLEVEL RELATIVE ${PROJECT_SOURCE_DIR} "src/debug.*" "src/global.h" "src/version.h" "src/kvz_math.h" "src/checkpoint.*") + +source_group( "Bitstream" FILES ${SOURCE_GROUP_BITSTREAM}) +source_group( "CABAC" FILES ${SOURCE_GROUP_CABAC}) +source_group( "Compression" FILES ${SOURCE_GROUP_COMPRESSION}) +source_group( "Constraint" FILES ${SOURCE_GROUP_CONSTRAINT}) +source_group( "Control" FILES ${SOURCE_GROUP_CONTROL}) +source_group( "Data Structures" FILES ${SOURCE_GROUP_DATA_STRUCTURES}) +source_group( "Extras" FILES ${SOURCE_GROUP_EXTRAS}) + +# Handle the strategies directory structure better in visual studio +if(MSVC) + foreach(source IN LISTS SOURCE_GROUP_STRATEGIES) + get_filename_component(source_path "${source}" PATH) + string(REPLACE "src/" "" source_path_msvc "${source_path}") + string(REPLACE "/" "\\" source_path_msvc "${source_path_msvc}") + source_group("Optimization\\${source_path_msvc}" FILES "${source}") + endforeach() +else() + source_group( "Optimization" FILES ${SOURCE_GROUP_STRATEGIES}) +endif() +source_group( "Optimization" FILES "src/strategyselector.c" "src/strategyselector.h") + +source_group( "Reconstruction" FILES ${SOURCE_GROUP_RECON}) +source_group( "Threading" FILES ${SOURCE_GROUP_THREADING}) +source_group( "Threadwrapper" FILES ${SOURCE_GROUP_THREADWRAPPER}) +source_group( "" FILES ${SOURCE_GROUP_TOPLEVEL}) + +# INSTALL + +# ToDo: make configurable + +install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.pc DESTINATION ${CMAKE_INSTALL_PREFIX}/share/pkgconfig) +install(TARGETS kvazaar-bin DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) +install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) +if(BUILD_SHARED_LIBS) # Just add the lib to the bin directory for now + if(MSVC) + install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) + endif() +endif() +install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include) +install(FILES ${PROJECT_SOURCE_DIR}/doc/kvazaar.1 DESTINATION ${CMAKE_INSTALL_PREFIX}/share/man) + +IF(UNIX) +# DIST + +set(GIT_LS_TREE_OK "1") + +# By default grab the list of files in the git repo +if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git") + execute_process(COMMAND ${GIT_EXECUTABLE} ls-tree --name-only -r HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE GIT_LS_TREE_OK + OUTPUT_VARIABLE GIT_LS_TREE) + if(GIT_LS_TREE_OK EQUAL "0") + string(REGEX REPLACE "\n" ";" GIT_LS_TREE "${GIT_LS_TREE}") + string(REGEX REPLACE "\r" "" GIT_LS_TREE "${GIT_LS_TREE}") + list(APPEND DIST_SOURCES ${GIT_LS_TREE}) + endif() +endif() +if(NOT GIT_LS_TREE_OK EQUAL "0") + file(GLOB_RECURSE DIST_SOURCES RELATIVE ${PROJECT_SOURCE_DIR} "src/*.c" "src/*.h" "tests/*.sh" "tools/*.sh" "tools/*.py" ".github/*.yml" "src/*.in" "placeholder.txt" "CMakeLists.txt" "doc/*" "examples/*" "rdcost-weight-tool/*" "greatest/*.h" "greatest/*.md") + list(APPEND DIST_SOURCES ".clang-format" ".gitignore" ".gitmodules" "tests/tsan_suppressions.txt" ".travis-install.bash" "CREDITS" "Dockerfile" "docs.doxy" ".gitlab-ci.yml" "LICENSE" "LICENSE.EXT.greatest" "README.md") +endif() + +add_custom_target(dist + COMMAND echo \"Writing log to ${PROJECT_SOURCE_DIR}/dist.log\" + && tar -zcvf "${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz" --transform 's,^,${PROJECT_NAME}-${PROJECT_VERSION}/,' -- ${DIST_SOURCES} > dist.log 2>&1 || { echo \"\\0330;31mfailed to pack ${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz, check ${PROJECT_SOURCE_DIR}/dist.log.\\033\m\"$<SEMICOLON> exit 1$<SEMICOLON> } + COMMENT "Make distribution ${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz" + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + BYPRODUCTS ${CMAKE_SOURCE_DIR}/${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz + ) + +# DISTCHECK + +set(TEMP_DISTCHECK_DIR "_distcheck") + +add_custom_target(distcheck + COMMAND echo \"Writing log to ${PROJECT_SOURCE_DIR}/distcheck.log\" + && cd ${PROJECT_SOURCE_DIR} + && mkdir -p ${TEMP_DISTCHECK_DIR} + && cd ${TEMP_DISTCHECK_DIR} + && tar -zxf ${CMAKE_SOURCE_DIR}/${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz > ${PROJECT_SOURCE_DIR}/distcheck.log || { echo \"\\0330;31mfailed to unpack ${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz.\\033\m\"$<SEMICOLON> exit 1$<SEMICOLON> } + && echo \"\\0330;32mFile unpack ok\\033m\" + && cd ${PROJECT_NAME}-${PROJECT_VERSION} + && mkdir -p build + && cd build + && cmake -DCMAKE_INSTALL_PREFIX=./ -DBUILD_SHARED_LIBS=OFF -G "Unix Makefiles" .. >> ${PROJECT_SOURCE_DIR}/distcheck.log || { echo \"\\0330;31mcmake failed to configure.\\033m\"$<SEMICOLON> exit 1$<SEMICOLON> } + && echo \"\\0330;32mCMake configure ok\\033m\" + && make -j >> ${PROJECT_SOURCE_DIR}/distcheck.log || { echo \"\\0330;31mmake failed.\\033m\"$<SEMICOLON> exit 1$<SEMICOLON> } + && echo \"\\0330;32mMake ok\\033m\" + # Full tests might be too demanding to run, enable with parameter? + #&& make test || (echo \"\\e0;31mmake test failed.\\033m\" && false) + && tests/kvazaar_tests >> ${PROJECT_SOURCE_DIR}/distcheck.log 2>&1 || { echo \"\\0330;31mtests failed.\\033m\"$<SEMICOLON> exit 1$<SEMICOLON> } + && echo \"\\0330;32mTests ok\\033m\" + && make install >> ${PROJECT_SOURCE_DIR}/distcheck.log || { echo \"\\0330;31mmake install failed.\\033m\"$<SEMICOLON> exit 1$<SEMICOLON> } + && echo \"\\0330;32mInstall ok\\033m\" + && bin/kvzaar --help >> ${PROJECT_SOURCE_DIR}/distcheck.log || { echo \"\\0330;31mkvazaar binary failed to run.\\033m\"$<SEMICOLON> exit 1$<SEMICOLON> } + && echo \"\\0330;32mbin/kvazaar ok\\033m\" + && make clean >> ${PROJECT_SOURCE_DIR}/distcheck.log || { echo \"\\0330;31mmake clean failed.\\033m\"$<SEMICOLON> exit 1$<SEMICOLON> } + && echo \"\\0330;32mmake clean ok\\033m\" + && cd ${PROJECT_SOURCE_DIR} + && rm -rf "${PROJECT_SOURCE_DIR}/${TEMP_DISTCHECK_DIR}" + && echo \"\\0330;32m==============================================================\\033m\" + && echo \"\\0330;32m${PROJECT_NAME}-${PROJECT_VERSION} archives ready for distribution:\\033m\" + && echo \"\\0330;32m${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz\\033m\" + && echo \"\\0330;32m==============================================================\\033m\" + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + DEPENDS ${CMAKE_SOURCE_DIR}/${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz + COMMENT "Checking ${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz.." + ) +endif() #Unix + +# TESTS +enable_testing() + +if(MSVC OR MINGW OR MSYS) + if(BUILD_SHARED_LIBS) + set(BUILD_TESTS OFF) + message(INFO " Disable test building, fails in MSVC/MINGW/MSYS2 when building shared binaries") + endif() +endif() + +if(EXISTS "${PROJECT_SOURCE_DIR}/greatest/greatest.h" AND BUILD_TESTS) + add_subdirectory( "tests/" ) + add_test( NAME Test_kvazaar COMMAND kvazaar_tests ) +endif() + +if(NOT DEFINED MSVC) + list(APPEND XFAIL "off") + if(DEFINED ENV{XFAIL_TESTS}) + list(APPEND XFAIL $ENV{XFAIL_TESTS}) + endif() + + if(NOT "test_tools.sh" IN_LIST XFAIL) + add_test( NAME test_tools COMMAND ${PROJECT_SOURCE_DIR}/tests/test_tools.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() + + if(NOT "test_smp.sh" IN_LIST XFAIL) + add_test( NAME test_smp COMMAND ${PROJECT_SOURCE_DIR}/tests/test_smp.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() + + if(NOT "test_pu_depth_constraints.sh" IN_LIST XFAIL) + add_test( NAME test_pu_depth_constraints COMMAND ${PROJECT_SOURCE_DIR}/tests/test_pu_depth_constraints.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() + + if(NOT "test_mv_constraint.sh" IN_LIST XFAIL) + add_test( NAME test_mv_constraint COMMAND ${PROJECT_SOURCE_DIR}/tests/test_mv_constraint.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() + + if(NOT "test_interlace.sh" IN_LIST XFAIL) + add_test( NAME test_interlace COMMAND ${PROJECT_SOURCE_DIR}/tests/test_interlace.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() + + if(NOT "test_gop.sh" IN_LIST XFAIL) + add_test( NAME test_gop COMMAND ${PROJECT_SOURCE_DIR}/tests/test_gop.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() + + if(NOT "test_owf_wpp_tiles.sh" IN_LIST XFAIL) + add_test( NAME test_owf_wpp_tiles COMMAND ${PROJECT_SOURCE_DIR}/tests/test_owf_wpp_tiles.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() + + if(NOT "test_weird_shapes.sh" IN_LIST XFAIL) + add_test( NAME test_weird_shapes COMMAND ${PROJECT_SOURCE_DIR}/tests/test_weird_shapes.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() + + if(NOT "test_external_symbols.sh" IN_LIST XFAIL) + add_test( NAME test_external_symbols COMMAND ${PROJECT_SOURCE_DIR}/tests/test_external_symbols.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() + + if(NOT "util.sh" IN_LIST XFAIL) + add_test( NAME util COMMAND ${PROJECT_SOURCE_DIR}/tests/util.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() + + if(NOT "test_invalid_input.sh" IN_LIST XFAIL) + add_test( NAME test_invalid_input COMMAND ${PROJECT_SOURCE_DIR}/tests/test_invalid_input.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() + + if(NOT "test_slices.sh" IN_LIST XFAIL) + add_test( NAME test_slices COMMAND ${PROJECT_SOURCE_DIR}/tests/test_slices.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() + + if(NOT "test_intra.sh" IN_LIST XFAIL) + add_test( NAME test_intra COMMAND ${PROJECT_SOURCE_DIR}/tests/test_intra.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() + + if(NOT "test_rate_control.sh" IN_LIST XFAIL) + add_test( NAME test_rate_control COMMAND ${PROJECT_SOURCE_DIR}/tests/test_rate_control.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests) + endif() +endif()
View file
kvazaar-2.2.0.tar.gz/Dockerfile -> kvazaar-2.3.0.tar.gz/Dockerfile
Changed
@@ -15,12 +15,12 @@ # # Use Ubuntu 18.04 as a base for now, it's around 88MB -FROM ubuntu:18.04 +FROM ubuntu:20.04 MAINTAINER Marko Viitanen <fador@iki.fi> # List of needed packages to be able to build kvazaar with autotools -ENV REQUIRED_PACKAGES automake autoconf libtool m4 build-essential git yasm pkgconf +ENV REQUIRED_PACKAGES automake autoconf libtool m4 build-essential git pkgconf COPY . kvazaar # Run all the commands in one RUN so we don't have any extra history
View file
kvazaar-2.2.0.tar.gz/Makefile.am -> kvazaar-2.3.0.tar.gz/Makefile.am
Changed
@@ -4,7 +4,7 @@ dist_man1_MANS = doc/kvazaar.1 -dist_doc_DATA = LICENSE LICENSE.EXT.greatest LICENSE.EXT.x264asm CREDITS README.md +dist_doc_DATA = LICENSE LICENSE.EXT.greatest CREDITS README.md EXTRA_DIST = \ build \
View file
kvazaar-2.2.0.tar.gz/README.md -> kvazaar-2.3.0.tar.gz/README.md
Changed
@@ -51,6 +51,8 @@ comment: # "BEGIN KVAZAAR HELP MESSAGE" ``` +Kvazaar v2.3.0 2024-01-17 +Kvazaar license: 3-clause BSD Usage: kvazaar -i <input> --input-res <width>x<height> -o <output> @@ -95,6 +97,8 @@ - md5: 56 bytes --(no-)psnr : Calculate PSNR for frames. enabled --(no-)info : Add encoder info SEI. enabled + --(no-)enable-logging : Enable logging for regular encoder performance, + error messages are always disblayed. enabled --crypto <string> : Selective encryption. Crypto support must be enabled at compile-time. Can be 'on' or 'off' or a list of features separated with a '+'. off @@ -422,11 +426,10 @@ improve in the build process. We want to make this as simple as possible. - ### Autotools Depending on the platform, some additional tools are required for compiling Kvazaar with autotools. -For Ubuntu, the required packages are `automake autoconf libtool m4 build-essential yasm`. Yasm is -optional, but some of the optimization will not be compiled in if it's missing. +For Ubuntu, the required packages are `automake autoconf libtool m4 build-essential`. + Run the following commands to compile and install Kvazaar. @@ -437,6 +440,7 @@ sudo ldconfig See `./configure --help` for more options. +**When building shared library with visual studio the tests will fail to link, the main binary will still work** ### Autotools on MinGW It is recommended to use Clang instead of GCC in MinGW environments. GCC also works, but AVX2 optimizations will be disabled because of a known GCC issue from 2012, so performance will suffer badly. Instead of `./configure`, run @@ -445,6 +449,11 @@ to build Kvazaar using Clang. +### CMake +Depending on the platform, some additional tools are required for compiling Kvazaar with CMake. +For Ubuntu, the required packages are `build-essential cmake`. + + ### OS X - Install Homebrew - run ```brew install automake libtool yasm``` @@ -482,7 +491,7 @@ Please cite this paper(https://dl.acm.org/citation.cfm?doid=2964284.2973796) for Kvazaar: -```M. Viitanen, A. Koivula, A. Lemmetti, A. Ylä-Outinen, J. Vanne, and T. D. Hämäläinen, âKvazaar: open-source HEVC/H.265 encoder,â in Proc. ACM Int. Conf. Multimedia, Amsterdam, The Netherlands, Oct. 2016.``` +```M. Viitanen, A. Koivula, A. Lemmetti, A. Ylä-Outinen, J. Vanne, and T. D. Hämäläinen, Kvazaar: open-source HEVC/H.265 encoder, in Proc. ACM Int. Conf. Multimedia, Amsterdam, The Netherlands, Oct. 2016.``` Or in BibTex: @@ -522,7 +531,7 @@ - Main automatic way of testing is with Travis CI. Commits, branches and pull requests are tested automatically. - Uninitialized variables and such are checked with Valgrind. - - Bitstream validity is checked with HM. + - Bitstream validity is checked with VTM. - Compilation is checked on GCC and Clang on Linux, and Clang on OSX. - Windows msys2 and msvc builds are checked automatically on Appveyor. - If your changes change the bitstream, decode with HM to check that
View file
kvazaar-2.2.0.tar.gz/appveyor.yml -> kvazaar-2.3.0.tar.gz/appveyor.yml
Changed
@@ -72,11 +72,6 @@ - MSYSTEM: MINGW32 - MSYSTEM: MINGW64 - install: - - ps: $url = "http://ultravideo.cs.tut.fi/vsyasm.exe" - - ps: $output = "C:\Tools\vsyasm.exe" - - ps: "(New-Object System.Net.WebClient).DownloadFile($url, $output)" - - ps: '$env:Path += ";$output\.."' build: project: .\build\kvazaar_VS2015.sln
View file
kvazaar-2.2.0.tar.gz/build/C_Properties.props -> kvazaar-2.3.0.tar.gz/build/C_Properties.props
Changed
@@ -24,10 +24,6 @@ <SubSystem>Console</SubSystem> <RandomizedBaseAddress>false</RandomizedBaseAddress> </Link> - <YASM> - <Defines>HAVE_ALIGNED_STACK=1</Defines> - <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths)</IncludePaths> - </YASM> </ItemDefinitionGroup> <ItemGroup /> </Project> \ No newline at end of file
View file
kvazaar-2.2.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj -> kvazaar-2.3.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj
Changed
@@ -46,9 +46,6 @@ <PlatformToolset>v140</PlatformToolset> </PropertyGroup> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> - <ImportGroup Label="ExtensionSettings"> - <Import Project="..\yasm\vsyasm.props" /> - </ImportGroup> <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> <Import Project="..\C_Properties.props" /> </ImportGroup> @@ -77,23 +74,14 @@ <OutDir>$(SolutionDir)$(Platform)-$(Configuration)-libs\</OutDir> </PropertyGroup> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> - <YASM /> <Lib> <AdditionalLibraryDirectories> </AdditionalLibraryDirectories> <AdditionalDependencies> </AdditionalDependencies> </Lib> - <YASM> - <Defines>ARCH_X86_64=1;%(Defines)</Defines> - <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths> - </YASM> </ItemDefinitionGroup> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> - <YASM> - <Defines>ARCH_X86_64=0;PREFIX</Defines> - <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths> - </YASM> <Lib> <AdditionalLibraryDirectories> </AdditionalLibraryDirectories> @@ -106,10 +94,6 @@ </ClCompile> </ItemDefinitionGroup> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> - <YASM> - <Defines>ARCH_X86_64=0;PREFIX</Defines> - <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths> - </YASM> <Lib> <AdditionalLibraryDirectories> </AdditionalLibraryDirectories> @@ -122,10 +106,6 @@ </ClCompile> </ItemDefinitionGroup> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> - <YASM> - <Defines>ARCH_X86_64=1;%(Defines)</Defines> - <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths> - </YASM> <Lib> <AdditionalLibraryDirectories> </AdditionalLibraryDirectories> @@ -239,7 +219,6 @@ <ClCompile Include="..\..\src\strategies\strategies-nal.c" /> <ClCompile Include="..\..\src\strategies\strategies-picture.c" /> <ClCompile Include="..\..\src\strategies\strategies-sao.c" /> - <ClCompile Include="..\..\src\strategies\x86_asm\picture-x86-asm.c" /> <ClCompile Include="..\..\src\threadwrapper\src\pthread.cpp"> <CompileAs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CompileAsCpp</CompileAs> <CompileAs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CompileAsCpp</CompileAs> @@ -318,9 +297,6 @@ <ClInclude Include="..\..\src\strategies\strategies-nal.h" /> <ClInclude Include="..\..\src\strategies\strategies-picture.h" /> <ClInclude Include="..\..\src\strategies\strategies-sao.h" /> - <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm-sad.h" /> - <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm-satd.h" /> - <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm.h" /> <ClInclude Include="..\..\src\strategyselector.h" /> <ClInclude Include="..\..\src\tables.h" /> <ClInclude Include="..\..\src\threadqueue.h" /> @@ -330,18 +306,5 @@ <ClInclude Include="..\..\src\transform.h" /> <ClInclude Include="..\..\src\videoframe.h" /> </ItemGroup> - <ItemGroup> - <YASM Include="..\..\src\extras\x86inc.asm"> - <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild> - <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild> - <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild> - <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild> - </YASM> - <YASM Include="..\..\src\strategies\x86_asm\picture-x86-asm-sad.asm" /> - <YASM Include="..\..\src\strategies\x86_asm\picture-x86-asm-satd.asm" /> - </ItemGroup> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> - <ImportGroup Label="ExtensionTargets"> - <Import Project="..\yasm\vsyasm.targets" /> - </ImportGroup> -</Project> +</Project> \ No newline at end of file
View file
kvazaar-2.2.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters -> kvazaar-2.3.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters
Changed
@@ -34,9 +34,6 @@ <Filter Include="Optimization\strategies\avx2"> <UniqueIdentifier>{4ffb5d27-c5bb-44d5-a935-fa93066a259e}</UniqueIdentifier> </Filter> - <Filter Include="Optimization\strategies\x86_asm"> - <UniqueIdentifier>{d0ce7d00-30c6-4e8a-b96e-51e13cb038ea}</UniqueIdentifier> - </Filter> <Filter Include="CABAC"> <UniqueIdentifier>{c696e039-5ba4-48ab-845d-cfe1a5713525}</UniqueIdentifier> </Filter> @@ -81,9 +78,6 @@ <ClCompile Include="..\..\src\strategies\avx2\picture-avx2.c"> <Filter>Optimization\strategies\avx2</Filter> </ClCompile> - <ClCompile Include="..\..\src\strategies\x86_asm\picture-x86-asm.c"> - <Filter>Optimization\strategies\x86_asm</Filter> - </ClCompile> <ClCompile Include="..\..\src\strategies\avx2\dct-avx2.c"> <Filter>Optimization\strategies\avx2</Filter> </ClCompile> @@ -375,15 +369,6 @@ <ClInclude Include="..\..\src\strategies\strategies-quant.h"> <Filter>Optimization\strategies</Filter> </ClInclude> - <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm.h"> - <Filter>Optimization\strategies\x86_asm</Filter> - </ClInclude> - <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm-sad.h"> - <Filter>Optimization\strategies\x86_asm</Filter> - </ClInclude> - <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm-satd.h"> - <Filter>Optimization\strategies\x86_asm</Filter> - </ClInclude> <ClInclude Include="..\..\src\strategies\sse41\picture-sse41.h"> <Filter>Optimization\strategies\sse41</Filter> </ClInclude> @@ -478,15 +463,4 @@ <Filter>Control</Filter> </ClInclude> </ItemGroup> - <ItemGroup> - <YASM Include="..\..\src\extras\x86inc.asm"> - <Filter>Extras</Filter> - </YASM> - <YASM Include="..\..\src\strategies\x86_asm\picture-x86-asm-sad.asm"> - <Filter>Optimization\strategies\x86_asm</Filter> - </YASM> - <YASM Include="..\..\src\strategies\x86_asm\picture-x86-asm-satd.asm"> - <Filter>Optimization\strategies\x86_asm</Filter> - </YASM> - </ItemGroup> -</Project> +</Project> \ No newline at end of file
View file
kvazaar-2.2.0.tar.gz/configure.ac -> kvazaar-2.3.0.tar.gz/configure.ac
Changed
@@ -23,7 +23,7 @@ # # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html ver_major=7 -ver_minor=2 +ver_minor=3 ver_release=0 # Prevents configure from adding a lot of defines to the CFLAGS @@ -137,6 +137,22 @@ , cygwin*|msys*|mingw*, CFLAGS="$CFLAGS -D__USE_MINGW_ANSI_STDIO=1" + # Fix a bug in mingw gcc where stack doesn't get aligned properly, force all AVX instructions to be unaligned + AS_CASE($CC, *gcc, + AX_CHECK_COMPILE_FLAG(-Wa,-muse-unaligned-vector-move, + CFLAGS="-Wa,-muse-unaligned-vector-move $CFLAGS", + + AC_MSG_CHECKING(if compiler is gcc) + AS_IF($CC --version | grep "gcc" >/dev/null 2>&1, + AS_ECHO("yes") + AC_MSG_ERROR(-Wa,-muse-unaligned-vector-move not supported, required with mingw+gcc to fix alignment bugs, update the used gcc) + , + AS_ECHO("no") + AS_ECHO("Compiler not gcc, -Wa,-muse-unaligned-vector-move not needed") + ) + + ) + ) AS_IF( test "x$BITS" = "x32", ASFLAGS="$ASFLAGS -fwin32 -DPREFIX -DHAVE_ALIGNED_STACK=0" @@ -165,24 +181,9 @@ ) -# YASM checks -AS_IF(test "x$X86" = "xtrue", - AC_CHECK_TOOL(YASM, yasm, no) -) -AS_IF(test "x$YASM" != "xno", have_yasm="yes") - -AC_ARG_ENABLE(asm, AS_HELP_STRING(--disable-asm, disable assembly no), - , enable_asm="yes" -) -AS_IF(test "x$enable_asm" != "xno" -a $have_yasm != "yes", - enable_asm="no" -) - - AM_CONDITIONAL(HAVE_X86, test "x$X86" = "xtrue") AM_CONDITIONAL(HAVE_PPC, test "x$PPC" = "xtrue") AM_CONDITIONAL(HAVE_ARM, test "x$ARM" = "xtrue") -AM_CONDITIONAL(ENABLE_ASM, test "x$enable_asm" = "xyes" -a "x$have_yasm" = "xyes" ) AC_ARG_VAR(ASFLAGS, ASFLAGS to use for assembler) AC_SUBST(ASFLAGS)
View file
kvazaar-2.2.0.tar.gz/doc/kvazaar.1 -> kvazaar-2.3.0.tar.gz/doc/kvazaar.1
Changed
@@ -1,9 +1,11 @@ -.TH KVAZAAR "1" "January 2023" "kvazaar v2.2.0" "User Commands" +.TH KVAZAAR "1" "tammikuu 2024" "kvazaar v2.3.0" "User Commands" .SH NAME kvazaar \- open source HEVC encoder .SH SYNOPSIS \fBkvazaar \fR\-i <input> \-\-input\-res <width>x<height> \-o <output> .SH DESCRIPTION + +.SS "Required:" .TP \fB\-i\fR, \fB\-\-input <filename> Input file @@ -85,6 +87,10 @@ \fB\-\-(no\-)info Add encoder info SEI. enabled .TP +\fB\-\-(no\-)enable\-logging +Enable logging for regular encoder performance, +error messages are always disblayed. enabled +.TP \fB\-\-crypto <string> Selective encryption. Crypto support must be enabled at compile\-time. Can be 'on' or 'off' or
View file
kvazaar-2.2.0.tar.gz/src/Makefile.am -> kvazaar-2.3.0.tar.gz/src/Makefile.am
Changed
@@ -14,9 +14,6 @@ include_HEADERS = \ kvazaar.h -noinst_HEADERS = \ - extras/x86inc.asm - noinst_LTLIBRARIES = \ libaltivec.la \ libavx2.la \ @@ -154,8 +151,6 @@ strategies/strategies-sao.h \ strategies/strategies-encode.c \ strategies/strategies-encode.h \ - strategies/x86_asm/picture-x86-asm.c \ - strategies/x86_asm/picture-x86-asm.h \ strategyselector.c \ strategyselector.h \ extras/libmd5.c \ @@ -238,27 +233,6 @@ if HAVE_SSE2 libsse2_la_CFLAGS = -msse2 endif - -if ENABLE_ASM -noinst_LTLIBRARIES += libasm.la -libkvazaar_la_LIBADD += libasm.la -libasm_la_SOURCES = \ - strategies/x86_asm/picture-x86-asm-sad.asm \ - strategies/x86_asm/picture-x86-asm-sad.h \ - strategies/x86_asm/picture-x86-asm-satd.asm \ - strategies/x86_asm/picture-x86-asm-satd.h -libkvazaar_la_CFLAGS += -DKVZ_COMPILE_ASM - -strategies/x86_asm/picture-x86-asm-sad.lo: strategies/x86_asm/picture-x86-asm-sad.asm -strategies/x86_asm/picture-x86-asm-satd.lo: strategies/x86_asm/picture-x86-asm-satd.asm -endif #ENABLE_ASM endif #HAVE_X86 -yasm_verbose = $(yasm_verbose_@AM_V@) -yasm_verbose_ = $(yasm_verbose_@AM_DEFAULT_V@) -yasm_verbose_0 = @echo " YASM " $@; - -.asm.lo: - $(yasm_verbose)$(LIBTOOL) --mode=compile --tag=CC $(YASM) -I$(srcdir)/extras $(ASFLAGS) $< -o $@ -prefer-non-pic 1>/dev/null -
View file
kvazaar-2.2.0.tar.gz/src/cfg.c -> kvazaar-2.3.0.tar.gz/src/cfg.c
Changed
@@ -188,6 +188,9 @@ cfg->force_inter = 0; cfg->intra_chroma_search = 0; cfg->fast_bipred = 1; + + cfg->enable_logging_output = 1; + return 1; } @@ -1407,6 +1410,9 @@ else if OPT("fast-bipred") { cfg->fast_bipred = atobool(value); } + else if OPT("enable-logging") { + cfg->enable_logging_output = atobool(value); + } else { return 0; }
View file
kvazaar-2.2.0.tar.gz/src/cli.c -> kvazaar-2.3.0.tar.gz/src/cli.c
Changed
@@ -176,6 +176,8 @@ { "no-intra-chroma-search", no_argument, NULL, 0 }, { "fast-bipred", no_argument, NULL, 0 }, { "no-fast-bipred", no_argument, NULL, 0 }, + { "enable-logging", no_argument, NULL, 0 }, + { "no-enable-logging", no_argument, NULL, 0 }, {0, 0, 0, 0} }; @@ -382,22 +384,28 @@ void print_usage(void) { + print_version(); fprintf(stdout, - "Kvazaar usage: -i and --input-res to set input, -o to set output\n" - " --help for more information\n"); + "usage: -i and --input-res to set input, -o to set output\n" + " --help for more information\n"); } void print_version(void) { fprintf(stdout, - "Kvazaar " VERSION_STRING "\n" +#ifdef CMAKE_BUILD + "kvazaar " VERSION_STRING " " KVZ_COMPILER_STRING " " KVZ_COMPILE_DATE "\n"); +#else + "Kvazaar " VERSION_STRING "\n" "Kvazaar license: 3-clause BSD\n"); +#endif } void print_help(void) { + print_version(); fprintf(stdout, "Usage:\n" "kvazaar -i <input> --input-res <width>x<height> -o <output>\n" @@ -447,6 +455,8 @@ " - md5: 56 bytes\n" " --(no-)psnr : Calculate PSNR for frames. enabled\n" " --(no-)info : Add encoder info SEI. enabled\n" + " --(no-)enable-logging : Enable logging for regular encoder performance,\n" + " error messages are always disblayed. enabled\n" " --crypto <string> : Selective encryption. Crypto support must be\n" " enabled at compile-time. Can be 'on' or 'off' or\n" " a list of features separated with a '+'. off\n"
View file
kvazaar-2.2.0.tar.gz/src/encmain.c -> kvazaar-2.3.0.tar.gz/src/encmain.c
Changed
@@ -527,10 +527,12 @@ const encoder_control_t *encoder = enc->control; - fprintf(stderr, "Input: %s, output: %s\n", opts->input, opts->output); - fprintf(stderr, " Video size: %dx%d (input=%dx%d)\n", - encoder->in.width, encoder->in.height, - encoder->in.real_width, encoder->in.real_height); + if(opts->config->enable_logging_output) { + fprintf(stderr, "Input: %s, output: %s\n", opts->input, opts->output); + fprintf(stderr, " Video size: %dx%d (input=%dx%d)\n", + encoder->in.width, encoder->in.height, + encoder->in.real_width, encoder->in.real_height); + } if (opts->seek > 0 && !yuv_io_seek(input, opts->seek, opts->config->width, opts->config->height, opts->config->file_format)) { fprintf(stderr, "Failed to seek %d frames.\n", opts->seek); @@ -687,7 +689,7 @@ // Compute and print stats. double frame_psnr3 = { 0.0, 0.0, 0.0 }; - if (encoder->cfg.calc_psnr && encoder->cfg.source_scan_type == KVZ_INTERLACING_NONE) { + if (encoder->cfg.calc_psnr && encoder->cfg.source_scan_type == KVZ_INTERLACING_NONE && encoder->cfg.enable_logging_output) { // Do not compute PSNR for interlaced frames, because img_rec does not contain // the deinterlaced frame yet. compute_psnr(img_src, img_rec, frame_psnr); @@ -719,8 +721,10 @@ psnr_sum1 += frame_psnr1; psnr_sum2 += frame_psnr2; - print_frame_info(&info_out, frame_psnr, len_out, encoder->cfg.calc_psnr, - calc_avg_qp(qp_sum, frames_done)); + if (opts->config->enable_logging_output) { + print_frame_info(&info_out, frame_psnr, len_out, encoder->cfg.calc_psnr, + calc_avg_qp(qp_sum, frames_done)); + } } api->picture_free(cur_in_img); @@ -735,19 +739,20 @@ // All reconstructed pictures should have been output. assert(recon_buffer_size == 0); - - // Print statistics of the coding - fprintf(stderr, " Processed %d frames, %10llu bits", - frames_done, - (long long unsigned int)bitstream_length * 8); - if (encoder->cfg.calc_psnr && frames_done > 0) { - fprintf(stderr, " AVG PSNR Y %2.4f U %2.4f V %2.4f", - psnr_sum0 / frames_done, - psnr_sum1 / frames_done, - psnr_sum2 / frames_done); + if (opts->config->enable_logging_output) { + // Print statistics of the coding + fprintf(stderr, " Processed %d frames, %10llu bits", + frames_done, + (long long unsigned int)bitstream_length * 8); + if (encoder->cfg.calc_psnr && frames_done > 0) { + fprintf(stderr, " AVG PSNR Y %2.4f U %2.4f V %2.4f", + psnr_sum0 / frames_done, + psnr_sum1 / frames_done, + psnr_sum2 / frames_done); + } + fprintf(stderr, "\n"); + fprintf(stderr, " Total CPU time: %.3f s.\n", ((float)(clock() - start_time)) / CLOCKS_PER_SEC); } - fprintf(stderr, "\n"); - fprintf(stderr, " Total CPU time: %.3f s.\n", ((float)(clock() - start_time)) / CLOCKS_PER_SEC); { const double mega = (double)(1 << 20); @@ -774,14 +779,16 @@ encoding_cpu = 100.0; } #endif - fprintf(stderr, " Encoding time: %.3f s.\n", encoding_time); - fprintf(stderr, " Encoding wall time: %.3f s.\n", wall_time); + if (opts->config->enable_logging_output) { + fprintf(stderr, " Encoding time: %.3f s.\n", encoding_time); + fprintf(stderr, " Encoding wall time: %.3f s.\n", wall_time); - fprintf(stderr, " Encoding CPU usage: %.2f%%\n", encoding_cpu); - fprintf(stderr, " FPS: %.2f\n", encoding_fps); + fprintf(stderr, " Encoding CPU usage: %.2f%%\n", encoding_cpu); + fprintf(stderr, " FPS: %.2f\n", encoding_fps); - fprintf(stderr, " Bitrate: %.3f Mbps\n", bitrate_mbps); - fprintf(stderr, " AVG QP: %.1f\n", avg_qp); + fprintf(stderr, " Bitrate: %.3f Mbps\n", bitrate_mbps); + fprintf(stderr, " AVG QP: %.1f\n", avg_qp); + } } pthread_join(input_thread, NULL); }
View file
kvazaar-2.2.0.tar.gz/src/encoder.c -> kvazaar-2.3.0.tar.gz/src/encoder.c
Changed
@@ -216,12 +216,12 @@ // completed. encoder->cfg.owf += 2; - fprintf(stderr, "--owf=auto value set to %d.\n", encoder->cfg.owf); + if (cfg->enable_logging_output) fprintf(stderr, "--owf=auto value set to %d.\n", encoder->cfg.owf); } if (encoder->cfg.threads < 0) { encoder->cfg.threads = MIN(max_threads, get_max_parallelism(encoder)); - fprintf(stderr, "--threads=auto value set to %d.\n", encoder->cfg.threads); + if (cfg->enable_logging_output) fprintf(stderr, "--threads=auto value set to %d.\n", encoder->cfg.threads); } if (encoder->cfg.source_scan_type != KVZ_INTERLACING_NONE) {
View file
kvazaar-2.2.0.tar.gz/src/encoder_state-bitstream.c -> kvazaar-2.3.0.tar.gz/src/encoder_state-bitstream.c
Changed
@@ -179,7 +179,7 @@ uint32_t ref_matrix_id = UINT32_MAX; for (pred_list_idx = list_id; pred_list_idx >= 0; pred_list_idx--) { - const int32_t * const pred_list = (list_id == pred_list_idx) ? + const coeff_t* const pred_list = (list_id == pred_list_idx) ? kvz_scalinglist_get_default(size_id, pred_list_idx) : encoder->scaling_list.scaling_list_coeffsize_idpred_list_idx; @@ -200,7 +200,7 @@ const int32_t coef_num = MIN(MAX_MATRIX_COEF_NUM, kvz_g_scaling_list_sizesize_id); const uint32_t * const scan_cg = (size_id == 0) ? g_sig_last_scan_16x16 : g_sig_last_scan_32x32; int32_t next_coef = 8; - const int32_t * const coef_list = encoder->scaling_list.scaling_list_coeffsize_idlist_id; + const coeff_t* const coef_list = encoder->scaling_list.scaling_list_coeffsize_idlist_id; if (size_id >= SCALING_LIST_16x16) { WRITE_SE(stream, encoder->scaling_list.scaling_list_dcsize_idlist_id - 8, "scaling_list_dc_coef_minus8"); @@ -504,7 +504,9 @@ WRITE_U(stream, 0, 1, "constrained_intra_pred_flag"); WRITE_U(stream, encoder->cfg.trskip_enable, 1, "transform_skip_enabled_flag"); - if (state->frame->max_qp_delta_depth >= 0) { + // Check all the conditions for setting cu_qp_delta_enabled_flag here, since state->frame->max_qp_delta_depth might not be set yet. + if (encoder->cfg.target_bitrate > 0 || encoder->cfg.erp_aqp || encoder->cfg.roi.file_path || + encoder->cfg.set_qp_in_cu || encoder->cfg.vaq || (state->tile->frame->source && state->tile->frame->source->roi.roi_array) ) { // Use separate QP for each LCU when rate control is enabled. WRITE_U(stream, 1, 1, "cu_qp_delta_enabled_flag"); WRITE_UE(stream, state->frame->max_qp_delta_depth, "diff_cu_qp_delta_depth");
View file
kvazaar-2.2.0.tar.gz/src/encoder_state-ctors_dtors.c -> kvazaar-2.3.0.tar.gz/src/encoder_state-ctors_dtors.c
Changed
@@ -68,6 +68,9 @@ state->frame->rc_beta = -1.367; state->frame->icost = 0; + // Reset max_qp_delta_depth here, was causing problems when headers are requested before input is fed in + state->frame->max_qp_delta_depth = 0; + const encoder_control_t * const encoder = state->encoder_control; const int num_lcus = encoder->in.width_in_lcu * encoder->in.height_in_lcu; state->frame->lcu_stats = calloc(num_lcus, sizeof(lcu_stats_t));
View file
kvazaar-2.2.0.tar.gz/src/extras/libmd5.c -> kvazaar-2.3.0.tar.gz/src/extras/libmd5.c
Changed
@@ -27,11 +27,11 @@ #ifndef __BIG_ENDIAN__ # define byteReverse(buf, len) /* Nothing */ #else -void byteReverse(uint32_t *buf, unsigned len); +static void byteReverse(uint32_t *buf, unsigned len); /* * Note: this code is harmless on little-endian machines. */ -void byteReverse(uint32_t *buf, unsigned len) +static void byteReverse(uint32_t *buf, unsigned len) { uint32_t t; do {
View file
kvazaar-2.2.0.tar.gz/src/global.h -> kvazaar-2.3.0.tar.gz/src/global.h
Changed
@@ -47,6 +47,10 @@ #include "config.h" // IWYU pragma: export #endif +#ifdef CMAKE_BUILD +#include "version.h" +#endif + // Include some basics in all files, like assert, primitives and NULL. // If you add anything to this list with export pragma, think long and // and hard if it's actually a good idea to incude it for every c-file. @@ -215,13 +219,12 @@ #define QUOTE(x) #x #define QUOTE_EXPAND(x) QUOTE(x) -// NOTE: When making a release, check to see if incrementing libversion in -// configure.ac is necessary. #ifndef KVZ_VERSION -#define KVZ_VERSION 2.2.0 +#define KVZ_VERSION 2.3.0 #endif #define VERSION_STRING QUOTE_EXPAND(KVZ_VERSION) + //#define VERBOSE 1 #define SAO_ABS_OFFSET_MAX ((1 << (MIN(KVZ_BIT_DEPTH, 10) - 5)) - 1)
View file
kvazaar-2.2.0.tar.gz/src/kvazaar.c -> kvazaar-2.3.0.tar.gz/src/kvazaar.c
Changed
@@ -93,7 +93,7 @@ //Initialize strategies // TODO: Make strategies non-global - if (!kvz_strategyselector_init(cfg->cpuid, KVZ_BIT_DEPTH)) { + if (!kvz_strategyselector_init(cfg->cpuid, KVZ_BIT_DEPTH, cfg->enable_logging_output)) { fprintf(stderr, "Failed to initialize strategies.\n"); goto kvazaar_open_failure; }
View file
kvazaar-2.2.0.tar.gz/src/kvazaar.h -> kvazaar-2.3.0.tar.gz/src/kvazaar.h
Changed
@@ -492,6 +492,8 @@ uint8_t intra_chroma_search; uint8_t fast_bipred; + + uint8_t enable_logging_output; //!< \brief May be used to disable the logging output to stderr. Default: on. } kvz_config; /**
View file
kvazaar-2.2.0.tar.gz/src/kvazaar.pc.in -> kvazaar-2.3.0.tar.gz/src/kvazaar.pc.in
Changed
@@ -1,11 +1,12 @@ -prefix=@prefix@ +prefix=@CMAKE_INSTALL_PREFIX@ exec_prefix=${prefix} -libdir=@libdir@ +libdir=${prefix}/lib incdir=${prefix}/include Name: libkvazaar -Description: Open-source HEVC encoder -Version: @VERSION@ +Description: @CMAKE_PROJECT_DESCRIPTION@ +URL: @CMAKE_PROJECT_HOMEPAGE_URL@ +Version: @PROJECT_VERSION@ Libs: -L${libdir} -lkvazaar Libs.private: @LIBS@ Cflags: -I${incdir}
View file
kvazaar-2.2.0.tar.gz/src/rate_control.c -> kvazaar-2.3.0.tar.gz/src/rate_control.c
Changed
@@ -189,7 +189,7 @@ bits_coded -= state->frame->cur_gop_bits_coded; } - smoothing_window = MAX(MIN_SMOOTHING_WINDOW, smoothing_window - encoder->cfg.gop_len / 2); + smoothing_window = MAX(MIN_SMOOTHING_WINDOW, smoothing_window - MAX(encoder->cfg.gop_len / 2, 1)); double gop_target_bits = -1; while( gop_target_bits < 0 && smoothing_window < 150) { @@ -375,7 +375,7 @@ else { alpha = 0.3; } - return MAX(100, alpha*pow(state->frame->icost * 4 / bits, beta)*bits); + return MIN(MAX(100, alpha*pow(state->frame->icost * 4 / bits, beta)*bits), encoder->cfg.gop_len >= 2 ? 0.85 * state->frame->cur_gop_target_bits : state->frame->cur_gop_target_bits); } if (encoder->cfg.gop_len <= 0) {
View file
kvazaar-2.2.0.tar.gz/src/rdo.c -> kvazaar-2.3.0.tar.gz/src/rdo.c
Changed
@@ -148,19 +148,6 @@ 0.027313232421875, 5.736968994140625, }; - -// This struct is for passing data to kvz_rdoq_sign_hiding -struct sh_rates_t { - // Bit cost of increasing rate by one. - int32_t inc32 * 32; - // Bit cost of decreasing rate by one. - int32_t dec32 * 32; - // Bit cost of going from zero to one. - int32_t sig_coeff_inc32 * 32; - // Coeff minus quantized coeff. - int32_t quant_delta32 * 32; -}; - int kvz_init_rdcost_outfiles(const char *dir_path) { #define RD_SAMPLING_MAX_FN_LENGTH 4095 @@ -532,7 +519,7 @@ const encoder_state_t *const state, const int32_t qp_scaled, const uint32_t *const scan2raster, - const struct sh_rates_t *const sh_rates, + const struct kvz_sh_rates_t *const sh_rates, const int32_t last_pos, const coeff_t *const coeffs, coeff_t *const quant_coeffs) @@ -686,7 +673,7 @@ int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift; - const int32_t *quant_coeff = encoder->scaling_list.quant_coefflog2_tr_size-2scalinglist_typeqp_scaled%6; + const coeff_t *quant_coeff = encoder->scaling_list.quant_coefflog2_tr_size-2scalinglist_typeqp_scaled%6; const double *err_scale = encoder->scaling_list.error_scalelog2_tr_size-2scalinglist_typeqp_scaled%6; double block_uncoded_cost = 0; @@ -695,7 +682,7 @@ double cost_sig 32 * 32 ; double cost_coeff0 32 * 32 ; - struct sh_rates_t sh_rates; + struct kvz_sh_rates_t sh_rates; const uint32_t *scan_cg = g_sig_last_scan_cglog2_block_size - 2scan_mode; const uint32_t cg_size = 16; @@ -744,29 +731,9 @@ //Find last cg and last scanpos int32_t cg_scanpos; - for (cg_scanpos = (cg_num - 1); cg_scanpos >= 0; cg_scanpos--) - { - for (int32_t scanpos_in_cg = (cg_size - 1); scanpos_in_cg >= 0; scanpos_in_cg--) - { - int32_t scanpos = cg_scanpos*cg_size + scanpos_in_cg; - uint32_t blkpos = scanscanpos; - int32_t q = quant_coeffblkpos; - int32_t level_double = coefblkpos; - level_double = MIN(abs(level_double) * q, MAX_INT - (1 << (q_bits - 1))); - uint32_t max_abs_level = (level_double + (1 << (q_bits - 1))) >> q_bits; - - if (max_abs_level > 0) { - last_scanpos = scanpos; - ctx_set = (scanpos > 0 && type == 0) ? 2 : 0; - cg_last_scanpos = cg_scanpos; - sh_rates.sig_coeff_incblkpos = 0; - break; - } - dest_coeffblkpos = 0; - } - if (last_scanpos != -1) break; - } - + kvz_find_last_scanpos(coef, dest_coeff, type, q_bits, quant_coeff, &sh_rates, cg_size, &ctx_set, scan, &cg_last_scanpos, + &last_scanpos, cg_num, &cg_scanpos, width, scan_mode); + if (last_scanpos == -1) { return; }
View file
kvazaar-2.2.0.tar.gz/src/rdo.h -> kvazaar-2.3.0.tar.gz/src/rdo.h
Changed
@@ -45,6 +45,19 @@ #include "search_inter.h" +// This struct is for passing data to kvz_rdoq_sign_hiding +struct kvz_sh_rates_t { + // Bit cost of increasing rate by one. + int32_t inc32 * 32; + // Bit cost of decreasing rate by one. + int32_t dec32 * 32; + // Bit cost of going from zero to one. + int32_t sig_coeff_inc32 * 32; + // Coeff minus quantized coeff. + int32_t quant_delta32 * 32; +}; + + extern const uint32_t kvz_g_go_rice_range5; extern const uint32_t kvz_g_go_rice_prefix_len5;
View file
kvazaar-2.2.0.tar.gz/src/scalinglist.c -> kvazaar-2.3.0.tar.gz/src/scalinglist.c
Changed
@@ -43,7 +43,7 @@ const uint16_t kvz_g_scaling_list_size4 = { 16, 64, 256,1024}; static const uint8_t g_scaling_list_size_x4 = { 4, 8,16,32}; -static const int32_t g_quant_default_4x416 = +static const coeff_t g_quant_default_4x416 = { 16,16,16,16, 16,16,16,16, @@ -51,7 +51,7 @@ 16,16,16,16 }; -static const int32_t g_quant_intra_default_8x864 = +static const coeff_t g_quant_intra_default_8x864 = { 16,16,16,16,17,18,21,24, 16,16,16,16,17,19,22,25, @@ -63,7 +63,7 @@ 24,25,29,36,47,65,88,115 }; -static const int32_t g_quant_inter_default_8x864 = +static const coeff_t g_quant_inter_default_8x864 = { 16,16,16,16,17,18,20,24, 16,16,16,17,18,20,24,25, @@ -75,8 +75,8 @@ 24,25,28,33,41,54,71,91 }; -const int16_t kvz_g_quant_scales6 = { 26214,23302,20560,18396,16384,14564 }; -const int16_t kvz_g_inv_quant_scales6 = { 40,45,51,57,64,72 }; +const coeff_t kvz_g_quant_scales6 = { 26214,23302,20560,18396,16384,14564 }; +const coeff_t kvz_g_inv_quant_scales6 = { 40,45,51,57,64,72 }; /** @@ -91,12 +91,12 @@ for (listId = 0; listId < kvz_g_scaling_list_numsizeId; listId++) { for (qp = 0; qp < 6; qp++) { if (!(sizeId == 3 && listId == 3)) { - scaling_list->quant_coeffsizeIdlistIdqp = (int32_t*)calloc(kvz_g_scaling_list_sizesizeId, sizeof(int32_t)); - scaling_list->de_quant_coeffsizeIdlistIdqp = (int32_t*)calloc(kvz_g_scaling_list_sizesizeId, sizeof(int32_t)); + scaling_list->quant_coeffsizeIdlistIdqp = (coeff_t*)calloc(kvz_g_scaling_list_sizesizeId, sizeof(int32_t)); + scaling_list->de_quant_coeffsizeIdlistIdqp = (coeff_t*)calloc(kvz_g_scaling_list_sizesizeId, sizeof(int32_t)); scaling_list->error_scalesizeIdlistIdqp = (double*)calloc(kvz_g_scaling_list_sizesizeId, sizeof(double)); } } - scaling_list->scaling_list_coeffsizeIdlistId = (int32_t*)calloc(MIN(MAX_MATRIX_COEF_NUM, kvz_g_scaling_list_sizesizeId), sizeof(int32_t)); + scaling_list->scaling_list_coeffsizeIdlistId = (coeff_t*)calloc(MIN(MAX_MATRIX_COEF_NUM, kvz_g_scaling_list_sizesizeId), sizeof(int32_t)); } } // alias, assign pointer to an existing array @@ -263,9 +263,9 @@ #undef LINE_BUFSIZE } -const int32_t *kvz_scalinglist_get_default(const uint32_t size_id, const uint32_t list_id) +const coeff_t *kvz_scalinglist_get_default(const uint32_t size_id, const uint32_t list_id) { - const int32_t *list_ptr = g_quant_intra_default_8x8; // Default to "8x8" intra + const coeff_t *list_ptr = g_quant_intra_default_8x8; // Default to "8x8" intra switch(size_id) { case SCALING_LIST_4x4: list_ptr = g_quant_default_4x4; @@ -286,7 +286,7 @@ * \brief get scaling list for decoder * */ -static void scalinglist_process_dec(const int32_t * const coeff, int32_t *dequantcoeff, +static void scalinglist_process_dec(const coeff_t * const coeff, coeff_t *dequantcoeff, int32_t inv_quant_scales, uint32_t height, uint32_t width, uint32_t ratio, int32_t size_num, uint32_t dc, @@ -315,7 +315,7 @@ * \brief get scaling list for encoder * */ -void kvz_scalinglist_process_enc(const int32_t * const coeff, int32_t* quantcoeff, const int32_t quant_scales, +void kvz_scalinglist_process_enc(const coeff_t * const coeff, coeff_t * quantcoeff, const int32_t quant_scales, const uint32_t height, const uint32_t width, const uint32_t ratio, const int32_t size_num, const uint32_t dc, const uint8_t flat) { @@ -354,7 +354,7 @@ int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - bitdepth - log2_tr_size; // Represents scaling through forward transform uint32_t i,max_num_coeff = kvz_g_scaling_list_sizesize; - const int32_t *quantcoeff = scaling_list->quant_coeffsizelistqp; + const coeff_t *quantcoeff = scaling_list->quant_coeffsizelistqp; //This cast is allowed, since error_scale is a malloc'd pointer in kvz_scalinglist_init double *err_scale = (double *) scaling_list->error_scalesizelistqp; @@ -372,15 +372,15 @@ * \brief set scaling lists * */ -void kvz_scalinglist_set(scaling_list_t * const scaling_list, const int32_t * const coeff, uint32_t listId, uint32_t sizeId, uint32_t qp) +void kvz_scalinglist_set(scaling_list_t * const scaling_list, const coeff_t* const coeff, uint32_t listId, uint32_t sizeId, uint32_t qp) { const uint32_t width = g_scaling_list_size_xsizeId; const uint32_t height = g_scaling_list_size_xsizeId; const uint32_t ratio = g_scaling_list_size_xsizeId / MIN(8, g_scaling_list_size_xsizeId); const uint32_t dc = scaling_list->scaling_list_dcsizeIdlistId != 0 ? scaling_list->scaling_list_dcsizeIdlistId : 16; //These cast are allowed, since these are pointer's to malloc'd area in kvz_scalinglist_init - int32_t *quantcoeff = (int32_t*) scaling_list->quant_coeffsizeIdlistIdqp; - int32_t *dequantcoeff = (int32_t*) scaling_list->de_quant_coeffsizeIdlistIdqp; + coeff_t*quantcoeff = (coeff_t*) scaling_list->quant_coeffsizeIdlistIdqp; + coeff_t*dequantcoeff = (coeff_t*) scaling_list->de_quant_coeffsizeIdlistIdqp; // Encoder list kvz_scalinglist_process_enc(coeff, quantcoeff, kvz_g_quant_scalesqp<<4, height, width, ratio, @@ -410,7 +410,7 @@ for (size = 0; size < SCALING_LIST_SIZE_NUM; size++) { for (list = 0; list < kvz_g_scaling_list_numsize; list++) { - const int32_t * const list_ptr = scaling_list->use_default_list ? + const coeff_t* const list_ptr = scaling_list->use_default_list ? kvz_scalinglist_get_default(size, list) : scaling_list->scaling_list_coeffsizelist;
View file
kvazaar-2.2.0.tar.gz/src/scalinglist.h -> kvazaar-2.3.0.tar.gz/src/scalinglist.h
Changed
@@ -47,16 +47,16 @@ int8_t enable; int8_t use_default_list; int32_t scaling_list_dc SCALING_LIST_SIZE_NUMSCALING_LIST_NUM; - const int32_t *scaling_list_coeffSCALING_LIST_SIZE_NUMSCALING_LIST_NUM; - const int32_t *quant_coeff466; - const int32_t *de_quant_coeff SCALING_LIST_SIZE_NUMSCALING_LIST_NUMSCALING_LIST_REM_NUM; + const coeff_t *scaling_list_coeffSCALING_LIST_SIZE_NUMSCALING_LIST_NUM; + const coeff_t *quant_coeff466; + const coeff_t *de_quant_coeff SCALING_LIST_SIZE_NUMSCALING_LIST_NUMSCALING_LIST_REM_NUM; const double *error_scale466; } scaling_list_t; extern const uint8_t kvz_g_scaling_list_num4; extern const uint16_t kvz_g_scaling_list_size4; -const int32_t *kvz_scalinglist_get_default(const uint32_t size_id, const uint32_t list_id); +const coeff_t*kvz_scalinglist_get_default(const uint32_t size_id, const uint32_t list_id); void kvz_scalinglist_init(scaling_list_t * const scaling_list); void kvz_scalinglist_destroy(scaling_list_t * const scaling_list);
View file
kvazaar-2.2.0.tar.gz/src/strategies/avx2/quant-avx2.c -> kvazaar-2.3.0.tar.gz/src/strategies/avx2/quant-avx2.c
Changed
@@ -152,7 +152,7 @@ return _mm256_inserti128_si256(v, hi, 1); } -static INLINE void scanord_read_vector_32(const int32_t *__restrict quant_coeff, +static INLINE void scanord_read_vector_32(const coeff_t *__restrict quant_coeff, const uint32_t *__restrict scan, int8_t scan_mode, int32_t subpos, @@ -190,15 +190,14 @@ _mm256_setr_epi32(2, 6, 0, 4, 3, 7, 1, 5), }; - __m128i coeffs4 = { - _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets0)), - _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets1)), - _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets2)), - _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets3)), - }; + coeff_t coeffs16; + memcpy(coeffs, quant_coeff + row_offsets0, sizeof(coeff_t) * 4); + memcpy(coeffs + 4, quant_coeff + row_offsets1, sizeof(coeff_t) * 4); + memcpy(coeffs + 8, quant_coeff + row_offsets2, sizeof(coeff_t) * 4); + memcpy(coeffs + 12, quant_coeff + row_offsets3, sizeof(coeff_t) * 4); - __m256i coeffs_upper = concatenate_2x128i(coeffs0, coeffs1); - __m256i coeffs_lower = concatenate_2x128i(coeffs2, coeffs3); + __m256i coeffs_upper = _mm256_cvtepi16_epi32(_mm_load_si128((__m128i const *)(coeffs))); + __m256i coeffs_lower = _mm256_cvtepi16_epi32(_mm_load_si128((__m128i const*)(coeffs + 8))); __m256i lower_shuffled = _mm256_permutevar8x32_epi32(coeffs_lower, shufmasksscan_mode); @@ -368,7 +367,7 @@ int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6); const uint32_t log2_tr_size = kvz_g_convert_to_bitwidth + 2; const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"type); - const int32_t *quant_coeff = encoder->scaling_list.quant_coefflog2_tr_size - 2scalinglist_typeqp_scaled % 6; + const coeff_t *quant_coeff = encoder->scaling_list.quant_coefflog2_tr_size - 2scalinglist_typeqp_scaled % 6; const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size; //!< Represents scaling through forward transform const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + transform_shift; const int32_t add = ((state->frame->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9); @@ -393,8 +392,8 @@ v_sign = _mm256_or_si256(v_sign, _mm256_set1_epi16(1)); if (state->encoder_control->scaling_list.enable) { - __m256i v_quant_coeff_lo = _mm256_loadu_si256(((__m256i *)(quant_coeff + n)) + 0); - __m256i v_quant_coeff_hi = _mm256_loadu_si256(((__m256i *)(quant_coeff + n)) + 1); + __m256i v_quant_coeff_lo = _mm256_cvtepi16_epi32(_mm_loadu_si128(((__m128i *)(quant_coeff + n)) + 0)); + __m256i v_quant_coeff_hi = _mm256_cvtepi16_epi32(_mm_loadu_si128(((__m128i *)(quant_coeff + n)) + 1)); low_b = _mm256_permute2x128_si256(v_quant_coeff_lo, v_quant_coeff_hi, @@ -739,7 +738,7 @@ uint32_t log2_tr_size = kvz_g_convert_to_bit width + 2; int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"type); - const int32_t *dequant_coef = encoder->scaling_list.de_quant_coefflog2_tr_size-2scalinglist_typeqp_scaled%6; + const coeff_t *dequant_coef = encoder->scaling_list.de_quant_coefflog2_tr_size-2scalinglist_typeqp_scaled%6; shift += 4; if (shift >qp_scaled / 6) { @@ -863,6 +862,72 @@ return (double)(temp) / 256.0; } +static void find_last_scanpos_avx2(coeff_t* coef, coeff_t* dest_coeff, int8_t type, int32_t q_bits, const coeff_t* quant_coeff, struct kvz_sh_rates_t* sh_rates, const uint32_t cg_size, + uint16_t* ctx_set, const uint32_t* scan, int32_t* cg_last_scanpos, int32_t* last_scanpos, uint32_t cg_num, int32_t* cg_scanpos, int32_t width, int8_t scan_mode) { + + __m256i min_q_bits = _mm256_set1_epi32(MAX_INT - (1 << (q_bits - 1))); + __m256i q_bits_v = _mm256_set1_epi32(1 << (q_bits - 1)); + for (*cg_scanpos = (cg_num - 1); *cg_scanpos >= 0; (*cg_scanpos)--) { + int32_t scan_pos = *cg_scanpos * cg_size; + int32_t block_pos = scanscan_pos; + coeff_t q_array16; + memcpy(q_array, &quant_coeffblock_pos, 4 * sizeof(coeff_t)); + memcpy(q_array + 4, &quant_coeffblock_pos + width, 4 * sizeof(coeff_t)); + memcpy(q_array + 8, &quant_coeffblock_pos + 2 * width, 4 * sizeof(coeff_t)); + memcpy(q_array + 12, &quant_coeffblock_pos + 3 * width, 4 * sizeof(coeff_t)); + + coeff_t coef_array16; + memcpy(coef_array, &coefblock_pos, 4 * sizeof(coeff_t)); + memcpy(coef_array + 4, &coefblock_pos + width, 4 * sizeof(coeff_t)); + memcpy(coef_array + 8, &coefblock_pos + 2 * width, 4 * sizeof(coeff_t)); + memcpy(coef_array + 12, &coefblock_pos + 3 * width, 4 * sizeof(coeff_t)); + + __m256i q = _mm256_loadu_si256((__m256i const*)q_array); + + __m256i level_double = _mm256_loadu_si256((__m256i const*)coef_array); + + __m256i abs_level_double = _mm256_abs_epi16(level_double); + + __m256i levels_mul_q_low = _mm256_mullo_epi16(abs_level_double, q); + __m256i levels_mul_q_high = _mm256_mulhi_epi16(abs_level_double, q); + + __m256i levels_mul_0 = _mm256_unpacklo_epi16(levels_mul_q_low, levels_mul_q_high); + __m256i levels_mul_1 = _mm256_unpackhi_epi16(levels_mul_q_low, levels_mul_q_high); + + __m256i min_mask = _mm256_cmpgt_epi32(min_q_bits, levels_mul_0); + levels_mul_0 = _mm256_blendv_epi8(min_q_bits, levels_mul_0, min_mask); + min_mask = _mm256_cmpgt_epi32(min_q_bits, levels_mul_1); + levels_mul_1 = _mm256_blendv_epi8(min_q_bits, levels_mul_1, min_mask); + + __m256i max_abs_level_low = _mm256_add_epi32(levels_mul_0, q_bits_v); + max_abs_level_low = _mm256_srai_epi32(max_abs_level_low, q_bits); + __m256i max_abs_level_high = _mm256_add_epi32(levels_mul_1, q_bits_v); + max_abs_level_high = _mm256_srai_epi32(max_abs_level_high, q_bits); + + memset(&dest_coeffblock_pos, 0, sizeof(coeff_t) * 4); + memset(&dest_coeffblock_pos + width, 0, sizeof(coeff_t) * 4); + memset(&dest_coeffblock_pos + 2 * width, 0, sizeof(coeff_t) * 4); + memset(&dest_coeffblock_pos + 3 * width, 0, sizeof(coeff_t) * 4); + if (!_mm256_testz_si256(max_abs_level_low, max_abs_level_low) || !_mm256_testz_si256(max_abs_level_high, max_abs_level_high)) { + uint32_t max_abs_level16; + _mm256_storeu2_m128i((__m128i*)(max_abs_level + 8), (__m128i*)(max_abs_level), max_abs_level_low); + _mm256_storeu2_m128i((__m128i*)(max_abs_level + 12), (__m128i*)(max_abs_level + 4), max_abs_level_high); + for (int sp = scan_pos + 15; sp >= scan_pos; sp--) { + uint32_t blkpos = kvz_g_sig_last_scanscan_mode1sp - scan_pos; + if (max_abs_levelblkpos > 0) { + *last_scanpos = sp; + *ctx_set = (sp > 0 && type == 0) ? 2 : 0; + *cg_last_scanpos = *cg_scanpos; + sh_rates->sig_coeff_incscansp = 0; + return; + } + } + } + } + *last_scanpos = -1; +} + + #endif //COMPILE_INTEL_AVX2 && defined X86_64 int kvz_strategy_register_quant_avx2(void* opaque, uint8_t bitdepth) @@ -879,6 +944,7 @@ success &= kvz_strategyselector_register(opaque, "quant", "avx2", 40, &kvz_quant_avx2); success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "avx2", 0, &coeff_abs_sum_avx2); success &= kvz_strategyselector_register(opaque, "fast_coeff_cost", "avx2", 40, &fast_coeff_cost_avx2); + success &= kvz_strategyselector_register(opaque, "find_last_scanpos", "avx2", 40, &find_last_scanpos_avx2); #endif //COMPILE_INTEL_AVX2 && defined X86_64 return success;
View file
kvazaar-2.2.0.tar.gz/src/strategies/generic/quant-generic.c -> kvazaar-2.3.0.tar.gz/src/strategies/generic/quant-generic.c
Changed
@@ -57,7 +57,7 @@ int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6); const uint32_t log2_tr_size = kvz_g_convert_to_bitwidth + 2; const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"type); - const int32_t *quant_coeff = encoder->scaling_list.quant_coefflog2_tr_size - 2scalinglist_typeqp_scaled % 6; + const coeff_t *quant_coeff = encoder->scaling_list.quant_coefflog2_tr_size - 2scalinglist_typeqp_scaled % 6; const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size; //!< Represents scaling through forward transform const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + transform_shift; const int32_t add = ((state->frame->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9); @@ -311,7 +311,7 @@ uint32_t log2_tr_size = kvz_g_convert_to_bit width + 2; int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"type); - const int32_t *dequant_coef = encoder->scaling_list.de_quant_coefflog2_tr_size-2scalinglist_typeqp_scaled%6; + const coeff_t *dequant_coef = encoder->scaling_list.de_quant_coefflog2_tr_size-2scalinglist_typeqp_scaled%6; shift += 4; if (shift >qp_scaled / 6) { @@ -374,6 +374,31 @@ return (double) sum / 256.0; } + + +static void find_last_scanpos_generic(coeff_t* coef, coeff_t* dest_coeff, int8_t type, int32_t q_bits, const coeff_t* quant_coeff, struct kvz_sh_rates_t* sh_rates, const uint32_t cg_size, uint16_t* ctx_set, const uint32_t* scan, int32_t* cg_last_scanpos, int32_t* last_scanpos, uint32_t cg_num, int32_t* cg_scanpos, int32_t width, int8_t scan_mode) { + for (*cg_scanpos = (cg_num - 1); *cg_scanpos >= 0; (*cg_scanpos)--) { + for (int32_t scanpos_in_cg = (cg_size - 1); scanpos_in_cg >= 0; scanpos_in_cg--) { + int32_t scanpos = *cg_scanpos * cg_size + scanpos_in_cg; + uint32_t blkpos = scanscanpos; + int32_t q = quant_coeffblkpos; + int32_t level_double = coefblkpos; + level_double = MIN(abs(level_double) * q, MAX_INT - (1 << (q_bits - 1))); + uint32_t max_abs_level = (level_double + (1 << (q_bits - 1))) >> q_bits; + + if (max_abs_level > 0) { + *last_scanpos = scanpos; + *ctx_set = (scanpos > 0 && type == 0) ? 2 : 0; + *cg_last_scanpos = *cg_scanpos; + sh_rates->sig_coeff_incblkpos = 0; + return; + } + dest_coeffblkpos = 0; + } + } +} + + int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth) { bool success = true; @@ -383,6 +408,7 @@ success &= kvz_strategyselector_register(opaque, "dequant", "generic", 0, &kvz_dequant_generic); success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "generic", 0, &coeff_abs_sum_generic); success &= kvz_strategyselector_register(opaque, "fast_coeff_cost", "generic", 0, &fast_coeff_cost_generic); + success &= kvz_strategyselector_register(opaque, "find_last_scanpos", "generic", 0, &find_last_scanpos_generic); return success; }
View file
kvazaar-2.2.0.tar.gz/src/strategies/strategies-picture.c -> kvazaar-2.3.0.tar.gz/src/strategies/strategies-picture.c
Changed
@@ -37,7 +37,6 @@ #include "strategies/generic/picture-generic.h" #include "strategies/sse2/picture-sse2.h" #include "strategies/sse41/picture-sse41.h" -#include "strategies/x86_asm/picture-x86-asm.h" #include "strategyselector.h" @@ -93,9 +92,6 @@ if (kvz_g_hardware_flags.intel_flags.sse41) { success &= kvz_strategy_register_picture_sse41(opaque, bitdepth); } - if (kvz_g_hardware_flags.intel_flags.avx) { - success &= kvz_strategy_register_picture_x86_asm_avx(opaque, bitdepth); - } if (kvz_g_hardware_flags.intel_flags.avx2) { success &= kvz_strategy_register_picture_avx2(opaque, bitdepth); }
View file
kvazaar-2.2.0.tar.gz/src/strategies/strategies-quant.c -> kvazaar-2.3.0.tar.gz/src/strategies/strategies-quant.c
Changed
@@ -43,6 +43,7 @@ dequant_func *kvz_dequant; coeff_abs_sum_func *kvz_coeff_abs_sum; fast_coeff_cost_func *kvz_fast_coeff_cost; +find_last_scanpos_func* kvz_find_last_scanpos; int kvz_strategy_register_quant(void* opaque, uint8_t bitdepth) {
View file
kvazaar-2.2.0.tar.gz/src/strategies/strategies-quant.h -> kvazaar-2.3.0.tar.gz/src/strategies/strategies-quant.h
Changed
@@ -44,6 +44,7 @@ #include "kvazaar.h" #include "tables.h" +struct kvz_sh_rates_t; // Declare function pointers. typedef void (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width, int32_t height, int8_t type, int8_t scan_idx, int8_t block_type); @@ -60,12 +61,16 @@ typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length); +typedef void (find_last_scanpos_func)(coeff_t* coef, coeff_t* dest_coeff, int8_t type, int32_t q_bits, const coeff_t* quant_coeff, struct kvz_sh_rates_t* sh_rates, const uint32_t cg_size, + uint16_t* ctx_set, const uint32_t* scan, int32_t* cg_last_scanpos, int32_t* last_scanpos, uint32_t cg_num, int32_t* cg_scanpos, int32_t width, int8_t scan_mode); + // Declare function pointers. extern quant_func * kvz_quant; extern quant_residual_func * kvz_quantize_residual; extern dequant_func *kvz_dequant; extern coeff_abs_sum_func *kvz_coeff_abs_sum; extern fast_coeff_cost_func *kvz_fast_coeff_cost; +extern find_last_scanpos_func *kvz_find_last_scanpos; int kvz_strategy_register_quant(void* opaque, uint8_t bitdepth); @@ -76,6 +81,7 @@ {"dequant", (void**) &kvz_dequant}, \ {"coeff_abs_sum", (void**) &kvz_coeff_abs_sum}, \ {"fast_coeff_cost", (void**) &kvz_fast_coeff_cost}, \ + {"find_last_scanpos", (void**) &kvz_find_last_scanpos}, \
View file
kvazaar-2.2.0.tar.gz/src/strategyselector.c -> kvazaar-2.3.0.tar.gz/src/strategyselector.c
Changed
@@ -46,13 +46,13 @@ hardware_flags_t kvz_g_strategies_in_use; hardware_flags_t kvz_g_strategies_available; -static void set_hardware_flags(int32_t cpuid); +static void set_hardware_flags(int32_t cpuid, uint8_t logging); static void* strategyselector_choose_for(const strategy_list_t * const strategies, const char * const strategy_type); //Strategies to include (add new file here) //Returns 1 if successful -int kvz_strategyselector_init(int32_t cpuid, uint8_t bitdepth) { +int kvz_strategyselector_init(int32_t cpuid, uint8_t bitdepth, uint8_t logging) { const strategy_to_select_t *cur_strategy_to_select = strategies_to_select; strategy_list_t strategies; @@ -60,7 +60,7 @@ strategies.count = 0; strategies.strategies = NULL; - set_hardware_flags(cpuid); + set_hardware_flags(cpuid, logging); //Add new register function here if (!kvz_strategy_register_picture(&strategies, bitdepth)) { @@ -118,109 +118,109 @@ //Also check what optimizations are available and what are in use //SIMD optimizations available bool strategies_available = false; - fprintf(stderr, "Available: "); + if (logging) fprintf(stderr, "Available: "); if (kvz_g_strategies_available.intel_flags.avx != 0){ - fprintf(stderr, "avx(%d) ", kvz_g_strategies_available.intel_flags.avx); + if (logging) fprintf(stderr, "avx(%d) ", kvz_g_strategies_available.intel_flags.avx); strategies_available = true; } if (kvz_g_strategies_available.intel_flags.avx2 != 0){ - fprintf(stderr, "avx2(%d) ", kvz_g_strategies_available.intel_flags.avx2); + if (logging) fprintf(stderr, "avx2(%d) ", kvz_g_strategies_available.intel_flags.avx2); strategies_available = true; } if (kvz_g_strategies_available.intel_flags.mmx != 0) { - fprintf(stderr, "mmx(%d) ", kvz_g_strategies_available.intel_flags.mmx); + if (logging) fprintf(stderr, "mmx(%d) ", kvz_g_strategies_available.intel_flags.mmx); strategies_available = true; } if (kvz_g_strategies_available.intel_flags.sse != 0) { - fprintf(stderr, "sse(%d) ", kvz_g_strategies_available.intel_flags.sse); + if (logging) fprintf(stderr, "sse(%d) ", kvz_g_strategies_available.intel_flags.sse); strategies_available = true; } if (kvz_g_strategies_available.intel_flags.sse2 != 0) { - fprintf(stderr, "sse2(%d) ", kvz_g_strategies_available.intel_flags.sse2); + if (logging) fprintf(stderr, "sse2(%d) ", kvz_g_strategies_available.intel_flags.sse2); strategies_available = true; } if (kvz_g_strategies_available.intel_flags.sse3 != 0) { - fprintf(stderr, "sse3(%d) ", kvz_g_strategies_available.intel_flags.sse3); + if (logging) fprintf(stderr, "sse3(%d) ", kvz_g_strategies_available.intel_flags.sse3); strategies_available = true; } if (kvz_g_strategies_available.intel_flags.sse41 != 0) { - fprintf(stderr, "sse41(%d) ", kvz_g_strategies_available.intel_flags.sse41); + if (logging) fprintf(stderr, "sse41(%d) ", kvz_g_strategies_available.intel_flags.sse41); strategies_available = true; } if (kvz_g_strategies_available.intel_flags.sse42 != 0) { - fprintf(stderr, "sse42(%d) ", kvz_g_strategies_available.intel_flags.sse42); + if (logging) fprintf(stderr, "sse42(%d) ", kvz_g_strategies_available.intel_flags.sse42); strategies_available = true; } if (kvz_g_strategies_available.intel_flags.ssse3 != 0) { - fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_available.intel_flags.ssse3); + if (logging) fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_available.intel_flags.ssse3); strategies_available = true; } if (kvz_g_strategies_available.arm_flags.neon != 0) { - fprintf(stderr, "neon(%d) ", kvz_g_strategies_available.arm_flags.neon); + if (logging) fprintf(stderr, "neon(%d) ", kvz_g_strategies_available.arm_flags.neon); strategies_available = true; } if (kvz_g_strategies_available.powerpc_flags.altivec != 0) { - fprintf(stderr, "altivec(%d) ", kvz_g_strategies_available.powerpc_flags.altivec); + if (logging) fprintf(stderr, "altivec(%d) ", kvz_g_strategies_available.powerpc_flags.altivec); strategies_available = true; } //If there is no strategies available if (!strategies_available){ - fprintf(stderr, "no SIMD optimizations"); + if (logging) fprintf(stderr, "no SIMD optimizations"); } - fprintf(stderr, "\n"); + if (logging) fprintf(stderr, "\n"); //SIMD optimizations in use bool strategies_in_use = false; - fprintf(stderr, "In use: "); + if (logging) fprintf(stderr, "In use: "); if (kvz_g_strategies_in_use.intel_flags.avx != 0){ - fprintf(stderr, "avx(%d) ", kvz_g_strategies_in_use.intel_flags.avx); + if (logging) fprintf(stderr, "avx(%d) ", kvz_g_strategies_in_use.intel_flags.avx); strategies_in_use = true; } if (kvz_g_strategies_in_use.intel_flags.avx2 != 0){ - fprintf(stderr, "avx2(%d) ", kvz_g_strategies_in_use.intel_flags.avx2); + if (logging) fprintf(stderr, "avx2(%d) ", kvz_g_strategies_in_use.intel_flags.avx2); strategies_in_use = true; } if (kvz_g_strategies_in_use.intel_flags.mmx != 0) { - fprintf(stderr, "mmx(%d) ", kvz_g_strategies_in_use.intel_flags.mmx); + if (logging) fprintf(stderr, "mmx(%d) ", kvz_g_strategies_in_use.intel_flags.mmx); strategies_in_use = true; } if (kvz_g_strategies_in_use.intel_flags.sse != 0) { - fprintf(stderr, "sse(%d) ", kvz_g_strategies_in_use.intel_flags.sse); + if (logging) fprintf(stderr, "sse(%d) ", kvz_g_strategies_in_use.intel_flags.sse); strategies_in_use = true; } if (kvz_g_strategies_in_use.intel_flags.sse2 != 0) { - fprintf(stderr, "sse2(%d) ", kvz_g_strategies_in_use.intel_flags.sse2); + if (logging) fprintf(stderr, "sse2(%d) ", kvz_g_strategies_in_use.intel_flags.sse2); strategies_in_use = true; } if (kvz_g_strategies_in_use.intel_flags.sse3 != 0) { - fprintf(stderr, "sse3(%d) ", kvz_g_strategies_in_use.intel_flags.sse3); + if (logging) fprintf(stderr, "sse3(%d) ", kvz_g_strategies_in_use.intel_flags.sse3); strategies_in_use = true; } if (kvz_g_strategies_in_use.intel_flags.sse41 != 0) { - fprintf(stderr, "sse41(%d) ", kvz_g_strategies_in_use.intel_flags.sse41); + if (logging) fprintf(stderr, "sse41(%d) ", kvz_g_strategies_in_use.intel_flags.sse41); strategies_in_use = true; } if (kvz_g_strategies_in_use.intel_flags.sse42 != 0) { - fprintf(stderr, "sse42(%d) ", kvz_g_strategies_in_use.intel_flags.sse42); + if (logging) fprintf(stderr, "sse42(%d) ", kvz_g_strategies_in_use.intel_flags.sse42); strategies_in_use = true; } if (kvz_g_strategies_in_use.intel_flags.ssse3 != 0) { - fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_in_use.intel_flags.ssse3); + if (logging) fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_in_use.intel_flags.ssse3); strategies_in_use = true; } if (kvz_g_strategies_in_use.arm_flags.neon != 0) { - fprintf(stderr, "neon(%d) ", kvz_g_strategies_in_use.arm_flags.neon); + if (logging) fprintf(stderr, "neon(%d) ", kvz_g_strategies_in_use.arm_flags.neon); strategies_in_use = true; } if (kvz_g_strategies_in_use.powerpc_flags.altivec != 0) { - fprintf(stderr, "altivec(%d) ", kvz_g_strategies_in_use.powerpc_flags.altivec); + if (logging) fprintf(stderr, "altivec(%d) ", kvz_g_strategies_in_use.powerpc_flags.altivec); strategies_in_use = true; } //If there is no strategies in use if (!strategies_in_use){ - fprintf(stderr, "no SIMD optimizations"); + if (logging) fprintf(stderr, "no SIMD optimizations"); } - fprintf(stderr, "\n"); + if (logging) fprintf(stderr, "\n"); //Free memory free(strategies.strategies); @@ -449,7 +449,7 @@ # endif #endif //COMPILE_POWERPC -static void set_hardware_flags(int32_t cpuid) { +static void set_hardware_flags(int32_t cpuid, uint8_t logging) { FILL(kvz_g_hardware_flags, 0); #if COMPILE_INTEL @@ -535,59 +535,63 @@ } } - fprintf(stderr, "Compiled: INTEL, flags:"); + if (logging) { + fprintf(stderr, "Compiled: INTEL, flags:"); #if COMPILE_INTEL_MMX - fprintf(stderr, " MMX"); + fprintf(stderr, " MMX"); #endif #if COMPILE_INTEL_SSE - fprintf(stderr, " SSE"); + fprintf(stderr, " SSE"); #endif #if COMPILE_INTEL_SSE2 - fprintf(stderr, " SSE2"); + fprintf(stderr, " SSE2"); #endif #if COMPILE_INTEL_SSE3 - fprintf(stderr, " SSE3"); + fprintf(stderr, " SSE3"); #endif #if COMPILE_INTEL_SSSE3 - fprintf(stderr, " SSSE3"); + fprintf(stderr, " SSSE3"); #endif #if COMPILE_INTEL_SSE41 - fprintf(stderr, " SSE41"); + fprintf(stderr, " SSE41"); #endif #if COMPILE_INTEL_SSE42 - fprintf(stderr, " SSE42"); + fprintf(stderr, " SSE42"); #endif #if COMPILE_INTEL_AVX - fprintf(stderr, " AVX"); + fprintf(stderr, " AVX"); #endif #if COMPILE_INTEL_AVX2 - fprintf(stderr, " AVX2"); + fprintf(stderr, " AVX2"); #endif - fprintf(stderr, "\nDetected: INTEL, flags:"); - if (kvz_g_hardware_flags.intel_flags.mmx) fprintf(stderr, " MMX"); - if (kvz_g_hardware_flags.intel_flags.sse) fprintf(stderr, " SSE"); - if (kvz_g_hardware_flags.intel_flags.sse2) fprintf(stderr, " SSE2"); - if (kvz_g_hardware_flags.intel_flags.sse3) fprintf(stderr, " SSE3"); - if (kvz_g_hardware_flags.intel_flags.ssse3) fprintf(stderr, " SSSE3"); - if (kvz_g_hardware_flags.intel_flags.sse41) fprintf(stderr, " SSE41"); - if (kvz_g_hardware_flags.intel_flags.sse42) fprintf(stderr, " SSE42"); - if (kvz_g_hardware_flags.intel_flags.avx) fprintf(stderr, " AVX"); - if (kvz_g_hardware_flags.intel_flags.avx2) fprintf(stderr, " AVX2"); - fprintf(stderr, "\n"); + fprintf(stderr, "\nDetected: INTEL, flags:"); + if (kvz_g_hardware_flags.intel_flags.mmx) fprintf(stderr, " MMX"); + if (kvz_g_hardware_flags.intel_flags.sse) fprintf(stderr, " SSE"); + if (kvz_g_hardware_flags.intel_flags.sse2) fprintf(stderr, " SSE2"); + if (kvz_g_hardware_flags.intel_flags.sse3) fprintf(stderr, " SSE3"); + if (kvz_g_hardware_flags.intel_flags.ssse3) fprintf(stderr, " SSSE3"); + if (kvz_g_hardware_flags.intel_flags.sse41) fprintf(stderr, " SSE41"); + if (kvz_g_hardware_flags.intel_flags.sse42) fprintf(stderr, " SSE42"); + if (kvz_g_hardware_flags.intel_flags.avx) fprintf(stderr, " AVX"); + if (kvz_g_hardware_flags.intel_flags.avx2) fprintf(stderr, " AVX2"); + fprintf(stderr, "\n"); + } #endif //COMPILE_INTEL #if COMPILE_POWERPC if (cpuid) { kvz_g_hardware_flags.powerpc_flags.altivec = altivec_available(); } - - fprintf(stderr, "Compiled: PowerPC, flags:"); + + if (logging) { + fprintf(stderr, "Compiled: PowerPC, flags:"); #if COMPILE_POWERPC_ALTIVEC - fprintf(stderr, " AltiVec"); + fprintf(stderr, " AltiVec"); #endif - fprintf(stderr, "\nDetected: PowerPC, flags:"); - if (kvz_g_hardware_flags.powerpc_flags.altivec) fprintf(stderr, " AltiVec"); - fprintf(stderr, "\n"); + fprintf(stderr, "\nDetected: PowerPC, flags:"); + if (kvz_g_hardware_flags.powerpc_flags.altivec) fprintf(stderr, " AltiVec"); + fprintf(stderr, "\n"); + } #endif }
View file
kvazaar-2.2.0.tar.gz/src/strategyselector.h -> kvazaar-2.3.0.tar.gz/src/strategyselector.h
Changed
@@ -95,7 +95,7 @@ extern hardware_flags_t kvz_g_strategies_in_use; extern hardware_flags_t kvz_g_strategies_available; -int kvz_strategyselector_init(int32_t cpuid, uint8_t bitdepth); +int kvz_strategyselector_init(int32_t cpuid, uint8_t bitdepth, uint8_t enable_logging_output); int kvz_strategyselector_register(void *opaque, const char *type, const char *strategy_name, int priority, void *fptr);
View file
kvazaar-2.2.0.tar.gz/src/threads.h -> kvazaar-2.3.0.tar.gz/src/threads.h
Changed
@@ -42,6 +42,10 @@ #include <pthread.h> +#ifdef __APPLE__ +#include <AvailabilityMacros.h> +#endif + #if defined(__GNUC__) && !defined(__MINGW32__) #include <unistd.h> // IWYU pragma: export #include <time.h> // IWYU pragma: export @@ -84,9 +88,9 @@ #endif //__GNUC__ -#ifdef __APPLE__ -// POSIX semaphores are deprecated on Mac so we use Grand Central Dispatch -// semaphores instead. +#if defined(__APPLE__) && MAC_OS_X_VERSION_MIN_REQUIRED > 1050 && !defined(__ppc__) +// POSIX semaphores are deprecated on Mac so we use Grand Central Dispatch semaphores instead. +// However GCD is supported only on 10.6+, and is not supported on any ppc, including 10.6 Rosetta. #include <dispatch/dispatch.h> typedef dispatch_semaphore_t kvz_sem_t; @@ -113,7 +117,7 @@ } #else -// Use POSIX semaphores. +// Use POSIX semaphores. This is also a fallback for old Darwin. #include <semaphore.h> typedef sem_t kvz_sem_t;
View file
kvazaar-2.2.0.tar.gz/src/transform.c -> kvazaar-2.3.0.tar.gz/src/transform.c
Changed
@@ -340,7 +340,6 @@ const kvz_pixel *ref = NULL; // Pointers to current location in arrays with quantized coefficients. coeff_t *coeff = NULL; - switch (color) { case COLOR_Y: pred = &lcu->rec.yoffset;
View file
kvazaar-2.3.0.tar.gz/src/version.h.in
Added
@@ -0,0 +1,39 @@ +#pragma once +/***************************************************************************** + * This file is part of kvazaar HEVC encoder. + * + * Copyright (c) 2022, Tampere University, ITU/ISO/IEC, project contributors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS + ****************************************************************************/ + +#ifndef KVZ_VERSION +#define KVZ_VERSION @PROJECT_VERSION@ +#endif +#define KVZ_COMPILER_STRING "@KVZ_COMPILER_STRING@" +#define KVZ_COMPILE_DATE "@CMAKE_BUILD_DATE@" +#define VERSION_STRING QUOTE_EXPAND(KVZ_VERSION)
View file
kvazaar-2.3.0.tar.gz/tests/CMakeLists.txt
Added
@@ -0,0 +1,43 @@ +file( GLOB TEST_SOURCES "*.c" ) + +# ToDo: fix the tests +list(REMOVE_ITEM TEST_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/inter_recon_bipred_tests.c") + +add_executable(kvazaar_tests ${TEST_SOURCES} ) + +target_include_directories(kvazaar_tests PUBLIC ${PROJECT_SOURCE_DIR}) +target_include_directories(kvazaar_tests PUBLIC ${PROJECT_SOURCE_DIR}/src) +target_include_directories(kvazaar_tests PUBLIC ${PROJECT_SOURCE_DIR}/src/extras) + +add_definitions(-DKVZ_DLL_EXPORTS) + +if(BUILD_SHARED_LIBS) + add_definitions(-DPIC) +endif() + +if(MSVC) + target_include_directories(kvazaar_tests PUBLIC ../src/threadwrapper/include) + + set_property( SOURCE ${TEST_SOURCES} APPEND PROPERTY COMPILE_FLAGS "/arch:AVX2" ) + add_definitions(-DWIN32_LEAN_AND_MEAN -D_WIN32 -DWIN32 -DWIN64) +else() + list(APPEND ALLOW_AVX2 "x86_64" "AMD64") + if(${CMAKE_SYSTEM_PROCESSOR} IN_LIST ALLOW_AVX2) + set_property( SOURCE ${TEST_SOURCES} APPEND PROPERTY COMPILE_FLAGS "-mavx2 -mbmi -mpopcnt -mlzcnt -mbmi2" ) + endif() + find_package(Threads REQUIRED) + target_link_libraries(kvazaar_tests PUBLIC Threads::Threads) + + include(CheckLibraryExists) + + CHECK_LIBRARY_EXISTS(m sin "" HAVE_LIB_M) + + if (HAVE_LIB_M) + set(EXTRA_LIBS ${EXTRA_LIBS} m) + endif (HAVE_LIB_M) + + target_link_libraries(kvazaar_tests PUBLIC ${EXTRA_LIBS}) +endif() + +target_link_libraries(kvazaar_tests PUBLIC kvazaar) +
View file
kvazaar-2.2.0.tar.gz/tests/test_strategies.c -> kvazaar-2.3.0.tar.gz/tests/test_strategies.c
Changed
@@ -45,7 +45,7 @@ strategies.strategies = NULL; // Init strategyselector because it sets hardware flags. - kvz_strategyselector_init(1, KVZ_BIT_DEPTH); + kvz_strategyselector_init(1, KVZ_BIT_DEPTH, 1); // Collect all strategies to be tested. if (!kvz_strategy_register_picture(&strategies, KVZ_BIT_DEPTH)) {
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.