Packman Build Service PMBS

Changes of Revision 19

kvazaar.changes Changed

kvazaar.spec Changed

@@ -1,7 +1,7 @@
 #
 # spec file for package kvazaar
 #
-# Copyright (c) 2023 Packman Team <packman@links2linux.de>
+# Copyright (c) 2024 Packman Team <packman@links2linux.de>
 # Copyright (c) 2017 SUSE LINUX GmbH, Nuernberg, Germany.
 #
 # All modifications and additions to the file contributed by third parties
@@ -16,24 +16,33 @@
 # Please submit bugfixes or comments via https://bugs.links2linux.org/
 #
 
-
 %define libname libkvazaar
 %define libmver 7
 Name:           kvazaar
-Version:        2.2.0
-Release:        0
+Version:        2.3.0
+Release:        0.pm.0
 Summary:        HEVC encoder
 License:        BSD-3-Clause
 Group:          Productivity/Multimedia/Video/Editors and Convertors
 URL:            http://ultravideo.cs.tut.fi/#encoder
 Source0:        https://github.com/ultravideo/%{name}/archive/v%{version}.tar.gz#/%{name}-%{version}.tar.gz
-Patch0:         kvazaar.memset.patch
-BuildRequires:  automake
-BuildRequires:  findutils
+# PATCH-FIX-OPENSUSE kvazaar-fix_libm_underlinking.patch
+Patch1:         kvazaar-fix_libm_underlinking.patch
+# PATCH-FIX-UPSTREAM kvazaar-add_soversion.patch
+Patch2:         kvazaar-add_soversion.patch
+# PATCH-FIX-OPENSUSE kvazaar-fix_install_libdir.patch
+Patch3:         kvazaar-fix_install_libdir.patch
+# PATCH-FIX-OPENSUSE kvazaar-fix_install_mandir.patch
+Patch4:         kvazaar-fix_install_mandir.patch
+BuildRequires:  cmake >= 3.12
 BuildRequires:  gcc >= 4.4
-BuildRequires:  gcc-c++
-BuildRequires:  libtool
 BuildRequires:  pkgconfig
+%if 0%{?suse_version} > 1500
+BuildRequires:  gcc-c++
+%else
+BuildRequires:  gcc11
+BuildRequires:  gcc11-c++
+%endif
 Requires:       %{libname}%{libmver} = %{version}
 %ifnarch %{arm}
 BuildRequires:  yasm
@@ -58,21 +67,20 @@
 Header files for the %{libname} library
 
 %prep
-%setup -q
-%patch0 -p1
+%autosetup -p1
 
 %build
-autoreconf -fvi
-%configure \
-    --disable-static \
-    --disable-silent-rules \
-    --docdir=%{_defaultdocdir}/%{name}
-make %{?_smp_mflags}
+export CC=gcc
+export CXX=g++
+test -x "$(type -p gcc-11)" && export CC=gcc-11
+test -x "$(type -p g++-11)" && export CXX=g++-11
+
+%cmake \
+   -DCMAKE_SKIP_INSTALL_RPATH=ON
+%cmake_build
 
 %install
-%make_install
-find %{buildroot} -type f -name "*.la" -delete -print
-rm %{buildroot}%{_defaultdocdir}/%{name}/LICENSE*
+%cmake_install
 
 %post   -n %{libname}%{libmver} -p /sbin/ldconfig
 %postun -n %{libname}%{libmver} -p /sbin/ldconfig

kvazaar-add_soversion.patch Added

kvazaar-fix_install_libdir.patch Added

@@ -0,0 +1,37 @@
+Index: kvazaar-2.3.0/CMakeLists.txt
+===================================================================
+--- kvazaar-2.3.0.orig/CMakeLists.txt
++++ kvazaar-2.3.0/CMakeLists.txt
+@@ -128,7 +128,7 @@ if(MSVC)
+ endif()
+ 
+ if(BUILD_SHARED_LIBS)
+-  list( APPEND CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib" "./" "../lib" )  
++  list( APPEND CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_FULL_LIBDIR}" "./" "../lib" )
+   set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+   add_library(kvazaar SHARED ${LIB_SOURCES})
+ else()
+@@ -233,9 +233,9 @@ source_group( "" FILES ${SOURCE_GROUP_TO
+ 
+ # ToDo: make configurable
+ 
+-install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.pc DESTINATION ${CMAKE_INSTALL_PREFIX}/share/pkgconfig)
++install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.pc DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/pkgconfig)
+ install(TARGETS kvazaar-bin DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
+-install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
++install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR})
+ if(BUILD_SHARED_LIBS) # Just add the lib to the bin directory for now
+   if(MSVC)
+     install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
+Index: kvazaar-2.3.0/src/kvazaar.pc.in
+===================================================================
+--- kvazaar-2.3.0.orig/src/kvazaar.pc.in
++++ kvazaar-2.3.0/src/kvazaar.pc.in
+@@ -1,6 +1,6 @@
+ prefix=@CMAKE_INSTALL_PREFIX@
+ exec_prefix=${prefix}
+-libdir=${prefix}/lib
++libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+ incdir=${prefix}/include
+ 
+ Name: libkvazaar

kvazaar-fix_install_mandir.patch Added

kvazaar-fix_libm_underlinking.patch Added

kvazaar.memset.patch Deleted

kvazaar-2.2.0.tar.gz/LICENSE.EXT.x264asm Deleted

kvazaar-2.2.0.tar.gz/build/yasm Deleted

kvazaar-2.2.0.tar.gz/build/yasm/vsyasm.props Deleted

@@ -1,31 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <PropertyGroup
-    Condition="'$(YASMBeforeTargets)' == '' and '$(YASMAfterTargets)' == '' and '$(ConfigurationType)' != 'Makefile'">
-    <YASMBeforeTargets>Midl</YASMBeforeTargets>
-    <YASMAfterTargets>CustomBuild</YASMAfterTargets>
-  </PropertyGroup>
-  <PropertyGroup>
-    <YASMDependsOn
-      Condition="'$(ConfigurationType)' != 'Makefile'">_SelectedFiles;$(YASMDependsOn)</YASMDependsOn>
-  </PropertyGroup>
-  
-  <PropertyGroup Condition="'$(Platform)' == 'Win32'">
-    <YASMFormat>win32</YASMFormat>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Platform)' == 'x64'">
-    <YASMFormat>win64</YASMFormat>
-  </PropertyGroup>
-  <ItemDefinitionGroup>
-    <YASM>
-      <Debug>False</Debug>
-      <ObjectFile>$(IntDir)</ObjectFile>
-      <PreProc>0</PreProc>
-      <Parser>0</Parser>
-      <CommandLineTemplate>vsyasm.exe -Xvc -f $(YASMFormat) AllOptions AdditionalOptions Inputs</CommandLineTemplate>
-      <Outputs>%(ObjectFile)</Outputs>
-      <ExecutionDescription>Assembling %(Filename)%(Extension)</ExecutionDescription>
-      <ShowOnlyRuleProperties>false</ShowOnlyRuleProperties>
-    </YASM>
-  </ItemDefinitionGroup>
-</Project>

kvazaar-2.2.0.tar.gz/build/yasm/vsyasm.targets Deleted

@@ -1,109 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup>
-    <PropertyPageSchema
-      Include="$(MSBuildThisFileDirectory)$(MSBuildThisFileName).xml" />
-    <AvailableItemName
-      Include="YASM">
-      <Targets>_YASM</Targets>
-    </AvailableItemName>
-  </ItemGroup>
-  <UsingTask
-    TaskName="YASM"
-    TaskFactory="XamlTaskFactory"
-    AssemblyName="Microsoft.Build.Tasks.v4.0">
-    <Task>$(MSBuildThisFileDirectory)$(MSBuildThisFileName).xml</Task>
-  </UsingTask>
-  <Target
-    Name="_YASM"
-    BeforeTargets="$(YASMBeforeTargets)"
-    AfterTargets="$(YASMAfterTargets)"
-    Condition="'@(YASM)' != ''"
-    DependsOnTargets="$(YASMDependsOn);ComputeYASMOutput"
-    Outputs="@(YASM->'$(ProjectDir)$(IntDir)%(Filename).obj')"
-    Inputs="@(YASM);%(YASM.AdditionalDependencies);$(MSBuildProjectFile)">
-    <ItemGroup
-      Condition="'@(SelectedFiles)' != ''">
-      <YASM
-        Remove="@(YASM)"
-        Condition="'%(Identity)' != '@(SelectedFiles)'" />
-    </ItemGroup>
-    <ItemGroup>
-      <YASM_tlog
-        Include="%(YASM.ObjectFile)"
-        Condition="'%(YASM.ObjectFile)' != '' and '%(YASM.ExcludedFromBuild)' != 'true'">
-        <Source>@(YASM->'%(FullPath)', '|')</Source>
-      </YASM_tlog>
-    </ItemGroup>
-    <Message
-      Importance="High"
-      Text="%(YASM.ExecutionDescription)" />
-    <WriteLinesToFile
-      Condition="'@(YASM_tlog)' != '' and '%(YASM_tlog.ExcludedFromBuild)' != 'true'"
-      File="$(TLogLocation)$(ProjectName).write.1.tlog"
-      Lines="^%(YASM_tlog.Source);@(YASM->'$(ProjectDir)$(IntDir)%(Filename).obj')"
-      Encoding="Unicode" />
-    <YASM
-      Condition="'@(YASM)' != '' and '%(YASM.ExcludedFromBuild)' != 'true'"
-      CommandLineTemplate="%(YASM.CommandLineTemplate)"
-      Debug="%(YASM.Debug)"
-      PreIncludeFile="%(YASM.PreIncludeFile)"
-      IncludePaths="%(YASM.IncludePaths)"
-      Defines="%(YASM.Defines)"
-      UnDefines="%(YASM.UnDefines)"
-      ObjectFile="%(YASM.ObjectFile)"
-      ListFile="%(YASM.ListFile)"
-      MapFile="%(YASM.MapFile)"
-      ErrorFile="%(YASM.ErrorFile)"
-      SymbolPrefix="%(YASM.SymbolPrefix)"
-      SymbolSuffix="%(YASM.SymbolSuffix)"
-      PreProc="%(YASM.PreProc)"
-      Parser="%(YASM.Parser)"
-      AdditionalOptions="%(YASM.AdditionalOptions)"
-      Inputs="@(YASM)" />
-  </Target>
-  <PropertyGroup>
-    <ComputeLinkInputsTargets>
-            $(ComputeLinkInputsTargets);
-            ComputeYASMOutput;
-          </ComputeLinkInputsTargets>
-    <ComputeLibInputsTargets>
-            $(ComputeLibInputsTargets);
-            ComputeYASMOutput;
-          </ComputeLibInputsTargets>
-  </PropertyGroup>
-  <Target
-    Name="ComputeYASMOutput"
-    Condition="'@(YASM)' != ''">
-    <ItemGroup>
-      <YASMDirsToMake
-        Condition="'@(YASM)' != '' and '%(YASM.ExcludedFromBuild)' != 'true' and !HasTrailingSlash('%(YASM.ObjectFile)')"
-        Include="%(YASM.ObjectFile)" />
-      <Link
-        Include="%(YASMDirsToMake.Identity)"
-        Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" />
-      <Lib
-        Include="%(YASMDirsToMake.Identity)"
-        Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" />
-      <ImpLib
-        Include="%(YASMDirsToMake.Identity)"
-        Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" />
-    </ItemGroup>
-    <ItemGroup>
-      <YASMDirsToMake
-        Condition="'@(YASM)' != '' and '%(YASM.ExcludedFromBuild)' != 'true' and HasTrailingSlash('%(YASM.ObjectFile)')"
-        Include="@(YASM->'%(ObjectFile)%(Filename).obj')" />
-      <Link
-        Include="%(YASMDirsToMake.Identity)"
-        Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" />
-      <Lib
-        Include="%(YASMDirsToMake.Identity)"
-        Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" />
-      <ImpLib
-        Include="%(YASMDirsToMake.Identity)"
-        Condition="'%(Extension)'=='.obj' or '%(Extension)'=='.res' or '%(Extension)'=='.rsc' or '%(Extension)'=='.lib'" />
-    </ItemGroup>
-    <MakeDir
-      Directories="@(YASMDirsToMake->'%(RootDir)%(Directory)')" />
-  </Target>
-</Project>
\ No newline at end of file

kvazaar-2.2.0.tar.gz/build/yasm/vsyasm.xml Deleted

@@ -1,283 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<ProjectSchemaDefinitions xmlns="clr-namespace:Microsoft.Build.Framework.XamlTypes;assembly=Microsoft.Build.Framework" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:sys="clr-namespace:System;assembly=mscorlib" xmlns:transformCallback="Microsoft.Cpp.Dev10.ConvertPropertyCallback">
-  <Rule
-    Name="YASM"
-    PageTemplate="tool"
-    DisplayName="Yasm Assembler"
-    Order="200">
-      
-    <Rule.DataSource>
-      <DataSource
-        Persistence="ProjectFile"
-        ItemType="YASM" />
-    </Rule.DataSource>
-      
-    <Rule.Categories>
-        
-      <Category
-        Name="General">
-        <Category.DisplayName>
-          <sys:String>General</sys:String>
-        </Category.DisplayName>
-      </Category>
-
-      <Category
-        Name="Symbols">
-          <Category.DisplayName>
-              <sys:String>Symbols</sys:String>
-          </Category.DisplayName>
-      </Category>
-
-      <Category
-        Name="Files">
-          <Category.DisplayName>
-              <sys:String>Files</sys:String>
-          </Category.DisplayName>
-      </Category>
-        
-      <Category
-        Name="Command Line"
-        Subtype="CommandLine">
-        <Category.DisplayName>
-          <sys:String>Command Line</sys:String>
-        </Category.DisplayName>
-      </Category>
-        
-    </Rule.Categories>
-
-    <StringListProperty
-      Name="Inputs"
-      Category="Command Line"
-      IsRequired="true"
-      Switch=" ">
-      <StringListProperty.DataSource>
-        <DataSource
-          Persistence="ProjectFile"
-          ItemType="YASM"
-          SourceType="Item" />
-      </StringListProperty.DataSource>
-    </StringListProperty>
-    
-    <BoolProperty
-      Name="Debug"
-      Subcategory="Configuration"
-      HelpContext="0"
-      DisplayName="Debug Information"
-      Description="Generate debugging information"
-      Switch="-g cv8" />
-
-    <StringListProperty
-      Name="IncludePaths"
-      Subcategory="Configuration"
-      HelpContext="0"
-      DisplayName="Include Paths"
-      Description="Set the paths for any additional include files"
-      Switch="-i &quot;value&quot;" />
-
-    <StringListProperty
-      Name="Defines"
-      Category="Symbols"
-      Subcategory="Pre-Defined Symbols"
-      HelpContext="0"
-      DisplayName="Defined Symbols"
-      Description="Specify pre-defined symbols ('symbol' or 'symbol = value') "
-      Switch="-d &quot;value&quot;" />
-    
-    <StringListProperty
-      Name="UnDefines"
-      Category="Symbols"
-      Subcategory="Pre-Defined Symbols"
-      HelpContext="0"
-      DisplayName="Remove Symbols"
-      Description="Remove pre-defined symbols "
-      Switch="-u &quot;value&quot;" />
-    
-    <StringProperty
-      Name="ObjectFile"
-      Subcategory="Output"
-      HelpContext="0"
-      DisplayName="Object File Name"
-      Description="Select the output file name"
-      Switch="-o &quot;value&quot;" />
-    
-    <StringProperty
-      Name="ListFile"
-      Category="Files"
-      Subcategory="Output"
-      HelpContext="0"
-      DisplayName="List File Name"
-      Description="Select an output listing by setting its file name"
-      Switch="-l &quot;value&quot;" />
-    
-    <StringProperty
-      Name="PreIncludeFile"
-      Category="Files"
-      Subcategory="Configuration"
-      HelpContext="0"
-      DisplayName="Pre Include File"
-      Description="Select a pre-included file by setting its name"
-      Switch="-P &quot;value&quot;" />
-      
-    <StringProperty
-      Name="MapFile"
-      Category="Files"
-      Subcategory="Output"
-      HelpContext="0"
-      DisplayName="Map File Name"
-      Description="Select a map output by setting its file name"
-      Switch="--mapdir= &quot;value&quot;" />
-
-    <StringProperty
-      Name="ErrorFile"
-      Category="Files"
-      Subcategory="Output"
-      HelpContext="0"
-      DisplayName="Error File Name"
-      Description="Send error/warning messages to a file by setting its name"
-      Switch="-E &quot;value&quot;" />
-
-    <StringProperty
-      Name="SymbolPrefix"
-      Category="Symbols"
-      Subcategory="Symbols"
-      HelpContext="0"
-      DisplayName="External Symbol Prefix"
-      Description="Prepend symbol to all external symbols"
-      Switch="--prefix &quot;value&quot;" />
-
-    <StringProperty
-      Name="SymbolSuffix"
-      Category="Symbols"
-      Subcategory="Symbols"
-      HelpContext="0"
-      DisplayName="External Symbol Suffix"
-      Description="Append symbol to all external symbols"
-      Switch="--suffix &quot;value&quot;" />
-
-    <EnumProperty
-      Name="PreProc"
-      Subcategory="Configuration"
-      HelpContext="0"
-      DisplayName="Pre-Processor"
-      Description="Select the pre-processor ('nasm' or 'raw')">
-      <EnumValue
-        Name="0"
-        DisplayName="Nasm "
-        Switch="-rnasm" />
-      <EnumValue
-        Name="1"
-        DisplayName="Raw"
-        Switch="-rraw" />
-    </EnumProperty>
-    
-    <EnumProperty
-      Name="Parser"
-      Subcategory="Configuration"
-      HelpContext="0"
-      DisplayName="Parser"
-      Description="Select the parser for Intel ('nasm') or AT&amp;T ( 'gas') syntax">
-      <EnumValue
-        Name="0"
-        DisplayName="Nasm"
-        Switch="-pnasm" />
-      <EnumValue
-        Name="1"
-        DisplayName="Gas"
-        Switch="-pgas" />
-    </EnumProperty>
-    
-    <StringProperty
-      Name="CommandLineTemplate"
-      DisplayName="Command Line"
-      Visible="False"
-      IncludeInCommandLine="False" />
-    
-    <DynamicEnumProperty
-      Name="YASMBeforeTargets"
-      Category="General"
-      EnumProvider="Targets"
-      IncludeInCommandLine="False">
-      <DynamicEnumProperty.DisplayName>
-        <sys:String>Execute Before</sys:String>
-      </DynamicEnumProperty.DisplayName>
-      <DynamicEnumProperty.Description>
-        <sys:String>Specifies the targets for the build customization to run before.</sys:String>
-      </DynamicEnumProperty.Description>
-      <DynamicEnumProperty.ProviderSettings>
-        <NameValuePair
-          Name="Exclude"
-          Value="^YASMBeforeTargets|^Compute" />
-      </DynamicEnumProperty.ProviderSettings>
-      <DynamicEnumProperty.DataSource>
-        <DataSource
-          Persistence="ProjectFile"
-          HasConfigurationCondition="true" />
-      </DynamicEnumProperty.DataSource>
-    </DynamicEnumProperty>
-    
-    <DynamicEnumProperty
-      Name="YASMAfterTargets"
-      Category="General"
-      EnumProvider="Targets"
-      IncludeInCommandLine="False">
-      <DynamicEnumProperty.DisplayName>
-        <sys:String>Execute After</sys:String>
-      </DynamicEnumProperty.DisplayName>
-      <DynamicEnumProperty.Description>
-        <sys:String>Specifies the targets for the build customization to run after.</sys:String>
-      </DynamicEnumProperty.Description>
-      <DynamicEnumProperty.ProviderSettings>
-        <NameValuePair
-          Name="Exclude"
-          Value="^YASMAfterTargets|^Compute" />
-      </DynamicEnumProperty.ProviderSettings>
-      <DynamicEnumProperty.DataSource>
-        <DataSource
-          Persistence="ProjectFile"
-          ItemType=""
-          HasConfigurationCondition="true" />
-      </DynamicEnumProperty.DataSource>
-    </DynamicEnumProperty>
-    
-    <StringListProperty
-      Name="Outputs"
-      DisplayName="Outputs"
-      Visible="False"
-      IncludeInCommandLine="False" />
-    
-    <StringProperty
-      Name="ExecutionDescription"
-      DisplayName="Execution Description"
-      Visible="False"
-      IncludeInCommandLine="False" />
-    
-    <StringListProperty
-      Name="AdditionalDependencies"
-      DisplayName="Additional Dependencies"
-      IncludeInCommandLine="False"
-      Visible="true" />
-    
-    <StringProperty
-      Subtype="AdditionalOptions"
-      Name="AdditionalOptions"
-      Category="Command Line">
-      <StringProperty.DisplayName>
-        <sys:String>Additional Options</sys:String>
-      </StringProperty.DisplayName>
-      <StringProperty.Description>
-        <sys:String>Additional Options</sys:String>
-      </StringProperty.Description>
-    </StringProperty>
-  </Rule>
-  
-  <ItemType
-    Name="YASM"
-    DisplayName="Yasm Assembler" />
-  <FileExtension
-    Name="*.asm"
-    ContentType="YASM" />
-  <ContentType
-    Name="YASM"
-    DisplayName="Yasm Assembler"
-    ItemType="YASM" />
-</ProjectSchemaDefinitions>
\ No newline at end of file

kvazaar-2.2.0.tar.gz/src/extras/x86inc.asm Deleted

@@ -1,1456 +0,0 @@
-;*****************************************************************************
-;* x86inc.asm: x264asm abstraction layer
-;*****************************************************************************
-;* Copyright (C) 2005-2014 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Anton Mitrofanov <BugMaster@narod.ru>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
-;*          Henrik Gramner <henrik@gramner.com>
-;*
-;* Permission to use, copy, modify, and/or distribute this software for any
-;* purpose with or without fee is hereby granted, provided that the above
-;* copyright notice and this permission notice appear in all copies.
-;*
-;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-;*****************************************************************************
-
-; This is a header file for the x264ASM assembly language, which uses
-; NASM/YASM syntax combined with a large number of macros to provide easy
-; abstraction between different calling conventions (x86_32, win64, linux64).
-; It also has various other useful features to simplify writing the kind of
-; DSP functions that are most often used in x264.
-
-; Unlike the rest of x264, this file is available under an ISC license, as it
-; has significant usefulness outside of x264 and we want it to be available
-; to the largest audience possible.  Of course, if you modify it for your own
-; purposes to add a new feature, we strongly encourage contributing a patch
-; as this feature might be useful for others as well.  Send patches or ideas
-; to x264-devel@videolan.org .
-
-%ifndef private_prefix
-    %define private_prefix kvz
-%endif
-
-%ifndef public_prefix
-    %define public_prefix private_prefix
-%endif
-
-%define WIN64  0
-%define UNIX64 0
-%if ARCH_X86_64
-    %ifidn __OUTPUT_FORMAT__,win32
-        %define WIN64  1
-    %elifidn __OUTPUT_FORMAT__,win64
-        %define WIN64  1
-    %elifidn __OUTPUT_FORMAT__,x64
-        %define WIN64  1
-    %else
-        %define UNIX64 1
-    %endif
-%endif
-
-%ifdef PREFIX
-    %define mangle(x) _ %+ x
-%else
-    %define mangle(x) x
-%endif
-
-%macro SECTION_RODATA 0-1 16
-    SECTION .rodata align=%1
-%endmacro
-
-%macro SECTION_TEXT 0-1 16
-    SECTION .text align=%1
-%endmacro
-
-%if WIN64
-    %define PIC
-%elif ARCH_X86_64 == 0
-; x86_32 doesn't require PIC.
-; Some distros prefer shared objects to be PIC, but nothing breaks if
-; the code contains a few textrels, so we'll skip that complexity.
-    %undef PIC
-%endif
-%ifdef PIC
-    default rel
-%endif
-
-%macro CPUNOP 1
-    %ifdef __YASM_MAJOR__
-        CPU %1
-    %endif
-%endmacro
-
-; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
-CPUNOP amdnop
-
-; Macros to eliminate most code duplication between x86_32 and x86_64:
-; Currently this works only for leaf functions which load all their arguments
-; into registers at the start, and make no other use of the stack. Luckily that
-; covers most of x264's asm.
-
-; PROLOGUE:
-; %1 = number of arguments. loads them from stack if needed.
-; %2 = number of registers used. pushes callee-saved regs if needed.
-; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
-; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
-;      MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
-;      and an extra register will be allocated to hold the original stack
-;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
-;      register as stack pointer, request a negative stack size.
-; %4+/%5+ = list of names to define to registers
-; PROLOGUE can also be invoked by adding the same options to cglobal
-
-; e.g.
-; cglobal foo, 2,3,0, dst, src, tmp
-; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
-
-; TODO Some functions can use some args directly from the stack. If they're the
-; last args then you can just not declare them, but if they're in the middle
-; we need more flexible macro.
-
-; RET:
-; Pops anything that was pushed by PROLOGUE, and returns.
-
-; REP_RET:
-; Use this instead of RET if it's a branch target.
-
-; registers:
-; rN and rNq are the native-size register holding function argument N
-; rNd, rNw, rNb are dword, word, and byte size
-; rNh is the high 8 bits of the word size
-; rNm is the original location of arg N (a register or on the stack), dword
-; rNmp is native size
-
-%macro DECLARE_REG 2-3
-    %define r%1q %2
-    %define r%1d %2d
-    %define r%1w %2w
-    %define r%1b %2b
-    %define r%1h %2h
-    %if %0 == 2
-        %define r%1m  %2d
-        %define r%1mp %2
-    %elif ARCH_X86_64 ; memory
-        %define r%1m rstk + stack_offset + %3
-        %define r%1mp qword r %+ %1 %+ m
-    %else
-        %define r%1m rstk + stack_offset + %3
-        %define r%1mp dword r %+ %1 %+ m
-    %endif
-    %define r%1  %2
-%endmacro
-
-%macro DECLARE_REG_SIZE 3
-    %define r%1q r%1
-    %define e%1q r%1
-    %define r%1d e%1
-    %define e%1d e%1
-    %define r%1w %1
-    %define e%1w %1
-    %define r%1h %3
-    %define e%1h %3
-    %define r%1b %2
-    %define e%1b %2
-%if ARCH_X86_64 == 0
-    %define r%1  e%1
-%endif
-%endmacro
-
-DECLARE_REG_SIZE ax, al, ah
-DECLARE_REG_SIZE bx, bl, bh
-DECLARE_REG_SIZE cx, cl, ch
-DECLARE_REG_SIZE dx, dl, dh
-DECLARE_REG_SIZE si, sil, null
-DECLARE_REG_SIZE di, dil, null
-DECLARE_REG_SIZE bp, bpl, null
-
-; t# defines for when per-arch register allocation is more complex than just function arguments
-
-%macro DECLARE_REG_TMP 1-*
-    %assign %%i 0
-    %rep %0
-        CAT_XDEFINE t, %%i, r%1
-        %assign %%i %%i+1
-        %rotate 1
-    %endrep
-%endmacro
-
-%macro DECLARE_REG_TMP_SIZE 0-*
-    %rep %0
-        %define t%1q t%1 %+ q
-        %define t%1d t%1 %+ d
-        %define t%1w t%1 %+ w
-        %define t%1h t%1 %+ h
-        %define t%1b t%1 %+ b
-        %rotate 1
-    %endrep
-%endmacro
-
-DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
-
-%if ARCH_X86_64
-    %define gprsize 8
-%else
-    %define gprsize 4
-%endif
-
-%macro PUSH 1
-    push %1
-    %ifidn rstk, rsp
-        %assign stack_offset stack_offset+gprsize
-    %endif
-%endmacro
-
-%macro POP 1
-    pop %1
-    %ifidn rstk, rsp
-        %assign stack_offset stack_offset-gprsize
-    %endif
-%endmacro
-
-%macro PUSH_IF_USED 1-*
-    %rep %0
-        %if %1 < regs_used
-            PUSH r%1
-        %endif
-        %rotate 1
-    %endrep
-%endmacro
-
-%macro POP_IF_USED 1-*
-    %rep %0
-        %if %1 < regs_used
-            pop r%1
-        %endif
-        %rotate 1
-    %endrep
-%endmacro
-
-%macro LOAD_IF_USED 1-*
-    %rep %0
-        %if %1 < num_args
-            mov r%1, r %+ %1 %+ mp
-        %endif
-        %rotate 1
-    %endrep
-%endmacro
-
-%macro SUB 2
-    sub %1, %2
-    %ifidn %1, rstk
-        %assign stack_offset stack_offset+(%2)
-    %endif
-%endmacro
-
-%macro ADD 2
-    add %1, %2
-    %ifidn %1, rstk
-        %assign stack_offset stack_offset-(%2)
-    %endif
-%endmacro
-
-%macro movifnidn 2
-    %ifnidn %1, %2
-        mov %1, %2
-    %endif
-%endmacro
-
-%macro movsxdifnidn 2
-    %ifnidn %1, %2
-        movsxd %1, %2
-    %endif
-%endmacro
-
-%macro ASSERT 1
-    %if (%1) == 0
-        %error assert failed
-    %endif
-%endmacro
-
-%macro DEFINE_ARGS 0-*
-    %ifdef n_arg_names
-        %assign %%i 0
-        %rep n_arg_names
-            CAT_UNDEF arg_name %+ %%i, q
-            CAT_UNDEF arg_name %+ %%i, d
-            CAT_UNDEF arg_name %+ %%i, w
-            CAT_UNDEF arg_name %+ %%i, h
-            CAT_UNDEF arg_name %+ %%i, b
-            CAT_UNDEF arg_name %+ %%i, m
-            CAT_UNDEF arg_name %+ %%i, mp
-            CAT_UNDEF arg_name, %%i
-            %assign %%i %%i+1
-        %endrep
-    %endif
-
-    %xdefine %%stack_offset stack_offset
-    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
-    %assign %%i 0
-    %rep %0
-        %xdefine %1q r %+ %%i %+ q
-        %xdefine %1d r %+ %%i %+ d
-        %xdefine %1w r %+ %%i %+ w
-        %xdefine %1h r %+ %%i %+ h
-        %xdefine %1b r %+ %%i %+ b
-        %xdefine %1m r %+ %%i %+ m
-        %xdefine %1mp r %+ %%i %+ mp
-        CAT_XDEFINE arg_name, %%i, %1
-        %assign %%i %%i+1
-        %rotate 1
-    %endrep
-    %xdefine stack_offset %%stack_offset
-    %assign n_arg_names %0
-%endmacro
-
-%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
-    %ifnum %1
-        %if %1 != 0
-            %assign %%stack_alignment ((mmsize + 15) & ~15)
-            %assign stack_size %1
-            %if stack_size < 0
-                %assign stack_size -stack_size
-            %endif
-            %assign stack_size_padded stack_size
-            %if WIN64
-                %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
-                %if mmsize != 8
-                    %assign xmm_regs_used %2
-                    %if xmm_regs_used > 8
-                        %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
-                    %endif
-                %endif
-            %endif
-            %if mmsize <= 16 && HAVE_ALIGNED_STACK
-                %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
-                SUB rsp, stack_size_padded
-            %else
-                %assign %%reg_num (regs_used - 1)
-                %xdefine rstk r %+ %%reg_num
-                ; align stack, and save original stack location directly above
-                ; it, i.e. in rsp+stack_size_padded, so we can restore the
-                ; stack in a single instruction (i.e. mov rsp, rstk or mov
-                ; rsp, rsp+stack_size_padded)
-                mov  rstk, rsp
-                %if %1 < 0 ; need to store rsp on stack
-                    sub  rsp, gprsize+stack_size_padded
-                    and  rsp, ~(%%stack_alignment-1)
-                    %xdefine rstkm rsp+stack_size_padded
-                    mov rstkm, rstk
-                %else ; can keep rsp in rstk during whole function
-                    sub  rsp, stack_size_padded
-                    and  rsp, ~(%%stack_alignment-1)
-                    %xdefine rstkm rstk
-                %endif
-            %endif
-            WIN64_PUSH_XMM
-        %endif
-    %endif
-%endmacro
-
-%macro SETUP_STACK_POINTER 1
-    %ifnum %1
-        %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
-            %if %1 > 0
-                %assign regs_used (regs_used + 1)
-            %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
-                %warning "Stack pointer will overwrite register argument"
-            %endif
-        %endif
-    %endif
-%endmacro
-
-%macro DEFINE_ARGS_INTERNAL 3+
-    %ifnum %2
-        DEFINE_ARGS %3
-    %elif %1 == 4
-        DEFINE_ARGS %2
-    %elif %1 > 4
-        DEFINE_ARGS %2, %3
-    %endif
-%endmacro
-
-%if WIN64 ; Windows x64 ;=================================================
-
-DECLARE_REG 0,  rcx
-DECLARE_REG 1,  rdx
-DECLARE_REG 2,  R8
-DECLARE_REG 3,  R9
-DECLARE_REG 4,  R10, 40
-DECLARE_REG 5,  R11, 48
-DECLARE_REG 6,  rax, 56
-DECLARE_REG 7,  rdi, 64
-DECLARE_REG 8,  rsi, 72
-DECLARE_REG 9,  rbx, 80
-DECLARE_REG 10, rbp, 88
-DECLARE_REG 11, R12, 96
-DECLARE_REG 12, R13, 104
-DECLARE_REG 13, R14, 112
-DECLARE_REG 14, R15, 120
-
-%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
-    %assign num_args %1
-    %assign regs_used %2
-    ASSERT regs_used >= num_args
-    SETUP_STACK_POINTER %4
-    ASSERT regs_used <= 15
-    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
-    ALLOC_STACK %4, %3
-    %if mmsize != 8 && stack_size == 0
-        WIN64_SPILL_XMM %3
-    %endif
-    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
-    DEFINE_ARGS_INTERNAL %0, %4, %5
-%endmacro
-
-%macro WIN64_PUSH_XMM 0
-    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
-    %if xmm_regs_used > 6
-        movaps rstk + stack_offset +  8, xmm6
-    %endif
-    %if xmm_regs_used > 7
-        movaps rstk + stack_offset + 24, xmm7
-    %endif
-    %if xmm_regs_used > 8
-        %assign %%i 8
-        %rep xmm_regs_used-8
-            movaps rsp + (%%i-8)*16 + stack_size + 32, xmm %+ %%i
-            %assign %%i %%i+1
-        %endrep
-    %endif
-%endmacro
-
-%macro WIN64_SPILL_XMM 1
-    %assign xmm_regs_used %1
-    ASSERT xmm_regs_used <= 16
-    %if xmm_regs_used > 8
-        %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
-        SUB rsp, stack_size_padded
-    %endif
-    WIN64_PUSH_XMM
-%endmacro
-
-%macro WIN64_RESTORE_XMM_INTERNAL 1
-    %assign %%pad_size 0
-    %if xmm_regs_used > 8
-        %assign %%i xmm_regs_used
-        %rep xmm_regs_used-8
-            %assign %%i %%i-1
-            movaps xmm %+ %%i, %1 + (%%i-8)*16 + stack_size + 32
-        %endrep
-    %endif
-    %if stack_size_padded > 0
-        %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
-            mov rsp, rstkm
-        %else
-            add %1, stack_size_padded
-            %assign %%pad_size stack_size_padded
-        %endif
-    %endif
-    %if xmm_regs_used > 7
-        movaps xmm7, %1 + stack_offset - %%pad_size + 24
-    %endif
-    %if xmm_regs_used > 6
-        movaps xmm6, %1 + stack_offset - %%pad_size +  8
-    %endif
-%endmacro
-
-%macro WIN64_RESTORE_XMM 1
-    WIN64_RESTORE_XMM_INTERNAL %1
-    %assign stack_offset (stack_offset-stack_size_padded)
-    %assign xmm_regs_used 0
-%endmacro
-
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
-
-%macro RET 0
-    WIN64_RESTORE_XMM_INTERNAL rsp
-    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-%if mmsize == 32
-    vzeroupper
-%endif
-    AUTO_REP_RET
-%endmacro
-
-%elif ARCH_X86_64 ; *nix x64 ;=============================================
-
-DECLARE_REG 0,  rdi
-DECLARE_REG 1,  rsi
-DECLARE_REG 2,  rdx
-DECLARE_REG 3,  rcx
-DECLARE_REG 4,  R8
-DECLARE_REG 5,  R9
-DECLARE_REG 6,  rax, 8
-DECLARE_REG 7,  R10, 16
-DECLARE_REG 8,  R11, 24
-DECLARE_REG 9,  rbx, 32
-DECLARE_REG 10, rbp, 40
-DECLARE_REG 11, R12, 48
-DECLARE_REG 12, R13, 56
-DECLARE_REG 13, R14, 64
-DECLARE_REG 14, R15, 72
-
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names...
-    %assign num_args %1
-    %assign regs_used %2
-    ASSERT regs_used >= num_args
-    SETUP_STACK_POINTER %4
-    ASSERT regs_used <= 15
-    PUSH_IF_USED 9, 10, 11, 12, 13, 14
-    ALLOC_STACK %4
-    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
-    DEFINE_ARGS_INTERNAL %0, %4, %5
-%endmacro
-
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
-
-%macro RET 0
-%if stack_size_padded > 0
-%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
-    mov rsp, rstkm
-%else
-    add rsp, stack_size_padded
-%endif
-%endif
-    POP_IF_USED 14, 13, 12, 11, 10, 9
-%if mmsize == 32
-    vzeroupper
-%endif
-    AUTO_REP_RET
-%endmacro
-
-%else ; X86_32 ;==============================================================
-
-DECLARE_REG 0, eax, 4
-DECLARE_REG 1, ecx, 8
-DECLARE_REG 2, edx, 12
-DECLARE_REG 3, ebx, 16
-DECLARE_REG 4, esi, 20
-DECLARE_REG 5, edi, 24
-DECLARE_REG 6, ebp, 28
-%define rsp esp
-
-%macro DECLARE_ARG 1-*
-    %rep %0
-        %define r%1m rstk + stack_offset + 4*%1 + 4
-        %define r%1mp dword r%1m
-        %rotate 1
-    %endrep
-%endmacro
-
-DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
-
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names...
-    %assign num_args %1
-    %assign regs_used %2
-    ASSERT regs_used >= num_args
-    %if num_args > 7
-        %assign num_args 7
-    %endif
-    %if regs_used > 7
-        %assign regs_used 7
-    %endif
-    SETUP_STACK_POINTER %4
-    ASSERT regs_used <= 7
-    PUSH_IF_USED 3, 4, 5, 6
-    ALLOC_STACK %4
-    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
-    DEFINE_ARGS_INTERNAL %0, %4, %5
-%endmacro
-
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
-
-%macro RET 0
-%if stack_size_padded > 0
-%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
-    mov rsp, rstkm
-%else
-    add rsp, stack_size_padded
-%endif
-%endif
-    POP_IF_USED 6, 5, 4, 3
-%if mmsize == 32
-    vzeroupper
-%endif
-    AUTO_REP_RET
-%endmacro
-
-%endif ;======================================================================
-
-%if WIN64 == 0
-%macro WIN64_SPILL_XMM 1
-%endmacro
-%macro WIN64_RESTORE_XMM 1
-%endmacro
-%macro WIN64_PUSH_XMM 0
-%endmacro
-%endif
-
-; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
-; a branch or a branch target. So switch to a 2-byte form of ret in that case.
-; We can automatically detect "follows a branch", but not a branch target.
-; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
-%macro REP_RET 0
-    %if has_epilogue
-        RET
-    %else
-        rep ret
-    %endif
-%endmacro
-
-%define last_branch_adr $$
-%macro AUTO_REP_RET 0
-    %ifndef cpuflags
-        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
-    %elif notcpuflag(ssse3)
-        times ((last_branch_adr-$)>>31)+1 rep
-    %endif
-    ret
-%endmacro
-
-%macro BRANCH_INSTR 0-*
-    %rep %0
-        %macro %1 1-2 %1
-            %2 %1
-            %%branch_instr:
-            %xdefine last_branch_adr %%branch_instr
-        %endmacro
-        %rotate 1
-    %endrep
-%endmacro
-
-BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
-
-%macro TAIL_CALL 2 ; callee, is_nonadjacent
-    %if has_epilogue
-        call %1
-        RET
-    %elif %2
-        jmp %1
-    %endif
-%endmacro
-
-;=============================================================================
-; arch-independent part
-;=============================================================================
-
-%assign function_align 16
-
-; Begin a function.
-; Applies any symbol mangling needed for C linkage, and sets up a define such that
-; subsequent uses of the function name automatically refer to the mangled version.
-; Appends cpuflags to the function name if cpuflags has been specified.
-; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
-; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
-%macro cglobal 1-2+ "" ; name, PROLOGUE args
-    cglobal_internal 1, %1 %+ SUFFIX, %2
-%endmacro
-%macro cvisible 1-2+ "" ; name, PROLOGUE args
-    cglobal_internal 0, %1 %+ SUFFIX, %2
-%endmacro
-%macro cglobal_internal 2-3+
-    %if %1
-        %xdefine %%FUNCTION_PREFIX private_prefix
-        %xdefine %%VISIBILITY hidden
-    %else
-        %xdefine %%FUNCTION_PREFIX public_prefix
-        %xdefine %%VISIBILITY
-    %endif
-    %ifndef cglobaled_%2
-        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
-        %xdefine %2.skip_prologue %2 %+ .skip_prologue
-        CAT_XDEFINE cglobaled_, %2, 1
-    %endif
-    %xdefine current_function %2
-    %ifidn __OUTPUT_FORMAT__,elf
-        global %2:function %%VISIBILITY
-    %else
-        global %2
-    %endif
-    align function_align
-    %2:
-    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
-    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
-    %assign stack_offset 0      ; stack pointer offset relative to the return address
-    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
-    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
-    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
-    %ifnidn %3, ""
-        PROLOGUE %3
-    %endif
-%endmacro
-
-%macro cextern 1
-    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
-    CAT_XDEFINE cglobaled_, %1, 1
-    extern %1
-%endmacro
-
-; like cextern, but without the prefix
-%macro cextern_naked 1
-    %xdefine %1 mangle(%1)
-    CAT_XDEFINE cglobaled_, %1, 1
-    extern %1
-%endmacro
-
-%macro const 1-2+
-    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
-    %ifidn __OUTPUT_FORMAT__,elf
-        global %1:data hidden
-    %else
-        global %1
-    %endif
-    %1: %2
-%endmacro
-
-; This is needed for ELF, otherwise the GNU linker assumes the stack is
-; executable by default.
-%ifidn __OUTPUT_FORMAT__,elf
-SECTION .note.GNU-stack noalloc noexec nowrite progbits
-%endif
-
-; cpuflags
-
-%assign cpuflags_mmx      (1<<0)
-%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
-%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
-%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
-%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
-%assign cpuflags_sse2     (1<<5) | cpuflags_sse
-%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
-%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
-%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
-%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
-%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
-%assign cpuflags_avx      (1<<11)| cpuflags_sse42
-%assign cpuflags_xop      (1<<12)| cpuflags_avx
-%assign cpuflags_fma4     (1<<13)| cpuflags_avx
-%assign cpuflags_avx2     (1<<14)| cpuflags_avx
-%assign cpuflags_fma3     (1<<15)| cpuflags_avx
-
-%assign cpuflags_cache32  (1<<16)
-%assign cpuflags_cache64  (1<<17)
-%assign cpuflags_slowctz  (1<<18)
-%assign cpuflags_lzcnt    (1<<19)
-%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
-%assign cpuflags_atom     (1<<21)
-%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
-%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
-
-%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
-%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
-
-; Takes up to 2 cpuflags from the above list.
-; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
-; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
-%macro INIT_CPUFLAGS 0-2
-    CPUNOP amdnop
-    %if %0 >= 1
-        %xdefine cpuname %1
-        %assign cpuflags cpuflags_%1
-        %if %0 >= 2
-            %xdefine cpuname %1_%2
-            %assign cpuflags cpuflags | cpuflags_%2
-        %endif
-        %xdefine SUFFIX _ %+ cpuname
-        %if cpuflag(avx)
-            %assign avx_enabled 1
-        %endif
-        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
-            %define mova movaps
-            %define movu movups
-            %define movnta movntps
-        %endif
-        %if cpuflag(aligned)
-            %define movu mova
-        %elifidn %1, sse3
-            %define movu lddqu
-        %endif
-        %if ARCH_X86_64 == 0 && notcpuflag(sse2)
-            CPUNOP basicnop
-        %endif
-    %else
-        %xdefine SUFFIX
-        %undef cpuname
-        %undef cpuflags
-    %endif
-%endmacro
-
-; Merge mmx and sse*
-; m# is a simd register of the currently selected size
-; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
-; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
-; (All 3 remain in sync through SWAP.)
-
-%macro CAT_XDEFINE 3
-    %xdefine %1%2 %3
-%endmacro
-
-%macro CAT_UNDEF 2
-    %undef %1%2
-%endmacro
-
-%macro INIT_MMX 0-1+
-    %assign avx_enabled 0
-    %define RESET_MM_PERMUTATION INIT_MMX %1
-    %define mmsize 8
-    %define num_mmregs 8
-    %define mova movq
-    %define movu movq
-    %define movh movd
-    %define movnta movntq
-    %assign %%i 0
-    %rep 8
-    CAT_XDEFINE m, %%i, mm %+ %%i
-    CAT_XDEFINE nmm, %%i, %%i
-    %assign %%i %%i+1
-    %endrep
-    %rep 8
-    CAT_UNDEF m, %%i
-    CAT_UNDEF nmm, %%i
-    %assign %%i %%i+1
-    %endrep
-    INIT_CPUFLAGS %1
-%endmacro
-
-%macro INIT_XMM 0-1+
-    %assign avx_enabled 0
-    %define RESET_MM_PERMUTATION INIT_XMM %1
-    %define mmsize 16
-    %define num_mmregs 8
-    %if ARCH_X86_64
-    %define num_mmregs 16
-    %endif
-    %define mova movdqa
-    %define movu movdqu
-    %define movh movq
-    %define movnta movntdq
-    %assign %%i 0
-    %rep num_mmregs
-    CAT_XDEFINE m, %%i, xmm %+ %%i
-    CAT_XDEFINE nxmm, %%i, %%i
-    %assign %%i %%i+1
-    %endrep
-    INIT_CPUFLAGS %1
-%endmacro
-
-%macro INIT_YMM 0-1+
-    %assign avx_enabled 1
-    %define RESET_MM_PERMUTATION INIT_YMM %1
-    %define mmsize 32
-    %define num_mmregs 8
-    %if ARCH_X86_64
-    %define num_mmregs 16
-    %endif
-    %define mova movdqa
-    %define movu movdqu
-    %undef movh
-    %define movnta movntdq
-    %assign %%i 0
-    %rep num_mmregs
-    CAT_XDEFINE m, %%i, ymm %+ %%i
-    CAT_XDEFINE nymm, %%i, %%i
-    %assign %%i %%i+1
-    %endrep
-    INIT_CPUFLAGS %1
-%endmacro
-
-INIT_XMM
-
-%macro DECLARE_MMCAST 1
-    %define  mmmm%1   mm%1
-    %define  mmxmm%1  mm%1
-    %define  mmymm%1  mm%1
-    %define xmmmm%1   mm%1
-    %define xmmxmm%1 xmm%1
-    %define xmmymm%1 xmm%1
-    %define ymmmm%1   mm%1
-    %define ymmxmm%1 xmm%1
-    %define ymmymm%1 ymm%1
-    %define xm%1 xmm %+ m%1
-    %define ym%1 ymm %+ m%1
-%endmacro
-
-%assign i 0
-%rep 16
-    DECLARE_MMCAST i
-%assign i i+1
-%endrep
-
-; I often want to use macros that permute their arguments. e.g. there's no
-; efficient way to implement butterfly or transpose or dct without swapping some
-; arguments.
-;
-; I would like to not have to manually keep track of the permutations:
-; If I insert a permutation in the middle of a function, it should automatically
-; change everything that follows. For more complex macros I may also have multiple
-; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
-;
-; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
-; permutes its arguments. It's equivalent to exchanging the contents of the
-; registers, except that this way you exchange the register names instead, so it
-; doesn't cost any cycles.
-
-%macro PERMUTE 2-* ; takes a list of pairs to swap
-%rep %0/2
-    %xdefine %%tmp%2 m%2
-    %rotate 2
-%endrep
-%rep %0/2
-    %xdefine m%1 %%tmp%2
-    CAT_XDEFINE n, m%1, %1
-    %rotate 2
-%endrep
-%endmacro
-
-%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
-%ifnum %1 ; SWAP 0, 1, ...
-    SWAP_INTERNAL_NUM %1, %2
-%else ; SWAP m0, m1, ...
-    SWAP_INTERNAL_NAME %1, %2
-%endif
-%endmacro
-
-%macro SWAP_INTERNAL_NUM 2-*
-    %rep %0-1
-        %xdefine %%tmp m%1
-        %xdefine m%1 m%2
-        %xdefine m%2 %%tmp
-        CAT_XDEFINE n, m%1, %1
-        CAT_XDEFINE n, m%2, %2
-    %rotate 1
-    %endrep
-%endmacro
-
-%macro SWAP_INTERNAL_NAME 2-*
-    %xdefine %%args n %+ %1
-    %rep %0-1
-        %xdefine %%args %%args, n %+ %2
-    %rotate 1
-    %endrep
-    SWAP_INTERNAL_NUM %%args
-%endmacro
-
-; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
-; calls to that function will automatically load the permutation, so values can
-; be returned in mmregs.
-%macro SAVE_MM_PERMUTATION 0-1
-    %if %0
-        %xdefine %%f %1_m
-    %else
-        %xdefine %%f current_function %+ _m
-    %endif
-    %assign %%i 0
-    %rep num_mmregs
-        CAT_XDEFINE %%f, %%i, m %+ %%i
-    %assign %%i %%i+1
-    %endrep
-%endmacro
-
-%macro LOAD_MM_PERMUTATION 1 ; name to load from
-    %ifdef %1_m0
-        %assign %%i 0
-        %rep num_mmregs
-            CAT_XDEFINE m, %%i, %1_m %+ %%i
-            CAT_XDEFINE n, m %+ %%i, %%i
-        %assign %%i %%i+1
-        %endrep
-    %endif
-%endmacro
-
-; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
-%macro call 1
-    call_internal %1, %1 %+ SUFFIX
-%endmacro
-%macro call_internal 2
-    %xdefine %%i %1
-    %ifndef cglobaled_%1
-        %ifdef cglobaled_%2
-            %xdefine %%i %2
-        %endif
-    %endif
-    call %%i
-    LOAD_MM_PERMUTATION %%i
-%endmacro
-
-; Substitutions that reduce instruction size but are functionally equivalent
-%macro add 2
-    %ifnum %2
-        %if %2==128
-            sub %1, -128
-        %else
-            add %1, %2
-        %endif
-    %else
-        add %1, %2
-    %endif
-%endmacro
-
-%macro sub 2
-    %ifnum %2
-        %if %2==128
-            add %1, -128
-        %else
-            sub %1, %2
-        %endif
-    %else
-        sub %1, %2
-    %endif
-%endmacro
-
-;=============================================================================
-; AVX abstraction layer
-;=============================================================================
-
-%assign i 0
-%rep 16
-    %if i < 8
-        CAT_XDEFINE sizeofmm, i, 8
-    %endif
-    CAT_XDEFINE sizeofxmm, i, 16
-    CAT_XDEFINE sizeofymm, i, 32
-%assign i i+1
-%endrep
-%undef i
-
-%macro CHECK_AVX_INSTR_EMU 3-*
-    %xdefine %%opcode %1
-    %xdefine %%dst %2
-    %rep %0-2
-        %ifidn %%dst, %3
-            %error non-avx emulation of ``%%opcode'' is not supported
-        %endif
-        %rotate 1
-    %endrep
-%endmacro
-
-;%1 == instruction
-;%2 == 1 if float, 0 if int
-;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
-;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
-;%5+: operands
-%macro RUN_AVX_INSTR 5-8+
-    %ifnum sizeof%6
-        %assign %%sizeofreg sizeof%6
-    %elifnum sizeof%5
-        %assign %%sizeofreg sizeof%5
-    %else
-        %assign %%sizeofreg mmsize
-    %endif
-    %assign %%emulate_avx 0
-    %if avx_enabled && %%sizeofreg >= 16
-        %xdefine %%instr v%1
-    %else
-        %xdefine %%instr %1
-        %if %0 >= 7+%3
-            %assign %%emulate_avx 1
-        %endif
-    %endif
-
-    %if %%emulate_avx
-        %xdefine %%src1 %6
-        %xdefine %%src2 %7
-        %ifnidn %5, %6
-            %if %0 >= 8
-                CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8
-            %else
-                CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
-            %endif
-            %if %4 && %3 == 0
-                %ifnid %7
-                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
-                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
-                    ; So, if the instruction is commutative with a memory arg, swap them.
-                    %xdefine %%src1 %7
-                    %xdefine %%src2 %6
-                %endif
-            %endif
-            %if %%sizeofreg == 8
-                MOVQ %5, %%src1
-            %elif %2
-                MOVAPS %5, %%src1
-            %else
-                MOVDQA %5, %%src1
-            %endif
-        %endif
-        %if %0 >= 8
-            %1 %5, %%src2, %8
-        %else
-            %1 %5, %%src2
-        %endif
-    %elif %0 >= 8
-        %%instr %5, %6, %7, %8
-    %elif %0 == 7
-        %%instr %5, %6, %7
-    %elif %0 == 6
-        %%instr %5, %6
-    %else
-        %%instr %5
-    %endif
-%endmacro
-
-;%1 == instruction
-;%2 == 1 if float, 0 if int
-;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
-;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
-%macro AVX_INSTR 1-4 0, 1, 0
-    %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4
-        %ifidn %2, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1
-        %elifidn %3, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2
-        %elifidn %4, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3
-        %elifidn %5, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4
-        %else
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5
-        %endif
-    %endmacro
-%endmacro
-
-; Instructions with both VEX and non-VEX encodings
-; Non-destructive instructions are written without parameters
-AVX_INSTR addpd, 1, 0, 1
-AVX_INSTR addps, 1, 0, 1
-AVX_INSTR addsd, 1, 0, 1
-AVX_INSTR addss, 1, 0, 1
-AVX_INSTR addsubpd, 1, 0, 0
-AVX_INSTR addsubps, 1, 0, 0
-AVX_INSTR aesdec, 0, 0, 0
-AVX_INSTR aesdeclast, 0, 0, 0
-AVX_INSTR aesenc, 0, 0, 0
-AVX_INSTR aesenclast, 0, 0, 0
-AVX_INSTR aesimc
-AVX_INSTR aeskeygenassist
-AVX_INSTR andnpd, 1, 0, 0
-AVX_INSTR andnps, 1, 0, 0
-AVX_INSTR andpd, 1, 0, 1
-AVX_INSTR andps, 1, 0, 1
-AVX_INSTR blendpd, 1, 0, 0
-AVX_INSTR blendps, 1, 0, 0
-AVX_INSTR blendvpd, 1, 0, 0
-AVX_INSTR blendvps, 1, 0, 0
-AVX_INSTR cmppd, 1, 1, 0
-AVX_INSTR cmpps, 1, 1, 0
-AVX_INSTR cmpsd, 1, 1, 0
-AVX_INSTR cmpss, 1, 1, 0
-AVX_INSTR comisd
-AVX_INSTR comiss
-AVX_INSTR cvtdq2pd
-AVX_INSTR cvtdq2ps
-AVX_INSTR cvtpd2dq
-AVX_INSTR cvtpd2ps
-AVX_INSTR cvtps2dq
-AVX_INSTR cvtps2pd
-AVX_INSTR cvtsd2si
-AVX_INSTR cvtsd2ss
-AVX_INSTR cvtsi2sd
-AVX_INSTR cvtsi2ss
-AVX_INSTR cvtss2sd
-AVX_INSTR cvtss2si
-AVX_INSTR cvttpd2dq
-AVX_INSTR cvttps2dq
-AVX_INSTR cvttsd2si
-AVX_INSTR cvttss2si
-AVX_INSTR divpd, 1, 0, 0
-AVX_INSTR divps, 1, 0, 0
-AVX_INSTR divsd, 1, 0, 0
-AVX_INSTR divss, 1, 0, 0
-AVX_INSTR dppd, 1, 1, 0
-AVX_INSTR dpps, 1, 1, 0
-AVX_INSTR extractps
-AVX_INSTR haddpd, 1, 0, 0
-AVX_INSTR haddps, 1, 0, 0
-AVX_INSTR hsubpd, 1, 0, 0
-AVX_INSTR hsubps, 1, 0, 0
-AVX_INSTR insertps, 1, 1, 0
-AVX_INSTR lddqu
-AVX_INSTR ldmxcsr
-AVX_INSTR maskmovdqu
-AVX_INSTR maxpd, 1, 0, 1
-AVX_INSTR maxps, 1, 0, 1
-AVX_INSTR maxsd, 1, 0, 1
-AVX_INSTR maxss, 1, 0, 1
-AVX_INSTR minpd, 1, 0, 1
-AVX_INSTR minps, 1, 0, 1
-AVX_INSTR minsd, 1, 0, 1
-AVX_INSTR minss, 1, 0, 1
-AVX_INSTR movapd
-AVX_INSTR movaps
-AVX_INSTR movd
-AVX_INSTR movddup
-AVX_INSTR movdqa
-AVX_INSTR movdqu
-AVX_INSTR movhlps, 1, 0, 0
-AVX_INSTR movhpd, 1, 0, 0
-AVX_INSTR movhps, 1, 0, 0
-AVX_INSTR movlhps, 1, 0, 0
-AVX_INSTR movlpd, 1, 0, 0
-AVX_INSTR movlps, 1, 0, 0
-AVX_INSTR movmskpd
-AVX_INSTR movmskps
-AVX_INSTR movntdq
-AVX_INSTR movntdqa
-AVX_INSTR movntpd
-AVX_INSTR movntps
-AVX_INSTR movq
-AVX_INSTR movsd, 1, 0, 0
-AVX_INSTR movshdup
-AVX_INSTR movsldup
-AVX_INSTR movss, 1, 0, 0
-AVX_INSTR movupd
-AVX_INSTR movups
-AVX_INSTR mpsadbw, 0, 1, 0
-AVX_INSTR mulpd, 1, 0, 1
-AVX_INSTR mulps, 1, 0, 1
-AVX_INSTR mulsd, 1, 0, 1
-AVX_INSTR mulss, 1, 0, 1
-AVX_INSTR orpd, 1, 0, 1
-AVX_INSTR orps, 1, 0, 1
-AVX_INSTR pabsb
-AVX_INSTR pabsd
-AVX_INSTR pabsw
-AVX_INSTR packsswb, 0, 0, 0
-AVX_INSTR packssdw, 0, 0, 0
-AVX_INSTR packuswb, 0, 0, 0
-AVX_INSTR packusdw, 0, 0, 0
-AVX_INSTR paddb, 0, 0, 1
-AVX_INSTR paddw, 0, 0, 1
-AVX_INSTR paddd, 0, 0, 1
-AVX_INSTR paddq, 0, 0, 1
-AVX_INSTR paddsb, 0, 0, 1
-AVX_INSTR paddsw, 0, 0, 1
-AVX_INSTR paddusb, 0, 0, 1
-AVX_INSTR paddusw, 0, 0, 1
-AVX_INSTR palignr, 0, 1, 0
-AVX_INSTR pand, 0, 0, 1
-AVX_INSTR pandn, 0, 0, 0
-AVX_INSTR pavgb, 0, 0, 1
-AVX_INSTR pavgw, 0, 0, 1
-AVX_INSTR pblendvb, 0, 0, 0
-AVX_INSTR pblendw, 0, 1, 0
-AVX_INSTR pclmulqdq, 0, 1, 0
-AVX_INSTR pcmpestri
-AVX_INSTR pcmpestrm
-AVX_INSTR pcmpistri
-AVX_INSTR pcmpistrm
-AVX_INSTR pcmpeqb, 0, 0, 1
-AVX_INSTR pcmpeqw, 0, 0, 1
-AVX_INSTR pcmpeqd, 0, 0, 1
-AVX_INSTR pcmpeqq, 0, 0, 1
-AVX_INSTR pcmpgtb, 0, 0, 0
-AVX_INSTR pcmpgtw, 0, 0, 0
-AVX_INSTR pcmpgtd, 0, 0, 0
-AVX_INSTR pcmpgtq, 0, 0, 0
-AVX_INSTR pextrb
-AVX_INSTR pextrd
-AVX_INSTR pextrq
-AVX_INSTR pextrw
-AVX_INSTR phaddw, 0, 0, 0
-AVX_INSTR phaddd, 0, 0, 0
-AVX_INSTR phaddsw, 0, 0, 0
-AVX_INSTR phminposuw
-AVX_INSTR phsubw, 0, 0, 0
-AVX_INSTR phsubd, 0, 0, 0
-AVX_INSTR phsubsw, 0, 0, 0
-AVX_INSTR pinsrb, 0, 1, 0
-AVX_INSTR pinsrd, 0, 1, 0
-AVX_INSTR pinsrq, 0, 1, 0
-AVX_INSTR pinsrw, 0, 1, 0
-AVX_INSTR pmaddwd, 0, 0, 1
-AVX_INSTR pmaddubsw, 0, 0, 0
-AVX_INSTR pmaxsb, 0, 0, 1
-AVX_INSTR pmaxsw, 0, 0, 1
-AVX_INSTR pmaxsd, 0, 0, 1
-AVX_INSTR pmaxub, 0, 0, 1
-AVX_INSTR pmaxuw, 0, 0, 1
-AVX_INSTR pmaxud, 0, 0, 1
-AVX_INSTR pminsb, 0, 0, 1
-AVX_INSTR pminsw, 0, 0, 1
-AVX_INSTR pminsd, 0, 0, 1
-AVX_INSTR pminub, 0, 0, 1
-AVX_INSTR pminuw, 0, 0, 1
-AVX_INSTR pminud, 0, 0, 1
-AVX_INSTR pmovmskb
-AVX_INSTR pmovsxbw
-AVX_INSTR pmovsxbd
-AVX_INSTR pmovsxbq
-AVX_INSTR pmovsxwd
-AVX_INSTR pmovsxwq
-AVX_INSTR pmovsxdq
-AVX_INSTR pmovzxbw
-AVX_INSTR pmovzxbd
-AVX_INSTR pmovzxbq
-AVX_INSTR pmovzxwd
-AVX_INSTR pmovzxwq
-AVX_INSTR pmovzxdq
-AVX_INSTR pmuldq, 0, 0, 1
-AVX_INSTR pmulhrsw, 0, 0, 1
-AVX_INSTR pmulhuw, 0, 0, 1
-AVX_INSTR pmulhw, 0, 0, 1
-AVX_INSTR pmullw, 0, 0, 1
-AVX_INSTR pmulld, 0, 0, 1
-AVX_INSTR pmuludq, 0, 0, 1
-AVX_INSTR por, 0, 0, 1
-AVX_INSTR psadbw, 0, 0, 1
-AVX_INSTR pshufb, 0, 0, 0
-AVX_INSTR pshufd
-AVX_INSTR pshufhw
-AVX_INSTR pshuflw
-AVX_INSTR psignb, 0, 0, 0
-AVX_INSTR psignw, 0, 0, 0
-AVX_INSTR psignd, 0, 0, 0
-AVX_INSTR psllw, 0, 0, 0
-AVX_INSTR pslld, 0, 0, 0
-AVX_INSTR psllq, 0, 0, 0
-AVX_INSTR pslldq, 0, 0, 0
-AVX_INSTR psraw, 0, 0, 0
-AVX_INSTR psrad, 0, 0, 0
-AVX_INSTR psrlw, 0, 0, 0
-AVX_INSTR psrld, 0, 0, 0
-AVX_INSTR psrlq, 0, 0, 0
-AVX_INSTR psrldq, 0, 0, 0
-AVX_INSTR psubb, 0, 0, 0
-AVX_INSTR psubw, 0, 0, 0
-AVX_INSTR psubd, 0, 0, 0
-AVX_INSTR psubq, 0, 0, 0
-AVX_INSTR psubsb, 0, 0, 0
-AVX_INSTR psubsw, 0, 0, 0
-AVX_INSTR psubusb, 0, 0, 0
-AVX_INSTR psubusw, 0, 0, 0
-AVX_INSTR ptest
-AVX_INSTR punpckhbw, 0, 0, 0
-AVX_INSTR punpckhwd, 0, 0, 0
-AVX_INSTR punpckhdq, 0, 0, 0
-AVX_INSTR punpckhqdq, 0, 0, 0
-AVX_INSTR punpcklbw, 0, 0, 0
-AVX_INSTR punpcklwd, 0, 0, 0
-AVX_INSTR punpckldq, 0, 0, 0
-AVX_INSTR punpcklqdq, 0, 0, 0
-AVX_INSTR pxor, 0, 0, 1
-AVX_INSTR rcpps, 1, 0, 0
-AVX_INSTR rcpss, 1, 0, 0
-AVX_INSTR roundpd
-AVX_INSTR roundps
-AVX_INSTR roundsd
-AVX_INSTR roundss
-AVX_INSTR rsqrtps, 1, 0, 0
-AVX_INSTR rsqrtss, 1, 0, 0
-AVX_INSTR shufpd, 1, 1, 0
-AVX_INSTR shufps, 1, 1, 0
-AVX_INSTR sqrtpd, 1, 0, 0
-AVX_INSTR sqrtps, 1, 0, 0
-AVX_INSTR sqrtsd, 1, 0, 0
-AVX_INSTR sqrtss, 1, 0, 0
-AVX_INSTR stmxcsr
-AVX_INSTR subpd, 1, 0, 0
-AVX_INSTR subps, 1, 0, 0
-AVX_INSTR subsd, 1, 0, 0
-AVX_INSTR subss, 1, 0, 0
-AVX_INSTR ucomisd
-AVX_INSTR ucomiss
-AVX_INSTR unpckhpd, 1, 0, 0
-AVX_INSTR unpckhps, 1, 0, 0
-AVX_INSTR unpcklpd, 1, 0, 0
-AVX_INSTR unpcklps, 1, 0, 0
-AVX_INSTR xorpd, 1, 0, 1
-AVX_INSTR xorps, 1, 0, 1
-
-; 3DNow instructions, for sharing code between AVX, SSE and 3DN
-AVX_INSTR pfadd, 1, 0, 1
-AVX_INSTR pfsub, 1, 0, 0
-AVX_INSTR pfmul, 1, 0, 1
-
-; base-4 constants for shuffles
-%assign i 0
-%rep 256
-    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
-    %if j < 10
-        CAT_XDEFINE q000, j, i
-    %elif j < 100
-        CAT_XDEFINE q00, j, i
-    %elif j < 1000
-        CAT_XDEFINE q0, j, i
-    %else
-        CAT_XDEFINE q, j, i
-    %endif
-%assign i i+1
-%endrep
-%undef i
-%undef j
-
-%macro FMA_INSTR 3
-    %macro %1 4-7 %1, %2, %3
-        %if cpuflag(xop)
-            v%5 %1, %2, %3, %4
-        %else
-            %6 %1, %2, %3
-            %7 %1, %4
-        %endif
-    %endmacro
-%endmacro
-
-FMA_INSTR  pmacsdd,  pmulld, paddd
-FMA_INSTR  pmacsww,  pmullw, paddw
-FMA_INSTR pmadcswd, pmaddwd, paddd
-
-; convert FMA4 to FMA3 if possible
-%macro FMA4_INSTR 4
-    %macro %1 4-8 %1, %2, %3, %4
-        %if cpuflag(fma4)
-            v%5 %1, %2, %3, %4
-        %elifidn %1, %2
-            v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
-        %elifidn %1, %3
-            v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
-        %elifidn %1, %4
-            v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
-        %else
-            %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
-        %endif
-    %endmacro
-%endmacro
-
-FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
-FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
-FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
-FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
-
-FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
-FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
-FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
-FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
-
-FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
-FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
-FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
-FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
-
-FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
-FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
-FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
-FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
-
-FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
-FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
-FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
-FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
-
-; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
-%if ARCH_X86_64 == 0
-%macro vpbroadcastq 2
-%if sizeof%1 == 16
-    movddup %1, %2
-%else
-    vbroadcastsd %1, %2
-%endif
-%endmacro
-%endif

kvazaar-2.2.0.tar.gz/src/strategies/x86_asm Deleted

kvazaar-2.2.0.tar.gz/src/strategies/x86_asm/picture-x86-asm-sad.asm Deleted

@@ -1,385 +0,0 @@
-;/*****************************************************************************
-; * This file is part of Kvazaar HEVC encoder.
-; *
-; * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
-; * All rights reserved.
-; * 
-; * Redistribution and use in source and binary forms, with or without modification,
-; * are permitted provided that the following conditions are met:
-; * 
-; * * Redistributions of source code must retain the above copyright notice, this
-; *   list of conditions and the following disclaimer.
-; * 
-; * * Redistributions in binary form must reproduce the above copyright notice, this
-; *   list of conditions and the following disclaimer in the documentation and/or
-; *   other materials provided with the distribution.
-; * 
-; * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
-; *   contributors may be used to endorse or promote products derived from
-; *   this software without specific prior written permission.
-; * 
-; * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-; * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-; * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-; * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-; * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-; * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-; * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-; * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-; * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-; ****************************************************************************/
-
-%include "x86inc.asm"
-
-;cglobal and RET macros are from the x86.inc
-;they push and pop the necessary registers to
-;stack depending on the operating system
-
-;Usage: cglobal name, %1, %2, %3
-;1%: Number of arguments
-;2%: Number of registers used
-;3%: Number of xmm registers used.
-;More info in x86inc.asm
-
-SECTION .text
-
-;Set x86inc.asm macros to use avx and xmm registers
-INIT_XMM avx
-
-;KVZ_SAD_4X4
-;Calculates SAD of the 16 consequtive bytes in memory
-;r0 address of the first value(current frame)
-;r1 address of the first value(reference frame)
-
-cglobal sad_4x4, 2, 2, 2
-
-    ;Load 16 bytes of both frames
-    vmovdqu m0, r0
-    vmovdqu m1, r1
-
-    ;Calculate SAD. The results are written in
-    ;m015:0 and m079:64
-    vpsadbw m0, m1
-
-    ;Sum the results
-    vmovhlps m1, m0
-    vpaddw m0, m1
-
-    ;Write the result to eax
-    vmovd eax, m0
-
-    RET
-
-
-;KVZ_SAD_4X4_STRIDE
-;Calculates SAD of a 4x4 block inside a frame with stride
-;r0 address of the first value(current)
-;r1 address of the first value(reference)
-;r2 stride
-
-cglobal sad_4x4_stride, 3, 3, 2
-
-    ;Load 4 times 4 bytes of both frames
-    vpinsrd m0, r0, 0
-    add r0, r2
-    vpinsrd m0, r0, 1
-    vpinsrd m0, r0+r2, 2
-    vpinsrd m0, r0+r2*2, 3
-
-    vpinsrd m1, r1, 0
-    add r1, r2
-    vpinsrd m1, r1, 1
-    vpinsrd m1, r1+r2, 2
-    vpinsrd m1, r1+r2*2, 3
-
-    vpsadbw m0, m1
-
-    vmovhlps m1, m0
-    vpaddw m0, m1
-
-    vmovd eax, m0
-
-    RET
-
-
-;KVZ_SAD_8X8
-;Calculates SAD of the 64 consequtive bytes in memory
-;r0 address of the first value(current)
-;r1 address of the first value(reference)
-
-cglobal sad_8x8, 2, 2, 5
-
-    ;Load the first half of both frames
-    vmovdqu m0, r0
-    vmovdqu m2, r0+16
-
-    vmovdqu m1, r1
-    vmovdqu m3, r1+16
-
-    ;Calculate SADs for both
-    vpsadbw m0, m1
-    vpsadbw m2, m3
-
-    ;Sum
-    vpaddw m0, m2
-
-    ;Repeat for the latter half
-    vmovdqu m1, r0+16*2
-    vmovdqu m3, r0+16*3
-
-    vmovdqu m2, r1+16*2
-    vmovdqu m4, r1+16*3
-
-    vpsadbw m1, m2
-    vpsadbw m3, m4
-
-    vpaddw m1, m3
-
-    ;Sum all the SADs
-    vpaddw m0, m1
-
-    vmovhlps m1, m0
-    vpaddw m0, m1
-
-    vmovd eax, m0
-
-    RET
-
-
-;KVZ_SAD_8X8_STRIDE
-;Calculates SAD of a 8x8 block inside a frame with stride
-;r0 address of the first value(current)
-;r1 address of the first value(reference)
-;r2 stride
-
-cglobal sad_8x8_stride, 3, 3, 5
-
-    ;Zero m0 register
-    vpxor m0, m0
-
-    ;Load the first half to m1 and m3 registers(cur)
-    ;Current frame
-    ;Load to the high 64 bits of xmm
-    vmovhpd m1, r0
-    add r0, r2
-    ;Load to the low 64 bits
-    vmovlpd m1, r0 
-
-    vmovhpd m3, r0+r2
-    vmovlpd m3, r0+r2*2 
-    ;lea calculates the address to r0,
-    ;but doesn't load anything from
-    ;the memory. Equivalent for
-    ;two add r0, r2 instructions.
-    lea r0, r0+r2*2
-    add r0, r2
-
-    ;Reference frame
-    vmovhpd m2, r1
-    add r1, r2
-    vmovlpd m2, r1 
-
-    vmovhpd m4, r1+r2
-    vmovlpd m4, r1+r2*2 
-    lea r1, r1+r2*2
-    add r1, r2
-
-    vpsadbw m1, m2
-    vpsadbw m3, m4
-
-    vpaddw m0, m1
-    vpaddw m0, m3
-
-    ;Repeat for the other half
-    vmovhpd m1, r0
-    add r0, r2
-    vmovlpd m1, r0 
-
-    vmovhpd m3, r0+r2
-    vmovlpd m3, r0+r2*2 
-    lea r0, r0+r2*2
-    add r0, r2
-
-    vmovhpd m2, r1
-    add r1, r2
-    vmovlpd m2, r1 
-
-    vmovhpd m4, r1+r2
-    vmovlpd m4, r1+r2*2 
-    lea r1, r1+r2*2
-    add r1, r2
-
-    vpsadbw m1, m2
-    vpsadbw m3, m4
-
-    vpaddw m0, m1
-    vpaddw m0, m3
-
-    vmovhlps m1, m0
-    vpaddw m0, m1
-
-    vmovd eax, m0
-
-    RET
-
-
-;KVZ_SAD_16X16
-;Calculates SAD of the 256 consequtive bytes in memory
-;r0 address of the first value(current)
-;r1 address of the first value(reference)
-
-cglobal sad_16x16, 2, 2, 5
-
-    ;Zero m4
-    vpxor m4, m4
-
-    %assign i 0
-
-    ;Repeat 8 times.
-    %rep 8
-
-        ;Load the next to rows of the current frame
-        vmovdqu m0, r0 + 16 * i
-        vmovdqu m2, r0 + 16 * (i + 1)
-
-        ;Load the next to rows of the reference frame
-        vmovdqu m1, r1 + 16 * i
-        vmovdqu m3, r1 + 16 * (i + 1)
-
-        vpsadbw m0, m1
-        vpsadbw m2, m3
-
-        ;Accumulate SADs to m4
-        vpaddw m4, m0
-        vpaddw m4, m2
-
-        %assign i i+2
-
-    %endrep
-
-    ;Calculate the final sum
-    vmovhlps m0, m4
-    vpaddw m4, m0
-
-    vmovd eax, m4
-
-    RET
-
-
-;KVZ_SAD_16X16_STRIDE
-;Calculates SAD of a 16x16 block inside a frame with stride
-;r0 address of the first value(current)
-;r1 address of the first value(reference)
-;r2 stride
-
-cglobal sad_16x16_stride, 3, 3, 5
-
-    vpxor m4, m4
-
-    %rep 8
-
-        ; Load the next 2 rows from rec_buf to m0 and m2
-        vmovdqu m0, r0
-        vmovdqu m2, r0 + r2
-        lea r0, r0 + r2*2
-
-        ; Load the next 2 rows from ref_buf to m1 and m3
-        vmovdqu m1, r1
-        vmovdqu m3, r1 + r2
-        lea r1, r1 + r2*2
- 
-        vpsadbw m0, m1
-        vpsadbw m2, m3
-
-        vpaddw m4, m0
-        vpaddw m4, m2
-
-    %endrep
-
-    vmovhlps m0, m4
-    vpaddw m4, m0
-
-    vmovd eax, m4
-
-    RET
-
-
-;KVZ_SAD_32x32_STRIDE
-;Calculates SAD of a 32x32 block inside a frame with stride
-;r0 address of the first value(current)
-;r1 address of the first value(reference)
-;r2 stride
-cglobal sad_32x32_stride, 3, 3, 5
-    vpxor m4, m4
-
-	; Handle 2 lines per iteration
-    %rep 16
-        vmovdqu m0, r0
-        vmovdqu m1, r0 + 16
-        vmovdqu m2, r0 + r2
-        vmovdqu m3, r0 + r2 + 16
-        lea r0, r0 + 2 * r2
-
-        vpsadbw m0, r1
-        vpsadbw m1, r1 + 16
-        vpsadbw m2, r1 + r2
-        vpsadbw m3, r1 + r2 + 16
-        lea r1, r1 + 2 * r2
- 
-        vpaddd m4, m0
-        vpaddd m4, m1
-        vpaddd m4, m2
-        vpaddd m4, m3
-    %endrep
-
-    vmovhlps m0, m4
-    vpaddd m4, m0
-
-    vmovd eax, m4
-
-    RET
-
-
-;KVZ_SAD_64x64_STRIDE
-;Calculates SAD of a 64x64 block inside a frame with stride
-;r0 address of the first value(current)
-;r1 address of the first value(reference)
-;r2 stride
-cglobal sad_64x64_stride, 3, 4, 5
-    vpxor m4, m4 ; sum accumulation register
-	mov r3, 4 ; number of iterations in the loop
-
-Process16Lines:
-	; Intel optimization manual says to not unroll beyond 500 instructions.
-	; Didn't seem to have much of an affect on Ivy Bridge or Haswell, but
-	; smaller is better, when speed is the same, right?
-    %rep 16
-        vmovdqu m0, r0
-        vmovdqu m1, r0 + 1*16
-        vmovdqu m2, r0 + 2*16
-        vmovdqu m3, r0 + 3*16
-
-        vpsadbw m0, r1
-        vpsadbw m1, r1 + 1*16
-        vpsadbw m2, r1 + 2*16
-        vpsadbw m3, r1 + 3*16
-
-        lea r0, r0 + r2
-        lea r1, r1 + r2
- 
-        vpaddd m4, m0
-        vpaddd m4, m1
-        vpaddd m4, m2
-        vpaddd m4, m3
-    %endrep
-
-	dec r3
-	jnz Process16Lines
-
-    vmovhlps m0, m4
-    vpaddd m4, m0
-
-    vmovd eax, m4
-
-    RET

kvazaar-2.2.0.tar.gz/src/strategies/x86_asm/picture-x86-asm-sad.h Deleted

@@ -1,56 +0,0 @@
-#ifndef _PICTURE_X86_ASM_SAD_H_
-#define _PICTURE_X86_ASM_SAD_H_
-/*****************************************************************************
- * This file is part of Kvazaar HEVC encoder.
- *
- * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- * 
- * * Redistributions of source code must retain the above copyright notice, this
- *   list of conditions and the following disclaimer.
- * 
- * * Redistributions in binary form must reproduce the above copyright notice, this
- *   list of conditions and the following disclaimer in the documentation and/or
- *   other materials provided with the distribution.
- * 
- * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
- *   contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
- ****************************************************************************/
-
-/**
- * \ingroup Optimization
- * \file
- * Optimizations for AVX, utilizing ASM implementations.
- */
-
-#include "global.h" // IWYU pragma: keep
-#include "kvazaar.h"
-
-#if KVZ_BIT_DEPTH == 8
-unsigned kvz_sad_4x4_avx(const uint8_t*, const uint8_t*);
-unsigned kvz_sad_8x8_avx(const uint8_t*, const uint8_t*);
-unsigned kvz_sad_16x16_avx(const uint8_t*, const uint8_t*);
-
-unsigned kvz_sad_4x4_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
-unsigned kvz_sad_8x8_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
-unsigned kvz_sad_16x16_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
-unsigned kvz_sad_32x32_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
-unsigned kvz_sad_64x64_stride_avx(const uint8_t *data1, const uint8_t *data2, unsigned stride);
-#endif // KVZ_BIT_DEPTH == 8
-
-#endif

kvazaar-2.2.0.tar.gz/src/strategies/x86_asm/picture-x86-asm-satd.asm Deleted

@@ -1,575 +0,0 @@
-;/*****************************************************************************
-; * This file is part of Kvazaar HEVC encoder.
-; *
-; * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
-; * All rights reserved.
-; * 
-; * Redistribution and use in source and binary forms, with or without modification,
-; * are permitted provided that the following conditions are met:
-; * 
-; * * Redistributions of source code must retain the above copyright notice, this
-; *   list of conditions and the following disclaimer.
-; * 
-; * * Redistributions in binary form must reproduce the above copyright notice, this
-; *   list of conditions and the following disclaimer in the documentation and/or
-; *   other materials provided with the distribution.
-; * 
-; * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
-; *   contributors may be used to endorse or promote products derived from
-; *   this software without specific prior written permission.
-; * 
-; * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-; * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-; * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-; * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-; * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-; * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-; * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-; * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-; * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-; ****************************************************************************/
-
-%include "x86inc.asm"
-
-;cglobal and RET macros are from the x86.inc
-;they push and pop the necessary registers to
-;stack depending on the operating system
-
-;Usage: cglobal name, %1, %2, %3
-;1%: Number of arguments
-;2%: Number of registers used
-;3%: Number of xmm registers used.
-;More info in x86inc.asm
-
-SECTION .text
-
-;Set x86inc.asm macros to use avx and xmm registers
-INIT_XMM avx
-
-;KVZ_ZERO_EXTEND_WD
-;zero extend all packed words in xmm to dwords in 2 xmm registers
-;%1 source register
-;%2 lower destination register
-;%3 higher destination register
-
-%macro KVZ_ZERO_EXTEND_WD 3
-
-    ;Zero extend high 64 bits
-    vmovhlps %3, %1
-    vpmovzxwd %3, %3
-    ;Zero extend low 64 bits
-    vpmovzxwd %2, %1
-
-%endmacro ; KVZ_ZERO_EXTEND_WD
-
-; Use nondestructive horizontal add and sub to calculate both at the same time.
-; TODO: It would probably be possible to do this with 3 registers (destructive vphsubw).
-; args:
-;	1, 2: input registers
-;   3, 4: output registers
-
-%macro SATD_HORIZONTAL_SUB_AND_ADD 4
-
-    ; TODO: It might be possible to do this with 3 registers?
-    
-    ;First stage
-    vphaddw %3, %1, %2
-    vphsubw %4, %1, %2
-    
-    ;Second stage
-    vphaddw %1, %3, %4
-    vphsubw %2, %3, %4
-    
-    ;Third stage
-    vphaddw %3, %1, %2
-    vphsubw %4, %1, %2
-
-%endmacro ; SATD_HORIZONTAL_SUB_AND_ADD
-
-;KVZ_SATD_8X8_STRIDE
-;Calculates SATD of a 8x8 block inside a frame with stride
-;r0 address of the first value(reference)
-;r1 address of the first value(current)
-;r2 stride
-;
-;The Result is written in the register r4
-
-%macro KVZ_SATD_8X8_STRIDE 0
-
-    ;Calculate differences of the 8 rows into
-    ;registers m0-m7
-    vpmovzxbw m0, r0
-    vpmovzxbw m7, r2
-    vpsubw m0, m7
-
-    vpmovzxbw m1, r0+r1
-    vpmovzxbw m7, r2+r3
-    vpsubw m1, m7
-
-    ;Set r0 and r2 2 rows forward
-    lea r0, r0+r1*2
-    lea r2, r2+r3*2
-
-    vpmovzxbw m2, r0
-    vpmovzxbw m7, r2
-    vpsubw m2, m7
-
-    vpmovzxbw m3, r0+r1
-    vpmovzxbw m7, r2+r3
-    vpsubw m3, m7
-
-    lea r0, r0+r1*2
-    lea r2, r2+r3*2
-
-    vpmovzxbw m4, r0
-    vpmovzxbw m7, r2
-    vpsubw m4, m7
-
-    vpmovzxbw m5, r0+r1
-    vpmovzxbw m7, r2+r3
-    vpsubw m5, m7
-
-    lea r0, r0+r1*2
-    lea r2, r2+r3*2
-
-    vpmovzxbw m6, r0
-    vpmovzxbw m7, r2
-    vpsubw m6, m7
-
-    ;32-bit AVX doesn't have registers
-    ;xmm8-xmm15, use stack instead
-    
-    %if ARCH_X86_64
-        vpmovzxbw m7, r0+r1
-        vpmovzxbw m8, r2+r3
-        vpsubw m7, m8
-    %else
-        %define temp0 esp+16*3
-        %define temp1 esp+16*2
-        %define temp2 esp+16*1
-        %define temp3 esp+16*0
-        
-        ;Reserve memory for 4 x 128 bits.
-        sub esp, 16*4
-
-        vpmovzxbw m7, r2+r3
-        vmovdqu temp0, m7
-        vpmovzxbw m7, r0+r1
-        vpsubw m7, temp0
-
-        ;Put rows 5-8 to stack
-        vmovdqu temp0, m4
-        vmovdqu temp1, m5
-        vmovdqu temp2, m6
-        vmovdqu temp3, m7
-    %endif
-
-    ;Hadamard transform (FWHT algorithm)
-    ;Horizontal transform
-
-    %if ARCH_X86_64
-        ;Calculate horizontal transform for each row.
-        ;Transforms of two rows are interleaved in register pairs.
-        ;(m8 and m9, m10 and m11,...)
-        
-        SATD_HORIZONTAL_SUB_AND_ADD m0, m1, m8, m9
-        SATD_HORIZONTAL_SUB_AND_ADD m2, m3, m10, m11
-        SATD_HORIZONTAL_SUB_AND_ADD m4, m5, m12, m13
-        SATD_HORIZONTAL_SUB_AND_ADD m6, m7, m14, m15
-
-    %else
-        ;Calculate horizontal transforms for the first four rows.
-        ;Then load the other four into the registers and store
-        ;ready transforms in the stack.
-        ;Input registers are m0-m3, results are written in
-        ;registers m4-m7 (and memory).
-        
-        SATD_HORIZONTAL_SUB_AND_ADD m0, m1, m4, m5
-        SATD_HORIZONTAL_SUB_AND_ADD m2, m3, m6, m7
-
-        vmovdqu m3, temp3
-        vmovdqu m2, temp2
-        vmovdqu m1, temp1
-        vmovdqu m0, temp0
-
-        vmovdqu temp3, m7
-        vmovdqu temp2, m6
-        vmovdqu temp1, m5
-        vmovdqu temp0, m4
-
-        SATD_HORIZONTAL_SUB_AND_ADD m0, m1, m4, m5
-        SATD_HORIZONTAL_SUB_AND_ADD m2, m3, m6, m7
-    %endif
-
-
-    ;Vertical transform
-    ;Transform columns of the 8x8 block.
-    ;First sum the interleaved horizontally
-    ;transformed values with one horizontal add
-    ;for each pair of rows. Then calculate
-    ;with regular packed additions and
-    ;subtractions.
-    
-    %if ARCH_X86_64
-        ;Horizontally transformed values are in registers m8-m15
-        ;Results are written in m0-m7
-
-        ;First stage
-        vphaddw m0, m8, m9
-        vphsubw m1, m8, m9
-
-        vphaddw m2, m10, m11
-        vphsubw m3, m10, m11
-
-        vphaddw m4, m12, m13
-        vphsubw m5, m12, m13
-
-        vphaddw m6, m14, m15
-        vphsubw m7, m14, m15
-
-        ;Second stage
-        vpaddw m8, m0, m2
-        vpaddw m9, m1, m3
-        vpsubw m10, m0, m2
-        vpsubw m11, m1, m3
-
-        vpaddw m12, m4, m6
-        vpaddw m13, m5, m7
-        vpsubw m14, m4, m6
-        vpsubw m15, m5, m7
-
-        ;Third stage
-        vpaddw m0, m8, m12
-        vpaddw m1, m9, m13
-        vpaddw m2, m10, m14
-        vpaddw m3, m11, m15
-
-        vpsubw m4, m8, m12
-        vpsubw m5, m9, m13
-        vpsubw m6, m10, m14
-        vpsubw m7, m11, m15
-
-    %else
-        ;Transformed values are in registers m4-m7
-        ;and in memory(temp0-temp3). Transformed values
-        ;are written in m4-m7. Also calculate absolute
-        ;values for them and accumulate into ymm0.
-
-        ;First stage
-        vphaddw m0, m4, m5
-        vphsubw m1, m4, m5
-
-        vphaddw m2, m6, m7
-        vphsubw m3, m6, m7
-
-        ;Second stage
-        vpaddw m4, m0, m2
-        vpaddw m5, m1, m3
-        vpsubw m6, m0, m2
-        vpsubw m7, m1, m3
-
-        vmovdqu m3, temp3
-        vmovdqu m2, temp2
-        vmovdqu m1, temp1
-        vmovdqu m0, temp0
-
-        vmovdqu temp3, m7
-        vmovdqu temp2, m6
-        vmovdqu temp1, m5
-        vmovdqu temp0, m4
-
-        ;First stage (second half)
-        vphaddw m4, m0, m1
-        vphsubw m5, m0, m1
-
-        vphaddw m6, m2, m3
-        vphsubw m7, m2, m3
-
-        ;Second stage (second half)
-        vpaddw m0, m4, m6
-        vpaddw m1, m5, m7
-        vpsubw m2, m4, m6
-        vpsubw m3, m5, m7
-
-        ;Third stage
-        vpaddw m4, m0, temp0
-        vpaddw m5, m1, temp1
-        vpsubw m6, m0, temp0
-        vpsubw m7, m1, temp1
-
-        ;Calculate the absolute values and
-        ;zero extend 16-bit values to 32-bit
-        ;values. Then sum the values.
-
-        vpabsw m4, m4
-        KVZ_ZERO_EXTEND_WD m4, m4, m1
-        vpaddd m4, m1
-
-        vpabsw m5, m5
-        KVZ_ZERO_EXTEND_WD m5, m5, m1
-        vpaddd m5, m1
-
-        vpabsw m6, m6
-        KVZ_ZERO_EXTEND_WD m6, m6, m1
-        vpaddd m6, m1
-
-        vpabsw m7, m7
-        KVZ_ZERO_EXTEND_WD m7, m7, m1
-        vpaddd m7, m1
-      
-        vpaddd m0, m4, m5
-        vpaddd m0, m6
-        vpaddd m0, m7
-
-        ;Repeat for the rest
-        vpaddw m4, m2, temp2
-        vpaddw m5, m3, temp3
-        vpsubw m6, m2, temp2
-        vpsubw m7, m3, temp3
-
-        vpabsw m4, m4
-        KVZ_ZERO_EXTEND_WD m4, m4, m1
-        vpaddd m4, m1
-
-        vpabsw m5, m5
-        KVZ_ZERO_EXTEND_WD m5, m5, m1
-        vpaddd m5, m1
-
-        vpabsw m6, m6
-        KVZ_ZERO_EXTEND_WD m6, m6, m1
-        vpaddd m6, m1
-
-        vpabsw m7, m7
-        KVZ_ZERO_EXTEND_WD m7, m7, m1
-        vpaddd m7, m1
-
-        ;Sum the other half of the packed results to ymm4
-        vpaddd m4, m5
-        vpaddd m4, m6
-        vpaddd m4, m7
-
-        ;Sum all packed results to ymm0
-        vpaddd m0, m4
-
-    %endif
-
-    %if ARCH_X86_64
-    
-        ;Calculate the absolute values and
-        ;zero extend 16-bit values to 32-bit
-        ;values. In other words: extend xmm to
-        ;corresponding ymm.
-
-        vpabsw m0, m0
-        KVZ_ZERO_EXTEND_WD m0, m0, m8
-        vpaddd m0, m8
-
-        vpabsw m1, m1
-        KVZ_ZERO_EXTEND_WD m1, m1, m8
-        vpaddd m1, m8
-
-        vpabsw m2, m2
-        KVZ_ZERO_EXTEND_WD m2, m2, m8
-        vpaddd m1, m8
-
-        vpabsw m3, m3
-        KVZ_ZERO_EXTEND_WD m3, m3, m8
-        vpaddd m3, m8
-
-        vpabsw m4, m4
-        KVZ_ZERO_EXTEND_WD m4, m4, m8
-        vpaddd m4, m8
-
-        vpabsw m5, m5
-        KVZ_ZERO_EXTEND_WD m5, m5, m8
-        vpaddd m5, m8
-
-        vpabsw m6, m6
-        KVZ_ZERO_EXTEND_WD m6, m6, m8
-        vpaddd m6, m8
-
-        vpabsw m7, m7
-        KVZ_ZERO_EXTEND_WD m7, m7, m8
-        vpaddd m7, m8
-
-        ;Calculate packed sum of transformed values to ymm0
-        vpaddd m0, m1
-        vpaddd m0, m2
-        vpaddd m0, m3
-        vpaddd m0, m4
-        vpaddd m0, m5
-        vpaddd m0, m6
-        vpaddd m0, m7
-    %endif
-
-    ;Sum the packed values to m032:0
-    vphaddd m0, m0
-    vphaddd m0, m0
-
-    ;The result is in the lowest 32 bits in m0
-    vmovd r4d, m0
-
-    ;8x8 Hadamard transform requires
-    ;adding 2 and dividing by 4
-    add r4, 2
-    shr r4, 2
-
-    ;Zero high 128 bits of ymm registers to
-    ;prevent AVX-SSE transition penalty.
-    vzeroupper
-
-    %if ARCH_X86_64 == 0
-        add esp, 16*4
-    %endif
-
-%endmacro ; KVZ_SATD_8X8_STRIDE
-
-;KVZ_SATD_4X4
-;Calculates SATD of the 16 consequtive bytes in memory
-;r0 address of the first value(current)
-;r1 address of the first value(reference)
-
-cglobal satd_4x4, 2, 2, 6
-
-    ;Load 8 bytes from memory and zero extend
-    ;to 16-bit values. Calculate difference.
-    vpmovzxbw m0, r0
-    vpmovzxbw m2, r1
-    vpsubw m0, m2
-
-    vpmovzxbw m1, r0+8
-    vpmovzxbw m3, r1+8
-    vpsubw m1, m3
-
-    ;Hadamard transform
-    ;Horizontal phase
-    ;First stage
-    vphaddw m4, m0, m1
-    vphsubw m5, m0, m1
-    ;Second stage
-    vphaddw m0, m4, m5
-    vphsubw m1, m4, m5
-
-    ;Vertical phase
-    ;First stage
-    vphaddw m4, m0, m1
-    vphsubw m5, m0, m1
-    ;Second stage
-    vphaddw m0, m4, m5
-    vphsubw m1, m4, m5
-
-    ;Calculate absolute values
-    vpabsw m0, m0
-    vpabsw m1, m1
-
-    ;Sum the all the transformed values
-    vpaddw m0, m1
-
-    vphaddw m0, m0
-    vphaddw m0, m0
-    vphaddw m0, m0
-
-    ;Extract the lowest 16 bits of m0
-    ;into eax
-    vpextrw eax, m0, 0
-
-    ;4x4 Hadamard transform requires
-    ;Addition of 1 and division by 2
-    add eax, 1
-    shr eax, 1
-
-    RET
-
-
-
-;KVZ_SATD_8X8
-;Calculates SATD of a 8x8 block inside a frame with stride
-;r0 address of the first value(reference)
-;r1 address of the first value(current)
-;r2 stride
-
-%if ARCH_X86_64
-    cglobal satd_8x8, 4, 5, 16
-%else
-    cglobal satd_8x8, 4, 5, 8
-%endif
-    
-    ;Set arguments
-    mov r2, r1
-    mov r1, 8
-    mov r3, 8
-    
-    ;Calculate 8x8 SATD. Result is written
-    ;in the register r4.
-    KVZ_SATD_8X8_STRIDE
-    mov rax, r4
-    RET
-
-;KVZ_SATD_NXN
-;Calculates SATD of a NxN block inside a frame with stride
-;r0 address of the first value(reference)
-;r1 address of the first value(current)
-
-%macro KVZ_SATD_NXN 1
-
-    %if ARCH_X86_64
-        cglobal satd_%1x%1, 2, 7, 16
-    %else
-        cglobal satd_%1x%1, 2, 7, 8
-    %endif
-    
-    ;Set arguments
-    mov r2, r1
-    mov r1, %1
-    mov r3, %1
-    
-    ;Zero r5 and r6
-    xor r5, r5
-    xor r6, r6
-
-    ;Calculate SATDs of each 8x8 sub-blocks
-    ;and accumulate the results in r6. Repeat yloop
-    ;N times. Repeat xloop N times. r4 and r5 are counters
-    ;for the loops.
-    
-    .yloop
-        
-        ;zero r4
-        xor r4, r4
-
-        .xloop
-            push r4
-        
-            ;Calculate SATD of the sub-block. Result is
-            ;written in the register r4.
-            KVZ_SATD_8X8_STRIDE
-            add r6, r4
-
-            ;Set r2 and r0 to the next sub-block
-            ;on the same row
-            sub r2, 6*%1-8
-            sub r0, 6*%1-8
-
-            pop r4
-            add r4, 8
-            cmp r4, %1
-        jne .xloop
-
-        ;Set r2 and r0 to the first sub-block
-        ;on the next row(of 8x8 sub-blocks)
-        add r2, 7*%1
-        add r0, 7*%1
-
-        add r5, 8
-        cmp r5, %1
-    jne .yloop
-
-    mov rax, r6
-    RET
-
-%endmacro ; KVZ_SATD_NXN
-
-KVZ_SATD_NXN 16
-KVZ_SATD_NXN 32
-KVZ_SATD_NXN 64

kvazaar-2.2.0.tar.gz/src/strategies/x86_asm/picture-x86-asm-satd.h Deleted

@@ -1,50 +0,0 @@
-#ifndef _PICTURE_X86_ASM_SATD_H_
-#define _PICTURE_X86_ASM_SATD_H_
-/*****************************************************************************
- * This file is part of Kvazaar HEVC encoder.
- *
- * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- * 
- * * Redistributions of source code must retain the above copyright notice, this
- *   list of conditions and the following disclaimer.
- * 
- * * Redistributions in binary form must reproduce the above copyright notice, this
- *   list of conditions and the following disclaimer in the documentation and/or
- *   other materials provided with the distribution.
- * 
- * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
- *   contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
- ****************************************************************************/
-
-/**
- * \ingroup Optimization
- * \file
- * Optimizations for AVX, utilizing ASM implementations.
- */
-
-#include "global.h" // IWYU pragma: keep
-
-
-unsigned kvz_satd_4x4_avx(const kvz_pixel *org, const kvz_pixel *cur);
-unsigned kvz_satd_8x8_avx(const kvz_pixel *org, const kvz_pixel *cur);
-unsigned kvz_satd_16x16_avx(const kvz_pixel *org, const kvz_pixel *cur);
-unsigned kvz_satd_32x32_avx(const kvz_pixel *org, const kvz_pixel *cur);
-unsigned kvz_satd_64x64_avx(const kvz_pixel *org, const kvz_pixel *cur);
-
-#endif

kvazaar-2.2.0.tar.gz/src/strategies/x86_asm/picture-x86-asm.c Deleted

@@ -1,132 +0,0 @@
-/*****************************************************************************
- * This file is part of Kvazaar HEVC encoder.
- *
- * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- * 
- * * Redistributions of source code must retain the above copyright notice, this
- *   list of conditions and the following disclaimer.
- * 
- * * Redistributions in binary form must reproduce the above copyright notice, this
- *   list of conditions and the following disclaimer in the documentation and/or
- *   other materials provided with the distribution.
- * 
- * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
- *   contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
- ****************************************************************************/
-
-#include "strategies/x86_asm/picture-x86-asm.h"
-
-#if defined(KVZ_COMPILE_ASM)
-#include "kvazaar.h"
-#if KVZ_BIT_DEPTH == 8
-#include <stdlib.h>
-
-#include "strategies/x86_asm/picture-x86-asm-sad.h"
-#include "strategies/x86_asm/picture-x86-asm-satd.h"
-#include "strategies/sse41/picture-sse41.h"
-#include "strategyselector.h"
-
-
-static unsigned kvz_sad_32x32_avx(const uint8_t *data1, const uint8_t *data2)
-{
-  unsigned sad = 0;
-  sad += kvz_sad_16x16_avx(data1, data2);
-  sad += kvz_sad_16x16_avx(data1 + 8 * 32, data2 + 8 * 32);
-  sad += kvz_sad_16x16_avx(data1 + 16 * 32, data2 + 16 * 32);
-  sad += kvz_sad_16x16_avx(data1 + 24 * 32, data2 + 24 * 32);
-  return sad;
-}
-
-static unsigned kvz_sad_64x64_avx(const uint8_t *data1, const uint8_t *data2)
-{
-  unsigned sad = 0;
-  sad += kvz_sad_32x32_avx(data1, data2);
-  sad += kvz_sad_32x32_avx(data1 + 16 * 64, data2 + 16 * 64);
-  sad += kvz_sad_32x32_avx(data1 + 32 * 64, data2 + 32 * 64);
-  sad += kvz_sad_32x32_avx(data1 + 48 * 64, data2 + 48 * 64);
-  return sad;
-}
-
-static unsigned kvz_sad_other_avx(const uint8_t *data1, const uint8_t *data2,
-                                  int width, int height,
-                                  unsigned stride)
-{
-  unsigned sad = 0;
-
-  for (int y = 0; y < height; ++y) {
-    for (int x = 0; x < width; ++x) {
-      sad += abs(data1y * stride + x - data2y * stride + x);
-    }
-  }
-
-  return sad;
-}
-
-static unsigned reg_sad_x86_asm(const uint8_t *data1, const uint8_t * data2,
-                                const int width, const int height,
-                                const unsigned stride1, const unsigned stride2)
-{
-  if (width == height) {
-    if (width == 8) {
-      return kvz_sad_8x8_stride_avx(data1, data2, stride1);
-    } else if (width == 16) {
-      return kvz_sad_16x16_stride_avx(data1, data2, stride1);
-    } else if (width == 32) {
-      return kvz_sad_32x32_stride_avx(data1, data2, stride1);
-    } else if (width == 64) {
-      return kvz_sad_64x64_stride_avx(data1, data2, stride1);
-    }
-  }
-
-  if (width * height >= 16) {
-    // Call the vectorized general SAD SSE41 function when the block
-    // is big enough to make it worth it.
-    return kvz_reg_sad_sse41(data1, data2, width, height, stride1, stride2);
-  } else {
-    return kvz_sad_other_avx(data1, data2, width, height, stride1);
-  }
-}
-
-#endif // KVZ_BIT_DEPTH == 8
-#endif //defined(KVZ_COMPILE_ASM)
-
-int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth)
-{
-  bool success = true;
-#if defined(KVZ_COMPILE_ASM)
-#if KVZ_BIT_DEPTH == 8
-  if (bitdepth == 8){
-    success &= kvz_strategyselector_register(opaque, "reg_sad", "x86_asm_avx", 30, &reg_sad_x86_asm);
-
-    success &= kvz_strategyselector_register(opaque, "sad_4x4", "x86_asm_avx", 30, &kvz_sad_4x4_avx);
-    success &= kvz_strategyselector_register(opaque, "sad_8x8", "x86_asm_avx", 30, &kvz_sad_8x8_avx);
-    success &= kvz_strategyselector_register(opaque, "sad_16x16", "x86_asm_avx", 30, &kvz_sad_16x16_avx);
-    success &= kvz_strategyselector_register(opaque, "sad_32x32", "x86_asm_avx", 30, &kvz_sad_32x32_avx);
-    success &= kvz_strategyselector_register(opaque, "sad_64x64", "x86_asm_avx", 30, &kvz_sad_64x64_avx);
-
-    success &= kvz_strategyselector_register(opaque, "satd_4x4", "x86_asm_avx", 30, &kvz_satd_4x4_avx);
-    success &= kvz_strategyselector_register(opaque, "satd_8x8", "x86_asm_avx", 30, &kvz_satd_8x8_avx);
-    success &= kvz_strategyselector_register(opaque, "satd_16x16", "x86_asm_avx", 30, &kvz_satd_16x16_avx);
-    success &= kvz_strategyselector_register(opaque, "satd_32x32", "x86_asm_avx", 30, &kvz_satd_32x32_avx);
-    success &= kvz_strategyselector_register(opaque, "satd_64x64", "x86_asm_avx", 30, &kvz_satd_64x64_avx);
-  }
-#endif // KVZ_BIT_DEPTH == 8
-#endif //!defined(KVZ_COMPILE_ASM)
-  return success;
-}

kvazaar-2.2.0.tar.gz/src/strategies/x86_asm/picture-x86-asm.h Deleted

@@ -1,46 +0,0 @@
-#ifndef STRATEGIES_PICTURE_X86_ASM_H_
-#define STRATEGIES_PICTURE_X86_ASM_H_
-/*****************************************************************************
- * This file is part of Kvazaar HEVC encoder.
- *
- * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
- * All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- * 
- * * Redistributions of source code must retain the above copyright notice, this
- *   list of conditions and the following disclaimer.
- * 
- * * Redistributions in binary form must reproduce the above copyright notice, this
- *   list of conditions and the following disclaimer in the documentation and/or
- *   other materials provided with the distribution.
- * 
- * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
- *   contributors may be used to endorse or promote products derived from
- *   this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
- ****************************************************************************/
-
-/**
- * \ingroup Optimization
- * \file
- * Optimizations for AVX, utilizing ASM implementations.
- */
-
-#include "global.h" // IWYU pragma: keep
-
-
-int kvz_strategy_register_picture_x86_asm_avx(void* opaque, uint8_t bitdepth);
-
-#endif //STRATEGIES_PICTURE_X86_ASM_H_

kvazaar-2.2.0.tar.gz/src/strategies/x86_asm/x86inc.asm Deleted

@@ -1,1466 +0,0 @@
-;*****************************************************************************
-;* x86inc.asm: x264asm abstraction layer
-;*****************************************************************************
-;* Copyright (C) 2005-2014 x264 project
-;*
-;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Anton Mitrofanov <BugMaster@narod.ru>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
-;*          Henrik Gramner <henrik@gramner.com>
-;*
-;* Permission to use, copy, modify, and/or distribute this software for any
-;* purpose with or without fee is hereby granted, provided that the above
-;* copyright notice and this permission notice appear in all copies.
-;*
-;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-;*****************************************************************************
-
-; This is a header file for the x264ASM assembly language, which uses
-; NASM/YASM syntax combined with a large number of macros to provide easy
-; abstraction between different calling conventions (x86_32, win64, linux64).
-; It also has various other useful features to simplify writing the kind of
-; DSP functions that are most often used in x264.
-
-; Unlike the rest of x264, this file is available under an ISC license, as it
-; has significant usefulness outside of x264 and we want it to be available
-; to the largest audience possible.  Of course, if you modify it for your own
-; purposes to add a new feature, we strongly encourage contributing a patch
-; as this feature might be useful for others as well.  Send patches or ideas
-; to x264-devel@videolan.org .
-
-%ifndef private_prefix
-    %define private_prefix kvz
-%endif
-
-%ifndef public_prefix
-    %define public_prefix private_prefix
-%endif
-
-%define WIN64  0
-%define UNIX64 0
-%if ARCH_X86_64
-    %ifidn __OUTPUT_FORMAT__,win32
-        %define WIN64  1
-    %elifidn __OUTPUT_FORMAT__,win64
-        %define WIN64  1
-    %elifidn __OUTPUT_FORMAT__,x64
-        %define WIN64  1
-    %else
-        %define UNIX64 1
-    %endif
-%endif
-
-%ifdef PREFIX
-    %define mangle(x) _ %+ x
-%else
-    %define mangle(x) x
-%endif
-
-%macro SECTION_RODATA 0-1 16
-    SECTION .rodata align=%1
-%endmacro
-
-%macro SECTION_TEXT 0-1 16
-    SECTION .text align=%1
-%endmacro
-
-%if WIN64
-    %define PIC
-%elif ARCH_X86_64 == 0
-; x86_32 doesn't require PIC.
-; Some distros prefer shared objects to be PIC, but nothing breaks if
-; the code contains a few textrels, so we'll skip that complexity.
-    %undef PIC
-%endif
-%ifdef PIC
-    default rel
-%endif
-
-%macro CPUNOP 1
-    %ifdef __YASM_MAJOR__
-        CPU %1
-    %endif
-%endmacro
-
-; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
-CPUNOP amdnop
-
-; Macros to eliminate most code duplication between x86_32 and x86_64:
-; Currently this works only for leaf functions which load all their arguments
-; into registers at the start, and make no other use of the stack. Luckily that
-; covers most of x264's asm.
-
-; PROLOGUE:
-; %1 = number of arguments. loads them from stack if needed.
-; %2 = number of registers used. pushes callee-saved regs if needed.
-; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
-; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
-;      MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
-;      and an extra register will be allocated to hold the original stack
-;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
-;      register as stack pointer, request a negative stack size.
-; %4+/%5+ = list of names to define to registers
-; PROLOGUE can also be invoked by adding the same options to cglobal
-
-; e.g.
-; cglobal foo, 2,3,0, dst, src, tmp
-; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
-
-; TODO Some functions can use some args directly from the stack. If they're the
-; last args then you can just not declare them, but if they're in the middle
-; we need more flexible macro.
-
-; RET:
-; Pops anything that was pushed by PROLOGUE, and returns.
-
-; REP_RET:
-; Use this instead of RET if it's a branch target.
-
-; registers:
-; rN and rNq are the native-size register holding function argument N
-; rNd, rNw, rNb are dword, word, and byte size
-; rNh is the high 8 bits of the word size
-; rNm is the original location of arg N (a register or on the stack), dword
-; rNmp is native size
-
-%macro DECLARE_REG 2-3
-    %define r%1q %2
-    %define r%1d %2d
-    %define r%1w %2w
-    %define r%1b %2b
-    %define r%1h %2h
-    %if %0 == 2
-        %define r%1m  %2d
-        %define r%1mp %2
-    %elif ARCH_X86_64 ; memory
-        %define r%1m rstk + stack_offset + %3
-        %define r%1mp qword r %+ %1 %+ m
-    %else
-        %define r%1m rstk + stack_offset + %3
-        %define r%1mp dword r %+ %1 %+ m
-    %endif
-    %define r%1  %2
-%endmacro
-
-%macro DECLARE_REG_SIZE 3
-    %define r%1q r%1
-    %define e%1q r%1
-    %define r%1d e%1
-    %define e%1d e%1
-    %define r%1w %1
-    %define e%1w %1
-    %define r%1h %3
-    %define e%1h %3
-    %define r%1b %2
-    %define e%1b %2
-%if ARCH_X86_64 == 0
-    %define r%1  e%1
-%endif
-%endmacro
-
-DECLARE_REG_SIZE ax, al, ah
-DECLARE_REG_SIZE bx, bl, bh
-DECLARE_REG_SIZE cx, cl, ch
-DECLARE_REG_SIZE dx, dl, dh
-DECLARE_REG_SIZE si, sil, null
-DECLARE_REG_SIZE di, dil, null
-DECLARE_REG_SIZE bp, bpl, null
-
-; t# defines for when per-arch register allocation is more complex than just function arguments
-
-%macro DECLARE_REG_TMP 1-*
-    %assign %%i 0
-    %rep %0
-        CAT_XDEFINE t, %%i, r%1
-        %assign %%i %%i+1
-        %rotate 1
-    %endrep
-%endmacro
-
-%macro DECLARE_REG_TMP_SIZE 0-*
-    %rep %0
-        %define t%1q t%1 %+ q
-        %define t%1d t%1 %+ d
-        %define t%1w t%1 %+ w
-        %define t%1h t%1 %+ h
-        %define t%1b t%1 %+ b
-        %rotate 1
-    %endrep
-%endmacro
-
-DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
-
-%if ARCH_X86_64
-    %define gprsize 8
-%else
-    %define gprsize 4
-%endif
-
-%macro PUSH 1
-    push %1
-    %ifidn rstk, rsp
-        %assign stack_offset stack_offset+gprsize
-    %endif
-%endmacro
-
-%macro POP 1
-    pop %1
-    %ifidn rstk, rsp
-        %assign stack_offset stack_offset-gprsize
-    %endif
-%endmacro
-
-%macro PUSH_IF_USED 1-*
-    %rep %0
-        %if %1 < regs_used
-            PUSH r%1
-        %endif
-        %rotate 1
-    %endrep
-%endmacro
-
-%macro POP_IF_USED 1-*
-    %rep %0
-        %if %1 < regs_used
-            pop r%1
-        %endif
-        %rotate 1
-    %endrep
-%endmacro
-
-%macro LOAD_IF_USED 1-*
-    %rep %0
-        %if %1 < num_args
-            mov r%1, r %+ %1 %+ mp
-        %endif
-        %rotate 1
-    %endrep
-%endmacro
-
-%macro SUB 2
-    sub %1, %2
-    %ifidn %1, rstk
-        %assign stack_offset stack_offset+(%2)
-    %endif
-%endmacro
-
-%macro ADD 2
-    add %1, %2
-    %ifidn %1, rstk
-        %assign stack_offset stack_offset-(%2)
-    %endif
-%endmacro
-
-%macro movifnidn 2
-    %ifnidn %1, %2
-        mov %1, %2
-    %endif
-%endmacro
-
-%macro movsxdifnidn 2
-    %ifnidn %1, %2
-        movsxd %1, %2
-    %endif
-%endmacro
-
-%macro ASSERT 1
-    %if (%1) == 0
-        %error assert failed
-    %endif
-%endmacro
-
-%macro DEFINE_ARGS 0-*
-    %ifdef n_arg_names
-        %assign %%i 0
-        %rep n_arg_names
-            CAT_UNDEF arg_name %+ %%i, q
-            CAT_UNDEF arg_name %+ %%i, d
-            CAT_UNDEF arg_name %+ %%i, w
-            CAT_UNDEF arg_name %+ %%i, h
-            CAT_UNDEF arg_name %+ %%i, b
-            CAT_UNDEF arg_name %+ %%i, m
-            CAT_UNDEF arg_name %+ %%i, mp
-            CAT_UNDEF arg_name, %%i
-            %assign %%i %%i+1
-        %endrep
-    %endif
-
-    %xdefine %%stack_offset stack_offset
-    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
-    %assign %%i 0
-    %rep %0
-        %xdefine %1q r %+ %%i %+ q
-        %xdefine %1d r %+ %%i %+ d
-        %xdefine %1w r %+ %%i %+ w
-        %xdefine %1h r %+ %%i %+ h
-        %xdefine %1b r %+ %%i %+ b
-        %xdefine %1m r %+ %%i %+ m
-        %xdefine %1mp r %+ %%i %+ mp
-        CAT_XDEFINE arg_name, %%i, %1
-        %assign %%i %%i+1
-        %rotate 1
-    %endrep
-    %xdefine stack_offset %%stack_offset
-    %assign n_arg_names %0
-%endmacro
-
-%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
-    %ifnum %1
-        %if %1 != 0
-            %assign %%stack_alignment ((mmsize + 15) & ~15)
-            %assign stack_size %1
-            %if stack_size < 0
-                %assign stack_size -stack_size
-            %endif
-            %assign stack_size_padded stack_size
-            %if WIN64
-                %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
-                %if mmsize != 8
-                    %assign xmm_regs_used %2
-                    %if xmm_regs_used > 8
-                        %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
-                    %endif
-                %endif
-            %endif
-            %if mmsize <= 16 && HAVE_ALIGNED_STACK
-                %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
-                SUB rsp, stack_size_padded
-            %else
-                %assign %%reg_num (regs_used - 1)
-                %xdefine rstk r %+ %%reg_num
-                ; align stack, and save original stack location directly above
-                ; it, i.e. in rsp+stack_size_padded, so we can restore the
-                ; stack in a single instruction (i.e. mov rsp, rstk or mov
-                ; rsp, rsp+stack_size_padded)
-                mov  rstk, rsp
-                %if %1 < 0 ; need to store rsp on stack
-                    sub  rsp, gprsize+stack_size_padded
-                    and  rsp, ~(%%stack_alignment-1)
-                    %xdefine rstkm rsp+stack_size_padded
-                    mov rstkm, rstk
-                %else ; can keep rsp in rstk during whole function
-                    sub  rsp, stack_size_padded
-                    and  rsp, ~(%%stack_alignment-1)
-                    %xdefine rstkm rstk
-                %endif
-            %endif
-            WIN64_PUSH_XMM
-        %endif
-    %endif
-%endmacro
-
-%macro SETUP_STACK_POINTER 1
-    %ifnum %1
-        %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
-            %if %1 > 0
-                %assign regs_used (regs_used + 1)
-            %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
-                %warning "Stack pointer will overwrite register argument"
-            %endif
-        %endif
-    %endif
-%endmacro
-
-%macro DEFINE_ARGS_INTERNAL 3+
-    %ifnum %2
-        DEFINE_ARGS %3
-    %elif %1 == 4
-        DEFINE_ARGS %2
-    %elif %1 > 4
-        DEFINE_ARGS %2, %3
-    %endif
-%endmacro
-
-%if WIN64 ; Windows x64 ;=================================================
-
-DECLARE_REG 0,  rcx
-DECLARE_REG 1,  rdx
-DECLARE_REG 2,  R8
-DECLARE_REG 3,  R9
-DECLARE_REG 4,  R10, 40
-DECLARE_REG 5,  R11, 48
-DECLARE_REG 6,  rax, 56
-DECLARE_REG 7,  rdi, 64
-DECLARE_REG 8,  rsi, 72
-DECLARE_REG 9,  rbx, 80
-DECLARE_REG 10, rbp, 88
-DECLARE_REG 11, R12, 96
-DECLARE_REG 12, R13, 104
-DECLARE_REG 13, R14, 112
-DECLARE_REG 14, R15, 120
-
-%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, stack_size, arg_names...
-    %assign num_args %1
-    %assign regs_used %2
-    ASSERT regs_used >= num_args
-    SETUP_STACK_POINTER %4
-    ASSERT regs_used <= 15
-    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
-    ALLOC_STACK %4, %3
-    %if mmsize != 8 && stack_size == 0
-        WIN64_SPILL_XMM %3
-    %endif
-    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
-    DEFINE_ARGS_INTERNAL %0, %4, %5
-%endmacro
-
-%macro WIN64_PUSH_XMM 0
-    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
-    %if xmm_regs_used > 6
-        movaps rstk + stack_offset +  8, xmm6
-    %endif
-    %if xmm_regs_used > 7
-        movaps rstk + stack_offset + 24, xmm7
-    %endif
-    %if xmm_regs_used > 8
-        %assign %%i 8
-        %rep xmm_regs_used-8
-            movaps rsp + (%%i-8)*16 + stack_size + 32, xmm %+ %%i
-            %assign %%i %%i+1
-        %endrep
-    %endif
-%endmacro
-
-%macro WIN64_SPILL_XMM 1
-    %assign xmm_regs_used %1
-    ASSERT xmm_regs_used <= 16
-    %if xmm_regs_used > 8
-        %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
-        SUB rsp, stack_size_padded
-    %endif
-    WIN64_PUSH_XMM
-%endmacro
-
-%macro WIN64_RESTORE_XMM_INTERNAL 1
-    %assign %%pad_size 0
-    %if xmm_regs_used > 8
-        %assign %%i xmm_regs_used
-        %rep xmm_regs_used-8
-            %assign %%i %%i-1
-            movaps xmm %+ %%i, %1 + (%%i-8)*16 + stack_size + 32
-        %endrep
-    %endif
-    %if stack_size_padded > 0
-        %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
-            mov rsp, rstkm
-        %else
-            add %1, stack_size_padded
-            %assign %%pad_size stack_size_padded
-        %endif
-    %endif
-    %if xmm_regs_used > 7
-        movaps xmm7, %1 + stack_offset - %%pad_size + 24
-    %endif
-    %if xmm_regs_used > 6
-        movaps xmm6, %1 + stack_offset - %%pad_size +  8
-    %endif
-%endmacro
-
-%macro WIN64_RESTORE_XMM 1
-    WIN64_RESTORE_XMM_INTERNAL %1
-    %assign stack_offset (stack_offset-stack_size_padded)
-    %assign xmm_regs_used 0
-%endmacro
-
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
-
-%macro RET 0
-    WIN64_RESTORE_XMM_INTERNAL rsp
-    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-%if mmsize == 32
-    vzeroupper
-%endif
-    AUTO_REP_RET
-%endmacro
-
-%elif ARCH_X86_64 ; *nix x64 ;=============================================
-
-DECLARE_REG 0,  rdi
-DECLARE_REG 1,  rsi
-DECLARE_REG 2,  rdx
-DECLARE_REG 3,  rcx
-DECLARE_REG 4,  R8
-DECLARE_REG 5,  R9
-DECLARE_REG 6,  rax, 8
-DECLARE_REG 7,  R10, 16
-DECLARE_REG 8,  R11, 24
-DECLARE_REG 9,  rbx, 32
-DECLARE_REG 10, rbp, 40
-DECLARE_REG 11, R12, 48
-DECLARE_REG 12, R13, 56
-DECLARE_REG 13, R14, 64
-DECLARE_REG 14, R15, 72
-
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names...
-    %assign num_args %1
-    %assign regs_used %2
-    ASSERT regs_used >= num_args
-    SETUP_STACK_POINTER %4
-    ASSERT regs_used <= 15
-    PUSH_IF_USED 9, 10, 11, 12, 13, 14
-    ALLOC_STACK %4
-    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
-    DEFINE_ARGS_INTERNAL %0, %4, %5
-%endmacro
-
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
-
-%macro RET 0
-%if stack_size_padded > 0
-%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
-    mov rsp, rstkm
-%else
-    add rsp, stack_size_padded
-%endif
-%endif
-    POP_IF_USED 14, 13, 12, 11, 10, 9
-%if mmsize == 32
-    vzeroupper
-%endif
-    AUTO_REP_RET
-%endmacro
-
-%else ; X86_32 ;==============================================================
-
-DECLARE_REG 0, eax, 4
-DECLARE_REG 1, ecx, 8
-DECLARE_REG 2, edx, 12
-DECLARE_REG 3, ebx, 16
-DECLARE_REG 4, esi, 20
-DECLARE_REG 5, edi, 24
-DECLARE_REG 6, ebp, 28
-%define rsp esp
-
-%macro DECLARE_ARG 1-*
-    %rep %0
-        %define r%1m rstk + stack_offset + 4*%1 + 4
-        %define r%1mp dword r%1m
-        %rotate 1
-    %endrep
-%endmacro
-
-DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
-
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, stack_size, arg_names...
-    %assign num_args %1
-    %assign regs_used %2
-    ASSERT regs_used >= num_args
-    %if num_args > 7
-        %assign num_args 7
-    %endif
-    %if regs_used > 7
-        %assign regs_used 7
-    %endif
-    SETUP_STACK_POINTER %4
-    ASSERT regs_used <= 7
-    PUSH_IF_USED 3, 4, 5, 6
-    ALLOC_STACK %4
-    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
-    DEFINE_ARGS_INTERNAL %0, %4, %5
-%endmacro
-
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
-
-%macro RET 0
-%if stack_size_padded > 0
-%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
-    mov rsp, rstkm
-%else
-    add rsp, stack_size_padded
-%endif
-%endif
-    POP_IF_USED 6, 5, 4, 3
-%if mmsize == 32
-    vzeroupper
-%endif
-    AUTO_REP_RET
-%endmacro
-
-%endif ;======================================================================
-
-%if WIN64 == 0
-%macro WIN64_SPILL_XMM 1
-%endmacro
-%macro WIN64_RESTORE_XMM 1
-%endmacro
-%macro WIN64_PUSH_XMM 0
-%endmacro
-%endif
-
-; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
-; a branch or a branch target. So switch to a 2-byte form of ret in that case.
-; We can automatically detect "follows a branch", but not a branch target.
-; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
-%macro REP_RET 0
-    %if has_epilogue
-        RET
-    %else
-        rep ret
-    %endif
-%endmacro
-
-%define last_branch_adr $$
-%macro AUTO_REP_RET 0
-    %ifndef cpuflags
-        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
-    %elif notcpuflag(ssse3)
-        times ((last_branch_adr-$)>>31)+1 rep
-    %endif
-    ret
-%endmacro
-
-%macro BRANCH_INSTR 0-*
-    %rep %0
-        %macro %1 1-2 %1
-            %2 %1
-            %%branch_instr:
-            %xdefine last_branch_adr %%branch_instr
-        %endmacro
-        %rotate 1
-    %endrep
-%endmacro
-
-BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
-
-%macro TAIL_CALL 2 ; callee, is_nonadjacent
-    %if has_epilogue
-        call %1
-        RET
-    %elif %2
-        jmp %1
-    %endif
-%endmacro
-
-;=============================================================================
-; arch-independent part
-;=============================================================================
-
-%assign function_align 16
-
-; Begin a function.
-; Applies any symbol mangling needed for C linkage, and sets up a define such that
-; subsequent uses of the function name automatically refer to the mangled version.
-; Appends cpuflags to the function name if cpuflags has been specified.
-; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
-; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
-%macro cglobal 1-2+ "" ; name, PROLOGUE args
-    cglobal_internal 1, %1 %+ SUFFIX, %2
-%endmacro
-%macro cvisible 1-2+ "" ; name, PROLOGUE args
-    cglobal_internal 0, %1 %+ SUFFIX, %2
-%endmacro
-%macro cglobal_internal 2-3+
-    %if %1
-        %xdefine %%FUNCTION_PREFIX private_prefix
-        %xdefine %%VISIBILITY hidden
-    %else
-        %xdefine %%FUNCTION_PREFIX public_prefix
-        %xdefine %%VISIBILITY
-    %endif
-    %ifndef cglobaled_%2
-        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
-        %xdefine %2.skip_prologue %2 %+ .skip_prologue
-        CAT_XDEFINE cglobaled_, %2, 1
-    %endif
-    %xdefine current_function %2
-    %ifidn __OUTPUT_FORMAT__,elf
-        global %2:function %%VISIBILITY
-    %else
-        global %2
-    %endif
-    align function_align
-    %2:
-    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
-    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
-    %assign stack_offset 0      ; stack pointer offset relative to the return address
-    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
-    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
-    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
-    %ifnidn %3, ""
-        PROLOGUE %3
-    %endif
-%endmacro
-
-%macro cextern 1
-    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
-    CAT_XDEFINE cglobaled_, %1, 1
-    extern %1
-%endmacro
-
-; like cextern, but without the prefix
-%macro cextern_naked 1
-    %xdefine %1 mangle(%1)
-    CAT_XDEFINE cglobaled_, %1, 1
-    extern %1
-%endmacro
-
-%macro const 1-2+
-    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
-    %ifidn __OUTPUT_FORMAT__,elf
-        global %1:data hidden
-    %else
-        global %1
-    %endif
-    %1: %2
-%endmacro
-
-; This is needed for ELF, otherwise the GNU linker assumes the stack is
-; executable by default.
-%ifidn __OUTPUT_FORMAT__,elf
-SECTION .note.GNU-stack noalloc noexec nowrite progbits
-%endif
-
-; cpuflags
-
-%assign cpuflags_mmx      (1<<0)
-%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
-%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
-%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
-%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
-%assign cpuflags_sse2     (1<<5) | cpuflags_sse
-%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
-%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
-%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
-%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
-%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
-%assign cpuflags_avx      (1<<11)| cpuflags_sse42
-%assign cpuflags_xop      (1<<12)| cpuflags_avx
-%assign cpuflags_fma4     (1<<13)| cpuflags_avx
-%assign cpuflags_avx2     (1<<14)| cpuflags_avx
-%assign cpuflags_fma3     (1<<15)| cpuflags_avx
-
-%assign cpuflags_cache32  (1<<16)
-%assign cpuflags_cache64  (1<<17)
-%assign cpuflags_slowctz  (1<<18)
-%assign cpuflags_lzcnt    (1<<19)
-%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
-%assign cpuflags_atom     (1<<21)
-%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
-%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
-
-%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
-%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
-
-; Takes up to 2 cpuflags from the above list.
-; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
-; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
-%macro INIT_CPUFLAGS 0-2
-    CPUNOP amdnop
-    %if %0 >= 1
-        %xdefine cpuname %1
-        %assign cpuflags cpuflags_%1
-        %if %0 >= 2
-            %xdefine cpuname %1_%2
-            %assign cpuflags cpuflags | cpuflags_%2
-        %endif
-        %xdefine SUFFIX _ %+ cpuname
-        %if cpuflag(avx)
-            %assign avx_enabled 1
-        %endif
-        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
-            %define mova movaps
-            %define movu movups
-            %define movnta movntps
-        %endif
-        %if cpuflag(aligned)
-            %define movu mova
-        %elifidn %1, sse3
-            %define movu lddqu
-        %endif
-        %if ARCH_X86_64 == 0 && notcpuflag(sse2)
-            CPUNOP basicnop
-        %endif
-    %else
-        %xdefine SUFFIX
-        %undef cpuname
-        %undef cpuflags
-    %endif
-%endmacro
-
-; Merge mmx and sse*
-; m# is a simd register of the currently selected size
-; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
-; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
-; (All 3 remain in sync through SWAP.)
-
-%macro CAT_XDEFINE 3
-    %xdefine %1%2 %3
-%endmacro
-
-%macro CAT_UNDEF 2
-    %undef %1%2
-%endmacro
-
-%macro INIT_MMX 0-1+
-    %assign avx_enabled 0
-    %define RESET_MM_PERMUTATION INIT_MMX %1
-    %define mmsize 8
-    %define num_mmregs 8
-    %define mova movq
-    %define movu movq
-    %define movh movd
-    %define movnta movntq
-    %assign %%i 0
-    %rep 8
-    CAT_XDEFINE m, %%i, mm %+ %%i
-    CAT_XDEFINE nmm, %%i, %%i
-    %assign %%i %%i+1
-    %endrep
-    %rep 8
-    CAT_UNDEF m, %%i
-    CAT_UNDEF nmm, %%i
-    %assign %%i %%i+1
-    %endrep
-    INIT_CPUFLAGS %1
-%endmacro
-
-%macro INIT_XMM 0-1+
-    %assign avx_enabled 0
-    %define RESET_MM_PERMUTATION INIT_XMM %1
-    %define mmsize 16
-    %define num_mmregs 8
-    %if ARCH_X86_64
-    %define num_mmregs 16
-    %endif
-    %define mova movdqa
-    %define movu movdqu
-    %define movh movq
-    %define movnta movntdq
-    %assign %%i 0
-    %rep num_mmregs
-    CAT_XDEFINE m, %%i, xmm %+ %%i
-    CAT_XDEFINE nxmm, %%i, %%i
-    %assign %%i %%i+1
-    %endrep
-    INIT_CPUFLAGS %1
-%endmacro
-
-%macro INIT_YMM 0-1+
-    %assign avx_enabled 1
-    %define RESET_MM_PERMUTATION INIT_YMM %1
-    %define mmsize 32
-    %define num_mmregs 8
-    %if ARCH_X86_64
-    %define num_mmregs 16
-    %endif
-    %define mova movdqa
-    %define movu movdqu
-    %undef movh
-    %define movnta movntdq
-    %assign %%i 0
-    %rep num_mmregs
-    CAT_XDEFINE m, %%i, ymm %+ %%i
-    CAT_XDEFINE nymm, %%i, %%i
-    %assign %%i %%i+1
-    %endrep
-    INIT_CPUFLAGS %1
-%endmacro
-
-INIT_XMM
-
-%macro DECLARE_MMCAST 1
-    %define  mmmm%1   mm%1
-    %define  mmxmm%1  mm%1
-    %define  mmymm%1  mm%1
-    %define xmmmm%1   mm%1
-    %define xmmxmm%1 xmm%1
-    %define xmmymm%1 xmm%1
-    %define ymmmm%1   mm%1
-    %define ymmxmm%1 xmm%1
-    %define ymmymm%1 ymm%1
-    %define xm%1 xmm %+ m%1
-    %define ym%1 ymm %+ m%1
-%endmacro
-
-%assign i 0
-%rep 16
-    DECLARE_MMCAST i
-%assign i i+1
-%endrep
-
-; I often want to use macros that permute their arguments. e.g. there's no
-; efficient way to implement butterfly or transpose or dct without swapping some
-; arguments.
-;
-; I would like to not have to manually keep track of the permutations:
-; If I insert a permutation in the middle of a function, it should automatically
-; change everything that follows. For more complex macros I may also have multiple
-; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
-;
-; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
-; permutes its arguments. It's equivalent to exchanging the contents of the
-; registers, except that this way you exchange the register names instead, so it
-; doesn't cost any cycles.
-
-%macro PERMUTE 2-* ; takes a list of pairs to swap
-%rep %0/2
-    %xdefine %%tmp%2 m%2
-    %rotate 2
-%endrep
-%rep %0/2
-    %xdefine m%1 %%tmp%2
-    CAT_XDEFINE n, m%1, %1
-    %rotate 2
-%endrep
-%endmacro
-
-%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
-%ifnum %1 ; SWAP 0, 1, ...
-    SWAP_INTERNAL_NUM %1, %2
-%else ; SWAP m0, m1, ...
-    SWAP_INTERNAL_NAME %1, %2
-%endif
-%endmacro
-
-%macro SWAP_INTERNAL_NUM 2-*
-    %rep %0-1
-        %xdefine %%tmp m%1
-        %xdefine m%1 m%2
-        %xdefine m%2 %%tmp
-        CAT_XDEFINE n, m%1, %1
-        CAT_XDEFINE n, m%2, %2
-    %rotate 1
-    %endrep
-%endmacro
-
-%macro SWAP_INTERNAL_NAME 2-*
-    %xdefine %%args n %+ %1
-    %rep %0-1
-        %xdefine %%args %%args, n %+ %2
-    %rotate 1
-    %endrep
-    SWAP_INTERNAL_NUM %%args
-%endmacro
-
-; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
-; calls to that function will automatically load the permutation, so values can
-; be returned in mmregs.
-%macro SAVE_MM_PERMUTATION 0-1
-    %if %0
-        %xdefine %%f %1_m
-    %else
-        %xdefine %%f current_function %+ _m
-    %endif
-    %assign %%i 0
-    %rep num_mmregs
-        CAT_XDEFINE %%f, %%i, m %+ %%i
-    %assign %%i %%i+1
-    %endrep
-%endmacro
-
-%macro LOAD_MM_PERMUTATION 1 ; name to load from
-    %ifdef %1_m0
-        %assign %%i 0
-        %rep num_mmregs
-            CAT_XDEFINE m, %%i, %1_m %+ %%i
-            CAT_XDEFINE n, m %+ %%i, %%i
-        %assign %%i %%i+1
-        %endrep
-    %endif
-%endmacro
-
-; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
-%macro call 1
-    call_internal %1, %1 %+ SUFFIX
-%endmacro
-%macro call_internal 2
-    %xdefine %%i %1
-    %ifndef cglobaled_%1
-        %ifdef cglobaled_%2
-            %xdefine %%i %2
-        %endif
-    %endif
-    call %%i
-    LOAD_MM_PERMUTATION %%i
-%endmacro
-
-; Substitutions that reduce instruction size but are functionally equivalent
-%macro add 2
-    %ifnum %2
-        %if %2==128
-            sub %1, -128
-        %else
-            add %1, %2
-        %endif
-    %else
-        add %1, %2
-    %endif
-%endmacro
-
-%macro sub 2
-    %ifnum %2
-        %if %2==128
-            add %1, -128
-        %else
-            sub %1, %2
-        %endif
-    %else
-        sub %1, %2
-    %endif
-%endmacro
-
-;=============================================================================
-; AVX abstraction layer
-;=============================================================================
-
-%assign i 0
-%rep 16
-    %if i < 8
-        CAT_XDEFINE sizeofmm, i, 8
-    %endif
-    CAT_XDEFINE sizeofxmm, i, 16
-    CAT_XDEFINE sizeofymm, i, 32
-%assign i i+1
-%endrep
-%undef i
-
-%macro CHECK_AVX_INSTR_EMU 3-*
-    %xdefine %%opcode %1
-    %xdefine %%dst %2
-    %rep %0-2
-        %ifidn %%dst, %3
-            %error non-avx emulation of ``%%opcode'' is not supported
-        %endif
-        %rotate 1
-    %endrep
-%endmacro
-
-;%1 == instruction
-;%2 == 1 if float, 0 if int
-;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
-;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
-;%5+: operands
-%macro RUN_AVX_INSTR 5-8+
-    %ifnum sizeof%6
-        %assign %%sizeofreg sizeof%6
-    %elifnum sizeof%5
-        %assign %%sizeofreg sizeof%5
-    %else
-        %assign %%sizeofreg mmsize
-    %endif
-    %assign %%emulate_avx 0
-    %if avx_enabled && %%sizeofreg >= 16
-        %xdefine %%instr v%1
-    %else
-        %xdefine %%instr %1
-        %if %0 >= 7+%3
-            %assign %%emulate_avx 1
-        %endif
-    %endif
-
-    %if %%emulate_avx
-        %xdefine %%src1 %6
-        %xdefine %%src2 %7
-        %ifnidn %5, %6
-            %if %0 >= 8
-                CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8
-            %else
-                CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
-            %endif
-            %if %4 && %3 == 0
-                %ifnid %7
-                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
-                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
-                    ; So, if the instruction is commutative with a memory arg, swap them.
-                    %xdefine %%src1 %7
-                    %xdefine %%src2 %6
-                %endif
-            %endif
-            %if %%sizeofreg == 8
-                MOVQ %5, %%src1
-            %elif %2
-                MOVAPS %5, %%src1
-            %else
-                MOVDQA %5, %%src1
-            %endif
-        %endif
-        %if %0 >= 8
-            %1 %5, %%src2, %8
-        %else
-            %1 %5, %%src2
-        %endif
-    %elif %0 >= 8
-        %%instr %5, %6, %7, %8
-    %elif %0 == 7
-        %%instr %5, %6, %7
-    %elif %0 == 6
-        %%instr %5, %6
-    %else
-        %%instr %5
-    %endif
-%endmacro
-
-;%1 == instruction
-;%2 == 1 if float, 0 if int
-;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
-;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
-%macro AVX_INSTR 1-4 0, 1, 0
-    %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4
-        %ifidn %2, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1
-        %elifidn %3, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2
-        %elifidn %4, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3
-        %elifidn %5, fnord
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4
-        %else
-            RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5
-        %endif
-    %endmacro
-%endmacro
-
-; Instructions with both VEX and non-VEX encodings
-; Non-destructive instructions are written without parameters
-AVX_INSTR addpd, 1, 0, 1
-AVX_INSTR addps, 1, 0, 1
-AVX_INSTR addsd, 1, 0, 1
-AVX_INSTR addss, 1, 0, 1
-AVX_INSTR addsubpd, 1, 0, 0
-AVX_INSTR addsubps, 1, 0, 0
-AVX_INSTR aesdec, 0, 0, 0
-AVX_INSTR aesdeclast, 0, 0, 0
-AVX_INSTR aesenc, 0, 0, 0
-AVX_INSTR aesenclast, 0, 0, 0
-AVX_INSTR aesimc
-AVX_INSTR aeskeygenassist
-AVX_INSTR andnpd, 1, 0, 0
-AVX_INSTR andnps, 1, 0, 0
-AVX_INSTR andpd, 1, 0, 1
-AVX_INSTR andps, 1, 0, 1
-AVX_INSTR blendpd, 1, 0, 0
-AVX_INSTR blendps, 1, 0, 0
-AVX_INSTR blendvpd, 1, 0, 0
-AVX_INSTR blendvps, 1, 0, 0
-AVX_INSTR cmppd, 1, 1, 0
-AVX_INSTR cmpps, 1, 1, 0
-AVX_INSTR cmpsd, 1, 1, 0
-AVX_INSTR cmpss, 1, 1, 0
-AVX_INSTR comisd
-AVX_INSTR comiss
-AVX_INSTR cvtdq2pd
-AVX_INSTR cvtdq2ps
-AVX_INSTR cvtpd2dq
-AVX_INSTR cvtpd2ps
-AVX_INSTR cvtps2dq
-AVX_INSTR cvtps2pd
-AVX_INSTR cvtsd2si
-AVX_INSTR cvtsd2ss
-AVX_INSTR cvtsi2sd
-AVX_INSTR cvtsi2ss
-AVX_INSTR cvtss2sd
-AVX_INSTR cvtss2si
-AVX_INSTR cvttpd2dq
-AVX_INSTR cvttps2dq
-AVX_INSTR cvttsd2si
-AVX_INSTR cvttss2si
-AVX_INSTR divpd, 1, 0, 0
-AVX_INSTR divps, 1, 0, 0
-AVX_INSTR divsd, 1, 0, 0
-AVX_INSTR divss, 1, 0, 0
-AVX_INSTR dppd, 1, 1, 0
-AVX_INSTR dpps, 1, 1, 0
-AVX_INSTR extractps
-AVX_INSTR haddpd, 1, 0, 0
-AVX_INSTR haddps, 1, 0, 0
-AVX_INSTR hsubpd, 1, 0, 0
-AVX_INSTR hsubps, 1, 0, 0
-AVX_INSTR insertps, 1, 1, 0
-AVX_INSTR lddqu
-AVX_INSTR ldmxcsr
-AVX_INSTR maskmovdqu
-AVX_INSTR maxpd, 1, 0, 1
-AVX_INSTR maxps, 1, 0, 1
-AVX_INSTR maxsd, 1, 0, 1
-AVX_INSTR maxss, 1, 0, 1
-AVX_INSTR minpd, 1, 0, 1
-AVX_INSTR minps, 1, 0, 1
-AVX_INSTR minsd, 1, 0, 1
-AVX_INSTR minss, 1, 0, 1
-AVX_INSTR movapd
-AVX_INSTR movaps
-AVX_INSTR movd
-AVX_INSTR movddup
-AVX_INSTR movdqa
-AVX_INSTR movdqu
-AVX_INSTR movhlps, 1, 0, 0
-AVX_INSTR movhpd, 1, 0, 0
-AVX_INSTR movhps, 1, 0, 0
-AVX_INSTR movlhps, 1, 0, 0
-AVX_INSTR movlpd, 1, 0, 0
-AVX_INSTR movlps, 1, 0, 0
-AVX_INSTR movmskpd
-AVX_INSTR movmskps
-AVX_INSTR movntdq
-AVX_INSTR movntdqa
-AVX_INSTR movntpd
-AVX_INSTR movntps
-AVX_INSTR movq
-AVX_INSTR movsd, 1, 0, 0
-AVX_INSTR movshdup
-AVX_INSTR movsldup
-AVX_INSTR movss, 1, 0, 0
-AVX_INSTR movupd
-AVX_INSTR movups
-AVX_INSTR mpsadbw, 0, 1, 0
-AVX_INSTR mulpd, 1, 0, 1
-AVX_INSTR mulps, 1, 0, 1
-AVX_INSTR mulsd, 1, 0, 1
-AVX_INSTR mulss, 1, 0, 1
-AVX_INSTR orpd, 1, 0, 1
-AVX_INSTR orps, 1, 0, 1
-AVX_INSTR pabsb
-AVX_INSTR pabsd
-AVX_INSTR pabsw
-AVX_INSTR packsswb, 0, 0, 0
-AVX_INSTR packssdw, 0, 0, 0
-AVX_INSTR packuswb, 0, 0, 0
-AVX_INSTR packusdw, 0, 0, 0
-AVX_INSTR paddb, 0, 0, 1
-AVX_INSTR paddw, 0, 0, 1
-AVX_INSTR paddd, 0, 0, 1
-AVX_INSTR paddq, 0, 0, 1
-AVX_INSTR paddsb, 0, 0, 1
-AVX_INSTR paddsw, 0, 0, 1
-AVX_INSTR paddusb, 0, 0, 1
-AVX_INSTR paddusw, 0, 0, 1
-AVX_INSTR palignr, 0, 1, 0
-AVX_INSTR pand, 0, 0, 1
-AVX_INSTR pandn, 0, 0, 0
-AVX_INSTR pavgb, 0, 0, 1
-AVX_INSTR pavgw, 0, 0, 1
-AVX_INSTR pblendvb, 0, 0, 0
-AVX_INSTR pblendw, 0, 1, 0
-AVX_INSTR pclmulqdq, 0, 1, 0
-AVX_INSTR pcmpestri
-AVX_INSTR pcmpestrm
-AVX_INSTR pcmpistri
-AVX_INSTR pcmpistrm
-AVX_INSTR pcmpeqb, 0, 0, 1
-AVX_INSTR pcmpeqw, 0, 0, 1
-AVX_INSTR pcmpeqd, 0, 0, 1
-AVX_INSTR pcmpeqq, 0, 0, 1
-AVX_INSTR pcmpgtb, 0, 0, 0
-AVX_INSTR pcmpgtw, 0, 0, 0
-AVX_INSTR pcmpgtd, 0, 0, 0
-AVX_INSTR pcmpgtq, 0, 0, 0
-AVX_INSTR pextrb
-AVX_INSTR pextrd
-AVX_INSTR pextrq
-AVX_INSTR pextrw
-AVX_INSTR phaddw, 0, 0, 0
-AVX_INSTR phaddd, 0, 0, 0
-AVX_INSTR phaddsw, 0, 0, 0
-AVX_INSTR phminposuw
-AVX_INSTR phsubw, 0, 0, 0
-AVX_INSTR phsubd, 0, 0, 0
-AVX_INSTR phsubsw, 0, 0, 0
-AVX_INSTR pinsrb, 0, 1, 0
-AVX_INSTR pinsrd, 0, 1, 0
-AVX_INSTR pinsrq, 0, 1, 0
-AVX_INSTR pinsrw, 0, 1, 0
-AVX_INSTR pmaddwd, 0, 0, 1
-AVX_INSTR pmaddubsw, 0, 0, 0
-AVX_INSTR pmaxsb, 0, 0, 1
-AVX_INSTR pmaxsw, 0, 0, 1
-AVX_INSTR pmaxsd, 0, 0, 1
-AVX_INSTR pmaxub, 0, 0, 1
-AVX_INSTR pmaxuw, 0, 0, 1
-AVX_INSTR pmaxud, 0, 0, 1
-AVX_INSTR pminsb, 0, 0, 1
-AVX_INSTR pminsw, 0, 0, 1
-AVX_INSTR pminsd, 0, 0, 1
-AVX_INSTR pminub, 0, 0, 1
-AVX_INSTR pminuw, 0, 0, 1
-AVX_INSTR pminud, 0, 0, 1
-AVX_INSTR pmovmskb
-AVX_INSTR pmovsxbw
-AVX_INSTR pmovsxbd
-AVX_INSTR pmovsxbq
-AVX_INSTR pmovsxwd
-AVX_INSTR pmovsxwq
-AVX_INSTR pmovsxdq
-AVX_INSTR pmovzxbw
-AVX_INSTR pmovzxbd
-AVX_INSTR pmovzxbq
-AVX_INSTR pmovzxwd
-AVX_INSTR pmovzxwq
-AVX_INSTR pmovzxdq
-AVX_INSTR pmuldq, 0, 0, 1
-AVX_INSTR pmulhrsw, 0, 0, 1
-AVX_INSTR pmulhuw, 0, 0, 1
-AVX_INSTR pmulhw, 0, 0, 1
-AVX_INSTR pmullw, 0, 0, 1
-AVX_INSTR pmulld, 0, 0, 1
-AVX_INSTR pmuludq, 0, 0, 1
-AVX_INSTR por, 0, 0, 1
-AVX_INSTR psadbw, 0, 0, 1
-AVX_INSTR pshufb, 0, 0, 0
-AVX_INSTR pshufd
-AVX_INSTR pshufhw
-AVX_INSTR pshuflw
-AVX_INSTR psignb, 0, 0, 0
-AVX_INSTR psignw, 0, 0, 0
-AVX_INSTR psignd, 0, 0, 0
-AVX_INSTR psllw, 0, 0, 0
-AVX_INSTR pslld, 0, 0, 0
-AVX_INSTR psllq, 0, 0, 0
-AVX_INSTR pslldq, 0, 0, 0
-AVX_INSTR psraw, 0, 0, 0
-AVX_INSTR psrad, 0, 0, 0
-AVX_INSTR psrlw, 0, 0, 0
-AVX_INSTR psrld, 0, 0, 0
-AVX_INSTR psrlq, 0, 0, 0
-AVX_INSTR psrldq, 0, 0, 0
-AVX_INSTR psubb, 0, 0, 0
-AVX_INSTR psubw, 0, 0, 0
-AVX_INSTR psubd, 0, 0, 0
-AVX_INSTR psubq, 0, 0, 0
-AVX_INSTR psubsb, 0, 0, 0
-AVX_INSTR psubsw, 0, 0, 0
-AVX_INSTR psubusb, 0, 0, 0
-AVX_INSTR psubusw, 0, 0, 0
-AVX_INSTR ptest
-AVX_INSTR punpckhbw, 0, 0, 0
-AVX_INSTR punpckhwd, 0, 0, 0
-AVX_INSTR punpckhdq, 0, 0, 0
-AVX_INSTR punpckhqdq, 0, 0, 0
-AVX_INSTR punpcklbw, 0, 0, 0
-AVX_INSTR punpcklwd, 0, 0, 0
-AVX_INSTR punpckldq, 0, 0, 0
-AVX_INSTR punpcklqdq, 0, 0, 0
-AVX_INSTR pxor, 0, 0, 1
-AVX_INSTR rcpps, 1, 0, 0
-AVX_INSTR rcpss, 1, 0, 0
-AVX_INSTR roundpd
-AVX_INSTR roundps
-AVX_INSTR roundsd
-AVX_INSTR roundss
-AVX_INSTR rsqrtps, 1, 0, 0
-AVX_INSTR rsqrtss, 1, 0, 0
-AVX_INSTR shufpd, 1, 1, 0
-AVX_INSTR shufps, 1, 1, 0
-AVX_INSTR sqrtpd, 1, 0, 0
-AVX_INSTR sqrtps, 1, 0, 0
-AVX_INSTR sqrtsd, 1, 0, 0
-AVX_INSTR sqrtss, 1, 0, 0
-AVX_INSTR stmxcsr
-AVX_INSTR subpd, 1, 0, 0
-AVX_INSTR subps, 1, 0, 0
-AVX_INSTR subsd, 1, 0, 0
-AVX_INSTR subss, 1, 0, 0
-AVX_INSTR ucomisd
-AVX_INSTR ucomiss
-AVX_INSTR unpckhpd, 1, 0, 0
-AVX_INSTR unpckhps, 1, 0, 0
-AVX_INSTR unpcklpd, 1, 0, 0
-AVX_INSTR unpcklps, 1, 0, 0
-AVX_INSTR xorpd, 1, 0, 1
-AVX_INSTR xorps, 1, 0, 1
-
-; 3DNow instructions, for sharing code between AVX, SSE and 3DN
-AVX_INSTR pfadd, 1, 0, 1
-AVX_INSTR pfsub, 1, 0, 0
-AVX_INSTR pfmul, 1, 0, 1
-
-; base-4 constants for shuffles
-%assign i 0
-%rep 256
-    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
-    %if j < 10
-        CAT_XDEFINE q000, j, i
-    %elif j < 100
-        CAT_XDEFINE q00, j, i
-    %elif j < 1000
-        CAT_XDEFINE q0, j, i
-    %else
-        CAT_XDEFINE q, j, i
-    %endif
-%assign i i+1
-%endrep
-%undef i
-%undef j
-
-%macro FMA_INSTR 3
-    %macro %1 4-7 %1, %2, %3
-        %if cpuflag(xop)
-            v%5 %1, %2, %3, %4
-        %else
-            %6 %1, %2, %3
-            %7 %1, %4
-        %endif
-    %endmacro
-%endmacro
-
-FMA_INSTR  pmacsdd,  pmulld, paddd
-FMA_INSTR  pmacsww,  pmullw, paddw
-FMA_INSTR pmadcswd, pmaddwd, paddd
-
-; convert FMA4 to FMA3 if possible
-%macro FMA4_INSTR 4
-    %macro %1 4-8 %1, %2, %3, %4
-        %if cpuflag(fma4)
-            v%5 %1, %2, %3, %4
-        %elifidn %1, %2
-            v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
-        %elifidn %1, %3
-            v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
-        %elifidn %1, %4
-            v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
-        %else
-            %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
-        %endif
-    %endmacro
-%endmacro
-
-FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
-FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
-FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
-FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
-
-FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
-FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
-FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
-FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
-
-FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
-FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
-FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
-FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
-
-FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
-FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
-FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
-FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
-
-FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
-FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
-FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
-FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
-
-; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
-%if ARCH_X86_64 == 0
-%macro vpbroadcastq 2
-%if sizeof%1 == 16
-    movddup %1, %2
-%else
-    vbroadcastsd %1, %2
-%endif
-%endmacro
-%endif
-
-%ifidn __OUTPUT_FORMAT__,elf
-section .note.GNU-stack noalloc noexec nowrite progbits
-%endif
-%ifidn __OUTPUT_FORMAT__,elf32
-section .note.GNU-stack noalloc noexec nowrite progbits
-%endif
-%ifidn __OUTPUT_FORMAT__,elf64
-section .note.GNU-stack noalloc noexec nowrite progbits
-%endif

kvazaar-2.2.0.tar.gz/.gitignore -> kvazaar-2.3.0.tar.gz/.gitignore Changed

kvazaar-2.3.0.tar.gz/CMakeLists.txt Added

@@ -0,0 +1,391 @@
+cmake_minimum_required(VERSION 3.12)
+
+project(kvazaar
+LANGUAGES C CXX
+HOMEPAGE_URL https://github.com/ultravideo/kvazaar
+DESCRIPTION "An open-source VVC encoder licensed under 3-clause BSD"
+VERSION 2.3.0 )
+
+option(BUILD_SHARED_LIBS "Build using shared kvazaar library" ON)
+
+option(BUILD_TESTS "Build tests" ON)
+
+
+include(GNUInstallDirs) #Helps to define correct distro specific install directories
+
+set(KVAZAAR_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}" CACHE PATH "kvazaar library install path")
+set(KVAZAAR_INSTALL_BINDIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH "kvazaar binary install path")
+set(KVAZAAR_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}" CACHE PATH "kvazaar include install path")
+set(KVAZAAR_INSTALL_MANDIR "${CMAKE_INSTALL_MANDIR}/man1" CACHE PATH "kvazaar manual page file install path")
+
+# https://www.kitware.com/cmake-and-the-default-build-type/
+# Set a default build type if none was specified
+set(KVZ_DEFAULT_BUILD_TYPE "RelWithDebInfo")
+
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "No build type specified, setting to '${KVZ_DEFAULT_BUILD_TYPE}'.")
+  set(CMAKE_BUILD_TYPE "${KVZ_DEFAULT_BUILD_TYPE}" CACHE
+      STRING "Choose the type of build." FORCE)
+  # Set the possible values of build type for cmake-gui
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
+    "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+
+find_package(Git QUIET)
+if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
+    # Update submodules as needed
+    option(GIT_SUBMODULE "Check submodules during build" ON)
+    if(GIT_SUBMODULE)
+        message(STATUS "Submodule update")
+        execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --recursive
+                        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                        RESULT_VARIABLE GIT_SUBMOD_RESULT)
+        if(NOT GIT_SUBMOD_RESULT EQUAL "0")
+            message(WARNING "git submodule update --init --recursive failed with ${GIT_SUBMOD_RESULT}, please checkout submodules")
+        endif()
+    endif()
+    # Check git hash and fetch tag
+    execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse HEAD
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                    RESULT_VARIABLE GIT_HEAD_OK
+                    OUTPUT_VARIABLE GIT_HEAD)
+    if(GIT_HEAD_OK EQUAL "0")
+        string(SUBSTRING ${GIT_HEAD} 0 30 GIT_TAG_LONG)    
+        execute_process(COMMAND ${GIT_EXECUTABLE} name-rev --tags --name-only ${GIT_TAG_LONG}
+                        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                        RESULT_VARIABLE GIT_TAG_OK
+                        OUTPUT_VARIABLE GIT_TAG)
+        string(SUBSTRING ${GIT_TAG} 0 9 GIT_TAG_STRIP)
+      
+        # If tag is not defined, add part of the commit hash to the version
+        if(GIT_TAG_OK EQUAL "0" AND GIT_TAG_STRIP STREQUAL "undefined")
+          string(SUBSTRING ${GIT_HEAD} 0 7 GIT_TAG_SHORT)
+          set(PROJECT_VERSION ${PROJECT_VERSION}-${GIT_TAG_SHORT})
+          message(INFO " No tag detected, version changed to ${PROJECT_VERSION}")
+        endif()
+    endif()    
+endif()
+
+if(NOT EXISTS "${PROJECT_SOURCE_DIR}/greatest/greatest.h")
+    message(WARNING "The submodule greatest was not loaded, some tests may fail")
+endif()
+
+# Grab <year>-<month>-<day> timestamp for debug purposes
+string(TIMESTAMP CMAKE_BUILD_DATE %Y-%m-%d)
+
+set(KVZ_COMPILER_VERSION "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+
+if(MSVC)
+    if(MSVC_VERSION LESS 1800)
+        set(KVZ_COMPILER_VERSION "VS")
+    elseif(MSVC_VERSION LESS 1900)
+        set(KVZ_COMPILER_VERSION "VS2013")
+    elseif(MSVC_VERSION LESS 1910)
+        set(KVZ_COMPILER_VERSION "VS2015")
+    elseif(MSVC_VERSION LESS 1920)
+        set(KVZ_COMPILER_VERSION "VS2017")
+    elseif(MSVC_VERSION LESS 1930)
+        set(KVZ_COMPILER_VERSION "VS2019")
+    else()
+        set(KVZ_COMPILER_VERSION "VS2022")
+    endif()
+endif()
+
+# Set compiler info to print at runtime
+set(KVZ_COMPILER_STRING "${KVZ_COMPILER_VERSION}")
+
+add_definitions(-DCMAKE_BUILD)
+
+# Apply dynamic info to the config files
+configure_file("${PROJECT_SOURCE_DIR}/src/kvazaar.pc.in" "${PROJECT_SOURCE_DIR}/src/kvazaar.pc" @ONLY)
+configure_file("${PROJECT_SOURCE_DIR}/src/version.h.in" "${PROJECT_SOURCE_DIR}/src/version.h" @ONLY)
+
+# Add all sources in src/ base
+file(GLOB LIB_SOURCES RELATIVE ${PROJECT_SOURCE_DIR} "src/*.h" "src/*.c")
+
+# We don't want CLI main in the library
+list(REMOVE_ITEM LIB_SOURCES "src/encmain.c" "src/cli.c" "src/cli.h" "src/yuv_io.c" "src/yuv_io.h")
+
+# Add also all the strategies
+file(GLOB_RECURSE LIB_SOURCES_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.c")
+
+list(APPEND LIB_SOURCES ${LIB_SOURCES_STRATEGIES})
+
+# We also need the libmd5
+list(APPEND LIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/extras/libmd5.c)
+
+add_definitions(-DKVZ_DLL_EXPORTS)
+
+if(BUILD_SHARED_LIBS)
+  add_definitions(-DPIC)
+endif()
+
+# For visual studio / windows we also need our own pthread implementation and getopt
+if(MSVC)
+  list(APPEND LIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/extras/getopt.c ${CMAKE_CURRENT_SOURCE_DIR}/src/threadwrapper/src/pthread.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/threadwrapper/src/semaphore.cpp)
+  add_definitions(-DWIN32_LEAN_AND_MEAN -D_WIN32 -DWIN32 -DWIN64)
+endif()
+
+if(BUILD_SHARED_LIBS)
+  list( APPEND CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib" "./" "../lib" )  
+  set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+  add_library(kvazaar SHARED ${LIB_SOURCES})
+else()
+  add_library(kvazaar STATIC ${LIB_SOURCES})
+  if(MSVC) # Fix a linking problem with visual studio when the library is the same name as the binary
+    set_target_properties(kvazaar PROPERTIES OUTPUT_NAME libkvazaar)
+  endif()
+  
+endif()
+
+target_include_directories(kvazaar PUBLIC src)
+target_include_directories(kvazaar PUBLIC src/extras)
+target_include_directories(kvazaar PUBLIC src/strategies)
+
+file(GLOB LIB_SOURCES_STRATEGIES_AVX2 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/avx2/*.c")
+file(GLOB LIB_SOURCES_STRATEGIES_SSE41 RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/sse41/*.c")
+
+set(CLI_SOURCES "src/encmain.c" "src/cli.c" "src/cli.h" "src/yuv_io.c" "src/yuv_io.h")
+
+# Add the getopt and pthread for visual studio
+if(MSVC) 
+  list(APPEND CLI_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/extras/getopt.c ${CMAKE_CURRENT_SOURCE_DIR}/src/threadwrapper/src/pthread.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/threadwrapper/src/semaphore.cpp)
+endif()
+
+add_executable(kvazaar-bin ${CLI_SOURCES})
+
+set_target_properties(kvazaar-bin PROPERTIES OUTPUT_NAME kvazaar)
+set_target_properties(kvazaar-bin PROPERTIES RUNTIME_OUTPUT_NAME kvazaar)
+
+target_link_libraries(kvazaar-bin PUBLIC kvazaar)
+
+if(MSVC)
+  target_include_directories(kvazaar PUBLIC src/threadwrapper/include)
+  set_property( SOURCE ${LIB_SOURCES_STRATEGIES_AVX2} APPEND PROPERTY COMPILE_FLAGS "/arch:AVX2" )  
+else()
+  set_target_properties(kvazaar-bin PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/src)
+  list(APPEND ALLOW_AVX2 "x86_64" "AMD64")
+  if(${CMAKE_SYSTEM_PROCESSOR} IN_LIST ALLOW_AVX2) 
+    set_property( SOURCE ${LIB_SOURCES_STRATEGIES_AVX2} APPEND PROPERTY COMPILE_FLAGS "-mavx2 -mbmi -mpopcnt -mlzcnt -mbmi2" )
+    set_property( SOURCE ${LIB_SOURCES_STRATEGIES_SSE41} APPEND PROPERTY COMPILE_FLAGS "-msse4.1" )    
+  endif()
+  set(THREADS_PREFER_PTHREAD_FLAG ON)
+  find_package(Threads REQUIRED)
+  target_link_libraries(kvazaar PUBLIC Threads::Threads)
+
+  include(CheckLibraryExists)
+
+  CHECK_LIBRARY_EXISTS(m sin "" HAVE_LIB_M)
+
+  if (HAVE_LIB_M)
+      set(EXTRA_LIBS ${EXTRA_LIBS} m)
+  endif (HAVE_LIB_M)
+
+  target_link_libraries(kvazaar-bin PUBLIC ${EXTRA_LIBS})
+endif()
+
+# Source grouping
+
+# Some basic structuring of the files based on previous visual studio project files
+file(GLOB SOURCE_GROUP_BITSTREAM RELATIVE ${PROJECT_SOURCE_DIR} "src/encode_coding_tree.*" "src/encoder_state-bitstream.*" "src/nal.*")
+file(GLOB SOURCE_GROUP_CABAC RELATIVE ${PROJECT_SOURCE_DIR} "src/bitstream.*" "src/cabac.*" "src/context.*")
+file(GLOB SOURCE_GROUP_COMPRESSION RELATIVE ${PROJECT_SOURCE_DIR} "src/search*" "src/rdo.*" "src/fast_coeff*")
+file(GLOB SOURCE_GROUP_CONSTRAINT RELATIVE ${PROJECT_SOURCE_DIR} "src/constraint.*" "src/ml_*")
+file(GLOB SOURCE_GROUP_CONTROL RELATIVE ${PROJECT_SOURCE_DIR} "src/cfg.*" "src/encoder.*" "src/encoder_state-c*" "src/encoder_state-g*" "src/encoderstate*" "src/gop.*" "src/input_frame_buffer.*" "src/kvazaar*" "src/rate_control.*" "src/mip_data.h")
+file(GLOB SOURCE_GROUP_DATA_STRUCTURES RELATIVE ${PROJECT_SOURCE_DIR} "src/cu.*" "src/image.*" "src/imagelist.*" "src/videoframe.*")
+file(GLOB SOURCE_GROUP_EXTRAS RELATIVE ${PROJECT_SOURCE_DIR} "src/extras/*.h" "src/extras/*.c")
+file(GLOB_RECURSE SOURCE_GROUP_STRATEGIES RELATIVE ${PROJECT_SOURCE_DIR} "src/strategies/*.h" "src/strategies/*.c")
+file(GLOB SOURCE_GROUP_RECON RELATIVE ${PROJECT_SOURCE_DIR} "src/alf.*" "src/filter.*" "src/inter.*" "src/intra.*" "src/reshape.*" "src/sao.*" "src/scalinglist.*" "src/tables.*" "src/transform.*")
+file(GLOB SOURCE_GROUP_THREADING RELATIVE ${PROJECT_SOURCE_DIR} "src/threadqueue.*" "src/threads.*")
+file(GLOB_RECURSE SOURCE_GROUP_THREADWRAPPER RELATIVE ${PROJECT_SOURCE_DIR} "src/threadwrapper/*.cpp" "src/threadwrapper/*.h")
+file(GLOB SOURCE_GROUP_TOPLEVEL RELATIVE ${PROJECT_SOURCE_DIR} "src/debug.*" "src/global.h" "src/version.h" "src/kvz_math.h" "src/checkpoint.*")
+
+source_group( "Bitstream" FILES  ${SOURCE_GROUP_BITSTREAM})
+source_group( "CABAC" FILES ${SOURCE_GROUP_CABAC})
+source_group( "Compression" FILES ${SOURCE_GROUP_COMPRESSION})
+source_group( "Constraint" FILES ${SOURCE_GROUP_CONSTRAINT})
+source_group( "Control" FILES ${SOURCE_GROUP_CONTROL})
+source_group( "Data Structures" FILES ${SOURCE_GROUP_DATA_STRUCTURES})
+source_group( "Extras" FILES ${SOURCE_GROUP_EXTRAS})
+
+# Handle the strategies directory structure better in visual studio
+if(MSVC)
+  foreach(source IN LISTS SOURCE_GROUP_STRATEGIES)
+      get_filename_component(source_path "${source}" PATH)
+      string(REPLACE "src/" "" source_path_msvc "${source_path}")
+      string(REPLACE "/" "\\" source_path_msvc "${source_path_msvc}")
+      source_group("Optimization\\${source_path_msvc}" FILES "${source}")
+  endforeach()
+else()
+  source_group( "Optimization" FILES ${SOURCE_GROUP_STRATEGIES})
+endif()
+source_group( "Optimization" FILES "src/strategyselector.c" "src/strategyselector.h")
+
+source_group( "Reconstruction" FILES ${SOURCE_GROUP_RECON})
+source_group( "Threading" FILES ${SOURCE_GROUP_THREADING})
+source_group( "Threadwrapper" FILES ${SOURCE_GROUP_THREADWRAPPER})
+source_group( "" FILES ${SOURCE_GROUP_TOPLEVEL})
+
+# INSTALL
+
+# ToDo: make configurable
+
+install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.pc DESTINATION ${CMAKE_INSTALL_PREFIX}/share/pkgconfig)
+install(TARGETS kvazaar-bin DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
+install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
+if(BUILD_SHARED_LIBS) # Just add the lib to the bin directory for now
+  if(MSVC)
+    install(TARGETS kvazaar DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
+  endif()
+endif()
+install(FILES ${PROJECT_SOURCE_DIR}/src/kvazaar.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
+install(FILES ${PROJECT_SOURCE_DIR}/doc/kvazaar.1 DESTINATION ${CMAKE_INSTALL_PREFIX}/share/man)
+
+IF(UNIX)
+# DIST
+
+set(GIT_LS_TREE_OK "1")
+
+# By default grab the list of files in the git repo
+if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
+    execute_process(COMMAND ${GIT_EXECUTABLE} ls-tree --name-only -r HEAD
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                    RESULT_VARIABLE GIT_LS_TREE_OK
+                    OUTPUT_VARIABLE GIT_LS_TREE)
+    if(GIT_LS_TREE_OK EQUAL "0")
+        string(REGEX REPLACE "\n" ";" GIT_LS_TREE "${GIT_LS_TREE}")
+        string(REGEX REPLACE "\r" "" GIT_LS_TREE "${GIT_LS_TREE}")
+        list(APPEND DIST_SOURCES ${GIT_LS_TREE})
+    endif()
+endif()
+if(NOT GIT_LS_TREE_OK EQUAL "0")
+    file(GLOB_RECURSE DIST_SOURCES RELATIVE ${PROJECT_SOURCE_DIR} "src/*.c" "src/*.h" "tests/*.sh" "tools/*.sh" "tools/*.py" ".github/*.yml" "src/*.in" "placeholder.txt" "CMakeLists.txt" "doc/*" "examples/*" "rdcost-weight-tool/*" "greatest/*.h" "greatest/*.md")
+    list(APPEND DIST_SOURCES ".clang-format" ".gitignore" ".gitmodules" "tests/tsan_suppressions.txt" ".travis-install.bash" "CREDITS" "Dockerfile" "docs.doxy" ".gitlab-ci.yml" "LICENSE" "LICENSE.EXT.greatest" "README.md")
+endif()
+
+add_custom_target(dist
+                COMMAND echo \"Writing log to ${PROJECT_SOURCE_DIR}/dist.log\"
+                 && tar -zcvf "${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz" --transform 's,^,${PROJECT_NAME}-${PROJECT_VERSION}/,' -- ${DIST_SOURCES} > dist.log 2>&1 || { echo \"\\0330;31mfailed to pack ${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz, check ${PROJECT_SOURCE_DIR}/dist.log.\\033\m\"$<SEMICOLON> exit 1$<SEMICOLON> }
+                COMMENT "Make distribution ${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz"
+                WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+                BYPRODUCTS ${CMAKE_SOURCE_DIR}/${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz
+            )
+
+# DISTCHECK
+
+set(TEMP_DISTCHECK_DIR "_distcheck")
+
+add_custom_target(distcheck
+                  COMMAND echo \"Writing log to ${PROJECT_SOURCE_DIR}/distcheck.log\"
+                  && cd ${PROJECT_SOURCE_DIR}
+                  && mkdir -p ${TEMP_DISTCHECK_DIR}
+                  && cd ${TEMP_DISTCHECK_DIR}
+                  && tar -zxf ${CMAKE_SOURCE_DIR}/${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz > ${PROJECT_SOURCE_DIR}/distcheck.log || { echo \"\\0330;31mfailed to unpack ${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz.\\033\m\"$<SEMICOLON> exit 1$<SEMICOLON> }
+                  && echo \"\\0330;32mFile unpack ok\\033m\"
+                  && cd ${PROJECT_NAME}-${PROJECT_VERSION}
+                  && mkdir -p build
+                  && cd build                  
+                  && cmake -DCMAKE_INSTALL_PREFIX=./ -DBUILD_SHARED_LIBS=OFF -G "Unix Makefiles" .. >> ${PROJECT_SOURCE_DIR}/distcheck.log || { echo \"\\0330;31mcmake failed to configure.\\033m\"$<SEMICOLON> exit 1$<SEMICOLON> }
+                  && echo \"\\0330;32mCMake configure ok\\033m\"
+                  && make -j >> ${PROJECT_SOURCE_DIR}/distcheck.log || { echo \"\\0330;31mmake failed.\\033m\"$<SEMICOLON> exit 1$<SEMICOLON> }
+                  && echo \"\\0330;32mMake ok\\033m\"
+                  # Full tests might be too demanding to run, enable with parameter?
+                  #&& make test || (echo \"\\e0;31mmake test failed.\\033m\" && false)
+                  && tests/kvazaar_tests >> ${PROJECT_SOURCE_DIR}/distcheck.log 2>&1 || { echo \"\\0330;31mtests failed.\\033m\"$<SEMICOLON> exit 1$<SEMICOLON> }
+                  && echo \"\\0330;32mTests ok\\033m\"
+                  && make install >> ${PROJECT_SOURCE_DIR}/distcheck.log || { echo \"\\0330;31mmake install failed.\\033m\"$<SEMICOLON> exit 1$<SEMICOLON> }
+                  && echo \"\\0330;32mInstall ok\\033m\"
+                  && bin/kvzaar --help >> ${PROJECT_SOURCE_DIR}/distcheck.log || { echo \"\\0330;31mkvazaar binary failed to run.\\033m\"$<SEMICOLON> exit 1$<SEMICOLON> }
+                  && echo \"\\0330;32mbin/kvazaar ok\\033m\"
+                  && make clean >> ${PROJECT_SOURCE_DIR}/distcheck.log || { echo \"\\0330;31mmake clean failed.\\033m\"$<SEMICOLON> exit 1$<SEMICOLON> }
+                  && echo \"\\0330;32mmake clean ok\\033m\"
+                  && cd ${PROJECT_SOURCE_DIR}
+                  && rm -rf "${PROJECT_SOURCE_DIR}/${TEMP_DISTCHECK_DIR}"
+                  && echo \"\\0330;32m==============================================================\\033m\"
+                  && echo \"\\0330;32m${PROJECT_NAME}-${PROJECT_VERSION} archives ready for distribution:\\033m\"
+                  && echo \"\\0330;32m${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz\\033m\"
+                  && echo \"\\0330;32m==============================================================\\033m\"
+                  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+                  DEPENDS ${CMAKE_SOURCE_DIR}/${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz
+                  COMMENT "Checking ${PROJECT_NAME}-${PROJECT_VERSION}.tar.gz.."
+                  )
+endif() #Unix
+
+# TESTS
+enable_testing()
+
+if(MSVC OR MINGW OR MSYS)
+  if(BUILD_SHARED_LIBS) 
+    set(BUILD_TESTS OFF)
+    message(INFO " Disable test building, fails in MSVC/MINGW/MSYS2 when building shared binaries")
+  endif()
+endif()
+
+if(EXISTS "${PROJECT_SOURCE_DIR}/greatest/greatest.h" AND BUILD_TESTS)
+  add_subdirectory( "tests/" )
+  add_test( NAME Test_kvazaar COMMAND kvazaar_tests )
+endif()
+
+if(NOT DEFINED MSVC)
+  list(APPEND XFAIL "off")
+  if(DEFINED ENV{XFAIL_TESTS})
+    list(APPEND XFAIL $ENV{XFAIL_TESTS})
+  endif()
+
+  if(NOT "test_tools.sh" IN_LIST XFAIL)
+    add_test( NAME test_tools COMMAND ${PROJECT_SOURCE_DIR}/tests/test_tools.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
+
+  if(NOT "test_smp.sh" IN_LIST XFAIL)
+    add_test( NAME test_smp COMMAND ${PROJECT_SOURCE_DIR}/tests/test_smp.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
+
+  if(NOT "test_pu_depth_constraints.sh" IN_LIST XFAIL)
+    add_test( NAME test_pu_depth_constraints COMMAND ${PROJECT_SOURCE_DIR}/tests/test_pu_depth_constraints.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
+
+  if(NOT "test_mv_constraint.sh" IN_LIST XFAIL)
+    add_test( NAME test_mv_constraint COMMAND ${PROJECT_SOURCE_DIR}/tests/test_mv_constraint.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
+
+  if(NOT "test_interlace.sh" IN_LIST XFAIL)
+    add_test( NAME test_interlace COMMAND ${PROJECT_SOURCE_DIR}/tests/test_interlace.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
+
+  if(NOT "test_gop.sh" IN_LIST XFAIL)
+    add_test( NAME test_gop COMMAND ${PROJECT_SOURCE_DIR}/tests/test_gop.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
+
+  if(NOT "test_owf_wpp_tiles.sh" IN_LIST XFAIL)
+    add_test( NAME test_owf_wpp_tiles COMMAND ${PROJECT_SOURCE_DIR}/tests/test_owf_wpp_tiles.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
+
+  if(NOT "test_weird_shapes.sh" IN_LIST XFAIL)
+    add_test( NAME test_weird_shapes COMMAND ${PROJECT_SOURCE_DIR}/tests/test_weird_shapes.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
+
+  if(NOT "test_external_symbols.sh" IN_LIST XFAIL)
+    add_test( NAME test_external_symbols COMMAND ${PROJECT_SOURCE_DIR}/tests/test_external_symbols.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
+
+  if(NOT "util.sh" IN_LIST XFAIL)
+    add_test( NAME util COMMAND ${PROJECT_SOURCE_DIR}/tests/util.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
+
+  if(NOT "test_invalid_input.sh" IN_LIST XFAIL)
+    add_test( NAME test_invalid_input COMMAND ${PROJECT_SOURCE_DIR}/tests/test_invalid_input.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
+
+  if(NOT "test_slices.sh" IN_LIST XFAIL)
+    add_test( NAME test_slices COMMAND ${PROJECT_SOURCE_DIR}/tests/test_slices.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
+
+  if(NOT "test_intra.sh" IN_LIST XFAIL)
+    add_test( NAME test_intra COMMAND ${PROJECT_SOURCE_DIR}/tests/test_intra.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
+
+  if(NOT "test_rate_control.sh" IN_LIST XFAIL)
+    add_test( NAME test_rate_control COMMAND ${PROJECT_SOURCE_DIR}/tests/test_rate_control.sh WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests)
+  endif()
+endif()

kvazaar-2.2.0.tar.gz/Dockerfile -> kvazaar-2.3.0.tar.gz/Dockerfile Changed

kvazaar-2.2.0.tar.gz/Makefile.am -> kvazaar-2.3.0.tar.gz/Makefile.am Changed

kvazaar-2.2.0.tar.gz/README.md -> kvazaar-2.3.0.tar.gz/README.md Changed

@@ -51,6 +51,8 @@
 
 comment: # "BEGIN KVAZAAR HELP MESSAGE"
 ```
+Kvazaar v2.3.0 2024-01-17
+Kvazaar license: 3-clause BSD
 Usage:
 kvazaar -i <input> --input-res <width>x<height> -o <output>
 
@@ -95,6 +97,8 @@
                                    - md5: 56 bytes
       --(no-)psnr            : Calculate PSNR for frames. enabled
       --(no-)info            : Add encoder info SEI. enabled
+      --(no-)enable-logging  : Enable logging for regular encoder performance,
+                               error messages are always disblayed. enabled
       --crypto <string>      : Selective encryption. Crypto support must be
                                enabled at compile-time. Can be 'on' or 'off' or
                                a list of features separated with a '+'. off
@@ -422,11 +426,10 @@
 improve in the build process. We want to make this as simple as
 possible.
 
-
 ### Autotools
 Depending on the platform, some additional tools are required for compiling Kvazaar with autotools.
-For Ubuntu, the required packages are `automake autoconf libtool m4 build-essential yasm`. Yasm is
-optional, but some of the optimization will not be compiled in if it's missing.
+For Ubuntu, the required packages are `automake autoconf libtool m4 build-essential`.
+
 
 Run the following commands to compile and install Kvazaar.
 
@@ -437,6 +440,7 @@
     sudo ldconfig
 
 See `./configure --help` for more options.
+**When building shared library with visual studio the tests will fail to link, the main binary will still work**
 
 ### Autotools on MinGW
 It is recommended to use Clang instead of GCC in MinGW environments. GCC also works, but AVX2 optimizations will be disabled because of a known GCC issue from 2012, so performance will suffer badly. Instead of `./configure`, run
@@ -445,6 +449,11 @@
 
 to build Kvazaar using Clang.
 
+### CMake
+Depending on the platform, some additional tools are required for compiling Kvazaar with CMake.
+For Ubuntu, the required packages are `build-essential cmake`.
+
+
 ### OS X
 - Install Homebrew
 - run ```brew install automake libtool yasm```
@@ -482,7 +491,7 @@
 
 Please cite this paper(https://dl.acm.org/citation.cfm?doid=2964284.2973796) for Kvazaar:
 
-```M. Viitanen, A. Koivula, A. Lemmetti, A. YlÃ¤-Outinen, J. Vanne, and T. D. HÃ¤mÃ¤lÃ¤inen, âKvazaar: open-source HEVC/H.265 encoder,â in Proc. ACM Int. Conf. Multimedia, Amsterdam, The Netherlands, Oct. 2016.```
+```M. Viitanen, A. Koivula, A. Lemmetti, A. Ylä-Outinen, J. Vanne, and T. D. Hämäläinen, Kvazaar: open-source HEVC/H.265 encoder, in Proc. ACM Int. Conf. Multimedia, Amsterdam, The Netherlands, Oct. 2016.```
 
 Or in BibTex:
 
@@ -522,7 +531,7 @@
 - Main automatic way of testing is with Travis CI. Commits, branches
   and pull requests are tested automatically.
   - Uninitialized variables and such are checked with Valgrind.
-  - Bitstream validity is checked with HM.
+  - Bitstream validity is checked with VTM.
   - Compilation is checked on GCC and Clang on Linux, and Clang on OSX.
 - Windows msys2 and msvc builds are checked automatically on Appveyor.
 - If your changes change the bitstream, decode with HM to check that

kvazaar-2.2.0.tar.gz/appveyor.yml -> kvazaar-2.3.0.tar.gz/appveyor.yml Changed

kvazaar-2.2.0.tar.gz/build/C_Properties.props -> kvazaar-2.3.0.tar.gz/build/C_Properties.props Changed

kvazaar-2.2.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj -> kvazaar-2.3.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj Changed

@@ -46,9 +46,6 @@
     <PlatformToolset>v140</PlatformToolset>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-    <Import Project="..\yasm\vsyasm.props" />
-  </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <Import Project="..\C_Properties.props" />
   </ImportGroup>
@@ -77,23 +74,14 @@
     <OutDir>$(SolutionDir)$(Platform)-$(Configuration)-libs\</OutDir>
   </PropertyGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <YASM />
     <Lib>
       <AdditionalLibraryDirectories>
       </AdditionalLibraryDirectories>
       <AdditionalDependencies>
       </AdditionalDependencies>
     </Lib>
-    <YASM>
-      <Defines>ARCH_X86_64=1;%(Defines)</Defines>
-      <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
-    </YASM>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <YASM>
-      <Defines>ARCH_X86_64=0;PREFIX</Defines>
-      <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
-    </YASM>
     <Lib>
       <AdditionalLibraryDirectories>
       </AdditionalLibraryDirectories>
@@ -106,10 +94,6 @@
     </ClCompile>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <YASM>
-      <Defines>ARCH_X86_64=0;PREFIX</Defines>
-      <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
-    </YASM>
     <Lib>
       <AdditionalLibraryDirectories>
       </AdditionalLibraryDirectories>
@@ -122,10 +106,6 @@
     </ClCompile>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <YASM>
-      <Defines>ARCH_X86_64=1;%(Defines)</Defines>
-      <IncludePaths>$(SolutionDir)..\src\extras;%(IncludePaths);$(SolutionDir)..\src\strategies\x86-asm;</IncludePaths>
-    </YASM>
     <Lib>
       <AdditionalLibraryDirectories>
       </AdditionalLibraryDirectories>
@@ -239,7 +219,6 @@
     <ClCompile Include="..\..\src\strategies\strategies-nal.c" />
     <ClCompile Include="..\..\src\strategies\strategies-picture.c" />
     <ClCompile Include="..\..\src\strategies\strategies-sao.c" />
-    <ClCompile Include="..\..\src\strategies\x86_asm\picture-x86-asm.c" />
     <ClCompile Include="..\..\src\threadwrapper\src\pthread.cpp">
       <CompileAs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CompileAsCpp</CompileAs>
       <CompileAs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CompileAsCpp</CompileAs>
@@ -318,9 +297,6 @@
     <ClInclude Include="..\..\src\strategies\strategies-nal.h" />
     <ClInclude Include="..\..\src\strategies\strategies-picture.h" />
     <ClInclude Include="..\..\src\strategies\strategies-sao.h" />
-    <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm-sad.h" />
-    <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm-satd.h" />
-    <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm.h" />
     <ClInclude Include="..\..\src\strategyselector.h" />
     <ClInclude Include="..\..\src\tables.h" />
     <ClInclude Include="..\..\src\threadqueue.h" />
@@ -330,18 +306,5 @@
     <ClInclude Include="..\..\src\transform.h" />
     <ClInclude Include="..\..\src\videoframe.h" />
   </ItemGroup>
-  <ItemGroup>
-    <YASM Include="..\..\src\extras\x86inc.asm">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-    </YASM>
-    <YASM Include="..\..\src\strategies\x86_asm\picture-x86-asm-sad.asm" />
-    <YASM Include="..\..\src\strategies\x86_asm\picture-x86-asm-satd.asm" />
-  </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-    <Import Project="..\yasm\vsyasm.targets" />
-  </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file

kvazaar-2.2.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters -> kvazaar-2.3.0.tar.gz/build/kvazaar_lib/kvazaar_lib.vcxproj.filters Changed

@@ -34,9 +34,6 @@
     <Filter Include="Optimization\strategies\avx2">
       <UniqueIdentifier>{4ffb5d27-c5bb-44d5-a935-fa93066a259e}</UniqueIdentifier>
     </Filter>
-    <Filter Include="Optimization\strategies\x86_asm">
-      <UniqueIdentifier>{d0ce7d00-30c6-4e8a-b96e-51e13cb038ea}</UniqueIdentifier>
-    </Filter>
     <Filter Include="CABAC">
       <UniqueIdentifier>{c696e039-5ba4-48ab-845d-cfe1a5713525}</UniqueIdentifier>
     </Filter>
@@ -81,9 +78,6 @@
     <ClCompile Include="..\..\src\strategies\avx2\picture-avx2.c">
       <Filter>Optimization\strategies\avx2</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\strategies\x86_asm\picture-x86-asm.c">
-      <Filter>Optimization\strategies\x86_asm</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\src\strategies\avx2\dct-avx2.c">
       <Filter>Optimization\strategies\avx2</Filter>
     </ClCompile>
@@ -375,15 +369,6 @@
     <ClInclude Include="..\..\src\strategies\strategies-quant.h">
       <Filter>Optimization\strategies</Filter>
     </ClInclude>
-    <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm.h">
-      <Filter>Optimization\strategies\x86_asm</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm-sad.h">
-      <Filter>Optimization\strategies\x86_asm</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\src\strategies\x86_asm\picture-x86-asm-satd.h">
-      <Filter>Optimization\strategies\x86_asm</Filter>
-    </ClInclude>
     <ClInclude Include="..\..\src\strategies\sse41\picture-sse41.h">
       <Filter>Optimization\strategies\sse41</Filter>
     </ClInclude>
@@ -478,15 +463,4 @@
       <Filter>Control</Filter>
     </ClInclude>
   </ItemGroup>
-  <ItemGroup>
-    <YASM Include="..\..\src\extras\x86inc.asm">
-      <Filter>Extras</Filter>
-    </YASM>
-    <YASM Include="..\..\src\strategies\x86_asm\picture-x86-asm-sad.asm">
-      <Filter>Optimization\strategies\x86_asm</Filter>
-    </YASM>
-    <YASM Include="..\..\src\strategies\x86_asm\picture-x86-asm-satd.asm">
-      <Filter>Optimization\strategies\x86_asm</Filter>
-    </YASM>
-  </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file

kvazaar-2.2.0.tar.gz/configure.ac -> kvazaar-2.3.0.tar.gz/configure.ac Changed

@@ -23,7 +23,7 @@
 #
 # Here is a somewhat sane guide to lib versioning: http://apr.apache.org/versioning.html
 ver_major=7
-ver_minor=2
+ver_minor=3
 ver_release=0
 
 # Prevents configure from adding a lot of defines to the CFLAGS
@@ -137,6 +137,22 @@
         ,
         cygwin*|msys*|mingw*, 
          CFLAGS="$CFLAGS -D__USE_MINGW_ANSI_STDIO=1"
+         # Fix a bug in mingw gcc where stack doesn't get aligned properly, force all AVX instructions to be unaligned
+         AS_CASE($CC, *gcc,
+             AX_CHECK_COMPILE_FLAG(-Wa,-muse-unaligned-vector-move,
+                 CFLAGS="-Wa,-muse-unaligned-vector-move $CFLAGS",
+                 
+                    AC_MSG_CHECKING(if compiler is gcc)
+                    AS_IF($CC --version | grep "gcc" >/dev/null 2>&1, 
+                        AS_ECHO("yes")
+                        AC_MSG_ERROR(-Wa,-muse-unaligned-vector-move not supported, required with mingw+gcc to fix alignment bugs, update the used gcc)
+                    ,
+                        AS_ECHO("no")
+                        AS_ECHO("Compiler not gcc, -Wa,-muse-unaligned-vector-move not needed")
+                    )
+                
+             )
+         )
          AS_IF(
                test "x$BITS" = "x32", 
                 ASFLAGS="$ASFLAGS -fwin32 -DPREFIX -DHAVE_ALIGNED_STACK=0"
@@ -165,24 +181,9 @@
 )
 
 
-# YASM checks
-AS_IF(test "x$X86" = "xtrue", 
-       AC_CHECK_TOOL(YASM, yasm, no)
-)
-AS_IF(test "x$YASM" != "xno", have_yasm="yes")
-
-AC_ARG_ENABLE(asm, AS_HELP_STRING(--disable-asm, disable assembly no),
-              , enable_asm="yes"
-)
-AS_IF(test "x$enable_asm" != "xno" -a $have_yasm != "yes",
-      enable_asm="no"
-)
-
-
 AM_CONDITIONAL(HAVE_X86, test "x$X86" = "xtrue")
 AM_CONDITIONAL(HAVE_PPC, test "x$PPC" = "xtrue")
 AM_CONDITIONAL(HAVE_ARM, test "x$ARM" = "xtrue")
-AM_CONDITIONAL(ENABLE_ASM, test "x$enable_asm" = "xyes" -a "x$have_yasm" = "xyes" )
 
 AC_ARG_VAR(ASFLAGS, ASFLAGS to use for assembler)
 AC_SUBST(ASFLAGS)

kvazaar-2.2.0.tar.gz/doc/kvazaar.1 -> kvazaar-2.3.0.tar.gz/doc/kvazaar.1 Changed

kvazaar-2.2.0.tar.gz/src/Makefile.am -> kvazaar-2.3.0.tar.gz/src/Makefile.am Changed

@@ -14,9 +14,6 @@
 include_HEADERS = \
 	kvazaar.h
 
-noinst_HEADERS = \
-	extras/x86inc.asm
-
 noinst_LTLIBRARIES = \
 	libaltivec.la \
 	libavx2.la \
@@ -154,8 +151,6 @@
 	strategies/strategies-sao.h \
 	strategies/strategies-encode.c \
 	strategies/strategies-encode.h \
-	strategies/x86_asm/picture-x86-asm.c \
-	strategies/x86_asm/picture-x86-asm.h \
 	strategyselector.c \
 	strategyselector.h \
 	extras/libmd5.c \
@@ -238,27 +233,6 @@
 if HAVE_SSE2
 libsse2_la_CFLAGS = -msse2
 endif
-
-if ENABLE_ASM
-noinst_LTLIBRARIES += libasm.la
-libkvazaar_la_LIBADD += libasm.la
-libasm_la_SOURCES = \
-	strategies/x86_asm/picture-x86-asm-sad.asm \
-	strategies/x86_asm/picture-x86-asm-sad.h \
-	strategies/x86_asm/picture-x86-asm-satd.asm \
-	strategies/x86_asm/picture-x86-asm-satd.h
-libkvazaar_la_CFLAGS += -DKVZ_COMPILE_ASM
-
-strategies/x86_asm/picture-x86-asm-sad.lo: strategies/x86_asm/picture-x86-asm-sad.asm
-strategies/x86_asm/picture-x86-asm-satd.lo: strategies/x86_asm/picture-x86-asm-satd.asm
-endif #ENABLE_ASM
 endif #HAVE_X86
 
 
-yasm_verbose = $(yasm_verbose_@AM_V@)
-yasm_verbose_ = $(yasm_verbose_@AM_DEFAULT_V@)
-yasm_verbose_0 = @echo "  YASM    " $@;
-
-.asm.lo:
-	$(yasm_verbose)$(LIBTOOL) --mode=compile --tag=CC $(YASM) -I$(srcdir)/extras $(ASFLAGS) $< -o $@ -prefer-non-pic 1>/dev/null
-

kvazaar-2.2.0.tar.gz/src/cfg.c -> kvazaar-2.3.0.tar.gz/src/cfg.c Changed

kvazaar-2.2.0.tar.gz/src/cli.c -> kvazaar-2.3.0.tar.gz/src/cli.c Changed

@@ -176,6 +176,8 @@
   { "no-intra-chroma-search",   no_argument, NULL, 0 },
   { "fast-bipred",              no_argument, NULL, 0 },
   { "no-fast-bipred",           no_argument, NULL, 0 },
+  { "enable-logging",           no_argument, NULL, 0 },
+  { "no-enable-logging",        no_argument, NULL, 0 },
   {0, 0, 0, 0}
 };
 
@@ -382,22 +384,28 @@
 
 void print_usage(void)
 {
+  print_version();
   fprintf(stdout,
-    "Kvazaar usage: -i and --input-res to set input, -o to set output\n"
-    "               --help for more information\n");
+    "usage: -i and --input-res to set input, -o to set output\n"
+    "              --help for more information\n");
 }
 
 
 void print_version(void)
 {
   fprintf(stdout,
-    "Kvazaar " VERSION_STRING "\n"
+#ifdef CMAKE_BUILD
+    "kvazaar " VERSION_STRING " " KVZ_COMPILER_STRING " " KVZ_COMPILE_DATE "\n");
+#else
+  "Kvazaar " VERSION_STRING "\n"
     "Kvazaar license: 3-clause BSD\n");
+#endif
 }
 
 
 void print_help(void)
 {
+  print_version();
   fprintf(stdout,
     "Usage:\n"
     "kvazaar -i <input> --input-res <width>x<height> -o <output>\n"
@@ -447,6 +455,8 @@
     "                                   - md5: 56 bytes\n"
     "      --(no-)psnr            : Calculate PSNR for frames. enabled\n"
     "      --(no-)info            : Add encoder info SEI. enabled\n"
+    "      --(no-)enable-logging  : Enable logging for regular encoder performance,\n"
+    "                               error messages are always disblayed. enabled\n"
     "      --crypto <string>      : Selective encryption. Crypto support must be\n"
     "                               enabled at compile-time. Can be 'on' or 'off' or\n"
     "                               a list of features separated with a '+'. off\n"

kvazaar-2.2.0.tar.gz/src/encmain.c -> kvazaar-2.3.0.tar.gz/src/encmain.c Changed

@@ -527,10 +527,12 @@
 
   const encoder_control_t *encoder = enc->control;
 
-  fprintf(stderr, "Input: %s, output: %s\n", opts->input, opts->output);
-  fprintf(stderr, "  Video size: %dx%d (input=%dx%d)\n",
-         encoder->in.width, encoder->in.height,
-         encoder->in.real_width, encoder->in.real_height);
+  if(opts->config->enable_logging_output) {
+    fprintf(stderr, "Input: %s, output: %s\n", opts->input, opts->output);
+    fprintf(stderr, "  Video size: %dx%d (input=%dx%d)\n",
+           encoder->in.width, encoder->in.height,
+           encoder->in.real_width, encoder->in.real_height);
+  }
 
   if (opts->seek > 0 && !yuv_io_seek(input, opts->seek, opts->config->width, opts->config->height, opts->config->file_format)) {
     fprintf(stderr, "Failed to seek %d frames.\n", opts->seek);
@@ -687,7 +689,7 @@
         // Compute and print stats.
 
         double frame_psnr3 = { 0.0, 0.0, 0.0 };
-        if (encoder->cfg.calc_psnr && encoder->cfg.source_scan_type == KVZ_INTERLACING_NONE) {
+        if (encoder->cfg.calc_psnr && encoder->cfg.source_scan_type == KVZ_INTERLACING_NONE && encoder->cfg.enable_logging_output) {
           // Do not compute PSNR for interlaced frames, because img_rec does not contain
           // the deinterlaced frame yet.
           compute_psnr(img_src, img_rec, frame_psnr);
@@ -719,8 +721,10 @@
         psnr_sum1 += frame_psnr1;
         psnr_sum2 += frame_psnr2;
 
-        print_frame_info(&info_out, frame_psnr, len_out, encoder->cfg.calc_psnr,
-                         calc_avg_qp(qp_sum, frames_done));
+        if (opts->config->enable_logging_output) {
+          print_frame_info(&info_out, frame_psnr, len_out, encoder->cfg.calc_psnr,
+                           calc_avg_qp(qp_sum, frames_done));
+        }
       }
 
       api->picture_free(cur_in_img);
@@ -735,19 +739,20 @@
 
     // All reconstructed pictures should have been output.
     assert(recon_buffer_size == 0);
-
-    // Print statistics of the coding
-    fprintf(stderr, " Processed %d frames, %10llu bits",
-            frames_done,
-            (long long unsigned int)bitstream_length * 8);
-    if (encoder->cfg.calc_psnr && frames_done > 0) {
-      fprintf(stderr, " AVG PSNR Y %2.4f U %2.4f V %2.4f",
-              psnr_sum0 / frames_done,
-              psnr_sum1 / frames_done,
-              psnr_sum2 / frames_done);
+    if (opts->config->enable_logging_output) {
+      // Print statistics of the coding
+      fprintf(stderr, " Processed %d frames, %10llu bits",
+              frames_done,
+              (long long unsigned int)bitstream_length * 8);
+      if (encoder->cfg.calc_psnr && frames_done > 0) {
+        fprintf(stderr, " AVG PSNR Y %2.4f U %2.4f V %2.4f",
+                psnr_sum0 / frames_done,
+                psnr_sum1 / frames_done,
+                psnr_sum2 / frames_done);
+      }
+      fprintf(stderr, "\n");
+      fprintf(stderr, " Total CPU time: %.3f s.\n", ((float)(clock() - start_time)) / CLOCKS_PER_SEC);
     }
-    fprintf(stderr, "\n");
-    fprintf(stderr, " Total CPU time: %.3f s.\n", ((float)(clock() - start_time)) / CLOCKS_PER_SEC);
 
     {
       const double mega = (double)(1 << 20);
@@ -774,14 +779,16 @@
         encoding_cpu = 100.0;
       }
 #endif
-      fprintf(stderr, " Encoding time: %.3f s.\n",      encoding_time);
-      fprintf(stderr, " Encoding wall time: %.3f s.\n", wall_time);
+      if (opts->config->enable_logging_output) {
+        fprintf(stderr, " Encoding time: %.3f s.\n",      encoding_time);
+        fprintf(stderr, " Encoding wall time: %.3f s.\n", wall_time);
 
-      fprintf(stderr, " Encoding CPU usage: %.2f%%\n",  encoding_cpu);
-      fprintf(stderr, " FPS: %.2f\n",                   encoding_fps);
+        fprintf(stderr, " Encoding CPU usage: %.2f%%\n",  encoding_cpu);
+        fprintf(stderr, " FPS: %.2f\n",                   encoding_fps);
 
-      fprintf(stderr, " Bitrate: %.3f Mbps\n",          bitrate_mbps);
-      fprintf(stderr, " AVG QP: %.1f\n",                avg_qp);
+        fprintf(stderr, " Bitrate: %.3f Mbps\n",          bitrate_mbps);
+        fprintf(stderr, " AVG QP: %.1f\n",                avg_qp);
+      }
     }
     pthread_join(input_thread, NULL);
   }

kvazaar-2.2.0.tar.gz/src/encoder.c -> kvazaar-2.3.0.tar.gz/src/encoder.c Changed

kvazaar-2.2.0.tar.gz/src/encoder_state-bitstream.c -> kvazaar-2.3.0.tar.gz/src/encoder_state-bitstream.c Changed

@@ -179,7 +179,7 @@
       uint32_t ref_matrix_id = UINT32_MAX;
 
       for (pred_list_idx = list_id; pred_list_idx >= 0; pred_list_idx--) {
-        const int32_t * const pred_list  = (list_id == pred_list_idx) ?
+        const coeff_t* const pred_list  = (list_id == pred_list_idx) ?
                                      kvz_scalinglist_get_default(size_id, pred_list_idx) :
                                      encoder->scaling_list.scaling_list_coeffsize_idpred_list_idx;
 
@@ -200,7 +200,7 @@
         const int32_t coef_num = MIN(MAX_MATRIX_COEF_NUM, kvz_g_scaling_list_sizesize_id);
         const uint32_t * const scan_cg = (size_id == 0) ? g_sig_last_scan_16x16 : g_sig_last_scan_32x32;
         int32_t next_coef = 8;
-        const int32_t * const coef_list = encoder->scaling_list.scaling_list_coeffsize_idlist_id;
+        const coeff_t* const coef_list = encoder->scaling_list.scaling_list_coeffsize_idlist_id;
 
         if (size_id >= SCALING_LIST_16x16) {
           WRITE_SE(stream, encoder->scaling_list.scaling_list_dcsize_idlist_id - 8, "scaling_list_dc_coef_minus8");
@@ -504,7 +504,9 @@
   WRITE_U(stream, 0, 1, "constrained_intra_pred_flag");
   WRITE_U(stream, encoder->cfg.trskip_enable, 1, "transform_skip_enabled_flag");
 
-  if (state->frame->max_qp_delta_depth >= 0) {
+  // Check all the conditions for setting cu_qp_delta_enabled_flag here, since state->frame->max_qp_delta_depth might not be set yet.
+  if (encoder->cfg.target_bitrate > 0 || encoder->cfg.erp_aqp || encoder->cfg.roi.file_path || 
+      encoder->cfg.set_qp_in_cu || encoder->cfg.vaq || (state->tile->frame->source && state->tile->frame->source->roi.roi_array) ) {
     // Use separate QP for each LCU when rate control is enabled.
     WRITE_U(stream, 1, 1, "cu_qp_delta_enabled_flag");
     WRITE_UE(stream, state->frame->max_qp_delta_depth, "diff_cu_qp_delta_depth");

kvazaar-2.2.0.tar.gz/src/encoder_state-ctors_dtors.c -> kvazaar-2.3.0.tar.gz/src/encoder_state-ctors_dtors.c Changed

kvazaar-2.2.0.tar.gz/src/extras/libmd5.c -> kvazaar-2.3.0.tar.gz/src/extras/libmd5.c Changed

kvazaar-2.2.0.tar.gz/src/global.h -> kvazaar-2.3.0.tar.gz/src/global.h Changed

kvazaar-2.2.0.tar.gz/src/kvazaar.c -> kvazaar-2.3.0.tar.gz/src/kvazaar.c Changed

kvazaar-2.2.0.tar.gz/src/kvazaar.h -> kvazaar-2.3.0.tar.gz/src/kvazaar.h Changed

kvazaar-2.2.0.tar.gz/src/kvazaar.pc.in -> kvazaar-2.3.0.tar.gz/src/kvazaar.pc.in Changed

kvazaar-2.2.0.tar.gz/src/rate_control.c -> kvazaar-2.3.0.tar.gz/src/rate_control.c Changed

kvazaar-2.2.0.tar.gz/src/rdo.c -> kvazaar-2.3.0.tar.gz/src/rdo.c Changed

@@ -148,19 +148,6 @@
   0.027313232421875, 5.736968994140625,
 };
 
-
-// This struct is for passing data to kvz_rdoq_sign_hiding
-struct sh_rates_t {
-  // Bit cost of increasing rate by one.
-  int32_t inc32 * 32;
-  // Bit cost of decreasing rate by one.
-  int32_t dec32 * 32;
-  // Bit cost of going from zero to one.
-  int32_t sig_coeff_inc32 * 32;
-  // Coeff minus quantized coeff.
-  int32_t quant_delta32 * 32;
-};
-
 int kvz_init_rdcost_outfiles(const char *dir_path)
 {
 #define RD_SAMPLING_MAX_FN_LENGTH 4095
@@ -532,7 +519,7 @@
     const encoder_state_t *const state,
     const int32_t qp_scaled,
     const uint32_t *const scan2raster,
-    const struct sh_rates_t *const sh_rates,
+    const struct kvz_sh_rates_t *const sh_rates,
     const int32_t last_pos,
     const coeff_t *const coeffs,
     coeff_t *const quant_coeffs)
@@ -686,7 +673,7 @@
   
   int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
 
-  const int32_t *quant_coeff  = encoder->scaling_list.quant_coefflog2_tr_size-2scalinglist_typeqp_scaled%6;
+  const coeff_t *quant_coeff  = encoder->scaling_list.quant_coefflog2_tr_size-2scalinglist_typeqp_scaled%6;
   const double *err_scale     = encoder->scaling_list.error_scalelog2_tr_size-2scalinglist_typeqp_scaled%6;
 
   double block_uncoded_cost = 0;
@@ -695,7 +682,7 @@
   double cost_sig    32 * 32 ;
   double cost_coeff0 32 * 32 ;
 
-  struct sh_rates_t sh_rates;
+  struct kvz_sh_rates_t sh_rates;
 
   const uint32_t *scan_cg = g_sig_last_scan_cglog2_block_size - 2scan_mode;
   const uint32_t cg_size = 16;
@@ -744,29 +731,9 @@
 
   //Find last cg and last scanpos
   int32_t cg_scanpos;
-  for (cg_scanpos = (cg_num - 1); cg_scanpos >= 0; cg_scanpos--)
-  {
-    for (int32_t scanpos_in_cg = (cg_size - 1); scanpos_in_cg >= 0; scanpos_in_cg--)
-    {
-      int32_t  scanpos        = cg_scanpos*cg_size + scanpos_in_cg;
-      uint32_t blkpos         = scanscanpos;
-      int32_t q               = quant_coeffblkpos;
-      int32_t level_double    = coefblkpos;
-      level_double            = MIN(abs(level_double) * q, MAX_INT - (1 << (q_bits - 1)));
-      uint32_t max_abs_level  = (level_double + (1 << (q_bits - 1))) >> q_bits;
-
-      if (max_abs_level > 0) {
-        last_scanpos    = scanpos;
-        ctx_set         = (scanpos > 0 && type == 0) ? 2 : 0;
-        cg_last_scanpos = cg_scanpos;
-        sh_rates.sig_coeff_incblkpos = 0;
-        break;
-      }
-      dest_coeffblkpos = 0;
-    }
-    if (last_scanpos != -1) break;
-  }
-
+  kvz_find_last_scanpos(coef, dest_coeff, type, q_bits, quant_coeff, &sh_rates, cg_size, &ctx_set, scan, &cg_last_scanpos,
+    &last_scanpos, cg_num, &cg_scanpos, width, scan_mode);
+    
   if (last_scanpos == -1) {
     return;
   }

kvazaar-2.2.0.tar.gz/src/rdo.h -> kvazaar-2.3.0.tar.gz/src/rdo.h Changed

kvazaar-2.2.0.tar.gz/src/scalinglist.c -> kvazaar-2.3.0.tar.gz/src/scalinglist.c Changed

@@ -43,7 +43,7 @@
 const uint16_t kvz_g_scaling_list_size4  = {   16,  64, 256,1024};
 static const uint8_t g_scaling_list_size_x4 = { 4, 8,16,32};
 
-static const int32_t g_quant_default_4x416 =
+static const coeff_t g_quant_default_4x416 =
 {
   16,16,16,16,
   16,16,16,16,
@@ -51,7 +51,7 @@
   16,16,16,16
 };
 
-static const int32_t g_quant_intra_default_8x864 =
+static const coeff_t g_quant_intra_default_8x864 =
 {
   16,16,16,16,17,18,21,24,
   16,16,16,16,17,19,22,25,
@@ -63,7 +63,7 @@
   24,25,29,36,47,65,88,115
 };
 
-static const int32_t g_quant_inter_default_8x864 =
+static const coeff_t g_quant_inter_default_8x864 =
 {
   16,16,16,16,17,18,20,24,
   16,16,16,17,18,20,24,25,
@@ -75,8 +75,8 @@
   24,25,28,33,41,54,71,91
 };
 
-const int16_t kvz_g_quant_scales6        = { 26214,23302,20560,18396,16384,14564 };
-const int16_t kvz_g_inv_quant_scales6    = { 40,45,51,57,64,72 };
+const coeff_t kvz_g_quant_scales6        = { 26214,23302,20560,18396,16384,14564 };
+const coeff_t kvz_g_inv_quant_scales6    = { 40,45,51,57,64,72 };
 
 
 /**
@@ -91,12 +91,12 @@
     for (listId = 0; listId < kvz_g_scaling_list_numsizeId; listId++) {
       for (qp = 0; qp < 6; qp++) {
         if (!(sizeId == 3 && listId == 3)) {
-          scaling_list->quant_coeffsizeIdlistIdqp    = (int32_t*)calloc(kvz_g_scaling_list_sizesizeId, sizeof(int32_t));
-          scaling_list->de_quant_coeffsizeIdlistIdqp = (int32_t*)calloc(kvz_g_scaling_list_sizesizeId, sizeof(int32_t));
+          scaling_list->quant_coeffsizeIdlistIdqp    = (coeff_t*)calloc(kvz_g_scaling_list_sizesizeId, sizeof(int32_t));
+          scaling_list->de_quant_coeffsizeIdlistIdqp = (coeff_t*)calloc(kvz_g_scaling_list_sizesizeId, sizeof(int32_t));
           scaling_list->error_scalesizeIdlistIdqp    = (double*)calloc(kvz_g_scaling_list_sizesizeId, sizeof(double));
         }
       }
-      scaling_list->scaling_list_coeffsizeIdlistId = (int32_t*)calloc(MIN(MAX_MATRIX_COEF_NUM, kvz_g_scaling_list_sizesizeId), sizeof(int32_t));
+      scaling_list->scaling_list_coeffsizeIdlistId = (coeff_t*)calloc(MIN(MAX_MATRIX_COEF_NUM, kvz_g_scaling_list_sizesizeId), sizeof(int32_t));
     }
   }
   // alias, assign pointer to an existing array
@@ -263,9 +263,9 @@
   #undef LINE_BUFSIZE
 }
 
-const int32_t *kvz_scalinglist_get_default(const uint32_t size_id, const uint32_t list_id)
+const coeff_t *kvz_scalinglist_get_default(const uint32_t size_id, const uint32_t list_id)
 {
-  const int32_t *list_ptr = g_quant_intra_default_8x8; // Default to "8x8" intra
+  const coeff_t *list_ptr = g_quant_intra_default_8x8; // Default to "8x8" intra
   switch(size_id) {
     case SCALING_LIST_4x4:
       list_ptr = g_quant_default_4x4;
@@ -286,7 +286,7 @@
  * \brief get scaling list for decoder
  *
  */
-static void scalinglist_process_dec(const int32_t * const coeff, int32_t *dequantcoeff,
+static void scalinglist_process_dec(const coeff_t * const coeff, coeff_t *dequantcoeff,
                                     int32_t inv_quant_scales, uint32_t height,
                                     uint32_t width, uint32_t ratio,
                                     int32_t size_num, uint32_t dc,
@@ -315,7 +315,7 @@
  * \brief get scaling list for encoder
  *
  */
-void kvz_scalinglist_process_enc(const int32_t * const coeff, int32_t* quantcoeff, const int32_t quant_scales,
+void kvz_scalinglist_process_enc(const coeff_t * const coeff, coeff_t * quantcoeff, const int32_t quant_scales,
                              const uint32_t height, const uint32_t width, const uint32_t ratio, 
                              const int32_t size_num, const uint32_t dc, const uint8_t flat)
 {
@@ -354,7 +354,7 @@
   int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - bitdepth - log2_tr_size;  // Represents scaling through forward transform
 
   uint32_t i,max_num_coeff  = kvz_g_scaling_list_sizesize;
-  const int32_t *quantcoeff = scaling_list->quant_coeffsizelistqp;
+  const coeff_t *quantcoeff = scaling_list->quant_coeffsizelistqp;
   //This cast is allowed, since error_scale is a malloc'd pointer in kvz_scalinglist_init
   double *err_scale         = (double *) scaling_list->error_scalesizelistqp;
 
@@ -372,15 +372,15 @@
  * \brief set scaling lists
  *
  */
-void kvz_scalinglist_set(scaling_list_t * const scaling_list, const int32_t * const coeff, uint32_t listId, uint32_t sizeId, uint32_t qp)
+void kvz_scalinglist_set(scaling_list_t * const scaling_list, const coeff_t* const coeff, uint32_t listId, uint32_t sizeId, uint32_t qp)
 {
   const uint32_t width  = g_scaling_list_size_xsizeId;
   const uint32_t height = g_scaling_list_size_xsizeId;
   const uint32_t ratio  = g_scaling_list_size_xsizeId / MIN(8, g_scaling_list_size_xsizeId);
   const uint32_t dc = scaling_list->scaling_list_dcsizeIdlistId != 0 ? scaling_list->scaling_list_dcsizeIdlistId : 16;
   //These cast are allowed, since these are pointer's to malloc'd area in kvz_scalinglist_init
-  int32_t *quantcoeff   = (int32_t*) scaling_list->quant_coeffsizeIdlistIdqp;
-  int32_t *dequantcoeff = (int32_t*) scaling_list->de_quant_coeffsizeIdlistIdqp;
+  coeff_t*quantcoeff   = (coeff_t*) scaling_list->quant_coeffsizeIdlistIdqp;
+  coeff_t*dequantcoeff = (coeff_t*) scaling_list->de_quant_coeffsizeIdlistIdqp;
 
   // Encoder list
   kvz_scalinglist_process_enc(coeff, quantcoeff, kvz_g_quant_scalesqp<<4, height, width, ratio,
@@ -410,7 +410,7 @@
 
   for (size = 0; size < SCALING_LIST_SIZE_NUM; size++) {
     for (list = 0; list < kvz_g_scaling_list_numsize; list++) {
-      const int32_t * const list_ptr = scaling_list->use_default_list ?
+      const coeff_t* const list_ptr = scaling_list->use_default_list ?
                                        kvz_scalinglist_get_default(size, list) :
                                        scaling_list->scaling_list_coeffsizelist;

kvazaar-2.2.0.tar.gz/src/scalinglist.h -> kvazaar-2.3.0.tar.gz/src/scalinglist.h Changed

@@ -47,16 +47,16 @@
         int8_t   enable;
         int8_t   use_default_list;
         int32_t  scaling_list_dc   SCALING_LIST_SIZE_NUMSCALING_LIST_NUM;
-  const int32_t *scaling_list_coeffSCALING_LIST_SIZE_NUMSCALING_LIST_NUM;
-  const int32_t *quant_coeff466;
-  const int32_t *de_quant_coeff  SCALING_LIST_SIZE_NUMSCALING_LIST_NUMSCALING_LIST_REM_NUM;
+  const coeff_t *scaling_list_coeffSCALING_LIST_SIZE_NUMSCALING_LIST_NUM;
+  const coeff_t *quant_coeff466;
+  const coeff_t *de_quant_coeff  SCALING_LIST_SIZE_NUMSCALING_LIST_NUMSCALING_LIST_REM_NUM;
   const double *error_scale466;
 } scaling_list_t;
 
 extern const uint8_t kvz_g_scaling_list_num4;
 extern const uint16_t kvz_g_scaling_list_size4;
 
-const int32_t *kvz_scalinglist_get_default(const uint32_t size_id, const uint32_t list_id);
+const coeff_t*kvz_scalinglist_get_default(const uint32_t size_id, const uint32_t list_id);
 
 void kvz_scalinglist_init(scaling_list_t * const scaling_list);
 void kvz_scalinglist_destroy(scaling_list_t * const scaling_list);

kvazaar-2.2.0.tar.gz/src/strategies/avx2/quant-avx2.c -> kvazaar-2.3.0.tar.gz/src/strategies/avx2/quant-avx2.c Changed

@@ -152,7 +152,7 @@
   return _mm256_inserti128_si256(v, hi, 1);
 }
 
-static INLINE void scanord_read_vector_32(const int32_t  *__restrict quant_coeff,
+static INLINE void scanord_read_vector_32(const coeff_t *__restrict quant_coeff,
                                           const uint32_t *__restrict scan,
                                           int8_t scan_mode,
                                           int32_t subpos,
@@ -190,15 +190,14 @@
     _mm256_setr_epi32(2, 6, 0, 4, 3, 7, 1, 5),
   };
 
-  __m128i coeffs4 = {
-    _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets0)),
-    _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets1)),
-    _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets2)),
-    _mm_loadu_si128((__m128i *)(quant_coeff + row_offsets3)),
-  };
+  coeff_t coeffs16;
+  memcpy(coeffs, quant_coeff + row_offsets0, sizeof(coeff_t) * 4);
+  memcpy(coeffs + 4, quant_coeff + row_offsets1, sizeof(coeff_t) * 4);
+  memcpy(coeffs + 8, quant_coeff + row_offsets2, sizeof(coeff_t) * 4);
+  memcpy(coeffs + 12, quant_coeff + row_offsets3, sizeof(coeff_t) * 4);
 
-  __m256i coeffs_upper = concatenate_2x128i(coeffs0, coeffs1);
-  __m256i coeffs_lower = concatenate_2x128i(coeffs2, coeffs3);
+  __m256i coeffs_upper = _mm256_cvtepi16_epi32(_mm_load_si128((__m128i const *)(coeffs)));
+  __m256i coeffs_lower = _mm256_cvtepi16_epi32(_mm_load_si128((__m128i const*)(coeffs + 8)));
 
   __m256i lower_shuffled = _mm256_permutevar8x32_epi32(coeffs_lower, shufmasksscan_mode);
 
@@ -368,7 +367,7 @@
   int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6);
   const uint32_t log2_tr_size = kvz_g_convert_to_bitwidth + 2;
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"type);
-  const int32_t *quant_coeff = encoder->scaling_list.quant_coefflog2_tr_size - 2scalinglist_typeqp_scaled % 6;
+  const coeff_t *quant_coeff = encoder->scaling_list.quant_coefflog2_tr_size - 2scalinglist_typeqp_scaled % 6;
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size; //!< Represents scaling through forward transform
   const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + transform_shift;
   const int32_t add = ((state->frame->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9);
@@ -393,8 +392,8 @@
     v_sign = _mm256_or_si256(v_sign, _mm256_set1_epi16(1));
 
     if (state->encoder_control->scaling_list.enable) {
-      __m256i v_quant_coeff_lo = _mm256_loadu_si256(((__m256i *)(quant_coeff + n)) + 0);
-      __m256i v_quant_coeff_hi = _mm256_loadu_si256(((__m256i *)(quant_coeff + n)) + 1);
+      __m256i v_quant_coeff_lo = _mm256_cvtepi16_epi32(_mm_loadu_si128(((__m128i *)(quant_coeff + n)) + 0));
+      __m256i v_quant_coeff_hi = _mm256_cvtepi16_epi32(_mm_loadu_si128(((__m128i *)(quant_coeff + n)) + 1));
 
       low_b  = _mm256_permute2x128_si256(v_quant_coeff_lo,
                                          v_quant_coeff_hi,
@@ -739,7 +738,7 @@
     uint32_t log2_tr_size = kvz_g_convert_to_bit width  + 2;
     int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"type);
 
-    const int32_t *dequant_coef = encoder->scaling_list.de_quant_coefflog2_tr_size-2scalinglist_typeqp_scaled%6;
+    const coeff_t *dequant_coef = encoder->scaling_list.de_quant_coefflog2_tr_size-2scalinglist_typeqp_scaled%6;
     shift += 4;
 
     if (shift >qp_scaled / 6) {
@@ -863,6 +862,72 @@
   return (double)(temp) / 256.0;
 }
 
+static void find_last_scanpos_avx2(coeff_t* coef, coeff_t* dest_coeff, int8_t type, int32_t q_bits, const coeff_t* quant_coeff, struct kvz_sh_rates_t* sh_rates, const uint32_t cg_size,
+  uint16_t* ctx_set, const uint32_t* scan, int32_t* cg_last_scanpos, int32_t* last_scanpos, uint32_t cg_num, int32_t* cg_scanpos, int32_t width, int8_t scan_mode) {
+
+  __m256i min_q_bits = _mm256_set1_epi32(MAX_INT - (1 << (q_bits - 1)));
+  __m256i q_bits_v = _mm256_set1_epi32(1 << (q_bits - 1));
+  for (*cg_scanpos = (cg_num - 1); *cg_scanpos >= 0; (*cg_scanpos)--) {
+    int32_t scan_pos = *cg_scanpos * cg_size;
+    int32_t block_pos = scanscan_pos;
+    coeff_t q_array16;
+    memcpy(q_array, &quant_coeffblock_pos, 4 * sizeof(coeff_t));
+    memcpy(q_array + 4, &quant_coeffblock_pos + width, 4 * sizeof(coeff_t));
+    memcpy(q_array + 8, &quant_coeffblock_pos + 2 * width, 4 * sizeof(coeff_t));
+    memcpy(q_array + 12, &quant_coeffblock_pos + 3 * width, 4 * sizeof(coeff_t));
+
+    coeff_t coef_array16;
+    memcpy(coef_array, &coefblock_pos, 4 * sizeof(coeff_t));
+    memcpy(coef_array + 4, &coefblock_pos + width, 4 * sizeof(coeff_t));
+    memcpy(coef_array + 8, &coefblock_pos + 2 * width, 4 * sizeof(coeff_t));
+    memcpy(coef_array + 12, &coefblock_pos + 3 * width, 4 * sizeof(coeff_t));
+
+    __m256i q = _mm256_loadu_si256((__m256i const*)q_array);
+
+    __m256i level_double = _mm256_loadu_si256((__m256i const*)coef_array);
+
+    __m256i abs_level_double = _mm256_abs_epi16(level_double);
+
+    __m256i levels_mul_q_low = _mm256_mullo_epi16(abs_level_double, q);
+    __m256i levels_mul_q_high = _mm256_mulhi_epi16(abs_level_double, q);
+
+    __m256i levels_mul_0 = _mm256_unpacklo_epi16(levels_mul_q_low, levels_mul_q_high);
+    __m256i levels_mul_1 = _mm256_unpackhi_epi16(levels_mul_q_low, levels_mul_q_high);
+
+    __m256i min_mask = _mm256_cmpgt_epi32(min_q_bits, levels_mul_0);
+    levels_mul_0 = _mm256_blendv_epi8(min_q_bits, levels_mul_0, min_mask);
+    min_mask = _mm256_cmpgt_epi32(min_q_bits, levels_mul_1);
+    levels_mul_1 = _mm256_blendv_epi8(min_q_bits, levels_mul_1, min_mask);
+
+    __m256i max_abs_level_low = _mm256_add_epi32(levels_mul_0, q_bits_v);
+    max_abs_level_low = _mm256_srai_epi32(max_abs_level_low, q_bits);
+    __m256i max_abs_level_high = _mm256_add_epi32(levels_mul_1, q_bits_v);
+    max_abs_level_high = _mm256_srai_epi32(max_abs_level_high, q_bits);
+
+    memset(&dest_coeffblock_pos, 0, sizeof(coeff_t) * 4);
+    memset(&dest_coeffblock_pos + width, 0, sizeof(coeff_t) * 4);
+    memset(&dest_coeffblock_pos + 2 * width, 0, sizeof(coeff_t) * 4);
+    memset(&dest_coeffblock_pos + 3 * width, 0, sizeof(coeff_t) * 4);
+    if (!_mm256_testz_si256(max_abs_level_low, max_abs_level_low) || !_mm256_testz_si256(max_abs_level_high, max_abs_level_high)) {
+      uint32_t max_abs_level16;
+      _mm256_storeu2_m128i((__m128i*)(max_abs_level + 8), (__m128i*)(max_abs_level), max_abs_level_low);
+      _mm256_storeu2_m128i((__m128i*)(max_abs_level + 12), (__m128i*)(max_abs_level + 4), max_abs_level_high);
+      for (int sp = scan_pos + 15; sp >= scan_pos; sp--) {
+        uint32_t blkpos = kvz_g_sig_last_scanscan_mode1sp - scan_pos;
+        if (max_abs_levelblkpos > 0) {
+          *last_scanpos = sp;
+          *ctx_set = (sp > 0 && type == 0) ? 2 : 0;
+          *cg_last_scanpos = *cg_scanpos;
+          sh_rates->sig_coeff_incscansp = 0;
+          return;
+        }
+      }
+    }
+  }
+  *last_scanpos = -1;
+}
+
+
 #endif //COMPILE_INTEL_AVX2 && defined X86_64
 
 int kvz_strategy_register_quant_avx2(void* opaque, uint8_t bitdepth)
@@ -879,6 +944,7 @@
   success &= kvz_strategyselector_register(opaque, "quant", "avx2", 40, &kvz_quant_avx2);
   success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "avx2", 0, &coeff_abs_sum_avx2);
   success &= kvz_strategyselector_register(opaque, "fast_coeff_cost", "avx2", 40, &fast_coeff_cost_avx2);
+  success &= kvz_strategyselector_register(opaque, "find_last_scanpos", "avx2", 40, &find_last_scanpos_avx2);
 #endif //COMPILE_INTEL_AVX2 && defined X86_64
 
   return success;

kvazaar-2.2.0.tar.gz/src/strategies/generic/quant-generic.c -> kvazaar-2.3.0.tar.gz/src/strategies/generic/quant-generic.c Changed

@@ -57,7 +57,7 @@
   int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6);
   const uint32_t log2_tr_size = kvz_g_convert_to_bitwidth + 2;
   const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"type);
-  const int32_t *quant_coeff = encoder->scaling_list.quant_coefflog2_tr_size - 2scalinglist_typeqp_scaled % 6;
+  const coeff_t *quant_coeff = encoder->scaling_list.quant_coefflog2_tr_size - 2scalinglist_typeqp_scaled % 6;
   const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size; //!< Represents scaling through forward transform
   const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + transform_shift;
   const int32_t add = ((state->frame->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9);
@@ -311,7 +311,7 @@
     uint32_t log2_tr_size = kvz_g_convert_to_bit width  + 2;
     int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"type);
 
-    const int32_t *dequant_coef = encoder->scaling_list.de_quant_coefflog2_tr_size-2scalinglist_typeqp_scaled%6;
+    const coeff_t *dequant_coef = encoder->scaling_list.de_quant_coefflog2_tr_size-2scalinglist_typeqp_scaled%6;
     shift += 4;
 
     if (shift >qp_scaled / 6) {
@@ -374,6 +374,31 @@
   return (double) sum  / 256.0;
 }
 
+
+
+static void find_last_scanpos_generic(coeff_t* coef, coeff_t* dest_coeff, int8_t type, int32_t q_bits, const coeff_t* quant_coeff, struct kvz_sh_rates_t* sh_rates, const uint32_t cg_size, uint16_t* ctx_set, const uint32_t* scan, int32_t* cg_last_scanpos, int32_t* last_scanpos, uint32_t cg_num, int32_t* cg_scanpos, int32_t width, int8_t scan_mode) {
+  for (*cg_scanpos = (cg_num - 1); *cg_scanpos >= 0; (*cg_scanpos)--) {
+    for (int32_t scanpos_in_cg = (cg_size - 1); scanpos_in_cg >= 0; scanpos_in_cg--) {
+      int32_t  scanpos = *cg_scanpos * cg_size + scanpos_in_cg;
+      uint32_t blkpos = scanscanpos;
+      int32_t q = quant_coeffblkpos;
+      int32_t level_double = coefblkpos;
+      level_double = MIN(abs(level_double) * q, MAX_INT - (1 << (q_bits - 1)));
+      uint32_t max_abs_level = (level_double + (1 << (q_bits - 1))) >> q_bits;
+
+      if (max_abs_level > 0) {
+        *last_scanpos = scanpos;
+        *ctx_set = (scanpos > 0 && type == 0) ? 2 : 0;
+        *cg_last_scanpos = *cg_scanpos;
+        sh_rates->sig_coeff_incblkpos = 0;
+        return;
+      }
+      dest_coeffblkpos = 0;
+    }
+  }
+}
+
+
 int kvz_strategy_register_quant_generic(void* opaque, uint8_t bitdepth)
 {
   bool success = true;
@@ -383,6 +408,7 @@
   success &= kvz_strategyselector_register(opaque, "dequant", "generic", 0, &kvz_dequant_generic);
   success &= kvz_strategyselector_register(opaque, "coeff_abs_sum", "generic", 0, &coeff_abs_sum_generic);
   success &= kvz_strategyselector_register(opaque, "fast_coeff_cost", "generic", 0, &fast_coeff_cost_generic);
+  success &= kvz_strategyselector_register(opaque, "find_last_scanpos", "generic", 0, &find_last_scanpos_generic);
 
   return success;
 }

kvazaar-2.2.0.tar.gz/src/strategies/strategies-picture.c -> kvazaar-2.3.0.tar.gz/src/strategies/strategies-picture.c Changed

kvazaar-2.2.0.tar.gz/src/strategies/strategies-quant.c -> kvazaar-2.3.0.tar.gz/src/strategies/strategies-quant.c Changed

kvazaar-2.2.0.tar.gz/src/strategies/strategies-quant.h -> kvazaar-2.3.0.tar.gz/src/strategies/strategies-quant.h Changed

@@ -44,6 +44,7 @@
 #include "kvazaar.h"
 #include "tables.h"
 
+struct kvz_sh_rates_t;
 // Declare function pointers.
 typedef void (quant_func)(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width,
   int32_t height, int8_t type, int8_t scan_idx, int8_t block_type);
@@ -60,12 +61,16 @@
 
 typedef uint32_t (coeff_abs_sum_func)(const coeff_t *coeffs, size_t length);
 
+typedef void (find_last_scanpos_func)(coeff_t* coef, coeff_t* dest_coeff, int8_t type, int32_t q_bits, const coeff_t* quant_coeff, struct kvz_sh_rates_t* sh_rates, const uint32_t cg_size,
+  uint16_t* ctx_set, const uint32_t* scan, int32_t* cg_last_scanpos, int32_t* last_scanpos, uint32_t cg_num, int32_t* cg_scanpos, int32_t width, int8_t scan_mode);
+
 // Declare function pointers.
 extern quant_func * kvz_quant;
 extern quant_residual_func * kvz_quantize_residual;
 extern dequant_func *kvz_dequant;
 extern coeff_abs_sum_func *kvz_coeff_abs_sum;
 extern fast_coeff_cost_func *kvz_fast_coeff_cost;
+extern find_last_scanpos_func *kvz_find_last_scanpos;
 
 int kvz_strategy_register_quant(void* opaque, uint8_t bitdepth);
 
@@ -76,6 +81,7 @@
   {"dequant", (void**) &kvz_dequant}, \
   {"coeff_abs_sum", (void**) &kvz_coeff_abs_sum}, \
   {"fast_coeff_cost", (void**) &kvz_fast_coeff_cost}, \
+  {"find_last_scanpos", (void**) &kvz_find_last_scanpos}, \

kvazaar-2.2.0.tar.gz/src/strategyselector.c -> kvazaar-2.3.0.tar.gz/src/strategyselector.c Changed

@@ -46,13 +46,13 @@
 hardware_flags_t kvz_g_strategies_in_use;
 hardware_flags_t kvz_g_strategies_available;
 
-static void set_hardware_flags(int32_t cpuid);
+static void set_hardware_flags(int32_t cpuid, uint8_t logging);
 static void* strategyselector_choose_for(const strategy_list_t * const strategies, const char * const strategy_type);
 
 //Strategies to include (add new file here)
 
 //Returns 1 if successful
-int kvz_strategyselector_init(int32_t cpuid, uint8_t bitdepth) {
+int kvz_strategyselector_init(int32_t cpuid, uint8_t bitdepth, uint8_t logging) {
   const strategy_to_select_t *cur_strategy_to_select = strategies_to_select;
   strategy_list_t strategies;
   
@@ -60,7 +60,7 @@
   strategies.count = 0;
   strategies.strategies = NULL;
   
-  set_hardware_flags(cpuid);
+  set_hardware_flags(cpuid, logging);
   
   //Add new register function here
   if (!kvz_strategy_register_picture(&strategies, bitdepth)) {
@@ -118,109 +118,109 @@
     //Also check what optimizations are available and what are in use
     //SIMD optimizations available
     bool strategies_available = false;
-    fprintf(stderr, "Available: ");
+    if (logging) fprintf(stderr, "Available: ");
     if (kvz_g_strategies_available.intel_flags.avx != 0){
-      fprintf(stderr, "avx(%d) ", kvz_g_strategies_available.intel_flags.avx);
+      if (logging) fprintf(stderr, "avx(%d) ", kvz_g_strategies_available.intel_flags.avx);
       strategies_available = true;
     }
     if (kvz_g_strategies_available.intel_flags.avx2 != 0){
-      fprintf(stderr, "avx2(%d) ", kvz_g_strategies_available.intel_flags.avx2);
+      if (logging) fprintf(stderr, "avx2(%d) ", kvz_g_strategies_available.intel_flags.avx2);
       strategies_available = true;
     }
     if (kvz_g_strategies_available.intel_flags.mmx != 0) {
-      fprintf(stderr, "mmx(%d) ", kvz_g_strategies_available.intel_flags.mmx);
+      if (logging) fprintf(stderr, "mmx(%d) ", kvz_g_strategies_available.intel_flags.mmx);
       strategies_available = true;
     }
     if (kvz_g_strategies_available.intel_flags.sse != 0) {
-      fprintf(stderr, "sse(%d) ", kvz_g_strategies_available.intel_flags.sse);
+      if (logging) fprintf(stderr, "sse(%d) ", kvz_g_strategies_available.intel_flags.sse);
       strategies_available = true;
     }
     if (kvz_g_strategies_available.intel_flags.sse2 != 0) {
-      fprintf(stderr, "sse2(%d) ", kvz_g_strategies_available.intel_flags.sse2);
+      if (logging) fprintf(stderr, "sse2(%d) ", kvz_g_strategies_available.intel_flags.sse2);
       strategies_available = true;
     }
     if (kvz_g_strategies_available.intel_flags.sse3 != 0) {
-      fprintf(stderr, "sse3(%d) ", kvz_g_strategies_available.intel_flags.sse3);
+      if (logging) fprintf(stderr, "sse3(%d) ", kvz_g_strategies_available.intel_flags.sse3);
       strategies_available = true;
     }
     if (kvz_g_strategies_available.intel_flags.sse41 != 0) {
-      fprintf(stderr, "sse41(%d) ", kvz_g_strategies_available.intel_flags.sse41);
+      if (logging) fprintf(stderr, "sse41(%d) ", kvz_g_strategies_available.intel_flags.sse41);
       strategies_available = true;
     }
     if (kvz_g_strategies_available.intel_flags.sse42 != 0) {
-      fprintf(stderr, "sse42(%d) ", kvz_g_strategies_available.intel_flags.sse42);
+      if (logging) fprintf(stderr, "sse42(%d) ", kvz_g_strategies_available.intel_flags.sse42);
       strategies_available = true;
     }
     if (kvz_g_strategies_available.intel_flags.ssse3 != 0) {
-      fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_available.intel_flags.ssse3);
+      if (logging) fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_available.intel_flags.ssse3);
       strategies_available = true;
     }
     if (kvz_g_strategies_available.arm_flags.neon != 0) {
-      fprintf(stderr, "neon(%d) ", kvz_g_strategies_available.arm_flags.neon);
+      if (logging) fprintf(stderr, "neon(%d) ", kvz_g_strategies_available.arm_flags.neon);
       strategies_available = true;
     }
     if (kvz_g_strategies_available.powerpc_flags.altivec != 0) {
-      fprintf(stderr, "altivec(%d) ", kvz_g_strategies_available.powerpc_flags.altivec);
+      if (logging) fprintf(stderr, "altivec(%d) ", kvz_g_strategies_available.powerpc_flags.altivec);
       strategies_available = true;
     }
     //If there is no strategies available
     if (!strategies_available){
-      fprintf(stderr, "no SIMD optimizations");
+      if (logging) fprintf(stderr, "no SIMD optimizations");
     }
-    fprintf(stderr, "\n");
+    if (logging) fprintf(stderr, "\n");
 
     //SIMD optimizations in use
     bool strategies_in_use = false;
-    fprintf(stderr, "In use: ");
+    if (logging) fprintf(stderr, "In use: ");
     if (kvz_g_strategies_in_use.intel_flags.avx != 0){
-      fprintf(stderr, "avx(%d) ", kvz_g_strategies_in_use.intel_flags.avx);
+      if (logging) fprintf(stderr, "avx(%d) ", kvz_g_strategies_in_use.intel_flags.avx);
       strategies_in_use = true;
     }
     if (kvz_g_strategies_in_use.intel_flags.avx2 != 0){ 
-      fprintf(stderr, "avx2(%d) ", kvz_g_strategies_in_use.intel_flags.avx2);
+      if (logging) fprintf(stderr, "avx2(%d) ", kvz_g_strategies_in_use.intel_flags.avx2);
       strategies_in_use = true;
     }
     if (kvz_g_strategies_in_use.intel_flags.mmx != 0) {
-      fprintf(stderr, "mmx(%d) ", kvz_g_strategies_in_use.intel_flags.mmx);
+      if (logging) fprintf(stderr, "mmx(%d) ", kvz_g_strategies_in_use.intel_flags.mmx);
       strategies_in_use = true;
     }
     if (kvz_g_strategies_in_use.intel_flags.sse != 0) {
-      fprintf(stderr, "sse(%d) ", kvz_g_strategies_in_use.intel_flags.sse);
+      if (logging) fprintf(stderr, "sse(%d) ", kvz_g_strategies_in_use.intel_flags.sse);
       strategies_in_use = true;
     }
     if (kvz_g_strategies_in_use.intel_flags.sse2 != 0) {
-      fprintf(stderr, "sse2(%d) ", kvz_g_strategies_in_use.intel_flags.sse2);
+      if (logging) fprintf(stderr, "sse2(%d) ", kvz_g_strategies_in_use.intel_flags.sse2);
       strategies_in_use = true;
     }
     if (kvz_g_strategies_in_use.intel_flags.sse3 != 0) {
-      fprintf(stderr, "sse3(%d) ", kvz_g_strategies_in_use.intel_flags.sse3);
+      if (logging) fprintf(stderr, "sse3(%d) ", kvz_g_strategies_in_use.intel_flags.sse3);
       strategies_in_use = true;
     }
     if (kvz_g_strategies_in_use.intel_flags.sse41 != 0) {
-      fprintf(stderr, "sse41(%d) ", kvz_g_strategies_in_use.intel_flags.sse41);
+      if (logging) fprintf(stderr, "sse41(%d) ", kvz_g_strategies_in_use.intel_flags.sse41);
       strategies_in_use = true;
     }
     if (kvz_g_strategies_in_use.intel_flags.sse42 != 0) {
-      fprintf(stderr, "sse42(%d) ", kvz_g_strategies_in_use.intel_flags.sse42);
+      if (logging) fprintf(stderr, "sse42(%d) ", kvz_g_strategies_in_use.intel_flags.sse42);
       strategies_in_use = true;
     }
     if (kvz_g_strategies_in_use.intel_flags.ssse3 != 0) {
-      fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_in_use.intel_flags.ssse3);
+      if (logging) fprintf(stderr, "ssse3(%d) ", kvz_g_strategies_in_use.intel_flags.ssse3);
       strategies_in_use = true;
     }
     if (kvz_g_strategies_in_use.arm_flags.neon != 0) {
-      fprintf(stderr, "neon(%d) ", kvz_g_strategies_in_use.arm_flags.neon);
+      if (logging) fprintf(stderr, "neon(%d) ", kvz_g_strategies_in_use.arm_flags.neon);
       strategies_in_use = true;
     }
     if (kvz_g_strategies_in_use.powerpc_flags.altivec != 0) {
-      fprintf(stderr, "altivec(%d) ", kvz_g_strategies_in_use.powerpc_flags.altivec);
+      if (logging) fprintf(stderr, "altivec(%d) ", kvz_g_strategies_in_use.powerpc_flags.altivec);
       strategies_in_use = true;
     }
     //If there is no strategies in use
     if (!strategies_in_use){
-      fprintf(stderr, "no SIMD optimizations");
+      if (logging) fprintf(stderr, "no SIMD optimizations");
     }
-    fprintf(stderr, "\n");
+    if (logging) fprintf(stderr, "\n");
 
     //Free memory
     free(strategies.strategies);
@@ -449,7 +449,7 @@
 #  endif
 #endif //COMPILE_POWERPC
 
-static void set_hardware_flags(int32_t cpuid) {
+static void set_hardware_flags(int32_t cpuid, uint8_t logging) {
   FILL(kvz_g_hardware_flags, 0);
 
 #if COMPILE_INTEL
@@ -535,59 +535,63 @@
     }
   }
 
-  fprintf(stderr, "Compiled: INTEL, flags:");
+  if (logging) {
+    fprintf(stderr, "Compiled: INTEL, flags:");
 #if COMPILE_INTEL_MMX
-  fprintf(stderr, " MMX");
+    fprintf(stderr, " MMX");
 #endif
 #if COMPILE_INTEL_SSE
-  fprintf(stderr, " SSE");
+    fprintf(stderr, " SSE");
 #endif
 #if COMPILE_INTEL_SSE2
-  fprintf(stderr, " SSE2");
+    fprintf(stderr, " SSE2");
 #endif
 #if COMPILE_INTEL_SSE3
-  fprintf(stderr, " SSE3");
+    fprintf(stderr, " SSE3");
 #endif
 #if COMPILE_INTEL_SSSE3
-  fprintf(stderr, " SSSE3");
+    fprintf(stderr, " SSSE3");
 #endif
 #if COMPILE_INTEL_SSE41
-  fprintf(stderr, " SSE41");
+    fprintf(stderr, " SSE41");
 #endif
 #if COMPILE_INTEL_SSE42
-  fprintf(stderr, " SSE42");
+    fprintf(stderr, " SSE42");
 #endif
 #if COMPILE_INTEL_AVX
-  fprintf(stderr, " AVX");
+    fprintf(stderr, " AVX");
 #endif
 #if COMPILE_INTEL_AVX2
-  fprintf(stderr, " AVX2");
+    fprintf(stderr, " AVX2");
 #endif
-  fprintf(stderr, "\nDetected: INTEL, flags:");
-  if (kvz_g_hardware_flags.intel_flags.mmx) fprintf(stderr, " MMX");
-  if (kvz_g_hardware_flags.intel_flags.sse) fprintf(stderr, " SSE");
-  if (kvz_g_hardware_flags.intel_flags.sse2) fprintf(stderr, " SSE2");
-  if (kvz_g_hardware_flags.intel_flags.sse3) fprintf(stderr, " SSE3");
-  if (kvz_g_hardware_flags.intel_flags.ssse3) fprintf(stderr, " SSSE3");
-  if (kvz_g_hardware_flags.intel_flags.sse41) fprintf(stderr, " SSE41");
-  if (kvz_g_hardware_flags.intel_flags.sse42) fprintf(stderr, " SSE42");
-  if (kvz_g_hardware_flags.intel_flags.avx) fprintf(stderr, " AVX");
-  if (kvz_g_hardware_flags.intel_flags.avx2) fprintf(stderr, " AVX2");
-  fprintf(stderr, "\n");
+    fprintf(stderr, "\nDetected: INTEL, flags:");
+    if (kvz_g_hardware_flags.intel_flags.mmx) fprintf(stderr, " MMX");
+    if (kvz_g_hardware_flags.intel_flags.sse) fprintf(stderr, " SSE");
+    if (kvz_g_hardware_flags.intel_flags.sse2) fprintf(stderr, " SSE2");
+    if (kvz_g_hardware_flags.intel_flags.sse3) fprintf(stderr, " SSE3");
+    if (kvz_g_hardware_flags.intel_flags.ssse3) fprintf(stderr, " SSSE3");
+    if (kvz_g_hardware_flags.intel_flags.sse41) fprintf(stderr, " SSE41");
+    if (kvz_g_hardware_flags.intel_flags.sse42) fprintf(stderr, " SSE42");
+    if (kvz_g_hardware_flags.intel_flags.avx) fprintf(stderr, " AVX");
+    if (kvz_g_hardware_flags.intel_flags.avx2) fprintf(stderr, " AVX2");
+    fprintf(stderr, "\n");
+  }
 #endif //COMPILE_INTEL
 
 #if COMPILE_POWERPC
   if (cpuid) {
     kvz_g_hardware_flags.powerpc_flags.altivec = altivec_available();
   }
-  
-  fprintf(stderr, "Compiled: PowerPC, flags:");
+
+  if (logging) {
+    fprintf(stderr, "Compiled: PowerPC, flags:");
 #if COMPILE_POWERPC_ALTIVEC
-  fprintf(stderr, " AltiVec");
+    fprintf(stderr, " AltiVec");
 #endif
-  fprintf(stderr, "\nDetected: PowerPC, flags:");
-  if (kvz_g_hardware_flags.powerpc_flags.altivec) fprintf(stderr, " AltiVec");
-  fprintf(stderr, "\n");
+    fprintf(stderr, "\nDetected: PowerPC, flags:");
+    if (kvz_g_hardware_flags.powerpc_flags.altivec) fprintf(stderr, " AltiVec");
+    fprintf(stderr, "\n");
+  }
 #endif
   
 }

kvazaar-2.2.0.tar.gz/src/strategyselector.h -> kvazaar-2.3.0.tar.gz/src/strategyselector.h Changed

kvazaar-2.2.0.tar.gz/src/threads.h -> kvazaar-2.3.0.tar.gz/src/threads.h Changed

kvazaar-2.2.0.tar.gz/src/transform.c -> kvazaar-2.3.0.tar.gz/src/transform.c Changed

kvazaar-2.3.0.tar.gz/src/version.h.in Added

@@ -0,0 +1,39 @@
+#pragma once
+/*****************************************************************************
+ * This file is part of kvazaar HEVC encoder.
+ *
+ * Copyright (c) 2022, Tampere University, ITU/ISO/IEC, project contributors
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * * Redistributions of source code must retain the above copyright notice, this
+ *   list of conditions and the following disclaimer.
+ * 
+ * * Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or
+ *   other materials provided with the distribution.
+ * 
+ * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
+ *   contributors may be used to endorse or promote products derived from
+ *   this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
+ ****************************************************************************/
+
+#ifndef KVZ_VERSION
+#define KVZ_VERSION @PROJECT_VERSION@
+#endif
+#define KVZ_COMPILER_STRING "@KVZ_COMPILER_STRING@"
+#define KVZ_COMPILE_DATE "@CMAKE_BUILD_DATE@"
+#define VERSION_STRING QUOTE_EXPAND(KVZ_VERSION)

kvazaar-2.3.0.tar.gz/tests/CMakeLists.txt Added

@@ -0,0 +1,43 @@
+file( GLOB TEST_SOURCES "*.c" )
+
+# ToDo: fix the tests
+list(REMOVE_ITEM TEST_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/inter_recon_bipred_tests.c")
+
+add_executable(kvazaar_tests ${TEST_SOURCES} )
+
+target_include_directories(kvazaar_tests PUBLIC ${PROJECT_SOURCE_DIR})
+target_include_directories(kvazaar_tests PUBLIC ${PROJECT_SOURCE_DIR}/src)
+target_include_directories(kvazaar_tests PUBLIC ${PROJECT_SOURCE_DIR}/src/extras)
+
+add_definitions(-DKVZ_DLL_EXPORTS)
+
+if(BUILD_SHARED_LIBS)
+  add_definitions(-DPIC)
+endif()
+
+if(MSVC)
+  target_include_directories(kvazaar_tests PUBLIC ../src/threadwrapper/include)
+
+  set_property( SOURCE ${TEST_SOURCES}  APPEND PROPERTY COMPILE_FLAGS "/arch:AVX2" )
+  add_definitions(-DWIN32_LEAN_AND_MEAN -D_WIN32 -DWIN32 -DWIN64)
+else()
+  list(APPEND ALLOW_AVX2 "x86_64" "AMD64")
+  if(${CMAKE_SYSTEM_PROCESSOR} IN_LIST ALLOW_AVX2) 
+    set_property( SOURCE ${TEST_SOURCES}  APPEND PROPERTY COMPILE_FLAGS "-mavx2 -mbmi -mpopcnt -mlzcnt -mbmi2" )
+  endif()
+  find_package(Threads REQUIRED)
+  target_link_libraries(kvazaar_tests PUBLIC Threads::Threads)
+
+  include(CheckLibraryExists)
+
+  CHECK_LIBRARY_EXISTS(m sin "" HAVE_LIB_M)
+
+  if (HAVE_LIB_M)
+      set(EXTRA_LIBS ${EXTRA_LIBS} m)
+  endif (HAVE_LIB_M)
+
+  target_link_libraries(kvazaar_tests PUBLIC ${EXTRA_LIBS})
+endif()
+
+target_link_libraries(kvazaar_tests PUBLIC kvazaar)
+

kvazaar-2.2.0.tar.gz/tests/test_strategies.c -> kvazaar-2.3.0.tar.gz/tests/test_strategies.c Changed