diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 501f20cbfeb3d1d0208d4618953f57067a9fea02..698cc8105f4bafa9f7855516003046e88ba43ab8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -87,7 +87,7 @@ build:gcc9:opt:python3: variables: BINARY_TAG: x86_64-centos7-gcc9-opt BUILDDIR: build-opt-python3 - HEPTOOLS_VERSION: 97python3 + HEPTOOLS_VERSION: 97apython3 script: - ci-utils/build artifacts: @@ -158,6 +158,7 @@ test:gcc8:opt: - ${TESTS_REPORT} when: always expire_in: 1 week + retry: 2 test:gcc9:opt: stage: test @@ -175,6 +176,7 @@ test:gcc9:opt: - ${TESTS_REPORT} when: always expire_in: 1 week + retry: 2 test:gcc9:dbg: stage: test @@ -192,6 +194,7 @@ test:gcc9:dbg: - ${TESTS_REPORT} when: always expire_in: 1 week + retry: 2 test:gcc9:opt:python3: stage: test @@ -209,6 +212,7 @@ test:gcc9:opt:python3: - ${TESTS_REPORT} when: always expire_in: 1 week + retry: 2 test_public_headers_build: stage: test diff --git a/CHANGELOG.md b/CHANGELOG.md index 09f40d702564197afe9ee4f82fd75d5290b182f4..3a29fc6a70f0b0d268333ccccdf15a13190f0cf7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,51 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +## [v33r2][] - 2020-06-19 +This is a bugfix release with just a couple of minor backward compatible improvements. + +We also updated the LCG baseline version to [LCG 97a](http://lcginfo.cern.ch/release/97a/) (ROOT 6.20/06). + +This is the last release with the legacy CMake configuration. From the next release (v34r0) +we will use a modern CMake configuration (see gaudi/Gaudi!922 and gaudi/Gaudi!986 for details). + +### Added +- Improve event context extension management (gaudi/Gaudi!1080) +- Introducing Set-semantics (gaudi/Gaudi!1066) +- Make gitlab use cpp syntax highlighting for icpp files (gaudi/Gaudi!1057) + +### Fixed +- Fixes for ROOT master (gaudi/Gaudi!1071) +- Use fmtlib instead of `boost::format` (where possible) (gaudi/Gaudi!1046, gaudi/Gaudi#112) +- Conditions alg test, with stall debug info (gaudi/Gaudi!1092) +- Make sure the TES is cleaned by `EventLoopMgr` before finalize (gaudi/Gaudi!1082, LHCBPS-1850) +- Fix race condition between tests (gaudi/Gaudi!1074) +- Fix propagation of `StopIteration` exception (gaudi/Gaudi!1093, gaudi/Gaudi#129) +- Update HiveDataBroker to use C++20 ranges if available. (gaudi/Gaudi!1089) +- Fixes for C++20 (gaudi/Gaudi!1085) +- Fix thread-safety problem in GaudiHandle (gaudi/Gaudi!1084) +- Misc. sanitiser improvements (gaudi/Gaudi!1081) +- Make `GaudiHandleArray.__str__` consistent with `GaudiHandle.__str__` (gaudi/Gaudi!1070) +- Fix trivial typo in message (gaudi/Gaudi!1087) +- `GaudiConfig2`: fix unpickling of derived Configurables (gaudi/Gaudi!1076, gaudi/Gaudi#124) +- Remove data flow dumps or make them configurable (gaudi/Gaudi!1075) +- Change `StatusCode::orThrow` so that the return value can be ignored (gaudi/Gaudi!1073, gaudi/Gaudi#118) +- Fix bugs in `ThreadPoolSvc` (gaudi/Gaudi!1068) +- Sequence semantics update (gaudi/Gaudi!1069) +- Remove now unused `ATLAS_GAUDI_V21` and `HAVE_GAUDI_PLUGINSVC` definitions (gaudi/Gaudi!1062) +- Few fixes to `GaudiConfig2` (gaudi/Gaudi!1061) +- Fix compatibility with C++ GSL 3 (gaudi/Gaudi!1063, gaudi/Gaudi#121) +- Consolidate and optimize scheduling of blocking tasks (gaudi/Gaudi!1051) +- Do not run public headers build test on generated headers (gaudi/Gaudi!974) +- Add array `operator<<` into `SerializeSTL.h` (gaudi/Gaudi!1049) +- Dereference instead of `operator*` in `get_from_handle` to enable `Gaudi::Ranges` in `MergingTransformer`s (gaudi/Gaudi!1047) +- Fixes for `GetHostBinaryTag` (gaudi/Gaudi!1058) +- Remove unused capture of `this` in `MultiTransformerFilter` (gaudi/Gaudi!1059) +- Fix deprecated copy constructor warning in `StreamBuffer::ContainedLink` (gaudi/Gaudi!1060) +- `AlgExecStateSvc`: require minimum state size to be 1 (gaudi/Gaudi!1055) +- `IncidentSvc`: improve error reporting (gaudi/Gaudi!1054) +- Remove mention of deprecated `tbb::recursive_mutex` (gaudi/Gaudi!1053) + ## [v33r1][] - 2020-03-23 This release contains multiple fixes and some ABI changes, in particular: - `StatusCode` values not checked now produce a compile time warning. @@ -482,7 +527,8 @@ Details about old versions of the project can be found in the [GaudiRelease/doc](GaudiRelease/doc). -[Unreleased]: https://gitlab.cern.ch/gaudi/Gaudi/compare/v33r1...master +[Unreleased]: https://gitlab.cern.ch/gaudi/Gaudi/compare/v33r2...master +[v33r2]: https://gitlab.cern.ch/gaudi/Gaudi/compare/v33r1...v33r2 [v33r1]: https://gitlab.cern.ch/gaudi/Gaudi/compare/v33r0...v33r1 [v33r0]: https://gitlab.cern.ch/gaudi/Gaudi/compare/v32r2...v33r0 [v32r2]: https://gitlab.cern.ch/gaudi/Gaudi/compare/v32r1...v32r2 diff --git a/CMakeLists.txt b/CMakeLists.txt index fae498e907482f37b7016ddd84b50a5fcb0bd359..44c07eadef7f36d3dd790d0a7759401717192b59 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,7 +37,7 @@ endif() include(${CMAKE_SOURCE_DIR}/cmake/externals.cmake) # Declare project name and version -gaudi_project(Gaudi v33r1) +gaudi_project(Gaudi v33r2) # Special hack to make sure downstream projects see fmt::fmt # This is going to be much cleaner with gaudi/Gaudi!986 (CMake modernization) diff --git a/GaudiKernel/CMakeLists.txt b/GaudiKernel/CMakeLists.txt index 828ae6495f9880ca63a220c1a5a071a99819734f..2095c12e03b62181f2bbcf0a63c26b76f84a148d 100644 --- a/GaudiKernel/CMakeLists.txt +++ b/GaudiKernel/CMakeLists.txt @@ -33,12 +33,14 @@ if(VectorClass_INCLUDE_DIR AND NOT VectorClass_VERSION) set(VectorClass_VERSION 0.0) file(STRINGS ${VectorClass_INCLUDE_DIR}/instrset.h _vectorclass_guard REGEX "define +INSTRSET_H +[0-9]+") list(GET _vectorclass_guard 0 _vectorclass_guard) - if(_vectorclass_guard MATCHES "INSTRSET_H +([0-9]+)") + if(_vectorclass_guard MATCHES "INSTRSET_H +([0-9][0-9][0-9][0-9][0-9])") + string(REGEX REPLACE "([0-9]+)([0-9][0-9])([0-9][0-9])" "\\1.\\2.\\3" VectorClass_VERSION "${CMAKE_MATCH_1}") + elseif(_vectorclass_guard MATCHES "INSTRSET_H +([0-9][0-9][0-9])") string(REGEX REPLACE "([0-9]+)([0-9][0-9])" "\\1.\\2" VectorClass_VERSION "${CMAKE_MATCH_1}") endif() set(VectorClass_VERSION "${VectorClass_VERSION}" CACHE INTERNAL "") endif() -if(NOT VectorClass_INCLUDE_DIR OR VectorClass_VERSION VERSION_LESS 1.25) +if(NOT VectorClass_INCLUDE_DIR OR VectorClass_VERSION VERSION_LESS 2.01.02) if(VectorClass_INCLUDE_DIR) message(STATUS "Found VectorClass instrset_detect ${VectorClass_VERSION} at ${VectorClass_INCLUDE_DIR}") endif() diff --git a/GaudiKernel/src/Util/instructionsetLevel.cpp b/GaudiKernel/src/Util/instructionsetLevel.cpp index 0a228ab870f8aff6b66b56a81eec6d649a74d238..e7123a5ec0326e62c520120b535a2672212dd381 100644 --- a/GaudiKernel/src/Util/instructionsetLevel.cpp +++ b/GaudiKernel/src/Util/instructionsetLevel.cpp @@ -28,7 +28,7 @@ int main( int argc, char* argv[] ) { const std::vector<std::string> sets{// "80386", "sse", "sse2", "sse3", // Note: This is reported by Linux as 'pni' - "ssse3", "sse4_1", "sse4_2", "avx", "avx2", "avx512f", "avx512vl", "avx512bw"}; + "ssse3", "sse4_1", "sse4_2", "avx", "avx2", "avx512f", "avx512vl"}; if ( argc == 1 ) { const std::size_t level = System::instructionsetLevel() - 1; diff --git a/GaudiKernel/src/contrib/instrset.h b/GaudiKernel/src/contrib/instrset.h index ea4a39d84b0e4d68e946503d64ac25fe9b734926..8fb762ed4645c9b00b4c080e927867af04cc0a1e 100644 --- a/GaudiKernel/src/contrib/instrset.h +++ b/GaudiKernel/src/contrib/instrset.h @@ -1,34 +1,59 @@ /**************************** instrset.h ********************************** * Author: Agner Fog * Date created: 2012-05-30 - * Last modified: 2016-11-25 - * Version: 1.25 - * Project: vector classes + * Last modified: 2020-04-01 + * Version: 2.01.02 + * Project: vector class library * Description: - * Header file for various compiler-specific tasks and other common tasks to - * vector class library: - * > selects the supported instruction set - * > defines integer types - * > defines compiler version macros - * > undefines certain macros that prevent function overloading - * > defines template class to represent compile-time integer constant - * > defines template for compile-time error messages + * Header file for various compiler-specific tasks as well as common + * macros and templates. This file contains: * - * (c) Copyright 2012-2016 GNU General Public License www.gnu.org/licenses + * > Selection of the supported instruction set + * > Defines compiler version macros + * > Undefines certain macros that prevent function overloading + * > Helper functions that depend on instruction set, compiler, or platform + * > Common templates for permute, blend, etc. + * + * For instructions, see vcl_manual.pdf + * + * (c) Copyright 2012-2020 Agner Fog. + * Apache License version 2.0 or later. ******************************************************************************/ #ifndef INSTRSET_H -#define INSTRSET_H 125 +#define INSTRSET_H 20102 -// Detect 64 bit mode +// Allow the use of floating point permute instructions on integer vectors. +// Some CPU's have an extra latency of 1 or 2 clock cycles for this, but +// it may still be faster than alternative implementations: +#define ALLOW_FP_PERMUTE true + +// Macro to indicate 64 bit mode #if ( defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ) ) && !defined( __x86_64__ ) # define __x86_64__ 1 // There are many different macros for this, decide on only one #endif -// Find instruction set from compiler macros if INSTRSET not defined +// The following values of INSTRSET are currently defined: +// 2: SSE2 +// 3: SSE3 +// 4: SSSE3 +// 5: SSE4.1 +// 6: SSE4.2 +// 7: AVX +// 8: AVX2 +// 9: AVX512F +// 10: AVX512BW/DQ/VL +// In the future, INSTRSET = 11 may include AVX512VBMI and AVX512VBMI2, but this +// decision cannot be made before the market situation for CPUs with these +// instruction sets is known (these future instruction set extensions are already +// used in some VCL functions and tested with an emulator) + +// Find instruction set from compiler macros if INSTRSET is not defined. // Note: Most of these macros are not defined in Microsoft compilers #ifndef INSTRSET -# if defined( __AVX512F__ ) || defined( __AVX512__ ) +# if defined( __AVX512VL__ ) && defined( __AVX512BW__ ) && defined( __AVX512DQ__ ) +# define INSTRSET 10 +# elif defined( __AVX512F__ ) || defined( __AVX512__ ) # define INSTRSET 9 # elif defined( __AVX2__ ) # define INSTRSET 8 @@ -60,7 +85,7 @@ // sets are specified on the compiler command line, such as: // xopintrin.h, fma4intrin.h # else -# include <immintrin.h> // MS version of immintrin.h covers AVX, AVX2 and FMA3 +# include <immintrin.h> // MS/Intel version of immintrin.h covers AVX and later # endif // __GNUC__ #elif INSTRSET == 7 # include <immintrin.h> // AVX @@ -80,10 +105,12 @@ #if INSTRSET >= 8 && !defined( __FMA__ ) // Assume that all processors that have AVX2 also have FMA3 -# if defined( __GNUC__ ) && !defined( __INTEL_COMPILER ) && !defined( __clang__ ) -// Prevent error message in g++ when using FMA intrinsics with avx2: -# pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher" -# else +# if defined( __GNUC__ ) && !defined( __INTEL_COMPILER ) +// Prevent error message in g++ and Clang when using FMA intrinsics with avx2: +# if !defined( DISABLE_WARNING_AVX2_WITHOUT_FMA ) +# pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher" +# endif +# elif !defined( __clang__ ) # define __FMA__ 1 # endif #endif @@ -109,52 +136,14 @@ # include <fma4intrin.h> // must have both x86intrin.h and fma4intrin.h, don't know why #endif // __FMA4__ -// Define integer types with known size -#if defined( __GNUC__ ) || defined( __clang__ ) || ( defined( _MSC_VER ) && _MSC_VER >= 1600 ) -// Compilers supporting C99 or C++0x have stdint.h defining these integer types -# include <stdint.h> -#elif defined( _MSC_VER ) -// Older Microsoft compilers have their own definitions -typedef signed __int8 int8_t; -typedef unsigned __int8 uint8_t; -typedef signed __int16 int16_t; -typedef unsigned __int16 uint16_t; -typedef signed __int32 int32_t; -typedef unsigned __int32 uint32_t; -typedef signed __int64 int64_t; -typedef unsigned __int64 uint64_t; -# ifndef _INTPTR_T_DEFINED -# define _INTPTR_T_DEFINED -# ifdef __x86_64__ -typedef int64_t intptr_t; -# else -typedef int32_t intptr_t; -# endif -# endif -#else -// This works with most compilers -typedef signed char int8_t; -typedef unsigned char uint8_t; -typedef signed short int int16_t; -typedef unsigned short int uint16_t; -typedef signed int int32_t; -typedef unsigned int uint32_t; -typedef long long int64_t; -typedef unsigned long long uint64_t; -# ifdef __x86_64__ -typedef int64_t intptr_t; -# else -typedef int32_t intptr_t; -# endif -#endif - +#include <stdint.h> // Define integer types with known size #include <stdlib.h> // define abs(int) #ifdef _MSC_VER // Microsoft compiler or compatible Intel compiler # include <intrin.h> // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int) #endif // _MSC_VER -// functions in instrset_detect.cpp +// functions in instrset_detect.cpp: #ifdef VCL_NAMESPACE namespace VCL_NAMESPACE { #endif @@ -163,10 +152,15 @@ namespace VCL_NAMESPACE { bool hasFMA4( void ); // true if FMA4 instructions supported bool hasXOP( void ); // true if XOP instructions supported bool hasAVX512ER( void ); // true if AVX512ER instructions supported + bool hasAVX512VBMI( void ); // true if AVX512VBMI instructions supported + bool hasAVX512VBMI2( void ); // true if AVX512VBMI2 instructions supported #ifdef VCL_NAMESPACE } #endif +// functions in physical_processors.cpp: +int physicalProcessors( int* logical_processors = 0 ); + // GCC version #if defined( __GNUC__ ) && !defined( GCC_VERSION ) && !defined( __clang__ ) # define GCC_VERSION ( (__GNUC__)*10000 + (__GNUC_MINOR__)*100 + ( __GNUC_PATCHLEVEL__ ) ) @@ -191,9 +185,216 @@ namespace VCL_NAMESPACE { # endif #endif +/* Intel compiler problem: +The Intel compiler currently cannot compile version 2.00 of VCL. It seems to have +a problem with constexpr function returns not being constant enough. +*/ +#if defined( __INTEL_COMPILER ) && __INTEL_COMPILER < 9999 +# error The Intel compiler version 19.00 cannot compile VCL version 2. Use Version 1.xx of VCL instead +#endif + +/* Clang problem: +The Clang compiler treats the intrinsic vector types __m128, __m128i, and __m128d as identical. +See the bug report at https://bugs.llvm.org/show_bug.cgi?id=17164 +Additional problem: The version number is not consistent across platforms. The Apple build has +different version numbers. We have to rely on __apple_build_version__ on the Mac platform: +http://llvm.org/bugs/show_bug.cgi?id=12643 +We have to make switches here when - hopefully - the error some day has been fixed. +We need different version checks with and whithout __apple_build_version__ +*/ +#if ( defined( __clang__ ) || defined( __apple_build_version__ ) ) && !defined( __INTEL_COMPILER ) +# define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY +#endif + +#if defined( GCC_VERSION ) && GCC_VERSION < 99999 && !defined( __clang__ ) +# define ZEXT_MISSING // Gcc 7.4.0 does not have _mm256_zextsi128_si256 and similar functions +#endif + #ifdef VCL_NAMESPACE namespace VCL_NAMESPACE { #endif + + // Constant for indicating don't care in permute and blend functions. + // V_DC is -256 in Vector class library version 1.xx + // V_DC can be any value less than -1 in Vector class library version 2.00 + constexpr int V_DC = -256; + + /***************************************************************************** + * + * Helper functions that depend on instruction set, compiler, or platform + * + *****************************************************************************/ + + // Define interface to cpuid instruction. + // input: functionnumber = leaf (eax), ecxleaf = subleaf(ecx) + // output: output[0] = eax, output[1] = ebx, output[2] = ecx, output[3] = edx + static inline void cpuid( int output[4], int functionnumber, int ecxleaf = 0 ) { +#if defined( __GNUC__ ) || defined( __clang__ ) // use inline assembly, Gnu/AT&T syntax + int a, b, c, d; + __asm( "cpuid" : "=a"( a ), "=b"( b ), "=c"( c ), "=d"( d ) : "a"( functionnumber ), "c"( ecxleaf ) : ); + output[0] = a; + output[1] = b; + output[2] = c; + output[3] = d; + +#elif defined( _MSC_VER ) // Microsoft compiler, intrin.h included + __cpuidex( output, functionnumber, ecxleaf ); // intrinsic function for CPUID + +#else // unknown platform. try inline assembly with masm/intel syntax + __asm { + mov eax, functionnumber + mov ecx, ecxleaf + cpuid; + mov esi, output + mov[esi], eax + mov[esi + 4], ebx + mov[esi + 8], ecx + mov[esi + 12], edx + } +#endif + } + +// Define popcount function. Gives sum of bits +#if INSTRSET >= 6 // SSE4.2 + // popcnt instruction is not officially part of the SSE4.2 instruction set, + // but available in all known processors with SSE4.2 + static inline uint32_t vml_popcnt( uint32_t a ) { + return (uint32_t)_mm_popcnt_u32( a ); // Intel intrinsic. Supported by gcc and clang + } +# ifdef __x86_64__ + static inline int64_t vml_popcnt( uint64_t a ) { + return _mm_popcnt_u64( a ); // Intel intrinsic. + } +# else // 32 bit mode + static inline int64_t vml_popcnt( uint64_t a ) { + return _mm_popcnt_u32( uint32_t( a >> 32 ) ) + _mm_popcnt_u32( uint32_t( a ) ); + } +# endif +#else // no SSE4.2 +static inline uint32_t vml_popcnt( uint32_t a ) { + // popcnt instruction not available + uint32_t b = a - ( ( a >> 1 ) & 0x55555555 ); + uint32_t c = ( b & 0x33333333 ) + ( ( b >> 2 ) & 0x33333333 ); + uint32_t d = ( c + ( c >> 4 ) ) & 0x0F0F0F0F; + uint32_t e = d * 0x01010101; + return e >> 24; +} + +static inline int32_t vml_popcnt( uint64_t a ) { + return vml_popcnt( uint32_t( a >> 32 ) ) + vml_popcnt( uint32_t( a ) ); +} + +#endif + +// Define bit-scan-forward function. Gives index to lowest set bit +#if defined( __GNUC__ ) || defined( __clang__ ) + // gcc and Clang have no bit_scan_forward intrinsic +# if defined( __clang__ ) // fix clang bug + // Clang uses a k register as parameter a when inlined from horizontal_find_first + __attribute__( ( noinline ) ) +# endif + static uint32_t + bit_scan_forward( uint32_t a ) { + uint32_t r; + __asm( "bsfl %1, %0" : "=r"( r ) : "r"( a ) : ); + return r; + } + static inline uint32_t bit_scan_forward( uint64_t a ) { + uint32_t lo = uint32_t( a ); + if ( lo ) return bit_scan_forward( lo ); + uint32_t hi = uint32_t( a >> 32 ); + return bit_scan_forward( hi ) + 32; + } + +#else // other compilers +static inline uint32_t bit_scan_forward( uint32_t a ) { + unsigned long r; + _BitScanForward( &r, a ); // defined in intrin.h for MS and Intel compilers + return r; +} +# ifdef __x86_64__ +static inline uint32_t bit_scan_forward( uint64_t a ) { + unsigned long r; + _BitScanForward64( &r, a ); // defined in intrin.h for MS and Intel compilers + return (uint32_t)r; +} +# else +static inline uint32_t bit_scan_forward( uint64_t a ) { + uint32_t lo = uint32_t( a ); + if ( lo ) return bit_scan_forward( lo ); + uint32_t hi = uint32_t( a >> 32 ); + return bit_scan_forward( hi ) + 32; +} +# endif +#endif + +// Define bit-scan-reverse function. Gives index to highest set bit = floor(log2(a)) +#if defined( __GNUC__ ) || defined( __clang__ ) + static inline uint32_t bit_scan_reverse( uint32_t a ) __attribute__( ( pure ) ); + static inline uint32_t bit_scan_reverse( uint32_t a ) { + uint32_t r; + __asm( "bsrl %1, %0" : "=r"( r ) : "r"( a ) : ); + return r; + } +# ifdef __x86_64__ + static inline uint32_t bit_scan_reverse( uint64_t a ) { + uint64_t r; + __asm( "bsrq %1, %0" : "=r"( r ) : "r"( a ) : ); + return r; + } +# else // 32 bit mode + static inline uint32_t bit_scan_reverse( uint64_t a ) { + uint64_t ahi = a >> 32; + if ( ahi == 0 ) + return bit_scan_reverse( uint32_t( a ) ); + else + return bit_scan_reverse( uint32_t( ahi ) ) + 32; + } +# endif +#else +static inline uint32_t bit_scan_reverse( uint32_t a ) { + unsigned long r; + _BitScanReverse( &r, a ); // defined in intrin.h for MS and Intel compilers + return r; +} +# ifdef __x86_64__ +static inline uint32_t bit_scan_reverse( uint64_t a ) { + unsigned long r; + _BitScanReverse64( &r, a ); // defined in intrin.h for MS and Intel compilers + return r; +} +# else // 32 bit mode +static inline uint32_t bit_scan_reverse( uint64_t a ) { + uint64_t ahi = a >> 32; + if ( ahi == 0 ) + return bit_scan_reverse( uint32_t( a ) ); + else + return bit_scan_reverse( uint32_t( ahi ) ) + 32; +} +# endif +#endif + + // Same function, for compile-time constants + constexpr int bit_scan_reverse_const( uint64_t const n ) { + if ( n == 0 ) return -1; + uint64_t a = n, b = 0, j = 64, k = 0; + do { + j >>= 1; + k = (uint64_t)1 << j; + if ( a >= k ) { + a >>= j; + b += j; + } + } while ( j > 0 ); + return int( b ); + } + + /***************************************************************************** + * + * Common templates + * + *****************************************************************************/ + // Template class to represent compile-time integer constant template <int32_t n> class Const_int_t {}; // represent compile-time signed integer constant @@ -202,17 +403,1028 @@ namespace VCL_NAMESPACE { #define const_int( n ) ( Const_int_t<n>() ) // n must be compile-time integer constant #define const_uint( n ) ( Const_uint_t<n>() ) // n must be compile-time unsigned integer constant - // Template for compile-time error messages - template <bool> - class Static_error_check { - public: - Static_error_check(){}; - }; - template <> - class Static_error_check<false> { // generate compile-time error if false - private: - Static_error_check(){}; + // template for producing quiet NAN + template <class VTYPE> + static inline VTYPE nan_vec( uint32_t payload = 0x100 ) { + if constexpr ( ( VTYPE::elementtype() & 1 ) != 0 ) { // double + union { + uint64_t q; + double f; + } ud; + // n is left justified to avoid loss of NAN payload when converting to float + ud.q = 0x7FF8000000000000 | uint64_t( payload ) << 29; + return VTYPE( ud.f ); + } + // float will be converted to double if necessary + union { + uint32_t i; + float f; + } uf; + uf.i = 0x7FC00000 | ( payload & 0x003FFFFF ); + return VTYPE( uf.f ); + } + + // Test if a parameter is a compile-time constant + /* Unfortunately, this works only for macro parameters, not for inline function parameters. + I hope that some solution will appear in the future, but for now it appears to be + impossible to check if a function parameter is a compile-time constant. + This would be useful in operator / and in function pow: + #if defined(__GNUC__) || defined (__clang__) + #define is_constant(a) __builtin_constant_p(a) + #else + #define is_constant(a) false + #endif + */ + + /***************************************************************************** + * + * Helper functions for permute and blend functions + * + ****************************************************************************** + Rules for constexpr functions: + + > All variable declarations must include initialization + + > Do not put variable declarations inside a for-clause, e.g. avoid: for (int i=0; .. + Instead, you have to declare the loop counter before the for-loop. + + > Do not make constexpr functions that return vector types. This requires type + punning with a union, which is not allowed in constexpr functions under C++17. + It may be possible under C++20 + + *****************************************************************************/ + + // Define type for Encapsulated array to use as return type: + template <typename T, int N> + struct EList { + T a[N]; }; + + // get_inttype: get an integer of a size that matches the element size + // of vector class V with the value -1 + template <typename V> + constexpr auto get_inttype() { + constexpr int elementsize = sizeof( V ) / V::size(); // size of vector elements + + if constexpr ( elementsize >= 8 ) { + return -int64_t( 1 ); + } else if constexpr ( elementsize >= 4 ) { + return int32_t( -1 ); + } else if constexpr ( elementsize >= 2 ) { + return int16_t( -1 ); + } else { + return int8_t( -1 ); + } + } + + // zero_mask: return a compact bit mask mask for zeroing using AVX512 mask. + // Parameter a is a reference to a constexpr int array of permutation indexes + template <int N> + constexpr auto zero_mask( int const ( &a )[N] ) { + uint64_t mask = 0; + int i = 0; + + for ( i = 0; i < N; i++ ) { + if ( a[i] >= 0 ) mask |= uint64_t( 1 ) << i; + } + if constexpr ( N <= 8 ) + return uint8_t( mask ); + else if constexpr ( N <= 16 ) + return uint16_t( mask ); + else if constexpr ( N <= 32 ) + return uint32_t( mask ); + else + return mask; + } + + // zero_mask_broad: return a broad byte mask for zeroing. + // Parameter a is a reference to a constexpr int array of permutation indexes + template <typename V> + constexpr auto zero_mask_broad( int const ( &A )[V::size()] ) { + constexpr int N = V::size(); // number of vector elements + typedef decltype( get_inttype<V>() ) Etype; // element type + EList<Etype, N> u = {{0}}; // list for return + int i = 0; + for ( i = 0; i < N; i++ ) { u.a[i] = A[i] >= 0 ? get_inttype<V>() : 0; } + return u; // return encapsulated array + } + + // make_bit_mask: return a compact mask of bits from a list of N indexes: + // B contains options indicating how to gather the mask + // bit 0-7 in B indicates which bit in each index to collect + // bit 8 = 0x100: set 1 in the lower half of the bit mask if the indicated bit is 1. + // bit 8 = 0 : set 1 in the lower half of the bit mask if the indicated bit is 0. + // bit 9 = 0x200: set 1 in the upper half of the bit mask if the indicated bit is 1. + // bit 9 = 0 : set 1 in the upper half of the bit mask if the indicated bit is 0. + // bit 10 = 0x400: set 1 in the bit mask if the corresponding index is -1 or V_DC + // Parameter a is a reference to a constexpr int array of permutation indexes + template <int N, int B> + constexpr uint64_t make_bit_mask( int const ( &a )[N] ) { + uint64_t r = 0; // return value + uint8_t j = uint8_t( B & 0xFF ); // index to selected bit + uint64_t s = 0; // bit number i in r + uint64_t f = 0; // 1 if bit not flipped + int i = 0; + for ( i = 0; i < N; i++ ) { + int ix = a[i]; + if ( ix < 0 ) { // -1 or V_DC + s = ( B >> 10 ) & 1; + } else { + s = ( (uint32_t)ix >> j ) & 1; // extract selected bit + if ( i < N / 2 ) { + f = ( B >> 8 ) & 1; // lower half + } else { + f = ( B >> 9 ) & 1; // upper half + } + s ^= f ^ 1; // flip bit if needed + } + r |= uint64_t( s ) << i; // set bit in return value + } + return r; + } + + // make_broad_mask: Convert a bit mask m to a broad mask + // The return value will be a broad boolean mask with elementsize matching vector class V + template <typename V> + constexpr auto make_broad_mask( uint64_t const m ) { + constexpr int N = V::size(); // number of vector elements + typedef decltype( get_inttype<V>() ) Etype; // element type + EList<Etype, N> u = {{0}}; // list for returning + int i = 0; + for ( i = 0; i < N; i++ ) { u.a[i] = ( ( m >> i ) & 1 ) != 0 ? get_inttype<V>() : 0; } + return u; // return encapsulated array + } + + // perm_mask_broad: return a mask for permutation by a vector register index. + // Parameter A is a reference to a constexpr int array of permutation indexes + template <typename V> + constexpr auto perm_mask_broad( int const ( &A )[V::size()] ) { + constexpr int N = V::size(); // number of vector elements + typedef decltype( get_inttype<V>() ) Etype; // vector element type + EList<Etype, N> u = {{0}}; // list for returning + int i = 0; + for ( i = 0; i < N; i++ ) { u.a[i] = Etype( A[i] ); } + return u; // return encapsulated array + } + + // perm_flags: returns information about how a permute can be implemented. + // The return value is composed of these flag bits: + const int perm_zeroing = 1; // needs zeroing + const int perm_perm = 2; // permutation needed + const int perm_allzero = 4; // all is zero or don't care + const int perm_largeblock = 8; // fits permute with a larger block size (e.g permute Vec2q instead of Vec4i) + const int perm_addz = 0x10; // additional zeroing needed after permute with larger block size or shift + const int perm_addz2 = 0x20; // additional zeroing needed after perm_zext, perm_compress, or perm_expand + const int perm_cross_lane = 0x40; // permutation crossing 128-bit lanes + const int perm_same_pattern = 0x80; // same permute pattern in all 128-bit lanes + const int perm_punpckh = 0x100; // permutation pattern fits punpckh instruction + const int perm_punpckl = 0x200; // permutation pattern fits punpckl instruction + const int perm_rotate = + 0x400; // permutation pattern fits rotation within lanes. 4 bit count returned in bit perm_rot_count + const int perm_shright = + 0x1000; // permutation pattern fits shift right within lanes. 4 bit count returned in bit perm_rot_count + const int perm_shleft = + 0x2000; // permutation pattern fits shift left within lanes. negative count returned in bit perm_rot_count + const int perm_rotate_big = + 0x4000; // permutation pattern fits rotation across lanes. 6 bit count returned in bit perm_rot_count + const int perm_broadcast = 0x8000; // permutation pattern fits broadcast of a single element. + const int perm_zext = 0x10000; // permutation pattern fits zero extension + const int perm_compress = 0x20000; // permutation pattern fits vpcompress instruction + const int perm_expand = 0x40000; // permutation pattern fits vpexpand instruction + const int perm_outofrange = 0x10000000; // index out of range + const int perm_rot_count = 32; // rotate or shift count is in bits perm_rot_count to perm_rot_count+3 + const int perm_ipattern = + 40; // pattern for pshufd is in bit perm_ipattern to perm_ipattern + 7 if perm_same_pattern and elementsize >= 4 + + template <typename V> + constexpr uint64_t perm_flags( int const ( &a )[V::size()] ) { + // a is a reference to a constexpr array of permutation indexes + // V is a vector class + constexpr int N = V::size(); // number of elements + uint64_t r = perm_largeblock | perm_same_pattern | perm_allzero; // return value + uint32_t i = 0; // loop counter + int j = 0; // loop counter + int ix = 0; // index number i + const uint32_t nlanes = sizeof( V ) / 16; // number of 128-bit lanes + const uint32_t lanesize = N / nlanes; // elements per lane + const uint32_t elementsize = sizeof( V ) / N; // size of each vector element + uint32_t lane = 0; // current lane + uint32_t rot = 999; // rotate left count + int32_t broadc = 999; // index to broadcasted element + uint32_t patfail = 0; // remember certain patterns that do not fit + uint32_t addz2 = 0; // remember certain patterns need extra zeroing + int32_t compresslasti = -1; // last index in perm_compress fit + int32_t compresslastp = -1; // last position in perm_compress fit + int32_t expandlasti = -1; // last index in perm_expand fit + int32_t expandlastp = -1; // last position in perm_expand fit + + int lanepattern[lanesize] = {0}; // pattern in each lane + + for ( i = 0; i < N; i++ ) { // loop through indexes + ix = a[i]; // current index + // meaning of ix: -1 = set to zero, V_DC = don't care, non-negative value = permute. + if ( ix == -1 ) { + r |= perm_zeroing; // zeroing requested + } else if ( ix != V_DC && uint32_t( ix ) >= N ) { + r |= perm_outofrange; // index out of range + } + if ( ix >= 0 ) { + r &= ~perm_allzero; // not all zero + if ( ix != (int)i ) r |= perm_perm; // needs permutation + if ( broadc == 999 ) + broadc = ix; // remember broadcast index + else if ( broadc != ix ) + broadc = 1000; // does not fit broadcast + } + // check if pattern fits a larger block size: + // even indexes must be even, odd indexes must fit the preceding even index + 1 + if ( ( i & 1 ) == 0 ) { // even index + if ( ix >= 0 && ( ix & 1 ) ) r &= ~perm_largeblock; // not even. does not fit larger block size + int iy = a[i + 1]; // next odd index + if ( iy >= 0 && ( iy & 1 ) == 0 ) r &= ~perm_largeblock; // not odd. does not fit larger block size + if ( ix >= 0 && iy >= 0 && iy != ix + 1 ) r &= ~perm_largeblock; // does not fit preceding index + 1 + if ( ix == -1 && iy >= 0 ) r |= perm_addz; // needs additional zeroing at current block size + if ( iy == -1 && ix >= 0 ) r |= perm_addz; // needs additional zeroing at current block size + } + lane = i / lanesize; // current lane + if ( lane == 0 ) { // first lane, or no pattern yet + lanepattern[i] = ix; // save pattern + } + // check if crossing lanes + if ( ix >= 0 ) { + uint32_t lanei = (uint32_t)ix / lanesize; // source lane + if ( lanei != lane ) r |= perm_cross_lane; // crossing lane + } + // check if same pattern in all lanes + if ( lane != 0 && ix >= 0 ) { // not first lane + int j1 = i - int( lane * lanesize ); // index into lanepattern + int jx = ix - int( lane * lanesize ); // pattern within lane + if ( jx < 0 || jx >= (int)lanesize ) r &= ~perm_same_pattern; // source is in another lane + if ( lanepattern[j1] < 0 ) { + lanepattern[j1] = jx; // pattern not known from previous lane + } else { + if ( lanepattern[j1] != jx ) r &= ~perm_same_pattern; // not same pattern + } + } + if ( ix >= 0 ) { + // check if pattern fits zero extension (perm_zext) + if ( uint32_t( ix * 2 ) != i ) { + patfail |= 1; // does not fit zero extension + } + // check if pattern fits compress (perm_compress) + if ( ix > compresslasti && ix - compresslasti >= (int)i - compresslastp ) { + if ( (int)i - compresslastp > 1 ) addz2 |= 2; // perm_compress may need additional zeroing + compresslasti = ix; + compresslastp = i; + } else { + patfail |= 2; // does not fit perm_compress + } + // check if pattern fits expand (perm_expand) + if ( ix > expandlasti && ix - expandlasti <= (int)i - expandlastp ) { + if ( ix - expandlasti > 1 ) addz2 |= 4; // perm_expand may need additional zeroing + expandlasti = ix; + expandlastp = i; + } else { + patfail |= 4; // does not fit perm_compress + } + } else if ( ix == -1 ) { + if ( ( i & 1 ) == 0 ) addz2 |= 1; // zero extension needs additional zeroing + } + } + if ( !( r & perm_perm ) ) return r; // more checks are superfluous + + if ( !( r & perm_largeblock ) ) r &= ~perm_addz; // remove irrelevant flag + if ( r & perm_cross_lane ) r &= ~perm_same_pattern; // remove irrelevant flag + if ( ( patfail & 1 ) == 0 ) { + r |= perm_zext; // fits zero extension + if ( ( addz2 & 1 ) != 0 ) r |= perm_addz2; + } else if ( ( patfail & 2 ) == 0 ) { + r |= perm_compress; // fits compression + if ( ( addz2 & 2 ) != 0 ) { // check if additional zeroing needed + for ( j = 0; j < compresslastp; j++ ) { + if ( a[j] == -1 ) r |= perm_addz2; + } + } + } else if ( ( patfail & 4 ) == 0 ) { + r |= perm_expand; // fits expansion + if ( ( addz2 & 4 ) != 0 ) { // check if additional zeroing needed + for ( j = 0; j < expandlastp; j++ ) { + if ( a[j] == -1 ) r |= perm_addz2; + } + } + } + + if ( r & perm_same_pattern ) { + // same pattern in all lanes. check if it fits specific patterns + bool fit = true; + // fit shift or rotate + for ( i = 0; i < lanesize; i++ ) { + if ( lanepattern[i] >= 0 ) { + uint32_t rot1 = uint32_t( lanepattern[i] + lanesize - i ) % lanesize; + if ( rot == 999 ) { + rot = rot1; + } else { // check if fit + if ( rot != rot1 ) fit = false; + } + } + } + rot &= lanesize - 1; // prevent out of range values + if ( fit ) { // fits rotate, and possibly shift + uint64_t rot2 = ( rot * elementsize ) & 0xF; // rotate right count in bytes + r |= rot2 << perm_rot_count; // put shift/rotate count in output bit 16-19 +#if INSTRSET >= 4 // SSSE3 + r |= perm_rotate; // allow palignr +#endif + // fit shift left + fit = true; + for ( i = 0; i < lanesize - rot; i++ ) { // check if first rot elements are zero or don't care + if ( lanepattern[i] >= 0 ) fit = false; + } + if ( fit ) { + r |= perm_shleft; + for ( ; i < lanesize; i++ ) + if ( lanepattern[i] == -1 ) r |= perm_addz; // additional zeroing needed + } + // fit shift right + fit = true; + for ( i = lanesize - (uint32_t)rot; i < lanesize; + i++ ) { // check if last (lanesize-rot) elements are zero or don't care + if ( lanepattern[i] >= 0 ) fit = false; + } + if ( fit ) { + r |= perm_shright; + for ( i = 0; i < lanesize - rot; i++ ) { + if ( lanepattern[i] == -1 ) r |= perm_addz; // additional zeroing needed + } + } + } + // fit punpckhi + fit = true; + uint32_t j2 = lanesize / 2; + for ( i = 0; i < lanesize; i++ ) { + if ( lanepattern[i] >= 0 && lanepattern[i] != (int)j2 ) fit = false; + if ( ( i & 1 ) != 0 ) j2++; + } + if ( fit ) r |= perm_punpckh; + // fit punpcklo + fit = true; + j2 = 0; + for ( i = 0; i < lanesize; i++ ) { + if ( lanepattern[i] >= 0 && lanepattern[i] != (int)j2 ) fit = false; + if ( ( i & 1 ) != 0 ) j2++; + } + if ( fit ) r |= perm_punpckl; + // fit pshufd + if ( elementsize >= 4 ) { + uint64_t p = 0; + for ( i = 0; i < lanesize; i++ ) { + if ( lanesize == 4 ) { + p |= ( lanepattern[i] & 3 ) << 2 * i; + } else { // lanesize = 2 + p |= ( ( lanepattern[i] & 1 ) * 10 + 4 ) << 4 * i; + } + } + r |= p << perm_ipattern; + } + } +#if INSTRSET >= 7 + else { // not same pattern in all lanes + if constexpr ( nlanes > 1 ) { // Try if it fits big rotate + for ( i = 0; i < N; i++ ) { + ix = a[i]; + if ( ix >= 0 ) { + uint32_t rot2 = ( ix + N - i ) % N; // rotate count + if ( rot == 999 ) { + rot = rot2; // save rotate count + } else if ( rot != rot2 ) { + rot = 1000; + break; // does not fit big rotate + } + } + } + if ( rot < N ) { // fits big rotate + r |= perm_rotate_big | (uint64_t)rot << perm_rot_count; + } + } + } +#endif + if ( broadc < 999 && ( r & ( perm_rotate | perm_shright | perm_shleft | perm_rotate_big ) ) == 0 ) { + r |= perm_broadcast | (uint64_t)broadc << perm_rot_count; // fits broadcast + } + return r; + } + + // compress_mask: returns a bit mask to use for compression instruction. + // It is presupposed that perm_flags indicates perm_compress. + // Additional zeroing is needed if perm_flags indicates perm_addz2 + template <int N> + constexpr uint64_t compress_mask( int const ( &a )[N] ) { + // a is a reference to a constexpr array of permutation indexes + int ix = 0, lasti = -1, lastp = -1; + uint64_t m = 0; + int i = 0; + int j = 1; // loop counters + for ( i = 0; i < N; i++ ) { + ix = a[i]; // permutation index + if ( ix >= 0 ) { + m |= (uint64_t)1 << ix; // mask for compression source + for ( j = 1; j < i - lastp; j++ ) { + m |= (uint64_t)1 << ( lasti + j ); // dummy filling source + } + lastp = i; + lasti = ix; + } + } + return m; + } + + // expand_mask: returns a bit mask to use for expansion instruction. + // It is presupposed that perm_flags indicates perm_expand. + // Additional zeroing is needed if perm_flags indicates perm_addz2 + template <int N> + constexpr uint64_t expand_mask( int const ( &a )[N] ) { + // a is a reference to a constexpr array of permutation indexes + int ix = 0, lasti = -1, lastp = -1; + uint64_t m = 0; + int i = 0; + int j = 1; + for ( i = 0; i < N; i++ ) { + ix = a[i]; // permutation index + if ( ix >= 0 ) { + m |= (uint64_t)1 << i; // mask for expansion destination + for ( j = 1; j < ix - lasti; j++ ) { + m |= (uint64_t)1 << ( lastp + j ); // dummy filling destination + } + lastp = i; + lasti = ix; + } + } + return m; + } + + // perm16_flags: returns information about how to permute a vector of 16-bit integers + // Note: It is presupposed that perm_flags reports perm_same_pattern + // The return value is composed of these bits: + // 1: data from low 64 bits to low 64 bits. pattern in bit 32-39 + // 2: data from high 64 bits to high 64 bits. pattern in bit 40-47 + // 4: data from high 64 bits to low 64 bits. pattern in bit 48-55 + // 8: data from low 64 bits to high 64 bits. pattern in bit 56-63 + template <typename V> + constexpr uint64_t perm16_flags( int const ( &a )[V::size()] ) { + // a is a reference to a constexpr array of permutation indexes + // V is a vector class + constexpr int N = V::size(); // number of elements + + uint64_t retval = 0; // return value + uint32_t pat[4] = {0, 0, 0, 0}; // permute patterns + uint32_t i = 0; // loop counter + int ix = 0; // index number i + const uint32_t lanesize = 8; // elements per lane + uint32_t lane = 0; // current lane + int lanepattern[lanesize] = {0}; // pattern in each lane + + for ( i = 0; i < N; i++ ) { + ix = a[i]; + lane = i / lanesize; // current lane + if ( lane == 0 ) { + lanepattern[i] = ix; // save pattern + } else if ( ix >= 0 ) { // not first lane + uint32_t j = i - lane * lanesize; // index into lanepattern + int jx = ix - lane * lanesize; // pattern within lane + if ( lanepattern[j] < 0 ) { + lanepattern[j] = jx; // pattern not known from previous lane + } + } + } + // four patterns: low2low, high2high, high2low, low2high + for ( i = 0; i < 4; i++ ) { + // loop through low pattern + if ( lanepattern[i] >= 0 ) { + if ( lanepattern[i] < 4 ) { // low2low + retval |= 1; + pat[0] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i ); + } else { // high2low + retval |= 4; + pat[2] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i ); + } + } + // loop through high pattern + if ( lanepattern[i + 4] >= 0 ) { + if ( lanepattern[i + 4] < 4 ) { // low2high + retval |= 8; + pat[3] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i ); + } else { // high2high + retval |= 2; + pat[1] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i ); + } + } + } + // join return data + for ( i = 0; i < 4; i++ ) { retval |= (uint64_t)pat[i] << ( 32 + i * 8 ); } + return retval; + } + + // pshufb_mask: return a broad byte mask for permutation within lanes + // for use with the pshufb instruction (_mm..._shuffle_epi8). + // The pshufb instruction provides fast permutation and zeroing, + // allowing different patterns in each lane but no crossing of lane boundaries + template <typename V, int oppos = 0> + constexpr auto pshufb_mask( int const ( &A )[V::size()] ) { + // Parameter a is a reference to a constexpr array of permutation indexes + // V is a vector class + // oppos = 1 for data from the opposite 128-bit lane in 256-bit vectors + constexpr uint32_t N = V::size(); // number of vector elements + constexpr uint32_t elementsize = sizeof( V ) / N; // size of each vector element + constexpr uint32_t nlanes = sizeof( V ) / 16; // number of 128 bit lanes in vector + constexpr uint32_t elements_per_lane = N / nlanes; // number of vector elements per lane + + EList<int8_t, sizeof( V )> u = {{0}}; // list for returning + + uint32_t i = 0; // loop counters + uint32_t j = 0; + int m = 0; + int k = 0; + uint32_t lane = 0; + + for ( lane = 0; lane < nlanes; lane++ ) { // loop through lanes + for ( i = 0; i < elements_per_lane; i++ ) { // loop through elements in lane + // permutation index for element within lane + int8_t p = -1; + int ix = A[m]; + if ( ix >= 0 ) { + ix ^= oppos * elements_per_lane; // flip bit if opposite lane + } + ix -= int( lane * elements_per_lane ); // index relative to lane + if ( ix >= 0 && ix < (int)elements_per_lane ) { // index points to desired lane + p = ix * elementsize; + } + for ( j = 0; j < elementsize; j++ ) { // loop through bytes in element + u.a[k++] = p < 0 ? -1 : p + j; // store byte permutation index + } + m++; + } + } + return u; // return encapsulated array + } + + // largeblock_perm: return indexes for replacing a permute or blend with + // a certain block size by a permute or blend with the double block size. + // Note: it is presupposed that perm_flags() indicates perm_largeblock + // It is required that additional zeroing is added if perm_flags() indicates perm_addz + template <int N> + constexpr EList<int, N / 2> largeblock_perm( int const ( &a )[N] ) { + // Parameter a is a reference to a constexpr array of permutation indexes + EList<int, N / 2> list = {{0}}; // result indexes + int ix = 0; // even index + int iy = 0; // odd index + int iz = 0; // combined index + bool fit_addz = false; // additional zeroing needed at the lower block level + int i = 0; // loop counter + + // check if additional zeroing is needed at current block size + for ( i = 0; i < N; i += 2 ) { + ix = a[i]; // even index + iy = a[i + 1]; // odd index + if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz = true; } + } + + // loop through indexes + for ( i = 0; i < N; i += 2 ) { + ix = a[i]; // even index + iy = a[i + 1]; // odd index + if ( ix >= 0 ) { + iz = ix / 2; // half index + } else if ( iy >= 0 ) { + iz = iy / 2; + } else { + iz = ix | iy; // -1 or V_DC. -1 takes precedence + if ( fit_addz ) iz = V_DC; // V_DC, because result will be zeroed later + } + list.a[i / 2] = iz; // save to list + } + return list; + } + + // blend_flags: returns information about how a blend function can be implemented + // The return value is composed of these flag bits: + const int blend_zeroing = 1; // needs zeroing + const int blend_allzero = 2; // all is zero or don't care + const int blend_largeblock = 4; // fits blend with a larger block size (e.g permute Vec2q instead of Vec4i) + const int blend_addz = 8; // additional zeroing needed after blend with larger block size or shift + const int blend_a = 0x10; // has data from a + const int blend_b = 0x20; // has data from b + const int blend_perma = 0x40; // permutation of a needed + const int blend_permb = 0x80; // permutation of b needed + const int blend_cross_lane = 0x100; // permutation crossing 128-bit lanes + const int blend_same_pattern = 0x200; // same permute/blend pattern in all 128-bit lanes + const int blend_punpckhab = 0x1000; // pattern fits punpckh(a,b) + const int blend_punpckhba = 0x2000; // pattern fits punpckh(b,a) + const int blend_punpcklab = 0x4000; // pattern fits punpckl(a,b) + const int blend_punpcklba = 0x8000; // pattern fits punpckl(b,a) + const int blend_rotateab = 0x10000; // pattern fits palignr(a,b) + const int blend_rotateba = 0x20000; // pattern fits palignr(b,a) + const int blend_shufab = 0x40000; // pattern fits shufps/shufpd(a,b) + const int blend_shufba = 0x80000; // pattern fits shufps/shufpd(b,a) + const int blend_rotate_big = 0x100000; // pattern fits rotation across lanes. count returned in bits blend_rotpattern + const int blend_outofrange = 0x10000000; // index out of range + const int blend_shufpattern = 32; // pattern for shufps/shufpd is in bit blend_shufpattern to blend_shufpattern + 7 + const int blend_rotpattern = 40; // pattern for palignr is in bit blend_rotpattern to blend_rotpattern + 7 + + template <typename V> + constexpr uint64_t blend_flags( int const ( &a )[V::size()] ) { + // a is a reference to a constexpr array of permutation indexes + // V is a vector class + constexpr int N = V::size(); // number of elements + uint64_t r = blend_largeblock | blend_same_pattern | blend_allzero; // return value + uint32_t iu = 0; // loop counter + int32_t ii = 0; // loop counter + int ix = 0; // index number i + const uint32_t nlanes = sizeof( V ) / 16; // number of 128-bit lanes + const uint32_t lanesize = N / nlanes; // elements per lane + uint32_t lane = 0; // current lane + uint32_t rot = 999; // rotate left count + int lanepattern[lanesize] = {0}; // pattern in each lane + if ( lanesize == 2 && N <= 8 ) { + r |= blend_shufab | blend_shufba; // check if it fits shufpd + } + + for ( ii = 0; ii < N; ii++ ) { // loop through indexes + ix = a[ii]; // index + if ( ix < 0 ) { + if ( ix == -1 ) + r |= blend_zeroing; // set to zero + else if ( ix != V_DC ) { + r = blend_outofrange; + break; // illegal index + } + } else { // ix >= 0 + r &= ~blend_allzero; + if ( ix < N ) { + r |= blend_a; // data from a + if ( ix != ii ) r |= blend_perma; // permutation of a + } else if ( ix < 2 * N ) { + r |= blend_b; // data from b + if ( ix != ii + N ) r |= blend_permb; // permutation of b + } else { + r = blend_outofrange; + break; // illegal index + } + } + // check if pattern fits a larger block size: + // even indexes must be even, odd indexes must fit the preceding even index + 1 + if ( ( ii & 1 ) == 0 ) { // even index + if ( ix >= 0 && ( ix & 1 ) ) r &= ~blend_largeblock; // not even. does not fit larger block size + int iy = a[ii + 1]; // next odd index + if ( iy >= 0 && ( iy & 1 ) == 0 ) r &= ~blend_largeblock; // not odd. does not fit larger block size + if ( ix >= 0 && iy >= 0 && iy != ix + 1 ) r &= ~blend_largeblock; // does not fit preceding index + 1 + if ( ix == -1 && iy >= 0 ) r |= blend_addz; // needs additional zeroing at current block size + if ( iy == -1 && ix >= 0 ) r |= blend_addz; // needs additional zeroing at current block size + } + lane = (uint32_t)ii / lanesize; // current lane + if ( lane == 0 ) { // first lane, or no pattern yet + lanepattern[ii] = ix; // save pattern + } + // check if crossing lanes + if ( ix >= 0 ) { + uint32_t lanei = uint32_t( ix & ~N ) / lanesize; // source lane + if ( lanei != lane ) { + r |= blend_cross_lane; // crossing lane + } + if ( lanesize == 2 ) { // check if it fits pshufd + if ( lanei != lane ) r &= ~( blend_shufab | blend_shufba ); + if ( ( ( ( ix & N ) != 0 ) ^ ii ) & 1 ) + r &= ~blend_shufab; + else + r &= ~blend_shufba; + } + } + // check if same pattern in all lanes + if ( lane != 0 && ix >= 0 ) { // not first lane + int j = ii - int( lane * lanesize ); // index into lanepattern + int jx = ix - int( lane * lanesize ); // pattern within lane + if ( jx < 0 || ( jx & ~N ) >= (int)lanesize ) r &= ~blend_same_pattern; // source is in another lane + if ( lanepattern[j] < 0 ) { + lanepattern[j] = jx; // pattern not known from previous lane + } else { + if ( lanepattern[j] != jx ) r &= ~blend_same_pattern; // not same pattern + } + } + } + if ( !( r & blend_largeblock ) ) r &= ~blend_addz; // remove irrelevant flag + if ( r & blend_cross_lane ) r &= ~blend_same_pattern; // remove irrelevant flag + if ( !( r & ( blend_perma | blend_permb ) ) ) { + return r; // no permutation. more checks are superfluous + } + if ( r & blend_same_pattern ) { + // same pattern in all lanes. check if it fits unpack patterns + r |= blend_punpckhab | blend_punpckhba | blend_punpcklab | blend_punpcklba; + for ( iu = 0; iu < lanesize; iu++ ) { // loop through lanepattern + ix = lanepattern[iu]; + if ( ix >= 0 ) { + if ( (uint32_t)ix != iu / 2 + ( iu & 1 ) * N ) r &= ~blend_punpcklab; + if ( (uint32_t)ix != iu / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &= ~blend_punpcklba; + if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( iu & 1 ) * N ) r &= ~blend_punpckhab; + if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &= ~blend_punpckhba; + } + } +#if INSTRSET >= 4 // SSSE3. check if it fits palignr + for ( iu = 0; iu < lanesize; iu++ ) { + ix = lanepattern[iu]; + if ( ix >= 0 ) { + uint32_t t = ix & ~N; + if ( ix & N ) t += lanesize; + uint32_t tb = ( t + 2 * lanesize - iu ) % ( lanesize * 2 ); + if ( rot == 999 ) { + rot = tb; + } else { // check if fit + if ( rot != tb ) rot = 1000; + } + } + } + if ( rot < 999 ) { // firs palignr + if ( rot < lanesize ) { + r |= blend_rotateba; + } else { + r |= blend_rotateab; + } + const uint32_t elementsize = sizeof( V ) / N; + r |= uint64_t( ( rot & ( lanesize - 1 ) ) * elementsize ) << blend_rotpattern; + } +#endif + if ( lanesize == 4 ) { + // check if it fits shufps + r |= blend_shufab | blend_shufba; + for ( ii = 0; ii < 2; ii++ ) { + ix = lanepattern[ii]; + if ( ix >= 0 ) { + if ( ix & N ) + r &= ~blend_shufab; + else + r &= ~blend_shufba; + } + } + for ( ; ii < 4; ii++ ) { + ix = lanepattern[ii]; + if ( ix >= 0 ) { + if ( ix & N ) + r &= ~blend_shufba; + else + r &= ~blend_shufab; + } + } + if ( r & ( blend_shufab | blend_shufba ) ) { // fits shufps/shufpd + uint8_t shufpattern = 0; // get pattern + for ( iu = 0; iu < lanesize; iu++ ) { shufpattern |= ( lanepattern[iu] & 3 ) << iu * 2; } + r |= (uint64_t)shufpattern << blend_shufpattern; // return pattern + } + } + } else if ( nlanes > 1 ) { // not same pattern in all lanes + rot = 999; // check if it fits big rotate + for ( ii = 0; ii < N; ii++ ) { + ix = a[ii]; + if ( ix >= 0 ) { + uint32_t rot2 = ( ix + 2 * N - ii ) % ( 2 * N ); // rotate count + if ( rot == 999 ) { + rot = rot2; // save rotate count + } else if ( rot != rot2 ) { + rot = 1000; + break; // does not fit big rotate + } + } + } + if ( rot < 2 * N ) { // fits big rotate + r |= blend_rotate_big | (uint64_t)rot << blend_rotpattern; + } + } + if ( lanesize == 2 && ( r & ( blend_shufab | blend_shufba ) ) ) { // fits shufpd. Get pattern + for ( ii = 0; ii < N; ii++ ) { r |= uint64_t( a[ii] & 1 ) << ( blend_shufpattern + ii ); } + } + return r; + } + + // blend_perm_indexes: return an Indexlist for implementing a blend function as + // two permutations. N = vector size. + // dozero = 0: let unused elements be don't care. The two permutation results must be blended + // dozero = 1: zero unused elements in each permuation. The two permutation results can be OR'ed + // dozero = 2: indexes that are -1 or V_DC are preserved + template <int N, int dozero> + constexpr EList<int, 2 * N> blend_perm_indexes( int const ( &a )[N] ) { + // a is a reference to a constexpr array of permutation indexes + EList<int, 2 * N> list = {{0}}; // list to return + int u = dozero ? -1 : V_DC; // value to use for unused entries + int j = 0; + + for ( j = 0; j < N; j++ ) { // loop through indexes + int ix = a[j]; // current index + if ( ix < 0 ) { // zero or don't care + if ( dozero == 2 ) { + // list.a[j] = list.a[j + N] = ix; // fails in gcc in complicated cases + list.a[j] = ix; + list.a[j + N] = ix; + } else { + // list.a[j] = list.a[j + N] = u; + list.a[j] = u; + list.a[j + N] = u; + } + } else if ( ix < N ) { // value from a + list.a[j] = ix; + list.a[j + N] = u; + } else { + list.a[j] = u; // value from b + list.a[j + N] = ix - N; + } + } + return list; + } + + // largeblock_indexes: return indexes for replacing a permute or blend with a + // certain block size by a permute or blend with the double block size. + // Note: it is presupposed that perm_flags or blend_flags indicates _largeblock + // It is required that additional zeroing is added if perm_flags or blend_flags + // indicates _addz + template <int N> + constexpr EList<int, N / 2> largeblock_indexes( int const ( &a )[N] ) { + // Parameter a is a reference to a constexpr array of N permutation indexes + EList<int, N / 2> list = {{0}}; // list to return + + bool fit_addz = false; // additional zeroing needed at the lower block level + int ix = 0; // even index + int iy = 0; // odd index + int iz = 0; // combined index + int i = 0; // loop counter + + for ( i = 0; i < N; i += 2 ) { + ix = a[i]; // even index + iy = a[i + 1]; // odd index + if ( ix >= 0 ) { + iz = ix / 2; // half index + } else if ( iy >= 0 ) { + iz = iy / 2; // half index + } else + iz = ix | iy; // -1 or V_DC. -1 takes precedence + list.a[i / 2] = iz; // save to list + // check if additional zeroing is needed at current block size + if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz = true; } + } + // replace -1 by V_DC if fit_addz + if ( fit_addz ) { + for ( i = 0; i < N / 2; i++ ) { + if ( list.a[i] < 0 ) list.a[i] = V_DC; + } + } + return list; + } + + /**************************************************************************************** + * + * Vector blend helper function templates + * + * These templates are for emulating a blend with a vector size that is not supported by + * the instruction set, using multiple blends or permutations of half the vector size + * + ****************************************************************************************/ + + // Make dummy blend function templates to avoid error messages when the blend funtions are not yet defined + template <typename dummy> + void blend2() {} + template <typename dummy> + void blend4() {} + template <typename dummy> + void blend8() {} + template <typename dummy> + void blend16() {} + template <typename dummy> + void blend32() {} + + // blend_half_indexes: return an Indexlist for emulating a blend function as + // blends or permutations from multiple sources + // dozero = 0: let unused elements be don't care. Multiple permutation results must be blended + // dozero = 1: zero unused elements in each permuation. Multiple permutation results can be OR'ed + // dozero = 2: indexes that are -1 or V_DC are preserved + // src1, src2: sources to blend in a partial implementation + template <int N, int dozero, int src1, int src2> + constexpr EList<int, N> blend_half_indexes( int const ( &a )[N] ) { + // a is a reference to a constexpr array of permutation indexes + EList<int, N> list = {{0}}; // list to return + int u = dozero ? -1 : V_DC; // value to use for unused entries + int j = 0; // loop counter + + for ( j = 0; j < N; j++ ) { // loop through indexes + int ix = a[j]; // current index + if ( ix < 0 ) { // zero or don't care + list.a[j] = ( dozero == 2 ) ? ix : u; + } else { + int src = ix / N; // source + if ( src == src1 ) { + list.a[j] = ix & ( N - 1 ); + } else if ( src == src2 ) { + list.a[j] = ( ix & ( N - 1 ) ) + N; + } else + list.a[j] = u; + } + } + return list; + } + + // selectblend: select one of four sources for blending + template <typename W, int s> + static inline auto selectblend( W const a, W const b ) { + if constexpr ( s == 0 ) + return a.get_low(); + else if constexpr ( s == 1 ) + return a.get_high(); + else if constexpr ( s == 2 ) + return b.get_low(); + else + return b.get_high(); + } + + // blend_half: Emulate a blend with a vector size that is not supported + // by multiple blends with half the vector size. + // blend_half is called twice, to give the low and high half of the result + // Parameters: W: type of full-size vector + // i0...: indexes for low or high half + // a, b: full size input vectors + // return value: half-size vector for lower or upper part + template <typename W, int... i0> + auto blend_half( W const& a, W const& b ) { + typedef decltype( a.get_low() ) V; // type for half-size vector + constexpr int N = V::size(); // size of half-size vector + static_assert( sizeof...( i0 ) == N, "wrong number of indexes in blend_half" ); + constexpr int ind[N] = {i0...}; // array of indexes + + // lambda to find which of the four possible sources are used + // return: EList<int, 5> containing a list of up to 4 sources. The last element is the number of sources used + auto listsources = []( int const n, int const( &ind )[N] ) constexpr { + bool source_used[4] = {false, false, false, false}; // list of sources used + int i = 0; + for ( i = 0; i < n; i++ ) { + int ix = ind[i]; // index + if ( ix >= 0 ) { + int src = ix / n; // source used + source_used[src & 3] = true; + } + } + // return a list of sources used. The last element is the number of sources used + EList<int, 5> sources = {{0}}; + int nsrc = 0; // number of sources + for ( i = 0; i < 4; i++ ) { + if ( source_used[i] ) { sources.a[nsrc++] = i; } + } + sources.a[4] = nsrc; + return sources; + }; + // list of sources used + constexpr EList<int, 5> sources = listsources( N, ind ); + constexpr int nsrc = sources.a[4]; // number of sources used + + if constexpr ( nsrc == 0 ) { // no sources + return V( 0 ); + } + // get indexes for the first one or two sources + constexpr int uindex = ( nsrc > 2 ) ? 1 : 2; // unused elements set to zero if two blends are combined + constexpr EList<int, N> L = blend_half_indexes<N, uindex, sources.a[0], sources.a[1]>( ind ); + V x0; + V src0 = selectblend<W, sources.a[0]>( a, b ); // first source + V src1 = selectblend<W, sources.a[1]>( a, b ); // second source + if constexpr ( N == 2 ) { + x0 = blend2<L.a[0], L.a[1]>( src0, src1 ); + } else if constexpr ( N == 4 ) { + x0 = blend4<L.a[0], L.a[1], L.a[2], L.a[3]>( src0, src1 ); + } else if constexpr ( N == 8 ) { + x0 = blend8<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]>( src0, src1 ); + } else if constexpr ( N == 16 ) { + x0 = blend16<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11], + L.a[12], L.a[13], L.a[14], L.a[15]>( src0, src1 ); + } else if constexpr ( N == 32 ) { + x0 = blend32<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11], + L.a[12], L.a[13], L.a[14], L.a[15], L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], + L.a[23], L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]>( src0, src1 ); + } + if constexpr ( nsrc > 2 ) { // get last one or two sources + constexpr EList<int, N> M = blend_half_indexes<N, 1, sources.a[2], sources.a[3]>( ind ); + V x1; + V src2 = selectblend<W, sources.a[2]>( a, b ); // third source + V src3 = selectblend<W, sources.a[3]>( a, b ); // fourth source + if constexpr ( N == 2 ) { + x1 = blend2<M.a[0], M.a[1]>( src0, src1 ); + } else if constexpr ( N == 4 ) { + x1 = blend4<M.a[0], M.a[1], M.a[2], M.a[3]>( src2, src3 ); + } else if constexpr ( N == 8 ) { + x1 = blend8<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7]>( src2, src3 ); + } else if constexpr ( N == 16 ) { + x1 = blend16<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11], + M.a[12], M.a[13], M.a[14], M.a[15]>( src2, src3 ); + } else if constexpr ( N == 32 ) { + x1 = blend32<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11], + M.a[12], M.a[13], M.a[14], M.a[15], M.a[16], M.a[17], M.a[18], M.a[19], M.a[20], M.a[21], M.a[22], + M.a[23], M.a[24], M.a[25], M.a[26], M.a[27], M.a[28], M.a[29], M.a[30], M.a[31]>( src2, src3 ); + } + x0 |= x1; // combine result of two blends. Unused elements are zero + } + return x0; + } + #ifdef VCL_NAMESPACE } #endif diff --git a/GaudiKernel/src/contrib/instrset_detect.cpp b/GaudiKernel/src/contrib/instrset_detect.cpp index 9fb617bd6d4834d5afa00e75d5fcd6f4a18a1614..e1e750cbeb7b78096571d8d3fb3965d8cf889868 100644 --- a/GaudiKernel/src/contrib/instrset_detect.cpp +++ b/GaudiKernel/src/contrib/instrset_detect.cpp @@ -1,14 +1,15 @@ /************************** instrset_detect.cpp **************************** -* Author: Agner Fog -* Date created: 2012-05-30 -* Last modified: 2017-05-02 -* Version: 1.28 -* Project: vector classes -* Description: -* Functions for checking which instruction sets are supported. -* -* (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses -\*****************************************************************************/ + * Author: Agner Fog + * Date created: 2012-05-30 + * Last modified: 2019-08-01 + * Version: 2.00.00 + * Project: vector class library + * Description: + * Functions for checking which instruction sets are supported. + * + * (c) Copyright 2012-2019 Agner Fog. + * Apache License version 2.0 or later. + ******************************************************************************/ #include "instrset.h" @@ -16,56 +17,21 @@ namespace VCL_NAMESPACE { #endif - // Define interface to cpuid instruction. - // input: eax = functionnumber, ecx = 0 - // output: eax = output[0], ebx = output[1], ecx = output[2], edx = output[3] - static inline void cpuid( int output[4], int functionnumber ) { -#if defined( __GNUC__ ) || defined( __clang__ ) // use inline assembly, Gnu/AT&T syntax - - int a, b, c, d; - __asm( "cpuid" : "=a"( a ), "=b"( b ), "=c"( c ), "=d"( d ) : "a"( functionnumber ), "c"( 0 ) : ); - output[0] = a; - output[1] = b; - output[2] = c; - output[3] = d; - -#elif defined( _MSC_VER ) || defined( __INTEL_COMPILER ) // Microsoft or Intel compiler, intrin.h included - - __cpuidex( output, functionnumber, 0 ); // intrinsic function for CPUID - -#else // unknown platform. try inline assembly with masm/intel syntax - - __asm { - mov eax, functionnumber - xor ecx, ecx - cpuid; - mov esi, output - mov [esi], eax - mov [esi+4], ebx - mov [esi+8], ecx - mov [esi+12], edx - } - -#endif - } - // Define interface to xgetbv instruction - static inline int64_t xgetbv( int ctr ) { + static inline uint64_t xgetbv( int ctr ) { #if ( defined( _MSC_FULL_VER ) && _MSC_FULL_VER >= 160040000 ) || \ - ( defined( __INTEL_COMPILER ) && __INTEL_COMPILER >= 1200 ) // Microsoft or Intel compiler supporting _xgetbv - // intrinsic + ( defined( __INTEL_COMPILER ) && __INTEL_COMPILER >= 1200 ) + // Microsoft or Intel compiler supporting _xgetbv intrinsic - return _xgetbv( ctr ); // intrinsic function for XGETBV + return uint64_t( _xgetbv( ctr ) ); // intrinsic function for XGETBV -#elif defined( __GNUC__ ) // use inline assembly, Gnu/AT&T syntax +#elif defined( __GNUC__ ) || defined( __clang__ ) // use inline assembly, Gnu/AT&T syntax uint32_t a, d; __asm( "xgetbv" : "=a"( a ), "=d"( d ) : "c"( ctr ) : ); return a | ( uint64_t( d ) << 32 ); -#else // #elif defined (_WIN32) // other compiler. try inline assembly with masm/intel/MS - // syntax - +#else // #elif defined (_WIN32) // other compiler. try inline assembly with masm/intel/MS syntax uint32_t a, d; __asm { mov ecx, ctr @@ -83,7 +49,7 @@ namespace VCL_NAMESPACE { /* find supported instruction set return value: 0 = 80386 instruction set - 1 or above = SSE (XMM) supported by CPU (not testing for O.S. support) + 1 or above = SSE (XMM) supported by CPU (not testing for OS support) 2 or above = SSE2 3 or above = SSE3 4 or above = Supplementary SSE3 (SSSE3) @@ -92,8 +58,7 @@ namespace VCL_NAMESPACE { 7 or above = AVX supported by CPU and operating system 8 or above = AVX2 9 or above = AVX512F - 10 or above = AVX512VL - 11 or above = AVX512BW, AVX512DQ + 10 or above = AVX512VL, AVX512BW, AVX512DQ */ int instrset_detect( void ) { @@ -134,11 +99,10 @@ namespace VCL_NAMESPACE { cpuid( abcd, 0xD ); // call cpuid leaf 0xD for feature flags if ( ( abcd[0] & 0x60 ) != 0x60 ) return iset; // no AVX512 iset = 9; - cpuid( abcd, 7 ); // call cpuid leaf 7 for feature flags - if ( ( abcd[1] & ( 1 << 31 ) ) == 0 ) return iset; // no AVX512VL - iset = 10; + cpuid( abcd, 7 ); // call cpuid leaf 7 for feature flags + if ( ( abcd[1] & ( 1 << 31 ) ) == 0 ) return iset; // no AVX512VL if ( ( abcd[1] & 0x40020000 ) != 0x40020000 ) return iset; // no AVX512BW, AVX512DQ - iset = 11; + iset = 10; return iset; } @@ -182,6 +146,22 @@ namespace VCL_NAMESPACE { return ( ( abcd[1] & ( 1 << 27 ) ) != 0 ); // ebx bit 27 indicates AVX512ER } + // detect if CPU supports the AVX512VBMI instruction set + bool hasAVX512VBMI( void ) { + if ( instrset_detect() < 10 ) return false; // must have AVX512BW + int abcd[4]; // cpuid results + cpuid( abcd, 7 ); // call cpuid function 7 + return ( ( abcd[2] & ( 1 << 1 ) ) != 0 ); // ecx bit 1 indicates AVX512VBMI + } + + // detect if CPU supports the AVX512VBMI2 instruction set + bool hasAVX512VBMI2( void ) { + if ( instrset_detect() < 10 ) return false; // must have AVX512BW + int abcd[4]; // cpuid results + cpuid( abcd, 7 ); // call cpuid function 7 + return ( ( abcd[2] & ( 1 << 6 ) ) != 0 ); // ecx bit 6 indicates AVX512VBMI2 + } + #ifdef VCL_NAMESPACE } #endif diff --git a/ci-utils/env_setup.sh b/ci-utils/env_setup.sh index 464e57cd1f5401e53b630ef8fa8257b57de0092b..fb4d1eadbe8cbeef037233c38b1bf0cd8a19a034 100644 --- a/ci-utils/env_setup.sh +++ b/ci-utils/env_setup.sh @@ -23,7 +23,7 @@ # initial required environment # - get Python, nosetests and xenv from LHCb installation -source /cvmfs/lhcb.cern.ch/lib/var/lib/LbEnv/prod/${LCG_hostos}/bin/activate +source /cvmfs/lhcb.cern.ch/lib/var/lib/LbEnv/stable/${LCG_hostos}/bin/activate # - add CMake and Ninja-build export PATH=${LCG_contrib}/CMake/latest/Linux-x86_64/bin:${LCG_contrib}/ninja/1.7.2.gcc0ea.kitware.dyndep-1/x86_64-slc6:${PATH} # - use LHCb compiler wrappers diff --git a/toolchain.cmake b/toolchain.cmake index 3cf3c082dc00788265c6da964b5808771c204fa8..0eb85e701549219c06d8a9c6b04a281b0145500e 100644 --- a/toolchain.cmake +++ b/toolchain.cmake @@ -9,7 +9,7 @@ # or submit itself to any jurisdiction. # ##################################################################################### # Special wrapper to load the declared version of the heptools toolchain. -set(heptools_version 97) +set(heptools_version 97a) cmake_minimum_required(VERSION 3.6) if(NOT "$ENV{HEPTOOLS_VERSION}" STREQUAL "")