diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 501f20cbfeb3d1d0208d4618953f57067a9fea02..698cc8105f4bafa9f7855516003046e88ba43ab8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -87,7 +87,7 @@ build:gcc9:opt:python3:
   variables:
     BINARY_TAG: x86_64-centos7-gcc9-opt
     BUILDDIR: build-opt-python3
-    HEPTOOLS_VERSION: 97python3
+    HEPTOOLS_VERSION: 97apython3
   script:
     - ci-utils/build
   artifacts:
@@ -158,6 +158,7 @@ test:gcc8:opt:
       - ${TESTS_REPORT}
     when: always
     expire_in: 1 week
+  retry: 2
 
 test:gcc9:opt:
   stage: test
@@ -175,6 +176,7 @@ test:gcc9:opt:
       - ${TESTS_REPORT}
     when: always
     expire_in: 1 week
+  retry: 2
 
 test:gcc9:dbg:
   stage: test
@@ -192,6 +194,7 @@ test:gcc9:dbg:
       - ${TESTS_REPORT}
     when: always
     expire_in: 1 week
+  retry: 2
 
 test:gcc9:opt:python3:
   stage: test
@@ -209,6 +212,7 @@ test:gcc9:opt:python3:
       - ${TESTS_REPORT}
     when: always
     expire_in: 1 week
+  retry: 2
 
 test_public_headers_build:
   stage: test
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 09f40d702564197afe9ee4f82fd75d5290b182f4..3a29fc6a70f0b0d268333ccccdf15a13190f0cf7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,51 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+## [v33r2][] - 2020-06-19
+This is a bugfix release with just a couple of minor backward compatible improvements.
+
+We also updated the LCG baseline version to [LCG 97a](http://lcginfo.cern.ch/release/97a/) (ROOT 6.20/06).
+
+This is the last release with the legacy CMake configuration. From the next release (v34r0)
+we will use a modern CMake configuration (see gaudi/Gaudi!922 and gaudi/Gaudi!986 for details).
+
+### Added
+- Improve event context extension management (gaudi/Gaudi!1080)
+- Introducing Set-semantics (gaudi/Gaudi!1066)
+- Make gitlab use cpp syntax highlighting for icpp files (gaudi/Gaudi!1057)
+
+### Fixed
+- Fixes for ROOT master (gaudi/Gaudi!1071)
+- Use fmtlib instead of `boost::format` (where possible) (gaudi/Gaudi!1046, gaudi/Gaudi#112)
+- Conditions alg test, with stall debug info (gaudi/Gaudi!1092)
+- Make sure the TES is cleaned by `EventLoopMgr` before finalize (gaudi/Gaudi!1082, LHCBPS-1850)
+- Fix race condition between tests (gaudi/Gaudi!1074)
+- Fix propagation of `StopIteration` exception (gaudi/Gaudi!1093, gaudi/Gaudi#129)
+- Update HiveDataBroker to use C++20 ranges if available. (gaudi/Gaudi!1089)
+- Fixes for C++20 (gaudi/Gaudi!1085)
+- Fix thread-safety problem in GaudiHandle (gaudi/Gaudi!1084)
+- Misc. sanitiser improvements (gaudi/Gaudi!1081)
+- Make `GaudiHandleArray.__str__` consistent with `GaudiHandle.__str__` (gaudi/Gaudi!1070)
+- Fix trivial typo in message (gaudi/Gaudi!1087)
+- `GaudiConfig2`: fix unpickling of derived Configurables (gaudi/Gaudi!1076, gaudi/Gaudi#124)
+- Remove data flow dumps or make them configurable (gaudi/Gaudi!1075)
+- Change `StatusCode::orThrow` so that the return value can be ignored (gaudi/Gaudi!1073, gaudi/Gaudi#118)
+- Fix bugs in `ThreadPoolSvc` (gaudi/Gaudi!1068)
+- Sequence semantics update (gaudi/Gaudi!1069)
+- Remove now unused `ATLAS_GAUDI_V21` and `HAVE_GAUDI_PLUGINSVC` definitions (gaudi/Gaudi!1062)
+- Few fixes to `GaudiConfig2` (gaudi/Gaudi!1061)
+- Fix compatibility with C++ GSL 3 (gaudi/Gaudi!1063, gaudi/Gaudi#121)
+- Consolidate and optimize scheduling of blocking tasks (gaudi/Gaudi!1051)
+- Do not run public headers build test on generated headers (gaudi/Gaudi!974)
+- Add array `operator<<` into `SerializeSTL.h` (gaudi/Gaudi!1049)
+- Dereference instead of `operator*` in `get_from_handle` to enable `Gaudi::Ranges` in `MergingTransformer`s (gaudi/Gaudi!1047)
+- Fixes for `GetHostBinaryTag` (gaudi/Gaudi!1058)
+- Remove unused capture of `this` in `MultiTransformerFilter` (gaudi/Gaudi!1059)
+- Fix deprecated copy constructor warning in `StreamBuffer::ContainedLink` (gaudi/Gaudi!1060)
+- `AlgExecStateSvc`: require minimum state size to be 1 (gaudi/Gaudi!1055)
+- `IncidentSvc`: improve error reporting (gaudi/Gaudi!1054)
+- Remove mention of deprecated `tbb::recursive_mutex` (gaudi/Gaudi!1053)
+
 ## [v33r1][] - 2020-03-23
 This release contains multiple fixes and some ABI changes, in particular:
 - `StatusCode` values not checked now produce a compile time warning.
@@ -482,7 +527,8 @@ Details about old versions of the project can be found in the
 [GaudiRelease/doc](GaudiRelease/doc).
 
 
-[Unreleased]: https://gitlab.cern.ch/gaudi/Gaudi/compare/v33r1...master
+[Unreleased]: https://gitlab.cern.ch/gaudi/Gaudi/compare/v33r2...master
+[v33r2]: https://gitlab.cern.ch/gaudi/Gaudi/compare/v33r1...v33r2
 [v33r1]: https://gitlab.cern.ch/gaudi/Gaudi/compare/v33r0...v33r1
 [v33r0]: https://gitlab.cern.ch/gaudi/Gaudi/compare/v32r2...v33r0
 [v32r2]: https://gitlab.cern.ch/gaudi/Gaudi/compare/v32r1...v32r2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fae498e907482f37b7016ddd84b50a5fcb0bd359..44c07eadef7f36d3dd790d0a7759401717192b59 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ endif()
 include(${CMAKE_SOURCE_DIR}/cmake/externals.cmake)
 
 # Declare project name and version
-gaudi_project(Gaudi v33r1)
+gaudi_project(Gaudi v33r2)
 
 # Special hack to make sure downstream projects see fmt::fmt
 # This is going to be much cleaner with gaudi/Gaudi!986 (CMake modernization)
diff --git a/GaudiKernel/CMakeLists.txt b/GaudiKernel/CMakeLists.txt
index 828ae6495f9880ca63a220c1a5a071a99819734f..2095c12e03b62181f2bbcf0a63c26b76f84a148d 100644
--- a/GaudiKernel/CMakeLists.txt
+++ b/GaudiKernel/CMakeLists.txt
@@ -33,12 +33,14 @@ if(VectorClass_INCLUDE_DIR AND NOT VectorClass_VERSION)
   set(VectorClass_VERSION 0.0)
   file(STRINGS ${VectorClass_INCLUDE_DIR}/instrset.h _vectorclass_guard REGEX "define +INSTRSET_H +[0-9]+")
   list(GET _vectorclass_guard 0 _vectorclass_guard)
-  if(_vectorclass_guard MATCHES "INSTRSET_H +([0-9]+)")
+  if(_vectorclass_guard MATCHES "INSTRSET_H +([0-9][0-9][0-9][0-9][0-9])")
+    string(REGEX REPLACE "([0-9]+)([0-9][0-9])([0-9][0-9])" "\\1.\\2.\\3" VectorClass_VERSION "${CMAKE_MATCH_1}")
+  elseif(_vectorclass_guard MATCHES "INSTRSET_H +([0-9][0-9][0-9])")
     string(REGEX REPLACE "([0-9]+)([0-9][0-9])" "\\1.\\2" VectorClass_VERSION "${CMAKE_MATCH_1}")
   endif()
   set(VectorClass_VERSION "${VectorClass_VERSION}" CACHE INTERNAL "")
 endif()
-if(NOT VectorClass_INCLUDE_DIR OR VectorClass_VERSION VERSION_LESS 1.25)
+if(NOT VectorClass_INCLUDE_DIR OR VectorClass_VERSION VERSION_LESS 2.01.02)
   if(VectorClass_INCLUDE_DIR)
     message(STATUS "Found VectorClass instrset_detect ${VectorClass_VERSION} at ${VectorClass_INCLUDE_DIR}")
   endif()
diff --git a/GaudiKernel/src/Util/instructionsetLevel.cpp b/GaudiKernel/src/Util/instructionsetLevel.cpp
index 0a228ab870f8aff6b66b56a81eec6d649a74d238..e7123a5ec0326e62c520120b535a2672212dd381 100644
--- a/GaudiKernel/src/Util/instructionsetLevel.cpp
+++ b/GaudiKernel/src/Util/instructionsetLevel.cpp
@@ -28,7 +28,7 @@ int main( int argc, char* argv[] ) {
   const std::vector<std::string> sets{// "80386",
                                       "sse",   "sse2",
                                       "sse3", // Note: This is reported by Linux as 'pni'
-                                      "ssse3", "sse4_1", "sse4_2", "avx", "avx2", "avx512f", "avx512vl", "avx512bw"};
+                                      "ssse3", "sse4_1", "sse4_2", "avx", "avx2", "avx512f", "avx512vl"};
 
   if ( argc == 1 ) {
     const std::size_t level = System::instructionsetLevel() - 1;
diff --git a/GaudiKernel/src/contrib/instrset.h b/GaudiKernel/src/contrib/instrset.h
index ea4a39d84b0e4d68e946503d64ac25fe9b734926..8fb762ed4645c9b00b4c080e927867af04cc0a1e 100644
--- a/GaudiKernel/src/contrib/instrset.h
+++ b/GaudiKernel/src/contrib/instrset.h
@@ -1,34 +1,59 @@
 /****************************  instrset.h   **********************************
  * Author:        Agner Fog
  * Date created:  2012-05-30
- * Last modified: 2016-11-25
- * Version:       1.25
- * Project:       vector classes
+ * Last modified: 2020-04-01
+ * Version:       2.01.02
+ * Project:       vector class library
  * Description:
- * Header file for various compiler-specific tasks and other common tasks to
- * vector class library:
- * > selects the supported instruction set
- * > defines integer types
- * > defines compiler version macros
- * > undefines certain macros that prevent function overloading
- * > defines template class to represent compile-time integer constant
- * > defines template for compile-time error messages
+ * Header file for various compiler-specific tasks as well as common
+ * macros and templates. This file contains:
  *
- * (c) Copyright 2012-2016 GNU General Public License www.gnu.org/licenses
+ * > Selection of the supported instruction set
+ * > Defines compiler version macros
+ * > Undefines certain macros that prevent function overloading
+ * > Helper functions that depend on instruction set, compiler, or platform
+ * > Common templates for permute, blend, etc.
+ *
+ * For instructions, see vcl_manual.pdf
+ *
+ * (c) Copyright 2012-2020 Agner Fog.
+ * Apache License version 2.0 or later.
  ******************************************************************************/
 
 #ifndef INSTRSET_H
-#define INSTRSET_H 125
+#define INSTRSET_H 20102
 
-// Detect 64 bit mode
+// Allow the use of floating point permute instructions on integer vectors.
+// Some CPU's have an extra latency of 1 or 2 clock cycles for this, but
+// it may still be faster than alternative implementations:
+#define ALLOW_FP_PERMUTE true
+
+// Macro to indicate 64 bit mode
 #if ( defined( _M_AMD64 ) || defined( _M_X64 ) || defined( __amd64 ) ) && !defined( __x86_64__ )
 #  define __x86_64__ 1 // There are many different macros for this, decide on only one
 #endif
 
-// Find instruction set from compiler macros if INSTRSET not defined
+// The following values of INSTRSET are currently defined:
+// 2:  SSE2
+// 3:  SSE3
+// 4:  SSSE3
+// 5:  SSE4.1
+// 6:  SSE4.2
+// 7:  AVX
+// 8:  AVX2
+// 9:  AVX512F
+// 10: AVX512BW/DQ/VL
+// In the future, INSTRSET = 11 may include AVX512VBMI and AVX512VBMI2, but this
+// decision cannot be made before the market situation for CPUs with these
+// instruction sets is known (these future instruction set extensions are already
+// used in some VCL functions and tested with an emulator)
+
+// Find instruction set from compiler macros if INSTRSET is not defined.
 // Note: Most of these macros are not defined in Microsoft compilers
 #ifndef INSTRSET
-#  if defined( __AVX512F__ ) || defined( __AVX512__ )
+#  if defined( __AVX512VL__ ) && defined( __AVX512BW__ ) && defined( __AVX512DQ__ )
+#    define INSTRSET 10
+#  elif defined( __AVX512F__ ) || defined( __AVX512__ )
 #    define INSTRSET 9
 #  elif defined( __AVX2__ )
 #    define INSTRSET 8
@@ -60,7 +85,7 @@
                            // sets are specified on the compiler command line, such as:
                            // xopintrin.h, fma4intrin.h
 #  else
-#    include <immintrin.h> // MS version of immintrin.h covers AVX, AVX2 and FMA3
+#    include <immintrin.h> // MS/Intel version of immintrin.h covers AVX and later
 #  endif                   // __GNUC__
 #elif INSTRSET == 7
 #  include <immintrin.h> // AVX
@@ -80,10 +105,12 @@
 
 #if INSTRSET >= 8 && !defined( __FMA__ )
 // Assume that all processors that have AVX2 also have FMA3
-#  if defined( __GNUC__ ) && !defined( __INTEL_COMPILER ) && !defined( __clang__ )
-// Prevent error message in g++ when using FMA intrinsics with avx2:
-#    pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
-#  else
+#  if defined( __GNUC__ ) && !defined( __INTEL_COMPILER )
+// Prevent error message in g++ and Clang when using FMA intrinsics with avx2:
+#    if !defined( DISABLE_WARNING_AVX2_WITHOUT_FMA )
+#      pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
+#    endif
+#  elif !defined( __clang__ )
 #    define __FMA__ 1
 #  endif
 #endif
@@ -109,52 +136,14 @@
 #  include <fma4intrin.h> // must have both x86intrin.h and fma4intrin.h, don't know why
 #endif                    // __FMA4__
 
-// Define integer types with known size
-#if defined( __GNUC__ ) || defined( __clang__ ) || ( defined( _MSC_VER ) && _MSC_VER >= 1600 )
-// Compilers supporting C99 or C++0x have stdint.h defining these integer types
-#  include <stdint.h>
-#elif defined( _MSC_VER )
-// Older Microsoft compilers have their own definitions
-typedef signed __int8    int8_t;
-typedef unsigned __int8  uint8_t;
-typedef signed __int16   int16_t;
-typedef unsigned __int16 uint16_t;
-typedef signed __int32   int32_t;
-typedef unsigned __int32 uint32_t;
-typedef signed __int64   int64_t;
-typedef unsigned __int64 uint64_t;
-#  ifndef _INTPTR_T_DEFINED
-#    define _INTPTR_T_DEFINED
-#    ifdef __x86_64__
-typedef int64_t          intptr_t;
-#    else
-typedef int32_t intptr_t;
-#    endif
-#  endif
-#else
-// This works with most compilers
-typedef signed char        int8_t;
-typedef unsigned char      uint8_t;
-typedef signed short int   int16_t;
-typedef unsigned short int uint16_t;
-typedef signed int         int32_t;
-typedef unsigned int       uint32_t;
-typedef long long          int64_t;
-typedef unsigned long long uint64_t;
-#  ifdef __x86_64__
-typedef int64_t            intptr_t;
-#  else
-typedef int32_t intptr_t;
-#  endif
-#endif
-
+#include <stdint.h> // Define integer types with known size
 #include <stdlib.h> // define abs(int)
 
 #ifdef _MSC_VER       // Microsoft compiler or compatible Intel compiler
 #  include <intrin.h> // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int)
 #endif                // _MSC_VER
 
-// functions in instrset_detect.cpp
+// functions in instrset_detect.cpp:
 #ifdef VCL_NAMESPACE
 namespace VCL_NAMESPACE {
 #endif
@@ -163,10 +152,15 @@ namespace VCL_NAMESPACE {
   bool hasFMA4( void );         // true if FMA4 instructions supported
   bool hasXOP( void );          // true if XOP  instructions supported
   bool hasAVX512ER( void );     // true if AVX512ER instructions supported
+  bool hasAVX512VBMI( void );   // true if AVX512VBMI instructions supported
+  bool hasAVX512VBMI2( void );  // true if AVX512VBMI2 instructions supported
 #ifdef VCL_NAMESPACE
 }
 #endif
 
+// functions in physical_processors.cpp:
+int physicalProcessors( int* logical_processors = 0 );
+
 // GCC version
 #if defined( __GNUC__ ) && !defined( GCC_VERSION ) && !defined( __clang__ )
 #  define GCC_VERSION ( (__GNUC__)*10000 + (__GNUC_MINOR__)*100 + ( __GNUC_PATCHLEVEL__ ) )
@@ -191,9 +185,216 @@ namespace VCL_NAMESPACE {
 #  endif
 #endif
 
+/* Intel compiler problem:
+The Intel compiler currently cannot compile version 2.00 of VCL. It seems to have
+a problem with constexpr function returns not being constant enough.
+*/
+#if defined( __INTEL_COMPILER ) && __INTEL_COMPILER < 9999
+#  error The Intel compiler version 19.00 cannot compile VCL version 2. Use Version 1.xx of VCL instead
+#endif
+
+/* Clang problem:
+The Clang compiler treats the intrinsic vector types __m128, __m128i, and __m128d as identical.
+See the bug report at https://bugs.llvm.org/show_bug.cgi?id=17164
+Additional problem: The version number is not consistent across platforms. The Apple build has
+different version numbers. We have to rely on __apple_build_version__ on the Mac platform:
+http://llvm.org/bugs/show_bug.cgi?id=12643
+We have to make switches here when - hopefully - the error some day has been fixed.
+We need different version checks with and whithout __apple_build_version__
+*/
+#if ( defined( __clang__ ) || defined( __apple_build_version__ ) ) && !defined( __INTEL_COMPILER )
+#  define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
+#endif
+
+#if defined( GCC_VERSION ) && GCC_VERSION < 99999 && !defined( __clang__ )
+#  define ZEXT_MISSING // Gcc 7.4.0 does not have _mm256_zextsi128_si256 and similar functions
+#endif
+
 #ifdef VCL_NAMESPACE
 namespace VCL_NAMESPACE {
 #endif
+
+  // Constant for indicating don't care in permute and blend functions.
+  // V_DC is -256 in Vector class library version 1.xx
+  // V_DC can be any value less than -1 in Vector class library version 2.00
+  constexpr int V_DC = -256;
+
+  /*****************************************************************************
+   *
+   *    Helper functions that depend on instruction set, compiler, or platform
+   *
+   *****************************************************************************/
+
+  // Define interface to cpuid instruction.
+  // input:  functionnumber = leaf (eax), ecxleaf = subleaf(ecx)
+  // output: output[0] = eax, output[1] = ebx, output[2] = ecx, output[3] = edx
+  static inline void cpuid( int output[4], int functionnumber, int ecxleaf = 0 ) {
+#if defined( __GNUC__ ) || defined( __clang__ ) // use inline assembly, Gnu/AT&T syntax
+    int a, b, c, d;
+    __asm( "cpuid" : "=a"( a ), "=b"( b ), "=c"( c ), "=d"( d ) : "a"( functionnumber ), "c"( ecxleaf ) : );
+    output[0] = a;
+    output[1] = b;
+    output[2] = c;
+    output[3] = d;
+
+#elif defined( _MSC_VER ) // Microsoft compiler, intrin.h included
+  __cpuidex( output, functionnumber, ecxleaf ); // intrinsic function for CPUID
+
+#else // unknown platform. try inline assembly with masm/intel syntax
+  __asm {
+        mov eax, functionnumber
+        mov ecx, ecxleaf
+        cpuid;
+        mov esi, output
+        mov[esi], eax
+        mov[esi + 4], ebx
+        mov[esi + 8], ecx
+        mov[esi + 12], edx
+  }
+#endif
+  }
+
+// Define popcount function. Gives sum of bits
+#if INSTRSET >= 6 // SSE4.2
+  // popcnt instruction is not officially part of the SSE4.2 instruction set,
+  // but available in all known processors with SSE4.2
+  static inline uint32_t vml_popcnt( uint32_t a ) {
+    return (uint32_t)_mm_popcnt_u32( a ); // Intel intrinsic. Supported by gcc and clang
+  }
+#  ifdef __x86_64__
+  static inline int64_t vml_popcnt( uint64_t a ) {
+    return _mm_popcnt_u64( a ); // Intel intrinsic.
+  }
+#  else // 32 bit mode
+  static inline int64_t vml_popcnt( uint64_t a ) {
+    return _mm_popcnt_u32( uint32_t( a >> 32 ) ) + _mm_popcnt_u32( uint32_t( a ) );
+  }
+#  endif
+#else // no SSE4.2
+static inline uint32_t vml_popcnt( uint32_t a ) {
+  // popcnt instruction not available
+  uint32_t b = a - ( ( a >> 1 ) & 0x55555555 );
+  uint32_t c = ( b & 0x33333333 ) + ( ( b >> 2 ) & 0x33333333 );
+  uint32_t d = ( c + ( c >> 4 ) ) & 0x0F0F0F0F;
+  uint32_t e = d * 0x01010101;
+  return e >> 24;
+}
+
+static inline int32_t vml_popcnt( uint64_t a ) {
+  return vml_popcnt( uint32_t( a >> 32 ) ) + vml_popcnt( uint32_t( a ) );
+}
+
+#endif
+
+// Define bit-scan-forward function. Gives index to lowest set bit
+#if defined( __GNUC__ ) || defined( __clang__ )
+  // gcc and Clang have no bit_scan_forward intrinsic
+#  if defined( __clang__ ) // fix clang bug
+                           // Clang uses a k register as parameter a when inlined from horizontal_find_first
+  __attribute__( ( noinline ) )
+#  endif
+  static uint32_t
+  bit_scan_forward( uint32_t a ) {
+    uint32_t r;
+    __asm( "bsfl %1, %0" : "=r"( r ) : "r"( a ) : );
+    return r;
+  }
+  static inline uint32_t bit_scan_forward( uint64_t a ) {
+    uint32_t lo = uint32_t( a );
+    if ( lo ) return bit_scan_forward( lo );
+    uint32_t hi = uint32_t( a >> 32 );
+    return bit_scan_forward( hi ) + 32;
+  }
+
+#else // other compilers
+static inline uint32_t bit_scan_forward( uint32_t a ) {
+  unsigned long r;
+  _BitScanForward( &r, a ); // defined in intrin.h for MS and Intel compilers
+  return r;
+}
+#  ifdef __x86_64__
+static inline uint32_t bit_scan_forward( uint64_t a ) {
+  unsigned long r;
+  _BitScanForward64( &r, a ); // defined in intrin.h for MS and Intel compilers
+  return (uint32_t)r;
+}
+#  else
+static inline uint32_t bit_scan_forward( uint64_t a ) {
+  uint32_t lo = uint32_t( a );
+  if ( lo ) return bit_scan_forward( lo );
+  uint32_t hi = uint32_t( a >> 32 );
+  return bit_scan_forward( hi ) + 32;
+}
+#  endif
+#endif
+
+// Define bit-scan-reverse function. Gives index to highest set bit = floor(log2(a))
+#if defined( __GNUC__ ) || defined( __clang__ )
+  static inline uint32_t bit_scan_reverse( uint32_t a ) __attribute__( ( pure ) );
+  static inline uint32_t bit_scan_reverse( uint32_t a ) {
+    uint32_t r;
+    __asm( "bsrl %1, %0" : "=r"( r ) : "r"( a ) : );
+    return r;
+  }
+#  ifdef __x86_64__
+  static inline uint32_t bit_scan_reverse( uint64_t a ) {
+    uint64_t r;
+    __asm( "bsrq %1, %0" : "=r"( r ) : "r"( a ) : );
+    return r;
+  }
+#  else // 32 bit mode
+  static inline uint32_t bit_scan_reverse( uint64_t a ) {
+    uint64_t ahi = a >> 32;
+    if ( ahi == 0 )
+      return bit_scan_reverse( uint32_t( a ) );
+    else
+      return bit_scan_reverse( uint32_t( ahi ) ) + 32;
+  }
+#  endif
+#else
+static inline uint32_t bit_scan_reverse( uint32_t a ) {
+  unsigned long r;
+  _BitScanReverse( &r, a ); // defined in intrin.h for MS and Intel compilers
+  return r;
+}
+#  ifdef __x86_64__
+static inline uint32_t bit_scan_reverse( uint64_t a ) {
+  unsigned long r;
+  _BitScanReverse64( &r, a ); // defined in intrin.h for MS and Intel compilers
+  return r;
+}
+#  else // 32 bit mode
+static inline uint32_t bit_scan_reverse( uint64_t a ) {
+  uint64_t ahi = a >> 32;
+  if ( ahi == 0 )
+    return bit_scan_reverse( uint32_t( a ) );
+  else
+    return bit_scan_reverse( uint32_t( ahi ) ) + 32;
+}
+#  endif
+#endif
+
+  // Same function, for compile-time constants
+  constexpr int bit_scan_reverse_const( uint64_t const n ) {
+    if ( n == 0 ) return -1;
+    uint64_t a = n, b = 0, j = 64, k = 0;
+    do {
+      j >>= 1;
+      k = (uint64_t)1 << j;
+      if ( a >= k ) {
+        a >>= j;
+        b += j;
+      }
+    } while ( j > 0 );
+    return int( b );
+  }
+
+  /*****************************************************************************
+   *
+   *    Common templates
+   *
+   *****************************************************************************/
+
   // Template class to represent compile-time integer constant
   template <int32_t n>
   class Const_int_t {}; // represent compile-time signed integer constant
@@ -202,17 +403,1028 @@ namespace VCL_NAMESPACE {
 #define const_int( n ) ( Const_int_t<n>() )   // n must be compile-time integer constant
 #define const_uint( n ) ( Const_uint_t<n>() ) // n must be compile-time unsigned integer constant
 
-  // Template for compile-time error messages
-  template <bool>
-  class Static_error_check {
-  public:
-    Static_error_check(){};
-  };
-  template <>
-  class Static_error_check<false> { // generate compile-time error if false
-  private:
-    Static_error_check(){};
+  // template for producing quiet NAN
+  template <class VTYPE>
+  static inline VTYPE nan_vec( uint32_t payload = 0x100 ) {
+    if constexpr ( ( VTYPE::elementtype() & 1 ) != 0 ) { // double
+      union {
+        uint64_t q;
+        double   f;
+      } ud;
+      // n is left justified to avoid loss of NAN payload when converting to float
+      ud.q = 0x7FF8000000000000 | uint64_t( payload ) << 29;
+      return VTYPE( ud.f );
+    }
+    // float will be converted to double if necessary
+    union {
+      uint32_t i;
+      float    f;
+    } uf;
+    uf.i = 0x7FC00000 | ( payload & 0x003FFFFF );
+    return VTYPE( uf.f );
+  }
+
+  // Test if a parameter is a compile-time constant
+  /* Unfortunately, this works only for macro parameters, not for inline function parameters.
+     I hope that some solution will appear in the future, but for now it appears to be
+     impossible to check if a function parameter is a compile-time constant.
+     This would be useful in operator / and in function pow:
+     #if defined(__GNUC__) || defined (__clang__)
+     #define is_constant(a) __builtin_constant_p(a)
+     #else
+     #define is_constant(a) false
+     #endif
+  */
+
+  /*****************************************************************************
+  *
+  *    Helper functions for permute and blend functions
+  *
+  ******************************************************************************
+  Rules for constexpr functions:
+
+  > All variable declarations must include initialization
+
+  > Do not put variable declarations inside a for-clause, e.g. avoid: for (int i=0; ..
+    Instead, you have to declare the loop counter before the for-loop.
+
+  > Do not make constexpr functions that return vector types. This requires type
+    punning with a union, which is not allowed in constexpr functions under C++17.
+    It may be possible under C++20
+
+  *****************************************************************************/
+
+  // Define type for Encapsulated array to use as return type:
+  template <typename T, int N>
+  struct EList {
+    T a[N];
   };
+
+  // get_inttype: get an integer of a size that matches the element size
+  // of vector class V with the value -1
+  template <typename V>
+  constexpr auto get_inttype() {
+    constexpr int elementsize = sizeof( V ) / V::size(); // size of vector elements
+
+    if constexpr ( elementsize >= 8 ) {
+      return -int64_t( 1 );
+    } else if constexpr ( elementsize >= 4 ) {
+      return int32_t( -1 );
+    } else if constexpr ( elementsize >= 2 ) {
+      return int16_t( -1 );
+    } else {
+      return int8_t( -1 );
+    }
+  }
+
+  // zero_mask: return a compact bit mask mask for zeroing using AVX512 mask.
+  // Parameter a is a reference to a constexpr int array of permutation indexes
+  template <int N>
+  constexpr auto zero_mask( int const ( &a )[N] ) {
+    uint64_t mask = 0;
+    int      i    = 0;
+
+    for ( i = 0; i < N; i++ ) {
+      if ( a[i] >= 0 ) mask |= uint64_t( 1 ) << i;
+    }
+    if constexpr ( N <= 8 )
+      return uint8_t( mask );
+    else if constexpr ( N <= 16 )
+      return uint16_t( mask );
+    else if constexpr ( N <= 32 )
+      return uint32_t( mask );
+    else
+      return mask;
+  }
+
+  // zero_mask_broad: return a broad byte mask for zeroing.
+  // Parameter a is a reference to a constexpr int array of permutation indexes
+  template <typename V>
+  constexpr auto zero_mask_broad( int const ( &A )[V::size()] ) {
+    constexpr int                        N = V::size(); // number of vector elements
+    typedef decltype( get_inttype<V>() ) Etype;         // element type
+    EList<Etype, N>                      u = {{0}};     // list for return
+    int                                  i = 0;
+    for ( i = 0; i < N; i++ ) { u.a[i] = A[i] >= 0 ? get_inttype<V>() : 0; }
+    return u; // return encapsulated array
+  }
+
+  // make_bit_mask: return a compact mask of bits from a list of N indexes:
+  // B contains options indicating how to gather the mask
+  // bit 0-7 in B indicates which bit in each index to collect
+  // bit 8 = 0x100:  set 1 in the lower half of the bit mask if the indicated bit is 1.
+  // bit 8 = 0    :  set 1 in the lower half of the bit mask if the indicated bit is 0.
+  // bit 9 = 0x200:  set 1 in the upper half of the bit mask if the indicated bit is 1.
+  // bit 9 = 0    :  set 1 in the upper half of the bit mask if the indicated bit is 0.
+  // bit 10 = 0x400: set 1 in the bit mask if the corresponding index is -1 or V_DC
+  // Parameter a is a reference to a constexpr int array of permutation indexes
+  template <int N, int B>
+  constexpr uint64_t make_bit_mask( int const ( &a )[N] ) {
+    uint64_t r = 0;                   // return value
+    uint8_t  j = uint8_t( B & 0xFF ); // index to selected bit
+    uint64_t s = 0;                   // bit number i in r
+    uint64_t f = 0;                   // 1 if bit not flipped
+    int      i = 0;
+    for ( i = 0; i < N; i++ ) {
+      int ix = a[i];
+      if ( ix < 0 ) { // -1 or V_DC
+        s = ( B >> 10 ) & 1;
+      } else {
+        s = ( (uint32_t)ix >> j ) & 1; // extract selected bit
+        if ( i < N / 2 ) {
+          f = ( B >> 8 ) & 1; // lower half
+        } else {
+          f = ( B >> 9 ) & 1; // upper half
+        }
+        s ^= f ^ 1; // flip bit if needed
+      }
+      r |= uint64_t( s ) << i; // set bit in return value
+    }
+    return r;
+  }
+
+  // make_broad_mask: Convert a bit mask m to a broad mask
+  // The return value will be a broad boolean mask with elementsize matching vector class V
+  template <typename V>
+  constexpr auto make_broad_mask( uint64_t const m ) {
+    constexpr int                        N = V::size(); // number of vector elements
+    typedef decltype( get_inttype<V>() ) Etype;         // element type
+    EList<Etype, N>                      u = {{0}};     // list for returning
+    int                                  i = 0;
+    for ( i = 0; i < N; i++ ) { u.a[i] = ( ( m >> i ) & 1 ) != 0 ? get_inttype<V>() : 0; }
+    return u; // return encapsulated array
+  }
+
+  // perm_mask_broad: return a mask for permutation by a vector register index.
+  // Parameter A is a reference to a constexpr int array of permutation indexes
+  template <typename V>
+  constexpr auto perm_mask_broad( int const ( &A )[V::size()] ) {
+    constexpr int                        N = V::size(); // number of vector elements
+    typedef decltype( get_inttype<V>() ) Etype;         // vector element type
+    EList<Etype, N>                      u = {{0}};     // list for returning
+    int                                  i = 0;
+    for ( i = 0; i < N; i++ ) { u.a[i] = Etype( A[i] ); }
+    return u; // return encapsulated array
+  }
+
+  // perm_flags: returns information about how a permute can be implemented.
+  // The return value is composed of these flag bits:
+  const int perm_zeroing      = 1;     // needs zeroing
+  const int perm_perm         = 2;     // permutation needed
+  const int perm_allzero      = 4;     // all is zero or don't care
+  const int perm_largeblock   = 8;     // fits permute with a larger block size (e.g permute Vec2q instead of Vec4i)
+  const int perm_addz         = 0x10;  // additional zeroing needed after permute with larger block size or shift
+  const int perm_addz2        = 0x20;  // additional zeroing needed after perm_zext, perm_compress, or perm_expand
+  const int perm_cross_lane   = 0x40;  // permutation crossing 128-bit lanes
+  const int perm_same_pattern = 0x80;  // same permute pattern in all 128-bit lanes
+  const int perm_punpckh      = 0x100; // permutation pattern fits punpckh instruction
+  const int perm_punpckl      = 0x200; // permutation pattern fits punpckl instruction
+  const int perm_rotate =
+      0x400; // permutation pattern fits rotation within lanes. 4 bit count returned in bit perm_rot_count
+  const int perm_shright =
+      0x1000; // permutation pattern fits shift right within lanes. 4 bit count returned in bit perm_rot_count
+  const int perm_shleft =
+      0x2000; // permutation pattern fits shift left within lanes. negative count returned in bit perm_rot_count
+  const int perm_rotate_big =
+      0x4000; // permutation pattern fits rotation across lanes. 6 bit count returned in bit perm_rot_count
+  const int perm_broadcast  = 0x8000;     // permutation pattern fits broadcast of a single element.
+  const int perm_zext       = 0x10000;    // permutation pattern fits zero extension
+  const int perm_compress   = 0x20000;    // permutation pattern fits vpcompress instruction
+  const int perm_expand     = 0x40000;    // permutation pattern fits vpexpand instruction
+  const int perm_outofrange = 0x10000000; // index out of range
+  const int perm_rot_count  = 32;         // rotate or shift count is in bits perm_rot_count to perm_rot_count+3
+  const int perm_ipattern =
+      40; // pattern for pshufd is in bit perm_ipattern to perm_ipattern + 7 if perm_same_pattern and elementsize >= 4
+
+  template <typename V>
+  constexpr uint64_t perm_flags( int const ( &a )[V::size()] ) {
+    // a is a reference to a constexpr array of permutation indexes
+    // V is a vector class
+    constexpr int  N             = V::size();                                          // number of elements
+    uint64_t       r             = perm_largeblock | perm_same_pattern | perm_allzero; // return value
+    uint32_t       i             = 0;                                                  // loop counter
+    int            j             = 0;                                                  // loop counter
+    int            ix            = 0;                                                  // index number i
+    const uint32_t nlanes        = sizeof( V ) / 16;                                   // number of 128-bit lanes
+    const uint32_t lanesize      = N / nlanes;                                         // elements per lane
+    const uint32_t elementsize   = sizeof( V ) / N;                                    // size of each vector element
+    uint32_t       lane          = 0;                                                  // current lane
+    uint32_t       rot           = 999;                                                // rotate left count
+    int32_t        broadc        = 999;                                                // index to broadcasted element
+    uint32_t       patfail       = 0;  // remember certain patterns that do not fit
+    uint32_t       addz2         = 0;  // remember certain patterns need extra zeroing
+    int32_t        compresslasti = -1; // last index in perm_compress fit
+    int32_t        compresslastp = -1; // last position in perm_compress fit
+    int32_t        expandlasti   = -1; // last index in perm_expand fit
+    int32_t        expandlastp   = -1; // last position in perm_expand fit
+
+    int lanepattern[lanesize] = {0}; // pattern in each lane
+
+    for ( i = 0; i < N; i++ ) { // loop through indexes
+      ix = a[i];                // current index
+      // meaning of ix: -1 = set to zero, V_DC = don't care, non-negative value = permute.
+      if ( ix == -1 ) {
+        r |= perm_zeroing; // zeroing requested
+      } else if ( ix != V_DC && uint32_t( ix ) >= N ) {
+        r |= perm_outofrange; // index out of range
+      }
+      if ( ix >= 0 ) {
+        r &= ~perm_allzero;                 // not all zero
+        if ( ix != (int)i ) r |= perm_perm; // needs permutation
+        if ( broadc == 999 )
+          broadc = ix; // remember broadcast index
+        else if ( broadc != ix )
+          broadc = 1000; // does not fit broadcast
+      }
+      // check if pattern fits a larger block size:
+      // even indexes must be even, odd indexes must fit the preceding even index + 1
+      if ( ( i & 1 ) == 0 ) {                                            // even index
+        if ( ix >= 0 && ( ix & 1 ) ) r &= ~perm_largeblock;              // not even. does not fit larger block size
+        int iy = a[i + 1];                                               // next odd index
+        if ( iy >= 0 && ( iy & 1 ) == 0 ) r &= ~perm_largeblock;         // not odd. does not fit larger block size
+        if ( ix >= 0 && iy >= 0 && iy != ix + 1 ) r &= ~perm_largeblock; // does not fit preceding index + 1
+        if ( ix == -1 && iy >= 0 ) r |= perm_addz; // needs additional zeroing at current block size
+        if ( iy == -1 && ix >= 0 ) r |= perm_addz; // needs additional zeroing at current block size
+      }
+      lane = i / lanesize;   // current lane
+      if ( lane == 0 ) {     // first lane, or no pattern yet
+        lanepattern[i] = ix; // save pattern
+      }
+      // check if crossing lanes
+      if ( ix >= 0 ) {
+        uint32_t lanei = (uint32_t)ix / lanesize;  // source lane
+        if ( lanei != lane ) r |= perm_cross_lane; // crossing lane
+      }
+      // check if same pattern in all lanes
+      if ( lane != 0 && ix >= 0 ) {                                   // not first lane
+        int j1 = i - int( lane * lanesize );                          // index into lanepattern
+        int jx = ix - int( lane * lanesize );                         // pattern within lane
+        if ( jx < 0 || jx >= (int)lanesize ) r &= ~perm_same_pattern; // source is in another lane
+        if ( lanepattern[j1] < 0 ) {
+          lanepattern[j1] = jx; // pattern not known from previous lane
+        } else {
+          if ( lanepattern[j1] != jx ) r &= ~perm_same_pattern; // not same pattern
+        }
+      }
+      if ( ix >= 0 ) {
+        // check if pattern fits zero extension (perm_zext)
+        if ( uint32_t( ix * 2 ) != i ) {
+          patfail |= 1; // does not fit zero extension
+        }
+        // check if pattern fits compress (perm_compress)
+        if ( ix > compresslasti && ix - compresslasti >= (int)i - compresslastp ) {
+          if ( (int)i - compresslastp > 1 ) addz2 |= 2; // perm_compress may need additional zeroing
+          compresslasti = ix;
+          compresslastp = i;
+        } else {
+          patfail |= 2; // does not fit perm_compress
+        }
+        // check if pattern fits expand (perm_expand)
+        if ( ix > expandlasti && ix - expandlasti <= (int)i - expandlastp ) {
+          if ( ix - expandlasti > 1 ) addz2 |= 4; // perm_expand may need additional zeroing
+          expandlasti = ix;
+          expandlastp = i;
+        } else {
+          patfail |= 4; // does not fit perm_compress
+        }
+      } else if ( ix == -1 ) {
+        if ( ( i & 1 ) == 0 ) addz2 |= 1; // zero extension needs additional zeroing
+      }
+    }
+    if ( !( r & perm_perm ) ) return r; // more checks are superfluous
+
+    if ( !( r & perm_largeblock ) ) r &= ~perm_addz;    // remove irrelevant flag
+    if ( r & perm_cross_lane ) r &= ~perm_same_pattern; // remove irrelevant flag
+    if ( ( patfail & 1 ) == 0 ) {
+      r |= perm_zext; // fits zero extension
+      if ( ( addz2 & 1 ) != 0 ) r |= perm_addz2;
+    } else if ( ( patfail & 2 ) == 0 ) {
+      r |= perm_compress;         // fits compression
+      if ( ( addz2 & 2 ) != 0 ) { // check if additional zeroing needed
+        for ( j = 0; j < compresslastp; j++ ) {
+          if ( a[j] == -1 ) r |= perm_addz2;
+        }
+      }
+    } else if ( ( patfail & 4 ) == 0 ) {
+      r |= perm_expand;           // fits expansion
+      if ( ( addz2 & 4 ) != 0 ) { // check if additional zeroing needed
+        for ( j = 0; j < expandlastp; j++ ) {
+          if ( a[j] == -1 ) r |= perm_addz2;
+        }
+      }
+    }
+
+    if ( r & perm_same_pattern ) {
+      // same pattern in all lanes. check if it fits specific patterns
+      bool fit = true;
+      // fit shift or rotate
+      for ( i = 0; i < lanesize; i++ ) {
+        if ( lanepattern[i] >= 0 ) {
+          uint32_t rot1 = uint32_t( lanepattern[i] + lanesize - i ) % lanesize;
+          if ( rot == 999 ) {
+            rot = rot1;
+          } else { // check if fit
+            if ( rot != rot1 ) fit = false;
+          }
+        }
+      }
+      rot &= lanesize - 1;                           // prevent out of range values
+      if ( fit ) {                                   // fits rotate, and possibly shift
+        uint64_t rot2 = ( rot * elementsize ) & 0xF; // rotate right count in bytes
+        r |= rot2 << perm_rot_count;                 // put shift/rotate count in output bit 16-19
+#if INSTRSET >= 4                                    // SSSE3
+        r |= perm_rotate;                            // allow palignr
+#endif
+        // fit shift left
+        fit = true;
+        for ( i = 0; i < lanesize - rot; i++ ) { // check if first rot elements are zero or don't care
+          if ( lanepattern[i] >= 0 ) fit = false;
+        }
+        if ( fit ) {
+          r |= perm_shleft;
+          for ( ; i < lanesize; i++ )
+            if ( lanepattern[i] == -1 ) r |= perm_addz; // additional zeroing needed
+        }
+        // fit shift right
+        fit = true;
+        for ( i = lanesize - (uint32_t)rot; i < lanesize;
+              i++ ) { // check if last (lanesize-rot) elements are zero or don't care
+          if ( lanepattern[i] >= 0 ) fit = false;
+        }
+        if ( fit ) {
+          r |= perm_shright;
+          for ( i = 0; i < lanesize - rot; i++ ) {
+            if ( lanepattern[i] == -1 ) r |= perm_addz; // additional zeroing needed
+          }
+        }
+      }
+      // fit punpckhi
+      fit         = true;
+      uint32_t j2 = lanesize / 2;
+      for ( i = 0; i < lanesize; i++ ) {
+        if ( lanepattern[i] >= 0 && lanepattern[i] != (int)j2 ) fit = false;
+        if ( ( i & 1 ) != 0 ) j2++;
+      }
+      if ( fit ) r |= perm_punpckh;
+      // fit punpcklo
+      fit = true;
+      j2  = 0;
+      for ( i = 0; i < lanesize; i++ ) {
+        if ( lanepattern[i] >= 0 && lanepattern[i] != (int)j2 ) fit = false;
+        if ( ( i & 1 ) != 0 ) j2++;
+      }
+      if ( fit ) r |= perm_punpckl;
+      // fit pshufd
+      if ( elementsize >= 4 ) {
+        uint64_t p = 0;
+        for ( i = 0; i < lanesize; i++ ) {
+          if ( lanesize == 4 ) {
+            p |= ( lanepattern[i] & 3 ) << 2 * i;
+          } else { // lanesize = 2
+            p |= ( ( lanepattern[i] & 1 ) * 10 + 4 ) << 4 * i;
+          }
+        }
+        r |= p << perm_ipattern;
+      }
+    }
+#if INSTRSET >= 7
+    else {                          // not same pattern in all lanes
+      if constexpr ( nlanes > 1 ) { // Try if it fits big rotate
+        for ( i = 0; i < N; i++ ) {
+          ix = a[i];
+          if ( ix >= 0 ) {
+            uint32_t rot2 = ( ix + N - i ) % N; // rotate count
+            if ( rot == 999 ) {
+              rot = rot2; // save rotate count
+            } else if ( rot != rot2 ) {
+              rot = 1000;
+              break; // does not fit big rotate
+            }
+          }
+        }
+        if ( rot < N ) { // fits big rotate
+          r |= perm_rotate_big | (uint64_t)rot << perm_rot_count;
+        }
+      }
+    }
+#endif
+    if ( broadc < 999 && ( r & ( perm_rotate | perm_shright | perm_shleft | perm_rotate_big ) ) == 0 ) {
+      r |= perm_broadcast | (uint64_t)broadc << perm_rot_count; // fits broadcast
+    }
+    return r;
+  }
+
+  // compress_mask: returns a bit mask to use for compression instruction.
+  // It is presupposed that perm_flags indicates perm_compress.
+  // Additional zeroing is needed if perm_flags indicates perm_addz2
+  template <int N>
+  constexpr uint64_t compress_mask( int const ( &a )[N] ) {
+    // a is a reference to a constexpr array of permutation indexes
+    int      ix = 0, lasti = -1, lastp = -1;
+    uint64_t m = 0;
+    int      i = 0;
+    int      j = 1; // loop counters
+    for ( i = 0; i < N; i++ ) {
+      ix = a[i]; // permutation index
+      if ( ix >= 0 ) {
+        m |= (uint64_t)1 << ix; // mask for compression source
+        for ( j = 1; j < i - lastp; j++ ) {
+          m |= (uint64_t)1 << ( lasti + j ); // dummy filling source
+        }
+        lastp = i;
+        lasti = ix;
+      }
+    }
+    return m;
+  }
+
+  // expand_mask: returns a bit mask to use for expansion instruction.
+  // It is presupposed that perm_flags indicates perm_expand.
+  // Additional zeroing is needed if perm_flags indicates perm_addz2
+  template <int N>
+  constexpr uint64_t expand_mask( int const ( &a )[N] ) {
+    // a is a reference to a constexpr array of permutation indexes
+    int      ix = 0, lasti = -1, lastp = -1;
+    uint64_t m = 0;
+    int      i = 0;
+    int      j = 1;
+    for ( i = 0; i < N; i++ ) {
+      ix = a[i]; // permutation index
+      if ( ix >= 0 ) {
+        m |= (uint64_t)1 << i; // mask for expansion destination
+        for ( j = 1; j < ix - lasti; j++ ) {
+          m |= (uint64_t)1 << ( lastp + j ); // dummy filling destination
+        }
+        lastp = i;
+        lasti = ix;
+      }
+    }
+    return m;
+  }
+
+  // perm16_flags: returns information about how to permute a vector of 16-bit integers
+  // Note: It is presupposed that perm_flags reports perm_same_pattern
+  // The return value is composed of these bits:
+  // 1:  data from low  64 bits to low  64 bits. pattern in bit 32-39
+  // 2:  data from high 64 bits to high 64 bits. pattern in bit 40-47
+  // 4:  data from high 64 bits to low  64 bits. pattern in bit 48-55
+  // 8:  data from low  64 bits to high 64 bits. pattern in bit 56-63
+  template <typename V>
+  constexpr uint64_t perm16_flags( int const ( &a )[V::size()] ) {
+    // a is a reference to a constexpr array of permutation indexes
+    // V is a vector class
+    constexpr int N = V::size(); // number of elements
+
+    uint64_t       retval                = 0;            // return value
+    uint32_t       pat[4]                = {0, 0, 0, 0}; // permute patterns
+    uint32_t       i                     = 0;            // loop counter
+    int            ix                    = 0;            // index number i
+    const uint32_t lanesize              = 8;            // elements per lane
+    uint32_t       lane                  = 0;            // current lane
+    int            lanepattern[lanesize] = {0};          // pattern in each lane
+
+    for ( i = 0; i < N; i++ ) {
+      ix   = a[i];
+      lane = i / lanesize; // current lane
+      if ( lane == 0 ) {
+        lanepattern[i] = ix;                // save pattern
+      } else if ( ix >= 0 ) {               // not first lane
+        uint32_t j  = i - lane * lanesize;  // index into lanepattern
+        int      jx = ix - lane * lanesize; // pattern within lane
+        if ( lanepattern[j] < 0 ) {
+          lanepattern[j] = jx; // pattern not known from previous lane
+        }
+      }
+    }
+    // four patterns: low2low, high2high, high2low, low2high
+    for ( i = 0; i < 4; i++ ) {
+      // loop through low pattern
+      if ( lanepattern[i] >= 0 ) {
+        if ( lanepattern[i] < 4 ) { // low2low
+          retval |= 1;
+          pat[0] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
+        } else { // high2low
+          retval |= 4;
+          pat[2] |= uint32_t( lanepattern[i] & 3 ) << ( 2 * i );
+        }
+      }
+      // loop through high pattern
+      if ( lanepattern[i + 4] >= 0 ) {
+        if ( lanepattern[i + 4] < 4 ) { // low2high
+          retval |= 8;
+          pat[3] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
+        } else { // high2high
+          retval |= 2;
+          pat[1] |= uint32_t( lanepattern[i + 4] & 3 ) << ( 2 * i );
+        }
+      }
+    }
+    // join return data
+    for ( i = 0; i < 4; i++ ) { retval |= (uint64_t)pat[i] << ( 32 + i * 8 ); }
+    return retval;
+  }
+
+  // pshufb_mask: return a broad byte mask for permutation within lanes
+  // for use with the pshufb instruction (_mm..._shuffle_epi8).
+  // The pshufb instruction provides fast permutation and zeroing,
+  // allowing different patterns in each lane but no crossing of lane boundaries
+  template <typename V, int oppos = 0>
+  constexpr auto pshufb_mask( int const ( &A )[V::size()] ) {
+    // Parameter a is a reference to a constexpr array of permutation indexes
+    // V is a vector class
+    // oppos = 1 for data from the opposite 128-bit lane in 256-bit vectors
+    constexpr uint32_t N                 = V::size();        // number of vector elements
+    constexpr uint32_t elementsize       = sizeof( V ) / N;  // size of each vector element
+    constexpr uint32_t nlanes            = sizeof( V ) / 16; // number of 128 bit lanes in vector
+    constexpr uint32_t elements_per_lane = N / nlanes;       // number of vector elements per lane
+
+    EList<int8_t, sizeof( V )> u = {{0}}; // list for returning
+
+    uint32_t i    = 0; // loop counters
+    uint32_t j    = 0;
+    int      m    = 0;
+    int      k    = 0;
+    uint32_t lane = 0;
+
+    for ( lane = 0; lane < nlanes; lane++ ) {     // loop through lanes
+      for ( i = 0; i < elements_per_lane; i++ ) { // loop through elements in lane
+        // permutation index for element within lane
+        int8_t p  = -1;
+        int    ix = A[m];
+        if ( ix >= 0 ) {
+          ix ^= oppos * elements_per_lane; // flip bit if opposite lane
+        }
+        ix -= int( lane * elements_per_lane );          // index relative to lane
+        if ( ix >= 0 && ix < (int)elements_per_lane ) { // index points to desired lane
+          p = ix * elementsize;
+        }
+        for ( j = 0; j < elementsize; j++ ) { // loop through bytes in element
+          u.a[k++] = p < 0 ? -1 : p + j;      // store byte permutation index
+        }
+        m++;
+      }
+    }
+    return u; // return encapsulated array
+  }
+
+  // largeblock_perm: return indexes for replacing a permute or blend with
+  // a certain block size by a permute or blend with the double block size.
+  // Note: it is presupposed that perm_flags() indicates perm_largeblock
+  // It is required that additional zeroing is added if perm_flags() indicates perm_addz
+  template <int N>
+  constexpr EList<int, N / 2> largeblock_perm( int const ( &a )[N] ) {
+    // Parameter a is a reference to a constexpr array of permutation indexes
+    EList<int, N / 2> list     = {{0}}; // result indexes
+    int               ix       = 0;     // even index
+    int               iy       = 0;     // odd index
+    int               iz       = 0;     // combined index
+    bool              fit_addz = false; // additional zeroing needed at the lower block level
+    int               i        = 0;     // loop counter
+
+    // check if additional zeroing is needed at current block size
+    for ( i = 0; i < N; i += 2 ) {
+      ix = a[i];     // even index
+      iy = a[i + 1]; // odd index
+      if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz = true; }
+    }
+
+    // loop through indexes
+    for ( i = 0; i < N; i += 2 ) {
+      ix = a[i];     // even index
+      iy = a[i + 1]; // odd index
+      if ( ix >= 0 ) {
+        iz = ix / 2; // half index
+      } else if ( iy >= 0 ) {
+        iz = iy / 2;
+      } else {
+        iz = ix | iy;              // -1 or V_DC. -1 takes precedence
+        if ( fit_addz ) iz = V_DC; // V_DC, because result will be zeroed later
+      }
+      list.a[i / 2] = iz; // save to list
+    }
+    return list;
+  }
+
+  // blend_flags: returns information about how a blend function can be implemented
+  // The return value is composed of these flag bits:
+  const int blend_zeroing      = 1;       // needs zeroing
+  const int blend_allzero      = 2;       // all is zero or don't care
+  const int blend_largeblock   = 4;       // fits blend with a larger block size (e.g permute Vec2q instead of Vec4i)
+  const int blend_addz         = 8;       // additional zeroing needed after blend with larger block size or shift
+  const int blend_a            = 0x10;    // has data from a
+  const int blend_b            = 0x20;    // has data from b
+  const int blend_perma        = 0x40;    // permutation of a needed
+  const int blend_permb        = 0x80;    // permutation of b needed
+  const int blend_cross_lane   = 0x100;   // permutation crossing 128-bit lanes
+  const int blend_same_pattern = 0x200;   // same permute/blend pattern in all 128-bit lanes
+  const int blend_punpckhab    = 0x1000;  // pattern fits punpckh(a,b)
+  const int blend_punpckhba    = 0x2000;  // pattern fits punpckh(b,a)
+  const int blend_punpcklab    = 0x4000;  // pattern fits punpckl(a,b)
+  const int blend_punpcklba    = 0x8000;  // pattern fits punpckl(b,a)
+  const int blend_rotateab     = 0x10000; // pattern fits palignr(a,b)
+  const int blend_rotateba     = 0x20000; // pattern fits palignr(b,a)
+  const int blend_shufab       = 0x40000; // pattern fits shufps/shufpd(a,b)
+  const int blend_shufba       = 0x80000; // pattern fits shufps/shufpd(b,a)
+  const int blend_rotate_big  = 0x100000; // pattern fits rotation across lanes. count returned in bits blend_rotpattern
+  const int blend_outofrange  = 0x10000000; // index out of range
+  const int blend_shufpattern = 32; // pattern for shufps/shufpd is in bit blend_shufpattern to blend_shufpattern + 7
+  const int blend_rotpattern  = 40; // pattern for palignr is in bit blend_rotpattern to blend_rotpattern + 7
+
+  template <typename V>
+  constexpr uint64_t blend_flags( int const ( &a )[V::size()] ) {
+    // a is a reference to a constexpr array of permutation indexes
+    // V is a vector class
+    constexpr int  N                     = V::size();                                             // number of elements
+    uint64_t       r                     = blend_largeblock | blend_same_pattern | blend_allzero; // return value
+    uint32_t       iu                    = 0;                                                     // loop counter
+    int32_t        ii                    = 0;                                                     // loop counter
+    int            ix                    = 0;                                                     // index number i
+    const uint32_t nlanes                = sizeof( V ) / 16; // number of 128-bit lanes
+    const uint32_t lanesize              = N / nlanes;       // elements per lane
+    uint32_t       lane                  = 0;                // current lane
+    uint32_t       rot                   = 999;              // rotate left count
+    int            lanepattern[lanesize] = {0};              // pattern in each lane
+    if ( lanesize == 2 && N <= 8 ) {
+      r |= blend_shufab | blend_shufba; // check if it fits shufpd
+    }
+
+    for ( ii = 0; ii < N; ii++ ) { // loop through indexes
+      ix = a[ii];                  // index
+      if ( ix < 0 ) {
+        if ( ix == -1 )
+          r |= blend_zeroing; // set to zero
+        else if ( ix != V_DC ) {
+          r = blend_outofrange;
+          break; // illegal index
+        }
+      } else { // ix >= 0
+        r &= ~blend_allzero;
+        if ( ix < N ) {
+          r |= blend_a;                     // data from a
+          if ( ix != ii ) r |= blend_perma; // permutation of a
+        } else if ( ix < 2 * N ) {
+          r |= blend_b;                         // data from b
+          if ( ix != ii + N ) r |= blend_permb; // permutation of b
+        } else {
+          r = blend_outofrange;
+          break; // illegal index
+        }
+      }
+      // check if pattern fits a larger block size:
+      // even indexes must be even, odd indexes must fit the preceding even index + 1
+      if ( ( ii & 1 ) == 0 ) {                                            // even index
+        if ( ix >= 0 && ( ix & 1 ) ) r &= ~blend_largeblock;              // not even. does not fit larger block size
+        int iy = a[ii + 1];                                               // next odd index
+        if ( iy >= 0 && ( iy & 1 ) == 0 ) r &= ~blend_largeblock;         // not odd. does not fit larger block size
+        if ( ix >= 0 && iy >= 0 && iy != ix + 1 ) r &= ~blend_largeblock; // does not fit preceding index + 1
+        if ( ix == -1 && iy >= 0 ) r |= blend_addz; // needs additional zeroing at current block size
+        if ( iy == -1 && ix >= 0 ) r |= blend_addz; // needs additional zeroing at current block size
+      }
+      lane = (uint32_t)ii / lanesize; // current lane
+      if ( lane == 0 ) {              // first lane, or no pattern yet
+        lanepattern[ii] = ix;         // save pattern
+      }
+      // check if crossing lanes
+      if ( ix >= 0 ) {
+        uint32_t lanei = uint32_t( ix & ~N ) / lanesize; // source lane
+        if ( lanei != lane ) {
+          r |= blend_cross_lane; // crossing lane
+        }
+        if ( lanesize == 2 ) { // check if it fits pshufd
+          if ( lanei != lane ) r &= ~( blend_shufab | blend_shufba );
+          if ( ( ( ( ix & N ) != 0 ) ^ ii ) & 1 )
+            r &= ~blend_shufab;
+          else
+            r &= ~blend_shufba;
+        }
+      }
+      // check if same pattern in all lanes
+      if ( lane != 0 && ix >= 0 ) {                                             // not first lane
+        int j  = ii - int( lane * lanesize );                                   // index into lanepattern
+        int jx = ix - int( lane * lanesize );                                   // pattern within lane
+        if ( jx < 0 || ( jx & ~N ) >= (int)lanesize ) r &= ~blend_same_pattern; // source is in another lane
+        if ( lanepattern[j] < 0 ) {
+          lanepattern[j] = jx; // pattern not known from previous lane
+        } else {
+          if ( lanepattern[j] != jx ) r &= ~blend_same_pattern; // not same pattern
+        }
+      }
+    }
+    if ( !( r & blend_largeblock ) ) r &= ~blend_addz;    // remove irrelevant flag
+    if ( r & blend_cross_lane ) r &= ~blend_same_pattern; // remove irrelevant flag
+    if ( !( r & ( blend_perma | blend_permb ) ) ) {
+      return r; // no permutation. more checks are superfluous
+    }
+    if ( r & blend_same_pattern ) {
+      // same pattern in all lanes. check if it fits unpack patterns
+      r |= blend_punpckhab | blend_punpckhba | blend_punpcklab | blend_punpcklba;
+      for ( iu = 0; iu < lanesize; iu++ ) { // loop through lanepattern
+        ix = lanepattern[iu];
+        if ( ix >= 0 ) {
+          if ( (uint32_t)ix != iu / 2 + ( iu & 1 ) * N ) r &= ~blend_punpcklab;
+          if ( (uint32_t)ix != iu / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &= ~blend_punpcklba;
+          if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( iu & 1 ) * N ) r &= ~blend_punpckhab;
+          if ( (uint32_t)ix != ( iu + lanesize ) / 2 + ( ( iu & 1 ) ^ 1 ) * N ) r &= ~blend_punpckhba;
+        }
+      }
+#if INSTRSET >= 4 // SSSE3. check if it fits palignr
+      for ( iu = 0; iu < lanesize; iu++ ) {
+        ix = lanepattern[iu];
+        if ( ix >= 0 ) {
+          uint32_t t = ix & ~N;
+          if ( ix & N ) t += lanesize;
+          uint32_t tb = ( t + 2 * lanesize - iu ) % ( lanesize * 2 );
+          if ( rot == 999 ) {
+            rot = tb;
+          } else { // check if fit
+            if ( rot != tb ) rot = 1000;
+          }
+        }
+      }
+      if ( rot < 999 ) { // firs palignr
+        if ( rot < lanesize ) {
+          r |= blend_rotateba;
+        } else {
+          r |= blend_rotateab;
+        }
+        const uint32_t elementsize = sizeof( V ) / N;
+        r |= uint64_t( ( rot & ( lanesize - 1 ) ) * elementsize ) << blend_rotpattern;
+      }
+#endif
+      if ( lanesize == 4 ) {
+        // check if it fits shufps
+        r |= blend_shufab | blend_shufba;
+        for ( ii = 0; ii < 2; ii++ ) {
+          ix = lanepattern[ii];
+          if ( ix >= 0 ) {
+            if ( ix & N )
+              r &= ~blend_shufab;
+            else
+              r &= ~blend_shufba;
+          }
+        }
+        for ( ; ii < 4; ii++ ) {
+          ix = lanepattern[ii];
+          if ( ix >= 0 ) {
+            if ( ix & N )
+              r &= ~blend_shufba;
+            else
+              r &= ~blend_shufab;
+          }
+        }
+        if ( r & ( blend_shufab | blend_shufba ) ) { // fits shufps/shufpd
+          uint8_t shufpattern = 0;                   // get pattern
+          for ( iu = 0; iu < lanesize; iu++ ) { shufpattern |= ( lanepattern[iu] & 3 ) << iu * 2; }
+          r |= (uint64_t)shufpattern << blend_shufpattern; // return pattern
+        }
+      }
+    } else if ( nlanes > 1 ) { // not same pattern in all lanes
+      rot = 999;               // check if it fits big rotate
+      for ( ii = 0; ii < N; ii++ ) {
+        ix = a[ii];
+        if ( ix >= 0 ) {
+          uint32_t rot2 = ( ix + 2 * N - ii ) % ( 2 * N ); // rotate count
+          if ( rot == 999 ) {
+            rot = rot2; // save rotate count
+          } else if ( rot != rot2 ) {
+            rot = 1000;
+            break; // does not fit big rotate
+          }
+        }
+      }
+      if ( rot < 2 * N ) { // fits big rotate
+        r |= blend_rotate_big | (uint64_t)rot << blend_rotpattern;
+      }
+    }
+    if ( lanesize == 2 && ( r & ( blend_shufab | blend_shufba ) ) ) { // fits shufpd. Get pattern
+      for ( ii = 0; ii < N; ii++ ) { r |= uint64_t( a[ii] & 1 ) << ( blend_shufpattern + ii ); }
+    }
+    return r;
+  }
+
+  // blend_perm_indexes: return an Indexlist for implementing a blend function as
+  // two permutations. N = vector size.
+  // dozero = 0: let unused elements be don't care. The two permutation results must be blended
+  // dozero = 1: zero unused elements in each permuation. The two permutation results can be OR'ed
+  // dozero = 2: indexes that are -1 or V_DC are preserved
+  template <int N, int dozero>
+  constexpr EList<int, 2 * N> blend_perm_indexes( int const ( &a )[N] ) {
+    // a is a reference to a constexpr array of permutation indexes
+    EList<int, 2 * N> list = {{0}};              // list to return
+    int               u    = dozero ? -1 : V_DC; // value to use for unused entries
+    int               j    = 0;
+
+    for ( j = 0; j < N; j++ ) { // loop through indexes
+      int ix = a[j];            // current index
+      if ( ix < 0 ) {           // zero or don't care
+        if ( dozero == 2 ) {
+          // list.a[j] = list.a[j + N] = ix;  // fails in gcc in complicated cases
+          list.a[j]     = ix;
+          list.a[j + N] = ix;
+        } else {
+          // list.a[j] = list.a[j + N] = u;
+          list.a[j]     = u;
+          list.a[j + N] = u;
+        }
+      } else if ( ix < N ) { // value from a
+        list.a[j]     = ix;
+        list.a[j + N] = u;
+      } else {
+        list.a[j]     = u; // value from b
+        list.a[j + N] = ix - N;
+      }
+    }
+    return list;
+  }
+
+  // largeblock_indexes: return indexes for replacing a permute or blend with a
+  // certain block size by a permute or blend with the double block size.
+  // Note: it is presupposed that perm_flags or blend_flags indicates _largeblock
+  // It is required that additional zeroing is added if perm_flags or blend_flags
+  // indicates _addz
+  template <int N>
+  constexpr EList<int, N / 2> largeblock_indexes( int const ( &a )[N] ) {
+    // Parameter a is a reference to a constexpr array of N permutation indexes
+    EList<int, N / 2> list = {{0}}; // list to return
+
+    bool fit_addz = false; // additional zeroing needed at the lower block level
+    int  ix       = 0;     // even index
+    int  iy       = 0;     // odd index
+    int  iz       = 0;     // combined index
+    int  i        = 0;     // loop counter
+
+    for ( i = 0; i < N; i += 2 ) {
+      ix = a[i];     // even index
+      iy = a[i + 1]; // odd index
+      if ( ix >= 0 ) {
+        iz = ix / 2; // half index
+      } else if ( iy >= 0 ) {
+        iz = iy / 2; // half index
+      } else
+        iz = ix | iy;     // -1 or V_DC. -1 takes precedence
+      list.a[i / 2] = iz; // save to list
+      // check if additional zeroing is needed at current block size
+      if ( ( ix == -1 && iy >= 0 ) || ( iy == -1 && ix >= 0 ) ) { fit_addz = true; }
+    }
+    // replace -1 by V_DC if fit_addz
+    if ( fit_addz ) {
+      for ( i = 0; i < N / 2; i++ ) {
+        if ( list.a[i] < 0 ) list.a[i] = V_DC;
+      }
+    }
+    return list;
+  }
+
+  /****************************************************************************************
+   *
+   *          Vector blend helper function templates
+   *
+   * These templates are for emulating a blend with a vector size that is not supported by
+   * the instruction set, using multiple blends or permutations of half the vector size
+   *
+   ****************************************************************************************/
+
+  // Make dummy blend function templates to avoid error messages when the blend funtions are not yet defined
+  template <typename dummy>
+  void blend2() {}
+  template <typename dummy>
+  void blend4() {}
+  template <typename dummy>
+  void blend8() {}
+  template <typename dummy>
+  void blend16() {}
+  template <typename dummy>
+  void blend32() {}
+
+  // blend_half_indexes: return an Indexlist for emulating a blend function as
+  // blends or permutations from multiple sources
+  // dozero = 0: let unused elements be don't care. Multiple permutation results must be blended
+  // dozero = 1: zero unused elements in each permuation. Multiple permutation results can be OR'ed
+  // dozero = 2: indexes that are -1 or V_DC are preserved
+  // src1, src2: sources to blend in a partial implementation
+  template <int N, int dozero, int src1, int src2>
+  constexpr EList<int, N> blend_half_indexes( int const ( &a )[N] ) {
+    // a is a reference to a constexpr array of permutation indexes
+    EList<int, N> list = {{0}};              // list to return
+    int           u    = dozero ? -1 : V_DC; // value to use for unused entries
+    int           j    = 0;                  // loop counter
+
+    for ( j = 0; j < N; j++ ) { // loop through indexes
+      int ix = a[j];            // current index
+      if ( ix < 0 ) {           // zero or don't care
+        list.a[j] = ( dozero == 2 ) ? ix : u;
+      } else {
+        int src = ix / N; // source
+        if ( src == src1 ) {
+          list.a[j] = ix & ( N - 1 );
+        } else if ( src == src2 ) {
+          list.a[j] = ( ix & ( N - 1 ) ) + N;
+        } else
+          list.a[j] = u;
+      }
+    }
+    return list;
+  }
+
+  // selectblend: select one of four sources for blending
+  template <typename W, int s>
+  static inline auto selectblend( W const a, W const b ) {
+    if constexpr ( s == 0 )
+      return a.get_low();
+    else if constexpr ( s == 1 )
+      return a.get_high();
+    else if constexpr ( s == 2 )
+      return b.get_low();
+    else
+      return b.get_high();
+  }
+
+  // blend_half: Emulate a blend with a vector size that is not supported
+  // by multiple blends with half the vector size.
+  // blend_half is called twice, to give the low and high half of the result
+  // Parameters: W: type of full-size vector
+  // i0...: indexes for low or high half
+  // a, b: full size input vectors
+  // return value: half-size vector for lower or upper part
+  template <typename W, int... i0>
+  auto blend_half( W const& a, W const& b ) {
+    typedef decltype( a.get_low() ) V;             // type for half-size vector
+    constexpr int                   N = V::size(); // size of half-size vector
+    static_assert( sizeof...( i0 ) == N, "wrong number of indexes in blend_half" );
+    constexpr int ind[N] = {i0...}; // array of indexes
+
+    // lambda to find which of the four possible sources are used
+    // return: EList<int, 5> containing a list of up to 4 sources. The last element is the number of sources used
+    auto listsources = []( int const n, int const( &ind )[N] ) constexpr {
+      bool source_used[4] = {false, false, false, false}; // list of sources used
+      int  i              = 0;
+      for ( i = 0; i < n; i++ ) {
+        int ix = ind[i]; // index
+        if ( ix >= 0 ) {
+          int src              = ix / n; // source used
+          source_used[src & 3] = true;
+        }
+      }
+      // return a list of sources used. The last element is the number of sources used
+      EList<int, 5> sources = {{0}};
+      int           nsrc    = 0; // number of sources
+      for ( i = 0; i < 4; i++ ) {
+        if ( source_used[i] ) { sources.a[nsrc++] = i; }
+      }
+      sources.a[4] = nsrc;
+      return sources;
+    };
+    // list of sources used
+    constexpr EList<int, 5> sources = listsources( N, ind );
+    constexpr int           nsrc    = sources.a[4]; // number of sources used
+
+    if constexpr ( nsrc == 0 ) { // no sources
+      return V( 0 );
+    }
+    // get indexes for the first one or two sources
+    constexpr int           uindex = ( nsrc > 2 ) ? 1 : 2; // unused elements set to zero if two blends are combined
+    constexpr EList<int, N> L      = blend_half_indexes<N, uindex, sources.a[0], sources.a[1]>( ind );
+    V                       x0;
+    V                       src0 = selectblend<W, sources.a[0]>( a, b ); // first source
+    V                       src1 = selectblend<W, sources.a[1]>( a, b ); // second source
+    if constexpr ( N == 2 ) {
+      x0 = blend2<L.a[0], L.a[1]>( src0, src1 );
+    } else if constexpr ( N == 4 ) {
+      x0 = blend4<L.a[0], L.a[1], L.a[2], L.a[3]>( src0, src1 );
+    } else if constexpr ( N == 8 ) {
+      x0 = blend8<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]>( src0, src1 );
+    } else if constexpr ( N == 16 ) {
+      x0 = blend16<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
+                   L.a[12], L.a[13], L.a[14], L.a[15]>( src0, src1 );
+    } else if constexpr ( N == 32 ) {
+      x0 = blend32<L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], L.a[8], L.a[9], L.a[10], L.a[11],
+                   L.a[12], L.a[13], L.a[14], L.a[15], L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22],
+                   L.a[23], L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]>( src0, src1 );
+    }
+    if constexpr ( nsrc > 2 ) { // get last one or two sources
+      constexpr EList<int, N> M = blend_half_indexes<N, 1, sources.a[2], sources.a[3]>( ind );
+      V                       x1;
+      V                       src2 = selectblend<W, sources.a[2]>( a, b ); // third source
+      V                       src3 = selectblend<W, sources.a[3]>( a, b ); // fourth source
+      if constexpr ( N == 2 ) {
+        x1 = blend2<M.a[0], M.a[1]>( src0, src1 );
+      } else if constexpr ( N == 4 ) {
+        x1 = blend4<M.a[0], M.a[1], M.a[2], M.a[3]>( src2, src3 );
+      } else if constexpr ( N == 8 ) {
+        x1 = blend8<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7]>( src2, src3 );
+      } else if constexpr ( N == 16 ) {
+        x1 = blend16<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
+                     M.a[12], M.a[13], M.a[14], M.a[15]>( src2, src3 );
+      } else if constexpr ( N == 32 ) {
+        x1 = blend32<M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7], M.a[8], M.a[9], M.a[10], M.a[11],
+                     M.a[12], M.a[13], M.a[14], M.a[15], M.a[16], M.a[17], M.a[18], M.a[19], M.a[20], M.a[21], M.a[22],
+                     M.a[23], M.a[24], M.a[25], M.a[26], M.a[27], M.a[28], M.a[29], M.a[30], M.a[31]>( src2, src3 );
+      }
+      x0 |= x1; // combine result of two blends. Unused elements are zero
+    }
+    return x0;
+  }
+
 #ifdef VCL_NAMESPACE
 }
 #endif
diff --git a/GaudiKernel/src/contrib/instrset_detect.cpp b/GaudiKernel/src/contrib/instrset_detect.cpp
index 9fb617bd6d4834d5afa00e75d5fcd6f4a18a1614..e1e750cbeb7b78096571d8d3fb3965d8cf889868 100644
--- a/GaudiKernel/src/contrib/instrset_detect.cpp
+++ b/GaudiKernel/src/contrib/instrset_detect.cpp
@@ -1,14 +1,15 @@
 /**************************  instrset_detect.cpp   ****************************
-* Author:        Agner Fog
-* Date created:  2012-05-30
-* Last modified: 2017-05-02
-* Version:       1.28
-* Project:       vector classes
-* Description:
-* Functions for checking which instruction sets are supported.
-*
-* (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses
-\*****************************************************************************/
+ * Author:        Agner Fog
+ * Date created:  2012-05-30
+ * Last modified: 2019-08-01
+ * Version:       2.00.00
+ * Project:       vector class library
+ * Description:
+ * Functions for checking which instruction sets are supported.
+ *
+ * (c) Copyright 2012-2019 Agner Fog.
+ * Apache License version 2.0 or later.
+ ******************************************************************************/
 
 #include "instrset.h"
 
@@ -16,56 +17,21 @@
 namespace VCL_NAMESPACE {
 #endif
 
-  // Define interface to cpuid instruction.
-  // input:  eax = functionnumber, ecx = 0
-  // output: eax = output[0], ebx = output[1], ecx = output[2], edx = output[3]
-  static inline void cpuid( int output[4], int functionnumber ) {
-#if defined( __GNUC__ ) || defined( __clang__ ) // use inline assembly, Gnu/AT&T syntax
-
-    int a, b, c, d;
-    __asm( "cpuid" : "=a"( a ), "=b"( b ), "=c"( c ), "=d"( d ) : "a"( functionnumber ), "c"( 0 ) : );
-    output[0] = a;
-    output[1] = b;
-    output[2] = c;
-    output[3] = d;
-
-#elif defined( _MSC_VER ) || defined( __INTEL_COMPILER ) // Microsoft or Intel compiler, intrin.h included
-
-  __cpuidex( output, functionnumber, 0 ); // intrinsic function for CPUID
-
-#else // unknown platform. try inline assembly with masm/intel syntax
-
-  __asm {
-        mov eax, functionnumber
-        xor ecx, ecx
-        cpuid;
-        mov esi, output
-        mov [esi],    eax
-        mov [esi+4],  ebx
-        mov [esi+8],  ecx
-        mov [esi+12], edx
-  }
-
-#endif
-  }
-
   // Define interface to xgetbv instruction
-  static inline int64_t xgetbv( int ctr ) {
+  static inline uint64_t xgetbv( int ctr ) {
 #if ( defined( _MSC_FULL_VER ) && _MSC_FULL_VER >= 160040000 ) ||                                                      \
-    ( defined( __INTEL_COMPILER ) && __INTEL_COMPILER >= 1200 ) // Microsoft or Intel compiler supporting _xgetbv
-                                                                // intrinsic
+    ( defined( __INTEL_COMPILER ) && __INTEL_COMPILER >= 1200 )
+    // Microsoft or Intel compiler supporting _xgetbv intrinsic
 
-    return _xgetbv( ctr ); // intrinsic function for XGETBV
+    return uint64_t( _xgetbv( ctr ) ); // intrinsic function for XGETBV
 
-#elif defined( __GNUC__ ) // use inline assembly, Gnu/AT&T syntax
+#elif defined( __GNUC__ ) || defined( __clang__ ) // use inline assembly, Gnu/AT&T syntax
 
   uint32_t a, d;
   __asm( "xgetbv" : "=a"( a ), "=d"( d ) : "c"( ctr ) : );
   return a | ( uint64_t( d ) << 32 );
 
-#else // #elif defined (_WIN32)                           // other compiler. try inline assembly with masm/intel/MS
-  // syntax
-
+#else // #elif defined (_WIN32)                      // other compiler. try inline assembly with masm/intel/MS syntax
   uint32_t a, d;
   __asm {
         mov ecx, ctr
@@ -83,7 +49,7 @@ namespace VCL_NAMESPACE {
   /* find supported instruction set
       return value:
       0           = 80386 instruction set
-      1  or above = SSE (XMM) supported by CPU (not testing for O.S. support)
+      1  or above = SSE (XMM) supported by CPU (not testing for OS support)
       2  or above = SSE2
       3  or above = SSE3
       4  or above = Supplementary SSE3 (SSSE3)
@@ -92,8 +58,7 @@ namespace VCL_NAMESPACE {
       7  or above = AVX supported by CPU and operating system
       8  or above = AVX2
       9  or above = AVX512F
-      10 or above = AVX512VL
-      11 or above = AVX512BW, AVX512DQ
+     10  or above = AVX512VL, AVX512BW, AVX512DQ
   */
   int instrset_detect( void ) {
 
@@ -134,11 +99,10 @@ namespace VCL_NAMESPACE {
     cpuid( abcd, 0xD );                                // call cpuid leaf 0xD for feature flags
     if ( ( abcd[0] & 0x60 ) != 0x60 ) return iset;     // no AVX512
     iset = 9;
-    cpuid( abcd, 7 );                                  // call cpuid leaf 7 for feature flags
-    if ( ( abcd[1] & ( 1 << 31 ) ) == 0 ) return iset; // no AVX512VL
-    iset = 10;
+    cpuid( abcd, 7 );                                          // call cpuid leaf 7 for feature flags
+    if ( ( abcd[1] & ( 1 << 31 ) ) == 0 ) return iset;         // no AVX512VL
     if ( ( abcd[1] & 0x40020000 ) != 0x40020000 ) return iset; // no AVX512BW, AVX512DQ
-    iset = 11;
+    iset = 10;
     return iset;
   }
 
@@ -182,6 +146,22 @@ namespace VCL_NAMESPACE {
     return ( ( abcd[1] & ( 1 << 27 ) ) != 0 ); // ebx bit 27 indicates AVX512ER
   }
 
+  // detect if CPU supports the AVX512VBMI instruction set
+  bool hasAVX512VBMI( void ) {
+    if ( instrset_detect() < 10 ) return false; // must have AVX512BW
+    int abcd[4];                                // cpuid results
+    cpuid( abcd, 7 );                           // call cpuid function 7
+    return ( ( abcd[2] & ( 1 << 1 ) ) != 0 );   // ecx bit 1 indicates AVX512VBMI
+  }
+
+  // detect if CPU supports the AVX512VBMI2 instruction set
+  bool hasAVX512VBMI2( void ) {
+    if ( instrset_detect() < 10 ) return false; // must have AVX512BW
+    int abcd[4];                                // cpuid results
+    cpuid( abcd, 7 );                           // call cpuid function 7
+    return ( ( abcd[2] & ( 1 << 6 ) ) != 0 );   // ecx bit 6 indicates AVX512VBMI2
+  }
+
 #ifdef VCL_NAMESPACE
 }
 #endif
diff --git a/ci-utils/env_setup.sh b/ci-utils/env_setup.sh
index 464e57cd1f5401e53b630ef8fa8257b57de0092b..fb4d1eadbe8cbeef037233c38b1bf0cd8a19a034 100644
--- a/ci-utils/env_setup.sh
+++ b/ci-utils/env_setup.sh
@@ -23,7 +23,7 @@
 
 # initial required environment
 # - get Python, nosetests and xenv from LHCb installation
-source /cvmfs/lhcb.cern.ch/lib/var/lib/LbEnv/prod/${LCG_hostos}/bin/activate
+source /cvmfs/lhcb.cern.ch/lib/var/lib/LbEnv/stable/${LCG_hostos}/bin/activate
 # - add CMake and Ninja-build
 export PATH=${LCG_contrib}/CMake/latest/Linux-x86_64/bin:${LCG_contrib}/ninja/1.7.2.gcc0ea.kitware.dyndep-1/x86_64-slc6:${PATH}
 # - use LHCb compiler wrappers
diff --git a/toolchain.cmake b/toolchain.cmake
index 3cf3c082dc00788265c6da964b5808771c204fa8..0eb85e701549219c06d8a9c6b04a281b0145500e 100644
--- a/toolchain.cmake
+++ b/toolchain.cmake
@@ -9,7 +9,7 @@
 # or submit itself to any jurisdiction.                                             #
 #####################################################################################
 # Special wrapper to load the declared version of the heptools toolchain.
-set(heptools_version 97)
+set(heptools_version 97a)
 
 cmake_minimum_required(VERSION 3.6)
 if(NOT "$ENV{HEPTOOLS_VERSION}" STREQUAL "")