vectorclass
diff --git a/‎instrset.h
+15-11 b/‎instrset.h
+15-11
diff --git a/‎instrset_detect.cpp
+1-1 b/‎instrset_detect.cpp
+1-1
diff --git a/‎vectorclass.h
+2-2 b/‎vectorclass.h
+2-2
diff --git a/‎vectorf128.h
+43-23 b/‎vectorf128.h
+43-23
diff --git a/‎vectorf256.h
+48-8 b/‎vectorf256.h
+48-8
diff --git a/‎vectorf512.h
+50-13 b/‎vectorf512.h
+50-13
@@ -1,8 +1,8 @@
 /****************************  instrset.h   **********************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2019-08-30
-* Version:       2.00.01
+* Last modified: 2019-10-31
+* Version:       2.00.02
 * Project:       vector class library
 * Description:
 * Header file for various compiler-specific tasks as well as common
@@ -153,6 +153,10 @@ namespace VCL_NAMESPACE {
 }
 #endif
 
+// functions in physical_processors.cpp:
+int physicalProcessors(int * logical_processors = 0);
+
+
 // GCC version
 #if defined(__GNUC__) && !defined (GCC_VERSION) && !defined (__clang__)
 #define GCC_VERSION  ((__GNUC__) * 10000 + (__GNUC_MINOR__) * 100 + (__GNUC_PATCHLEVEL__))
@@ -220,30 +224,30 @@ constexpr int V_DC = -256;
 *****************************************************************************/
 
 // Define interface to cpuid instruction.
-// input:  eax = functionnumber, ecx = 0
+// input:  functionnumber = leaf (eax), ecxleaf = subleaf(ecx)
 // output: output[0] = eax, output[1] = ebx, output[2] = ecx, output[3] = edx
-static inline void cpuid(int output[4], int functionnumber) {
+static inline void cpuid(int output[4], int functionnumber, int ecxleaf = 0) {
 #if defined(__GNUC__) || defined(__clang__)           // use inline assembly, Gnu/AT&T syntax
     int a, b, c, d;
-    __asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionnumber), "c"(0) : );
+    __asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionnumber), "c"(ecxleaf) : );
     output[0] = a;
     output[1] = b;
     output[2] = c;
     output[3] = d;
 
 #elif defined (_MSC_VER)                              // Microsoft compiler, intrin.h included
-    __cpuidex(output, functionnumber, 0);             // intrinsic function for CPUID
+    __cpuidex(output, functionnumber, ecxleaf);       // intrinsic function for CPUID
 
 #else                                                 // unknown platform. try inline assembly with masm/intel syntax
     __asm {
         mov eax, functionnumber
-        xor ecx, ecx
+        mov ecx, ecxleaf
         cpuid;
         mov esi, output
-            mov[esi], eax
-            mov[esi + 4], ebx
-            mov[esi + 8], ecx
-            mov[esi + 12], edx
+        mov[esi], eax
+        mov[esi + 4], ebx
+        mov[esi + 8], ecx
+        mov[esi + 12], edx
     }
 #endif
 }
 
@@ -25,7 +25,7 @@ static inline uint64_t xgetbv (int ctr) {
 
     return uint64_t(_xgetbv(ctr));                    // intrinsic function for XGETBV
 
-#elif defined(__GNUC__)                               // use inline assembly, Gnu/AT&T syntax
+#elif defined(__GNUC__) ||  defined (__clang__)       // use inline assembly, Gnu/AT&T syntax
 
    uint32_t a, d;
    __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : );
 
@@ -1,8 +1,8 @@
 /****************************  vectorclass.h   ********************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2019-09-14
-* Version:       2.00.01
+* Last modified: 2019-10-27
+* Version:       2.00.02
 * Project:       vector class library
 * Home:          https://github.com/vectorclass
 * Description:
 
@@ -1,8 +1,8 @@
 /****************************  vectorf128.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2019-09-14
-* Version:       2.00.01
+* Last modified: 2019-10-27
+* Version:       2.00.02
 * Project:       vector class library
 * Description:
 * Header file defining 128-bit floating point vector classes
@@ -970,22 +970,31 @@ static inline Vec4fb is_inf(Vec4f const a) {
 #endif
 }
 
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
 #if INSTRSET >= 10
 static inline Vec4fb is_nan(Vec4f const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
     return Vec4fb(_mm_fpclass_ps_mask(a, 0x81));
 }
+//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) 
+//__attribute__((optimize("-fno-unsafe-math-optimizations")))
+//static inline Vec4fb is_nan(Vec4f const a) {
+//    return a != a; // not safe with -ffinite-math-only compiler option
+//}
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
+static inline Vec4fb is_nan(Vec4f const a) {
+    __m128 aa = a;
+    __m128i unordered;
+    __asm volatile("vcmpps $3,  %1, %1, %0" : "=x" (unordered) :  "x" (aa) );
+    return Vec4fb(unordered);
+}
 #else
-// Function is_nan: gives true for elements that are +NAN or -NAN
-// false for finite numbers and +/-INF
-// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
-//__attribute__ ((optimize("-fno-unsafe-math-optimizations")));
-#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
-__attribute__((optimize("-fno-unsafe-math-optimizations")))
-#elif defined(__clang__)
-__attribute__((optnone))
-#endif
 static inline Vec4fb is_nan(Vec4f const a) {
-    return a != a; // not safe with -ffinite-math-only compiler option
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return _mm_cmp_ps(a, a, 3); // compare unordered
+    // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
 }
 #endif
 
@@ -1934,25 +1943,36 @@ static inline Vec2db is_inf(Vec2d const a) {
 #endif
 }
 
+
+// Function is_nan: gives true for elements that are +NAN or -NAN
+// false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
 #if INSTRSET >= 10
 static inline Vec2db is_nan(Vec2d const a) {
-    return _mm_fpclass_pd_mask(a, 0x81);
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return Vec2db(_mm_fpclass_pd_mask(a, 0x81));
+}
+//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) 
+//__attribute__((optimize("-fno-unsafe-math-optimizations")))
+//static inline Vec2db is_nan(Vec2d const a) {
+//    return a != a; // not safe with -ffinite-math-only compiler option
+//}
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
+static inline Vec2db is_nan(Vec2d const a) {
+    __m128d aa = a;
+    __m128i unordered;
+    __asm volatile("vcmppd $3,  %1, %1, %0" : "=x" (unordered) :  "x" (aa) );
+    return Vec2db(unordered);
 }
 #else
-// Function is_nan: gives true for elements that are +NAN or -NAN
-// false for finite numbers and +/-INF
-// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
-//__attribute__ ((optimize("-fno-unsafe-math-optimizations")));
-#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
-__attribute__((optimize("-fno-unsafe-math-optimizations")))
-#elif defined(__clang__)
-__attribute__((optnone))
-#endif
 static inline Vec2db is_nan(Vec2d const a) {
-    return a != a;   // not safe with -ffinite-math-only compiler option
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return _mm_cmp_pd(a, a, 3); // compare unordered
+    // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
 }
 #endif
 
+
 // Function is_subnormal: gives true for elements that are subnormal (denormal)
 // false for finite numbers, zero, NAN and INF
 static inline Vec2db is_subnormal(Vec2d const a) {
 
@@ -1,8 +1,8 @@
 /****************************  vectorf256.h   *******************************
 * Author:        Agner Fog
 * Date created:  2012-05-30
-* Last modified: 2019-08-01
-* Version:       2.00.00
+* Last modified: 2019-10-27
+* Version:       2.00.02
 * Project:       vector class library
 * Description:
 * Header file defining 256-bit floating point vector classes
@@ -52,11 +52,14 @@ namespace VCL_NAMESPACE {
 // Generate a constant vector of 8 integers stored in memory
 template <uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7>
 inline __m256 constant8f() {
+    /*
     const union {
         uint32_t i[8];
         __m256   ymm;
     } u = {{i0,i1,i2,i3,i4,i5,i6,i7}};
     return u.ymm;
+    */
+    return _mm256_castsi256_ps(_mm256_setr_epi32(i0,i1,i2,i3,i4,i5,i6,i7));
 }
 
 
@@ -1067,13 +1070,31 @@ static inline Vec8fb is_inf(Vec8f const a) {
 // Function is_nan: gives true for elements that are +NAN or -NAN
 // false for finite numbers and +/-INF
 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+#if INSTRSET >= 10
 static inline Vec8fb is_nan(Vec8f const a) {
-#if INSTRSET >= 10  // compact boolean vectors
+    // assume that compiler does not optimize this away with -ffinite-math-only:
     return _mm256_fpclass_ps_mask (a, 0x81);
+}
+//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) 
+//__attribute__((optimize("-fno-unsafe-math-optimizations")))
+//static inline Vec8fb is_nan(Vec8f const a) {
+//    return a != a; // not safe with -ffinite-math-only compiler option
+//}
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
+static inline Vec8fb is_nan(Vec8f const a) {
+    __m256 aa = a;
+    __m256 unordered;
+    __asm volatile("vcmpps $3, %1, %1, %0" : "=v" (unordered) :  "v" (aa) );
+    return Vec8fb(unordered);
+}
 #else
-    return a != a;  // not safe with -ffinite-math-only compiler option
-#endif
+static inline Vec8fb is_nan(Vec8f const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return _mm256_cmp_ps(a, a, 3); // compare unordered
+    // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
 }
+#endif
+
 
 // Function is_subnormal: gives true for elements that are denormal (subnormal)
 // false for finite numbers, zero, NAN and INF
@@ -1873,13 +1894,32 @@ static inline Vec4db is_inf(Vec4d const a) {
 
 // Function is_nan: gives true for elements that are +NAN or -NAN
 // false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+#if INSTRSET >= 10
 static inline Vec4db is_nan(Vec4d const a) {
-#if INSTRSET >= 10  // compact boolean vectors
+    // assume that compiler does not optimize this away with -ffinite-math-only:
     return _mm256_fpclass_pd_mask (a, 0x81);
+}
+//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) 
+//__attribute__((optimize("-fno-unsafe-math-optimizations")))
+//static inline Vec4db is_nan(Vec4d const a) {
+//    return a != a; // not safe with -ffinite-math-only compiler option
+//}
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
+static inline Vec4db is_nan(Vec4d const a) {
+    __m256d aa = a;
+    __m256d unordered;
+    __asm volatile("vcmppd $3, %1, %1, %0" : "=v" (unordered) :  "v" (aa) );
+    return Vec4db(unordered);
+}
 #else
-    return a != a;  // not safe with -ffinite-math-only compiler option
-#endif
+static inline Vec4db is_nan(Vec4d const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return _mm256_cmp_pd(a, a, 3); // compare unordered
+    // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
 }
+#endif
+
 
 // Function is_subnormal: gives true for elements that are denormal (subnormal)
 // false for finite numbers, zero, NAN and INF
 
@@ -1,8 +1,8 @@
 /****************************  vectorf512.h   *******************************
 * Author:        Agner Fog
 * Date created:  2014-07-23
-* Last modified: 2019-08-01
-* Version:       2.00.00
+* Last modified: 2019-10-27
+* Version:       2.00.02
 * Project:       vector class library
 * Description:
 * Header file defining 512-bit floating point vector classes
@@ -68,16 +68,16 @@ inline Vec8b::Vec8b(Vec4db const x0, Vec4db const x1) {
     mm = to_bits(x0) | (to_bits(x1) << 4);
 }
 
-Vec8ib Vec16b::get_low() const {
+inline Vec8ib Vec16b::get_low() const {
     return Vec8ib().load_bits(uint8_t(mm));
 }
-Vec8ib Vec16b::get_high() const {
+inline Vec8ib Vec16b::get_high() const {
     return Vec8ib().load_bits(uint8_t((uint16_t)mm >> 8u));
 }
-Vec4qb Vec8b::get_low() const {
+inline Vec4qb Vec8b::get_low() const {
     return Vec4qb().load_bits(mm & 0xF);
 }
-Vec4qb Vec8b::get_high() const {
+inline Vec4qb Vec8b::get_high() const {
     return Vec4qb().load_bits(mm >> 4u);
 }
 
@@ -466,18 +466,36 @@ static inline Vec16fb is_inf(Vec16f const a) {
     Vec16i t2 = t1 << 1;                // shift out sign bit
     return Vec16fb(t2 == 0xFF000000);   // exponent is all 1s, fraction is 0
 #endif
-}
+} 
 
 // Function is_nan: gives true for elements that are +NAN or -NAN
 // false for finite numbers and +/-INF
 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+#if INSTRSET >= 10
 static inline Vec16fb is_nan(Vec16f const a) {
-#if INSTRSET >= 10  // __AVX512DQ__
+    // assume that compiler does not optimize this away with -ffinite-math-only:
     return _mm512_fpclass_ps_mask(a, 0x81);
+}
+//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) 
+//__attribute__((optimize("-fno-unsafe-math-optimizations")))
+//static inline Vec16fb is_nan(Vec16f const a) {
+//    return a != a; // not safe with -ffinite-math-only compiler option
+//}
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
+static inline Vec16fb is_nan(Vec16f const a) {
+    __m512 aa = a;
+    __mmask16 unordered;
+    __asm volatile("vcmpps $3, %1, %1, %0" : "=Yk" (unordered) :  "v" (aa) );
+    return Vec16fb(unordered);
+}
 #else
-    return a != a;  // not safe with -ffinite-math-only compiler option
-#endif
+static inline Vec16fb is_nan(Vec16f const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return Vec16fb().load_bits(_mm512_cmp_ps_mask(a, a, 3)); // compare unordered
+    // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
 }
+#endif
+
 
 // Function is_subnormal: gives true for elements that are denormal (subnormal)
 // false for finite numbers, zero, NAN and INF
@@ -1088,13 +1106,32 @@ static inline Vec8db is_inf(Vec8d const a) {
 
 // Function is_nan: gives true for elements that are +NAN or -NAN
 // false for finite numbers and +/-INF
+// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
+#if INSTRSET >= 10
 static inline Vec8db is_nan(Vec8d const a) {
-#if INSTRSET >= 10  // __AVX512DQ__
+    // assume that compiler does not optimize this away with -ffinite-math-only:
     return _mm512_fpclass_pd_mask(a, 0x81);
+}
+//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) 
+//__attribute__((optimize("-fno-unsafe-math-optimizations")))
+//static inline Vec8db is_nan(Vec8d const a) {
+//    return a != a; // not safe with -ffinite-math-only compiler option
+//}
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
+static inline Vec8db is_nan(Vec8d const a) {
+    __m512d aa = a;
+    __mmask16 unordered;
+    __asm volatile("vcmppd $3, %1, %1, %0" : "=Yk" (unordered) :  "v" (aa) );
+    return Vec8db(unordered);
+}
 #else
-    return a != a;  // not safe with -ffinite-math-only compiler option
-#endif
+static inline Vec8db is_nan(Vec8d const a) {
+    // assume that compiler does not optimize this away with -ffinite-math-only:
+    return Vec8db().load_bits(_mm512_cmp_pd_mask(a, a, 3)); // compare unordered
+    // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
 }
+#endif
+
 
 // Function is_subnormal: gives true for elements that are denormal (subnormal)
 // false for finite numbers, zero, NAN and INF