Skip to content

Commit 42de0e6

Browse files
authored
version 2.00.02
1 parent 1a0ccf5 commit 42de0e6

10 files changed

+180
-68
lines changed

‎instrset.h

+15-11
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/**************************** instrset.h **********************************
22
* Author: Agner Fog
33
* Date created: 2012-05-30
4-
* Last modified: 2019-08-30
5-
* Version: 2.00.01
4+
* Last modified: 2019-10-31
5+
* Version: 2.00.02
66
* Project: vector class library
77
* Description:
88
* Header file for various compiler-specific tasks as well as common
@@ -153,6 +153,10 @@ namespace VCL_NAMESPACE {
153153
}
154154
#endif
155155

156+
// functions in physical_processors.cpp:
157+
int physicalProcessors(int * logical_processors = 0);
158+
159+
156160
// GCC version
157161
#if defined(__GNUC__) && !defined (GCC_VERSION) && !defined (__clang__)
158162
#define GCC_VERSION ((__GNUC__) * 10000 + (__GNUC_MINOR__) * 100 + (__GNUC_PATCHLEVEL__))
@@ -220,30 +224,30 @@ constexpr int V_DC = -256;
220224
*****************************************************************************/
221225

222226
// Define interface to cpuid instruction.
223-
// input: eax = functionnumber, ecx = 0
227+
// input: functionnumber = leaf (eax), ecxleaf = subleaf(ecx)
224228
// output: output[0] = eax, output[1] = ebx, output[2] = ecx, output[3] = edx
225-
static inline void cpuid(int output[4], int functionnumber) {
229+
static inline void cpuid(int output[4], int functionnumber, int ecxleaf = 0) {
226230
#if defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax
227231
int a, b, c, d;
228-
__asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionnumber), "c"(0) : );
232+
__asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionnumber), "c"(ecxleaf) : );
229233
output[0] = a;
230234
output[1] = b;
231235
output[2] = c;
232236
output[3] = d;
233237

234238
#elif defined (_MSC_VER) // Microsoft compiler, intrin.h included
235-
__cpuidex(output, functionnumber, 0); // intrinsic function for CPUID
239+
__cpuidex(output, functionnumber, ecxleaf); // intrinsic function for CPUID
236240

237241
#else // unknown platform. try inline assembly with masm/intel syntax
238242
__asm {
239243
mov eax, functionnumber
240-
xor ecx, ecx
244+
mov ecx, ecxleaf
241245
cpuid;
242246
mov esi, output
243-
mov[esi], eax
244-
mov[esi + 4], ebx
245-
mov[esi + 8], ecx
246-
mov[esi + 12], edx
247+
mov[esi], eax
248+
mov[esi + 4], ebx
249+
mov[esi + 8], ecx
250+
mov[esi + 12], edx
247251
}
248252
#endif
249253
}

‎instrset_detect.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ static inline uint64_t xgetbv (int ctr) {
2525

2626
return uint64_t(_xgetbv(ctr)); // intrinsic function for XGETBV
2727

28-
#elif defined(__GNUC__) // use inline assembly, Gnu/AT&T syntax
28+
#elif defined(__GNUC__) || defined (__clang__) // use inline assembly, Gnu/AT&T syntax
2929

3030
uint32_t a, d;
3131
__asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : );

‎vectorclass.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/**************************** vectorclass.h ********************************
22
* Author: Agner Fog
33
* Date created: 2012-05-30
4-
* Last modified: 2019-09-14
5-
* Version: 2.00.01
4+
* Last modified: 2019-10-27
5+
* Version: 2.00.02
66
* Project: vector class library
77
* Home: https://github.com/vectorclass
88
* Description:

‎vectorf128.h

+43-23
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/**************************** vectorf128.h *******************************
22
* Author: Agner Fog
33
* Date created: 2012-05-30
4-
* Last modified: 2019-09-14
5-
* Version: 2.00.01
4+
* Last modified: 2019-10-27
5+
* Version: 2.00.02
66
* Project: vector class library
77
* Description:
88
* Header file defining 128-bit floating point vector classes
@@ -970,22 +970,31 @@ static inline Vec4fb is_inf(Vec4f const a) {
970970
#endif
971971
}
972972

973+
// Function is_nan: gives true for elements that are +NAN or -NAN
974+
// false for finite numbers and +/-INF
975+
// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
973976
#if INSTRSET >= 10
974977
static inline Vec4fb is_nan(Vec4f const a) {
978+
// assume that compiler does not optimize this away with -ffinite-math-only:
975979
return Vec4fb(_mm_fpclass_ps_mask(a, 0x81));
976980
}
981+
//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
982+
//__attribute__((optimize("-fno-unsafe-math-optimizations")))
983+
//static inline Vec4fb is_nan(Vec4f const a) {
984+
// return a != a; // not safe with -ffinite-math-only compiler option
985+
//}
986+
#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
987+
static inline Vec4fb is_nan(Vec4f const a) {
988+
__m128 aa = a;
989+
__m128i unordered;
990+
__asm volatile("vcmpps $3, %1, %1, %0" : "=x" (unordered) : "x" (aa) );
991+
return Vec4fb(unordered);
992+
}
977993
#else
978-
// Function is_nan: gives true for elements that are +NAN or -NAN
979-
// false for finite numbers and +/-INF
980-
// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
981-
//__attribute__ ((optimize("-fno-unsafe-math-optimizations")));
982-
#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
983-
__attribute__((optimize("-fno-unsafe-math-optimizations")))
984-
#elif defined(__clang__)
985-
__attribute__((optnone))
986-
#endif
987994
static inline Vec4fb is_nan(Vec4f const a) {
988-
return a != a; // not safe with -ffinite-math-only compiler option
995+
// assume that compiler does not optimize this away with -ffinite-math-only:
996+
return _mm_cmp_ps(a, a, 3); // compare unordered
997+
// return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
989998
}
990999
#endif
9911000

@@ -1934,25 +1943,36 @@ static inline Vec2db is_inf(Vec2d const a) {
19341943
#endif
19351944
}
19361945

1946+
1947+
// Function is_nan: gives true for elements that are +NAN or -NAN
1948+
// false for finite numbers and +/-INF
1949+
// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
19371950
#if INSTRSET >= 10
19381951
static inline Vec2db is_nan(Vec2d const a) {
1939-
return _mm_fpclass_pd_mask(a, 0x81);
1952+
// assume that compiler does not optimize this away with -ffinite-math-only:
1953+
return Vec2db(_mm_fpclass_pd_mask(a, 0x81));
1954+
}
1955+
//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
1956+
//__attribute__((optimize("-fno-unsafe-math-optimizations")))
1957+
//static inline Vec2db is_nan(Vec2d const a) {
1958+
// return a != a; // not safe with -ffinite-math-only compiler option
1959+
//}
1960+
#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
1961+
static inline Vec2db is_nan(Vec2d const a) {
1962+
__m128d aa = a;
1963+
__m128i unordered;
1964+
__asm volatile("vcmppd $3, %1, %1, %0" : "=x" (unordered) : "x" (aa) );
1965+
return Vec2db(unordered);
19401966
}
19411967
#else
1942-
// Function is_nan: gives true for elements that are +NAN or -NAN
1943-
// false for finite numbers and +/-INF
1944-
// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
1945-
//__attribute__ ((optimize("-fno-unsafe-math-optimizations")));
1946-
#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
1947-
__attribute__((optimize("-fno-unsafe-math-optimizations")))
1948-
#elif defined(__clang__)
1949-
__attribute__((optnone))
1950-
#endif
19511968
static inline Vec2db is_nan(Vec2d const a) {
1952-
return a != a; // not safe with -ffinite-math-only compiler option
1969+
// assume that compiler does not optimize this away with -ffinite-math-only:
1970+
return _mm_cmp_pd(a, a, 3); // compare unordered
1971+
// return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
19531972
}
19541973
#endif
19551974

1975+
19561976
// Function is_subnormal: gives true for elements that are subnormal (denormal)
19571977
// false for finite numbers, zero, NAN and INF
19581978
static inline Vec2db is_subnormal(Vec2d const a) {

‎vectorf256.h

+48-8
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/**************************** vectorf256.h *******************************
22
* Author: Agner Fog
33
* Date created: 2012-05-30
4-
* Last modified: 2019-08-01
5-
* Version: 2.00.00
4+
* Last modified: 2019-10-27
5+
* Version: 2.00.02
66
* Project: vector class library
77
* Description:
88
* Header file defining 256-bit floating point vector classes
@@ -52,11 +52,14 @@ namespace VCL_NAMESPACE {
5252
// Generate a constant vector of 8 integers stored in memory
5353
template <uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7>
5454
inline __m256 constant8f() {
55+
/*
5556
const union {
5657
uint32_t i[8];
5758
__m256 ymm;
5859
} u = {{i0,i1,i2,i3,i4,i5,i6,i7}};
5960
return u.ymm;
61+
*/
62+
return _mm256_castsi256_ps(_mm256_setr_epi32(i0,i1,i2,i3,i4,i5,i6,i7));
6063
}
6164

6265

@@ -1067,13 +1070,31 @@ static inline Vec8fb is_inf(Vec8f const a) {
10671070
// Function is_nan: gives true for elements that are +NAN or -NAN
10681071
// false for finite numbers and +/-INF
10691072
// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
1073+
#if INSTRSET >= 10
10701074
static inline Vec8fb is_nan(Vec8f const a) {
1071-
#if INSTRSET >= 10 // compact boolean vectors
1075+
// assume that compiler does not optimize this away with -ffinite-math-only:
10721076
return _mm256_fpclass_ps_mask (a, 0x81);
1077+
}
1078+
//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
1079+
//__attribute__((optimize("-fno-unsafe-math-optimizations")))
1080+
//static inline Vec8fb is_nan(Vec8f const a) {
1081+
// return a != a; // not safe with -ffinite-math-only compiler option
1082+
//}
1083+
#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
1084+
static inline Vec8fb is_nan(Vec8f const a) {
1085+
__m256 aa = a;
1086+
__m256 unordered;
1087+
__asm volatile("vcmpps $3, %1, %1, %0" : "=v" (unordered) : "v" (aa) );
1088+
return Vec8fb(unordered);
1089+
}
10731090
#else
1074-
return a != a; // not safe with -ffinite-math-only compiler option
1075-
#endif
1091+
static inline Vec8fb is_nan(Vec8f const a) {
1092+
// assume that compiler does not optimize this away with -ffinite-math-only:
1093+
return _mm256_cmp_ps(a, a, 3); // compare unordered
1094+
// return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
10761095
}
1096+
#endif
1097+
10771098

10781099
// Function is_subnormal: gives true for elements that are denormal (subnormal)
10791100
// false for finite numbers, zero, NAN and INF
@@ -1873,13 +1894,32 @@ static inline Vec4db is_inf(Vec4d const a) {
18731894

18741895
// Function is_nan: gives true for elements that are +NAN or -NAN
18751896
// false for finite numbers and +/-INF
1897+
// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
1898+
#if INSTRSET >= 10
18761899
static inline Vec4db is_nan(Vec4d const a) {
1877-
#if INSTRSET >= 10 // compact boolean vectors
1900+
// assume that compiler does not optimize this away with -ffinite-math-only:
18781901
return _mm256_fpclass_pd_mask (a, 0x81);
1902+
}
1903+
//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
1904+
//__attribute__((optimize("-fno-unsafe-math-optimizations")))
1905+
//static inline Vec4db is_nan(Vec4d const a) {
1906+
// return a != a; // not safe with -ffinite-math-only compiler option
1907+
//}
1908+
#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
1909+
static inline Vec4db is_nan(Vec4d const a) {
1910+
__m256d aa = a;
1911+
__m256d unordered;
1912+
__asm volatile("vcmppd $3, %1, %1, %0" : "=v" (unordered) : "v" (aa) );
1913+
return Vec4db(unordered);
1914+
}
18791915
#else
1880-
return a != a; // not safe with -ffinite-math-only compiler option
1881-
#endif
1916+
static inline Vec4db is_nan(Vec4d const a) {
1917+
// assume that compiler does not optimize this away with -ffinite-math-only:
1918+
return _mm256_cmp_pd(a, a, 3); // compare unordered
1919+
// return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
18821920
}
1921+
#endif
1922+
18831923

18841924
// Function is_subnormal: gives true for elements that are denormal (subnormal)
18851925
// false for finite numbers, zero, NAN and INF

‎vectorf512.h

+50-13
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/**************************** vectorf512.h *******************************
22
* Author: Agner Fog
33
* Date created: 2014-07-23
4-
* Last modified: 2019-08-01
5-
* Version: 2.00.00
4+
* Last modified: 2019-10-27
5+
* Version: 2.00.02
66
* Project: vector class library
77
* Description:
88
* Header file defining 512-bit floating point vector classes
@@ -68,16 +68,16 @@ inline Vec8b::Vec8b(Vec4db const x0, Vec4db const x1) {
6868
mm = to_bits(x0) | (to_bits(x1) << 4);
6969
}
7070

71-
Vec8ib Vec16b::get_low() const {
71+
inline Vec8ib Vec16b::get_low() const {
7272
return Vec8ib().load_bits(uint8_t(mm));
7373
}
74-
Vec8ib Vec16b::get_high() const {
74+
inline Vec8ib Vec16b::get_high() const {
7575
return Vec8ib().load_bits(uint8_t((uint16_t)mm >> 8u));
7676
}
77-
Vec4qb Vec8b::get_low() const {
77+
inline Vec4qb Vec8b::get_low() const {
7878
return Vec4qb().load_bits(mm & 0xF);
7979
}
80-
Vec4qb Vec8b::get_high() const {
80+
inline Vec4qb Vec8b::get_high() const {
8181
return Vec4qb().load_bits(mm >> 4u);
8282
}
8383

@@ -466,18 +466,36 @@ static inline Vec16fb is_inf(Vec16f const a) {
466466
Vec16i t2 = t1 << 1; // shift out sign bit
467467
return Vec16fb(t2 == 0xFF000000); // exponent is all 1s, fraction is 0
468468
#endif
469-
}
469+
}
470470

471471
// Function is_nan: gives true for elements that are +NAN or -NAN
472472
// false for finite numbers and +/-INF
473473
// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
474+
#if INSTRSET >= 10
474475
static inline Vec16fb is_nan(Vec16f const a) {
475-
#if INSTRSET >= 10 // __AVX512DQ__
476+
// assume that compiler does not optimize this away with -ffinite-math-only:
476477
return _mm512_fpclass_ps_mask(a, 0x81);
478+
}
479+
//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
480+
//__attribute__((optimize("-fno-unsafe-math-optimizations")))
481+
//static inline Vec16fb is_nan(Vec16f const a) {
482+
// return a != a; // not safe with -ffinite-math-only compiler option
483+
//}
484+
#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
485+
static inline Vec16fb is_nan(Vec16f const a) {
486+
__m512 aa = a;
487+
__mmask16 unordered;
488+
__asm volatile("vcmpps $3, %1, %1, %0" : "=Yk" (unordered) : "v" (aa) );
489+
return Vec16fb(unordered);
490+
}
477491
#else
478-
return a != a; // not safe with -ffinite-math-only compiler option
479-
#endif
492+
static inline Vec16fb is_nan(Vec16f const a) {
493+
// assume that compiler does not optimize this away with -ffinite-math-only:
494+
return Vec16fb().load_bits(_mm512_cmp_ps_mask(a, a, 3)); // compare unordered
495+
// return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
480496
}
497+
#endif
498+
481499

482500
// Function is_subnormal: gives true for elements that are denormal (subnormal)
483501
// false for finite numbers, zero, NAN and INF
@@ -1088,13 +1106,32 @@ static inline Vec8db is_inf(Vec8d const a) {
10881106

10891107
// Function is_nan: gives true for elements that are +NAN or -NAN
10901108
// false for finite numbers and +/-INF
1109+
// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
1110+
#if INSTRSET >= 10
10911111
static inline Vec8db is_nan(Vec8d const a) {
1092-
#if INSTRSET >= 10 // __AVX512DQ__
1112+
// assume that compiler does not optimize this away with -ffinite-math-only:
10931113
return _mm512_fpclass_pd_mask(a, 0x81);
1114+
}
1115+
//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
1116+
//__attribute__((optimize("-fno-unsafe-math-optimizations")))
1117+
//static inline Vec8db is_nan(Vec8d const a) {
1118+
// return a != a; // not safe with -ffinite-math-only compiler option
1119+
//}
1120+
#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
1121+
static inline Vec8db is_nan(Vec8d const a) {
1122+
__m512d aa = a;
1123+
__mmask16 unordered;
1124+
__asm volatile("vcmppd $3, %1, %1, %0" : "=Yk" (unordered) : "v" (aa) );
1125+
return Vec8db(unordered);
1126+
}
10941127
#else
1095-
return a != a; // not safe with -ffinite-math-only compiler option
1096-
#endif
1128+
static inline Vec8db is_nan(Vec8d const a) {
1129+
// assume that compiler does not optimize this away with -ffinite-math-only:
1130+
return Vec8db().load_bits(_mm512_cmp_pd_mask(a, a, 3)); // compare unordered
1131+
// return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option
10971132
}
1133+
#endif
1134+
10981135

10991136
// Function is_subnormal: gives true for elements that are denormal (subnormal)
11001137
// false for finite numbers, zero, NAN and INF

0 commit comments

Comments
 (0)