Skip to content

Commit fe6c450

Browse files
authored
Add files via upload
Version 2.02.01
1 parent 08959eb commit fe6c450

13 files changed

+156
-87
lines changed

‎changelog.txt

+2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
Change log for Vector class library
22
-----------------------------------
3+
2022-06-03 version 2.02.01
4+
* minor bug fixes and updates
35

46
2022-07-20 version 2.02.00
57
* support half precision floating point vectors

‎dispatch_example2.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/************************* dispatch_example2.cpp ***************************
22
Author: Agner Fog
33
Date created: 2012-05-30
4-
Last modified: 2020-02-25
4+
Last modified: 2023-06-03
55
Version: 2.02.00
66
Project: vector class library
77
Description: Example of automatic CPU dispatching.
@@ -49,7 +49,7 @@ clang++ -O2 -m64 -msse2 -std=c++17 dispatch_example2.cpp instrset_detect.cpp d7.
4949
# Run the program
5050
./test.exe
5151
52-
(c) Copyright 2012-2022 Agner Fog.
52+
(c) Copyright 2012-2023 Agner Fog.
5353
Apache License version 2.0 or later.
5454
******************************************************************************/
5555

@@ -168,7 +168,7 @@ float myfunc_dispatch(float const f[]) {
168168
// Choose which version of the entry function we want to point to:
169169
if (iset >= 10) myfunc_pointer = &Ns_AVX512::myfunc; // AVX512 version
170170
else if (iset >= 8) myfunc_pointer = &Ns_AVX2::myfunc; // AVX2 version
171-
else if (iset >= 5) myfunc_pointer = &Ns_AVX::myfunc; // AVX version
171+
else if (iset >= 7) myfunc_pointer = &Ns_AVX::myfunc; // AVX version
172172
else if (iset >= 2) myfunc_pointer = &Ns_SSE2::myfunc; // SSE2 version
173173
else {
174174
// Error: lowest instruction set not supported.

‎instrset.h

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/**************************** instrset.h **********************************
22
* Author: Agner Fog
33
* Date created: 2012-05-30
4-
* Last modified: 2022-07-26
5-
* Version: 2.02.00
4+
* Last modified: 2023-06-03
5+
* Version: 2.02.01
66
* Project: vector class library
77
* Description:
88
* Header file for various compiler-specific tasks as well as common
@@ -16,7 +16,7 @@
1616
*
1717
* For instructions, see vcl_manual.pdf
1818
*
19-
* (c) Copyright 2012-2022 Agner Fog.
19+
* (c) Copyright 2012-2023 Agner Fog.
2020
* Apache License version 2.0 or later.
2121
******************************************************************************/
2222

@@ -110,6 +110,7 @@
110110
#endif
111111

112112
#include <stdint.h> // Define integer types with known size
113+
#include <limits.h> // Define INT_MAX
113114
#include <stdlib.h> // define abs(int)
114115

115116

‎vectorf128.h

+11-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/**************************** vectorf128.h *******************************
22
* Author: Agner Fog
33
* Date created: 2012-05-30
4-
* Last modified: 2022-07-20
5-
* Version: 2.02.00
4+
* Last modified: 2023-06-03
5+
* Version: 2.02.01
66
* Project: vector class library
77
* Description:
88
* Header file defining 128-bit floating point vector classes
@@ -18,7 +18,7 @@
1818
* Each vector object is represented internally in the CPU as a 128-bit register.
1919
* This header file defines operators and functions for these vectors.
2020
*
21-
* (c) Copyright 2012-2022 Agner Fog.
21+
* (c) Copyright 2012-2023 Agner Fog.
2222
* Apache License version 2.0 or later.
2323
*****************************************************************************/
2424

@@ -2801,7 +2801,10 @@ static inline Vec4f lookup(Vec4i const index, float const * table) {
28012801
}
28022802
// n > 8. Limit index
28032803
Vec4ui index1;
2804-
if constexpr ((n & (n - 1)) == 0) {
2804+
if constexpr (n == INT_MAX) {
2805+
index1 = index;
2806+
}
2807+
else if constexpr ((n & (n - 1)) == 0) {
28052808
// n is a power of 2, make index modulo n
28062809
index1 = Vec4ui(index) & (n - 1);
28072810
}
@@ -2853,7 +2856,10 @@ static inline Vec2d lookup(Vec2q const index, double const * table) {
28532856
#endif
28542857
// Limit index
28552858
Vec2uq index1;
2856-
if constexpr ((n & (n - 1)) == 0) {
2859+
if constexpr (n == INT_MAX) {
2860+
index1 = index;
2861+
}
2862+
else if constexpr ((n & (n - 1)) == 0) {
28572863
// n is a power of 2, make index modulo n
28582864
index1 = Vec2uq(index) & (n - 1);
28592865
}

‎vectorf256.h

+11-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/**************************** vectorf256.h *******************************
22
* Author: Agner Fog
33
* Date created: 2012-05-30
4-
* Last modified: 2022-07-20
5-
* Version: 2.02.00
4+
* Last modified: 2023-06-03
5+
* Version: 2.02.01
66
* Project: vector class library
77
* Description:
88
* Header file defining 256-bit floating point vector classes
@@ -18,7 +18,7 @@
1818
* Each vector object is represented internally in the CPU as a 256-bit register.
1919
* This header file defines operators and functions for these vectors.
2020
*
21-
* (c) Copyright 2012-2022 Agner Fog.
21+
* (c) Copyright 2012-2023 Agner Fog.
2222
* Apache License version 2.0 or later.
2323
*****************************************************************************/
2424

@@ -2843,7 +2843,10 @@ static inline Vec8f lookup(Vec8i const index, float const * table) {
28432843
#endif
28442844
// Limit index
28452845
Vec8ui index1;
2846-
if constexpr ((n & (n-1)) == 0) {
2846+
if constexpr (n == INT_MAX) {
2847+
index1 = index;
2848+
}
2849+
else if constexpr ((n & (n-1)) == 0) {
28472850
// n is a power of 2, make index modulo n
28482851
index1 = Vec8ui(index) & (n-1);
28492852
}
@@ -2907,7 +2910,10 @@ static inline Vec4d lookup(Vec4q const index, double const * table) {
29072910
#endif
29082911
// Limit index
29092912
Vec4uq index1;
2910-
if constexpr ((n & (n-1)) == 0) {
2913+
if constexpr (n == INT_MAX) {
2914+
index1 = index;
2915+
}
2916+
else if constexpr ((n & (n-1)) == 0) {
29112917
// n is a power of 2, make index modulo n
29122918
index1 = Vec4uq(index) & Vec4uq(n-1);
29132919
}

‎vectorf256e.h

+11-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/**************************** vectorf256e.h *******************************
22
* Author: Agner Fog
33
* Date created: 2012-05-30
4-
* Last modified: 2022-07-20
5-
* Version: 2.02.00
4+
* Last modified: 2023-06-03
5+
* Version: 2.02.01
66
* Project: vector class library
77
* Description:
88
* Header file defining 256-bit floating point vector classes
@@ -19,7 +19,7 @@
1919
* Each vector object is represented internally in the CPU as two 128-bit registers.
2020
* This header file defines operators and functions for these vectors.
2121
*
22-
* (c) Copyright 2012-2022 Agner Fog.
22+
* (c) Copyright 2012-2023 Agner Fog.
2323
* Apache License version 2.0 or later.
2424
*****************************************************************************/
2525

@@ -1827,7 +1827,10 @@ static inline Vec8f lookup(Vec8i const index, float const * table) {
18271827
}
18281828
// Limit index
18291829
Vec8ui index1;
1830-
if constexpr ((n & (n-1)) == 0) {
1830+
if constexpr (n == INT_MAX) {
1831+
index1 = index;
1832+
}
1833+
else if constexpr ((n & (n-1)) == 0) {
18311834
// n is a power of 2, make index modulo n
18321835
index1 = Vec8ui(index) & (n-1);
18331836
}
@@ -1856,7 +1859,10 @@ static inline Vec4d lookup(Vec4q const index, double const * table) {
18561859
}
18571860
// Limit index
18581861
Vec8ui index1;
1859-
if constexpr ((n & (n-1)) == 0) {
1862+
if constexpr (n == INT_MAX) {
1863+
index1 = Vec8ui(index);
1864+
}
1865+
else if constexpr ((n & (n-1)) == 0) {
18601866
// n is a power of 2, make index modulo n
18611867
index1 = Vec8ui(index) & Vec8ui(n-1, 0, n-1, 0, n-1, 0, n-1, 0);
18621868
}

‎vectorfp16.h

+21-21
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
/**************************** vectorfp16.h *******************************
22
* Author: Agner Fog
33
* Date created: 2022-05-03
4-
* Last modified: 2022-07-20
5-
* Version: 2.02.00
4+
* Last modified: 2023-06-03
5+
* Version: 2.02.01
66
* Project: vector class library
77
* Description:
88
* Header file defining half precision floating point vector classes
@@ -23,7 +23,7 @@
2323
* g++ version 12.1 with binutils version 2.34
2424
* Intel c++ compiler version 2022.0
2525
*
26-
* (c) Copyright 2012-2022 Agner Fog.
26+
* (c) Copyright 2012-2023 Agner Fog.
2727
* Apache License version 2.0 or later.
2828
*****************************************************************************/
2929

@@ -687,24 +687,24 @@ static inline Vec8h change_sign(Vec8h const a) {
687687

688688
// conversions Vec8h <-> Vec4f
689689
// extend precision: Vec8h -> Vec4f. upper half ignored
690-
Vec4f convert8h_4f (Vec8h h) {
690+
static inline Vec4f convert8h_4f (Vec8h h) {
691691
return _mm_cvtph_ps(_mm_castph_si128(h));
692692
}
693693

694694
// reduce precision: Vec4f -> Vec8h. upper half zero
695-
Vec8h convert4f_8h (Vec4f f) {
695+
static inline Vec8h convert4f_8h (Vec4f f) {
696696
return _mm_castsi128_ph(_mm_cvtps_ph(f, 0));
697697
}
698698

699699
#if MAX_VECTOR_SIZE >= 256
700700
// conversions Vec8h <-> Vec8f
701701
// extend precision: Vec8h -> Vec8f
702-
Vec8f to_float (Vec8h h) {
702+
static inline Vec8f to_float (Vec8h h) {
703703
return _mm256_cvtph_ps(_mm_castph_si128(h));
704704
}
705705

706706
// reduce precision: Vec8f -> Vec8h
707-
Vec8h to_float16 (Vec8f f) {
707+
static inline Vec8h to_float16 (Vec8f f) {
708708
return _mm_castsi128_ph(_mm256_cvtps_ph(f, 0));
709709
}
710710
#endif
@@ -1308,7 +1308,7 @@ inline Vec16h pow<uint32_t>(Vec16h const x0, uint32_t const n) {
13081308

13091309
// implement as function pow(vector, const_int)
13101310
template <int n>
1311-
static inline Vec16h pow(Vec16h const a, Const_int_t<n>) {
1311+
Vec16h pow(Vec16h const a, Const_int_t<n>) {
13121312
return pow_n<Vec16h, n>(a);
13131313
}
13141314

@@ -1422,7 +1422,7 @@ static inline Vec16h exp2(Vec16s const n) {
14221422
// Each index i0 - i15 is 1 for changing sign on the corresponding element, 0 for no change
14231423
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
14241424
int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
1425-
static inline Vec16h change_sign(Vec16h const a) {
1425+
Vec16h change_sign(Vec16h const a) {
14261426
if constexpr ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15) == 0) return a;
14271427
__m256i mask = constant8ui<
14281428
(i0 ? 0x8000 : 0) | (i1 ? 0x80000000 : 0),
@@ -1443,12 +1443,12 @@ static inline Vec16h change_sign(Vec16h const a) {
14431443
*****************************************************************************/
14441444
#if MAX_VECTOR_SIZE >= 512
14451445
// extend precision: Vec8h -> Vec8f
1446-
Vec16f to_float (Vec16h h) {
1446+
static inline Vec16f to_float (Vec16h h) {
14471447
return _mm512_cvtph_ps(_mm256_castph_si256(h));
14481448
}
14491449

14501450
// reduce precision: Vec8f -> Vec8h
1451-
Vec16h to_float16 (Vec16f f) {
1451+
static inline Vec16h to_float16 (Vec16f f) {
14521452
return _mm256_castsi256_ph(_mm512_cvtps_ph(f, 0));
14531453
}
14541454
#endif
@@ -1496,7 +1496,7 @@ static inline Vec16h extend_z(Vec8h a) {
14961496
// permute vector Vec16h
14971497
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
14981498
int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
1499-
static inline Vec16h permute16(Vec16h const a) {
1499+
Vec16h permute16(Vec16h const a) {
15001500
return _mm256_castsi256_ph (
15011501
permute16<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15> (
15021502
Vec16s(_mm256_castph_si256(a))));
@@ -1512,7 +1512,7 @@ static inline Vec16h permute16(Vec16h const a) {
15121512
// permute and blend Vec16h
15131513
template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
15141514
int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
1515-
static inline Vec16h blend16(Vec16h const a, Vec16h const b) {
1515+
Vec16h blend16(Vec16h const a, Vec16h const b) {
15161516
return _mm256_castsi256_ph (
15171517
blend16<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15> (
15181518
Vec16s(_mm256_castph_si256(a)), Vec16s(_mm256_castph_si256(b))));
@@ -1535,7 +1535,7 @@ static inline Vec16h lookup16 (Vec16s const index, Vec16h const table) {
15351535
}
15361536

15371537
template <int n>
1538-
static inline Vec16h lookup(Vec16s const index, void const * table) {
1538+
Vec16h lookup(Vec16s const index, void const * table) {
15391539
return _mm256_castsi256_ph(lookup<n>(index, (void const *)(table)));
15401540
}
15411541

@@ -2063,7 +2063,7 @@ inline Vec32h pow<uint32_t>(Vec32h const x0, uint32_t const n) {
20632063

20642064
// implement as function pow(vector, const_int)
20652065
template <int n>
2066-
static inline Vec32h pow(Vec32h const a, Const_int_t<n>) {
2066+
Vec32h pow(Vec32h const a, Const_int_t<n>) {
20672067
return pow_n<Vec32h, n>(a);
20682068
}
20692069

@@ -2178,7 +2178,7 @@ template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
21782178
int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15,
21792179
int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
21802180
int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
2181-
static inline Vec32h change_sign(Vec32h const a) {
2181+
Vec32h change_sign(Vec32h const a) {
21822182
if constexpr ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7 | i8 | i9 | i10 | i11 | i12 | i13 | i14 | i15 |
21832183
i16 | i17 | i18 | i19 | i20 | i21 | i22 | i23 | i24 | i25 | i26 | i27 | i28 | i29 | i30 | i31)
21842184
== 0) return a;
@@ -2247,7 +2247,7 @@ template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
22472247
int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15,
22482248
int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
22492249
int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
2250-
static inline Vec32h permute32(Vec32h const a) {
2250+
Vec32h permute32(Vec32h const a) {
22512251
return _mm512_castsi512_ph (
22522252
permute32<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
22532253
i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31 > (
@@ -2266,7 +2266,7 @@ template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
22662266
int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15,
22672267
int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23,
22682268
int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 >
2269-
static inline Vec32h blend32(Vec32h const a, Vec32h const b) {
2269+
Vec32h blend32(Vec32h const a, Vec32h const b) {
22702270
return _mm512_castsi512_ph (
22712271
blend32<i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
22722272
i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31 > (
@@ -2307,7 +2307,7 @@ static inline Vec32h lookup(Vec32s const index, void const * table) {
23072307

23082308
// pow(2,n)
23092309
template <typename V>
2310-
static inline V vh_pow2n (V const n) {
2310+
V vh_pow2n (V const n) {
23112311
typedef decltype(roundi(n)) VI; // corresponding integer vector type
23122312
const _Float16 pow2_10 = 1024.; // 2^10
23132313
const _Float16 bias = 15.; // bias in exponent
@@ -2355,7 +2355,7 @@ inline Vec32h infinite_vech<Vec32h>() {
23552355
// BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x)
23562356

23572357
template<typename VTYPE, int M1, int BA>
2358-
static inline VTYPE exp_h(VTYPE const initial_x) {
2358+
VTYPE exp_h(VTYPE const initial_x) {
23592359

23602360
// Taylor coefficients
23612361
const _Float16 P0expf = 1.f/2.f;
@@ -2444,7 +2444,7 @@ static inline Vec32us unsigned_int_type(Vec32h) { return 0; }
24442444
// xx = input x (radians)
24452445
// cosret = return pointer (only if SC = 3)
24462446
template<typename VTYPE, int SC>
2447-
static inline VTYPE sincos_h(VTYPE * cosret, VTYPE const xx) {
2447+
VTYPE sincos_h(VTYPE * cosret, VTYPE const xx) {
24482448

24492449
// define constants
24502450
const _Float16 dp1h = 1.57031250f; // pi/2 with lower bits of mantissa removed

0 commit comments

Comments
 (0)