69 #define Mln2 0.6931471805f 71 #define B 1065353216.0f 75 #ifndef INCLUDED_volk_32f_expfast_32f_a_H 76 #define INCLUDED_volk_32f_expfast_32f_a_H 78 #if LV_HAVE_AVX && LV_HAVE_FMA 80 #include <immintrin.h> 83 volk_32f_expfast_32f_a_avx_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
85 float* bPtr = bVector;
86 const float* aPtr = aVector;
88 unsigned int number = 0;
89 const unsigned int eighthPoints = num_points / 8;
91 __m256 aVal, bVal, a, b;
93 a = _mm256_set1_ps(
A/
Mln2);
94 b = _mm256_set1_ps(
B-
C);
96 for(;number < eighthPoints; number++){
97 aVal = _mm256_load_ps(aPtr);
98 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a,aVal, b));
99 bVal = _mm256_castsi256_ps(exp);
101 _mm256_store_ps(bPtr, bVal);
106 number = eighthPoints * 8;
107 for(;number < num_points; number++){
108 *bPtr++ = expf(*aPtr++);
116 #include <immintrin.h> 121 float* bPtr = bVector;
122 const float* aPtr = aVector;
124 unsigned int number = 0;
125 const unsigned int eighthPoints = num_points / 8;
127 __m256 aVal, bVal, a, b;
129 a = _mm256_set1_ps(
A/
Mln2);
130 b = _mm256_set1_ps(
B-
C);
132 for(;number < eighthPoints; number++){
133 aVal = _mm256_load_ps(aPtr);
134 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
135 bVal = _mm256_castsi256_ps(exp);
137 _mm256_store_ps(bPtr, bVal);
142 number = eighthPoints * 8;
143 for(;number < num_points; number++){
144 *bPtr++ = expf(*aPtr++);
150 #ifdef LV_HAVE_SSE4_1 151 #include <smmintrin.h> 154 volk_32f_expfast_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
156 float* bPtr = bVector;
157 const float* aPtr = aVector;
159 unsigned int number = 0;
160 const unsigned int quarterPoints = num_points / 4;
162 __m128 aVal, bVal, a, b;
164 a = _mm_set1_ps(
A/
Mln2);
165 b = _mm_set1_ps(
B-
C);
167 for(;number < quarterPoints; number++){
168 aVal = _mm_load_ps(aPtr);
169 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
170 bVal = _mm_castsi128_ps(exp);
172 _mm_store_ps(bPtr, bVal);
177 number = quarterPoints * 4;
178 for(;number < num_points; number++){
179 *bPtr++ = expf(*aPtr++);
187 #ifndef INCLUDED_volk_32f_expfast_32f_u_H 188 #define INCLUDED_volk_32f_expfast_32f_u_H 190 #if LV_HAVE_AVX && LV_HAVE_FMA 191 #include <immintrin.h> 194 volk_32f_expfast_32f_u_avx_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
196 float* bPtr = bVector;
197 const float* aPtr = aVector;
199 unsigned int number = 0;
200 const unsigned int eighthPoints = num_points / 8;
202 __m256 aVal, bVal, a, b;
204 a = _mm256_set1_ps(
A/
Mln2);
205 b = _mm256_set1_ps(
B-
C);
207 for(;number < eighthPoints; number++){
208 aVal = _mm256_loadu_ps(aPtr);
209 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a,aVal, b));
210 bVal = _mm256_castsi256_ps(exp);
212 _mm256_storeu_ps(bPtr, bVal);
217 number = eighthPoints * 8;
218 for(;number < num_points; number++){
219 *bPtr++ = expf(*aPtr++);
226 #include <immintrin.h> 231 float* bPtr = bVector;
232 const float* aPtr = aVector;
234 unsigned int number = 0;
235 const unsigned int eighthPoints = num_points / 8;
237 __m256 aVal, bVal, a, b;
239 a = _mm256_set1_ps(
A/
Mln2);
240 b = _mm256_set1_ps(
B-
C);
242 for(;number < eighthPoints; number++){
243 aVal = _mm256_loadu_ps(aPtr);
244 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
245 bVal = _mm256_castsi256_ps(exp);
247 _mm256_storeu_ps(bPtr, bVal);
252 number = eighthPoints * 8;
253 for(;number < num_points; number++){
254 *bPtr++ = expf(*aPtr++);
261 #ifdef LV_HAVE_SSE4_1 262 #include <smmintrin.h> 265 volk_32f_expfast_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
267 float* bPtr = bVector;
268 const float* aPtr = aVector;
270 unsigned int number = 0;
271 const unsigned int quarterPoints = num_points / 4;
273 __m128 aVal, bVal, a, b;
275 a = _mm_set1_ps(
A/
Mln2);
276 b = _mm_set1_ps(
B-
C);
278 for(;number < quarterPoints; number++){
279 aVal = _mm_loadu_ps(aPtr);
280 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
281 bVal = _mm_castsi128_ps(exp);
283 _mm_storeu_ps(bPtr, bVal);
288 number = quarterPoints * 4;
289 for(;number < num_points; number++){
290 *bPtr++ = expf(*aPtr++);
297 #ifdef LV_HAVE_GENERIC 302 float* bPtr = bVector;
303 const float* aPtr = aVector;
304 unsigned int number = 0;
306 for(number = 0; number < num_points; number++){
307 *bPtr++ = expf(*aPtr++);
static void volk_32f_expfast_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:229
static void volk_32f_expfast_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:119
#define B
Definition: volk_32f_expfast_32f.h:71
#define C
Definition: volk_32f_expfast_32f.h:72
#define Mln2
Definition: volk_32f_expfast_32f.h:69
static void volk_32f_expfast_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:300
#define A
Definition: volk_32f_expfast_32f.h:70