77 #ifndef INCLUDED_volk_32f_acos_32f_a_H 78 #define INCLUDED_volk_32f_acos_32f_a_H 80 #if LV_HAVE_AVX2 && LV_HAVE_FMA 81 #include <immintrin.h> 84 volk_32f_acos_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
86 float* bPtr = bVector;
87 const float* aPtr = aVector;
89 unsigned int number = 0;
90 unsigned int eighthPoints = num_points / 8;
93 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
94 __m256 fzeroes, fones, ftwos, ffours, condition;
96 pi = _mm256_set1_ps(3.14159265358979323846);
97 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
98 fzeroes = _mm256_setzero_ps();
99 fones = _mm256_set1_ps(1.0);
100 ftwos = _mm256_set1_ps(2.0);
101 ffours = _mm256_set1_ps(4.0);
103 for(;number < eighthPoints; number++){
104 aVal = _mm256_load_ps(aPtr);
106 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
108 condition = _mm256_cmp_ps(z, fzeroes,1);
109 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
111 condition = _mm256_cmp_ps(z, fones,1);
112 x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
114 for(i = 0; i < 2; i++)
115 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones)));
116 x = _mm256_div_ps(fones, x);
119 y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
121 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
122 condition = _mm256_cmp_ps(z, fones,14);
124 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
126 condition = _mm256_cmp_ps(aVal, fzeroes,1);
127 arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
128 condition = _mm256_cmp_ps(d, fzeroes,1);
129 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
131 _mm256_store_ps(bPtr, arccosine);
136 number = eighthPoints * 8;
137 for(;number < num_points; number++){
138 *bPtr++ = acos(*aPtr++);
146 #include <immintrin.h> 151 float* bPtr = bVector;
152 const float* aPtr = aVector;
154 unsigned int number = 0;
155 unsigned int eighthPoints = num_points / 8;
158 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
159 __m256 fzeroes, fones, ftwos, ffours, condition;
161 pi = _mm256_set1_ps(3.14159265358979323846);
162 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
163 fzeroes = _mm256_setzero_ps();
164 fones = _mm256_set1_ps(1.0);
165 ftwos = _mm256_set1_ps(2.0);
166 ffours = _mm256_set1_ps(4.0);
168 for(;number < eighthPoints; number++){
169 aVal = _mm256_load_ps(aPtr);
171 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
173 condition = _mm256_cmp_ps(z, fzeroes,1);
174 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
176 condition = _mm256_cmp_ps(z, fones,1);
177 x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
179 for(i = 0; i < 2; i++)
180 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
181 x = _mm256_div_ps(fones, x);
184 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
186 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
187 condition = _mm256_cmp_ps(z, fones,14);
189 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
191 condition = _mm256_cmp_ps(aVal, fzeroes,1);
192 arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
193 condition = _mm256_cmp_ps(d, fzeroes,1);
194 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
196 _mm256_store_ps(bPtr, arccosine);
201 number = eighthPoints * 8;
202 for(;number < num_points; number++){
203 *bPtr++ = acos(*aPtr++);
209 #ifdef LV_HAVE_SSE4_1 210 #include <smmintrin.h> 213 volk_32f_acos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
215 float* bPtr = bVector;
216 const float* aPtr = aVector;
218 unsigned int number = 0;
219 unsigned int quarterPoints = num_points / 4;
222 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
223 __m128 fzeroes, fones, ftwos, ffours, condition;
225 pi = _mm_set1_ps(3.14159265358979323846);
226 pio2 = _mm_set1_ps(3.14159265358979323846/2);
227 fzeroes = _mm_setzero_ps();
228 fones = _mm_set1_ps(1.0);
229 ftwos = _mm_set1_ps(2.0);
230 ffours = _mm_set1_ps(4.0);
232 for(;number < quarterPoints; number++){
233 aVal = _mm_load_ps(aPtr);
235 aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
237 condition = _mm_cmplt_ps(z, fzeroes);
238 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
240 condition = _mm_cmplt_ps(z, fones);
241 x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
243 for(i = 0; i < 2; i++)
244 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
245 x = _mm_div_ps(fones, x);
248 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
250 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
251 condition = _mm_cmpgt_ps(z, fones);
253 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
255 condition = _mm_cmplt_ps(aVal, fzeroes);
256 arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
257 condition = _mm_cmplt_ps(d, fzeroes);
258 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
260 _mm_store_ps(bPtr, arccosine);
265 number = quarterPoints * 4;
266 for(;number < num_points; number++){
267 *bPtr++ = acosf(*aPtr++);
276 #ifndef INCLUDED_volk_32f_acos_32f_u_H 277 #define INCLUDED_volk_32f_acos_32f_u_H 279 #if LV_HAVE_AVX2 && LV_HAVE_FMA 280 #include <immintrin.h> 283 volk_32f_acos_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
285 float* bPtr = bVector;
286 const float* aPtr = aVector;
288 unsigned int number = 0;
289 unsigned int eighthPoints = num_points / 8;
292 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
293 __m256 fzeroes, fones, ftwos, ffours, condition;
295 pi = _mm256_set1_ps(3.14159265358979323846);
296 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
297 fzeroes = _mm256_setzero_ps();
298 fones = _mm256_set1_ps(1.0);
299 ftwos = _mm256_set1_ps(2.0);
300 ffours = _mm256_set1_ps(4.0);
302 for(;number < eighthPoints; number++){
303 aVal = _mm256_loadu_ps(aPtr);
305 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
307 condition = _mm256_cmp_ps(z, fzeroes,1);
308 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
310 condition = _mm256_cmp_ps(z, fones,1);
311 x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
313 for(i = 0; i < 2; i++)
314 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones)));
315 x = _mm256_div_ps(fones, x);
318 y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
320 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
321 condition = _mm256_cmp_ps(z, fones,14);
323 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
325 condition = _mm256_cmp_ps(aVal, fzeroes,1);
326 arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
327 condition = _mm256_cmp_ps(d, fzeroes,1);
328 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
330 _mm256_storeu_ps(bPtr, arccosine);
335 number = eighthPoints * 8;
336 for(;number < num_points; number++){
337 *bPtr++ = acos(*aPtr++);
345 #include <immintrin.h> 350 float* bPtr = bVector;
351 const float* aPtr = aVector;
353 unsigned int number = 0;
354 unsigned int eighthPoints = num_points / 8;
357 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
358 __m256 fzeroes, fones, ftwos, ffours, condition;
360 pi = _mm256_set1_ps(3.14159265358979323846);
361 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
362 fzeroes = _mm256_setzero_ps();
363 fones = _mm256_set1_ps(1.0);
364 ftwos = _mm256_set1_ps(2.0);
365 ffours = _mm256_set1_ps(4.0);
367 for(;number < eighthPoints; number++){
368 aVal = _mm256_loadu_ps(aPtr);
370 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
372 condition = _mm256_cmp_ps(z, fzeroes,1);
373 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
375 condition = _mm256_cmp_ps(z, fones,1);
376 x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
378 for(i = 0; i < 2; i++)
379 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
380 x = _mm256_div_ps(fones, x);
383 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
385 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
386 condition = _mm256_cmp_ps(z, fones,14);
388 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
390 condition = _mm256_cmp_ps(aVal, fzeroes,1);
391 arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
392 condition = _mm256_cmp_ps(d, fzeroes,1);
393 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
395 _mm256_storeu_ps(bPtr, arccosine);
400 number = eighthPoints * 8;
401 for(;number < num_points; number++){
402 *bPtr++ = acos(*aPtr++);
408 #ifdef LV_HAVE_SSE4_1 409 #include <smmintrin.h> 412 volk_32f_acos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
414 float* bPtr = bVector;
415 const float* aPtr = aVector;
417 unsigned int number = 0;
418 unsigned int quarterPoints = num_points / 4;
421 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
422 __m128 fzeroes, fones, ftwos, ffours, condition;
424 pi = _mm_set1_ps(3.14159265358979323846);
425 pio2 = _mm_set1_ps(3.14159265358979323846/2);
426 fzeroes = _mm_setzero_ps();
427 fones = _mm_set1_ps(1.0);
428 ftwos = _mm_set1_ps(2.0);
429 ffours = _mm_set1_ps(4.0);
431 for(;number < quarterPoints; number++){
432 aVal = _mm_loadu_ps(aPtr);
434 aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
436 condition = _mm_cmplt_ps(z, fzeroes);
437 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
439 condition = _mm_cmplt_ps(z, fones);
440 x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
442 for(i = 0; i < 2; i++)
443 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
444 x = _mm_div_ps(fones, x);
448 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
450 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
451 condition = _mm_cmpgt_ps(z, fones);
453 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
455 condition = _mm_cmplt_ps(aVal, fzeroes);
456 arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
457 condition = _mm_cmplt_ps(d, fzeroes);
458 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
460 _mm_storeu_ps(bPtr, arccosine);
465 number = quarterPoints * 4;
466 for(;number < num_points; number++){
467 *bPtr++ = acosf(*aPtr++);
473 #ifdef LV_HAVE_GENERIC 478 float* bPtr = bVector;
479 const float* aPtr = aVector;
480 unsigned int number = 0;
482 for(number = 0; number < num_points; number++){
483 *bPtr++ = acosf(*aPtr++);
static void volk_32f_acos_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:348
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32f_acos_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:149
#define ACOS_TERMS
Definition: volk_32f_acos_32f.h:75
static void volk_32f_acos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:476