77 #ifndef INCLUDED_volk_32f_asin_32f_a_H 78 #define INCLUDED_volk_32f_asin_32f_a_H 80 #if LV_HAVE_AVX2 && LV_HAVE_FMA 81 #include <immintrin.h> 84 volk_32f_asin_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
86 float* bPtr = bVector;
87 const float* aPtr = aVector;
89 unsigned int number = 0;
90 unsigned int eighthPoints = num_points / 8;
93 __m256 aVal, pio2, x, y, z, arcsine;
94 __m256 fzeroes, fones, ftwos, ffours, condition;
96 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
97 fzeroes = _mm256_setzero_ps();
98 fones = _mm256_set1_ps(1.0);
99 ftwos = _mm256_set1_ps(2.0);
100 ffours = _mm256_set1_ps(4.0);
102 for(;number < eighthPoints; number++){
103 aVal = _mm256_load_ps(aPtr);
104 aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
106 condition = _mm256_cmp_ps(z, fzeroes,1);
107 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
109 condition = _mm256_cmp_ps(z, fones,1);
110 x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
112 for(i = 0; i < 2; i++){
113 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
115 x = _mm256_div_ps(fones, x);
118 y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
121 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
122 condition = _mm256_cmp_ps(z, fones,14);
124 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
126 condition = _mm256_cmp_ps(aVal, fzeroes,1);
127 arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
129 _mm256_store_ps(bPtr, arcsine);
134 number = eighthPoints * 8;
135 for(;number < num_points; number++){
136 *bPtr++ = asin(*aPtr++);
144 #include <immintrin.h> 149 float* bPtr = bVector;
150 const float* aPtr = aVector;
152 unsigned int number = 0;
153 unsigned int eighthPoints = num_points / 8;
156 __m256 aVal, pio2, x, y, z, arcsine;
157 __m256 fzeroes, fones, ftwos, ffours, condition;
159 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
160 fzeroes = _mm256_setzero_ps();
161 fones = _mm256_set1_ps(1.0);
162 ftwos = _mm256_set1_ps(2.0);
163 ffours = _mm256_set1_ps(4.0);
165 for(;number < eighthPoints; number++){
166 aVal = _mm256_load_ps(aPtr);
167 aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
169 condition = _mm256_cmp_ps(z, fzeroes,1);
170 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
172 condition = _mm256_cmp_ps(z, fones,1);
173 x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
175 for(i = 0; i < 2; i++){
176 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
178 x = _mm256_div_ps(fones, x);
181 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
184 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
185 condition = _mm256_cmp_ps(z, fones,14);
187 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
189 condition = _mm256_cmp_ps(aVal, fzeroes,1);
190 arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
192 _mm256_store_ps(bPtr, arcsine);
197 number = eighthPoints * 8;
198 for(;number < num_points; number++){
199 *bPtr++ = asin(*aPtr++);
205 #ifdef LV_HAVE_SSE4_1 206 #include <smmintrin.h> 209 volk_32f_asin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
211 float* bPtr = bVector;
212 const float* aPtr = aVector;
214 unsigned int number = 0;
215 unsigned int quarterPoints = num_points / 4;
218 __m128 aVal, pio2, x, y, z, arcsine;
219 __m128 fzeroes, fones, ftwos, ffours, condition;
221 pio2 = _mm_set1_ps(3.14159265358979323846/2);
222 fzeroes = _mm_setzero_ps();
223 fones = _mm_set1_ps(1.0);
224 ftwos = _mm_set1_ps(2.0);
225 ffours = _mm_set1_ps(4.0);
227 for(;number < quarterPoints; number++){
228 aVal = _mm_load_ps(aPtr);
229 aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
231 condition = _mm_cmplt_ps(z, fzeroes);
232 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
234 condition = _mm_cmplt_ps(z, fones);
235 x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
237 for(i = 0; i < 2; i++){
238 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
240 x = _mm_div_ps(fones, x);
243 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
246 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
247 condition = _mm_cmpgt_ps(z, fones);
249 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
251 condition = _mm_cmplt_ps(aVal, fzeroes);
252 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
254 _mm_store_ps(bPtr, arcsine);
259 number = quarterPoints * 4;
260 for(;number < num_points; number++){
261 *bPtr++ = asinf(*aPtr++);
269 #ifndef INCLUDED_volk_32f_asin_32f_u_H 270 #define INCLUDED_volk_32f_asin_32f_u_H 272 #if LV_HAVE_AVX2 && LV_HAVE_FMA 273 #include <immintrin.h> 276 volk_32f_asin_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
278 float* bPtr = bVector;
279 const float* aPtr = aVector;
281 unsigned int number = 0;
282 unsigned int eighthPoints = num_points / 8;
285 __m256 aVal, pio2, x, y, z, arcsine;
286 __m256 fzeroes, fones, ftwos, ffours, condition;
288 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
289 fzeroes = _mm256_setzero_ps();
290 fones = _mm256_set1_ps(1.0);
291 ftwos = _mm256_set1_ps(2.0);
292 ffours = _mm256_set1_ps(4.0);
294 for(;number < eighthPoints; number++){
295 aVal = _mm256_loadu_ps(aPtr);
296 aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
298 condition = _mm256_cmp_ps(z, fzeroes,1);
299 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
301 condition = _mm256_cmp_ps(z, fones,1);
302 x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
304 for(i = 0; i < 2; i++){
305 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
307 x = _mm256_div_ps(fones, x);
310 y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
313 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
314 condition = _mm256_cmp_ps(z, fones,14);
316 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
318 condition = _mm256_cmp_ps(aVal, fzeroes,1);
319 arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
321 _mm256_storeu_ps(bPtr, arcsine);
326 number = eighthPoints * 8;
327 for(;number < num_points; number++){
328 *bPtr++ = asin(*aPtr++);
336 #include <immintrin.h> 341 float* bPtr = bVector;
342 const float* aPtr = aVector;
344 unsigned int number = 0;
345 unsigned int eighthPoints = num_points / 8;
348 __m256 aVal, pio2, x, y, z, arcsine;
349 __m256 fzeroes, fones, ftwos, ffours, condition;
351 pio2 = _mm256_set1_ps(3.14159265358979323846/2);
352 fzeroes = _mm256_setzero_ps();
353 fones = _mm256_set1_ps(1.0);
354 ftwos = _mm256_set1_ps(2.0);
355 ffours = _mm256_set1_ps(4.0);
357 for(;number < eighthPoints; number++){
358 aVal = _mm256_loadu_ps(aPtr);
359 aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
361 condition = _mm256_cmp_ps(z, fzeroes,1);
362 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
364 condition = _mm256_cmp_ps(z, fones,1);
365 x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
367 for(i = 0; i < 2; i++){
368 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
370 x = _mm256_div_ps(fones, x);
373 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
376 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
377 condition = _mm256_cmp_ps(z, fones,14);
379 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
381 condition = _mm256_cmp_ps(aVal, fzeroes,1);
382 arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
384 _mm256_storeu_ps(bPtr, arcsine);
389 number = eighthPoints * 8;
390 for(;number < num_points; number++){
391 *bPtr++ = asin(*aPtr++);
398 #ifdef LV_HAVE_SSE4_1 399 #include <smmintrin.h> 402 volk_32f_asin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
404 float* bPtr = bVector;
405 const float* aPtr = aVector;
407 unsigned int number = 0;
408 unsigned int quarterPoints = num_points / 4;
411 __m128 aVal, pio2, x, y, z, arcsine;
412 __m128 fzeroes, fones, ftwos, ffours, condition;
414 pio2 = _mm_set1_ps(3.14159265358979323846/2);
415 fzeroes = _mm_setzero_ps();
416 fones = _mm_set1_ps(1.0);
417 ftwos = _mm_set1_ps(2.0);
418 ffours = _mm_set1_ps(4.0);
420 for(;number < quarterPoints; number++){
421 aVal = _mm_loadu_ps(aPtr);
422 aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
424 condition = _mm_cmplt_ps(z, fzeroes);
425 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
427 condition = _mm_cmplt_ps(z, fones);
428 x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
430 for(i = 0; i < 2; i++){
431 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
433 x = _mm_div_ps(fones, x);
436 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
439 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
440 condition = _mm_cmpgt_ps(z, fones);
442 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
444 condition = _mm_cmplt_ps(aVal, fzeroes);
445 arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
447 _mm_storeu_ps(bPtr, arcsine);
452 number = quarterPoints * 4;
453 for(;number < num_points; number++){
454 *bPtr++ = asinf(*aPtr++);
460 #ifdef LV_HAVE_GENERIC 465 float* bPtr = bVector;
466 const float* aPtr = aVector;
467 unsigned int number = 0;
469 for(number = 0; number < num_points; number++){
470 *bPtr++ = asinf(*aPtr++);
static void volk_32f_asin_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:463
#define ASIN_TERMS
Definition: volk_32f_asin_32f.h:75
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32f_asin_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:339
static void volk_32f_asin_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:147