76 #ifndef INCLUDED_volk_32f_cos_32f_a_H 77 #define INCLUDED_volk_32f_cos_32f_a_H 79 #if LV_HAVE_AVX2 && LV_HAVE_FMA 80 #include <immintrin.h> 83 volk_32f_cos_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
85 float* bPtr = bVector;
86 const float* aPtr = aVector;
88 unsigned int number = 0;
89 unsigned int eighthPoints = num_points / 8;
92 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
94 __m256i q, ones, twos, fours;
96 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
97 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
98 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
99 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
100 ffours = _mm256_set1_ps(4.0);
101 ftwos = _mm256_set1_ps(2.0);
102 fones = _mm256_set1_ps(1.0);
103 fzeroes = _mm256_setzero_ps();
104 __m256i zeroes = _mm256_set1_epi32(0);
105 ones = _mm256_set1_epi32(1);
106 __m256i allones = _mm256_set1_epi32(0xffffffff);
107 twos = _mm256_set1_epi32(2);
108 fours = _mm256_set1_epi32(4);
110 cp1 = _mm256_set1_ps(1.0);
111 cp2 = _mm256_set1_ps(0.08333333333333333);
112 cp3 = _mm256_set1_ps(0.002777777777777778);
113 cp4 = _mm256_set1_ps(4.96031746031746e-05);
114 cp5 = _mm256_set1_ps(5.511463844797178e-07);
118 for(;number < eighthPoints; number++){
120 aVal = _mm256_load_ps(aPtr);
122 s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
124 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
126 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
128 s = _mm256_fnmadd_ps(r,pio4A,s);
129 s = _mm256_fnmadd_ps(r,pio4B,s);
130 s = _mm256_fnmadd_ps(r,pio4C,s);
132 s = _mm256_div_ps(s, _mm256_set1_ps(8.0));
133 s = _mm256_mul_ps(s, s);
135 s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
137 for(i = 0; i < 3; i++)
138 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
139 s = _mm256_div_ps(s, ftwos);
141 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
142 cosine = _mm256_sub_ps(fones, s);
145 condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
146 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
149 condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
150 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
152 cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
153 cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0
f)), condition3.float_vec));
154 _mm256_store_ps(bPtr, cosine);
159 number = eighthPoints * 8;
160 for(;number < num_points; number++){
161 *bPtr++ = cos(*aPtr++);
168 #include <immintrin.h> 171 volk_32f_cos_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
173 float* bPtr = bVector;
174 const float* aPtr = aVector;
176 unsigned int number = 0;
177 unsigned int eighthPoints = num_points / 8;
180 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
182 __m256i q, ones, twos, fours;
184 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
185 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
186 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
187 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
188 ffours = _mm256_set1_ps(4.0);
189 ftwos = _mm256_set1_ps(2.0);
190 fones = _mm256_set1_ps(1.0);
191 fzeroes = _mm256_setzero_ps();
192 __m256i zeroes = _mm256_set1_epi32(0);
193 ones = _mm256_set1_epi32(1);
194 __m256i allones = _mm256_set1_epi32(0xffffffff);
195 twos = _mm256_set1_epi32(2);
196 fours = _mm256_set1_epi32(4);
198 cp1 = _mm256_set1_ps(1.0);
199 cp2 = _mm256_set1_ps(0.08333333333333333);
200 cp3 = _mm256_set1_ps(0.002777777777777778);
201 cp4 = _mm256_set1_ps(4.96031746031746e-05);
202 cp5 = _mm256_set1_ps(5.511463844797178e-07);
206 for(;number < eighthPoints; number++){
208 aVal = _mm256_load_ps(aPtr);
210 s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
212 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
214 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
216 s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4A));
217 s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4B));
218 s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4C));
220 s = _mm256_div_ps(s, _mm256_set1_ps(8.0));
221 s = _mm256_mul_ps(s, s);
223 s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
225 for(i = 0; i < 3; i++)
226 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
227 s = _mm256_div_ps(s, ftwos);
229 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
230 cosine = _mm256_sub_ps(fones, s);
233 condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
234 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
237 condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
238 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
240 cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
241 cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0
f)), condition3.float_vec));
242 _mm256_store_ps(bPtr, cosine);
247 number = eighthPoints * 8;
248 for(;number < num_points; number++){
249 *bPtr++ = cos(*aPtr++);
255 #ifdef LV_HAVE_SSE4_1 256 #include <smmintrin.h> 259 volk_32f_cos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
261 float* bPtr = bVector;
262 const float* aPtr = aVector;
264 unsigned int number = 0;
265 unsigned int quarterPoints = num_points / 4;
268 __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
270 __m128i q, ones, twos, fours;
272 m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
273 pio4A = _mm_set1_ps(0.7853981554508209228515625);
274 pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
275 pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
276 ffours = _mm_set1_ps(4.0);
277 ftwos = _mm_set1_ps(2.0);
278 fones = _mm_set1_ps(1.0);
279 fzeroes = _mm_setzero_ps();
280 __m128i zeroes = _mm_set1_epi32(0);
281 ones = _mm_set1_epi32(1);
282 __m128i allones = _mm_set1_epi32(0xffffffff);
283 twos = _mm_set1_epi32(2);
284 fours = _mm_set1_epi32(4);
286 cp1 = _mm_set1_ps(1.0);
287 cp2 = _mm_set1_ps(0.08333333333333333);
288 cp3 = _mm_set1_ps(0.002777777777777778);
289 cp4 = _mm_set1_ps(4.96031746031746e-05);
290 cp5 = _mm_set1_ps(5.511463844797178e-07);
294 for(;number < quarterPoints; number++){
296 aVal = _mm_load_ps(aPtr);
298 s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
300 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
302 r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
304 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
305 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
306 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
308 s = _mm_div_ps(s, _mm_set1_ps(8.0));
309 s = _mm_mul_ps(s, s);
311 s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
313 for(i = 0; i < 3; i++)
314 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
315 s = _mm_div_ps(s, ftwos);
317 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
318 cosine = _mm_sub_ps(fones, s);
321 condition1.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
322 condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
325 condition3.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
326 condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
328 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
329 cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0
f)), condition3.float_vec));
330 _mm_store_ps(bPtr, cosine);
335 number = quarterPoints * 4;
336 for(;number < num_points; number++){
337 *bPtr++ = cosf(*aPtr++);
347 #ifndef INCLUDED_volk_32f_cos_32f_u_H 348 #define INCLUDED_volk_32f_cos_32f_u_H 350 #if LV_HAVE_AVX2 && LV_HAVE_FMA 351 #include <immintrin.h> 354 volk_32f_cos_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
356 float* bPtr = bVector;
357 const float* aPtr = aVector;
359 unsigned int number = 0;
360 unsigned int eighthPoints = num_points / 8;
363 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
365 __m256i q, ones, twos, fours;
367 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
368 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
369 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
370 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
371 ffours = _mm256_set1_ps(4.0);
372 ftwos = _mm256_set1_ps(2.0);
373 fones = _mm256_set1_ps(1.0);
374 fzeroes = _mm256_setzero_ps();
375 __m256i zeroes = _mm256_set1_epi32(0);
376 ones = _mm256_set1_epi32(1);
377 __m256i allones = _mm256_set1_epi32(0xffffffff);
378 twos = _mm256_set1_epi32(2);
379 fours = _mm256_set1_epi32(4);
381 cp1 = _mm256_set1_ps(1.0);
382 cp2 = _mm256_set1_ps(0.08333333333333333);
383 cp3 = _mm256_set1_ps(0.002777777777777778);
384 cp4 = _mm256_set1_ps(4.96031746031746e-05);
385 cp5 = _mm256_set1_ps(5.511463844797178e-07);
389 for(;number < eighthPoints; number++){
391 aVal = _mm256_loadu_ps(aPtr);
393 s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
395 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
397 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
399 s = _mm256_fnmadd_ps(r,pio4A,s);
400 s = _mm256_fnmadd_ps(r,pio4B,s);
401 s = _mm256_fnmadd_ps(r,pio4C,s);
403 s = _mm256_div_ps(s, _mm256_set1_ps(8.0));
404 s = _mm256_mul_ps(s, s);
406 s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
408 for(i = 0; i < 3; i++)
409 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
410 s = _mm256_div_ps(s, ftwos);
412 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
413 cosine = _mm256_sub_ps(fones, s);
416 condition1.
int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
417 condition1.
int_vec = _mm256_xor_si256(allones, condition1.
int_vec);
420 condition3.
int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
421 condition3.
int_vec = _mm256_xor_si256(allones, condition3.
int_vec);
423 cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.
float_vec));
424 cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0
f)), condition3.
float_vec));
425 _mm256_storeu_ps(bPtr, cosine);
430 number = eighthPoints * 8;
431 for(;number < num_points; number++){
432 *bPtr++ = cos(*aPtr++);
439 #include <immintrin.h> 442 volk_32f_cos_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
444 float* bPtr = bVector;
445 const float* aPtr = aVector;
447 unsigned int number = 0;
448 unsigned int eighthPoints = num_points / 8;
451 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
453 __m256i q, ones, twos, fours;
455 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
456 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
457 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
458 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
459 ffours = _mm256_set1_ps(4.0);
460 ftwos = _mm256_set1_ps(2.0);
461 fones = _mm256_set1_ps(1.0);
462 fzeroes = _mm256_setzero_ps();
463 __m256i zeroes = _mm256_set1_epi32(0);
464 ones = _mm256_set1_epi32(1);
465 __m256i allones = _mm256_set1_epi32(0xffffffff);
466 twos = _mm256_set1_epi32(2);
467 fours = _mm256_set1_epi32(4);
469 cp1 = _mm256_set1_ps(1.0);
470 cp2 = _mm256_set1_ps(0.08333333333333333);
471 cp3 = _mm256_set1_ps(0.002777777777777778);
472 cp4 = _mm256_set1_ps(4.96031746031746e-05);
473 cp5 = _mm256_set1_ps(5.511463844797178e-07);
477 for(;number < eighthPoints; number++){
479 aVal = _mm256_loadu_ps(aPtr);
481 s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
483 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
485 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
487 s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4A));
488 s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4B));
489 s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4C));
491 s = _mm256_div_ps(s, _mm256_set1_ps(8.0));
492 s = _mm256_mul_ps(s, s);
494 s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
496 for(i = 0; i < 3; i++)
497 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
498 s = _mm256_div_ps(s, ftwos);
500 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
501 cosine = _mm256_sub_ps(fones, s);
504 condition1.
int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
505 condition1.
int_vec = _mm256_xor_si256(allones, condition1.
int_vec);
508 condition3.
int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
509 condition3.
int_vec = _mm256_xor_si256(allones, condition3.
int_vec);
511 cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.
float_vec));
512 cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0
f)), condition3.
float_vec));
513 _mm256_storeu_ps(bPtr, cosine);
518 number = eighthPoints * 8;
519 for(;number < num_points; number++){
520 *bPtr++ = cos(*aPtr++);
526 #ifdef LV_HAVE_SSE4_1 527 #include <smmintrin.h> 530 volk_32f_cos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
532 float* bPtr = bVector;
533 const float* aPtr = aVector;
535 unsigned int number = 0;
536 unsigned int quarterPoints = num_points / 4;
539 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
540 __m128 sine, cosine, condition1, condition3;
541 __m128i q, r, ones, twos, fours;
543 m4pi = _mm_set1_ps(1.273239545);
544 pio4A = _mm_set1_ps(0.78515625);
545 pio4B = _mm_set1_ps(0.241876e-3);
546 ffours = _mm_set1_ps(4.0);
547 ftwos = _mm_set1_ps(2.0);
548 fones = _mm_set1_ps(1.0);
549 fzeroes = _mm_setzero_ps();
550 ones = _mm_set1_epi32(1);
551 twos = _mm_set1_epi32(2);
552 fours = _mm_set1_epi32(4);
554 cp1 = _mm_set1_ps(1.0);
555 cp2 = _mm_set1_ps(0.83333333e-1);
556 cp3 = _mm_set1_ps(0.2777778e-2);
557 cp4 = _mm_set1_ps(0.49603e-4);
558 cp5 = _mm_set1_ps(0.551e-6);
560 for(;number < quarterPoints; number++){
561 aVal = _mm_loadu_ps(aPtr);
562 s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
563 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
564 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
566 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
567 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
569 s = _mm_div_ps(s, _mm_set1_ps(8.0));
570 s = _mm_mul_ps(s, s);
572 s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
574 for(i = 0; i < 3; i++){
575 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
577 s = _mm_div_ps(s, ftwos);
579 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
580 cosine = _mm_sub_ps(fones, s);
582 condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
584 condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
586 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
587 cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0
f)), condition3));
588 _mm_storeu_ps(bPtr, cosine);
593 number = quarterPoints * 4;
594 for(;number < num_points; number++){
595 *bPtr++ = cosf(*aPtr++);
602 #ifdef LV_HAVE_GENERIC 612 float* bPtr = bVector;
613 const float* aPtr = aVector;
615 float m4pi = 1.273239544735162542821171882678754627704620361328125;
616 float pio4A = 0.7853981554508209228515625;
617 float pio4B = 0.794662735614792836713604629039764404296875e-8;
618 float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
622 for(number = 0; number < num_points; number++){
623 float s = fabs(*aPtr);
624 int q = (int)(s * m4pi);
632 s = ((((s/1814400. - 1.0/20160.0)*s + 1.0/360.0)*s - 1.0/12.0)*s + 1.0)*s;
635 for(i=0; i < N; ++
i) {
640 float sine = sqrt((2.0-s)*s);
643 if (((q+1) & 2) != 0) {
648 if (((q+2) & 4) != 0) {
660 #ifdef LV_HAVE_GENERIC 665 float* bPtr = bVector;
666 const float* aPtr = aVector;
667 unsigned int number = 0;
669 for(; number < num_points; number++){
670 *bPtr++ = cosf(*aPtr++);
static void volk_32f_cos_32f_generic_fast(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:610
float f[8]
Definition: volk_common.h:108
static void volk_32f_cos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:663
__m256i int_vec
Definition: volk_common.h:113
for i
Definition: volk_config_fixed.tmpl.h:25
Definition: volk_common.h:104
__m256 float_vec
Definition: volk_common.h:112
float f[4]
Definition: volk_common.h:91
Definition: volk_common.h:87