76 #ifndef INCLUDED_volk_32f_cos_32f_a_H
77 #define INCLUDED_volk_32f_cos_32f_a_H
79 #if LV_HAVE_AVX2 && LV_HAVE_FMA
80 #include <immintrin.h>
83 volk_32f_cos_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
85 float* bPtr = bVector;
86 const float* aPtr = aVector;
88 unsigned int number = 0;
89 unsigned int eighthPoints = num_points / 8;
92 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
95 __m256i q, ones, twos, fours;
97 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
98 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
99 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
100 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
101 ffours = _mm256_set1_ps(4.0);
102 ftwos = _mm256_set1_ps(2.0);
103 fones = _mm256_set1_ps(1.0);
104 fzeroes = _mm256_setzero_ps();
105 __m256i zeroes = _mm256_set1_epi32(0);
106 ones = _mm256_set1_epi32(1);
107 __m256i allones = _mm256_set1_epi32(0xffffffff);
108 twos = _mm256_set1_epi32(2);
109 fours = _mm256_set1_epi32(4);
111 cp1 = _mm256_set1_ps(1.0);
112 cp2 = _mm256_set1_ps(0.08333333333333333);
113 cp3 = _mm256_set1_ps(0.002777777777777778);
114 cp4 = _mm256_set1_ps(4.96031746031746e-05);
115 cp5 = _mm256_set1_ps(5.511463844797178e-07);
119 for (; number < eighthPoints; number++) {
121 aVal = _mm256_load_ps(aPtr);
123 s = _mm256_sub_ps(aVal,
124 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
125 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
127 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
129 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
131 s = _mm256_fnmadd_ps(r, pio4A, s);
132 s = _mm256_fnmadd_ps(r, pio4B, s);
133 s = _mm256_fnmadd_ps(r, pio4C, s);
137 _mm256_set1_ps(8.0));
138 s = _mm256_mul_ps(s, s);
143 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
148 for (
i = 0;
i < 3;
i++)
149 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
150 s = _mm256_div_ps(s, ftwos);
152 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
153 cosine = _mm256_sub_ps(fones, s);
157 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
158 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
161 condition3.int_vec = _mm256_cmpeq_epi32(
162 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
163 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
165 cosine = _mm256_add_ps(
166 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
167 cosine = _mm256_sub_ps(cosine,
168 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0
f)),
169 condition3.float_vec));
170 _mm256_store_ps(bPtr, cosine);
175 number = eighthPoints * 8;
176 for (; number < num_points; number++) {
177 *bPtr++ = cos(*aPtr++);
184 #include <immintrin.h>
187 volk_32f_cos_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
189 float* bPtr = bVector;
190 const float* aPtr = aVector;
192 unsigned int number = 0;
193 unsigned int eighthPoints = num_points / 8;
196 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
199 __m256i q, ones, twos, fours;
201 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
202 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
203 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
204 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
205 ffours = _mm256_set1_ps(4.0);
206 ftwos = _mm256_set1_ps(2.0);
207 fones = _mm256_set1_ps(1.0);
208 fzeroes = _mm256_setzero_ps();
209 __m256i zeroes = _mm256_set1_epi32(0);
210 ones = _mm256_set1_epi32(1);
211 __m256i allones = _mm256_set1_epi32(0xffffffff);
212 twos = _mm256_set1_epi32(2);
213 fours = _mm256_set1_epi32(4);
215 cp1 = _mm256_set1_ps(1.0);
216 cp2 = _mm256_set1_ps(0.08333333333333333);
217 cp3 = _mm256_set1_ps(0.002777777777777778);
218 cp4 = _mm256_set1_ps(4.96031746031746e-05);
219 cp5 = _mm256_set1_ps(5.511463844797178e-07);
223 for (; number < eighthPoints; number++) {
225 aVal = _mm256_load_ps(aPtr);
227 s = _mm256_sub_ps(aVal,
228 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
229 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
231 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
233 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
235 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
236 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
237 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
241 _mm256_set1_ps(8.0));
242 s = _mm256_mul_ps(s, s);
250 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
259 for (
i = 0;
i < 3;
i++)
260 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
261 s = _mm256_div_ps(s, ftwos);
263 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
264 cosine = _mm256_sub_ps(fones, s);
268 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
269 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
272 condition3.int_vec = _mm256_cmpeq_epi32(
273 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
274 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
276 cosine = _mm256_add_ps(
277 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
278 cosine = _mm256_sub_ps(cosine,
279 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0
f)),
280 condition3.float_vec));
281 _mm256_store_ps(bPtr, cosine);
286 number = eighthPoints * 8;
287 for (; number < num_points; number++) {
288 *bPtr++ = cos(*aPtr++);
294 #ifdef LV_HAVE_SSE4_1
295 #include <smmintrin.h>
298 volk_32f_cos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
300 float* bPtr = bVector;
301 const float* aPtr = aVector;
303 unsigned int number = 0;
304 unsigned int quarterPoints = num_points / 4;
307 __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
310 __m128i q, ones, twos, fours;
312 m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
313 pio4A = _mm_set1_ps(0.7853981554508209228515625);
314 pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
315 pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
316 ffours = _mm_set1_ps(4.0);
317 ftwos = _mm_set1_ps(2.0);
318 fones = _mm_set1_ps(1.0);
319 fzeroes = _mm_setzero_ps();
320 __m128i zeroes = _mm_set1_epi32(0);
321 ones = _mm_set1_epi32(1);
322 __m128i allones = _mm_set1_epi32(0xffffffff);
323 twos = _mm_set1_epi32(2);
324 fours = _mm_set1_epi32(4);
326 cp1 = _mm_set1_ps(1.0);
327 cp2 = _mm_set1_ps(0.08333333333333333);
328 cp3 = _mm_set1_ps(0.002777777777777778);
329 cp4 = _mm_set1_ps(4.96031746031746e-05);
330 cp5 = _mm_set1_ps(5.511463844797178e-07);
334 for (; number < quarterPoints; number++) {
336 aVal = _mm_load_ps(aPtr);
339 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
341 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
343 r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
345 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
346 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
347 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
350 s, _mm_set1_ps(8.0));
351 s = _mm_mul_ps(s, s);
358 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
366 for (
i = 0;
i < 3;
i++)
367 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
368 s = _mm_div_ps(s, ftwos);
370 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
371 cosine = _mm_sub_ps(fones, s);
375 _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
376 condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
380 _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
381 condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
383 cosine = _mm_add_ps(cosine,
384 _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
387 _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0
f)), condition3.float_vec));
388 _mm_store_ps(bPtr, cosine);
393 number = quarterPoints * 4;
394 for (; number < num_points; number++) {
395 *bPtr++ = cosf(*aPtr++);
404 #ifndef INCLUDED_volk_32f_cos_32f_u_H
405 #define INCLUDED_volk_32f_cos_32f_u_H
407 #if LV_HAVE_AVX2 && LV_HAVE_FMA
408 #include <immintrin.h>
411 volk_32f_cos_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
413 float* bPtr = bVector;
414 const float* aPtr = aVector;
416 unsigned int number = 0;
417 unsigned int eighthPoints = num_points / 8;
420 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
423 __m256i q, ones, twos, fours;
425 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
426 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
427 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
428 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
429 ffours = _mm256_set1_ps(4.0);
430 ftwos = _mm256_set1_ps(2.0);
431 fones = _mm256_set1_ps(1.0);
432 fzeroes = _mm256_setzero_ps();
433 __m256i zeroes = _mm256_set1_epi32(0);
434 ones = _mm256_set1_epi32(1);
435 __m256i allones = _mm256_set1_epi32(0xffffffff);
436 twos = _mm256_set1_epi32(2);
437 fours = _mm256_set1_epi32(4);
439 cp1 = _mm256_set1_ps(1.0);
440 cp2 = _mm256_set1_ps(0.08333333333333333);
441 cp3 = _mm256_set1_ps(0.002777777777777778);
442 cp4 = _mm256_set1_ps(4.96031746031746e-05);
443 cp5 = _mm256_set1_ps(5.511463844797178e-07);
447 for (; number < eighthPoints; number++) {
449 aVal = _mm256_loadu_ps(aPtr);
451 s = _mm256_sub_ps(aVal,
452 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
453 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
455 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
457 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
459 s = _mm256_fnmadd_ps(r, pio4A, s);
460 s = _mm256_fnmadd_ps(r, pio4B, s);
461 s = _mm256_fnmadd_ps(r, pio4C, s);
465 _mm256_set1_ps(8.0));
466 s = _mm256_mul_ps(s, s);
471 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
476 for (
i = 0;
i < 3;
i++)
477 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
478 s = _mm256_div_ps(s, ftwos);
480 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
481 cosine = _mm256_sub_ps(fones, s);
485 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
486 condition1.
int_vec = _mm256_xor_si256(allones, condition1.
int_vec);
489 condition3.
int_vec = _mm256_cmpeq_epi32(
490 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
491 condition3.
int_vec = _mm256_xor_si256(allones, condition3.
int_vec);
493 cosine = _mm256_add_ps(
494 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.
float_vec));
495 cosine = _mm256_sub_ps(cosine,
496 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0
f)),
498 _mm256_storeu_ps(bPtr, cosine);
503 number = eighthPoints * 8;
504 for (; number < num_points; number++) {
505 *bPtr++ = cos(*aPtr++);
512 #include <immintrin.h>
515 volk_32f_cos_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
517 float* bPtr = bVector;
518 const float* aPtr = aVector;
520 unsigned int number = 0;
521 unsigned int eighthPoints = num_points / 8;
524 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
527 __m256i q, ones, twos, fours;
529 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
530 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
531 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
532 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
533 ffours = _mm256_set1_ps(4.0);
534 ftwos = _mm256_set1_ps(2.0);
535 fones = _mm256_set1_ps(1.0);
536 fzeroes = _mm256_setzero_ps();
537 __m256i zeroes = _mm256_set1_epi32(0);
538 ones = _mm256_set1_epi32(1);
539 __m256i allones = _mm256_set1_epi32(0xffffffff);
540 twos = _mm256_set1_epi32(2);
541 fours = _mm256_set1_epi32(4);
543 cp1 = _mm256_set1_ps(1.0);
544 cp2 = _mm256_set1_ps(0.08333333333333333);
545 cp3 = _mm256_set1_ps(0.002777777777777778);
546 cp4 = _mm256_set1_ps(4.96031746031746e-05);
547 cp5 = _mm256_set1_ps(5.511463844797178e-07);
551 for (; number < eighthPoints; number++) {
553 aVal = _mm256_loadu_ps(aPtr);
555 s = _mm256_sub_ps(aVal,
556 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
557 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
559 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
561 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
563 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
564 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
565 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
569 _mm256_set1_ps(8.0));
570 s = _mm256_mul_ps(s, s);
578 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
587 for (
i = 0;
i < 3;
i++)
588 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
589 s = _mm256_div_ps(s, ftwos);
591 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
592 cosine = _mm256_sub_ps(fones, s);
596 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
597 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
600 condition3.int_vec = _mm256_cmpeq_epi32(
601 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
602 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
604 cosine = _mm256_add_ps(
605 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
606 cosine = _mm256_sub_ps(cosine,
607 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0
f)),
608 condition3.float_vec));
609 _mm256_storeu_ps(bPtr, cosine);
614 number = eighthPoints * 8;
615 for (; number < num_points; number++) {
616 *bPtr++ = cos(*aPtr++);
622 #ifdef LV_HAVE_SSE4_1
623 #include <smmintrin.h>
626 volk_32f_cos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
628 float* bPtr = bVector;
629 const float* aPtr = aVector;
631 unsigned int number = 0;
632 unsigned int quarterPoints = num_points / 4;
635 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
637 __m128 sine, cosine, condition1, condition3;
638 __m128i q, r, ones, twos, fours;
640 m4pi = _mm_set1_ps(1.273239545);
641 pio4A = _mm_set1_ps(0.78515625);
642 pio4B = _mm_set1_ps(0.241876e-3);
643 ffours = _mm_set1_ps(4.0);
644 ftwos = _mm_set1_ps(2.0);
645 fones = _mm_set1_ps(1.0);
646 fzeroes = _mm_setzero_ps();
647 ones = _mm_set1_epi32(1);
648 twos = _mm_set1_epi32(2);
649 fours = _mm_set1_epi32(4);
651 cp1 = _mm_set1_ps(1.0);
652 cp2 = _mm_set1_ps(0.83333333e-1);
653 cp3 = _mm_set1_ps(0.2777778e-2);
654 cp4 = _mm_set1_ps(0.49603e-4);
655 cp5 = _mm_set1_ps(0.551e-6);
657 for (; number < quarterPoints; number++) {
658 aVal = _mm_loadu_ps(aPtr);
660 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
661 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
662 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
664 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
665 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
668 s, _mm_set1_ps(8.0));
669 s = _mm_mul_ps(s, s);
676 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
684 for (
i = 0;
i < 3;
i++) {
685 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
687 s = _mm_div_ps(s, ftwos);
689 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
690 cosine = _mm_sub_ps(fones, s);
692 condition1 = _mm_cmpneq_ps(
693 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
695 condition3 = _mm_cmpneq_ps(
696 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
698 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
700 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0
f)), condition3));
701 _mm_storeu_ps(bPtr, cosine);
706 number = quarterPoints * 4;
707 for (; number < num_points; number++) {
708 *bPtr++ = cosf(*aPtr++);
715 #ifdef LV_HAVE_GENERIC
723 const float* aVector,
724 unsigned int num_points)
726 float* bPtr = bVector;
727 const float* aPtr = aVector;
729 float m4pi = 1.273239544735162542821171882678754627704620361328125;
730 float pio4A = 0.7853981554508209228515625;
731 float pio4B = 0.794662735614792836713604629039764404296875e-8;
732 float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
736 for (number = 0; number < num_points; number++) {
737 float s = fabs(*aPtr);
738 int q = (int)(s * m4pi);
746 s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
751 for (
i = 0;
i < N; ++
i) {
756 float sine = sqrt((2.0 - s) * s);
757 float cosine = 1 - s;
759 if (((q + 1) & 2) != 0) {
764 if (((q + 2) & 4) != 0) {
776 #ifdef LV_HAVE_GENERIC
781 float* bPtr = bVector;
782 const float* aPtr = aVector;
783 unsigned int number = 0;
785 for (; number < num_points; number++) {
786 *bPtr++ = cosf(*aPtr++);
794 #include <arm_neon.h>
800 unsigned int number = 0;
801 unsigned int quarter_points = num_points / 4;
802 float* bVectorPtr = bVector;
803 const float* aVectorPtr = aVector;
808 for (number = 0; number < quarter_points; number++) {
809 a_vec = vld1q_f32(aVectorPtr);
813 vst1q_f32(bVectorPtr, b_vec);
820 for (number = quarter_points * 4; number < num_points; number++) {
821 *bVectorPtr++ = cosf(*aVectorPtr++);