76 #ifndef INCLUDED_volk_32f_sin_32f_a_H 77 #define INCLUDED_volk_32f_sin_32f_a_H 80 #if LV_HAVE_AVX2 && LV_HAVE_FMA 81 #include <immintrin.h> 84 volk_32f_sin_32f_a_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
86 float* bPtr = bVector;
87 const float* aPtr = aVector;
89 unsigned int number = 0;
90 unsigned int eighthPoints = num_points / 8;
93 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
94 __m256 sine, cosine, condition1, condition2;
95 __m256i q, r, ones, twos, fours;
97 m4pi = _mm256_set1_ps(1.273239545);
98 pio4A = _mm256_set1_ps(0.78515625);
99 pio4B = _mm256_set1_ps(0.241876e-3);
100 ffours = _mm256_set1_ps(4.0);
101 ftwos = _mm256_set1_ps(2.0);
102 fones = _mm256_set1_ps(1.0);
103 fzeroes = _mm256_setzero_ps();
104 ones = _mm256_set1_epi32(1);
105 twos = _mm256_set1_epi32(2);
106 fours = _mm256_set1_epi32(4);
108 cp1 = _mm256_set1_ps(1.0);
109 cp2 = _mm256_set1_ps(0.83333333e-1);
110 cp3 = _mm256_set1_ps(0.2777778e-2);
111 cp4 = _mm256_set1_ps(0.49603e-4);
112 cp5 = _mm256_set1_ps(0.551e-6);
114 for(;number < eighthPoints; number++) {
115 aVal = _mm256_load_ps(aPtr);
116 s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
117 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
118 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
120 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
121 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
123 s = _mm256_div_ps(s, _mm256_set1_ps(8.0));
124 s = _mm256_mul_ps(s, s);
126 s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
128 for(i = 0; i < 3; i++) {
129 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
131 s = _mm256_div_ps(s, ftwos);
133 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
134 cosine = _mm256_sub_ps(fones, s);
136 condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
137 condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
141 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
142 sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
143 _mm256_store_ps(bPtr, sine);
148 number = eighthPoints * 8;
149 for(;number < num_points; number++) {
150 *bPtr++ = sin(*aPtr++);
157 #include <immintrin.h> 160 volk_32f_sin_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
162 float* bPtr = bVector;
163 const float* aPtr = aVector;
165 unsigned int number = 0;
166 unsigned int eighthPoints = num_points / 8;
169 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
170 __m256 sine, cosine, condition1, condition2;
171 __m256i q, r, ones, twos, fours;
173 m4pi = _mm256_set1_ps(1.273239545);
174 pio4A = _mm256_set1_ps(0.78515625);
175 pio4B = _mm256_set1_ps(0.241876e-3);
176 ffours = _mm256_set1_ps(4.0);
177 ftwos = _mm256_set1_ps(2.0);
178 fones = _mm256_set1_ps(1.0);
179 fzeroes = _mm256_setzero_ps();
180 ones = _mm256_set1_epi32(1);
181 twos = _mm256_set1_epi32(2);
182 fours = _mm256_set1_epi32(4);
184 cp1 = _mm256_set1_ps(1.0);
185 cp2 = _mm256_set1_ps(0.83333333e-1);
186 cp3 = _mm256_set1_ps(0.2777778e-2);
187 cp4 = _mm256_set1_ps(0.49603e-4);
188 cp5 = _mm256_set1_ps(0.551e-6);
190 for(;number < eighthPoints; number++) {
191 aVal = _mm256_load_ps(aPtr);
192 s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
193 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
194 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
196 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
197 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
199 s = _mm256_div_ps(s, _mm256_set1_ps(8.0));
200 s = _mm256_mul_ps(s, s);
202 s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
204 for(i = 0; i < 3; i++) {
205 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
207 s = _mm256_div_ps(s, ftwos);
209 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
210 cosine = _mm256_sub_ps(fones, s);
212 condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
213 condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
217 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
218 sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
219 _mm256_store_ps(bPtr, sine);
224 number = eighthPoints * 8;
225 for(;number < num_points; number++) {
226 *bPtr++ = sin(*aPtr++);
232 #ifdef LV_HAVE_SSE4_1 233 #include <smmintrin.h> 236 volk_32f_sin_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
238 float* bPtr = bVector;
239 const float* aPtr = aVector;
241 unsigned int number = 0;
242 unsigned int quarterPoints = num_points / 4;
245 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
246 __m128 sine, cosine, condition1, condition2;
247 __m128i q, r, ones, twos, fours;
249 m4pi = _mm_set1_ps(1.273239545);
250 pio4A = _mm_set1_ps(0.78515625);
251 pio4B = _mm_set1_ps(0.241876e-3);
252 ffours = _mm_set1_ps(4.0);
253 ftwos = _mm_set1_ps(2.0);
254 fones = _mm_set1_ps(1.0);
255 fzeroes = _mm_setzero_ps();
256 ones = _mm_set1_epi32(1);
257 twos = _mm_set1_epi32(2);
258 fours = _mm_set1_epi32(4);
260 cp1 = _mm_set1_ps(1.0);
261 cp2 = _mm_set1_ps(0.83333333e-1);
262 cp3 = _mm_set1_ps(0.2777778e-2);
263 cp4 = _mm_set1_ps(0.49603e-4);
264 cp5 = _mm_set1_ps(0.551e-6);
266 for(;number < quarterPoints; number++) {
267 aVal = _mm_load_ps(aPtr);
268 s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
269 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
270 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
272 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
273 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
275 s = _mm_div_ps(s, _mm_set1_ps(8.0));
276 s = _mm_mul_ps(s, s);
278 s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
280 for(i = 0; i < 3; i++) {
281 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
283 s = _mm_div_ps(s, ftwos);
285 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
286 cosine = _mm_sub_ps(fones, s);
288 condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
289 condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
293 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
294 sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
295 _mm_store_ps(bPtr, sine);
300 number = quarterPoints * 4;
301 for(;number < num_points; number++) {
302 *bPtr++ = sinf(*aPtr++);
311 #ifndef INCLUDED_volk_32f_sin_32f_u_H 312 #define INCLUDED_volk_32f_sin_32f_u_H 314 #if LV_HAVE_AVX2 && LV_HAVE_FMA 315 #include <immintrin.h> 318 volk_32f_sin_32f_u_avx2_fma(
float* bVector,
const float* aVector,
unsigned int num_points)
320 float* bPtr = bVector;
321 const float* aPtr = aVector;
323 unsigned int number = 0;
324 unsigned int eighthPoints = num_points / 8;
327 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
328 __m256 sine, cosine, condition1, condition2;
329 __m256i q, r, ones, twos, fours;
331 m4pi = _mm256_set1_ps(1.273239545);
332 pio4A = _mm256_set1_ps(0.78515625);
333 pio4B = _mm256_set1_ps(0.241876e-3);
334 ffours = _mm256_set1_ps(4.0);
335 ftwos = _mm256_set1_ps(2.0);
336 fones = _mm256_set1_ps(1.0);
337 fzeroes = _mm256_setzero_ps();
338 ones = _mm256_set1_epi32(1);
339 twos = _mm256_set1_epi32(2);
340 fours = _mm256_set1_epi32(4);
342 cp1 = _mm256_set1_ps(1.0);
343 cp2 = _mm256_set1_ps(0.83333333e-1);
344 cp3 = _mm256_set1_ps(0.2777778e-2);
345 cp4 = _mm256_set1_ps(0.49603e-4);
346 cp5 = _mm256_set1_ps(0.551e-6);
348 for(;number < eighthPoints; number++) {
349 aVal = _mm256_loadu_ps(aPtr);
350 s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
351 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
352 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
354 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
355 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
357 s = _mm256_div_ps(s, _mm256_set1_ps(8.0));
358 s = _mm256_mul_ps(s, s);
360 s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
362 for(i = 0; i < 3; i++) {
363 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
365 s = _mm256_div_ps(s, ftwos);
367 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
368 cosine = _mm256_sub_ps(fones, s);
370 condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
371 condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
375 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
376 sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
377 _mm256_storeu_ps(bPtr, sine);
382 number = eighthPoints * 8;
383 for(;number < num_points; number++) {
384 *bPtr++ = sin(*aPtr++);
391 #include <immintrin.h> 394 volk_32f_sin_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
396 float* bPtr = bVector;
397 const float* aPtr = aVector;
399 unsigned int number = 0;
400 unsigned int eighthPoints = num_points / 8;
403 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
404 __m256 sine, cosine, condition1, condition2;
405 __m256i q, r, ones, twos, fours;
407 m4pi = _mm256_set1_ps(1.273239545);
408 pio4A = _mm256_set1_ps(0.78515625);
409 pio4B = _mm256_set1_ps(0.241876e-3);
410 ffours = _mm256_set1_ps(4.0);
411 ftwos = _mm256_set1_ps(2.0);
412 fones = _mm256_set1_ps(1.0);
413 fzeroes = _mm256_setzero_ps();
414 ones = _mm256_set1_epi32(1);
415 twos = _mm256_set1_epi32(2);
416 fours = _mm256_set1_epi32(4);
418 cp1 = _mm256_set1_ps(1.0);
419 cp2 = _mm256_set1_ps(0.83333333e-1);
420 cp3 = _mm256_set1_ps(0.2777778e-2);
421 cp4 = _mm256_set1_ps(0.49603e-4);
422 cp5 = _mm256_set1_ps(0.551e-6);
424 for(;number < eighthPoints; number++) {
425 aVal = _mm256_loadu_ps(aPtr);
426 s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
427 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
428 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
430 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
431 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
433 s = _mm256_div_ps(s, _mm256_set1_ps(8.0));
434 s = _mm256_mul_ps(s, s);
436 s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
438 for(i = 0; i < 3; i++) {
439 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
441 s = _mm256_div_ps(s, ftwos);
443 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
444 cosine = _mm256_sub_ps(fones, s);
446 condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
447 condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
451 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
452 sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
453 _mm256_storeu_ps(bPtr, sine);
458 number = eighthPoints * 8;
459 for(;number < num_points; number++) {
460 *bPtr++ = sin(*aPtr++);
467 #ifdef LV_HAVE_SSE4_1 468 #include <smmintrin.h> 471 volk_32f_sin_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
473 float* bPtr = bVector;
474 const float* aPtr = aVector;
476 unsigned int number = 0;
477 unsigned int quarterPoints = num_points / 4;
480 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
481 __m128 sine, cosine, condition1, condition2;
482 __m128i q, r, ones, twos, fours;
484 m4pi = _mm_set1_ps(1.273239545);
485 pio4A = _mm_set1_ps(0.78515625);
486 pio4B = _mm_set1_ps(0.241876e-3);
487 ffours = _mm_set1_ps(4.0);
488 ftwos = _mm_set1_ps(2.0);
489 fones = _mm_set1_ps(1.0);
490 fzeroes = _mm_setzero_ps();
491 ones = _mm_set1_epi32(1);
492 twos = _mm_set1_epi32(2);
493 fours = _mm_set1_epi32(4);
495 cp1 = _mm_set1_ps(1.0);
496 cp2 = _mm_set1_ps(0.83333333e-1);
497 cp3 = _mm_set1_ps(0.2777778e-2);
498 cp4 = _mm_set1_ps(0.49603e-4);
499 cp5 = _mm_set1_ps(0.551e-6);
501 for(;number < quarterPoints; number++) {
502 aVal = _mm_loadu_ps(aPtr);
503 s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
504 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
505 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
507 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
508 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
510 s = _mm_div_ps(s, _mm_set1_ps(8.0));
511 s = _mm_mul_ps(s, s);
513 s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
515 for(i = 0; i < 3; i++) {
516 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
518 s = _mm_div_ps(s, ftwos);
520 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
521 cosine = _mm_sub_ps(fones, s);
523 condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
524 condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
526 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
527 sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
528 _mm_storeu_ps(bPtr, sine);
533 number = quarterPoints * 4;
534 for(;number < num_points; number++){
535 *bPtr++ = sinf(*aPtr++);
542 #ifdef LV_HAVE_GENERIC 547 float* bPtr = bVector;
548 const float* aPtr = aVector;
549 unsigned int number = 0;
551 for(number = 0; number < num_points; number++) {
552 *bPtr++ = sinf(*aPtr++);
static void volk_32f_sin_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:545
for i
Definition: volk_config_fixed.tmpl.h:25