78 #ifndef INCLUDED_volk_32f_tan_32f_a_H 79 #define INCLUDED_volk_32f_tan_32f_a_H 81 #if LV_HAVE_AVX2 && LV_HAVE_FMA 82 #include <immintrin.h> 85 volk_32f_tan_32f_a_avx2_fma(
float* bVector,
const float* aVector,
86 unsigned int num_points)
88 float* bPtr = bVector;
89 const float* aPtr = aVector;
91 unsigned int number = 0;
92 unsigned int eighthPoints = num_points / 8;
95 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
96 __m256 sine, cosine, tangent, condition1, condition2, condition3;
97 __m256i q, r, ones, twos, fours;
99 m4pi = _mm256_set1_ps(1.273239545);
100 pio4A = _mm256_set1_ps(0.78515625);
101 pio4B = _mm256_set1_ps(0.241876e-3);
102 ffours = _mm256_set1_ps(4.0);
103 ftwos = _mm256_set1_ps(2.0);
104 fones = _mm256_set1_ps(1.0);
105 fzeroes = _mm256_setzero_ps();
106 ones = _mm256_set1_epi32(1);
107 twos = _mm256_set1_epi32(2);
108 fours = _mm256_set1_epi32(4);
110 cp1 = _mm256_set1_ps(1.0);
111 cp2 = _mm256_set1_ps(0.83333333e-1);
112 cp3 = _mm256_set1_ps(0.2777778e-2);
113 cp4 = _mm256_set1_ps(0.49603e-4);
114 cp5 = _mm256_set1_ps(0.551e-6);
116 for(;number < eighthPoints; number++){
117 aVal = _mm256_load_ps(aPtr);
118 s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
119 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
120 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
122 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
123 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
125 s = _mm256_div_ps(s, _mm256_set1_ps(8.0));
126 s = _mm256_mul_ps(s, s);
128 s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
130 for(i = 0; i < 3; i++){
131 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
133 s = _mm256_div_ps(s, ftwos);
135 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
136 cosine = _mm256_sub_ps(fones, s);
138 condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
139 condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
140 condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes,4);
142 __m256 temp = cosine;
143 cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
144 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
145 sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
146 cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
147 tangent = _mm256_div_ps(sine, cosine);
148 _mm256_store_ps(bPtr, tangent);
153 number = eighthPoints * 8;
154 for(;number < num_points; number++){
155 *bPtr++ = tan(*aPtr++);
162 #include <immintrin.h> 165 volk_32f_tan_32f_a_avx2(
float* bVector,
const float* aVector,
166 unsigned int num_points)
168 float* bPtr = bVector;
169 const float* aPtr = aVector;
171 unsigned int number = 0;
172 unsigned int eighthPoints = num_points / 8;
175 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
176 __m256 sine, cosine, tangent, condition1, condition2, condition3;
177 __m256i q, r, ones, twos, fours;
179 m4pi = _mm256_set1_ps(1.273239545);
180 pio4A = _mm256_set1_ps(0.78515625);
181 pio4B = _mm256_set1_ps(0.241876e-3);
182 ffours = _mm256_set1_ps(4.0);
183 ftwos = _mm256_set1_ps(2.0);
184 fones = _mm256_set1_ps(1.0);
185 fzeroes = _mm256_setzero_ps();
186 ones = _mm256_set1_epi32(1);
187 twos = _mm256_set1_epi32(2);
188 fours = _mm256_set1_epi32(4);
190 cp1 = _mm256_set1_ps(1.0);
191 cp2 = _mm256_set1_ps(0.83333333e-1);
192 cp3 = _mm256_set1_ps(0.2777778e-2);
193 cp4 = _mm256_set1_ps(0.49603e-4);
194 cp5 = _mm256_set1_ps(0.551e-6);
196 for(;number < eighthPoints; number++){
197 aVal = _mm256_load_ps(aPtr);
198 s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
199 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
200 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
202 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
203 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
205 s = _mm256_div_ps(s, _mm256_set1_ps(8.0));
206 s = _mm256_mul_ps(s, s);
208 s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
210 for(i = 0; i < 3; i++){
211 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
213 s = _mm256_div_ps(s, ftwos);
215 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
216 cosine = _mm256_sub_ps(fones, s);
218 condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
219 condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
220 condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes,4);
222 __m256 temp = cosine;
223 cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
224 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
225 sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
226 cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
227 tangent = _mm256_div_ps(sine, cosine);
228 _mm256_store_ps(bPtr, tangent);
233 number = eighthPoints * 8;
234 for(;number < num_points; number++){
235 *bPtr++ = tan(*aPtr++);
241 #ifdef LV_HAVE_SSE4_1 242 #include <smmintrin.h> 245 volk_32f_tan_32f_a_sse4_1(
float* bVector,
const float* aVector,
246 unsigned int num_points)
248 float* bPtr = bVector;
249 const float* aPtr = aVector;
251 unsigned int number = 0;
252 unsigned int quarterPoints = num_points / 4;
255 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
256 __m128 sine, cosine, tangent, condition1, condition2, condition3;
257 __m128i q, r, ones, twos, fours;
259 m4pi = _mm_set1_ps(1.273239545);
260 pio4A = _mm_set1_ps(0.78515625);
261 pio4B = _mm_set1_ps(0.241876e-3);
262 ffours = _mm_set1_ps(4.0);
263 ftwos = _mm_set1_ps(2.0);
264 fones = _mm_set1_ps(1.0);
265 fzeroes = _mm_setzero_ps();
266 ones = _mm_set1_epi32(1);
267 twos = _mm_set1_epi32(2);
268 fours = _mm_set1_epi32(4);
270 cp1 = _mm_set1_ps(1.0);
271 cp2 = _mm_set1_ps(0.83333333e-1);
272 cp3 = _mm_set1_ps(0.2777778e-2);
273 cp4 = _mm_set1_ps(0.49603e-4);
274 cp5 = _mm_set1_ps(0.551e-6);
276 for(;number < quarterPoints; number++){
277 aVal = _mm_load_ps(aPtr);
278 s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
279 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
280 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
282 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
283 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
285 s = _mm_div_ps(s, _mm_set1_ps(8.0));
286 s = _mm_mul_ps(s, s);
288 s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
290 for(i = 0; i < 3; i++){
291 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
293 s = _mm_div_ps(s, ftwos);
295 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
296 cosine = _mm_sub_ps(fones, s);
298 condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
299 condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
300 condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
302 __m128 temp = cosine;
303 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
304 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
305 sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
306 cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
307 tangent = _mm_div_ps(sine, cosine);
308 _mm_store_ps(bPtr, tangent);
313 number = quarterPoints * 4;
314 for(;number < num_points; number++){
315 *bPtr++ = tanf(*aPtr++);
324 #ifndef INCLUDED_volk_32f_tan_32f_u_H 325 #define INCLUDED_volk_32f_tan_32f_u_H 327 #if LV_HAVE_AVX2 && LV_HAVE_FMA 328 #include <immintrin.h> 331 volk_32f_tan_32f_u_avx2_fma(
float* bVector,
const float* aVector,
332 unsigned int num_points)
334 float* bPtr = bVector;
335 const float* aPtr = aVector;
337 unsigned int number = 0;
338 unsigned int eighthPoints = num_points / 8;
341 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
342 __m256 sine, cosine, tangent, condition1, condition2, condition3;
343 __m256i q, r, ones, twos, fours;
345 m4pi = _mm256_set1_ps(1.273239545);
346 pio4A = _mm256_set1_ps(0.78515625);
347 pio4B = _mm256_set1_ps(0.241876e-3);
348 ffours = _mm256_set1_ps(4.0);
349 ftwos = _mm256_set1_ps(2.0);
350 fones = _mm256_set1_ps(1.0);
351 fzeroes = _mm256_setzero_ps();
352 ones = _mm256_set1_epi32(1);
353 twos = _mm256_set1_epi32(2);
354 fours = _mm256_set1_epi32(4);
356 cp1 = _mm256_set1_ps(1.0);
357 cp2 = _mm256_set1_ps(0.83333333e-1);
358 cp3 = _mm256_set1_ps(0.2777778e-2);
359 cp4 = _mm256_set1_ps(0.49603e-4);
360 cp5 = _mm256_set1_ps(0.551e-6);
362 for(;number < eighthPoints; number++){
363 aVal = _mm256_loadu_ps(aPtr);
364 s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
365 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
366 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
368 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
369 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
371 s = _mm256_div_ps(s, _mm256_set1_ps(8.0));
372 s = _mm256_mul_ps(s, s);
374 s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
376 for(i = 0; i < 3; i++){
377 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
379 s = _mm256_div_ps(s, ftwos);
381 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
382 cosine = _mm256_sub_ps(fones, s);
384 condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
385 condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
386 condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes,4);
388 __m256 temp = cosine;
389 cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
390 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
391 sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
392 cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
393 tangent = _mm256_div_ps(sine, cosine);
394 _mm256_storeu_ps(bPtr, tangent);
399 number = eighthPoints * 8;
400 for(;number < num_points; number++){
401 *bPtr++ = tan(*aPtr++);
408 #include <immintrin.h> 411 volk_32f_tan_32f_u_avx2(
float* bVector,
const float* aVector,
412 unsigned int num_points)
414 float* bPtr = bVector;
415 const float* aPtr = aVector;
417 unsigned int number = 0;
418 unsigned int eighthPoints = num_points / 8;
421 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
422 __m256 sine, cosine, tangent, condition1, condition2, condition3;
423 __m256i q, r, ones, twos, fours;
425 m4pi = _mm256_set1_ps(1.273239545);
426 pio4A = _mm256_set1_ps(0.78515625);
427 pio4B = _mm256_set1_ps(0.241876e-3);
428 ffours = _mm256_set1_ps(4.0);
429 ftwos = _mm256_set1_ps(2.0);
430 fones = _mm256_set1_ps(1.0);
431 fzeroes = _mm256_setzero_ps();
432 ones = _mm256_set1_epi32(1);
433 twos = _mm256_set1_epi32(2);
434 fours = _mm256_set1_epi32(4);
436 cp1 = _mm256_set1_ps(1.0);
437 cp2 = _mm256_set1_ps(0.83333333e-1);
438 cp3 = _mm256_set1_ps(0.2777778e-2);
439 cp4 = _mm256_set1_ps(0.49603e-4);
440 cp5 = _mm256_set1_ps(0.551e-6);
442 for(;number < eighthPoints; number++){
443 aVal = _mm256_loadu_ps(aPtr);
444 s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
445 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
446 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
448 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
449 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
451 s = _mm256_div_ps(s, _mm256_set1_ps(8.0));
452 s = _mm256_mul_ps(s, s);
454 s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
456 for(i = 0; i < 3; i++){
457 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
459 s = _mm256_div_ps(s, ftwos);
461 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
462 cosine = _mm256_sub_ps(fones, s);
464 condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
465 condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
466 condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes,4);
468 __m256 temp = cosine;
469 cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
470 sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
471 sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
472 cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
473 tangent = _mm256_div_ps(sine, cosine);
474 _mm256_storeu_ps(bPtr, tangent);
479 number = eighthPoints * 8;
480 for(;number < num_points; number++){
481 *bPtr++ = tan(*aPtr++);
488 #ifdef LV_HAVE_SSE4_1 489 #include <smmintrin.h> 492 volk_32f_tan_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
494 float* bPtr = bVector;
495 const float* aPtr = aVector;
497 unsigned int number = 0;
498 unsigned int quarterPoints = num_points / 4;
501 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
502 __m128 sine, cosine, tangent, condition1, condition2, condition3;
503 __m128i q, r, ones, twos, fours;
505 m4pi = _mm_set1_ps(1.273239545);
506 pio4A = _mm_set1_ps(0.78515625);
507 pio4B = _mm_set1_ps(0.241876e-3);
508 ffours = _mm_set1_ps(4.0);
509 ftwos = _mm_set1_ps(2.0);
510 fones = _mm_set1_ps(1.0);
511 fzeroes = _mm_setzero_ps();
512 ones = _mm_set1_epi32(1);
513 twos = _mm_set1_epi32(2);
514 fours = _mm_set1_epi32(4);
516 cp1 = _mm_set1_ps(1.0);
517 cp2 = _mm_set1_ps(0.83333333e-1);
518 cp3 = _mm_set1_ps(0.2777778e-2);
519 cp4 = _mm_set1_ps(0.49603e-4);
520 cp5 = _mm_set1_ps(0.551e-6);
522 for(;number < quarterPoints; number++){
523 aVal = _mm_loadu_ps(aPtr);
524 s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
525 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
526 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
528 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
529 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
531 s = _mm_div_ps(s, _mm_set1_ps(8.0));
532 s = _mm_mul_ps(s, s);
534 s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
536 for(i = 0; i < 3; i++){
537 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
539 s = _mm_div_ps(s, ftwos);
541 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
542 cosine = _mm_sub_ps(fones, s);
544 condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
545 condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
546 condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
548 __m128 temp = cosine;
549 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
550 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
551 sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
552 cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
553 tangent = _mm_div_ps(sine, cosine);
554 _mm_storeu_ps(bPtr, tangent);
559 number = quarterPoints * 4;
560 for(;number < num_points; number++){
561 *bPtr++ = tanf(*aPtr++);
568 #ifdef LV_HAVE_GENERIC 572 unsigned int num_points)
574 float* bPtr = bVector;
575 const float* aPtr = aVector;
576 unsigned int number = 0;
578 for(; number < num_points; number++){
579 *bPtr++ = tanf(*aPtr++);
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32f_tan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tan_32f.h:571