Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_tan_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
74 #include <stdio.h>
75 #include <math.h>
76 #include <inttypes.h>
77 
78 #ifndef INCLUDED_volk_32f_tan_32f_a_H
79 #define INCLUDED_volk_32f_tan_32f_a_H
80 
81 #if LV_HAVE_AVX2 && LV_HAVE_FMA
82 #include <immintrin.h>
83 
84 static inline void
85 volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector,
86  unsigned int num_points)
87 {
88  float* bPtr = bVector;
89  const float* aPtr = aVector;
90 
91  unsigned int number = 0;
92  unsigned int eighthPoints = num_points / 8;
93  unsigned int i = 0;
94 
95  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
96  __m256 sine, cosine, tangent, condition1, condition2, condition3;
97  __m256i q, r, ones, twos, fours;
98 
99  m4pi = _mm256_set1_ps(1.273239545);
100  pio4A = _mm256_set1_ps(0.78515625);
101  pio4B = _mm256_set1_ps(0.241876e-3);
102  ffours = _mm256_set1_ps(4.0);
103  ftwos = _mm256_set1_ps(2.0);
104  fones = _mm256_set1_ps(1.0);
105  fzeroes = _mm256_setzero_ps();
106  ones = _mm256_set1_epi32(1);
107  twos = _mm256_set1_epi32(2);
108  fours = _mm256_set1_epi32(4);
109 
110  cp1 = _mm256_set1_ps(1.0);
111  cp2 = _mm256_set1_ps(0.83333333e-1);
112  cp3 = _mm256_set1_ps(0.2777778e-2);
113  cp4 = _mm256_set1_ps(0.49603e-4);
114  cp5 = _mm256_set1_ps(0.551e-6);
115 
116  for(;number < eighthPoints; number++){
117  aVal = _mm256_load_ps(aPtr);
118  s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
119  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
120  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
121 
122  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
123  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
124 
125  s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
126  s = _mm256_mul_ps(s, s);
127  // Evaluate Taylor series
128  s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
129 
130  for(i = 0; i < 3; i++){
131  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
132  }
133  s = _mm256_div_ps(s, ftwos);
134 
135  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
136  cosine = _mm256_sub_ps(fones, s);
137 
138  condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
139  condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
140  condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes,4);
141 
142  __m256 temp = cosine;
143  cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
144  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
145  sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
146  cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
147  tangent = _mm256_div_ps(sine, cosine);
148  _mm256_store_ps(bPtr, tangent);
149  aPtr += 8;
150  bPtr += 8;
151  }
152 
153  number = eighthPoints * 8;
154  for(;number < num_points; number++){
155  *bPtr++ = tan(*aPtr++);
156  }
157 }
158 
159 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
160 
161 #ifdef LV_HAVE_AVX2
162 #include <immintrin.h>
163 
164 static inline void
165 volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector,
166  unsigned int num_points)
167 {
168  float* bPtr = bVector;
169  const float* aPtr = aVector;
170 
171  unsigned int number = 0;
172  unsigned int eighthPoints = num_points / 8;
173  unsigned int i = 0;
174 
175  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
176  __m256 sine, cosine, tangent, condition1, condition2, condition3;
177  __m256i q, r, ones, twos, fours;
178 
179  m4pi = _mm256_set1_ps(1.273239545);
180  pio4A = _mm256_set1_ps(0.78515625);
181  pio4B = _mm256_set1_ps(0.241876e-3);
182  ffours = _mm256_set1_ps(4.0);
183  ftwos = _mm256_set1_ps(2.0);
184  fones = _mm256_set1_ps(1.0);
185  fzeroes = _mm256_setzero_ps();
186  ones = _mm256_set1_epi32(1);
187  twos = _mm256_set1_epi32(2);
188  fours = _mm256_set1_epi32(4);
189 
190  cp1 = _mm256_set1_ps(1.0);
191  cp2 = _mm256_set1_ps(0.83333333e-1);
192  cp3 = _mm256_set1_ps(0.2777778e-2);
193  cp4 = _mm256_set1_ps(0.49603e-4);
194  cp5 = _mm256_set1_ps(0.551e-6);
195 
196  for(;number < eighthPoints; number++){
197  aVal = _mm256_load_ps(aPtr);
198  s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
199  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
200  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
201 
202  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
203  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
204 
205  s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
206  s = _mm256_mul_ps(s, s);
207  // Evaluate Taylor series
208  s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
209 
210  for(i = 0; i < 3; i++){
211  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
212  }
213  s = _mm256_div_ps(s, ftwos);
214 
215  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
216  cosine = _mm256_sub_ps(fones, s);
217 
218  condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
219  condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
220  condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes,4);
221 
222  __m256 temp = cosine;
223  cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
224  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
225  sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
226  cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
227  tangent = _mm256_div_ps(sine, cosine);
228  _mm256_store_ps(bPtr, tangent);
229  aPtr += 8;
230  bPtr += 8;
231  }
232 
233  number = eighthPoints * 8;
234  for(;number < num_points; number++){
235  *bPtr++ = tan(*aPtr++);
236  }
237 }
238 
239 #endif /* LV_HAVE_AVX2 for aligned */
240 
241 #ifdef LV_HAVE_SSE4_1
242 #include <smmintrin.h>
243 
244 static inline void
245 volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector,
246  unsigned int num_points)
247 {
248  float* bPtr = bVector;
249  const float* aPtr = aVector;
250 
251  unsigned int number = 0;
252  unsigned int quarterPoints = num_points / 4;
253  unsigned int i = 0;
254 
255  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
256  __m128 sine, cosine, tangent, condition1, condition2, condition3;
257  __m128i q, r, ones, twos, fours;
258 
259  m4pi = _mm_set1_ps(1.273239545);
260  pio4A = _mm_set1_ps(0.78515625);
261  pio4B = _mm_set1_ps(0.241876e-3);
262  ffours = _mm_set1_ps(4.0);
263  ftwos = _mm_set1_ps(2.0);
264  fones = _mm_set1_ps(1.0);
265  fzeroes = _mm_setzero_ps();
266  ones = _mm_set1_epi32(1);
267  twos = _mm_set1_epi32(2);
268  fours = _mm_set1_epi32(4);
269 
270  cp1 = _mm_set1_ps(1.0);
271  cp2 = _mm_set1_ps(0.83333333e-1);
272  cp3 = _mm_set1_ps(0.2777778e-2);
273  cp4 = _mm_set1_ps(0.49603e-4);
274  cp5 = _mm_set1_ps(0.551e-6);
275 
276  for(;number < quarterPoints; number++){
277  aVal = _mm_load_ps(aPtr);
278  s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
279  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
280  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
281 
282  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
283  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
284 
285  s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
286  s = _mm_mul_ps(s, s);
287  // Evaluate Taylor series
288  s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
289 
290  for(i = 0; i < 3; i++){
291  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
292  }
293  s = _mm_div_ps(s, ftwos);
294 
295  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
296  cosine = _mm_sub_ps(fones, s);
297 
298  condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
299  condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
300  condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
301 
302  __m128 temp = cosine;
303  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
304  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
305  sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
306  cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
307  tangent = _mm_div_ps(sine, cosine);
308  _mm_store_ps(bPtr, tangent);
309  aPtr += 4;
310  bPtr += 4;
311  }
312 
313  number = quarterPoints * 4;
314  for(;number < num_points; number++){
315  *bPtr++ = tanf(*aPtr++);
316  }
317 }
318 
319 #endif /* LV_HAVE_SSE4_1 for aligned */
320 
321 
322 #endif /* INCLUDED_volk_32f_tan_32f_a_H */
323 
324 #ifndef INCLUDED_volk_32f_tan_32f_u_H
325 #define INCLUDED_volk_32f_tan_32f_u_H
326 
327 #if LV_HAVE_AVX2 && LV_HAVE_FMA
328 #include <immintrin.h>
329 
330 static inline void
331 volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector,
332  unsigned int num_points)
333 {
334  float* bPtr = bVector;
335  const float* aPtr = aVector;
336 
337  unsigned int number = 0;
338  unsigned int eighthPoints = num_points / 8;
339  unsigned int i = 0;
340 
341  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
342  __m256 sine, cosine, tangent, condition1, condition2, condition3;
343  __m256i q, r, ones, twos, fours;
344 
345  m4pi = _mm256_set1_ps(1.273239545);
346  pio4A = _mm256_set1_ps(0.78515625);
347  pio4B = _mm256_set1_ps(0.241876e-3);
348  ffours = _mm256_set1_ps(4.0);
349  ftwos = _mm256_set1_ps(2.0);
350  fones = _mm256_set1_ps(1.0);
351  fzeroes = _mm256_setzero_ps();
352  ones = _mm256_set1_epi32(1);
353  twos = _mm256_set1_epi32(2);
354  fours = _mm256_set1_epi32(4);
355 
356  cp1 = _mm256_set1_ps(1.0);
357  cp2 = _mm256_set1_ps(0.83333333e-1);
358  cp3 = _mm256_set1_ps(0.2777778e-2);
359  cp4 = _mm256_set1_ps(0.49603e-4);
360  cp5 = _mm256_set1_ps(0.551e-6);
361 
362  for(;number < eighthPoints; number++){
363  aVal = _mm256_loadu_ps(aPtr);
364  s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
365  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
366  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
367 
368  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
369  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
370 
371  s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
372  s = _mm256_mul_ps(s, s);
373  // Evaluate Taylor series
374  s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
375 
376  for(i = 0; i < 3; i++){
377  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
378  }
379  s = _mm256_div_ps(s, ftwos);
380 
381  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
382  cosine = _mm256_sub_ps(fones, s);
383 
384  condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
385  condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
386  condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes,4);
387 
388  __m256 temp = cosine;
389  cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
390  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
391  sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
392  cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
393  tangent = _mm256_div_ps(sine, cosine);
394  _mm256_storeu_ps(bPtr, tangent);
395  aPtr += 8;
396  bPtr += 8;
397  }
398 
399  number = eighthPoints * 8;
400  for(;number < num_points; number++){
401  *bPtr++ = tan(*aPtr++);
402  }
403 }
404 
405 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
406 
407 #ifdef LV_HAVE_AVX2
408 #include <immintrin.h>
409 
410 static inline void
411 volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector,
412  unsigned int num_points)
413 {
414  float* bPtr = bVector;
415  const float* aPtr = aVector;
416 
417  unsigned int number = 0;
418  unsigned int eighthPoints = num_points / 8;
419  unsigned int i = 0;
420 
421  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
422  __m256 sine, cosine, tangent, condition1, condition2, condition3;
423  __m256i q, r, ones, twos, fours;
424 
425  m4pi = _mm256_set1_ps(1.273239545);
426  pio4A = _mm256_set1_ps(0.78515625);
427  pio4B = _mm256_set1_ps(0.241876e-3);
428  ffours = _mm256_set1_ps(4.0);
429  ftwos = _mm256_set1_ps(2.0);
430  fones = _mm256_set1_ps(1.0);
431  fzeroes = _mm256_setzero_ps();
432  ones = _mm256_set1_epi32(1);
433  twos = _mm256_set1_epi32(2);
434  fours = _mm256_set1_epi32(4);
435 
436  cp1 = _mm256_set1_ps(1.0);
437  cp2 = _mm256_set1_ps(0.83333333e-1);
438  cp3 = _mm256_set1_ps(0.2777778e-2);
439  cp4 = _mm256_set1_ps(0.49603e-4);
440  cp5 = _mm256_set1_ps(0.551e-6);
441 
442  for(;number < eighthPoints; number++){
443  aVal = _mm256_loadu_ps(aPtr);
444  s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
445  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
446  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
447 
448  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
449  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
450 
451  s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
452  s = _mm256_mul_ps(s, s);
453  // Evaluate Taylor series
454  s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
455 
456  for(i = 0; i < 3; i++){
457  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
458  }
459  s = _mm256_div_ps(s, ftwos);
460 
461  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
462  cosine = _mm256_sub_ps(fones, s);
463 
464  condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
465  condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
466  condition3 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)), fzeroes,4);
467 
468  __m256 temp = cosine;
469  cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
470  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
471  sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
472  cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
473  tangent = _mm256_div_ps(sine, cosine);
474  _mm256_storeu_ps(bPtr, tangent);
475  aPtr += 8;
476  bPtr += 8;
477  }
478 
479  number = eighthPoints * 8;
480  for(;number < num_points; number++){
481  *bPtr++ = tan(*aPtr++);
482  }
483 }
484 
485 #endif /* LV_HAVE_AVX2 for unaligned */
486 
487 
488 #ifdef LV_HAVE_SSE4_1
489 #include <smmintrin.h>
490 
491 static inline void
492 volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
493 {
494  float* bPtr = bVector;
495  const float* aPtr = aVector;
496 
497  unsigned int number = 0;
498  unsigned int quarterPoints = num_points / 4;
499  unsigned int i = 0;
500 
501  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
502  __m128 sine, cosine, tangent, condition1, condition2, condition3;
503  __m128i q, r, ones, twos, fours;
504 
505  m4pi = _mm_set1_ps(1.273239545);
506  pio4A = _mm_set1_ps(0.78515625);
507  pio4B = _mm_set1_ps(0.241876e-3);
508  ffours = _mm_set1_ps(4.0);
509  ftwos = _mm_set1_ps(2.0);
510  fones = _mm_set1_ps(1.0);
511  fzeroes = _mm_setzero_ps();
512  ones = _mm_set1_epi32(1);
513  twos = _mm_set1_epi32(2);
514  fours = _mm_set1_epi32(4);
515 
516  cp1 = _mm_set1_ps(1.0);
517  cp2 = _mm_set1_ps(0.83333333e-1);
518  cp3 = _mm_set1_ps(0.2777778e-2);
519  cp4 = _mm_set1_ps(0.49603e-4);
520  cp5 = _mm_set1_ps(0.551e-6);
521 
522  for(;number < quarterPoints; number++){
523  aVal = _mm_loadu_ps(aPtr);
524  s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
525  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
526  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
527 
528  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
529  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
530 
531  s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
532  s = _mm_mul_ps(s, s);
533  // Evaluate Taylor series
534  s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
535 
536  for(i = 0; i < 3; i++){
537  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
538  }
539  s = _mm_div_ps(s, ftwos);
540 
541  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
542  cosine = _mm_sub_ps(fones, s);
543 
544  condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
545  condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
546  condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
547 
548  __m128 temp = cosine;
549  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
550  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
551  sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
552  cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
553  tangent = _mm_div_ps(sine, cosine);
554  _mm_storeu_ps(bPtr, tangent);
555  aPtr += 4;
556  bPtr += 4;
557  }
558 
559  number = quarterPoints * 4;
560  for(;number < num_points; number++){
561  *bPtr++ = tanf(*aPtr++);
562  }
563 }
564 
565 #endif /* LV_HAVE_SSE4_1 for unaligned */
566 
567 
568 #ifdef LV_HAVE_GENERIC
569 
570 static inline void
571 volk_32f_tan_32f_generic(float* bVector, const float* aVector,
572  unsigned int num_points)
573 {
574  float* bPtr = bVector;
575  const float* aPtr = aVector;
576  unsigned int number = 0;
577 
578  for(; number < num_points; number++){
579  *bPtr++ = tanf(*aPtr++);
580  }
581 }
582 #endif /* LV_HAVE_GENERIC */
583 
584 
585 #endif /* INCLUDED_volk_32f_tan_32f_u_H */
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32f_tan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tan_32f.h:571