Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_sin_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
72 #include <stdio.h>
73 #include <math.h>
74 #include <inttypes.h>
75 
76 #ifndef INCLUDED_volk_32f_sin_32f_a_H
77 #define INCLUDED_volk_32f_sin_32f_a_H
78 
79 
80 #if LV_HAVE_AVX2 && LV_HAVE_FMA
81 #include <immintrin.h>
82 
83 static inline void
84 volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
85 {
86  float* bPtr = bVector;
87  const float* aPtr = aVector;
88 
89  unsigned int number = 0;
90  unsigned int eighthPoints = num_points / 8;
91  unsigned int i = 0;
92 
93  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
94  __m256 sine, cosine, condition1, condition2;
95  __m256i q, r, ones, twos, fours;
96 
97  m4pi = _mm256_set1_ps(1.273239545);
98  pio4A = _mm256_set1_ps(0.78515625);
99  pio4B = _mm256_set1_ps(0.241876e-3);
100  ffours = _mm256_set1_ps(4.0);
101  ftwos = _mm256_set1_ps(2.0);
102  fones = _mm256_set1_ps(1.0);
103  fzeroes = _mm256_setzero_ps();
104  ones = _mm256_set1_epi32(1);
105  twos = _mm256_set1_epi32(2);
106  fours = _mm256_set1_epi32(4);
107 
108  cp1 = _mm256_set1_ps(1.0);
109  cp2 = _mm256_set1_ps(0.83333333e-1);
110  cp3 = _mm256_set1_ps(0.2777778e-2);
111  cp4 = _mm256_set1_ps(0.49603e-4);
112  cp5 = _mm256_set1_ps(0.551e-6);
113 
114  for(;number < eighthPoints; number++) {
115  aVal = _mm256_load_ps(aPtr);
116  s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
117  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
118  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
119 
120  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
121  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
122 
123  s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
124  s = _mm256_mul_ps(s, s);
125  // Evaluate Taylor series
126  s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
127 
128  for(i = 0; i < 3; i++) {
129  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
130  }
131  s = _mm256_div_ps(s, ftwos);
132 
133  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
134  cosine = _mm256_sub_ps(fones, s);
135 
136  condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
137  condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
138  // Need this condition only for cos
139  //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
140 
141  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
142  sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
143  _mm256_store_ps(bPtr, sine);
144  aPtr += 8;
145  bPtr += 8;
146  }
147 
148  number = eighthPoints * 8;
149  for(;number < num_points; number++) {
150  *bPtr++ = sin(*aPtr++);
151  }
152 }
153 
154 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
155 
156 #ifdef LV_HAVE_AVX2
157 #include <immintrin.h>
158 
159 static inline void
160 volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
161 {
162  float* bPtr = bVector;
163  const float* aPtr = aVector;
164 
165  unsigned int number = 0;
166  unsigned int eighthPoints = num_points / 8;
167  unsigned int i = 0;
168 
169  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
170  __m256 sine, cosine, condition1, condition2;
171  __m256i q, r, ones, twos, fours;
172 
173  m4pi = _mm256_set1_ps(1.273239545);
174  pio4A = _mm256_set1_ps(0.78515625);
175  pio4B = _mm256_set1_ps(0.241876e-3);
176  ffours = _mm256_set1_ps(4.0);
177  ftwos = _mm256_set1_ps(2.0);
178  fones = _mm256_set1_ps(1.0);
179  fzeroes = _mm256_setzero_ps();
180  ones = _mm256_set1_epi32(1);
181  twos = _mm256_set1_epi32(2);
182  fours = _mm256_set1_epi32(4);
183 
184  cp1 = _mm256_set1_ps(1.0);
185  cp2 = _mm256_set1_ps(0.83333333e-1);
186  cp3 = _mm256_set1_ps(0.2777778e-2);
187  cp4 = _mm256_set1_ps(0.49603e-4);
188  cp5 = _mm256_set1_ps(0.551e-6);
189 
190  for(;number < eighthPoints; number++) {
191  aVal = _mm256_load_ps(aPtr);
192  s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
193  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
194  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
195 
196  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
197  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
198 
199  s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
200  s = _mm256_mul_ps(s, s);
201  // Evaluate Taylor series
202  s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
203 
204  for(i = 0; i < 3; i++) {
205  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
206  }
207  s = _mm256_div_ps(s, ftwos);
208 
209  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
210  cosine = _mm256_sub_ps(fones, s);
211 
212  condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
213  condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
214  // Need this condition only for cos
215  //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
216 
217  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
218  sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
219  _mm256_store_ps(bPtr, sine);
220  aPtr += 8;
221  bPtr += 8;
222  }
223 
224  number = eighthPoints * 8;
225  for(;number < num_points; number++) {
226  *bPtr++ = sin(*aPtr++);
227  }
228 }
229 
230 #endif /* LV_HAVE_AVX2 for aligned */
231 
232 #ifdef LV_HAVE_SSE4_1
233 #include <smmintrin.h>
234 
235 static inline void
236 volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
237 {
238  float* bPtr = bVector;
239  const float* aPtr = aVector;
240 
241  unsigned int number = 0;
242  unsigned int quarterPoints = num_points / 4;
243  unsigned int i = 0;
244 
245  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
246  __m128 sine, cosine, condition1, condition2;
247  __m128i q, r, ones, twos, fours;
248 
249  m4pi = _mm_set1_ps(1.273239545);
250  pio4A = _mm_set1_ps(0.78515625);
251  pio4B = _mm_set1_ps(0.241876e-3);
252  ffours = _mm_set1_ps(4.0);
253  ftwos = _mm_set1_ps(2.0);
254  fones = _mm_set1_ps(1.0);
255  fzeroes = _mm_setzero_ps();
256  ones = _mm_set1_epi32(1);
257  twos = _mm_set1_epi32(2);
258  fours = _mm_set1_epi32(4);
259 
260  cp1 = _mm_set1_ps(1.0);
261  cp2 = _mm_set1_ps(0.83333333e-1);
262  cp3 = _mm_set1_ps(0.2777778e-2);
263  cp4 = _mm_set1_ps(0.49603e-4);
264  cp5 = _mm_set1_ps(0.551e-6);
265 
266  for(;number < quarterPoints; number++) {
267  aVal = _mm_load_ps(aPtr);
268  s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
269  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
270  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
271 
272  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
273  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
274 
275  s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
276  s = _mm_mul_ps(s, s);
277  // Evaluate Taylor series
278  s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
279 
280  for(i = 0; i < 3; i++) {
281  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
282  }
283  s = _mm_div_ps(s, ftwos);
284 
285  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
286  cosine = _mm_sub_ps(fones, s);
287 
288  condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
289  condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
290  // Need this condition only for cos
291  //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
292 
293  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
294  sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
295  _mm_store_ps(bPtr, sine);
296  aPtr += 4;
297  bPtr += 4;
298  }
299 
300  number = quarterPoints * 4;
301  for(;number < num_points; number++) {
302  *bPtr++ = sinf(*aPtr++);
303  }
304 }
305 
306 #endif /* LV_HAVE_SSE4_1 for aligned */
307 
308 
309 #endif /* INCLUDED_volk_32f_sin_32f_a_H */
310 
311 #ifndef INCLUDED_volk_32f_sin_32f_u_H
312 #define INCLUDED_volk_32f_sin_32f_u_H
313 
314 #if LV_HAVE_AVX2 && LV_HAVE_FMA
315 #include <immintrin.h>
316 
317 static inline void
318 volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
319 {
320  float* bPtr = bVector;
321  const float* aPtr = aVector;
322 
323  unsigned int number = 0;
324  unsigned int eighthPoints = num_points / 8;
325  unsigned int i = 0;
326 
327  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
328  __m256 sine, cosine, condition1, condition2;
329  __m256i q, r, ones, twos, fours;
330 
331  m4pi = _mm256_set1_ps(1.273239545);
332  pio4A = _mm256_set1_ps(0.78515625);
333  pio4B = _mm256_set1_ps(0.241876e-3);
334  ffours = _mm256_set1_ps(4.0);
335  ftwos = _mm256_set1_ps(2.0);
336  fones = _mm256_set1_ps(1.0);
337  fzeroes = _mm256_setzero_ps();
338  ones = _mm256_set1_epi32(1);
339  twos = _mm256_set1_epi32(2);
340  fours = _mm256_set1_epi32(4);
341 
342  cp1 = _mm256_set1_ps(1.0);
343  cp2 = _mm256_set1_ps(0.83333333e-1);
344  cp3 = _mm256_set1_ps(0.2777778e-2);
345  cp4 = _mm256_set1_ps(0.49603e-4);
346  cp5 = _mm256_set1_ps(0.551e-6);
347 
348  for(;number < eighthPoints; number++) {
349  aVal = _mm256_loadu_ps(aPtr);
350  s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
351  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
352  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
353 
354  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
355  s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
356 
357  s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
358  s = _mm256_mul_ps(s, s);
359  // Evaluate Taylor series
360  s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
361 
362  for(i = 0; i < 3; i++) {
363  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
364  }
365  s = _mm256_div_ps(s, ftwos);
366 
367  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
368  cosine = _mm256_sub_ps(fones, s);
369 
370  condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
371  condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
372  // Need this condition only for cos
373  //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
374 
375  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
376  sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
377  _mm256_storeu_ps(bPtr, sine);
378  aPtr += 8;
379  bPtr += 8;
380  }
381 
382  number = eighthPoints * 8;
383  for(;number < num_points; number++) {
384  *bPtr++ = sin(*aPtr++);
385  }
386 }
387 
388 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
389 
390 #ifdef LV_HAVE_AVX2
391 #include <immintrin.h>
392 
393 static inline void
394 volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
395 {
396  float* bPtr = bVector;
397  const float* aPtr = aVector;
398 
399  unsigned int number = 0;
400  unsigned int eighthPoints = num_points / 8;
401  unsigned int i = 0;
402 
403  __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
404  __m256 sine, cosine, condition1, condition2;
405  __m256i q, r, ones, twos, fours;
406 
407  m4pi = _mm256_set1_ps(1.273239545);
408  pio4A = _mm256_set1_ps(0.78515625);
409  pio4B = _mm256_set1_ps(0.241876e-3);
410  ffours = _mm256_set1_ps(4.0);
411  ftwos = _mm256_set1_ps(2.0);
412  fones = _mm256_set1_ps(1.0);
413  fzeroes = _mm256_setzero_ps();
414  ones = _mm256_set1_epi32(1);
415  twos = _mm256_set1_epi32(2);
416  fours = _mm256_set1_epi32(4);
417 
418  cp1 = _mm256_set1_ps(1.0);
419  cp2 = _mm256_set1_ps(0.83333333e-1);
420  cp3 = _mm256_set1_ps(0.2777778e-2);
421  cp4 = _mm256_set1_ps(0.49603e-4);
422  cp5 = _mm256_set1_ps(0.551e-6);
423 
424  for(;number < eighthPoints; number++) {
425  aVal = _mm256_loadu_ps(aPtr);
426  s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
427  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
428  r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
429 
430  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
431  s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
432 
433  s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
434  s = _mm256_mul_ps(s, s);
435  // Evaluate Taylor series
436  s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
437 
438  for(i = 0; i < 3; i++) {
439  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
440  }
441  s = _mm256_div_ps(s, ftwos);
442 
443  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
444  cosine = _mm256_sub_ps(fones, s);
445 
446  condition1 = _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)), fzeroes,4);
447  condition2 = _mm256_cmp_ps(_mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes,4), _mm256_cmp_ps(aVal, fzeroes,1),4);
448  // Need this condition only for cos
449  //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
450 
451  sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
452  sine = _mm256_sub_ps(sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
453  _mm256_storeu_ps(bPtr, sine);
454  aPtr += 8;
455  bPtr += 8;
456  }
457 
458  number = eighthPoints * 8;
459  for(;number < num_points; number++) {
460  *bPtr++ = sin(*aPtr++);
461  }
462 }
463 
464 #endif /* LV_HAVE_AVX2 for unaligned */
465 
466 
467 #ifdef LV_HAVE_SSE4_1
468 #include <smmintrin.h>
469 
470 static inline void
471 volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
472 {
473  float* bPtr = bVector;
474  const float* aPtr = aVector;
475 
476  unsigned int number = 0;
477  unsigned int quarterPoints = num_points / 4;
478  unsigned int i = 0;
479 
480  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
481  __m128 sine, cosine, condition1, condition2;
482  __m128i q, r, ones, twos, fours;
483 
484  m4pi = _mm_set1_ps(1.273239545);
485  pio4A = _mm_set1_ps(0.78515625);
486  pio4B = _mm_set1_ps(0.241876e-3);
487  ffours = _mm_set1_ps(4.0);
488  ftwos = _mm_set1_ps(2.0);
489  fones = _mm_set1_ps(1.0);
490  fzeroes = _mm_setzero_ps();
491  ones = _mm_set1_epi32(1);
492  twos = _mm_set1_epi32(2);
493  fours = _mm_set1_epi32(4);
494 
495  cp1 = _mm_set1_ps(1.0);
496  cp2 = _mm_set1_ps(0.83333333e-1);
497  cp3 = _mm_set1_ps(0.2777778e-2);
498  cp4 = _mm_set1_ps(0.49603e-4);
499  cp5 = _mm_set1_ps(0.551e-6);
500 
501  for(;number < quarterPoints; number++) {
502  aVal = _mm_loadu_ps(aPtr);
503  s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
504  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
505  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
506 
507  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
508  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
509 
510  s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
511  s = _mm_mul_ps(s, s);
512  // Evaluate Taylor series
513  s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
514 
515  for(i = 0; i < 3; i++) {
516  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
517  }
518  s = _mm_div_ps(s, ftwos);
519 
520  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
521  cosine = _mm_sub_ps(fones, s);
522 
523  condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
524  condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
525 
526  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
527  sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
528  _mm_storeu_ps(bPtr, sine);
529  aPtr += 4;
530  bPtr += 4;
531  }
532 
533  number = quarterPoints * 4;
534  for(;number < num_points; number++){
535  *bPtr++ = sinf(*aPtr++);
536  }
537 }
538 
539 #endif /* LV_HAVE_SSE4_1 for unaligned */
540 
541 
542 #ifdef LV_HAVE_GENERIC
543 
544 static inline void
545 volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
546 {
547  float* bPtr = bVector;
548  const float* aPtr = aVector;
549  unsigned int number = 0;
550 
551  for(number = 0; number < num_points; number++) {
552  *bPtr++ = sinf(*aPtr++);
553  }
554 
555 }
556 
557 #endif /* LV_HAVE_GENERIC */
558 
559 #endif /* INCLUDED_volk_32f_sin_32f_u_H */
static void volk_32f_sin_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:545
for i
Definition: volk_config_fixed.tmpl.h:25