Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_cos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
72 #include <stdio.h>
73 #include <math.h>
74 #include <inttypes.h>
75 
76 #ifndef INCLUDED_volk_32f_cos_32f_a_H
77 #define INCLUDED_volk_32f_cos_32f_a_H
78 
79 #if LV_HAVE_AVX2 && LV_HAVE_FMA
80 #include <immintrin.h>
81 
82 static inline void
83  volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
84 {
85  float* bPtr = bVector;
86  const float* aPtr = aVector;
87 
88  unsigned int number = 0;
89  unsigned int eighthPoints = num_points / 8;
90  unsigned int i = 0;
91 
92  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
93  __m256 sine, cosine;
94  __m256i q, ones, twos, fours;
95 
96  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
97  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
98  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
99  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
100  ffours = _mm256_set1_ps(4.0);
101  ftwos = _mm256_set1_ps(2.0);
102  fones = _mm256_set1_ps(1.0);
103  fzeroes = _mm256_setzero_ps();
104  __m256i zeroes = _mm256_set1_epi32(0);
105  ones = _mm256_set1_epi32(1);
106  __m256i allones = _mm256_set1_epi32(0xffffffff);
107  twos = _mm256_set1_epi32(2);
108  fours = _mm256_set1_epi32(4);
109 
110  cp1 = _mm256_set1_ps(1.0);
111  cp2 = _mm256_set1_ps(0.08333333333333333);
112  cp3 = _mm256_set1_ps(0.002777777777777778);
113  cp4 = _mm256_set1_ps(4.96031746031746e-05);
114  cp5 = _mm256_set1_ps(5.511463844797178e-07);
115  union bit256 condition1;
116  union bit256 condition3;
117 
118  for(;number < eighthPoints; number++){
119 
120  aVal = _mm256_load_ps(aPtr);
121  // s = fabs(aVal)
122  s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
123  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
124  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
125  // r = q + q&1, q indicates quadrant, r gives
126  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
127 
128  s = _mm256_fnmadd_ps(r,pio4A,s);
129  s = _mm256_fnmadd_ps(r,pio4B,s);
130  s = _mm256_fnmadd_ps(r,pio4C,s);
131 
132  s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
133  s = _mm256_mul_ps(s, s);
134  // Evaluate Taylor series
135  s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
136 
137  for(i = 0; i < 3; i++)
138  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
139  s = _mm256_div_ps(s, ftwos);
140 
141  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
142  cosine = _mm256_sub_ps(fones, s);
143 
144  // if(((q+1)&2) != 0) { cosine=sine;}
145  condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
146  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
147 
148  // if(((q+2)&4) != 0) { cosine = -cosine;}
149  condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
150  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
151 
152  cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
153  cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
154  _mm256_store_ps(bPtr, cosine);
155  aPtr += 8;
156  bPtr += 8;
157  }
158 
159  number = eighthPoints * 8;
160  for(;number < num_points; number++){
161  *bPtr++ = cos(*aPtr++);
162  }
163 }
164 
165 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
166 
167 #ifdef LV_HAVE_AVX2
168 #include <immintrin.h>
169 
170 static inline void
171  volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
172 {
173  float* bPtr = bVector;
174  const float* aPtr = aVector;
175 
176  unsigned int number = 0;
177  unsigned int eighthPoints = num_points / 8;
178  unsigned int i = 0;
179 
180  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
181  __m256 sine, cosine;
182  __m256i q, ones, twos, fours;
183 
184  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
185  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
186  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
187  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
188  ffours = _mm256_set1_ps(4.0);
189  ftwos = _mm256_set1_ps(2.0);
190  fones = _mm256_set1_ps(1.0);
191  fzeroes = _mm256_setzero_ps();
192  __m256i zeroes = _mm256_set1_epi32(0);
193  ones = _mm256_set1_epi32(1);
194  __m256i allones = _mm256_set1_epi32(0xffffffff);
195  twos = _mm256_set1_epi32(2);
196  fours = _mm256_set1_epi32(4);
197 
198  cp1 = _mm256_set1_ps(1.0);
199  cp2 = _mm256_set1_ps(0.08333333333333333);
200  cp3 = _mm256_set1_ps(0.002777777777777778);
201  cp4 = _mm256_set1_ps(4.96031746031746e-05);
202  cp5 = _mm256_set1_ps(5.511463844797178e-07);
203  union bit256 condition1;
204  union bit256 condition3;
205 
206  for(;number < eighthPoints; number++){
207 
208  aVal = _mm256_load_ps(aPtr);
209  // s = fabs(aVal)
210  s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
211  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
212  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
213  // r = q + q&1, q indicates quadrant, r gives
214  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
215 
216  s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4A));
217  s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4B));
218  s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4C));
219 
220  s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
221  s = _mm256_mul_ps(s, s);
222  // Evaluate Taylor series
223  s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
224 
225  for(i = 0; i < 3; i++)
226  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
227  s = _mm256_div_ps(s, ftwos);
228 
229  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
230  cosine = _mm256_sub_ps(fones, s);
231 
232  // if(((q+1)&2) != 0) { cosine=sine;}
233  condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
234  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
235 
236  // if(((q+2)&4) != 0) { cosine = -cosine;}
237  condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
238  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
239 
240  cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
241  cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
242  _mm256_store_ps(bPtr, cosine);
243  aPtr += 8;
244  bPtr += 8;
245  }
246 
247  number = eighthPoints * 8;
248  for(;number < num_points; number++){
249  *bPtr++ = cos(*aPtr++);
250  }
251 }
252 
253 #endif /* LV_HAVE_AVX2 for aligned */
254 
255 #ifdef LV_HAVE_SSE4_1
256 #include <smmintrin.h>
257 
258 static inline void
259  volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
260 {
261  float* bPtr = bVector;
262  const float* aPtr = aVector;
263 
264  unsigned int number = 0;
265  unsigned int quarterPoints = num_points / 4;
266  unsigned int i = 0;
267 
268  __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
269  __m128 sine, cosine;
270  __m128i q, ones, twos, fours;
271 
272  m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
273  pio4A = _mm_set1_ps(0.7853981554508209228515625);
274  pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
275  pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
276  ffours = _mm_set1_ps(4.0);
277  ftwos = _mm_set1_ps(2.0);
278  fones = _mm_set1_ps(1.0);
279  fzeroes = _mm_setzero_ps();
280  __m128i zeroes = _mm_set1_epi32(0);
281  ones = _mm_set1_epi32(1);
282  __m128i allones = _mm_set1_epi32(0xffffffff);
283  twos = _mm_set1_epi32(2);
284  fours = _mm_set1_epi32(4);
285 
286  cp1 = _mm_set1_ps(1.0);
287  cp2 = _mm_set1_ps(0.08333333333333333);
288  cp3 = _mm_set1_ps(0.002777777777777778);
289  cp4 = _mm_set1_ps(4.96031746031746e-05);
290  cp5 = _mm_set1_ps(5.511463844797178e-07);
291  union bit128 condition1;
292  union bit128 condition3;
293 
294  for(;number < quarterPoints; number++){
295 
296  aVal = _mm_load_ps(aPtr);
297  // s = fabs(aVal)
298  s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
299  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
300  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
301  // r = q + q&1, q indicates quadrant, r gives
302  r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
303 
304  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
305  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
306  s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
307 
308  s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
309  s = _mm_mul_ps(s, s);
310  // Evaluate Taylor series
311  s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
312 
313  for(i = 0; i < 3; i++)
314  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
315  s = _mm_div_ps(s, ftwos);
316 
317  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
318  cosine = _mm_sub_ps(fones, s);
319 
320  // if(((q+1)&2) != 0) { cosine=sine;}
321  condition1.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
322  condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
323 
324  // if(((q+2)&4) != 0) { cosine = -cosine;}
325  condition3.int_vec = _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
326  condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
327 
328  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
329  cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
330  _mm_store_ps(bPtr, cosine);
331  aPtr += 4;
332  bPtr += 4;
333  }
334 
335  number = quarterPoints * 4;
336  for(;number < num_points; number++){
337  *bPtr++ = cosf(*aPtr++);
338  }
339 }
340 
341 #endif /* LV_HAVE_SSE4_1 for aligned */
342 
343 #endif /* INCLUDED_volk_32f_cos_32f_a_H */
344 
345 
346 
347 #ifndef INCLUDED_volk_32f_cos_32f_u_H
348 #define INCLUDED_volk_32f_cos_32f_u_H
349 
350 #if LV_HAVE_AVX2 && LV_HAVE_FMA
351 #include <immintrin.h>
352 
353 static inline void
354  volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
355 {
356  float* bPtr = bVector;
357  const float* aPtr = aVector;
358 
359  unsigned int number = 0;
360  unsigned int eighthPoints = num_points / 8;
361  unsigned int i = 0;
362 
363  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
364  __m256 sine, cosine;
365  __m256i q, ones, twos, fours;
366 
367  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
368  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
369  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
370  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
371  ffours = _mm256_set1_ps(4.0);
372  ftwos = _mm256_set1_ps(2.0);
373  fones = _mm256_set1_ps(1.0);
374  fzeroes = _mm256_setzero_ps();
375  __m256i zeroes = _mm256_set1_epi32(0);
376  ones = _mm256_set1_epi32(1);
377  __m256i allones = _mm256_set1_epi32(0xffffffff);
378  twos = _mm256_set1_epi32(2);
379  fours = _mm256_set1_epi32(4);
380 
381  cp1 = _mm256_set1_ps(1.0);
382  cp2 = _mm256_set1_ps(0.08333333333333333);
383  cp3 = _mm256_set1_ps(0.002777777777777778);
384  cp4 = _mm256_set1_ps(4.96031746031746e-05);
385  cp5 = _mm256_set1_ps(5.511463844797178e-07);
386  union bit256 condition1;
387  union bit256 condition3;
388 
389  for(;number < eighthPoints; number++){
390 
391  aVal = _mm256_loadu_ps(aPtr);
392  // s = fabs(aVal)
393  s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
394  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
395  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
396  // r = q + q&1, q indicates quadrant, r gives
397  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
398 
399  s = _mm256_fnmadd_ps(r,pio4A,s);
400  s = _mm256_fnmadd_ps(r,pio4B,s);
401  s = _mm256_fnmadd_ps(r,pio4C,s);
402 
403  s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
404  s = _mm256_mul_ps(s, s);
405  // Evaluate Taylor series
406  s = _mm256_mul_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(_mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2), s, cp1), s);
407 
408  for(i = 0; i < 3; i++)
409  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
410  s = _mm256_div_ps(s, ftwos);
411 
412  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
413  cosine = _mm256_sub_ps(fones, s);
414 
415  // if(((q+1)&2) != 0) { cosine=sine;}
416  condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
417  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
418 
419  // if(((q+2)&4) != 0) { cosine = -cosine;}
420  condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
421  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
422 
423  cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
424  cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
425  _mm256_storeu_ps(bPtr, cosine);
426  aPtr += 8;
427  bPtr += 8;
428  }
429 
430  number = eighthPoints * 8;
431  for(;number < num_points; number++){
432  *bPtr++ = cos(*aPtr++);
433  }
434 }
435 
436 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
437 
438 #ifdef LV_HAVE_AVX2
439 #include <immintrin.h>
440 
441 static inline void
442  volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
443 {
444  float* bPtr = bVector;
445  const float* aPtr = aVector;
446 
447  unsigned int number = 0;
448  unsigned int eighthPoints = num_points / 8;
449  unsigned int i = 0;
450 
451  __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
452  __m256 sine, cosine;
453  __m256i q, ones, twos, fours;
454 
455  m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
456  pio4A = _mm256_set1_ps(0.7853981554508209228515625);
457  pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
458  pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
459  ffours = _mm256_set1_ps(4.0);
460  ftwos = _mm256_set1_ps(2.0);
461  fones = _mm256_set1_ps(1.0);
462  fzeroes = _mm256_setzero_ps();
463  __m256i zeroes = _mm256_set1_epi32(0);
464  ones = _mm256_set1_epi32(1);
465  __m256i allones = _mm256_set1_epi32(0xffffffff);
466  twos = _mm256_set1_epi32(2);
467  fours = _mm256_set1_epi32(4);
468 
469  cp1 = _mm256_set1_ps(1.0);
470  cp2 = _mm256_set1_ps(0.08333333333333333);
471  cp3 = _mm256_set1_ps(0.002777777777777778);
472  cp4 = _mm256_set1_ps(4.96031746031746e-05);
473  cp5 = _mm256_set1_ps(5.511463844797178e-07);
474  union bit256 condition1;
475  union bit256 condition3;
476 
477  for(;number < eighthPoints; number++){
478 
479  aVal = _mm256_loadu_ps(aPtr);
480  // s = fabs(aVal)
481  s = _mm256_sub_ps(aVal, _mm256_and_ps(_mm256_mul_ps(aVal, ftwos), _mm256_cmp_ps(aVal, fzeroes,1)));
482  // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
483  q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
484  // r = q + q&1, q indicates quadrant, r gives
485  r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
486 
487  s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4A));
488  s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4B));
489  s = _mm256_sub_ps(s, _mm256_mul_ps(r,pio4C));
490 
491  s = _mm256_div_ps(s, _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
492  s = _mm256_mul_ps(s, s);
493  // Evaluate Taylor series
494  s = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
495 
496  for(i = 0; i < 3; i++)
497  s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
498  s = _mm256_div_ps(s, ftwos);
499 
500  sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
501  cosine = _mm256_sub_ps(fones, s);
502 
503  // if(((q+1)&2) != 0) { cosine=sine;}
504  condition1.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
505  condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
506 
507  // if(((q+2)&4) != 0) { cosine = -cosine;}
508  condition3.int_vec = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
509  condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
510 
511  cosine = _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
512  cosine = _mm256_sub_ps(cosine, _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3.float_vec));
513  _mm256_storeu_ps(bPtr, cosine);
514  aPtr += 8;
515  bPtr += 8;
516  }
517 
518  number = eighthPoints * 8;
519  for(;number < num_points; number++){
520  *bPtr++ = cos(*aPtr++);
521  }
522 }
523 
524 #endif /* LV_HAVE_AVX2 for unaligned */
525 
526 #ifdef LV_HAVE_SSE4_1
527 #include <smmintrin.h>
528 
529 static inline void
530 volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
531 {
532  float* bPtr = bVector;
533  const float* aPtr = aVector;
534 
535  unsigned int number = 0;
536  unsigned int quarterPoints = num_points / 4;
537  unsigned int i = 0;
538 
539  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
540  __m128 sine, cosine, condition1, condition3;
541  __m128i q, r, ones, twos, fours;
542 
543  m4pi = _mm_set1_ps(1.273239545);
544  pio4A = _mm_set1_ps(0.78515625);
545  pio4B = _mm_set1_ps(0.241876e-3);
546  ffours = _mm_set1_ps(4.0);
547  ftwos = _mm_set1_ps(2.0);
548  fones = _mm_set1_ps(1.0);
549  fzeroes = _mm_setzero_ps();
550  ones = _mm_set1_epi32(1);
551  twos = _mm_set1_epi32(2);
552  fours = _mm_set1_epi32(4);
553 
554  cp1 = _mm_set1_ps(1.0);
555  cp2 = _mm_set1_ps(0.83333333e-1);
556  cp3 = _mm_set1_ps(0.2777778e-2);
557  cp4 = _mm_set1_ps(0.49603e-4);
558  cp5 = _mm_set1_ps(0.551e-6);
559 
560  for(;number < quarterPoints; number++){
561  aVal = _mm_loadu_ps(aPtr);
562  s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
563  q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
564  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
565 
566  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
567  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
568 
569  s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
570  s = _mm_mul_ps(s, s);
571  // Evaluate Taylor series
572  s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
573 
574  for(i = 0; i < 3; i++){
575  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
576  }
577  s = _mm_div_ps(s, ftwos);
578 
579  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
580  cosine = _mm_sub_ps(fones, s);
581 
582  condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
583 
584  condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
585 
586  cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
587  cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
588  _mm_storeu_ps(bPtr, cosine);
589  aPtr += 4;
590  bPtr += 4;
591  }
592 
593  number = quarterPoints * 4;
594  for(;number < num_points; number++){
595  *bPtr++ = cosf(*aPtr++);
596  }
597 }
598 
599 #endif /* LV_HAVE_SSE4_1 for unaligned */
600 
601 
602 #ifdef LV_HAVE_GENERIC
603 
604 /*
605  * For derivation see
606  * Shibata, Naoki, "Efficient evaluation methods of elementary functions
607  * suitable for SIMD computation," in Springer-Verlag 2010
608  */
609 static inline void
610 volk_32f_cos_32f_generic_fast(float* bVector, const float* aVector, unsigned int num_points)
611 {
612  float* bPtr = bVector;
613  const float* aPtr = aVector;
614 
615  float m4pi = 1.273239544735162542821171882678754627704620361328125;
616  float pio4A = 0.7853981554508209228515625;
617  float pio4B = 0.794662735614792836713604629039764404296875e-8;
618  float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
619  int N = 3; // order of argument reduction
620 
621  unsigned int number;
622  for(number = 0; number < num_points; number++){
623  float s = fabs(*aPtr);
624  int q = (int)(s * m4pi);
625  int r = q + (q&1);
626  s -= r * pio4A;
627  s -= r * pio4B;
628  s -= r * pio4C;
629 
630  s = s * 0.125; // 2^-N (<--3)
631  s = s*s;
632  s = ((((s/1814400. - 1.0/20160.0)*s + 1.0/360.0)*s - 1.0/12.0)*s + 1.0)*s;
633 
634  int i;
635  for(i=0; i < N; ++i) {
636  s = (4.0-s)*s;
637  }
638  s = s/2.0;
639 
640  float sine = sqrt((2.0-s)*s);
641  float cosine = 1-s;
642 
643  if (((q+1) & 2) != 0) {
644  s = cosine;
645  cosine = sine;
646  sine = s;
647  }
648  if (((q+2) & 4) != 0) {
649  cosine = -cosine;
650  }
651  *bPtr = cosine;
652  bPtr++;
653  aPtr++;
654  }
655 }
656 
657 #endif /* LV_HAVE_GENERIC */
658 
659 
660 #ifdef LV_HAVE_GENERIC
661 
662 static inline void
663 volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
664 {
665  float* bPtr = bVector;
666  const float* aPtr = aVector;
667  unsigned int number = 0;
668 
669  for(; number < num_points; number++){
670  *bPtr++ = cosf(*aPtr++);
671  }
672 }
673 
674 #endif /* LV_HAVE_GENERIC */
675 
676 #endif /* INCLUDED_volk_32f_cos_32f_u_H */
static void volk_32f_cos_32f_generic_fast(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:610
float f[8]
Definition: volk_common.h:108
static void volk_32f_cos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:663
__m256i int_vec
Definition: volk_common.h:113
for i
Definition: volk_config_fixed.tmpl.h:25
Definition: volk_common.h:104
__m256 float_vec
Definition: volk_common.h:112
float f[4]
Definition: volk_common.h:91
Definition: volk_common.h:87