Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_acos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #include <stdio.h>
71 #include <math.h>
72 #include <inttypes.h>
73 
74 /* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
75 #define ACOS_TERMS 2
76 
77 #ifndef INCLUDED_volk_32f_acos_32f_a_H
78 #define INCLUDED_volk_32f_acos_32f_a_H
79 
80 #if LV_HAVE_AVX2 && LV_HAVE_FMA
81 #include <immintrin.h>
82 
83 static inline void
84 volk_32f_acos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
85 {
86  float* bPtr = bVector;
87  const float* aPtr = aVector;
88 
89  unsigned int number = 0;
90  unsigned int eighthPoints = num_points / 8;
91  int i, j;
92 
93  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
94  __m256 fzeroes, fones, ftwos, ffours, condition;
95 
96  pi = _mm256_set1_ps(3.14159265358979323846);
97  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
98  fzeroes = _mm256_setzero_ps();
99  fones = _mm256_set1_ps(1.0);
100  ftwos = _mm256_set1_ps(2.0);
101  ffours = _mm256_set1_ps(4.0);
102 
103  for(;number < eighthPoints; number++){
104  aVal = _mm256_load_ps(aPtr);
105  d = aVal;
106  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
107  z = aVal;
108  condition = _mm256_cmp_ps(z, fzeroes,1);
109  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
110  x = z;
111  condition = _mm256_cmp_ps(z, fones,1);
112  x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
113 
114  for(i = 0; i < 2; i++)
115  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones)));
116  x = _mm256_div_ps(fones, x);
117  y = fzeroes;
118  for(j = ACOS_TERMS - 1; j >=0 ; j--)
119  y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
120 
121  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
122  condition = _mm256_cmp_ps(z, fones,14);
123 
124  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
125  arccosine = y;
126  condition = _mm256_cmp_ps(aVal, fzeroes,1);
127  arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
128  condition = _mm256_cmp_ps(d, fzeroes,1);
129  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
130 
131  _mm256_store_ps(bPtr, arccosine);
132  aPtr += 8;
133  bPtr += 8;
134  }
135 
136  number = eighthPoints * 8;
137  for(;number < num_points; number++){
138  *bPtr++ = acos(*aPtr++);
139  }
140 }
141 
142 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
143 
144 
145 #ifdef LV_HAVE_AVX
146 #include <immintrin.h>
147 
148 static inline void
149 volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
150 {
151  float* bPtr = bVector;
152  const float* aPtr = aVector;
153 
154  unsigned int number = 0;
155  unsigned int eighthPoints = num_points / 8;
156  int i, j;
157 
158  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
159  __m256 fzeroes, fones, ftwos, ffours, condition;
160 
161  pi = _mm256_set1_ps(3.14159265358979323846);
162  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
163  fzeroes = _mm256_setzero_ps();
164  fones = _mm256_set1_ps(1.0);
165  ftwos = _mm256_set1_ps(2.0);
166  ffours = _mm256_set1_ps(4.0);
167 
168  for(;number < eighthPoints; number++){
169  aVal = _mm256_load_ps(aPtr);
170  d = aVal;
171  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
172  z = aVal;
173  condition = _mm256_cmp_ps(z, fzeroes,1);
174  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
175  x = z;
176  condition = _mm256_cmp_ps(z, fones,1);
177  x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
178 
179  for(i = 0; i < 2; i++)
180  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
181  x = _mm256_div_ps(fones, x);
182  y = fzeroes;
183  for(j = ACOS_TERMS - 1; j >=0 ; j--)
184  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
185 
186  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
187  condition = _mm256_cmp_ps(z, fones,14);
188 
189  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
190  arccosine = y;
191  condition = _mm256_cmp_ps(aVal, fzeroes,1);
192  arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
193  condition = _mm256_cmp_ps(d, fzeroes,1);
194  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
195 
196  _mm256_store_ps(bPtr, arccosine);
197  aPtr += 8;
198  bPtr += 8;
199  }
200 
201  number = eighthPoints * 8;
202  for(;number < num_points; number++){
203  *bPtr++ = acos(*aPtr++);
204  }
205 }
206 
207 #endif /* LV_HAVE_AVX2 for aligned */
208 
209 #ifdef LV_HAVE_SSE4_1
210 #include <smmintrin.h>
211 
212 static inline void
213 volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
214 {
215  float* bPtr = bVector;
216  const float* aPtr = aVector;
217 
218  unsigned int number = 0;
219  unsigned int quarterPoints = num_points / 4;
220  int i, j;
221 
222  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
223  __m128 fzeroes, fones, ftwos, ffours, condition;
224 
225  pi = _mm_set1_ps(3.14159265358979323846);
226  pio2 = _mm_set1_ps(3.14159265358979323846/2);
227  fzeroes = _mm_setzero_ps();
228  fones = _mm_set1_ps(1.0);
229  ftwos = _mm_set1_ps(2.0);
230  ffours = _mm_set1_ps(4.0);
231 
232  for(;number < quarterPoints; number++){
233  aVal = _mm_load_ps(aPtr);
234  d = aVal;
235  aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
236  z = aVal;
237  condition = _mm_cmplt_ps(z, fzeroes);
238  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
239  x = z;
240  condition = _mm_cmplt_ps(z, fones);
241  x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
242 
243  for(i = 0; i < 2; i++)
244  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
245  x = _mm_div_ps(fones, x);
246  y = fzeroes;
247  for(j = ACOS_TERMS - 1; j >=0 ; j--)
248  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
249 
250  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
251  condition = _mm_cmpgt_ps(z, fones);
252 
253  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
254  arccosine = y;
255  condition = _mm_cmplt_ps(aVal, fzeroes);
256  arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
257  condition = _mm_cmplt_ps(d, fzeroes);
258  arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
259 
260  _mm_store_ps(bPtr, arccosine);
261  aPtr += 4;
262  bPtr += 4;
263  }
264 
265  number = quarterPoints * 4;
266  for(;number < num_points; number++){
267  *bPtr++ = acosf(*aPtr++);
268  }
269 }
270 
271 #endif /* LV_HAVE_SSE4_1 for aligned */
272 
273 #endif /* INCLUDED_volk_32f_acos_32f_a_H */
274 
275 
276 #ifndef INCLUDED_volk_32f_acos_32f_u_H
277 #define INCLUDED_volk_32f_acos_32f_u_H
278 
279 #if LV_HAVE_AVX2 && LV_HAVE_FMA
280 #include <immintrin.h>
281 
282 static inline void
283 volk_32f_acos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
284 {
285  float* bPtr = bVector;
286  const float* aPtr = aVector;
287 
288  unsigned int number = 0;
289  unsigned int eighthPoints = num_points / 8;
290  int i, j;
291 
292  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
293  __m256 fzeroes, fones, ftwos, ffours, condition;
294 
295  pi = _mm256_set1_ps(3.14159265358979323846);
296  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
297  fzeroes = _mm256_setzero_ps();
298  fones = _mm256_set1_ps(1.0);
299  ftwos = _mm256_set1_ps(2.0);
300  ffours = _mm256_set1_ps(4.0);
301 
302  for(;number < eighthPoints; number++){
303  aVal = _mm256_loadu_ps(aPtr);
304  d = aVal;
305  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
306  z = aVal;
307  condition = _mm256_cmp_ps(z, fzeroes,1);
308  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
309  x = z;
310  condition = _mm256_cmp_ps(z, fones,1);
311  x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
312 
313  for(i = 0; i < 2; i++)
314  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x,fones)));
315  x = _mm256_div_ps(fones, x);
316  y = fzeroes;
317  for(j = ACOS_TERMS - 1; j >=0 ; j--)
318  y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
319 
320  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
321  condition = _mm256_cmp_ps(z, fones,14);
322 
323  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
324  arccosine = y;
325  condition = _mm256_cmp_ps(aVal, fzeroes,1);
326  arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
327  condition = _mm256_cmp_ps(d, fzeroes,1);
328  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
329 
330  _mm256_storeu_ps(bPtr, arccosine);
331  aPtr += 8;
332  bPtr += 8;
333  }
334 
335  number = eighthPoints * 8;
336  for(;number < num_points; number++){
337  *bPtr++ = acos(*aPtr++);
338  }
339 }
340 
341 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
342 
343 
344 #ifdef LV_HAVE_AVX
345 #include <immintrin.h>
346 
347 static inline void
348 volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
349 {
350  float* bPtr = bVector;
351  const float* aPtr = aVector;
352 
353  unsigned int number = 0;
354  unsigned int eighthPoints = num_points / 8;
355  int i, j;
356 
357  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
358  __m256 fzeroes, fones, ftwos, ffours, condition;
359 
360  pi = _mm256_set1_ps(3.14159265358979323846);
361  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
362  fzeroes = _mm256_setzero_ps();
363  fones = _mm256_set1_ps(1.0);
364  ftwos = _mm256_set1_ps(2.0);
365  ffours = _mm256_set1_ps(4.0);
366 
367  for(;number < eighthPoints; number++){
368  aVal = _mm256_loadu_ps(aPtr);
369  d = aVal;
370  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))), aVal);
371  z = aVal;
372  condition = _mm256_cmp_ps(z, fzeroes,1);
373  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
374  x = z;
375  condition = _mm256_cmp_ps(z, fones,1);
376  x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
377 
378  for(i = 0; i < 2; i++)
379  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
380  x = _mm256_div_ps(fones, x);
381  y = fzeroes;
382  for(j = ACOS_TERMS - 1; j >=0 ; j--)
383  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
384 
385  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
386  condition = _mm256_cmp_ps(z, fones,14);
387 
388  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
389  arccosine = y;
390  condition = _mm256_cmp_ps(aVal, fzeroes,1);
391  arccosine = _mm256_sub_ps(arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
392  condition = _mm256_cmp_ps(d, fzeroes,1);
393  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
394 
395  _mm256_storeu_ps(bPtr, arccosine);
396  aPtr += 8;
397  bPtr += 8;
398  }
399 
400  number = eighthPoints * 8;
401  for(;number < num_points; number++){
402  *bPtr++ = acos(*aPtr++);
403  }
404 }
405 
406 #endif /* LV_HAVE_AVX2 for unaligned */
407 
408 #ifdef LV_HAVE_SSE4_1
409 #include <smmintrin.h>
410 
411 static inline void
412 volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
413 {
414  float* bPtr = bVector;
415  const float* aPtr = aVector;
416 
417  unsigned int number = 0;
418  unsigned int quarterPoints = num_points / 4;
419  int i, j;
420 
421  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
422  __m128 fzeroes, fones, ftwos, ffours, condition;
423 
424  pi = _mm_set1_ps(3.14159265358979323846);
425  pio2 = _mm_set1_ps(3.14159265358979323846/2);
426  fzeroes = _mm_setzero_ps();
427  fones = _mm_set1_ps(1.0);
428  ftwos = _mm_set1_ps(2.0);
429  ffours = _mm_set1_ps(4.0);
430 
431  for(;number < quarterPoints; number++){
432  aVal = _mm_loadu_ps(aPtr);
433  d = aVal;
434  aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))), aVal);
435  z = aVal;
436  condition = _mm_cmplt_ps(z, fzeroes);
437  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
438  x = z;
439  condition = _mm_cmplt_ps(z, fones);
440  x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
441 
442  for(i = 0; i < 2; i++)
443  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
444  x = _mm_div_ps(fones, x);
445  y = fzeroes;
446 
447  for(j = ACOS_TERMS - 1; j >=0 ; j--)
448  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
449 
450  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
451  condition = _mm_cmpgt_ps(z, fones);
452 
453  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
454  arccosine = y;
455  condition = _mm_cmplt_ps(aVal, fzeroes);
456  arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
457  condition = _mm_cmplt_ps(d, fzeroes);
458  arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
459 
460  _mm_storeu_ps(bPtr, arccosine);
461  aPtr += 4;
462  bPtr += 4;
463  }
464 
465  number = quarterPoints * 4;
466  for(;number < num_points; number++){
467  *bPtr++ = acosf(*aPtr++);
468  }
469 }
470 
471 #endif /* LV_HAVE_SSE4_1 for aligned */
472 
473 #ifdef LV_HAVE_GENERIC
474 
475 static inline void
476 volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
477 {
478  float* bPtr = bVector;
479  const float* aPtr = aVector;
480  unsigned int number = 0;
481 
482  for(number = 0; number < num_points; number++){
483  *bPtr++ = acosf(*aPtr++);
484  }
485 
486 }
487 #endif /* LV_HAVE_GENERIC */
488 
489 #endif /* INCLUDED_volk_32f_acos_32f_u_H */
static void volk_32f_acos_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:348
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32f_acos_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:149
#define ACOS_TERMS
Definition: volk_32f_acos_32f.h:75
static void volk_32f_acos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:476