Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_atan_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #include <stdio.h>
71 #include <math.h>
72 #include <inttypes.h>
73 
74 /* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
75 #define TERMS 2
76 
77 #ifndef INCLUDED_volk_32f_atan_32f_a_H
78 #define INCLUDED_volk_32f_atan_32f_a_H
79 
80 #if LV_HAVE_AVX2 && LV_HAVE_FMA
81 #include <immintrin.h>
82 
83 static inline void
84 volk_32f_atan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
85 {
86  float* bPtr = bVector;
87  const float* aPtr = aVector;
88 
89  unsigned int number = 0;
90  unsigned int eighthPoints = num_points / 8;
91  int i, j;
92 
93  __m256 aVal, pio2, x, y, z, arctangent;
94  __m256 fzeroes, fones, ftwos, ffours, condition;
95 
96  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
97  fzeroes = _mm256_setzero_ps();
98  fones = _mm256_set1_ps(1.0);
99  ftwos = _mm256_set1_ps(2.0);
100  ffours = _mm256_set1_ps(4.0);
101 
102  for(;number < eighthPoints; number++){
103  aVal = _mm256_load_ps(aPtr);
104  z = aVal;
105  condition = _mm256_cmp_ps(z, fzeroes,1);
106  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
107  x = z;
108  condition = _mm256_cmp_ps(z, fones,1);
109  x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
110 
111  for(i = 0; i < 2; i++){
112  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
113  }
114  x = _mm256_div_ps(fones, x);
115  y = fzeroes;
116  for(j = TERMS - 1; j >=0 ; j--){
117  y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
118  }
119 
120  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
121  condition = _mm256_cmp_ps(z, fones,14);
122 
123  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
124  arctangent = y;
125  condition = _mm256_cmp_ps(aVal, fzeroes,1);
126  arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
127 
128  _mm256_store_ps(bPtr, arctangent);
129  aPtr += 8;
130  bPtr += 8;
131  }
132 
133  number = eighthPoints * 8;
134  for(;number < num_points; number++){
135  *bPtr++ = atan(*aPtr++);
136  }
137 }
138 
139 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
140 
141 
142 #ifdef LV_HAVE_AVX
143 #include <immintrin.h>
144 
145 static inline void
146 volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
147 {
148  float* bPtr = bVector;
149  const float* aPtr = aVector;
150 
151  unsigned int number = 0;
152  unsigned int eighthPoints = num_points / 8;
153  int i, j;
154 
155  __m256 aVal, pio2, x, y, z, arctangent;
156  __m256 fzeroes, fones, ftwos, ffours, condition;
157 
158  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
159  fzeroes = _mm256_setzero_ps();
160  fones = _mm256_set1_ps(1.0);
161  ftwos = _mm256_set1_ps(2.0);
162  ffours = _mm256_set1_ps(4.0);
163 
164  for(;number < eighthPoints; number++){
165  aVal = _mm256_load_ps(aPtr);
166  z = aVal;
167  condition = _mm256_cmp_ps(z, fzeroes,1);
168  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
169  x = z;
170  condition = _mm256_cmp_ps(z, fones,1);
171  x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
172 
173  for(i = 0; i < 2; i++){
174  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
175  }
176  x = _mm256_div_ps(fones, x);
177  y = fzeroes;
178  for(j = TERMS - 1; j >=0 ; j--){
179  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
180  }
181 
182  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
183  condition = _mm256_cmp_ps(z, fones,14);
184 
185  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
186  arctangent = y;
187  condition = _mm256_cmp_ps(aVal, fzeroes,1);
188  arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
189 
190  _mm256_store_ps(bPtr, arctangent);
191  aPtr += 8;
192  bPtr += 8;
193  }
194 
195  number = eighthPoints * 8;
196  for(;number < num_points; number++){
197  *bPtr++ = atan(*aPtr++);
198  }
199 }
200 
201 #endif /* LV_HAVE_AVX for aligned */
202 
203 #ifdef LV_HAVE_SSE4_1
204 #include <smmintrin.h>
205 
206 static inline void
207 volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
208 {
209  float* bPtr = bVector;
210  const float* aPtr = aVector;
211 
212  unsigned int number = 0;
213  unsigned int quarterPoints = num_points / 4;
214  int i, j;
215 
216  __m128 aVal, pio2, x, y, z, arctangent;
217  __m128 fzeroes, fones, ftwos, ffours, condition;
218 
219  pio2 = _mm_set1_ps(3.14159265358979323846/2);
220  fzeroes = _mm_setzero_ps();
221  fones = _mm_set1_ps(1.0);
222  ftwos = _mm_set1_ps(2.0);
223  ffours = _mm_set1_ps(4.0);
224 
225  for(;number < quarterPoints; number++){
226  aVal = _mm_load_ps(aPtr);
227  z = aVal;
228  condition = _mm_cmplt_ps(z, fzeroes);
229  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
230  x = z;
231  condition = _mm_cmplt_ps(z, fones);
232  x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
233 
234  for(i = 0; i < 2; i++){
235  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
236  }
237  x = _mm_div_ps(fones, x);
238  y = fzeroes;
239  for(j = TERMS - 1; j >=0 ; j--){
240  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
241  }
242 
243  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
244  condition = _mm_cmpgt_ps(z, fones);
245 
246  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
247  arctangent = y;
248  condition = _mm_cmplt_ps(aVal, fzeroes);
249  arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
250 
251  _mm_store_ps(bPtr, arctangent);
252  aPtr += 4;
253  bPtr += 4;
254  }
255 
256  number = quarterPoints * 4;
257  for(;number < num_points; number++){
258  *bPtr++ = atanf(*aPtr++);
259  }
260 }
261 
262 #endif /* LV_HAVE_SSE4_1 for aligned */
263 
264 #endif /* INCLUDED_volk_32f_atan_32f_a_H */
265 
266 #ifndef INCLUDED_volk_32f_atan_32f_u_H
267 #define INCLUDED_volk_32f_atan_32f_u_H
268 
269 #if LV_HAVE_AVX2 && LV_HAVE_FMA
270 #include <immintrin.h>
271 
272 static inline void
273 volk_32f_atan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
274 {
275  float* bPtr = bVector;
276  const float* aPtr = aVector;
277 
278  unsigned int number = 0;
279  unsigned int eighthPoints = num_points / 8;
280  int i, j;
281 
282  __m256 aVal, pio2, x, y, z, arctangent;
283  __m256 fzeroes, fones, ftwos, ffours, condition;
284 
285  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
286  fzeroes = _mm256_setzero_ps();
287  fones = _mm256_set1_ps(1.0);
288  ftwos = _mm256_set1_ps(2.0);
289  ffours = _mm256_set1_ps(4.0);
290 
291  for(;number < eighthPoints; number++){
292  aVal = _mm256_loadu_ps(aPtr);
293  z = aVal;
294  condition = _mm256_cmp_ps(z, fzeroes,1);
295  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
296  x = z;
297  condition = _mm256_cmp_ps(z, fones,1);
298  x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
299 
300  for(i = 0; i < 2; i++){
301  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
302  }
303  x = _mm256_div_ps(fones, x);
304  y = fzeroes;
305  for(j = TERMS - 1; j >=0 ; j--){
306  y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
307  }
308 
309  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
310  condition = _mm256_cmp_ps(z, fones,14);
311 
312  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
313  arctangent = y;
314  condition = _mm256_cmp_ps(aVal, fzeroes,1);
315  arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
316 
317  _mm256_storeu_ps(bPtr, arctangent);
318  aPtr += 8;
319  bPtr += 8;
320  }
321 
322  number = eighthPoints * 8;
323  for(;number < num_points; number++){
324  *bPtr++ = atan(*aPtr++);
325  }
326 }
327 
328 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
329 
330 
331 #ifdef LV_HAVE_AVX
332 #include <immintrin.h>
333 
334 static inline void
335 volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
336 {
337  float* bPtr = bVector;
338  const float* aPtr = aVector;
339 
340  unsigned int number = 0;
341  unsigned int eighthPoints = num_points / 8;
342  int i, j;
343 
344  __m256 aVal, pio2, x, y, z, arctangent;
345  __m256 fzeroes, fones, ftwos, ffours, condition;
346 
347  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
348  fzeroes = _mm256_setzero_ps();
349  fones = _mm256_set1_ps(1.0);
350  ftwos = _mm256_set1_ps(2.0);
351  ffours = _mm256_set1_ps(4.0);
352 
353  for(;number < eighthPoints; number++){
354  aVal = _mm256_loadu_ps(aPtr);
355  z = aVal;
356  condition = _mm256_cmp_ps(z, fzeroes,1);
357  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
358  x = z;
359  condition = _mm256_cmp_ps(z, fones,1);
360  x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
361 
362  for(i = 0; i < 2; i++){
363  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
364  }
365  x = _mm256_div_ps(fones, x);
366  y = fzeroes;
367  for(j = TERMS - 1; j >=0 ; j--){
368  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
369  }
370 
371  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
372  condition = _mm256_cmp_ps(z, fones,14);
373 
374  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
375  arctangent = y;
376  condition = _mm256_cmp_ps(aVal, fzeroes,1);
377  arctangent = _mm256_sub_ps(arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
378 
379  _mm256_storeu_ps(bPtr, arctangent);
380  aPtr += 8;
381  bPtr += 8;
382  }
383 
384  number = eighthPoints * 8;
385  for(;number < num_points; number++){
386  *bPtr++ = atan(*aPtr++);
387  }
388 }
389 
390 #endif /* LV_HAVE_AVX for unaligned */
391 
392 #ifdef LV_HAVE_SSE4_1
393 #include <smmintrin.h>
394 
395 static inline void
396 volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
397 {
398  float* bPtr = bVector;
399  const float* aPtr = aVector;
400 
401  unsigned int number = 0;
402  unsigned int quarterPoints = num_points / 4;
403  int i, j;
404 
405  __m128 aVal, pio2, x, y, z, arctangent;
406  __m128 fzeroes, fones, ftwos, ffours, condition;
407 
408  pio2 = _mm_set1_ps(3.14159265358979323846/2);
409  fzeroes = _mm_setzero_ps();
410  fones = _mm_set1_ps(1.0);
411  ftwos = _mm_set1_ps(2.0);
412  ffours = _mm_set1_ps(4.0);
413 
414  for(;number < quarterPoints; number++){
415  aVal = _mm_loadu_ps(aPtr);
416  z = aVal;
417  condition = _mm_cmplt_ps(z, fzeroes);
418  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
419  x = z;
420  condition = _mm_cmplt_ps(z, fones);
421  x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
422 
423  for(i = 0; i < 2; i++)
424  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
425  x = _mm_div_ps(fones, x);
426  y = fzeroes;
427  for(j = TERMS - 1; j >= 0; j--)
428  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
429 
430  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
431  condition = _mm_cmpgt_ps(z, fones);
432 
433  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
434  arctangent = y;
435  condition = _mm_cmplt_ps(aVal, fzeroes);
436  arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
437 
438  _mm_storeu_ps(bPtr, arctangent);
439  aPtr += 4;
440  bPtr += 4;
441  }
442 
443  number = quarterPoints * 4;
444  for(;number < num_points; number++){
445  *bPtr++ = atanf(*aPtr++);
446  }
447 }
448 
449 #endif /* LV_HAVE_SSE4_1 for unaligned */
450 
451 #ifdef LV_HAVE_GENERIC
452 
453 static inline void
454 volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
455 {
456  float* bPtr = bVector;
457  const float* aPtr = aVector;
458  unsigned int number = 0;
459 
460  for(number = 0; number < num_points; number++){
461  *bPtr++ = atanf(*aPtr++);
462  }
463 }
464 #endif /* LV_HAVE_GENERIC */
465 
466 #endif /* INCLUDED_volk_32f_atan_32f_u_H */
static void volk_32f_atan_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:335
static void volk_32f_atan_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:146
for i
Definition: volk_config_fixed.tmpl.h:25
#define TERMS
Definition: volk_32f_atan_32f.h:75
static void volk_32f_atan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:454