Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_asin_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #include <stdio.h>
71 #include <math.h>
72 #include <inttypes.h>
73 
74 /* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
75 #define ASIN_TERMS 2
76 
77 #ifndef INCLUDED_volk_32f_asin_32f_a_H
78 #define INCLUDED_volk_32f_asin_32f_a_H
79 
80 #if LV_HAVE_AVX2 && LV_HAVE_FMA
81 #include <immintrin.h>
82 
83 static inline void
84 volk_32f_asin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
85 {
86  float* bPtr = bVector;
87  const float* aPtr = aVector;
88 
89  unsigned int number = 0;
90  unsigned int eighthPoints = num_points / 8;
91  int i, j;
92 
93  __m256 aVal, pio2, x, y, z, arcsine;
94  __m256 fzeroes, fones, ftwos, ffours, condition;
95 
96  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
97  fzeroes = _mm256_setzero_ps();
98  fones = _mm256_set1_ps(1.0);
99  ftwos = _mm256_set1_ps(2.0);
100  ffours = _mm256_set1_ps(4.0);
101 
102  for(;number < eighthPoints; number++){
103  aVal = _mm256_load_ps(aPtr);
104  aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
105  z = aVal;
106  condition = _mm256_cmp_ps(z, fzeroes,1);
107  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
108  x = z;
109  condition = _mm256_cmp_ps(z, fones,1);
110  x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
111 
112  for(i = 0; i < 2; i++){
113  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
114  }
115  x = _mm256_div_ps(fones, x);
116  y = fzeroes;
117  for(j = ASIN_TERMS - 1; j >=0 ; j--){
118  y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
119  }
120 
121  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
122  condition = _mm256_cmp_ps(z, fones,14);
123 
124  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
125  arcsine = y;
126  condition = _mm256_cmp_ps(aVal, fzeroes,1);
127  arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
128 
129  _mm256_store_ps(bPtr, arcsine);
130  aPtr += 8;
131  bPtr += 8;
132  }
133 
134  number = eighthPoints * 8;
135  for(;number < num_points; number++){
136  *bPtr++ = asin(*aPtr++);
137  }
138 }
139 
140 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
141 
142 
143 #ifdef LV_HAVE_AVX
144 #include <immintrin.h>
145 
146 static inline void
147 volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
148 {
149  float* bPtr = bVector;
150  const float* aPtr = aVector;
151 
152  unsigned int number = 0;
153  unsigned int eighthPoints = num_points / 8;
154  int i, j;
155 
156  __m256 aVal, pio2, x, y, z, arcsine;
157  __m256 fzeroes, fones, ftwos, ffours, condition;
158 
159  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
160  fzeroes = _mm256_setzero_ps();
161  fones = _mm256_set1_ps(1.0);
162  ftwos = _mm256_set1_ps(2.0);
163  ffours = _mm256_set1_ps(4.0);
164 
165  for(;number < eighthPoints; number++){
166  aVal = _mm256_load_ps(aPtr);
167  aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
168  z = aVal;
169  condition = _mm256_cmp_ps(z, fzeroes,1);
170  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
171  x = z;
172  condition = _mm256_cmp_ps(z, fones,1);
173  x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
174 
175  for(i = 0; i < 2; i++){
176  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
177  }
178  x = _mm256_div_ps(fones, x);
179  y = fzeroes;
180  for(j = ASIN_TERMS - 1; j >=0 ; j--){
181  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
182  }
183 
184  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
185  condition = _mm256_cmp_ps(z, fones,14);
186 
187  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
188  arcsine = y;
189  condition = _mm256_cmp_ps(aVal, fzeroes,1);
190  arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
191 
192  _mm256_store_ps(bPtr, arcsine);
193  aPtr += 8;
194  bPtr += 8;
195  }
196 
197  number = eighthPoints * 8;
198  for(;number < num_points; number++){
199  *bPtr++ = asin(*aPtr++);
200  }
201 }
202 
203 #endif /* LV_HAVE_AVX for aligned */
204 
205 #ifdef LV_HAVE_SSE4_1
206 #include <smmintrin.h>
207 
208 static inline void
209 volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
210 {
211  float* bPtr = bVector;
212  const float* aPtr = aVector;
213 
214  unsigned int number = 0;
215  unsigned int quarterPoints = num_points / 4;
216  int i, j;
217 
218  __m128 aVal, pio2, x, y, z, arcsine;
219  __m128 fzeroes, fones, ftwos, ffours, condition;
220 
221  pio2 = _mm_set1_ps(3.14159265358979323846/2);
222  fzeroes = _mm_setzero_ps();
223  fones = _mm_set1_ps(1.0);
224  ftwos = _mm_set1_ps(2.0);
225  ffours = _mm_set1_ps(4.0);
226 
227  for(;number < quarterPoints; number++){
228  aVal = _mm_load_ps(aPtr);
229  aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
230  z = aVal;
231  condition = _mm_cmplt_ps(z, fzeroes);
232  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
233  x = z;
234  condition = _mm_cmplt_ps(z, fones);
235  x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
236 
237  for(i = 0; i < 2; i++){
238  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
239  }
240  x = _mm_div_ps(fones, x);
241  y = fzeroes;
242  for(j = ASIN_TERMS - 1; j >=0 ; j--){
243  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
244  }
245 
246  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
247  condition = _mm_cmpgt_ps(z, fones);
248 
249  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
250  arcsine = y;
251  condition = _mm_cmplt_ps(aVal, fzeroes);
252  arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
253 
254  _mm_store_ps(bPtr, arcsine);
255  aPtr += 4;
256  bPtr += 4;
257  }
258 
259  number = quarterPoints * 4;
260  for(;number < num_points; number++){
261  *bPtr++ = asinf(*aPtr++);
262  }
263 }
264 
265 #endif /* LV_HAVE_SSE4_1 for aligned */
266 
267 #endif /* INCLUDED_volk_32f_asin_32f_a_H */
268 
269 #ifndef INCLUDED_volk_32f_asin_32f_u_H
270 #define INCLUDED_volk_32f_asin_32f_u_H
271 
272 #if LV_HAVE_AVX2 && LV_HAVE_FMA
273 #include <immintrin.h>
274 
275 static inline void
276 volk_32f_asin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
277 {
278  float* bPtr = bVector;
279  const float* aPtr = aVector;
280 
281  unsigned int number = 0;
282  unsigned int eighthPoints = num_points / 8;
283  int i, j;
284 
285  __m256 aVal, pio2, x, y, z, arcsine;
286  __m256 fzeroes, fones, ftwos, ffours, condition;
287 
288  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
289  fzeroes = _mm256_setzero_ps();
290  fones = _mm256_set1_ps(1.0);
291  ftwos = _mm256_set1_ps(2.0);
292  ffours = _mm256_set1_ps(4.0);
293 
294  for(;number < eighthPoints; number++){
295  aVal = _mm256_loadu_ps(aPtr);
296  aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
297  z = aVal;
298  condition = _mm256_cmp_ps(z, fzeroes,1);
299  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
300  x = z;
301  condition = _mm256_cmp_ps(z, fones,1);
302  x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
303 
304  for(i = 0; i < 2; i++){
305  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x,x,fones)));
306  }
307  x = _mm256_div_ps(fones, x);
308  y = fzeroes;
309  for(j = ASIN_TERMS - 1; j >=0 ; j--){
310  y = _mm256_fmadd_ps(y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
311  }
312 
313  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
314  condition = _mm256_cmp_ps(z, fones,14);
315 
316  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y,ftwos,pio2), condition));
317  arcsine = y;
318  condition = _mm256_cmp_ps(aVal, fzeroes,1);
319  arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
320 
321  _mm256_storeu_ps(bPtr, arcsine);
322  aPtr += 8;
323  bPtr += 8;
324  }
325 
326  number = eighthPoints * 8;
327  for(;number < num_points; number++){
328  *bPtr++ = asin(*aPtr++);
329  }
330 }
331 
332 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
333 
334 
335 #ifdef LV_HAVE_AVX
336 #include <immintrin.h>
337 
338 static inline void
339 volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
340 {
341  float* bPtr = bVector;
342  const float* aPtr = aVector;
343 
344  unsigned int number = 0;
345  unsigned int eighthPoints = num_points / 8;
346  int i, j;
347 
348  __m256 aVal, pio2, x, y, z, arcsine;
349  __m256 fzeroes, fones, ftwos, ffours, condition;
350 
351  pio2 = _mm256_set1_ps(3.14159265358979323846/2);
352  fzeroes = _mm256_setzero_ps();
353  fones = _mm256_set1_ps(1.0);
354  ftwos = _mm256_set1_ps(2.0);
355  ffours = _mm256_set1_ps(4.0);
356 
357  for(;number < eighthPoints; number++){
358  aVal = _mm256_loadu_ps(aPtr);
359  aVal = _mm256_div_ps(aVal, _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal), _mm256_sub_ps(fones, aVal))));
360  z = aVal;
361  condition = _mm256_cmp_ps(z, fzeroes,1);
362  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
363  x = z;
364  condition = _mm256_cmp_ps(z, fones,1);
365  x = _mm256_add_ps(x, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
366 
367  for(i = 0; i < 2; i++){
368  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
369  }
370  x = _mm256_div_ps(fones, x);
371  y = fzeroes;
372  for(j = ASIN_TERMS - 1; j >=0 ; j--){
373  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)), _mm256_set1_ps(pow(-1,j)/(2*j+1)));
374  }
375 
376  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
377  condition = _mm256_cmp_ps(z, fones,14);
378 
379  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
380  arcsine = y;
381  condition = _mm256_cmp_ps(aVal, fzeroes,1);
382  arcsine = _mm256_sub_ps(arcsine, _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
383 
384  _mm256_storeu_ps(bPtr, arcsine);
385  aPtr += 8;
386  bPtr += 8;
387  }
388 
389  number = eighthPoints * 8;
390  for(;number < num_points; number++){
391  *bPtr++ = asin(*aPtr++);
392  }
393 }
394 
395 #endif /* LV_HAVE_AVX for unaligned */
396 
397 
398 #ifdef LV_HAVE_SSE4_1
399 #include <smmintrin.h>
400 
401 static inline void
402 volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
403 {
404  float* bPtr = bVector;
405  const float* aPtr = aVector;
406 
407  unsigned int number = 0;
408  unsigned int quarterPoints = num_points / 4;
409  int i, j;
410 
411  __m128 aVal, pio2, x, y, z, arcsine;
412  __m128 fzeroes, fones, ftwos, ffours, condition;
413 
414  pio2 = _mm_set1_ps(3.14159265358979323846/2);
415  fzeroes = _mm_setzero_ps();
416  fones = _mm_set1_ps(1.0);
417  ftwos = _mm_set1_ps(2.0);
418  ffours = _mm_set1_ps(4.0);
419 
420  for(;number < quarterPoints; number++){
421  aVal = _mm_loadu_ps(aPtr);
422  aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
423  z = aVal;
424  condition = _mm_cmplt_ps(z, fzeroes);
425  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
426  x = z;
427  condition = _mm_cmplt_ps(z, fones);
428  x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
429 
430  for(i = 0; i < 2; i++){
431  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
432  }
433  x = _mm_div_ps(fones, x);
434  y = fzeroes;
435  for(j = ASIN_TERMS - 1; j >=0 ; j--){
436  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
437  }
438 
439  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
440  condition = _mm_cmpgt_ps(z, fones);
441 
442  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
443  arcsine = y;
444  condition = _mm_cmplt_ps(aVal, fzeroes);
445  arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
446 
447  _mm_storeu_ps(bPtr, arcsine);
448  aPtr += 4;
449  bPtr += 4;
450  }
451 
452  number = quarterPoints * 4;
453  for(;number < num_points; number++){
454  *bPtr++ = asinf(*aPtr++);
455  }
456 }
457 
458 #endif /* LV_HAVE_SSE4_1 for unaligned */
459 
460 #ifdef LV_HAVE_GENERIC
461 
462 static inline void
463 volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
464 {
465  float* bPtr = bVector;
466  const float* aPtr = aVector;
467  unsigned int number = 0;
468 
469  for(number = 0; number < num_points; number++){
470  *bPtr++ = asinf(*aPtr++);
471  }
472 }
473 #endif /* LV_HAVE_GENERIC */
474 
475 #endif /* INCLUDED_volk_32f_asin_32f_u_H */
static void volk_32f_asin_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:463
#define ASIN_TERMS
Definition: volk_32f_asin_32f.h:75
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32f_asin_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:339
static void volk_32f_asin_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_asin_32f.h:147