Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32f_x2_pow_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
72 #define INCLUDED_volk_32f_x2_pow_32f_a_H
73 
74 #include <inttypes.h>
75 #include <math.h>
76 #include <stdio.h>
77 #include <stdlib.h>
78 
79 #define POW_POLY_DEGREE 3
80 
81 #if LV_HAVE_AVX2 && LV_HAVE_FMA
82 #include <immintrin.h>
83 
84 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
85 #define POLY1_AVX2_FMA(x, c0, c1) \
86  _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
87 #define POLY2_AVX2_FMA(x, c0, c1, c2) \
88  _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
89 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
90  _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
91 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
92  _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
93 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
94  _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
95 
96 static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,
97  const float* bVector,
98  const float* aVector,
99  unsigned int num_points)
100 {
101  float* cPtr = cVector;
102  const float* bPtr = bVector;
103  const float* aPtr = aVector;
104 
105  unsigned int number = 0;
106  const unsigned int eighthPoints = num_points / 8;
107 
108  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
109  __m256 tmp, fx, mask, pow2n, z, y;
110  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
111  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
112  __m256i bias, exp, emm0, pi32_0x7f;
113 
114  one = _mm256_set1_ps(1.0);
115  exp_hi = _mm256_set1_ps(88.3762626647949);
116  exp_lo = _mm256_set1_ps(-88.3762626647949);
117  ln2 = _mm256_set1_ps(0.6931471805);
118  log2EF = _mm256_set1_ps(1.44269504088896341);
119  half = _mm256_set1_ps(0.5);
120  exp_C1 = _mm256_set1_ps(0.693359375);
121  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
122  pi32_0x7f = _mm256_set1_epi32(0x7f);
123 
124  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
125  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
126  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
127  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
128  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
129  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
130 
131  for (; number < eighthPoints; number++) {
132  // First compute the logarithm
133  aVal = _mm256_load_ps(aPtr);
134  bias = _mm256_set1_epi32(127);
135  leadingOne = _mm256_set1_ps(1.0f);
136  exp = _mm256_sub_epi32(
137  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
138  _mm256_set1_epi32(0x7f800000)),
139  23),
140  bias);
141  logarithm = _mm256_cvtepi32_ps(exp);
142 
143  frac = _mm256_or_ps(
144  leadingOne,
145  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
146 
147 #if POW_POLY_DEGREE == 6
148  mantissa = POLY5_AVX2_FMA(frac,
149  3.1157899f,
150  -3.3241990f,
151  2.5988452f,
152  -1.2315303f,
153  3.1821337e-1f,
154  -3.4436006e-2f);
155 #elif POW_POLY_DEGREE == 5
156  mantissa = POLY4_AVX2_FMA(frac,
157  2.8882704548164776201f,
158  -2.52074962577807006663f,
159  1.48116647521213171641f,
160  -0.465725644288844778798f,
161  0.0596515482674574969533f);
162 #elif POW_POLY_DEGREE == 4
163  mantissa = POLY3_AVX2_FMA(frac,
164  2.61761038894603480148f,
165  -1.75647175389045657003f,
166  0.688243882994381274313f,
167  -0.107254423828329604454f);
168 #elif POW_POLY_DEGREE == 3
169  mantissa = POLY2_AVX2_FMA(frac,
170  2.28330284476918490682f,
171  -1.04913055217340124191f,
172  0.204446009836232697516f);
173 #else
174 #error
175 #endif
176 
177  logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
178  logarithm = _mm256_mul_ps(logarithm, ln2);
179 
180  // Now calculate b*lna
181  bVal = _mm256_load_ps(bPtr);
182  bVal = _mm256_mul_ps(bVal, logarithm);
183 
184  // Now compute exp(b*lna)
185  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
186 
187  fx = _mm256_fmadd_ps(bVal, log2EF, half);
188 
189  emm0 = _mm256_cvttps_epi32(fx);
190  tmp = _mm256_cvtepi32_ps(emm0);
191 
192  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
193  fx = _mm256_sub_ps(tmp, mask);
194 
195  tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
196  bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
197  z = _mm256_mul_ps(bVal, bVal);
198 
199  y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
200  y = _mm256_fmadd_ps(y, bVal, exp_p2);
201  y = _mm256_fmadd_ps(y, bVal, exp_p3);
202  y = _mm256_fmadd_ps(y, bVal, exp_p4);
203  y = _mm256_fmadd_ps(y, bVal, exp_p5);
204  y = _mm256_fmadd_ps(y, z, bVal);
205  y = _mm256_add_ps(y, one);
206 
207  emm0 =
208  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
209 
210  pow2n = _mm256_castsi256_ps(emm0);
211  cVal = _mm256_mul_ps(y, pow2n);
212 
213  _mm256_store_ps(cPtr, cVal);
214 
215  aPtr += 8;
216  bPtr += 8;
217  cPtr += 8;
218  }
219 
220  number = eighthPoints * 8;
221  for (; number < num_points; number++) {
222  *cPtr++ = pow(*aPtr++, *bPtr++);
223  }
224 }
225 
226 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
227 
228 #ifdef LV_HAVE_AVX2
229 #include <immintrin.h>
230 
231 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
232 #define POLY1_AVX2(x, c0, c1) \
233  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
234 #define POLY2_AVX2(x, c0, c1, c2) \
235  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
236 #define POLY3_AVX2(x, c0, c1, c2, c3) \
237  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
238 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
239  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
240 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
241  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
242 
243 static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,
244  const float* bVector,
245  const float* aVector,
246  unsigned int num_points)
247 {
248  float* cPtr = cVector;
249  const float* bPtr = bVector;
250  const float* aPtr = aVector;
251 
252  unsigned int number = 0;
253  const unsigned int eighthPoints = num_points / 8;
254 
255  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
256  __m256 tmp, fx, mask, pow2n, z, y;
257  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
258  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
259  __m256i bias, exp, emm0, pi32_0x7f;
260 
261  one = _mm256_set1_ps(1.0);
262  exp_hi = _mm256_set1_ps(88.3762626647949);
263  exp_lo = _mm256_set1_ps(-88.3762626647949);
264  ln2 = _mm256_set1_ps(0.6931471805);
265  log2EF = _mm256_set1_ps(1.44269504088896341);
266  half = _mm256_set1_ps(0.5);
267  exp_C1 = _mm256_set1_ps(0.693359375);
268  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
269  pi32_0x7f = _mm256_set1_epi32(0x7f);
270 
271  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
272  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
273  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
274  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
275  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
276  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
277 
278  for (; number < eighthPoints; number++) {
279  // First compute the logarithm
280  aVal = _mm256_load_ps(aPtr);
281  bias = _mm256_set1_epi32(127);
282  leadingOne = _mm256_set1_ps(1.0f);
283  exp = _mm256_sub_epi32(
284  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
285  _mm256_set1_epi32(0x7f800000)),
286  23),
287  bias);
288  logarithm = _mm256_cvtepi32_ps(exp);
289 
290  frac = _mm256_or_ps(
291  leadingOne,
292  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
293 
294 #if POW_POLY_DEGREE == 6
295  mantissa = POLY5_AVX2(frac,
296  3.1157899f,
297  -3.3241990f,
298  2.5988452f,
299  -1.2315303f,
300  3.1821337e-1f,
301  -3.4436006e-2f);
302 #elif POW_POLY_DEGREE == 5
303  mantissa = POLY4_AVX2(frac,
304  2.8882704548164776201f,
305  -2.52074962577807006663f,
306  1.48116647521213171641f,
307  -0.465725644288844778798f,
308  0.0596515482674574969533f);
309 #elif POW_POLY_DEGREE == 4
310  mantissa = POLY3_AVX2(frac,
311  2.61761038894603480148f,
312  -1.75647175389045657003f,
313  0.688243882994381274313f,
314  -0.107254423828329604454f);
315 #elif POW_POLY_DEGREE == 3
316  mantissa = POLY2_AVX2(frac,
317  2.28330284476918490682f,
318  -1.04913055217340124191f,
319  0.204446009836232697516f);
320 #else
321 #error
322 #endif
323 
324  logarithm = _mm256_add_ps(
325  _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
326  logarithm = _mm256_mul_ps(logarithm, ln2);
327 
328  // Now calculate b*lna
329  bVal = _mm256_load_ps(bPtr);
330  bVal = _mm256_mul_ps(bVal, logarithm);
331 
332  // Now compute exp(b*lna)
333  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
334 
335  fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
336 
337  emm0 = _mm256_cvttps_epi32(fx);
338  tmp = _mm256_cvtepi32_ps(emm0);
339 
340  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
341  fx = _mm256_sub_ps(tmp, mask);
342 
343  tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
344  bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
345  z = _mm256_mul_ps(bVal, bVal);
346 
347  y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
348  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
349  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
350  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
351  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
352  y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
353  y = _mm256_add_ps(y, one);
354 
355  emm0 =
356  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
357 
358  pow2n = _mm256_castsi256_ps(emm0);
359  cVal = _mm256_mul_ps(y, pow2n);
360 
361  _mm256_store_ps(cPtr, cVal);
362 
363  aPtr += 8;
364  bPtr += 8;
365  cPtr += 8;
366  }
367 
368  number = eighthPoints * 8;
369  for (; number < num_points; number++) {
370  *cPtr++ = pow(*aPtr++, *bPtr++);
371  }
372 }
373 
374 #endif /* LV_HAVE_AVX2 for aligned */
375 
376 
377 #ifdef LV_HAVE_SSE4_1
378 #include <smmintrin.h>
379 
380 #define POLY0(x, c0) _mm_set1_ps(c0)
381 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
382 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
383 #define POLY3(x, c0, c1, c2, c3) \
384  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
385 #define POLY4(x, c0, c1, c2, c3, c4) \
386  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
387 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
388  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
389 
390 static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,
391  const float* bVector,
392  const float* aVector,
393  unsigned int num_points)
394 {
395  float* cPtr = cVector;
396  const float* bPtr = bVector;
397  const float* aPtr = aVector;
398 
399  unsigned int number = 0;
400  const unsigned int quarterPoints = num_points / 4;
401 
402  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
403  __m128 tmp, fx, mask, pow2n, z, y;
404  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
405  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
406  __m128i bias, exp, emm0, pi32_0x7f;
407 
408  one = _mm_set1_ps(1.0);
409  exp_hi = _mm_set1_ps(88.3762626647949);
410  exp_lo = _mm_set1_ps(-88.3762626647949);
411  ln2 = _mm_set1_ps(0.6931471805);
412  log2EF = _mm_set1_ps(1.44269504088896341);
413  half = _mm_set1_ps(0.5);
414  exp_C1 = _mm_set1_ps(0.693359375);
415  exp_C2 = _mm_set1_ps(-2.12194440e-4);
416  pi32_0x7f = _mm_set1_epi32(0x7f);
417 
418  exp_p0 = _mm_set1_ps(1.9875691500e-4);
419  exp_p1 = _mm_set1_ps(1.3981999507e-3);
420  exp_p2 = _mm_set1_ps(8.3334519073e-3);
421  exp_p3 = _mm_set1_ps(4.1665795894e-2);
422  exp_p4 = _mm_set1_ps(1.6666665459e-1);
423  exp_p5 = _mm_set1_ps(5.0000001201e-1);
424 
425  for (; number < quarterPoints; number++) {
426  // First compute the logarithm
427  aVal = _mm_load_ps(aPtr);
428  bias = _mm_set1_epi32(127);
429  leadingOne = _mm_set1_ps(1.0f);
430  exp = _mm_sub_epi32(
431  _mm_srli_epi32(
432  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
433  bias);
434  logarithm = _mm_cvtepi32_ps(exp);
435 
436  frac = _mm_or_ps(leadingOne,
437  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
438 
439 #if POW_POLY_DEGREE == 6
440  mantissa = POLY5(frac,
441  3.1157899f,
442  -3.3241990f,
443  2.5988452f,
444  -1.2315303f,
445  3.1821337e-1f,
446  -3.4436006e-2f);
447 #elif POW_POLY_DEGREE == 5
448  mantissa = POLY4(frac,
449  2.8882704548164776201f,
450  -2.52074962577807006663f,
451  1.48116647521213171641f,
452  -0.465725644288844778798f,
453  0.0596515482674574969533f);
454 #elif POW_POLY_DEGREE == 4
455  mantissa = POLY3(frac,
456  2.61761038894603480148f,
457  -1.75647175389045657003f,
458  0.688243882994381274313f,
459  -0.107254423828329604454f);
460 #elif POW_POLY_DEGREE == 3
461  mantissa = POLY2(frac,
462  2.28330284476918490682f,
463  -1.04913055217340124191f,
464  0.204446009836232697516f);
465 #else
466 #error
467 #endif
468 
469  logarithm =
470  _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
471  logarithm = _mm_mul_ps(logarithm, ln2);
472 
473 
474  // Now calculate b*lna
475  bVal = _mm_load_ps(bPtr);
476  bVal = _mm_mul_ps(bVal, logarithm);
477 
478  // Now compute exp(b*lna)
479  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
480 
481  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
482 
483  emm0 = _mm_cvttps_epi32(fx);
484  tmp = _mm_cvtepi32_ps(emm0);
485 
486  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
487  fx = _mm_sub_ps(tmp, mask);
488 
489  tmp = _mm_mul_ps(fx, exp_C1);
490  z = _mm_mul_ps(fx, exp_C2);
491  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
492  z = _mm_mul_ps(bVal, bVal);
493 
494  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
495  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
496  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
497  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
498  y = _mm_add_ps(y, one);
499 
500  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
501 
502  pow2n = _mm_castsi128_ps(emm0);
503  cVal = _mm_mul_ps(y, pow2n);
504 
505  _mm_store_ps(cPtr, cVal);
506 
507  aPtr += 4;
508  bPtr += 4;
509  cPtr += 4;
510  }
511 
512  number = quarterPoints * 4;
513  for (; number < num_points; number++) {
514  *cPtr++ = powf(*aPtr++, *bPtr++);
515  }
516 }
517 
518 #endif /* LV_HAVE_SSE4_1 for aligned */
519 
520 #endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
521 
522 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
523 #define INCLUDED_volk_32f_x2_pow_32f_u_H
524 
525 #include <inttypes.h>
526 #include <math.h>
527 #include <stdio.h>
528 #include <stdlib.h>
529 
530 #define POW_POLY_DEGREE 3
531 
532 #ifdef LV_HAVE_GENERIC
533 
534 static inline void volk_32f_x2_pow_32f_generic(float* cVector,
535  const float* bVector,
536  const float* aVector,
537  unsigned int num_points)
538 {
539  float* cPtr = cVector;
540  const float* bPtr = bVector;
541  const float* aPtr = aVector;
542  unsigned int number = 0;
543 
544  for (number = 0; number < num_points; number++) {
545  *cPtr++ = powf(*aPtr++, *bPtr++);
546  }
547 }
548 #endif /* LV_HAVE_GENERIC */
549 
550 
551 #ifdef LV_HAVE_SSE4_1
552 #include <smmintrin.h>
553 
554 #define POLY0(x, c0) _mm_set1_ps(c0)
555 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
556 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
557 #define POLY3(x, c0, c1, c2, c3) \
558  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
559 #define POLY4(x, c0, c1, c2, c3, c4) \
560  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
561 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
562  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
563 
564 static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,
565  const float* bVector,
566  const float* aVector,
567  unsigned int num_points)
568 {
569  float* cPtr = cVector;
570  const float* bPtr = bVector;
571  const float* aPtr = aVector;
572 
573  unsigned int number = 0;
574  const unsigned int quarterPoints = num_points / 4;
575 
576  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
577  __m128 tmp, fx, mask, pow2n, z, y;
578  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
579  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
580  __m128i bias, exp, emm0, pi32_0x7f;
581 
582  one = _mm_set1_ps(1.0);
583  exp_hi = _mm_set1_ps(88.3762626647949);
584  exp_lo = _mm_set1_ps(-88.3762626647949);
585  ln2 = _mm_set1_ps(0.6931471805);
586  log2EF = _mm_set1_ps(1.44269504088896341);
587  half = _mm_set1_ps(0.5);
588  exp_C1 = _mm_set1_ps(0.693359375);
589  exp_C2 = _mm_set1_ps(-2.12194440e-4);
590  pi32_0x7f = _mm_set1_epi32(0x7f);
591 
592  exp_p0 = _mm_set1_ps(1.9875691500e-4);
593  exp_p1 = _mm_set1_ps(1.3981999507e-3);
594  exp_p2 = _mm_set1_ps(8.3334519073e-3);
595  exp_p3 = _mm_set1_ps(4.1665795894e-2);
596  exp_p4 = _mm_set1_ps(1.6666665459e-1);
597  exp_p5 = _mm_set1_ps(5.0000001201e-1);
598 
599  for (; number < quarterPoints; number++) {
600  // First compute the logarithm
601  aVal = _mm_loadu_ps(aPtr);
602  bias = _mm_set1_epi32(127);
603  leadingOne = _mm_set1_ps(1.0f);
604  exp = _mm_sub_epi32(
605  _mm_srli_epi32(
606  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
607  bias);
608  logarithm = _mm_cvtepi32_ps(exp);
609 
610  frac = _mm_or_ps(leadingOne,
611  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
612 
613 #if POW_POLY_DEGREE == 6
614  mantissa = POLY5(frac,
615  3.1157899f,
616  -3.3241990f,
617  2.5988452f,
618  -1.2315303f,
619  3.1821337e-1f,
620  -3.4436006e-2f);
621 #elif POW_POLY_DEGREE == 5
622  mantissa = POLY4(frac,
623  2.8882704548164776201f,
624  -2.52074962577807006663f,
625  1.48116647521213171641f,
626  -0.465725644288844778798f,
627  0.0596515482674574969533f);
628 #elif POW_POLY_DEGREE == 4
629  mantissa = POLY3(frac,
630  2.61761038894603480148f,
631  -1.75647175389045657003f,
632  0.688243882994381274313f,
633  -0.107254423828329604454f);
634 #elif POW_POLY_DEGREE == 3
635  mantissa = POLY2(frac,
636  2.28330284476918490682f,
637  -1.04913055217340124191f,
638  0.204446009836232697516f);
639 #else
640 #error
641 #endif
642 
643  logarithm =
644  _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
645  logarithm = _mm_mul_ps(logarithm, ln2);
646 
647 
648  // Now calculate b*lna
649  bVal = _mm_loadu_ps(bPtr);
650  bVal = _mm_mul_ps(bVal, logarithm);
651 
652  // Now compute exp(b*lna)
653  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
654 
655  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
656 
657  emm0 = _mm_cvttps_epi32(fx);
658  tmp = _mm_cvtepi32_ps(emm0);
659 
660  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
661  fx = _mm_sub_ps(tmp, mask);
662 
663  tmp = _mm_mul_ps(fx, exp_C1);
664  z = _mm_mul_ps(fx, exp_C2);
665  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
666  z = _mm_mul_ps(bVal, bVal);
667 
668  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
669  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
670  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
671  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
672  y = _mm_add_ps(y, one);
673 
674  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
675 
676  pow2n = _mm_castsi128_ps(emm0);
677  cVal = _mm_mul_ps(y, pow2n);
678 
679  _mm_storeu_ps(cPtr, cVal);
680 
681  aPtr += 4;
682  bPtr += 4;
683  cPtr += 4;
684  }
685 
686  number = quarterPoints * 4;
687  for (; number < num_points; number++) {
688  *cPtr++ = powf(*aPtr++, *bPtr++);
689  }
690 }
691 
692 #endif /* LV_HAVE_SSE4_1 for unaligned */
693 
694 #if LV_HAVE_AVX2 && LV_HAVE_FMA
695 #include <immintrin.h>
696 
697 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
698 #define POLY1_AVX2_FMA(x, c0, c1) \
699  _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
700 #define POLY2_AVX2_FMA(x, c0, c1, c2) \
701  _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
702 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
703  _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
704 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
705  _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
706 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
707  _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
708 
709 static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,
710  const float* bVector,
711  const float* aVector,
712  unsigned int num_points)
713 {
714  float* cPtr = cVector;
715  const float* bPtr = bVector;
716  const float* aPtr = aVector;
717 
718  unsigned int number = 0;
719  const unsigned int eighthPoints = num_points / 8;
720 
721  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
722  __m256 tmp, fx, mask, pow2n, z, y;
723  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
724  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
725  __m256i bias, exp, emm0, pi32_0x7f;
726 
727  one = _mm256_set1_ps(1.0);
728  exp_hi = _mm256_set1_ps(88.3762626647949);
729  exp_lo = _mm256_set1_ps(-88.3762626647949);
730  ln2 = _mm256_set1_ps(0.6931471805);
731  log2EF = _mm256_set1_ps(1.44269504088896341);
732  half = _mm256_set1_ps(0.5);
733  exp_C1 = _mm256_set1_ps(0.693359375);
734  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
735  pi32_0x7f = _mm256_set1_epi32(0x7f);
736 
737  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
738  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
739  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
740  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
741  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
742  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
743 
744  for (; number < eighthPoints; number++) {
745  // First compute the logarithm
746  aVal = _mm256_loadu_ps(aPtr);
747  bias = _mm256_set1_epi32(127);
748  leadingOne = _mm256_set1_ps(1.0f);
749  exp = _mm256_sub_epi32(
750  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
751  _mm256_set1_epi32(0x7f800000)),
752  23),
753  bias);
754  logarithm = _mm256_cvtepi32_ps(exp);
755 
756  frac = _mm256_or_ps(
757  leadingOne,
758  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
759 
760 #if POW_POLY_DEGREE == 6
761  mantissa = POLY5_AVX2_FMA(frac,
762  3.1157899f,
763  -3.3241990f,
764  2.5988452f,
765  -1.2315303f,
766  3.1821337e-1f,
767  -3.4436006e-2f);
768 #elif POW_POLY_DEGREE == 5
769  mantissa = POLY4_AVX2_FMA(frac,
770  2.8882704548164776201f,
771  -2.52074962577807006663f,
772  1.48116647521213171641f,
773  -0.465725644288844778798f,
774  0.0596515482674574969533f);
775 #elif POW_POLY_DEGREE == 4
776  mantissa = POLY3_AVX2_FMA(frac,
777  2.61761038894603480148f,
778  -1.75647175389045657003f,
779  0.688243882994381274313f,
780  -0.107254423828329604454f);
781 #elif POW_POLY_DEGREE == 3
782  mantissa = POLY2_AVX2_FMA(frac,
783  2.28330284476918490682f,
784  -1.04913055217340124191f,
785  0.204446009836232697516f);
786 #else
787 #error
788 #endif
789 
790  logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
791  logarithm = _mm256_mul_ps(logarithm, ln2);
792 
793 
794  // Now calculate b*lna
795  bVal = _mm256_loadu_ps(bPtr);
796  bVal = _mm256_mul_ps(bVal, logarithm);
797 
798  // Now compute exp(b*lna)
799  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
800 
801  fx = _mm256_fmadd_ps(bVal, log2EF, half);
802 
803  emm0 = _mm256_cvttps_epi32(fx);
804  tmp = _mm256_cvtepi32_ps(emm0);
805 
806  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
807  fx = _mm256_sub_ps(tmp, mask);
808 
809  tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
810  bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
811  z = _mm256_mul_ps(bVal, bVal);
812 
813  y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
814  y = _mm256_fmadd_ps(y, bVal, exp_p2);
815  y = _mm256_fmadd_ps(y, bVal, exp_p3);
816  y = _mm256_fmadd_ps(y, bVal, exp_p4);
817  y = _mm256_fmadd_ps(y, bVal, exp_p5);
818  y = _mm256_fmadd_ps(y, z, bVal);
819  y = _mm256_add_ps(y, one);
820 
821  emm0 =
822  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
823 
824  pow2n = _mm256_castsi256_ps(emm0);
825  cVal = _mm256_mul_ps(y, pow2n);
826 
827  _mm256_storeu_ps(cPtr, cVal);
828 
829  aPtr += 8;
830  bPtr += 8;
831  cPtr += 8;
832  }
833 
834  number = eighthPoints * 8;
835  for (; number < num_points; number++) {
836  *cPtr++ = pow(*aPtr++, *bPtr++);
837  }
838 }
839 
840 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
841 
842 #ifdef LV_HAVE_AVX2
843 #include <immintrin.h>
844 
845 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
846 #define POLY1_AVX2(x, c0, c1) \
847  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
848 #define POLY2_AVX2(x, c0, c1, c2) \
849  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
850 #define POLY3_AVX2(x, c0, c1, c2, c3) \
851  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
852 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
853  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
854 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
855  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
856 
857 static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
858  const float* bVector,
859  const float* aVector,
860  unsigned int num_points)
861 {
862  float* cPtr = cVector;
863  const float* bPtr = bVector;
864  const float* aPtr = aVector;
865 
866  unsigned int number = 0;
867  const unsigned int eighthPoints = num_points / 8;
868 
869  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
870  __m256 tmp, fx, mask, pow2n, z, y;
871  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
872  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
873  __m256i bias, exp, emm0, pi32_0x7f;
874 
875  one = _mm256_set1_ps(1.0);
876  exp_hi = _mm256_set1_ps(88.3762626647949);
877  exp_lo = _mm256_set1_ps(-88.3762626647949);
878  ln2 = _mm256_set1_ps(0.6931471805);
879  log2EF = _mm256_set1_ps(1.44269504088896341);
880  half = _mm256_set1_ps(0.5);
881  exp_C1 = _mm256_set1_ps(0.693359375);
882  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
883  pi32_0x7f = _mm256_set1_epi32(0x7f);
884 
885  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
886  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
887  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
888  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
889  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
890  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
891 
892  for (; number < eighthPoints; number++) {
893  // First compute the logarithm
894  aVal = _mm256_loadu_ps(aPtr);
895  bias = _mm256_set1_epi32(127);
896  leadingOne = _mm256_set1_ps(1.0f);
897  exp = _mm256_sub_epi32(
898  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
899  _mm256_set1_epi32(0x7f800000)),
900  23),
901  bias);
902  logarithm = _mm256_cvtepi32_ps(exp);
903 
904  frac = _mm256_or_ps(
905  leadingOne,
906  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
907 
908 #if POW_POLY_DEGREE == 6
909  mantissa = POLY5_AVX2(frac,
910  3.1157899f,
911  -3.3241990f,
912  2.5988452f,
913  -1.2315303f,
914  3.1821337e-1f,
915  -3.4436006e-2f);
916 #elif POW_POLY_DEGREE == 5
917  mantissa = POLY4_AVX2(frac,
918  2.8882704548164776201f,
919  -2.52074962577807006663f,
920  1.48116647521213171641f,
921  -0.465725644288844778798f,
922  0.0596515482674574969533f);
923 #elif POW_POLY_DEGREE == 4
924  mantissa = POLY3_AVX2(frac,
925  2.61761038894603480148f,
926  -1.75647175389045657003f,
927  0.688243882994381274313f,
928  -0.107254423828329604454f);
929 #elif POW_POLY_DEGREE == 3
930  mantissa = POLY2_AVX2(frac,
931  2.28330284476918490682f,
932  -1.04913055217340124191f,
933  0.204446009836232697516f);
934 #else
935 #error
936 #endif
937 
938  logarithm = _mm256_add_ps(
939  _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
940  logarithm = _mm256_mul_ps(logarithm, ln2);
941 
942  // Now calculate b*lna
943  bVal = _mm256_loadu_ps(bPtr);
944  bVal = _mm256_mul_ps(bVal, logarithm);
945 
946  // Now compute exp(b*lna)
947  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
948 
949  fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
950 
951  emm0 = _mm256_cvttps_epi32(fx);
952  tmp = _mm256_cvtepi32_ps(emm0);
953 
954  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
955  fx = _mm256_sub_ps(tmp, mask);
956 
957  tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
958  bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
959  z = _mm256_mul_ps(bVal, bVal);
960 
961  y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
962  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
963  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
964  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
965  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
966  y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
967  y = _mm256_add_ps(y, one);
968 
969  emm0 =
970  _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
971 
972  pow2n = _mm256_castsi256_ps(emm0);
973  cVal = _mm256_mul_ps(y, pow2n);
974 
975  _mm256_storeu_ps(cPtr, cVal);
976 
977  aPtr += 8;
978  bPtr += 8;
979  cPtr += 8;
980  }
981 
982  number = eighthPoints * 8;
983  for (; number < num_points; number++) {
984  *cPtr++ = pow(*aPtr++, *bPtr++);
985  }
986 }
987 
988 #endif /* LV_HAVE_AVX2 for unaligned */
989 
990 #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
static void volk_32f_x2_pow_32f_generic(float *cVector, const float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_x2_pow_32f.h:534