Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_x2_pow_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
72 #define INCLUDED_volk_32f_x2_pow_32f_a_H
73 
74 #include <stdio.h>
75 #include <stdlib.h>
76 #include <inttypes.h>
77 #include <math.h>
78 
79 #define POW_POLY_DEGREE 3
80 
81 #if LV_HAVE_AVX2 && LV_HAVE_FMA
82 #include <immintrin.h>
83 
84 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
85 #define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
86 #define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
87 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
88 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
89 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
90 
91 static inline void
92 volk_32f_x2_pow_32f_a_avx2_fma(float* cVector, const float* bVector,
93  const float* aVector, unsigned int num_points)
94 {
95  float* cPtr = cVector;
96  const float* bPtr = bVector;
97  const float* aPtr = aVector;
98 
99  unsigned int number = 0;
100  const unsigned int eighthPoints = num_points / 8;
101 
102  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
103  __m256 tmp, fx, mask, pow2n, z, y;
104  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
105  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
106  __m256i bias, exp, emm0, pi32_0x7f;
107 
108  one = _mm256_set1_ps(1.0);
109  exp_hi = _mm256_set1_ps(88.3762626647949);
110  exp_lo = _mm256_set1_ps(-88.3762626647949);
111  ln2 = _mm256_set1_ps(0.6931471805);
112  log2EF = _mm256_set1_ps(1.44269504088896341);
113  half = _mm256_set1_ps(0.5);
114  exp_C1 = _mm256_set1_ps(0.693359375);
115  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
116  pi32_0x7f = _mm256_set1_epi32(0x7f);
117 
118  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
119  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
120  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
121  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
122  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
123  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
124 
125  for(;number < eighthPoints; number++){
126  // First compute the logarithm
127  aVal = _mm256_load_ps(aPtr);
128  bias = _mm256_set1_epi32(127);
129  leadingOne = _mm256_set1_ps(1.0f);
130  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
131  logarithm = _mm256_cvtepi32_ps(exp);
132 
133  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
134 
135 #if POW_POLY_DEGREE == 6
136  mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
137 #elif POW_POLY_DEGREE == 5
138  mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
139 #elif POW_POLY_DEGREE == 4
140  mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
141 #elif POW_POLY_DEGREE == 3
142  mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
143 #else
144 #error
145 #endif
146 
147  logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
148  logarithm = _mm256_mul_ps(logarithm, ln2);
149 
150  // Now calculate b*lna
151  bVal = _mm256_load_ps(bPtr);
152  bVal = _mm256_mul_ps(bVal, logarithm);
153 
154  // Now compute exp(b*lna)
155  tmp = _mm256_setzero_ps();
156 
157  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
158 
159  fx = _mm256_fmadd_ps(bVal, log2EF, half);
160 
161  emm0 = _mm256_cvttps_epi32(fx);
162  tmp = _mm256_cvtepi32_ps(emm0);
163 
164  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, 14), one);
165  fx = _mm256_sub_ps(tmp, mask);
166 
167  tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
168  bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
169  z = _mm256_mul_ps(bVal, bVal);
170 
171  y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
172  y = _mm256_fmadd_ps(y, bVal, exp_p2);
173  y = _mm256_fmadd_ps(y, bVal, exp_p3);
174  y = _mm256_fmadd_ps(y, bVal, exp_p4);
175  y = _mm256_fmadd_ps(y, bVal, exp_p5);
176  y = _mm256_fmadd_ps(y, z, bVal);
177  y = _mm256_add_ps(y, one);
178 
179  emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
180 
181  pow2n = _mm256_castsi256_ps(emm0);
182  cVal = _mm256_mul_ps(y, pow2n);
183 
184  _mm256_store_ps(cPtr, cVal);
185 
186  aPtr += 8;
187  bPtr += 8;
188  cPtr += 8;
189  }
190 
191  number = eighthPoints * 8;
192  for(;number < num_points; number++){
193  *cPtr++ = pow(*aPtr++, *bPtr++);
194  }
195 }
196 
197 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
198 
199 #ifdef LV_HAVE_AVX2
200 #include <immintrin.h>
201 
202 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
203 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
204 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
205 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
206 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
207 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
208 
209 static inline void
210 volk_32f_x2_pow_32f_a_avx2(float* cVector, const float* bVector,
211  const float* aVector, unsigned int num_points)
212 {
213  float* cPtr = cVector;
214  const float* bPtr = bVector;
215  const float* aPtr = aVector;
216 
217  unsigned int number = 0;
218  const unsigned int eighthPoints = num_points / 8;
219 
220  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
221  __m256 tmp, fx, mask, pow2n, z, y;
222  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
223  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
224  __m256i bias, exp, emm0, pi32_0x7f;
225 
226  one = _mm256_set1_ps(1.0);
227  exp_hi = _mm256_set1_ps(88.3762626647949);
228  exp_lo = _mm256_set1_ps(-88.3762626647949);
229  ln2 = _mm256_set1_ps(0.6931471805);
230  log2EF = _mm256_set1_ps(1.44269504088896341);
231  half = _mm256_set1_ps(0.5);
232  exp_C1 = _mm256_set1_ps(0.693359375);
233  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
234  pi32_0x7f = _mm256_set1_epi32(0x7f);
235 
236  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
237  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
238  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
239  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
240  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
241  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
242 
243  for(;number < eighthPoints; number++){
244  // First compute the logarithm
245  aVal = _mm256_load_ps(aPtr);
246  bias = _mm256_set1_epi32(127);
247  leadingOne = _mm256_set1_ps(1.0f);
248  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
249  logarithm = _mm256_cvtepi32_ps(exp);
250 
251  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
252 
253 #if POW_POLY_DEGREE == 6
254  mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
255 #elif POW_POLY_DEGREE == 5
256  mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
257 #elif POW_POLY_DEGREE == 4
258  mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
259 #elif POW_POLY_DEGREE == 3
260  mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
261 #else
262 #error
263 #endif
264 
265  logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
266  logarithm = _mm256_mul_ps(logarithm, ln2);
267 
268  // Now calculate b*lna
269  bVal = _mm256_load_ps(bPtr);
270  bVal = _mm256_mul_ps(bVal, logarithm);
271 
272  // Now compute exp(b*lna)
273  tmp = _mm256_setzero_ps();
274 
275  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
276 
277  fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
278 
279  emm0 = _mm256_cvttps_epi32(fx);
280  tmp = _mm256_cvtepi32_ps(emm0);
281 
282  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, 14), one);
283  fx = _mm256_sub_ps(tmp, mask);
284 
285  tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
286  bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
287  z = _mm256_mul_ps(bVal, bVal);
288 
289  y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
290  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
291  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
292  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
293  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
294  y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
295  y = _mm256_add_ps(y, one);
296 
297  emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
298 
299  pow2n = _mm256_castsi256_ps(emm0);
300  cVal = _mm256_mul_ps(y, pow2n);
301 
302  _mm256_store_ps(cPtr, cVal);
303 
304  aPtr += 8;
305  bPtr += 8;
306  cPtr += 8;
307  }
308 
309  number = eighthPoints * 8;
310  for(;number < num_points; number++){
311  *cPtr++ = pow(*aPtr++, *bPtr++);
312  }
313 }
314 
315 #endif /* LV_HAVE_AVX2 for aligned */
316 
317 
318 #ifdef LV_HAVE_SSE4_1
319 #include <smmintrin.h>
320 
321 #define POLY0(x, c0) _mm_set1_ps(c0)
322 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
323 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
324 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
325 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
326 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
327 
328 static inline void
329 volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* bVector,
330  const float* aVector, unsigned int num_points)
331 {
332  float* cPtr = cVector;
333  const float* bPtr = bVector;
334  const float* aPtr = aVector;
335 
336  unsigned int number = 0;
337  const unsigned int quarterPoints = num_points / 4;
338 
339  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
340  __m128 tmp, fx, mask, pow2n, z, y;
341  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
342  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
343  __m128i bias, exp, emm0, pi32_0x7f;
344 
345  one = _mm_set1_ps(1.0);
346  exp_hi = _mm_set1_ps(88.3762626647949);
347  exp_lo = _mm_set1_ps(-88.3762626647949);
348  ln2 = _mm_set1_ps(0.6931471805);
349  log2EF = _mm_set1_ps(1.44269504088896341);
350  half = _mm_set1_ps(0.5);
351  exp_C1 = _mm_set1_ps(0.693359375);
352  exp_C2 = _mm_set1_ps(-2.12194440e-4);
353  pi32_0x7f = _mm_set1_epi32(0x7f);
354 
355  exp_p0 = _mm_set1_ps(1.9875691500e-4);
356  exp_p1 = _mm_set1_ps(1.3981999507e-3);
357  exp_p2 = _mm_set1_ps(8.3334519073e-3);
358  exp_p3 = _mm_set1_ps(4.1665795894e-2);
359  exp_p4 = _mm_set1_ps(1.6666665459e-1);
360  exp_p5 = _mm_set1_ps(5.0000001201e-1);
361 
362  for(;number < quarterPoints; number++){
363  // First compute the logarithm
364  aVal = _mm_load_ps(aPtr);
365  bias = _mm_set1_epi32(127);
366  leadingOne = _mm_set1_ps(1.0f);
367  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
368  logarithm = _mm_cvtepi32_ps(exp);
369 
370  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
371 
372 #if POW_POLY_DEGREE == 6
373  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
374 #elif POW_POLY_DEGREE == 5
375  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
376 #elif POW_POLY_DEGREE == 4
377  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
378 #elif POW_POLY_DEGREE == 3
379  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
380 #else
381 #error
382 #endif
383 
384  logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
385  logarithm = _mm_mul_ps(logarithm, ln2);
386 
387 
388  // Now calculate b*lna
389  bVal = _mm_load_ps(bPtr);
390  bVal = _mm_mul_ps(bVal, logarithm);
391 
392  // Now compute exp(b*lna)
393  tmp = _mm_setzero_ps();
394 
395  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
396 
397  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
398 
399  emm0 = _mm_cvttps_epi32(fx);
400  tmp = _mm_cvtepi32_ps(emm0);
401 
402  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
403  fx = _mm_sub_ps(tmp, mask);
404 
405  tmp = _mm_mul_ps(fx, exp_C1);
406  z = _mm_mul_ps(fx, exp_C2);
407  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
408  z = _mm_mul_ps(bVal, bVal);
409 
410  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
411  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
412  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
413  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
414  y = _mm_add_ps(y, one);
415 
416  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
417 
418  pow2n = _mm_castsi128_ps(emm0);
419  cVal = _mm_mul_ps(y, pow2n);
420 
421  _mm_store_ps(cPtr, cVal);
422 
423  aPtr += 4;
424  bPtr += 4;
425  cPtr += 4;
426  }
427 
428  number = quarterPoints * 4;
429  for(;number < num_points; number++){
430  *cPtr++ = powf(*aPtr++, *bPtr++);
431  }
432 }
433 
434 #endif /* LV_HAVE_SSE4_1 for aligned */
435 
436 #endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
437 
438 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
439 #define INCLUDED_volk_32f_x2_pow_32f_u_H
440 
441 #include <stdio.h>
442 #include <stdlib.h>
443 #include <inttypes.h>
444 #include <math.h>
445 
446 #define POW_POLY_DEGREE 3
447 
448 #ifdef LV_HAVE_GENERIC
449 
450 static inline void
451 volk_32f_x2_pow_32f_generic(float* cVector, const float* bVector,
452  const float* aVector, unsigned int num_points)
453 {
454  float* cPtr = cVector;
455  const float* bPtr = bVector;
456  const float* aPtr = aVector;
457  unsigned int number = 0;
458 
459  for(number = 0; number < num_points; number++){
460  *cPtr++ = powf(*aPtr++, *bPtr++);
461  }
462 }
463 #endif /* LV_HAVE_GENERIC */
464 
465 
466 #ifdef LV_HAVE_SSE4_1
467 #include <smmintrin.h>
468 
469 #define POLY0(x, c0) _mm_set1_ps(c0)
470 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
471 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
472 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
473 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
474 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
475 
476 static inline void
477 volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* bVector,
478  const float* aVector, unsigned int num_points)
479 {
480  float* cPtr = cVector;
481  const float* bPtr = bVector;
482  const float* aPtr = aVector;
483 
484  unsigned int number = 0;
485  const unsigned int quarterPoints = num_points / 4;
486 
487  __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
488  __m128 tmp, fx, mask, pow2n, z, y;
489  __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
490  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
491  __m128i bias, exp, emm0, pi32_0x7f;
492 
493  one = _mm_set1_ps(1.0);
494  exp_hi = _mm_set1_ps(88.3762626647949);
495  exp_lo = _mm_set1_ps(-88.3762626647949);
496  ln2 = _mm_set1_ps(0.6931471805);
497  log2EF = _mm_set1_ps(1.44269504088896341);
498  half = _mm_set1_ps(0.5);
499  exp_C1 = _mm_set1_ps(0.693359375);
500  exp_C2 = _mm_set1_ps(-2.12194440e-4);
501  pi32_0x7f = _mm_set1_epi32(0x7f);
502 
503  exp_p0 = _mm_set1_ps(1.9875691500e-4);
504  exp_p1 = _mm_set1_ps(1.3981999507e-3);
505  exp_p2 = _mm_set1_ps(8.3334519073e-3);
506  exp_p3 = _mm_set1_ps(4.1665795894e-2);
507  exp_p4 = _mm_set1_ps(1.6666665459e-1);
508  exp_p5 = _mm_set1_ps(5.0000001201e-1);
509 
510  for(;number < quarterPoints; number++){
511  // First compute the logarithm
512  aVal = _mm_loadu_ps(aPtr);
513  bias = _mm_set1_epi32(127);
514  leadingOne = _mm_set1_ps(1.0f);
515  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
516  logarithm = _mm_cvtepi32_ps(exp);
517 
518  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
519 
520 #if POW_POLY_DEGREE == 6
521  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
522 #elif POW_POLY_DEGREE == 5
523  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
524 #elif POW_POLY_DEGREE == 4
525  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
526 #elif POW_POLY_DEGREE == 3
527  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
528 #else
529 #error
530 #endif
531 
532  logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
533  logarithm = _mm_mul_ps(logarithm, ln2);
534 
535 
536  // Now calculate b*lna
537  bVal = _mm_loadu_ps(bPtr);
538  bVal = _mm_mul_ps(bVal, logarithm);
539 
540  // Now compute exp(b*lna)
541  tmp = _mm_setzero_ps();
542 
543  bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
544 
545  fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
546 
547  emm0 = _mm_cvttps_epi32(fx);
548  tmp = _mm_cvtepi32_ps(emm0);
549 
550  mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
551  fx = _mm_sub_ps(tmp, mask);
552 
553  tmp = _mm_mul_ps(fx, exp_C1);
554  z = _mm_mul_ps(fx, exp_C2);
555  bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
556  z = _mm_mul_ps(bVal, bVal);
557 
558  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
559  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
560  y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
561  y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
562  y = _mm_add_ps(y, one);
563 
564  emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
565 
566  pow2n = _mm_castsi128_ps(emm0);
567  cVal = _mm_mul_ps(y, pow2n);
568 
569  _mm_storeu_ps(cPtr, cVal);
570 
571  aPtr += 4;
572  bPtr += 4;
573  cPtr += 4;
574  }
575 
576  number = quarterPoints * 4;
577  for(;number < num_points; number++){
578  *cPtr++ = powf(*aPtr++, *bPtr++);
579  }
580 }
581 
582 #endif /* LV_HAVE_SSE4_1 for unaligned */
583 
584 #if LV_HAVE_AVX2 && LV_HAVE_FMA
585 #include <immintrin.h>
586 
587 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
588 #define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
589 #define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
590 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
591 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
592 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
593 
594 static inline void
595 volk_32f_x2_pow_32f_u_avx2_fma(float* cVector, const float* bVector,
596  const float* aVector, unsigned int num_points)
597 {
598  float* cPtr = cVector;
599  const float* bPtr = bVector;
600  const float* aPtr = aVector;
601 
602  unsigned int number = 0;
603  const unsigned int eighthPoints = num_points / 8;
604 
605  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
606  __m256 tmp, fx, mask, pow2n, z, y;
607  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
608  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
609  __m256i bias, exp, emm0, pi32_0x7f;
610 
611  one = _mm256_set1_ps(1.0);
612  exp_hi = _mm256_set1_ps(88.3762626647949);
613  exp_lo = _mm256_set1_ps(-88.3762626647949);
614  ln2 = _mm256_set1_ps(0.6931471805);
615  log2EF = _mm256_set1_ps(1.44269504088896341);
616  half = _mm256_set1_ps(0.5);
617  exp_C1 = _mm256_set1_ps(0.693359375);
618  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
619  pi32_0x7f = _mm256_set1_epi32(0x7f);
620 
621  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
622  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
623  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
624  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
625  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
626  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
627 
628  for(;number < eighthPoints; number++){
629  // First compute the logarithm
630  aVal = _mm256_loadu_ps(aPtr);
631  bias = _mm256_set1_epi32(127);
632  leadingOne = _mm256_set1_ps(1.0f);
633  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
634  logarithm = _mm256_cvtepi32_ps(exp);
635 
636  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
637 
638 #if POW_POLY_DEGREE == 6
639  mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
640 #elif POW_POLY_DEGREE == 5
641  mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
642 #elif POW_POLY_DEGREE == 4
643  mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
644 #elif POW_POLY_DEGREE == 3
645  mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
646 #else
647 #error
648 #endif
649 
650  logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
651  logarithm = _mm256_mul_ps(logarithm, ln2);
652 
653 
654  // Now calculate b*lna
655  bVal = _mm256_loadu_ps(bPtr);
656  bVal = _mm256_mul_ps(bVal, logarithm);
657 
658  // Now compute exp(b*lna)
659  tmp = _mm256_setzero_ps();
660 
661  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
662 
663  fx = _mm256_fmadd_ps(bVal, log2EF, half);
664 
665  emm0 = _mm256_cvttps_epi32(fx);
666  tmp = _mm256_cvtepi32_ps(emm0);
667 
668  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, 14), one);
669  fx = _mm256_sub_ps(tmp, mask);
670 
671  tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
672  bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
673  z = _mm256_mul_ps(bVal, bVal);
674 
675  y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
676  y = _mm256_fmadd_ps(y, bVal, exp_p2);
677  y = _mm256_fmadd_ps(y, bVal, exp_p3);
678  y = _mm256_fmadd_ps(y, bVal, exp_p4);
679  y = _mm256_fmadd_ps(y, bVal, exp_p5);
680  y = _mm256_fmadd_ps(y, z, bVal);
681  y = _mm256_add_ps(y, one);
682 
683  emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
684 
685  pow2n = _mm256_castsi256_ps(emm0);
686  cVal = _mm256_mul_ps(y, pow2n);
687 
688  _mm256_storeu_ps(cPtr, cVal);
689 
690  aPtr += 8;
691  bPtr += 8;
692  cPtr += 8;
693  }
694 
695  number = eighthPoints * 8;
696  for(;number < num_points; number++){
697  *cPtr++ = pow(*aPtr++, *bPtr++);
698  }
699 }
700 
701 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
702 
703 #ifdef LV_HAVE_AVX2
704 #include <immintrin.h>
705 
706 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
707 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
708 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
709 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
710 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
711 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
712 
713 static inline void
714 volk_32f_x2_pow_32f_u_avx2(float* cVector, const float* bVector,
715  const float* aVector, unsigned int num_points)
716 {
717  float* cPtr = cVector;
718  const float* bPtr = bVector;
719  const float* aPtr = aVector;
720 
721  unsigned int number = 0;
722  const unsigned int eighthPoints = num_points / 8;
723 
724  __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
725  __m256 tmp, fx, mask, pow2n, z, y;
726  __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
727  __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
728  __m256i bias, exp, emm0, pi32_0x7f;
729 
730  one = _mm256_set1_ps(1.0);
731  exp_hi = _mm256_set1_ps(88.3762626647949);
732  exp_lo = _mm256_set1_ps(-88.3762626647949);
733  ln2 = _mm256_set1_ps(0.6931471805);
734  log2EF = _mm256_set1_ps(1.44269504088896341);
735  half = _mm256_set1_ps(0.5);
736  exp_C1 = _mm256_set1_ps(0.693359375);
737  exp_C2 = _mm256_set1_ps(-2.12194440e-4);
738  pi32_0x7f = _mm256_set1_epi32(0x7f);
739 
740  exp_p0 = _mm256_set1_ps(1.9875691500e-4);
741  exp_p1 = _mm256_set1_ps(1.3981999507e-3);
742  exp_p2 = _mm256_set1_ps(8.3334519073e-3);
743  exp_p3 = _mm256_set1_ps(4.1665795894e-2);
744  exp_p4 = _mm256_set1_ps(1.6666665459e-1);
745  exp_p5 = _mm256_set1_ps(5.0000001201e-1);
746 
747  for(;number < eighthPoints; number++){
748  // First compute the logarithm
749  aVal = _mm256_loadu_ps(aPtr);
750  bias = _mm256_set1_epi32(127);
751  leadingOne = _mm256_set1_ps(1.0f);
752  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
753  logarithm = _mm256_cvtepi32_ps(exp);
754 
755  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
756 
757 #if POW_POLY_DEGREE == 6
758  mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
759 #elif POW_POLY_DEGREE == 5
760  mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
761 #elif POW_POLY_DEGREE == 4
762  mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
763 #elif POW_POLY_DEGREE == 3
764  mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
765 #else
766 #error
767 #endif
768 
769  logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
770  logarithm = _mm256_mul_ps(logarithm, ln2);
771 
772  // Now calculate b*lna
773  bVal = _mm256_loadu_ps(bPtr);
774  bVal = _mm256_mul_ps(bVal, logarithm);
775 
776  // Now compute exp(b*lna)
777  tmp = _mm256_setzero_ps();
778 
779  bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
780 
781  fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
782 
783  emm0 = _mm256_cvttps_epi32(fx);
784  tmp = _mm256_cvtepi32_ps(emm0);
785 
786  mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, 14), one);
787  fx = _mm256_sub_ps(tmp, mask);
788 
789  tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
790  bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
791  z = _mm256_mul_ps(bVal, bVal);
792 
793  y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
794  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
795  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
796  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
797  y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
798  y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
799  y = _mm256_add_ps(y, one);
800 
801  emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
802 
803  pow2n = _mm256_castsi256_ps(emm0);
804  cVal = _mm256_mul_ps(y, pow2n);
805 
806  _mm256_storeu_ps(cPtr, cVal);
807 
808  aPtr += 8;
809  bPtr += 8;
810  cPtr += 8;
811  }
812 
813  number = eighthPoints * 8;
814  for(;number < num_points; number++){
815  *cPtr++ = pow(*aPtr++, *bPtr++);
816  }
817 }
818 
819 #endif /* LV_HAVE_AVX2 for unaligned */
820 
821 #endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
static void volk_32f_x2_pow_32f_generic(float *cVector, const float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_x2_pow_32f.h:451