71 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H 72 #define INCLUDED_volk_32f_x2_pow_32f_a_H 79 #define POW_POLY_DEGREE 3 81 #if LV_HAVE_AVX2 && LV_HAVE_FMA 82 #include <immintrin.h> 84 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0) 85 #define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0)) 86 #define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0)) 87 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0)) 88 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) 89 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) 92 volk_32f_x2_pow_32f_a_avx2_fma(
float* cVector,
const float* bVector,
93 const float* aVector,
unsigned int num_points)
95 float* cPtr = cVector;
96 const float* bPtr = bVector;
97 const float* aPtr = aVector;
99 unsigned int number = 0;
100 const unsigned int eighthPoints = num_points / 8;
102 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
103 __m256 tmp, fx, mask, pow2n, z, y;
104 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
105 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
106 __m256i bias, exp, emm0, pi32_0x7f;
108 one = _mm256_set1_ps(1.0);
109 exp_hi = _mm256_set1_ps(88.3762626647949);
110 exp_lo = _mm256_set1_ps(-88.3762626647949);
111 ln2 = _mm256_set1_ps(0.6931471805);
112 log2EF = _mm256_set1_ps(1.44269504088896341);
113 half = _mm256_set1_ps(0.5);
114 exp_C1 = _mm256_set1_ps(0.693359375);
115 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
116 pi32_0x7f = _mm256_set1_epi32(0x7f);
118 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
119 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
120 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
121 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
122 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
123 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
125 for(;number < eighthPoints; number++){
127 aVal = _mm256_load_ps(aPtr);
128 bias = _mm256_set1_epi32(127);
129 leadingOne = _mm256_set1_ps(1.0f);
130 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
131 logarithm = _mm256_cvtepi32_ps(exp);
133 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
135 #if POW_POLY_DEGREE == 6 136 mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
137 #elif POW_POLY_DEGREE == 5 138 mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
139 #elif POW_POLY_DEGREE == 4 140 mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
141 #elif POW_POLY_DEGREE == 3 142 mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
147 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
148 logarithm = _mm256_mul_ps(logarithm, ln2);
151 bVal = _mm256_load_ps(bPtr);
152 bVal = _mm256_mul_ps(bVal, logarithm);
155 tmp = _mm256_setzero_ps();
157 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
159 fx = _mm256_fmadd_ps(bVal, log2EF, half);
161 emm0 = _mm256_cvttps_epi32(fx);
162 tmp = _mm256_cvtepi32_ps(emm0);
164 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, 14), one);
165 fx = _mm256_sub_ps(tmp, mask);
167 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
168 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
169 z = _mm256_mul_ps(bVal, bVal);
171 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
172 y = _mm256_fmadd_ps(y, bVal, exp_p2);
173 y = _mm256_fmadd_ps(y, bVal, exp_p3);
174 y = _mm256_fmadd_ps(y, bVal, exp_p4);
175 y = _mm256_fmadd_ps(y, bVal, exp_p5);
176 y = _mm256_fmadd_ps(y, z, bVal);
177 y = _mm256_add_ps(y, one);
179 emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
181 pow2n = _mm256_castsi256_ps(emm0);
182 cVal = _mm256_mul_ps(y, pow2n);
184 _mm256_store_ps(cPtr, cVal);
191 number = eighthPoints * 8;
192 for(;number < num_points; number++){
193 *cPtr++ = pow(*aPtr++, *bPtr++);
200 #include <immintrin.h> 202 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) 203 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) 204 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) 205 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) 206 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) 207 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) 210 volk_32f_x2_pow_32f_a_avx2(
float* cVector,
const float* bVector,
211 const float* aVector,
unsigned int num_points)
213 float* cPtr = cVector;
214 const float* bPtr = bVector;
215 const float* aPtr = aVector;
217 unsigned int number = 0;
218 const unsigned int eighthPoints = num_points / 8;
220 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
221 __m256 tmp, fx, mask, pow2n, z, y;
222 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
223 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
224 __m256i bias, exp, emm0, pi32_0x7f;
226 one = _mm256_set1_ps(1.0);
227 exp_hi = _mm256_set1_ps(88.3762626647949);
228 exp_lo = _mm256_set1_ps(-88.3762626647949);
229 ln2 = _mm256_set1_ps(0.6931471805);
230 log2EF = _mm256_set1_ps(1.44269504088896341);
231 half = _mm256_set1_ps(0.5);
232 exp_C1 = _mm256_set1_ps(0.693359375);
233 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
234 pi32_0x7f = _mm256_set1_epi32(0x7f);
236 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
237 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
238 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
239 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
240 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
241 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
243 for(;number < eighthPoints; number++){
245 aVal = _mm256_load_ps(aPtr);
246 bias = _mm256_set1_epi32(127);
247 leadingOne = _mm256_set1_ps(1.0f);
248 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
249 logarithm = _mm256_cvtepi32_ps(exp);
251 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
253 #if POW_POLY_DEGREE == 6 254 mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
255 #elif POW_POLY_DEGREE == 5 256 mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
257 #elif POW_POLY_DEGREE == 4 258 mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
259 #elif POW_POLY_DEGREE == 3 260 mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
265 logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
266 logarithm = _mm256_mul_ps(logarithm, ln2);
269 bVal = _mm256_load_ps(bPtr);
270 bVal = _mm256_mul_ps(bVal, logarithm);
273 tmp = _mm256_setzero_ps();
275 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
277 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
279 emm0 = _mm256_cvttps_epi32(fx);
280 tmp = _mm256_cvtepi32_ps(emm0);
282 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, 14), one);
283 fx = _mm256_sub_ps(tmp, mask);
285 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
286 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
287 z = _mm256_mul_ps(bVal, bVal);
289 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
290 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
291 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
292 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
293 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
294 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
295 y = _mm256_add_ps(y, one);
297 emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
299 pow2n = _mm256_castsi256_ps(emm0);
300 cVal = _mm256_mul_ps(y, pow2n);
302 _mm256_store_ps(cPtr, cVal);
309 number = eighthPoints * 8;
310 for(;number < num_points; number++){
311 *cPtr++ = pow(*aPtr++, *bPtr++);
318 #ifdef LV_HAVE_SSE4_1 319 #include <smmintrin.h> 321 #define POLY0(x, c0) _mm_set1_ps(c0) 322 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) 323 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) 324 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) 325 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) 326 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) 329 volk_32f_x2_pow_32f_a_sse4_1(
float* cVector,
const float* bVector,
330 const float* aVector,
unsigned int num_points)
332 float* cPtr = cVector;
333 const float* bPtr = bVector;
334 const float* aPtr = aVector;
336 unsigned int number = 0;
337 const unsigned int quarterPoints = num_points / 4;
339 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
340 __m128 tmp, fx, mask, pow2n, z, y;
341 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
342 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
343 __m128i bias, exp, emm0, pi32_0x7f;
345 one = _mm_set1_ps(1.0);
346 exp_hi = _mm_set1_ps(88.3762626647949);
347 exp_lo = _mm_set1_ps(-88.3762626647949);
348 ln2 = _mm_set1_ps(0.6931471805);
349 log2EF = _mm_set1_ps(1.44269504088896341);
350 half = _mm_set1_ps(0.5);
351 exp_C1 = _mm_set1_ps(0.693359375);
352 exp_C2 = _mm_set1_ps(-2.12194440e-4);
353 pi32_0x7f = _mm_set1_epi32(0x7f);
355 exp_p0 = _mm_set1_ps(1.9875691500e-4);
356 exp_p1 = _mm_set1_ps(1.3981999507e-3);
357 exp_p2 = _mm_set1_ps(8.3334519073e-3);
358 exp_p3 = _mm_set1_ps(4.1665795894e-2);
359 exp_p4 = _mm_set1_ps(1.6666665459e-1);
360 exp_p5 = _mm_set1_ps(5.0000001201e-1);
362 for(;number < quarterPoints; number++){
364 aVal = _mm_load_ps(aPtr);
365 bias = _mm_set1_epi32(127);
366 leadingOne = _mm_set1_ps(1.0f);
367 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
368 logarithm = _mm_cvtepi32_ps(exp);
370 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
372 #if POW_POLY_DEGREE == 6 373 mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
374 #elif POW_POLY_DEGREE == 5 375 mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
376 #elif POW_POLY_DEGREE == 4 377 mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
378 #elif POW_POLY_DEGREE == 3 379 mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
384 logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
385 logarithm = _mm_mul_ps(logarithm, ln2);
389 bVal = _mm_load_ps(bPtr);
390 bVal = _mm_mul_ps(bVal, logarithm);
393 tmp = _mm_setzero_ps();
395 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
397 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
399 emm0 = _mm_cvttps_epi32(fx);
400 tmp = _mm_cvtepi32_ps(emm0);
402 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
403 fx = _mm_sub_ps(tmp, mask);
405 tmp = _mm_mul_ps(fx, exp_C1);
406 z = _mm_mul_ps(fx, exp_C2);
407 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
408 z = _mm_mul_ps(bVal, bVal);
410 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
411 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
412 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
413 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
414 y = _mm_add_ps(y, one);
416 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
418 pow2n = _mm_castsi128_ps(emm0);
419 cVal = _mm_mul_ps(y, pow2n);
421 _mm_store_ps(cPtr, cVal);
428 number = quarterPoints * 4;
429 for(;number < num_points; number++){
430 *cPtr++ = powf(*aPtr++, *bPtr++);
438 #ifndef INCLUDED_volk_32f_x2_pow_32f_u_H 439 #define INCLUDED_volk_32f_x2_pow_32f_u_H 443 #include <inttypes.h> 446 #define POW_POLY_DEGREE 3 448 #ifdef LV_HAVE_GENERIC 452 const float* aVector,
unsigned int num_points)
454 float* cPtr = cVector;
455 const float* bPtr = bVector;
456 const float* aPtr = aVector;
457 unsigned int number = 0;
459 for(number = 0; number < num_points; number++){
460 *cPtr++ = powf(*aPtr++, *bPtr++);
466 #ifdef LV_HAVE_SSE4_1 467 #include <smmintrin.h> 469 #define POLY0(x, c0) _mm_set1_ps(c0) 470 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) 471 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) 472 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) 473 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) 474 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) 477 volk_32f_x2_pow_32f_u_sse4_1(
float* cVector,
const float* bVector,
478 const float* aVector,
unsigned int num_points)
480 float* cPtr = cVector;
481 const float* bPtr = bVector;
482 const float* aPtr = aVector;
484 unsigned int number = 0;
485 const unsigned int quarterPoints = num_points / 4;
487 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
488 __m128 tmp, fx, mask, pow2n, z, y;
489 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
490 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
491 __m128i bias, exp, emm0, pi32_0x7f;
493 one = _mm_set1_ps(1.0);
494 exp_hi = _mm_set1_ps(88.3762626647949);
495 exp_lo = _mm_set1_ps(-88.3762626647949);
496 ln2 = _mm_set1_ps(0.6931471805);
497 log2EF = _mm_set1_ps(1.44269504088896341);
498 half = _mm_set1_ps(0.5);
499 exp_C1 = _mm_set1_ps(0.693359375);
500 exp_C2 = _mm_set1_ps(-2.12194440e-4);
501 pi32_0x7f = _mm_set1_epi32(0x7f);
503 exp_p0 = _mm_set1_ps(1.9875691500e-4);
504 exp_p1 = _mm_set1_ps(1.3981999507e-3);
505 exp_p2 = _mm_set1_ps(8.3334519073e-3);
506 exp_p3 = _mm_set1_ps(4.1665795894e-2);
507 exp_p4 = _mm_set1_ps(1.6666665459e-1);
508 exp_p5 = _mm_set1_ps(5.0000001201e-1);
510 for(;number < quarterPoints; number++){
512 aVal = _mm_loadu_ps(aPtr);
513 bias = _mm_set1_epi32(127);
514 leadingOne = _mm_set1_ps(1.0f);
515 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
516 logarithm = _mm_cvtepi32_ps(exp);
518 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
520 #if POW_POLY_DEGREE == 6 521 mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
522 #elif POW_POLY_DEGREE == 5 523 mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
524 #elif POW_POLY_DEGREE == 4 525 mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
526 #elif POW_POLY_DEGREE == 3 527 mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
532 logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
533 logarithm = _mm_mul_ps(logarithm, ln2);
537 bVal = _mm_loadu_ps(bPtr);
538 bVal = _mm_mul_ps(bVal, logarithm);
541 tmp = _mm_setzero_ps();
543 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
545 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
547 emm0 = _mm_cvttps_epi32(fx);
548 tmp = _mm_cvtepi32_ps(emm0);
550 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
551 fx = _mm_sub_ps(tmp, mask);
553 tmp = _mm_mul_ps(fx, exp_C1);
554 z = _mm_mul_ps(fx, exp_C2);
555 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
556 z = _mm_mul_ps(bVal, bVal);
558 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
559 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
560 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
561 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
562 y = _mm_add_ps(y, one);
564 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
566 pow2n = _mm_castsi128_ps(emm0);
567 cVal = _mm_mul_ps(y, pow2n);
569 _mm_storeu_ps(cPtr, cVal);
576 number = quarterPoints * 4;
577 for(;number < num_points; number++){
578 *cPtr++ = powf(*aPtr++, *bPtr++);
584 #if LV_HAVE_AVX2 && LV_HAVE_FMA 585 #include <immintrin.h> 587 #define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0) 588 #define POLY1_AVX2_FMA(x, c0, c1) _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0)) 589 #define POLY2_AVX2_FMA(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0)) 590 #define POLY3_AVX2_FMA(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0)) 591 #define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) 592 #define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) 595 volk_32f_x2_pow_32f_u_avx2_fma(
float* cVector,
const float* bVector,
596 const float* aVector,
unsigned int num_points)
598 float* cPtr = cVector;
599 const float* bPtr = bVector;
600 const float* aPtr = aVector;
602 unsigned int number = 0;
603 const unsigned int eighthPoints = num_points / 8;
605 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
606 __m256 tmp, fx, mask, pow2n, z, y;
607 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
608 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
609 __m256i bias, exp, emm0, pi32_0x7f;
611 one = _mm256_set1_ps(1.0);
612 exp_hi = _mm256_set1_ps(88.3762626647949);
613 exp_lo = _mm256_set1_ps(-88.3762626647949);
614 ln2 = _mm256_set1_ps(0.6931471805);
615 log2EF = _mm256_set1_ps(1.44269504088896341);
616 half = _mm256_set1_ps(0.5);
617 exp_C1 = _mm256_set1_ps(0.693359375);
618 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
619 pi32_0x7f = _mm256_set1_epi32(0x7f);
621 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
622 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
623 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
624 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
625 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
626 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
628 for(;number < eighthPoints; number++){
630 aVal = _mm256_loadu_ps(aPtr);
631 bias = _mm256_set1_epi32(127);
632 leadingOne = _mm256_set1_ps(1.0f);
633 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
634 logarithm = _mm256_cvtepi32_ps(exp);
636 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
638 #if POW_POLY_DEGREE == 6 639 mantissa = POLY5_AVX2_FMA( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
640 #elif POW_POLY_DEGREE == 5 641 mantissa = POLY4_AVX2_FMA( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
642 #elif POW_POLY_DEGREE == 4 643 mantissa = POLY3_AVX2_FMA( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
644 #elif POW_POLY_DEGREE == 3 645 mantissa = POLY2_AVX2_FMA( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
650 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
651 logarithm = _mm256_mul_ps(logarithm, ln2);
655 bVal = _mm256_loadu_ps(bPtr);
656 bVal = _mm256_mul_ps(bVal, logarithm);
659 tmp = _mm256_setzero_ps();
661 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
663 fx = _mm256_fmadd_ps(bVal, log2EF, half);
665 emm0 = _mm256_cvttps_epi32(fx);
666 tmp = _mm256_cvtepi32_ps(emm0);
668 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, 14), one);
669 fx = _mm256_sub_ps(tmp, mask);
671 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
672 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
673 z = _mm256_mul_ps(bVal, bVal);
675 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
676 y = _mm256_fmadd_ps(y, bVal, exp_p2);
677 y = _mm256_fmadd_ps(y, bVal, exp_p3);
678 y = _mm256_fmadd_ps(y, bVal, exp_p4);
679 y = _mm256_fmadd_ps(y, bVal, exp_p5);
680 y = _mm256_fmadd_ps(y, z, bVal);
681 y = _mm256_add_ps(y, one);
683 emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
685 pow2n = _mm256_castsi256_ps(emm0);
686 cVal = _mm256_mul_ps(y, pow2n);
688 _mm256_storeu_ps(cPtr, cVal);
695 number = eighthPoints * 8;
696 for(;number < num_points; number++){
697 *cPtr++ = pow(*aPtr++, *bPtr++);
704 #include <immintrin.h> 706 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) 707 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) 708 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) 709 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) 710 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) 711 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) 714 volk_32f_x2_pow_32f_u_avx2(
float* cVector,
const float* bVector,
715 const float* aVector,
unsigned int num_points)
717 float* cPtr = cVector;
718 const float* bPtr = bVector;
719 const float* aPtr = aVector;
721 unsigned int number = 0;
722 const unsigned int eighthPoints = num_points / 8;
724 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
725 __m256 tmp, fx, mask, pow2n, z, y;
726 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
727 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
728 __m256i bias, exp, emm0, pi32_0x7f;
730 one = _mm256_set1_ps(1.0);
731 exp_hi = _mm256_set1_ps(88.3762626647949);
732 exp_lo = _mm256_set1_ps(-88.3762626647949);
733 ln2 = _mm256_set1_ps(0.6931471805);
734 log2EF = _mm256_set1_ps(1.44269504088896341);
735 half = _mm256_set1_ps(0.5);
736 exp_C1 = _mm256_set1_ps(0.693359375);
737 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
738 pi32_0x7f = _mm256_set1_epi32(0x7f);
740 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
741 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
742 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
743 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
744 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
745 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
747 for(;number < eighthPoints; number++){
749 aVal = _mm256_loadu_ps(aPtr);
750 bias = _mm256_set1_epi32(127);
751 leadingOne = _mm256_set1_ps(1.0f);
752 exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
753 logarithm = _mm256_cvtepi32_ps(exp);
755 frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
757 #if POW_POLY_DEGREE == 6 758 mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
759 #elif POW_POLY_DEGREE == 5 760 mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
761 #elif POW_POLY_DEGREE == 4 762 mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
763 #elif POW_POLY_DEGREE == 3 764 mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
769 logarithm = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
770 logarithm = _mm256_mul_ps(logarithm, ln2);
773 bVal = _mm256_loadu_ps(bPtr);
774 bVal = _mm256_mul_ps(bVal, logarithm);
777 tmp = _mm256_setzero_ps();
779 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
781 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
783 emm0 = _mm256_cvttps_epi32(fx);
784 tmp = _mm256_cvtepi32_ps(emm0);
786 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, 14), one);
787 fx = _mm256_sub_ps(tmp, mask);
789 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
790 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
791 z = _mm256_mul_ps(bVal, bVal);
793 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
794 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
795 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
796 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
797 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
798 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
799 y = _mm256_add_ps(y, one);
801 emm0 = _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
803 pow2n = _mm256_castsi256_ps(emm0);
804 cVal = _mm256_mul_ps(y, pow2n);
806 _mm256_storeu_ps(cPtr, cVal);
813 number = eighthPoints * 8;
814 for(;number < num_points; number++){
815 *cPtr++ = pow(*aPtr++, *bPtr++);
static void volk_32f_x2_pow_32f_generic(float *cVector, const float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_x2_pow_32f.h:451