Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_log2_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
89 #ifndef INCLUDED_volk_32f_log2_32f_a_H
90 #define INCLUDED_volk_32f_log2_32f_a_H
91 
92 #include <stdio.h>
93 #include <stdlib.h>
94 #include <inttypes.h>
95 #include <math.h>
96 
97 #define LOG_POLY_DEGREE 6
98 
99 #if LV_HAVE_AVX2 && LV_HAVE_FMA
100 #include <immintrin.h>
101 
102 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
103 #define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
104 #define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
105 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
106 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
107 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
108 
109 static inline void
110 volk_32f_log2_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
111 {
112  float* bPtr = bVector;
113  const float* aPtr = aVector;
114 
115  unsigned int number = 0;
116  const unsigned int eighthPoints = num_points / 8;
117 
118  __m256 aVal, bVal, mantissa, frac, leadingOne;
119  __m256i bias, exp;
120 
121  for(;number < eighthPoints; number++){
122 
123  aVal = _mm256_load_ps(aPtr);
124  bias = _mm256_set1_epi32(127);
125  leadingOne = _mm256_set1_ps(1.0f);
126  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
127  bVal = _mm256_cvtepi32_ps(exp);
128 
129  // Now to extract mantissa
130  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
131 
132 #if LOG_POLY_DEGREE == 6
133  mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
134 #elif LOG_POLY_DEGREE == 5
135  mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
136 #elif LOG_POLY_DEGREE == 4
137  mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
138 #elif LOG_POLY_DEGREE == 3
139  mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
140 #else
141 #error
142 #endif
143 
144  bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
145  _mm256_store_ps(bPtr, bVal);
146 
147  aPtr += 8;
148  bPtr += 8;
149  }
150 
151  number = eighthPoints * 8;
152  for(;number < num_points; number++){
153  *bPtr++ = log2f(*aPtr++);
154  }
155 }
156 
157 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
158 
159 #ifdef LV_HAVE_AVX2
160 #include <immintrin.h>
161 
162 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
163 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
164 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
165 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
166 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
167 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
168 
169 static inline void
170 volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
171 {
172  float* bPtr = bVector;
173  const float* aPtr = aVector;
174 
175  unsigned int number = 0;
176  const unsigned int eighthPoints = num_points / 8;
177 
178  __m256 aVal, bVal, mantissa, frac, leadingOne;
179  __m256i bias, exp;
180 
181  for(;number < eighthPoints; number++){
182 
183  aVal = _mm256_load_ps(aPtr);
184  bias = _mm256_set1_epi32(127);
185  leadingOne = _mm256_set1_ps(1.0f);
186  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
187  bVal = _mm256_cvtepi32_ps(exp);
188 
189  // Now to extract mantissa
190  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
191 
192 #if LOG_POLY_DEGREE == 6
193  mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
194 #elif LOG_POLY_DEGREE == 5
195  mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
196 #elif LOG_POLY_DEGREE == 4
197  mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
198 #elif LOG_POLY_DEGREE == 3
199  mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
200 #else
201 #error
202 #endif
203 
204  bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
205  _mm256_store_ps(bPtr, bVal);
206 
207  aPtr += 8;
208  bPtr += 8;
209  }
210 
211  number = eighthPoints * 8;
212  for(;number < num_points; number++){
213  *bPtr++ = log2f(*aPtr++);
214  }
215 }
216 
217 #endif /* LV_HAVE_AVX2 for aligned */
218 
219 #ifdef LV_HAVE_GENERIC
220 
221 static inline void
222 volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
223 {
224  float* bPtr = bVector;
225  const float* aPtr = aVector;
226  unsigned int number = 0;
227 
228  for(number = 0; number < num_points; number++) {
229  *bPtr++ = log2f(*aPtr++);
230  }
231 
232 }
233 #endif /* LV_HAVE_GENERIC */
234 
235 #ifdef LV_HAVE_SSE4_1
236 #include <smmintrin.h>
237 
238 #define POLY0(x, c0) _mm_set1_ps(c0)
239 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
240 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
241 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
242 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
243 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
244 
245 static inline void
246 volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
247 {
248  float* bPtr = bVector;
249  const float* aPtr = aVector;
250 
251  unsigned int number = 0;
252  const unsigned int quarterPoints = num_points / 4;
253 
254  __m128 aVal, bVal, mantissa, frac, leadingOne;
255  __m128i bias, exp;
256 
257  for(;number < quarterPoints; number++){
258 
259  aVal = _mm_load_ps(aPtr);
260  bias = _mm_set1_epi32(127);
261  leadingOne = _mm_set1_ps(1.0f);
262  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
263  bVal = _mm_cvtepi32_ps(exp);
264 
265  // Now to extract mantissa
266  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
267 
268 #if LOG_POLY_DEGREE == 6
269  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
270 #elif LOG_POLY_DEGREE == 5
271  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
272 #elif LOG_POLY_DEGREE == 4
273  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
274 #elif LOG_POLY_DEGREE == 3
275  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
276 #else
277 #error
278 #endif
279 
280  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
281  _mm_store_ps(bPtr, bVal);
282 
283  aPtr += 4;
284  bPtr += 4;
285  }
286 
287  number = quarterPoints * 4;
288  for(;number < num_points; number++){
289  *bPtr++ = log2f(*aPtr++);
290  }
291 }
292 
293 #endif /* LV_HAVE_SSE4_1 for aligned */
294 
295 #ifdef LV_HAVE_NEON
296 #include <arm_neon.h>
297 
298 /* these macros allow us to embed logs in other kernels */
299 #define VLOG2Q_NEON_PREAMBLE() \
300  int32x4_t one = vdupq_n_s32(0x000800000); \
301  /* minimax polynomial */ \
302  float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
303  float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
304  float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
305  float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
306  float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
307  float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
308  float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
309  int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
310  int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
311  int32x4_t exp_bias = vdupq_n_s32(127);
312 
313 
314 #define VLOG2Q_NEON_F32(log2_approx, aval) \
315  int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
316  int32x4_t significand_i = vandq_s32(aval, sig_mask); \
317  exponent_i = vshrq_n_s32(exponent_i, 23); \
318  \
319  /* extract the exponent and significand \
320  we can treat this as fixed point to save ~9% on the \
321  conversion + float add */ \
322  significand_i = vorrq_s32(one, significand_i); \
323  float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \
324  /* debias the exponent and convert to float */ \
325  exponent_i = vsubq_s32(exponent_i, exp_bias); \
326  float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
327  \
328  /* put the significand through a polynomial fit of log2(x) [1,2] \
329  add the result to the exponent */ \
330  log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
331  float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
332  log2_approx = vaddq_f32(log2_approx, tmp1); \
333  float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
334  tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
335  log2_approx = vaddq_f32(log2_approx, tmp1); \
336  \
337  float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
338  tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
339  log2_approx = vaddq_f32(log2_approx, tmp1); \
340  float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
341  tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
342  log2_approx = vaddq_f32(log2_approx, tmp1); \
343  float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
344  tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
345  log2_approx = vaddq_f32(log2_approx, tmp1); \
346  float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
347  tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
348  log2_approx = vaddq_f32(log2_approx, tmp1);
349 
350 static inline void
351 volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
352 {
353  float* bPtr = bVector;
354  const float* aPtr = aVector;
355  unsigned int number;
356  const unsigned int quarterPoints = num_points / 4;
357 
358  int32x4_t aval;
359  float32x4_t log2_approx;
360 
362  // lms
363  //p0 = vdupq_n_f32(-1.649132280361871);
364  //p1 = vdupq_n_f32(1.995047138579499);
365  //p2 = vdupq_n_f32(-0.336914839219728);
366 
367  // keep in mind a single precision float is represented as
368  // (-1)^sign * 2^exp * 1.significand, so the log2 is
369  // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
370  for(number = 0; number < quarterPoints; ++number){
371  // load float in to an int register without conversion
372  aval = vld1q_s32((int*)aPtr);
373 
374  VLOG2Q_NEON_F32(log2_approx, aval)
375 
376  vst1q_f32(bPtr, log2_approx);
377 
378  aPtr += 4;
379  bPtr += 4;
380  }
381 
382  for(number = quarterPoints * 4; number < num_points; number++){
383  *bPtr++ = log2f(*aPtr++);
384  }
385 }
386 
387 #endif /* LV_HAVE_NEON */
388 
389 
390 #endif /* INCLUDED_volk_32f_log2_32f_a_H */
391 
392 #ifndef INCLUDED_volk_32f_log2_32f_u_H
393 #define INCLUDED_volk_32f_log2_32f_u_H
394 
395 
396 #ifdef LV_HAVE_GENERIC
397 
398 static inline void
399 volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
400 {
401  float* bPtr = bVector;
402  const float* aPtr = aVector;
403  unsigned int number = 0;
404 
405  for(number = 0; number < num_points; number++){
406  *bPtr++ = log2f(*aPtr++);
407  }
408 }
409 
410 #endif /* LV_HAVE_GENERIC */
411 
412 
413 #ifdef LV_HAVE_SSE4_1
414 #include <smmintrin.h>
415 
416 #define POLY0(x, c0) _mm_set1_ps(c0)
417 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
418 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
419 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
420 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
421 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
422 
423 static inline void
424 volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
425 {
426  float* bPtr = bVector;
427  const float* aPtr = aVector;
428 
429  unsigned int number = 0;
430  const unsigned int quarterPoints = num_points / 4;
431 
432  __m128 aVal, bVal, mantissa, frac, leadingOne;
433  __m128i bias, exp;
434 
435  for(;number < quarterPoints; number++){
436 
437  aVal = _mm_loadu_ps(aPtr);
438  bias = _mm_set1_epi32(127);
439  leadingOne = _mm_set1_ps(1.0f);
440  exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
441  bVal = _mm_cvtepi32_ps(exp);
442 
443  // Now to extract mantissa
444  frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
445 
446 #if LOG_POLY_DEGREE == 6
447  mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
448 #elif LOG_POLY_DEGREE == 5
449  mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
450 #elif LOG_POLY_DEGREE == 4
451  mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
452 #elif LOG_POLY_DEGREE == 3
453  mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
454 #else
455 #error
456 #endif
457 
458  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
459  _mm_storeu_ps(bPtr, bVal);
460 
461  aPtr += 4;
462  bPtr += 4;
463  }
464 
465  number = quarterPoints * 4;
466  for(;number < num_points; number++){
467  *bPtr++ = log2f(*aPtr++);
468  }
469 }
470 
471 #endif /* LV_HAVE_SSE4_1 for unaligned */
472 
473 #if LV_HAVE_AVX2 && LV_HAVE_FMA
474 #include <immintrin.h>
475 
476 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
477 #define POLY1_FMAAVX2(x, c0, c1) _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
478 #define POLY2_FMAAVX2(x, c0, c1, c2) _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
479 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
480 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
481 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
482 
483 static inline void
484 volk_32f_log2_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
485 {
486  float* bPtr = bVector;
487  const float* aPtr = aVector;
488 
489  unsigned int number = 0;
490  const unsigned int eighthPoints = num_points / 8;
491 
492  __m256 aVal, bVal, mantissa, frac, leadingOne;
493  __m256i bias, exp;
494 
495  for(;number < eighthPoints; number++){
496 
497  aVal = _mm256_loadu_ps(aPtr);
498  bias = _mm256_set1_epi32(127);
499  leadingOne = _mm256_set1_ps(1.0f);
500  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
501  bVal = _mm256_cvtepi32_ps(exp);
502 
503  // Now to extract mantissa
504  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
505 
506 #if LOG_POLY_DEGREE == 6
507  mantissa = POLY5_FMAAVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
508 #elif LOG_POLY_DEGREE == 5
509  mantissa = POLY4_FMAAVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
510 #elif LOG_POLY_DEGREE == 4
511  mantissa = POLY3_FMAAVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
512 #elif LOG_POLY_DEGREE == 3
513  mantissa = POLY2_FMAAVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
514 #else
515 #error
516 #endif
517 
518  bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
519  _mm256_storeu_ps(bPtr, bVal);
520 
521  aPtr += 8;
522  bPtr += 8;
523  }
524 
525  number = eighthPoints * 8;
526  for(;number < num_points; number++){
527  *bPtr++ = log2f(*aPtr++);
528  }
529 }
530 
531 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
532 
533 #ifdef LV_HAVE_AVX2
534 #include <immintrin.h>
535 
536 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
537 #define POLY1_AVX2(x, c0, c1) _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
538 #define POLY2_AVX2(x, c0, c1, c2) _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
539 #define POLY3_AVX2(x, c0, c1, c2, c3) _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
540 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
541 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
542 
543 static inline void
544 volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
545 {
546  float* bPtr = bVector;
547  const float* aPtr = aVector;
548 
549  unsigned int number = 0;
550  const unsigned int eighthPoints = num_points / 8;
551 
552  __m256 aVal, bVal, mantissa, frac, leadingOne;
553  __m256i bias, exp;
554 
555  for(;number < eighthPoints; number++){
556 
557  aVal = _mm256_loadu_ps(aPtr);
558  bias = _mm256_set1_epi32(127);
559  leadingOne = _mm256_set1_ps(1.0f);
560  exp = _mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal), _mm256_set1_epi32(0x7f800000)), 23), bias);
561  bVal = _mm256_cvtepi32_ps(exp);
562 
563  // Now to extract mantissa
564  frac = _mm256_or_ps(leadingOne, _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
565 
566 #if LOG_POLY_DEGREE == 6
567  mantissa = POLY5_AVX2( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
568 #elif LOG_POLY_DEGREE == 5
569  mantissa = POLY4_AVX2( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
570 #elif LOG_POLY_DEGREE == 4
571  mantissa = POLY3_AVX2( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
572 #elif LOG_POLY_DEGREE == 3
573  mantissa = POLY2_AVX2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
574 #else
575 #error
576 #endif
577 
578  bVal = _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
579  _mm256_storeu_ps(bPtr, bVal);
580 
581  aPtr += 8;
582  bPtr += 8;
583  }
584 
585  number = eighthPoints * 8;
586  for(;number < num_points; number++){
587  *bPtr++ = log2f(*aPtr++);
588  }
589 }
590 
591 #endif /* LV_HAVE_AVX2 for unaligned */
592 
593 
594 #endif /* INCLUDED_volk_32f_log2_32f_u_H */
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:222
static void volk_32f_log2_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:399
#define VLOG2Q_NEON_F32(log2_approx, aval)
Definition: volk_32f_log2_32f.h:314
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:351
#define VLOG2Q_NEON_PREAMBLE()
Definition: volk_32f_log2_32f.h:299