71 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H 72 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H 79 #include <pmmintrin.h> 84 const lv_32fc_t* denumeratorVector,
unsigned int num_points)
92 unsigned int number = 0;
93 const unsigned int quarterPoints = num_points / 4;
95 __m128 num01, num23, den01, den23, norm, result;
100 for(; number < quarterPoints; number++){
101 num01 = _mm_loadu_ps((
float*) a);
102 den01 = _mm_loadu_ps((
float*) b);
107 num23 = _mm_loadu_ps((
float*) a);
108 den23 = _mm_loadu_ps((
float*) b);
114 den01 = _mm_unpacklo_ps(norm,norm);
115 den23 = _mm_unpackhi_ps(norm,norm);
117 result = _mm_div_ps(num01, den01);
118 _mm_storeu_ps((
float*) c, result);
120 result = _mm_div_ps(num23, den23);
121 _mm_storeu_ps((
float*) c, result);
126 for(;number < num_points; number++){
135 #include <immintrin.h> 140 const lv_32fc_t* denumeratorVector,
unsigned int num_points)
148 unsigned int number = 0;
149 const unsigned int quarterPoints = num_points / 4;
151 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
156 for(; number < quarterPoints; number++){
157 num = _mm256_loadu_ps((
float*) a);
158 denum = _mm256_loadu_ps((
float*) b);
160 sq = _mm256_mul_ps(denum, denum);
161 mag_sq_un = _mm256_hadd_ps(sq,sq);
162 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8);
164 div = _mm256_div_ps(mul_conj,mag_sq);
166 _mm256_storeu_ps((
float*) c, div);
173 number = quarterPoints * 4;
175 for(; number < num_points; number++){
176 *c++ = (*a++) / (*b++);
183 #ifdef LV_HAVE_GENERIC 187 const lv_32fc_t* bVector,
unsigned int num_points)
192 unsigned int number = 0;
194 for(number = 0; number < num_points; number++){
195 *cPtr++ = (*aPtr++) / (*bPtr++);
205 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H 206 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H 208 #include <inttypes.h> 214 #include <pmmintrin.h> 219 const lv_32fc_t* denumeratorVector,
unsigned int num_points)
227 unsigned int number = 0;
228 const unsigned int quarterPoints = num_points / 4;
230 __m128 num01, num23, den01, den23, norm, result;
235 for(; number < quarterPoints; number++){
236 num01 = _mm_load_ps((
float*) a);
237 den01 = _mm_load_ps((
float*) b);
242 num23 = _mm_load_ps((
float*) a);
243 den23 = _mm_load_ps((
float*) b);
250 den01 = _mm_unpacklo_ps(norm,norm);
251 den23 = _mm_unpackhi_ps(norm,norm);
253 result = _mm_div_ps(num01, den01);
254 _mm_store_ps((
float*) c, result);
256 result = _mm_div_ps(num23, den23);
257 _mm_store_ps((
float*) c, result);
262 for(;number < num_points; number++){
270 #include <immintrin.h> 275 const lv_32fc_t* denumeratorVector,
unsigned int num_points)
283 unsigned int number = 0;
284 const unsigned int quarterPoints = num_points / 4;
286 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
291 for(; number < quarterPoints; number++){
292 num = _mm256_load_ps((
float*) a);
293 denum = _mm256_load_ps((
float*) b);
295 sq = _mm256_mul_ps(denum, denum);
296 mag_sq_un = _mm256_hadd_ps(sq,sq);
297 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8);
299 div = _mm256_div_ps(mul_conj,mag_sq);
301 _mm256_store_ps((
float*) c, div);
308 number = quarterPoints * 4;
310 for(; number < num_points; number++){
311 *c++ = (*a++) / (*b++);
319 #include <arm_neon.h> 323 const lv_32fc_t* bVector,
unsigned int num_points)
329 float32x4x2_t aVal, bVal, cVal;
330 float32x4_t bAbs, bAbsInv;
332 const unsigned int quarterPoints = num_points / 4;
333 unsigned int number = 0;
334 for(; number < quarterPoints; number++){
335 aVal = vld2q_f32((
const float*)(aPtr));
336 bVal = vld2q_f32((
const float*)(bPtr));
342 bAbs = vmulq_f32( bVal.val[0], bVal.val[0]);
343 bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
345 bAbsInv = vrecpeq_f32(bAbs);
346 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
347 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
349 cVal.val[0] = vmulq_f32( aVal.val[0], bVal.val[0]);
350 cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
351 cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
353 cVal.val[1] = vmulq_f32( aVal.val[1], bVal.val[0]);
354 cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
355 cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
357 vst2q_f32((
float*)(cPtr), cVal);
361 for(number = quarterPoints * 4; number < num_points; number++){
362 *cPtr++ = (*aPtr++) / (*bPtr++);
368 #ifdef LV_HAVE_GENERIC 372 const lv_32fc_t* bVector,
unsigned int num_points)
377 unsigned int number = 0;
379 for(number = 0; number < num_points; number++){
380 *cPtr++ = (*aPtr++) / (*bPtr++);
static void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:274
static __m256 _mm256_complexconjugatemul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:51
static __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:53
static void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:218
static void volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:371
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:45
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:83
static void volk_32fc_x2_divide_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:322
static void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:139
static void volk_32fc_x2_divide_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:186