70 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H 71 #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H 78 #if LV_HAVE_AVX2 && LV_HAVE_FMA 79 #include <immintrin.h> 87 static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
88 unsigned int number = 0;
89 const unsigned int quarterPoints = num_points / 4;
95 for(;number < quarterPoints; number++){
97 const __m256 x = _mm256_loadu_ps((
float*)a);
98 const __m256 y = _mm256_loadu_ps((
float*)b);
100 const __m256 yl = _mm256_moveldup_ps(y);
101 const __m256 yh = _mm256_movehdup_ps(y);
103 const __m256 tmp2x = _mm256_permute_ps(x,0xB1);
105 const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh);
107 const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2);
109 _mm256_storeu_ps((
float*)c,z);
118 number = quarterPoints * 4;
119 for(;number < num_points; number++){
120 *c++ = (*a++) * (*b++);
127 #include <immintrin.h> 132 const lv_32fc_t* bVector,
unsigned int num_points)
134 unsigned int number = 0;
135 const unsigned int quarterPoints = num_points / 4;
142 for(; number < quarterPoints; number++){
143 x = _mm256_loadu_ps((
float*) a);
144 y = _mm256_loadu_ps((
float*) b);
146 _mm256_storeu_ps((
float*) c, z);
153 number = quarterPoints * 4;
155 for(; number < num_points; number++){
156 *c++ = (*a++) * (*b++);
163 #include <pmmintrin.h> 168 const lv_32fc_t* bVector,
unsigned int num_points)
170 unsigned int number = 0;
171 const unsigned int halfPoints = num_points / 2;
178 for(; number < halfPoints; number++){
179 x = _mm_loadu_ps((
float*) a);
180 y = _mm_loadu_ps((
float*) b);
182 _mm_storeu_ps((
float*) c, z);
189 if((num_points % 2) != 0){
196 #ifdef LV_HAVE_GENERIC 200 const lv_32fc_t* bVector,
unsigned int num_points)
205 unsigned int number = 0;
207 for(number = 0; number < num_points; number++){
208 *cPtr++ = (*aPtr++) * (*bPtr++);
215 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H 216 #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H 218 #include <inttypes.h> 223 #if LV_HAVE_AVX2 && LV_HAVE_FMA 224 #include <immintrin.h> 232 static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t* bVector,
unsigned int num_points){
233 unsigned int number = 0;
234 const unsigned int quarterPoints = num_points / 4;
240 for(;number < quarterPoints; number++){
242 const __m256 x = _mm256_load_ps((
float*)a);
243 const __m256 y = _mm256_load_ps((
float*)b);
245 const __m256 yl = _mm256_moveldup_ps(y);
246 const __m256 yh = _mm256_movehdup_ps(y);
248 const __m256 tmp2x = _mm256_permute_ps(x,0xB1);
250 const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh);
252 const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2);
254 _mm256_store_ps((
float*)c,z);
263 number = quarterPoints * 4;
264 for(;number < num_points; number++){
265 *c++ = (*a++) * (*b++);
272 #include <immintrin.h> 277 const lv_32fc_t* bVector,
unsigned int num_points)
279 unsigned int number = 0;
280 const unsigned int quarterPoints = num_points / 4;
287 for(; number < quarterPoints; number++){
288 x = _mm256_load_ps((
float*) a);
289 y = _mm256_load_ps((
float*) b);
291 _mm256_store_ps((
float*) c, z);
298 number = quarterPoints * 4;
300 for(; number < num_points; number++){
301 *c++ = (*a++) * (*b++);
307 #include <pmmintrin.h> 312 const lv_32fc_t* bVector,
unsigned int num_points)
314 unsigned int number = 0;
315 const unsigned int halfPoints = num_points / 2;
322 for(; number < halfPoints; number++){
323 x = _mm_load_ps((
float*) a);
324 y = _mm_load_ps((
float*) b);
326 _mm_store_ps((
float*) c, z);
333 if((num_points % 2) != 0){
340 #ifdef LV_HAVE_GENERIC 344 const lv_32fc_t* bVector,
unsigned int num_points)
349 unsigned int number = 0;
351 for(number = 0; number < num_points; number++){
352 *cPtr++ = (*aPtr++) * (*bPtr++);
359 #include <arm_neon.h> 363 const lv_32fc_t* bVector,
unsigned int num_points)
367 unsigned int quarter_points = num_points / 4;
368 float32x4x2_t a_val, b_val, c_val;
369 float32x4x2_t tmp_real, tmp_imag;
370 unsigned int number = 0;
372 for(number = 0; number < quarter_points; ++number) {
373 a_val = vld2q_f32((
float*)a_ptr);
374 b_val = vld2q_f32((
float*)b_ptr);
380 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
382 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
386 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
388 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
391 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
392 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
393 vst2q_f32((
float*)cVector, c_val);
400 for(number = quarter_points*4; number < num_points; number++){
401 *cVector++ = (*a_ptr++) * (*b_ptr++);
411 const lv_32fc_t* bVector,
unsigned int num_points)
415 unsigned int quarter_points = num_points / 4;
416 float32x4x2_t a_val, b_val;
417 float32x4x2_t tmp_imag;
418 unsigned int number = 0;
420 for(number = 0; number < quarter_points; ++number) {
421 a_val = vld2q_f32((
float*)a_ptr);
422 b_val = vld2q_f32((
float*)b_ptr);
427 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
428 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
431 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
432 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
435 vst2q_f32((
float*)cVector, tmp_imag);
442 for(number = quarter_points*4; number < num_points; number++){
443 *cVector++ = (*a_ptr++) * (*b_ptr++);
449 #ifdef LV_HAVE_NEONV7 453 const lv_32fc_t* bVector,
unsigned int num_points);
461 const lv_32fc_t* bVector,
unsigned int num_points);
465 const lv_32fc_t* bVector,
unsigned int num_points)
467 volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
static void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:276
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:33
static void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:343
static void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:199
static __m128 _mm_complexmul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:33
static void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:362
static void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:410
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:131
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:167
static void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:311