Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_multiply_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
71 #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
72 
73 #include <inttypes.h>
74 #include <stdio.h>
75 #include <volk/volk_complex.h>
76 #include <float.h>
77 
78 #if LV_HAVE_AVX2 && LV_HAVE_FMA
79 #include <immintrin.h>
87 static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
88  unsigned int number = 0;
89  const unsigned int quarterPoints = num_points / 4;
90 
91  lv_32fc_t* c = cVector;
92  const lv_32fc_t* a = aVector;
93  const lv_32fc_t* b = bVector;
94 
95  for(;number < quarterPoints; number++){
96 
97  const __m256 x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
98  const __m256 y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
99 
100  const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
101  const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
102 
103  const __m256 tmp2x = _mm256_permute_ps(x,0xB1); // Re-arrange x to be ai,ar,bi,br
104 
105  const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
106 
107  const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
108 
109  _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
110 
111  a += 4;
112  b += 4;
113  c += 4;
114  }
115 
116  _mm256_zeroupper();
117 
118  number = quarterPoints * 4;
119  for(;number < num_points; number++){
120  *c++ = (*a++) * (*b++);
121  }
122 }
123 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
124 
125 
126 #ifdef LV_HAVE_AVX
127 #include <immintrin.h>
129 
130 static inline void
132  const lv_32fc_t* bVector, unsigned int num_points)
133 {
134  unsigned int number = 0;
135  const unsigned int quarterPoints = num_points / 4;
136 
137  __m256 x, y, z;
138  lv_32fc_t* c = cVector;
139  const lv_32fc_t* a = aVector;
140  const lv_32fc_t* b = bVector;
141 
142  for(; number < quarterPoints; number++){
143  x = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
144  y = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
145  z = _mm256_complexmul_ps(x, y);
146  _mm256_storeu_ps((float*) c, z); // Store the results back into the C container
147 
148  a += 4;
149  b += 4;
150  c += 4;
151  }
152 
153  number = quarterPoints * 4;
154 
155  for(; number < num_points; number++){
156  *c++ = (*a++) * (*b++);
157  }
158 }
159 #endif /* LV_HAVE_AVX */
160 
161 
162 #ifdef LV_HAVE_SSE3
163 #include <pmmintrin.h>
165 
166 static inline void
168  const lv_32fc_t* bVector, unsigned int num_points)
169 {
170  unsigned int number = 0;
171  const unsigned int halfPoints = num_points / 2;
172 
173  __m128 x, y, z;
174  lv_32fc_t* c = cVector;
175  const lv_32fc_t* a = aVector;
176  const lv_32fc_t* b = bVector;
177 
178  for(; number < halfPoints; number++){
179  x = _mm_loadu_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
180  y = _mm_loadu_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
181  z = _mm_complexmul_ps(x, y);
182  _mm_storeu_ps((float*) c, z); // Store the results back into the C container
183 
184  a += 2;
185  b += 2;
186  c += 2;
187  }
188 
189  if((num_points % 2) != 0){
190  *c = (*a) * (*b);
191  }
192 }
193 #endif /* LV_HAVE_SSE */
194 
195 
196 #ifdef LV_HAVE_GENERIC
197 
198 static inline void
200  const lv_32fc_t* bVector, unsigned int num_points)
201 {
202  lv_32fc_t* cPtr = cVector;
203  const lv_32fc_t* aPtr = aVector;
204  const lv_32fc_t* bPtr= bVector;
205  unsigned int number = 0;
206 
207  for(number = 0; number < num_points; number++){
208  *cPtr++ = (*aPtr++) * (*bPtr++);
209  }
210 }
211 #endif /* LV_HAVE_GENERIC */
212 
213 
214 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
215 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
216 #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
217 
218 #include <inttypes.h>
219 #include <stdio.h>
220 #include <volk/volk_complex.h>
221 #include <float.h>
222 
223 #if LV_HAVE_AVX2 && LV_HAVE_FMA
224 #include <immintrin.h>
232 static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
233  unsigned int number = 0;
234  const unsigned int quarterPoints = num_points / 4;
235 
236  lv_32fc_t* c = cVector;
237  const lv_32fc_t* a = aVector;
238  const lv_32fc_t* b = bVector;
239 
240  for(;number < quarterPoints; number++){
241 
242  const __m256 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
243  const __m256 y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
244 
245  const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
246  const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
247 
248  const __m256 tmp2x = _mm256_permute_ps(x,0xB1); // Re-arrange x to be ai,ar,bi,br
249 
250  const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
251 
252  const __m256 z = _mm256_fmaddsub_ps(x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
253 
254  _mm256_store_ps((float*)c,z); // Store the results back into the C container
255 
256  a += 4;
257  b += 4;
258  c += 4;
259  }
260 
261  _mm256_zeroupper();
262 
263  number = quarterPoints * 4;
264  for(;number < num_points; number++){
265  *c++ = (*a++) * (*b++);
266  }
267 }
268 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
269 
270 
271 #ifdef LV_HAVE_AVX
272 #include <immintrin.h>
274 
275 static inline void
277  const lv_32fc_t* bVector, unsigned int num_points)
278 {
279  unsigned int number = 0;
280  const unsigned int quarterPoints = num_points / 4;
281 
282  __m256 x, y, z;
283  lv_32fc_t* c = cVector;
284  const lv_32fc_t* a = aVector;
285  const lv_32fc_t* b = bVector;
286 
287  for(; number < quarterPoints; number++){
288  x = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
289  y = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
290  z = _mm256_complexmul_ps(x, y);
291  _mm256_store_ps((float*) c, z); // Store the results back into the C container
292 
293  a += 4;
294  b += 4;
295  c += 4;
296  }
297 
298  number = quarterPoints * 4;
299 
300  for(; number < num_points; number++){
301  *c++ = (*a++) * (*b++);
302  }
303 }
304 #endif /* LV_HAVE_AVX */
305 
306 #ifdef LV_HAVE_SSE3
307 #include <pmmintrin.h>
309 
310 static inline void
312  const lv_32fc_t* bVector, unsigned int num_points)
313 {
314  unsigned int number = 0;
315  const unsigned int halfPoints = num_points / 2;
316 
317  __m128 x, y, z;
318  lv_32fc_t* c = cVector;
319  const lv_32fc_t* a = aVector;
320  const lv_32fc_t* b = bVector;
321 
322  for(; number < halfPoints; number++){
323  x = _mm_load_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
324  y = _mm_load_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
325  z = _mm_complexmul_ps(x, y);
326  _mm_store_ps((float*) c, z); // Store the results back into the C container
327 
328  a += 2;
329  b += 2;
330  c += 2;
331  }
332 
333  if((num_points % 2) != 0){
334  *c = (*a) * (*b);
335  }
336 }
337 #endif /* LV_HAVE_SSE */
338 
339 
340 #ifdef LV_HAVE_GENERIC
341 
342 static inline void
344  const lv_32fc_t* bVector, unsigned int num_points)
345 {
346  lv_32fc_t* cPtr = cVector;
347  const lv_32fc_t* aPtr = aVector;
348  const lv_32fc_t* bPtr= bVector;
349  unsigned int number = 0;
350 
351  for(number = 0; number < num_points; number++){
352  *cPtr++ = (*aPtr++) * (*bPtr++);
353  }
354 }
355 #endif /* LV_HAVE_GENERIC */
356 
357 
358 #ifdef LV_HAVE_NEON
359 #include <arm_neon.h>
360 
361 static inline void
363  const lv_32fc_t* bVector, unsigned int num_points)
364 {
365  lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
366  lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
367  unsigned int quarter_points = num_points / 4;
368  float32x4x2_t a_val, b_val, c_val;
369  float32x4x2_t tmp_real, tmp_imag;
370  unsigned int number = 0;
371 
372  for(number = 0; number < quarter_points; ++number) {
373  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
374  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
375  __VOLK_PREFETCH(a_ptr+4);
376  __VOLK_PREFETCH(b_ptr+4);
377 
378  // multiply the real*real and imag*imag to get real result
379  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
380  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
381  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
382  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
383 
384  // Multiply cross terms to get the imaginary result
385  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
386  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
387  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
388  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
389 
390  // store the results
391  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
392  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
393  vst2q_f32((float*)cVector, c_val);
394 
395  a_ptr += 4;
396  b_ptr += 4;
397  cVector += 4;
398  }
399 
400  for(number = quarter_points*4; number < num_points; number++){
401  *cVector++ = (*a_ptr++) * (*b_ptr++);
402  }
403 }
404 #endif /* LV_HAVE_NEON */
405 
406 
407 #ifdef LV_HAVE_NEON
408 
409 static inline void
411  const lv_32fc_t* bVector, unsigned int num_points)
412 {
413  lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
414  lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
415  unsigned int quarter_points = num_points / 4;
416  float32x4x2_t a_val, b_val;
417  float32x4x2_t tmp_imag;
418  unsigned int number = 0;
419 
420  for(number = 0; number < quarter_points; ++number) {
421  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
422  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
423  __VOLK_PREFETCH(a_ptr+4);
424  __VOLK_PREFETCH(b_ptr+4);
425 
426  // do the first multiply
427  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
428  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
429 
430  // use multiply accumulate/subtract to get result
431  tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
432  tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
433 
434  // store
435  vst2q_f32((float*)cVector, tmp_imag);
436  // increment pointers
437  a_ptr += 4;
438  b_ptr += 4;
439  cVector += 4;
440  }
441 
442  for(number = quarter_points*4; number < num_points; number++){
443  *cVector++ = (*a_ptr++) * (*b_ptr++);
444  }
445 }
446 #endif /* LV_HAVE_NEON */
447 
448 
449 #ifdef LV_HAVE_NEONV7
450 
451 extern void
452 volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector,
453  const lv_32fc_t* bVector, unsigned int num_points);
454 #endif /* LV_HAVE_NEONV7 */
455 
456 
457 #ifdef LV_HAVE_ORC
458 
459 extern void
460 volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector,
461  const lv_32fc_t* bVector, unsigned int num_points);
462 
463 static inline void
464 volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
465  const lv_32fc_t* bVector, unsigned int num_points)
466 {
467  volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
468 }
469 
470 #endif /* LV_HAVE_ORC */
471 
472 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
static void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:276
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:33
static void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:343
static void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:199
static __m128 _mm_complexmul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:33
static void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:362
static void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:410
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:131
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:167
static void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_32fc.h:311