Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32fc_s32fc_multiply_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
76 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
77 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
78 
79 #include <inttypes.h>
80 #include <stdio.h>
81 #include <volk/volk_complex.h>
82 #include <float.h>
83 
84 #if LV_HAVE_AVX && LV_HAVE_FMA
85 #include <immintrin.h>
86 
87 static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
88  unsigned int number = 0;
89  unsigned int i = 0;
90  const unsigned int quarterPoints = num_points / 4;
91  unsigned int isodd = num_points & 3;
92  __m256 x, yl, yh, z, tmp1, tmp2;
93  lv_32fc_t* c = cVector;
94  const lv_32fc_t* a = aVector;
95 
96  // Set up constant scalar vector
97  yl = _mm256_set1_ps(lv_creal(scalar));
98  yh = _mm256_set1_ps(lv_cimag(scalar));
99 
100  for(;number < quarterPoints; number++){
101  x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
102 
103  tmp1 = x;
104 
105  x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
106 
107  tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
108 
109  z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
110 
111  _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
112 
113  a += 4;
114  c += 4;
115  }
116 
117  for(i = num_points-isodd; i < num_points; i++) {
118  *c++ = (*a++) * scalar;
119  }
120 
121 }
122 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
123 
124 #ifdef LV_HAVE_AVX
125 #include <immintrin.h>
126 
127 static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
128  unsigned int number = 0;
129  unsigned int i = 0;
130  const unsigned int quarterPoints = num_points / 4;
131  unsigned int isodd = num_points & 3;
132  __m256 x, yl, yh, z, tmp1, tmp2;
133  lv_32fc_t* c = cVector;
134  const lv_32fc_t* a = aVector;
135 
136  // Set up constant scalar vector
137  yl = _mm256_set1_ps(lv_creal(scalar));
138  yh = _mm256_set1_ps(lv_cimag(scalar));
139 
140  for(;number < quarterPoints; number++){
141  x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
142 
143  tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
144 
145  x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
146 
147  tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
148 
149  z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
150 
151  _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
152 
153  a += 4;
154  c += 4;
155  }
156 
157  for(i = num_points-isodd; i < num_points; i++) {
158  *c++ = (*a++) * scalar;
159  }
160 
161 }
162 #endif /* LV_HAVE_AVX */
163 
164 #ifdef LV_HAVE_SSE3
165 #include <pmmintrin.h>
166 
167 static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
168  unsigned int number = 0;
169  const unsigned int halfPoints = num_points / 2;
170 
171  __m128 x, yl, yh, z, tmp1, tmp2;
172  lv_32fc_t* c = cVector;
173  const lv_32fc_t* a = aVector;
174 
175  // Set up constant scalar vector
176  yl = _mm_set_ps1(lv_creal(scalar));
177  yh = _mm_set_ps1(lv_cimag(scalar));
178 
179  for(;number < halfPoints; number++){
180 
181  x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
182 
183  tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
184 
185  x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
186 
187  tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
188 
189  z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
190 
191  _mm_storeu_ps((float*)c,z); // Store the results back into the C container
192 
193  a += 2;
194  c += 2;
195  }
196 
197  if((num_points % 2) != 0) {
198  *c = (*a) * scalar;
199  }
200 }
201 #endif /* LV_HAVE_SSE */
202 
203 #ifdef LV_HAVE_GENERIC
204 
205 static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
206  lv_32fc_t* cPtr = cVector;
207  const lv_32fc_t* aPtr = aVector;
208  unsigned int number = num_points;
209 
210  // unwrap loop
211  while (number >= 8){
212  *cPtr++ = (*aPtr++) * scalar;
213  *cPtr++ = (*aPtr++) * scalar;
214  *cPtr++ = (*aPtr++) * scalar;
215  *cPtr++ = (*aPtr++) * scalar;
216  *cPtr++ = (*aPtr++) * scalar;
217  *cPtr++ = (*aPtr++) * scalar;
218  *cPtr++ = (*aPtr++) * scalar;
219  *cPtr++ = (*aPtr++) * scalar;
220  number -= 8;
221  }
222 
223  // clean up any remaining
224  while (number-- > 0)
225  *cPtr++ = *aPtr++ * scalar;
226 }
227 #endif /* LV_HAVE_GENERIC */
228 
229 
230 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
231 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
232 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
233 
234 #include <inttypes.h>
235 #include <stdio.h>
236 #include <volk/volk_complex.h>
237 #include <float.h>
238 
239 #if LV_HAVE_AVX && LV_HAVE_FMA
240 #include <immintrin.h>
241 
242 static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
243  unsigned int number = 0;
244  unsigned int i = 0;
245  const unsigned int quarterPoints = num_points / 4;
246  unsigned int isodd = num_points & 3;
247  __m256 x, yl, yh, z, tmp1, tmp2;
248  lv_32fc_t* c = cVector;
249  const lv_32fc_t* a = aVector;
250 
251  // Set up constant scalar vector
252  yl = _mm256_set1_ps(lv_creal(scalar));
253  yh = _mm256_set1_ps(lv_cimag(scalar));
254 
255  for(;number < quarterPoints; number++){
256  x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
257 
258  tmp1 = x;
259 
260  x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
261 
262  tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
263 
264  z = _mm256_fmaddsub_ps(tmp1, yl,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
265 
266  _mm256_store_ps((float*)c,z); // Store the results back into the C container
267 
268  a += 4;
269  c += 4;
270  }
271 
272  for(i = num_points-isodd; i < num_points; i++) {
273  *c++ = (*a++) * scalar;
274  }
275 
276 }
277 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
278 
279 
280 #ifdef LV_HAVE_AVX
281 #include <immintrin.h>
282 
283 static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
284  unsigned int number = 0;
285  unsigned int i = 0;
286  const unsigned int quarterPoints = num_points / 4;
287  unsigned int isodd = num_points & 3;
288  __m256 x, yl, yh, z, tmp1, tmp2;
289  lv_32fc_t* c = cVector;
290  const lv_32fc_t* a = aVector;
291 
292  // Set up constant scalar vector
293  yl = _mm256_set1_ps(lv_creal(scalar));
294  yh = _mm256_set1_ps(lv_cimag(scalar));
295 
296  for(;number < quarterPoints; number++){
297  x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
298 
299  tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
300 
301  x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
302 
303  tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
304 
305  z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
306 
307  _mm256_store_ps((float*)c,z); // Store the results back into the C container
308 
309  a += 4;
310  c += 4;
311  }
312 
313  for(i = num_points-isodd; i < num_points; i++) {
314  *c++ = (*a++) * scalar;
315  }
316 
317 }
318 #endif /* LV_HAVE_AVX */
319 
320 #ifdef LV_HAVE_SSE3
321 #include <pmmintrin.h>
322 
323 static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
324  unsigned int number = 0;
325  const unsigned int halfPoints = num_points / 2;
326 
327  __m128 x, yl, yh, z, tmp1, tmp2;
328  lv_32fc_t* c = cVector;
329  const lv_32fc_t* a = aVector;
330 
331  // Set up constant scalar vector
332  yl = _mm_set_ps1(lv_creal(scalar));
333  yh = _mm_set_ps1(lv_cimag(scalar));
334 
335  for(;number < halfPoints; number++){
336 
337  x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
338 
339  tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
340 
341  x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
342 
343  tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
344 
345  z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
346 
347  _mm_store_ps((float*)c,z); // Store the results back into the C container
348 
349  a += 2;
350  c += 2;
351  }
352 
353  if((num_points % 2) != 0) {
354  *c = (*a) * scalar;
355  }
356 }
357 #endif /* LV_HAVE_SSE */
358 
359 #ifdef LV_HAVE_NEON
360 #include <arm_neon.h>
361 
362 static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
363  lv_32fc_t* cPtr = cVector;
364  const lv_32fc_t* aPtr = aVector;
365  unsigned int number = num_points;
366  unsigned int quarter_points = num_points / 4;
367 
368  float32x4x2_t a_val, scalar_val;
369  float32x4x2_t tmp_imag;
370 
371  scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
372  scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
373  for(number = 0; number < quarter_points; ++number) {
374  a_val = vld2q_f32((float*)aPtr);
375  tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
376  tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
377 
378  tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
379  tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
380 
381  vst2q_f32((float*)cPtr, tmp_imag);
382  aPtr += 4;
383  cPtr += 4;
384  }
385 
386  for(number = quarter_points*4; number < num_points; number++){
387  *cPtr++ = *aPtr++ * scalar;
388  }
389 }
390 #endif /* LV_HAVE_NEON */
391 
392 #ifdef LV_HAVE_GENERIC
393 
394 static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
395  lv_32fc_t* cPtr = cVector;
396  const lv_32fc_t* aPtr = aVector;
397  unsigned int number = num_points;
398 
399  // unwrap loop
400  while (number >= 8){
401  *cPtr++ = (*aPtr++) * scalar;
402  *cPtr++ = (*aPtr++) * scalar;
403  *cPtr++ = (*aPtr++) * scalar;
404  *cPtr++ = (*aPtr++) * scalar;
405  *cPtr++ = (*aPtr++) * scalar;
406  *cPtr++ = (*aPtr++) * scalar;
407  *cPtr++ = (*aPtr++) * scalar;
408  *cPtr++ = (*aPtr++) * scalar;
409  number -= 8;
410  }
411 
412  // clean up any remaining
413  while (number-- > 0)
414  *cPtr++ = *aPtr++ * scalar;
415 }
416 #endif /* LV_HAVE_GENERIC */
417 
418 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
static void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:323
static void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:394
static void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:205
static void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:167
for i
Definition: volk_config_fixed.tmpl.h:25
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:283
static void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:362
#define lv_creal(x)
Definition: volk_complex.h:83
static void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:127
#define lv_cimag(x)
Definition: volk_complex.h:85