Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_multiply_conjugate_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
72 #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
73 
74 #include <inttypes.h>
75 #include <stdio.h>
76 #include <volk/volk_complex.h>
77 #include <float.h>
78 
79 #ifdef LV_HAVE_AVX
80 #include <immintrin.h>
82 
83 static inline void
85  const lv_32fc_t* bVector, unsigned int num_points)
86 {
87  unsigned int number = 0;
88  const unsigned int quarterPoints = num_points / 4;
89 
90  __m256 x, y, z;
91  lv_32fc_t* c = cVector;
92  const lv_32fc_t* a = aVector;
93  const lv_32fc_t* b = bVector;
94 
95  for(; number < quarterPoints; number++){
96  x = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
97  y = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
99  _mm256_storeu_ps((float*) c, z); // Store the results back into the C container
100 
101  a += 4;
102  b += 4;
103  c += 4;
104  }
105 
106  number = quarterPoints * 4;
107 
108  for(; number < num_points; number++){
109  *c++ = (*a++) * lv_conj(*b++);
110  }
111 }
112 #endif /* LV_HAVE_AVX */
113 
114 
115 #ifdef LV_HAVE_SSE3
116 #include <pmmintrin.h>
118 
119 static inline void
121  const lv_32fc_t* bVector, unsigned int num_points)
122 {
123  unsigned int number = 0;
124  const unsigned int halfPoints = num_points / 2;
125 
126  __m128 x, y, z;
127  lv_32fc_t* c = cVector;
128  const lv_32fc_t* a = aVector;
129  const lv_32fc_t* b = bVector;
130 
131  for(; number < halfPoints; number++){
132  x = _mm_loadu_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
133  y = _mm_loadu_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
134  z = _mm_complexconjugatemul_ps(x, y);
135  _mm_storeu_ps((float*) c, z); // Store the results back into the C container
136 
137  a += 2;
138  b += 2;
139  c += 2;
140  }
141 
142  if((num_points % 2) != 0){
143  *c = (*a) * lv_conj(*b);
144  }
145 }
146 #endif /* LV_HAVE_SSE */
147 
148 
149 #ifdef LV_HAVE_GENERIC
150 
151 static inline void
153  const lv_32fc_t* bVector, unsigned int num_points)
154 {
155  lv_32fc_t* cPtr = cVector;
156  const lv_32fc_t* aPtr = aVector;
157  const lv_32fc_t* bPtr= bVector;
158  unsigned int number = 0;
159 
160  for(number = 0; number < num_points; number++){
161  *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
162  }
163 }
164 #endif /* LV_HAVE_GENERIC */
165 
166 
167 
168 #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
169 #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
170 #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
171 
172 #include <inttypes.h>
173 #include <stdio.h>
174 #include <volk/volk_complex.h>
175 #include <float.h>
176 
177 #ifdef LV_HAVE_AVX
178 #include <immintrin.h>
180 
181 static inline void
183  const lv_32fc_t* bVector, unsigned int num_points)
184 {
185  unsigned int number = 0;
186  const unsigned int quarterPoints = num_points / 4;
187 
188  __m256 x, y, z;
189  lv_32fc_t* c = cVector;
190  const lv_32fc_t* a = aVector;
191  const lv_32fc_t* b = bVector;
192 
193  for(; number < quarterPoints; number++){
194  x = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
195  y = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
197  _mm256_store_ps((float*) c, z); // Store the results back into the C container
198 
199  a += 4;
200  b += 4;
201  c += 4;
202  }
203 
204  number = quarterPoints * 4;
205 
206  for(; number < num_points; number++){
207  *c++ = (*a++) * lv_conj(*b++);
208  }
209 }
210 #endif /* LV_HAVE_AVX */
211 
212 
213 #ifdef LV_HAVE_SSE3
214 #include <pmmintrin.h>
216 
217 static inline void
219  const lv_32fc_t* bVector, unsigned int num_points)
220 {
221  unsigned int number = 0;
222  const unsigned int halfPoints = num_points / 2;
223 
224  __m128 x, y, z;
225  lv_32fc_t* c = cVector;
226  const lv_32fc_t* a = aVector;
227  const lv_32fc_t* b = bVector;
228 
229  for(; number < halfPoints; number++){
230  x = _mm_load_ps((float*) a); // Load the ar + ai, br + bi as ar,ai,br,bi
231  y = _mm_load_ps((float*) b); // Load the cr + ci, dr + di as cr,ci,dr,di
232  z = _mm_complexconjugatemul_ps(x, y);
233  _mm_store_ps((float*) c, z); // Store the results back into the C container
234 
235  a += 2;
236  b += 2;
237  c += 2;
238  }
239 
240  if((num_points % 2) != 0){
241  *c = (*a) * lv_conj(*b);
242  }
243 }
244 #endif /* LV_HAVE_SSE */
245 
246 
247 #ifdef LV_HAVE_NEON
248 #include <arm_neon.h>
249 
250 static inline void
252  const lv_32fc_t* bVector, unsigned int num_points)
253 {
254  lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
255  lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
256  unsigned int quarter_points = num_points / 4;
257  float32x4x2_t a_val, b_val, c_val;
258  float32x4x2_t tmp_real, tmp_imag;
259  unsigned int number = 0;
260 
261  for(number = 0; number < quarter_points; ++number) {
262  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
263  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
264  b_val.val[1] = vnegq_f32(b_val.val[1]);
265  __VOLK_PREFETCH(a_ptr+4);
266  __VOLK_PREFETCH(b_ptr+4);
267 
268  // multiply the real*real and imag*imag to get real result
269  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
270  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
271  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
272  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
273 
274  // Multiply cross terms to get the imaginary result
275  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
276  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
277  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
278  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
279 
280  // store the results
281  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
282  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
283  vst2q_f32((float*)cVector, c_val);
284 
285  a_ptr += 4;
286  b_ptr += 4;
287  cVector += 4;
288  }
289 
290  for(number = quarter_points*4; number < num_points; number++){
291  *cVector++ = (*a_ptr++) * conj(*b_ptr++);
292  }
293 }
294 #endif /* LV_HAVE_NEON */
295 
296 
297 #ifdef LV_HAVE_GENERIC
298 
299 static inline void
301  const lv_32fc_t* bVector, unsigned int num_points)
302 {
303  lv_32fc_t* cPtr = cVector;
304  const lv_32fc_t* aPtr = aVector;
305  const lv_32fc_t* bPtr= bVector;
306  unsigned int number = 0;
307 
308  for(number = 0; number < num_points; number++){
309  *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
310  }
311 }
312 #endif /* LV_HAVE_GENERIC */
313 
314 
315 #endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
static void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_conjugate_32fc.h:152
static __m256 _mm256_complexconjugatemul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:51
#define lv_conj(x)
Definition: volk_complex.h:87
static void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_conjugate_32fc.h:218
static void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_conjugate_32fc.h:251
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:45
static void volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_conjugate_32fc.h:182
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_conjugate_32fc.h:84
static void volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_conjugate_32fc.h:300
static void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_multiply_conjugate_32fc.h:120