Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32fc_x2_divide_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
72 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
73 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
74 
75 #include <float.h>
76 #include <inttypes.h>
77 #include <volk/volk_complex.h>
78 
79 #ifdef LV_HAVE_SSE3
80 #include <pmmintrin.h>
82 
83 static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
84  const lv_32fc_t* numeratorVector,
85  const lv_32fc_t* denumeratorVector,
86  unsigned int num_points)
87 {
88  /*
89  * we'll do the "classical"
90  * a a b*
91  * --- = -------
92  * b |b|^2
93  * */
94  unsigned int number = 0;
95  const unsigned int quarterPoints = num_points / 4;
96 
97  __m128 num01, num23, den01, den23, norm, result;
98  lv_32fc_t* c = cVector;
99  const lv_32fc_t* a = numeratorVector;
100  const lv_32fc_t* b = denumeratorVector;
101 
102  for (; number < quarterPoints; number++) {
103  num01 = _mm_loadu_ps((float*)a); // first pair
104  den01 = _mm_loadu_ps((float*)b); // first pair
105  num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
106  a += 2;
107  b += 2;
108 
109  num23 = _mm_loadu_ps((float*)a); // second pair
110  den23 = _mm_loadu_ps((float*)b); // second pair
111  num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
112  a += 2;
113  b += 2;
114 
115  norm = _mm_magnitudesquared_ps_sse3(den01, den23);
116  den01 = _mm_unpacklo_ps(norm, norm);
117  den23 = _mm_unpackhi_ps(norm, norm);
118 
119  result = _mm_div_ps(num01, den01);
120  _mm_storeu_ps((float*)c, result); // Store the results back into the C container
121  c += 2;
122  result = _mm_div_ps(num23, den23);
123  _mm_storeu_ps((float*)c, result); // Store the results back into the C container
124  c += 2;
125  }
126 
127  number *= 4;
128  for (; number < num_points; number++) {
129  *c = (*a) / (*b);
130  a++;
131  b++;
132  c++;
133  }
134 }
135 #endif /* LV_HAVE_SSE3 */
136 
137 
138 #ifdef LV_HAVE_AVX
139 #include <immintrin.h>
141 
142 static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
143  const lv_32fc_t* numeratorVector,
144  const lv_32fc_t* denumeratorVector,
145  unsigned int num_points)
146 {
147  /*
148  * we'll do the "classical"
149  * a a b*
150  * --- = -------
151  * b |b|^2
152  * */
153  unsigned int number = 0;
154  const unsigned int quarterPoints = num_points / 4;
155 
156  __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
157  lv_32fc_t* c = cVector;
158  const lv_32fc_t* a = numeratorVector;
159  const lv_32fc_t* b = denumeratorVector;
160 
161  for (; number < quarterPoints; number++) {
162  num = _mm256_loadu_ps(
163  (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
164  denum = _mm256_loadu_ps(
165  (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
166  mul_conj = _mm256_complexconjugatemul_ps(num, denum);
167  sq = _mm256_mul_ps(denum, denum); // Square the values
168  mag_sq_un = _mm256_hadd_ps(
169  sq, sq); // obtain the actual squared magnitude, although out of order
170  mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
171  // best guide I found on using these functions:
172  // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
173  div = _mm256_div_ps(mul_conj, mag_sq);
174 
175  _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
176 
177  a += 4;
178  b += 4;
179  c += 4;
180  }
181 
182  number = quarterPoints * 4;
183 
184  for (; number < num_points; number++) {
185  *c++ = (*a++) / (*b++);
186  }
187 }
188 #endif /* LV_HAVE_AVX */
189 
190 
191 #ifdef LV_HAVE_GENERIC
192 
193 static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
194  const lv_32fc_t* aVector,
195  const lv_32fc_t* bVector,
196  unsigned int num_points)
197 {
198  lv_32fc_t* cPtr = cVector;
199  const lv_32fc_t* aPtr = aVector;
200  const lv_32fc_t* bPtr = bVector;
201  unsigned int number = 0;
202 
203  for (number = 0; number < num_points; number++) {
204  *cPtr++ = (*aPtr++) / (*bPtr++);
205  }
206 }
207 #endif /* LV_HAVE_GENERIC */
208 
209 
210 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
211 
212 
213 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
214 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
215 
216 #include <float.h>
217 #include <inttypes.h>
218 #include <stdio.h>
219 #include <volk/volk_complex.h>
220 
221 #ifdef LV_HAVE_SSE3
222 #include <pmmintrin.h>
224 
225 static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
226  const lv_32fc_t* numeratorVector,
227  const lv_32fc_t* denumeratorVector,
228  unsigned int num_points)
229 {
230  /*
231  * we'll do the "classical"
232  * a a b*
233  * --- = -------
234  * b |b|^2
235  * */
236  unsigned int number = 0;
237  const unsigned int quarterPoints = num_points / 4;
238 
239  __m128 num01, num23, den01, den23, norm, result;
240  lv_32fc_t* c = cVector;
241  const lv_32fc_t* a = numeratorVector;
242  const lv_32fc_t* b = denumeratorVector;
243 
244  for (; number < quarterPoints; number++) {
245  num01 = _mm_load_ps((float*)a); // first pair
246  den01 = _mm_load_ps((float*)b); // first pair
247  num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
248  a += 2;
249  b += 2;
250 
251  num23 = _mm_load_ps((float*)a); // second pair
252  den23 = _mm_load_ps((float*)b); // second pair
253  num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
254  a += 2;
255  b += 2;
256 
257  norm = _mm_magnitudesquared_ps_sse3(den01, den23);
258 
259  den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
260  den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
261 
262  result = _mm_div_ps(num01, den01);
263  _mm_store_ps((float*)c, result); // Store the results back into the C container
264  c += 2;
265  result = _mm_div_ps(num23, den23);
266  _mm_store_ps((float*)c, result); // Store the results back into the C container
267  c += 2;
268  }
269 
270  number *= 4;
271  for (; number < num_points; number++) {
272  *c = (*a) / (*b);
273  a++;
274  b++;
275  c++;
276  }
277 }
278 #endif /* LV_HAVE_SSE */
279 
280 #ifdef LV_HAVE_AVX
281 #include <immintrin.h>
283 
284 static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
285  const lv_32fc_t* numeratorVector,
286  const lv_32fc_t* denumeratorVector,
287  unsigned int num_points)
288 {
289  /*
290  * we'll do the "classical"
291  * a a b*
292  * --- = -------
293  * b |b|^2
294  * */
295  unsigned int number = 0;
296  const unsigned int quarterPoints = num_points / 4;
297 
298  __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
299  lv_32fc_t* c = cVector;
300  const lv_32fc_t* a = numeratorVector;
301  const lv_32fc_t* b = denumeratorVector;
302 
303  for (; number < quarterPoints; number++) {
304  num =
305  _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
306  denum =
307  _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
308  mul_conj = _mm256_complexconjugatemul_ps(num, denum);
309  sq = _mm256_mul_ps(denum, denum); // Square the values
310  mag_sq_un = _mm256_hadd_ps(
311  sq, sq); // obtain the actual squared magnitude, although out of order
312  mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
313  // best guide I found on using these functions:
314  // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
315  div = _mm256_div_ps(mul_conj, mag_sq);
316 
317  _mm256_store_ps((float*)c, div); // Store the results back into the C container
318 
319  a += 4;
320  b += 4;
321  c += 4;
322  }
323 
324  number = quarterPoints * 4;
325 
326  for (; number < num_points; number++) {
327  *c++ = (*a++) / (*b++);
328  }
329 }
330 #endif /* LV_HAVE_AVX */
331 
332 #ifdef LV_HAVE_NEON
333 #include <arm_neon.h>
334 
335 static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
336  const lv_32fc_t* aVector,
337  const lv_32fc_t* bVector,
338  unsigned int num_points)
339 {
340  lv_32fc_t* cPtr = cVector;
341  const lv_32fc_t* aPtr = aVector;
342  const lv_32fc_t* bPtr = bVector;
343 
344  float32x4x2_t aVal, bVal, cVal;
345  float32x4_t bAbs, bAbsInv;
346 
347  const unsigned int quarterPoints = num_points / 4;
348  unsigned int number = 0;
349  for (; number < quarterPoints; number++) {
350  aVal = vld2q_f32((const float*)(aPtr));
351  bVal = vld2q_f32((const float*)(bPtr));
352  aPtr += 4;
353  bPtr += 4;
354  __VOLK_PREFETCH(aPtr + 4);
355  __VOLK_PREFETCH(bPtr + 4);
356 
357  bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
358  bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
359 
360  bAbsInv = vrecpeq_f32(bAbs);
361  bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
362  bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
363 
364  cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
365  cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
366  cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
367 
368  cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
369  cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
370  cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
371 
372  vst2q_f32((float*)(cPtr), cVal);
373  cPtr += 4;
374  }
375 
376  for (number = quarterPoints * 4; number < num_points; number++) {
377  *cPtr++ = (*aPtr++) / (*bPtr++);
378  }
379 }
380 #endif /* LV_HAVE_NEON */
381 
382 
383 #ifdef LV_HAVE_GENERIC
384 
385 static inline void volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector,
386  const lv_32fc_t* aVector,
387  const lv_32fc_t* bVector,
388  unsigned int num_points)
389 {
390  lv_32fc_t* cPtr = cVector;
391  const lv_32fc_t* aPtr = aVector;
392  const lv_32fc_t* bPtr = bVector;
393  unsigned int number = 0;
394 
395  for (number = 0; number < num_points; number++) {
396  *cPtr++ = (*aPtr++) / (*bPtr++);
397  }
398 }
399 #endif /* LV_HAVE_GENERIC */
400 
401 
402 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
_mm256_complexconjugatemul_ps
static __m256 _mm256_complexconjugatemul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:50
volk_32fc_x2_divide_32fc_a_avx
static void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:284
volk_sse3_intrinsics.h
volk_32fc_x2_divide_32fc_u_sse3
static void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:83
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
volk_32fc_x2_divide_32fc_generic
static void volk_32fc_x2_divide_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:193
volk_32fc_x2_divide_32fc_a_generic
static void volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:385
volk_32fc_x2_divide_32fc_u_avx
static void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:142
lv_32fc_t
float complex lv_32fc_t
Definition: volk_complex.h:70
volk_32fc_x2_divide_32fc_a_sse3
static void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:225
volk_complex.h
_mm_magnitudesquared_ps_sse3
static __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:51
volk_avx_intrinsics.h
_mm_complexconjugatemul_ps
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:44
volk_32fc_x2_divide_32fc_neon
static void volk_32fc_x2_divide_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:335