Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_divide_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
72 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
73 
74 #include <inttypes.h>
75 #include <volk/volk_complex.h>
76 #include <float.h>
77 
78 #ifdef LV_HAVE_SSE3
79 #include <pmmintrin.h>
81 
82 static inline void
83 volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
84  const lv_32fc_t* denumeratorVector, unsigned int num_points)
85 {
86  /*
87  * we'll do the "classical"
88  * a a b*
89  * --- = -------
90  * b |b|^2
91  * */
92  unsigned int number = 0;
93  const unsigned int quarterPoints = num_points / 4;
94 
95  __m128 num01, num23, den01, den23, norm, result;
96  lv_32fc_t* c = cVector;
97  const lv_32fc_t* a = numeratorVector;
98  const lv_32fc_t* b = denumeratorVector;
99 
100  for(; number < quarterPoints; number++){
101  num01 = _mm_loadu_ps((float*) a); // first pair
102  den01 = _mm_loadu_ps((float*) b); // first pair
103  num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
104  a += 2;
105  b += 2;
106 
107  num23 = _mm_loadu_ps((float*) a); // second pair
108  den23 = _mm_loadu_ps((float*) b); // second pair
109  num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
110  a += 2;
111  b += 2;
112 
113  norm = _mm_magnitudesquared_ps_sse3(den01, den23);
114  den01 = _mm_unpacklo_ps(norm,norm);
115  den23 = _mm_unpackhi_ps(norm,norm);
116 
117  result = _mm_div_ps(num01, den01);
118  _mm_storeu_ps((float*) c, result); // Store the results back into the C container
119  c += 2;
120  result = _mm_div_ps(num23, den23);
121  _mm_storeu_ps((float*) c, result); // Store the results back into the C container
122  c += 2;
123  }
124 
125  number *= 4;
126  for(;number < num_points; number++){
127  *c = (*a) / (*b);
128  a++; b++; c++;
129  }
130 }
131 #endif /* LV_HAVE_SSE3 */
132 
133 
134 #ifdef LV_HAVE_AVX
135 #include <immintrin.h>
137 
138 static inline void
139 volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
140  const lv_32fc_t* denumeratorVector, unsigned int num_points)
141 {
142  /*
143  * we'll do the "classical"
144  * a a b*
145  * --- = -------
146  * b |b|^2
147  * */
148  unsigned int number = 0;
149  const unsigned int quarterPoints = num_points / 4;
150 
151  __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
152  lv_32fc_t* c = cVector;
153  const lv_32fc_t* a = numeratorVector;
154  const lv_32fc_t* b = denumeratorVector;
155 
156  for(; number < quarterPoints; number++){
157  num = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
158  denum = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
159  mul_conj = _mm256_complexconjugatemul_ps(num, denum);
160  sq = _mm256_mul_ps(denum, denum); // Square the values
161  mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
162  mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
163  // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
164  div = _mm256_div_ps(mul_conj,mag_sq);
165 
166  _mm256_storeu_ps((float*) c, div); // Store the results back into the C container
167 
168  a += 4;
169  b += 4;
170  c += 4;
171  }
172 
173  number = quarterPoints * 4;
174 
175  for(; number < num_points; number++){
176  *c++ = (*a++) / (*b++);
177  }
178 
179 }
180 #endif /* LV_HAVE_AVX */
181 
182 
183 #ifdef LV_HAVE_GENERIC
184 
185 static inline void
187  const lv_32fc_t* bVector, unsigned int num_points)
188 {
189  lv_32fc_t* cPtr = cVector;
190  const lv_32fc_t* aPtr = aVector;
191  const lv_32fc_t* bPtr= bVector;
192  unsigned int number = 0;
193 
194  for(number = 0; number < num_points; number++){
195  *cPtr++ = (*aPtr++) / (*bPtr++);
196  }
197 }
198 #endif /* LV_HAVE_GENERIC */
199 
200 
201 
202 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
203 
204 
205 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
206 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
207 
208 #include <inttypes.h>
209 #include <stdio.h>
210 #include <volk/volk_complex.h>
211 #include <float.h>
212 
213 #ifdef LV_HAVE_SSE3
214 #include <pmmintrin.h>
216 
217 static inline void
218 volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
219  const lv_32fc_t* denumeratorVector, unsigned int num_points)
220 {
221  /*
222  * we'll do the "classical"
223  * a a b*
224  * --- = -------
225  * b |b|^2
226  * */
227  unsigned int number = 0;
228  const unsigned int quarterPoints = num_points / 4;
229 
230  __m128 num01, num23, den01, den23, norm, result;
231  lv_32fc_t* c = cVector;
232  const lv_32fc_t* a = numeratorVector;
233  const lv_32fc_t* b = denumeratorVector;
234 
235  for(; number < quarterPoints; number++){
236  num01 = _mm_load_ps((float*) a); // first pair
237  den01 = _mm_load_ps((float*) b); // first pair
238  num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
239  a += 2;
240  b += 2;
241 
242  num23 = _mm_load_ps((float*) a); // second pair
243  den23 = _mm_load_ps((float*) b); // second pair
244  num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
245  a += 2;
246  b += 2;
247 
248  norm = _mm_magnitudesquared_ps_sse3(den01, den23);
249 
250  den01 = _mm_unpacklo_ps(norm,norm); // select the lower floats twice
251  den23 = _mm_unpackhi_ps(norm,norm); // select the upper floats twice
252 
253  result = _mm_div_ps(num01, den01);
254  _mm_store_ps((float*) c, result); // Store the results back into the C container
255  c += 2;
256  result = _mm_div_ps(num23, den23);
257  _mm_store_ps((float*) c, result); // Store the results back into the C container
258  c += 2;
259  }
260 
261  number *= 4;
262  for(;number < num_points; number++){
263  *c = (*a) / (*b);
264  a++; b++; c++;
265  }
266 }
267 #endif /* LV_HAVE_SSE */
268 
269 #ifdef LV_HAVE_AVX
270 #include <immintrin.h>
272 
273 static inline void
274 volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
275  const lv_32fc_t* denumeratorVector, unsigned int num_points)
276 {
277  /*
278  * we'll do the "classical"
279  * a a b*
280  * --- = -------
281  * b |b|^2
282  * */
283  unsigned int number = 0;
284  const unsigned int quarterPoints = num_points / 4;
285 
286  __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
287  lv_32fc_t* c = cVector;
288  const lv_32fc_t* a = numeratorVector;
289  const lv_32fc_t* b = denumeratorVector;
290 
291  for(; number < quarterPoints; number++){
292  num = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
293  denum = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
294  mul_conj = _mm256_complexconjugatemul_ps(num, denum);
295  sq = _mm256_mul_ps(denum, denum); // Square the values
296  mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
297  mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
298  // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
299  div = _mm256_div_ps(mul_conj,mag_sq);
300 
301  _mm256_store_ps((float*) c, div); // Store the results back into the C container
302 
303  a += 4;
304  b += 4;
305  c += 4;
306  }
307 
308  number = quarterPoints * 4;
309 
310  for(; number < num_points; number++){
311  *c++ = (*a++) / (*b++);
312  }
313 
314 
315 }
316 #endif /* LV_HAVE_AVX */
317 
318 #ifdef LV_HAVE_NEON
319 #include <arm_neon.h>
320 
321 static inline void
323  const lv_32fc_t* bVector, unsigned int num_points)
324 {
325  lv_32fc_t* cPtr = cVector;
326  const lv_32fc_t* aPtr = aVector;
327  const lv_32fc_t* bPtr = bVector;
328 
329  float32x4x2_t aVal, bVal, cVal;
330  float32x4_t bAbs, bAbsInv;
331 
332  const unsigned int quarterPoints = num_points / 4;
333  unsigned int number = 0;
334  for(; number < quarterPoints; number++){
335  aVal = vld2q_f32((const float*)(aPtr));
336  bVal = vld2q_f32((const float*)(bPtr));
337  aPtr += 4;
338  bPtr += 4;
339  __VOLK_PREFETCH(aPtr+4);
340  __VOLK_PREFETCH(bPtr+4);
341 
342  bAbs = vmulq_f32( bVal.val[0], bVal.val[0]);
343  bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
344 
345  bAbsInv = vrecpeq_f32(bAbs);
346  bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
347  bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
348 
349  cVal.val[0] = vmulq_f32( aVal.val[0], bVal.val[0]);
350  cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
351  cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
352 
353  cVal.val[1] = vmulq_f32( aVal.val[1], bVal.val[0]);
354  cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
355  cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
356 
357  vst2q_f32((float*)(cPtr), cVal);
358  cPtr += 4;
359  }
360 
361  for(number = quarterPoints * 4; number < num_points; number++){
362  *cPtr++ = (*aPtr++) / (*bPtr++);
363  }
364 }
365 #endif /* LV_HAVE_NEON */
366 
367 
368 #ifdef LV_HAVE_GENERIC
369 
370 static inline void
372  const lv_32fc_t* bVector, unsigned int num_points)
373 {
374  lv_32fc_t* cPtr = cVector;
375  const lv_32fc_t* aPtr = aVector;
376  const lv_32fc_t* bPtr= bVector;
377  unsigned int number = 0;
378 
379  for(number = 0; number < num_points; number++){
380  *cPtr++ = (*aPtr++) / (*bPtr++);
381  }
382 }
383 #endif /* LV_HAVE_GENERIC */
384 
385 
386 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
static void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:274
static __m256 _mm256_complexconjugatemul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:51
static __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:53
static void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:218
static void volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:371
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:45
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:83
static void volk_32fc_x2_divide_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:322
static void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:139
static void volk_32fc_x2_divide_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:186