Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32fc_s32fc_x2_rotator_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
80 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
81 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
82 
83 
84 #include <volk/volk_complex.h>
85 #include <stdio.h>
86 #include <stdlib.h>
87 #include <math.h>
88 #define ROTATOR_RELOAD 512
89 
90 
91 #ifdef LV_HAVE_GENERIC
92 
93 static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
94  unsigned int i = 0;
95  int j = 0;
96  for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) {
97  for(j = 0; j < ROTATOR_RELOAD; ++j) {
98  *outVector++ = *inVector++ * (*phase);
99  (*phase) *= phase_inc;
100  }
101 #ifdef __cplusplus
102  (*phase) /= std::abs((*phase));
103 #else
104  //(*phase) /= cabsf((*phase));
105  (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
106 #endif
107  }
108  for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) {
109  *outVector++ = *inVector++ * (*phase);
110  (*phase) *= phase_inc;
111  }
112 
113 }
114 
115 #endif /* LV_HAVE_GENERIC */
116 
117 
118 #ifdef LV_HAVE_NEON
119 #include <arm_neon.h>
121 
122 static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
123 
124 {
125  lv_32fc_t* outputVectorPtr = outVector;
126  const lv_32fc_t* inputVectorPtr = inVector;
127  lv_32fc_t incr = 1;
128  lv_32fc_t phasePtr[4] = {(*phase), (*phase), (*phase), (*phase)};
129  float32x4x2_t input_vec;
130  float32x4x2_t output_vec;
131 
132  unsigned int i = 0, j = 0;
133  const unsigned int quarter_points = num_points / 4;
134 
135  for(i = 0; i < 4; ++i) {
136  phasePtr[i] *= incr;
137  incr *= (phase_inc);
138  }
139 
140  // Notice that incr has be incremented in the previous loop
141  const lv_32fc_t incrPtr[4] = {incr, incr, incr, incr};
142  const float32x4x2_t incr_vec = vld2q_f32((float*) incrPtr);
143  float32x4x2_t phase_vec = vld2q_f32((float*) phasePtr);
144 
145  for(i = 0; i < (unsigned int)(quarter_points/ROTATOR_RELOAD); i++) {
146  for(j = 0; j < ROTATOR_RELOAD; j++) {
147  input_vec = vld2q_f32((float*) inputVectorPtr);
148  // Prefetch next one, speeds things up
149  __VOLK_PREFETCH(inputVectorPtr+4);
150  // Rotate
151  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
152  // Increase phase
153  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
154  // Store output
155  vst2q_f32((float*)outputVectorPtr, output_vec);
156 
157  outputVectorPtr+=4;
158  inputVectorPtr+=4;
159  }
160  // normalize phase so magnitude doesn't grow because of
161  // floating point rounding error
162  const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
163  const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
164  // Multiply complex with real
165  phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
166  phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
167  }
168 
169  for(i = 0; i < quarter_points % ROTATOR_RELOAD; i++) {
170  input_vec = vld2q_f32((float*) inputVectorPtr);
171  // Prefetch next one, speeds things up
172  __VOLK_PREFETCH(inputVectorPtr+4);
173  // Rotate
174  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
175  // Increase phase
176  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
177  // Store output
178  vst2q_f32((float*)outputVectorPtr, output_vec);
179 
180  outputVectorPtr+=4;
181  inputVectorPtr+=4;
182  }
183  // if(i) == true means we looped above
184  if (i) {
185  // normalize phase so magnitude doesn't grow because of
186  // floating point rounding error
187  const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
188  const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
189  // Multiply complex with real
190  phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
191  phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
192  }
193  // Store current phase
194  vst2q_f32((float*)phasePtr, phase_vec);
195 
196  // Deal with the rest
197  for(i = 0; i < num_points % 4; i++) {
198  *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
199  phasePtr[0] *= (phase_inc);
200  }
201 
202  // For continious phase next time we need to call this function
203  (*phase) = phasePtr[0];
204 }
205 
206 #endif /* LV_HAVE_NEON */
207 
208 
209 #ifdef LV_HAVE_SSE4_1
210 #include <smmintrin.h>
211 
212 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
213  lv_32fc_t* cPtr = outVector;
214  const lv_32fc_t* aPtr = inVector;
215  lv_32fc_t incr = 1;
216  lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
217 
218  unsigned int i, j = 0;
219 
220  for(i = 0; i < 2; ++i) {
221  phase_Ptr[i] *= incr;
222  incr *= (phase_inc);
223  }
224 
225  /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
226  printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
227  printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
228  __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
229 
230  phase_Val = _mm_loadu_ps((float*)phase_Ptr);
231  inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
232 
233  const unsigned int halfPoints = num_points / 2;
234 
235 
236  for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
237  for(j = 0; j < ROTATOR_RELOAD; ++j) {
238 
239  aVal = _mm_load_ps((float*)aPtr);
240 
241  yl = _mm_moveldup_ps(phase_Val);
242  yh = _mm_movehdup_ps(phase_Val);
243  ylp = _mm_moveldup_ps(inc_Val);
244  yhp = _mm_movehdup_ps(inc_Val);
245 
246  tmp1 = _mm_mul_ps(aVal, yl);
247  tmp1p = _mm_mul_ps(phase_Val, ylp);
248 
249  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
250  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
251  tmp2 = _mm_mul_ps(aVal, yh);
252  tmp2p = _mm_mul_ps(phase_Val, yhp);
253 
254  z = _mm_addsub_ps(tmp1, tmp2);
255  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
256 
257  _mm_store_ps((float*)cPtr, z);
258 
259  aPtr += 2;
260  cPtr += 2;
261  }
262  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
263  tmp2 = _mm_hadd_ps(tmp1, tmp1);
264  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
265  tmp2 = _mm_sqrt_ps(tmp1);
266  phase_Val = _mm_div_ps(phase_Val, tmp2);
267  }
268  for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
269  aVal = _mm_load_ps((float*)aPtr);
270 
271  yl = _mm_moveldup_ps(phase_Val);
272  yh = _mm_movehdup_ps(phase_Val);
273  ylp = _mm_moveldup_ps(inc_Val);
274  yhp = _mm_movehdup_ps(inc_Val);
275 
276  tmp1 = _mm_mul_ps(aVal, yl);
277 
278  tmp1p = _mm_mul_ps(phase_Val, ylp);
279 
280  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
281  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
282  tmp2 = _mm_mul_ps(aVal, yh);
283  tmp2p = _mm_mul_ps(phase_Val, yhp);
284 
285  z = _mm_addsub_ps(tmp1, tmp2);
286  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
287 
288  _mm_store_ps((float*)cPtr, z);
289 
290  aPtr += 2;
291  cPtr += 2;
292  }
293  if (i) {
294  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
295  tmp2 = _mm_hadd_ps(tmp1, tmp1);
296  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
297  tmp2 = _mm_sqrt_ps(tmp1);
298  phase_Val = _mm_div_ps(phase_Val, tmp2);
299  }
300 
301  _mm_storeu_ps((float*)phase_Ptr, phase_Val);
302  for(i = 0; i < num_points%2; ++i) {
303  *cPtr++ = *aPtr++ * phase_Ptr[0];
304  phase_Ptr[0] *= (phase_inc);
305  }
306 
307  (*phase) = phase_Ptr[0];
308 
309 }
310 
311 #endif /* LV_HAVE_SSE4_1 for aligned */
312 
313 
314 #ifdef LV_HAVE_SSE4_1
315 #include <smmintrin.h>
316 
317 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
318  lv_32fc_t* cPtr = outVector;
319  const lv_32fc_t* aPtr = inVector;
320  lv_32fc_t incr = 1;
321  lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
322 
323  unsigned int i, j = 0;
324 
325  for(i = 0; i < 2; ++i) {
326  phase_Ptr[i] *= incr;
327  incr *= (phase_inc);
328  }
329 
330  /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
331  printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
332  printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
333  __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
334 
335  phase_Val = _mm_loadu_ps((float*)phase_Ptr);
336  inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
337 
338  const unsigned int halfPoints = num_points / 2;
339 
340 
341  for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
342  for(j = 0; j < ROTATOR_RELOAD; ++j) {
343 
344  aVal = _mm_loadu_ps((float*)aPtr);
345 
346  yl = _mm_moveldup_ps(phase_Val);
347  yh = _mm_movehdup_ps(phase_Val);
348  ylp = _mm_moveldup_ps(inc_Val);
349  yhp = _mm_movehdup_ps(inc_Val);
350 
351  tmp1 = _mm_mul_ps(aVal, yl);
352  tmp1p = _mm_mul_ps(phase_Val, ylp);
353 
354  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
355  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
356  tmp2 = _mm_mul_ps(aVal, yh);
357  tmp2p = _mm_mul_ps(phase_Val, yhp);
358 
359  z = _mm_addsub_ps(tmp1, tmp2);
360  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
361 
362  _mm_storeu_ps((float*)cPtr, z);
363 
364  aPtr += 2;
365  cPtr += 2;
366  }
367  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
368  tmp2 = _mm_hadd_ps(tmp1, tmp1);
369  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
370  tmp2 = _mm_sqrt_ps(tmp1);
371  phase_Val = _mm_div_ps(phase_Val, tmp2);
372  }
373  for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
374  aVal = _mm_loadu_ps((float*)aPtr);
375 
376  yl = _mm_moveldup_ps(phase_Val);
377  yh = _mm_movehdup_ps(phase_Val);
378  ylp = _mm_moveldup_ps(inc_Val);
379  yhp = _mm_movehdup_ps(inc_Val);
380 
381  tmp1 = _mm_mul_ps(aVal, yl);
382 
383  tmp1p = _mm_mul_ps(phase_Val, ylp);
384 
385  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
386  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
387  tmp2 = _mm_mul_ps(aVal, yh);
388  tmp2p = _mm_mul_ps(phase_Val, yhp);
389 
390  z = _mm_addsub_ps(tmp1, tmp2);
391  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
392 
393  _mm_storeu_ps((float*)cPtr, z);
394 
395  aPtr += 2;
396  cPtr += 2;
397  }
398  if (i) {
399  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
400  tmp2 = _mm_hadd_ps(tmp1, tmp1);
401  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
402  tmp2 = _mm_sqrt_ps(tmp1);
403  phase_Val = _mm_div_ps(phase_Val, tmp2);
404  }
405 
406  _mm_storeu_ps((float*)phase_Ptr, phase_Val);
407  for(i = 0; i < num_points%2; ++i) {
408  *cPtr++ = *aPtr++ * phase_Ptr[0];
409  phase_Ptr[0] *= (phase_inc);
410  }
411 
412  (*phase) = phase_Ptr[0];
413 
414 }
415 
416 #endif /* LV_HAVE_SSE4_1 */
417 
418 
419 #ifdef LV_HAVE_AVX
420 #include <immintrin.h>
421 
422 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
423  lv_32fc_t* cPtr = outVector;
424  const lv_32fc_t* aPtr = inVector;
425  lv_32fc_t incr = 1;
426  lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
427 
428  unsigned int i, j = 0;
429 
430  for(i = 0; i < 4; ++i) {
431  phase_Ptr[i] *= incr;
432  incr *= (phase_inc);
433  }
434 
435  /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
436  printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
437  printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
438  printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
439  printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
440  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
441 
442  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
443  inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
444  const unsigned int fourthPoints = num_points / 4;
445 
446 
447  for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
448  for(j = 0; j < ROTATOR_RELOAD; ++j) {
449 
450  aVal = _mm256_load_ps((float*)aPtr);
451 
452  yl = _mm256_moveldup_ps(phase_Val);
453  yh = _mm256_movehdup_ps(phase_Val);
454  ylp = _mm256_moveldup_ps(inc_Val);
455  yhp = _mm256_movehdup_ps(inc_Val);
456 
457  tmp1 = _mm256_mul_ps(aVal, yl);
458  tmp1p = _mm256_mul_ps(phase_Val, ylp);
459 
460  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
461  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
462  tmp2 = _mm256_mul_ps(aVal, yh);
463  tmp2p = _mm256_mul_ps(phase_Val, yhp);
464 
465  z = _mm256_addsub_ps(tmp1, tmp2);
466  phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
467 
468  _mm256_store_ps((float*)cPtr, z);
469 
470  aPtr += 4;
471  cPtr += 4;
472  }
473  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
474  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
475  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
476  tmp2 = _mm256_sqrt_ps(tmp1);
477  phase_Val = _mm256_div_ps(phase_Val, tmp2);
478  }
479  for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
480  aVal = _mm256_load_ps((float*)aPtr);
481 
482  yl = _mm256_moveldup_ps(phase_Val);
483  yh = _mm256_movehdup_ps(phase_Val);
484  ylp = _mm256_moveldup_ps(inc_Val);
485  yhp = _mm256_movehdup_ps(inc_Val);
486 
487  tmp1 = _mm256_mul_ps(aVal, yl);
488 
489  tmp1p = _mm256_mul_ps(phase_Val, ylp);
490 
491  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
492  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
493  tmp2 = _mm256_mul_ps(aVal, yh);
494  tmp2p = _mm256_mul_ps(phase_Val, yhp);
495 
496  z = _mm256_addsub_ps(tmp1, tmp2);
497  phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
498 
499  _mm256_store_ps((float*)cPtr, z);
500 
501  aPtr += 4;
502  cPtr += 4;
503  }
504  if (i) {
505  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
506  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
507  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
508  tmp2 = _mm256_sqrt_ps(tmp1);
509  phase_Val = _mm256_div_ps(phase_Val, tmp2);
510  }
511 
512  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
513  for(i = 0; i < num_points%4; ++i) {
514  *cPtr++ = *aPtr++ * phase_Ptr[0];
515  phase_Ptr[0] *= (phase_inc);
516  }
517 
518  (*phase) = phase_Ptr[0];
519 
520 }
521 
522 #endif /* LV_HAVE_AVX for aligned */
523 
524 
525 #ifdef LV_HAVE_AVX
526 #include <immintrin.h>
527 
528 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
529  lv_32fc_t* cPtr = outVector;
530  const lv_32fc_t* aPtr = inVector;
531  lv_32fc_t incr = 1;
532  lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
533 
534  unsigned int i, j = 0;
535 
536  for(i = 0; i < 4; ++i) {
537  phase_Ptr[i] *= incr;
538  incr *= (phase_inc);
539  }
540 
541  /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
542  printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
543  printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
544  printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
545  printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
546  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
547 
548  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
549  inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
550  const unsigned int fourthPoints = num_points / 4;
551 
552 
553  for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
554  for(j = 0; j < ROTATOR_RELOAD; ++j) {
555 
556  aVal = _mm256_loadu_ps((float*)aPtr);
557 
558  yl = _mm256_moveldup_ps(phase_Val);
559  yh = _mm256_movehdup_ps(phase_Val);
560  ylp = _mm256_moveldup_ps(inc_Val);
561  yhp = _mm256_movehdup_ps(inc_Val);
562 
563  tmp1 = _mm256_mul_ps(aVal, yl);
564  tmp1p = _mm256_mul_ps(phase_Val, ylp);
565 
566  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
567  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
568  tmp2 = _mm256_mul_ps(aVal, yh);
569  tmp2p = _mm256_mul_ps(phase_Val, yhp);
570 
571  z = _mm256_addsub_ps(tmp1, tmp2);
572  phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
573 
574  _mm256_storeu_ps((float*)cPtr, z);
575 
576  aPtr += 4;
577  cPtr += 4;
578  }
579  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
580  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
581  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
582  tmp2 = _mm256_sqrt_ps(tmp1);
583  phase_Val = _mm256_div_ps(phase_Val, tmp2);
584  }
585  for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
586  aVal = _mm256_loadu_ps((float*)aPtr);
587 
588  yl = _mm256_moveldup_ps(phase_Val);
589  yh = _mm256_movehdup_ps(phase_Val);
590  ylp = _mm256_moveldup_ps(inc_Val);
591  yhp = _mm256_movehdup_ps(inc_Val);
592 
593  tmp1 = _mm256_mul_ps(aVal, yl);
594 
595  tmp1p = _mm256_mul_ps(phase_Val, ylp);
596 
597  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
598  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
599  tmp2 = _mm256_mul_ps(aVal, yh);
600  tmp2p = _mm256_mul_ps(phase_Val, yhp);
601 
602  z = _mm256_addsub_ps(tmp1, tmp2);
603  phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
604 
605  _mm256_storeu_ps((float*)cPtr, z);
606 
607  aPtr += 4;
608  cPtr += 4;
609  }
610  if (i) {
611  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
612  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
613  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
614  tmp2 = _mm256_sqrt_ps(tmp1);
615  phase_Val = _mm256_div_ps(phase_Val, tmp2);
616  }
617 
618  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
619  for(i = 0; i < num_points%4; ++i) {
620  *cPtr++ = *aPtr++ * phase_Ptr[0];
621  phase_Ptr[0] *= (phase_inc);
622  }
623 
624  (*phase) = phase_Ptr[0];
625 
626 }
627 
628 #endif /* LV_HAVE_AVX */
629 
630 #if LV_HAVE_AVX && LV_HAVE_FMA
631 #include <immintrin.h>
632 
633 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
634  lv_32fc_t* cPtr = outVector;
635  const lv_32fc_t* aPtr = inVector;
636  lv_32fc_t incr = 1;
637  __VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
638 
639  unsigned int i, j = 0;
640 
641  for(i = 0; i < 4; ++i) {
642  phase_Ptr[i] *= incr;
643  incr *= (phase_inc);
644  }
645 
646  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
647 
648  phase_Val = _mm256_load_ps((float*)phase_Ptr);
649  inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
650  const unsigned int fourthPoints = num_points / 4;
651 
652  for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
653  for(j = 0; j < ROTATOR_RELOAD; ++j) {
654 
655  aVal = _mm256_load_ps((float*)aPtr);
656 
657  yl = _mm256_moveldup_ps(phase_Val);
658  yh = _mm256_movehdup_ps(phase_Val);
659  ylp = _mm256_moveldup_ps(inc_Val);
660  yhp = _mm256_movehdup_ps(inc_Val);
661 
662  tmp1 = aVal;
663  tmp1p = phase_Val;
664 
665  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
666  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
667  tmp2 = _mm256_mul_ps(aVal, yh);
668  tmp2p = _mm256_mul_ps(phase_Val, yhp);
669 
670  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
671  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
672 
673  _mm256_store_ps((float*)cPtr, z);
674 
675  aPtr += 4;
676  cPtr += 4;
677  }
678  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
679  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
680  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
681  tmp2 = _mm256_sqrt_ps(tmp1);
682  phase_Val = _mm256_div_ps(phase_Val, tmp2);
683  }
684  for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
685  aVal = _mm256_load_ps((float*)aPtr);
686 
687  yl = _mm256_moveldup_ps(phase_Val);
688  yh = _mm256_movehdup_ps(phase_Val);
689  ylp = _mm256_moveldup_ps(inc_Val);
690  yhp = _mm256_movehdup_ps(inc_Val);
691 
692  tmp1 = aVal;
693  tmp1p = phase_Val;
694 
695  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
696  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
697  tmp2 = _mm256_mul_ps(aVal, yh);
698  tmp2p = _mm256_mul_ps(phase_Val, yhp);
699 
700  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
701  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
702 
703  _mm256_store_ps((float*)cPtr, z);
704 
705  aPtr += 4;
706  cPtr += 4;
707  }
708  if (i) {
709  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
710  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
711  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
712  tmp2 = _mm256_sqrt_ps(tmp1);
713  phase_Val = _mm256_div_ps(phase_Val, tmp2);
714  }
715 
716  _mm256_store_ps((float*)phase_Ptr, phase_Val);
717  for(i = 0; i < num_points%4; ++i) {
718  *cPtr++ = *aPtr++ * phase_Ptr[0];
719  phase_Ptr[0] *= (phase_inc);
720  }
721 
722  (*phase) = phase_Ptr[0];
723 
724 }
725 
726 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
727 
728 #if LV_HAVE_AVX && LV_HAVE_FMA
729 #include <immintrin.h>
730 
731 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
732  lv_32fc_t* cPtr = outVector;
733  const lv_32fc_t* aPtr = inVector;
734  lv_32fc_t incr = 1;
735  lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
736 
737  unsigned int i, j = 0;
738 
739  for(i = 0; i < 4; ++i) {
740  phase_Ptr[i] *= incr;
741  incr *= (phase_inc);
742  }
743 
744  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
745 
746  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
747  inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
748  const unsigned int fourthPoints = num_points / 4;
749 
750  for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
751  for(j = 0; j < ROTATOR_RELOAD; ++j) {
752 
753  aVal = _mm256_loadu_ps((float*)aPtr);
754 
755  yl = _mm256_moveldup_ps(phase_Val);
756  yh = _mm256_movehdup_ps(phase_Val);
757  ylp = _mm256_moveldup_ps(inc_Val);
758  yhp = _mm256_movehdup_ps(inc_Val);
759 
760  tmp1 = aVal;
761  tmp1p = phase_Val;
762 
763  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
764  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
765  tmp2 = _mm256_mul_ps(aVal, yh);
766  tmp2p = _mm256_mul_ps(phase_Val, yhp);
767 
768  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
769  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
770 
771  _mm256_storeu_ps((float*)cPtr, z);
772 
773  aPtr += 4;
774  cPtr += 4;
775  }
776  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
777  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
778  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
779  tmp2 = _mm256_sqrt_ps(tmp1);
780  phase_Val = _mm256_div_ps(phase_Val, tmp2);
781  }
782  for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
783  aVal = _mm256_loadu_ps((float*)aPtr);
784 
785  yl = _mm256_moveldup_ps(phase_Val);
786  yh = _mm256_movehdup_ps(phase_Val);
787  ylp = _mm256_moveldup_ps(inc_Val);
788  yhp = _mm256_movehdup_ps(inc_Val);
789 
790  tmp1 = aVal;
791  tmp1p = phase_Val;
792 
793  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
794  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
795  tmp2 = _mm256_mul_ps(aVal, yh);
796  tmp2p = _mm256_mul_ps(phase_Val, yhp);
797 
798  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
799  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
800 
801  _mm256_storeu_ps((float*)cPtr, z);
802 
803  aPtr += 4;
804  cPtr += 4;
805  }
806  if (i) {
807  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
808  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
809  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
810  tmp2 = _mm256_sqrt_ps(tmp1);
811  phase_Val = _mm256_div_ps(phase_Val, tmp2);
812  }
813 
814  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
815  for(i = 0; i < num_points%4; ++i) {
816  *cPtr++ = *aPtr++ * phase_Ptr[0];
817  phase_Ptr[0] *= (phase_inc);
818  }
819 
820  (*phase) = phase_Ptr[0];
821 
822 }
823 
824 #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
825 
826 #endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:46
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:57
static void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:93
static void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:422
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:36
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:528
static void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:122
#define lv_creal(x)
Definition: volk_complex.h:83
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:88
#define lv_cimag(x)
Definition: volk_complex.h:85