Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_16ic_s32f_magnitude_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
55 #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
56 #define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
57 
58 #include <volk/volk_common.h>
59 #include <inttypes.h>
60 #include <stdio.h>
61 #include <math.h>
62 
63 #ifdef LV_HAVE_AVX2
64 #include <immintrin.h>
65 
66 static inline void
67 volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector, const lv_16sc_t* complexVector,
68  const float scalar, unsigned int num_points)
69 {
70  unsigned int number = 0;
71  const unsigned int eighthPoints = num_points / 8;
72 
73  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
74  float* magnitudeVectorPtr = magnitudeVector;
75 
76  __m256 invScalar = _mm256_set1_ps(1.0/scalar);
77 
78  __m256 cplxValue1, cplxValue2, result;
79  __m256i int1, int2;
80  __m128i short1, short2;
81  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
82 
83  for(;number < eighthPoints; number++){
84 
85  int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
86  complexVectorPtr += 16;
87  short1 = _mm256_extracti128_si256(int1,0);
88  short2 = _mm256_extracti128_si256(int1,1);
89 
90  int1 = _mm256_cvtepi16_epi32(short1);
91  int2 = _mm256_cvtepi16_epi32(short2);
92  cplxValue1 = _mm256_cvtepi32_ps(int1);
93  cplxValue2 = _mm256_cvtepi32_ps(int2);
94 
95  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
96  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
97 
98  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
99  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
100 
101  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
102  result = _mm256_permutevar8x32_ps(result, idx);
103 
104  result = _mm256_sqrt_ps(result); // Square root the values
105 
106  _mm256_store_ps(magnitudeVectorPtr, result);
107 
108  magnitudeVectorPtr += 8;
109  }
110 
111  number = eighthPoints * 8;
112  magnitudeVectorPtr = &magnitudeVector[number];
113  complexVectorPtr = (const int16_t*)&complexVector[number];
114  for(; number < num_points; number++){
115  float val1Real = (float)(*complexVectorPtr++) / scalar;
116  float val1Imag = (float)(*complexVectorPtr++) / scalar;
117  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
118  }
119 }
120 #endif /* LV_HAVE_AVX2 */
121 
122 
123 #ifdef LV_HAVE_SSE3
124 #include <pmmintrin.h>
125 
126 static inline void
127 volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector, const lv_16sc_t* complexVector,
128  const float scalar, unsigned int num_points)
129 {
130  unsigned int number = 0;
131  const unsigned int quarterPoints = num_points / 4;
132 
133  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
134  float* magnitudeVectorPtr = magnitudeVector;
135 
136  __m128 invScalar = _mm_set_ps1(1.0/scalar);
137 
138  __m128 cplxValue1, cplxValue2, result;
139 
140  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
141 
142  for(;number < quarterPoints; number++){
143 
144  inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
145  inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
146  inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
147  inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
148 
149  inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
150  inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
151  inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
152  inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
153 
154  cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
155  cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
156 
157  complexVectorPtr += 8;
158 
159  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
160  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
161 
162  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
163  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
164 
165  result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
166 
167  result = _mm_sqrt_ps(result); // Square root the values
168 
169  _mm_store_ps(magnitudeVectorPtr, result);
170 
171  magnitudeVectorPtr += 4;
172  }
173 
174  number = quarterPoints * 4;
175  magnitudeVectorPtr = &magnitudeVector[number];
176  complexVectorPtr = (const int16_t*)&complexVector[number];
177  for(; number < num_points; number++){
178  float val1Real = (float)(*complexVectorPtr++) / scalar;
179  float val1Imag = (float)(*complexVectorPtr++) / scalar;
180  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
181  }
182 }
183 #endif /* LV_HAVE_SSE3 */
184 
185 #ifdef LV_HAVE_SSE
186 #include <xmmintrin.h>
187 
188 static inline void
189 volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector, const lv_16sc_t* complexVector,
190  const float scalar, unsigned int num_points)
191 {
192  unsigned int number = 0;
193  const unsigned int quarterPoints = num_points / 4;
194 
195  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
196  float* magnitudeVectorPtr = magnitudeVector;
197 
198  const float iScalar = 1.0 / scalar;
199  __m128 invScalar = _mm_set_ps1(iScalar);
200 
201  __m128 cplxValue1, cplxValue2, result, re, im;
202 
203  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
204 
205  for(;number < quarterPoints; number++){
206  inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
207  inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
208  inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
209  inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
210 
211  inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
212  inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
213  inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
214  inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
215 
216  cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
217  cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
218 
219  re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
220  im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
221 
222  complexVectorPtr += 8;
223 
224  cplxValue1 = _mm_mul_ps(re, invScalar);
225  cplxValue2 = _mm_mul_ps(im, invScalar);
226 
227  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
228  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
229 
230  result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
231 
232  result = _mm_sqrt_ps(result); // Square root the values
233 
234  _mm_store_ps(magnitudeVectorPtr, result);
235 
236  magnitudeVectorPtr += 4;
237  }
238 
239  number = quarterPoints * 4;
240  magnitudeVectorPtr = &magnitudeVector[number];
241  complexVectorPtr = (const int16_t*)&complexVector[number];
242  for(; number < num_points; number++){
243  float val1Real = (float)(*complexVectorPtr++) * iScalar;
244  float val1Imag = (float)(*complexVectorPtr++) * iScalar;
245  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
246  }
247 }
248 
249 
250 #endif /* LV_HAVE_SSE */
251 
252 #ifdef LV_HAVE_GENERIC
253 
254 static inline void
255 volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector, const lv_16sc_t* complexVector,
256  const float scalar, unsigned int num_points)
257 {
258  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
259  float* magnitudeVectorPtr = magnitudeVector;
260  unsigned int number = 0;
261  const float invScalar = 1.0 / scalar;
262  for(number = 0; number < num_points; number++){
263  float real = ( (float) (*complexVectorPtr++)) * invScalar;
264  float imag = ( (float) (*complexVectorPtr++)) * invScalar;
265  *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
266  }
267 }
268 #endif /* LV_HAVE_GENERIC */
269 
270 #ifdef LV_HAVE_ORC_DISABLED
271 
272 extern void
273 volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector,
274  const float scalar, unsigned int num_points);
275 
276 static inline void
277 volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector, const lv_16sc_t* complexVector,
278  const float scalar, unsigned int num_points)
279 {
280  volk_16ic_s32f_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, scalar, num_points);
281 }
282 #endif /* LV_HAVE_ORC */
283 
284 
285 #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a_H */
286 
287 #ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
288 #define INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
289 
290 #include <volk/volk_common.h>
291 #include <inttypes.h>
292 #include <stdio.h>
293 #include <math.h>
294 
295 #ifdef LV_HAVE_AVX2
296 #include <immintrin.h>
297 
298 static inline void
299 volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector, const lv_16sc_t* complexVector,
300  const float scalar, unsigned int num_points)
301 {
302  unsigned int number = 0;
303  const unsigned int eighthPoints = num_points / 8;
304 
305  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
306  float* magnitudeVectorPtr = magnitudeVector;
307 
308  __m256 invScalar = _mm256_set1_ps(1.0/scalar);
309 
310  __m256 cplxValue1, cplxValue2, result;
311  __m256i int1, int2;
312  __m128i short1, short2;
313  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
314 
315  for(;number < eighthPoints; number++){
316 
317  int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
318  complexVectorPtr += 16;
319  short1 = _mm256_extracti128_si256(int1,0);
320  short2 = _mm256_extracti128_si256(int1,1);
321 
322  int1 = _mm256_cvtepi16_epi32(short1);
323  int2 = _mm256_cvtepi16_epi32(short2);
324  cplxValue1 = _mm256_cvtepi32_ps(int1);
325  cplxValue2 = _mm256_cvtepi32_ps(int2);
326 
327  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
328  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
329 
330  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
331  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
332 
333  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
334  result = _mm256_permutevar8x32_ps(result, idx);
335 
336  result = _mm256_sqrt_ps(result); // Square root the values
337 
338  _mm256_storeu_ps(magnitudeVectorPtr, result);
339 
340  magnitudeVectorPtr += 8;
341  }
342 
343  number = eighthPoints * 8;
344  magnitudeVectorPtr = &magnitudeVector[number];
345  complexVectorPtr = (const int16_t*)&complexVector[number];
346  for(; number < num_points; number++){
347  float val1Real = (float)(*complexVectorPtr++) / scalar;
348  float val1Imag = (float)(*complexVectorPtr++) / scalar;
349  *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
350  }
351 }
352 #endif /* LV_HAVE_AVX2 */
353 
354 #endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */
355 
short complex lv_16sc_t
Definition: volk_complex.h:58
static void volk_16ic_s32f_magnitude_32f_a_sse(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_magnitude_32f.h:189
static void volk_16ic_s32f_magnitude_32f_a_sse3(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_magnitude_32f.h:127
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_16ic_s32f_magnitude_32f_generic(float *magnitudeVector, const lv_16sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_16ic_s32f_magnitude_32f.h:255