Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_16ic_magnitude_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
54 #ifndef INCLUDED_volk_16ic_magnitude_16i_a_H
55 #define INCLUDED_volk_16ic_magnitude_16i_a_H
56 
57 #include <volk/volk_common.h>
58 #include <inttypes.h>
59 #include <stdio.h>
60 #include <math.h>
61 
62 #ifdef LV_HAVE_AVX2
63 #include <immintrin.h>
64 
65 static inline void
66 volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
67 {
68  unsigned int number = 0;
69  const unsigned int eighthPoints = num_points / 8;
70 
71  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
72  int16_t* magnitudeVectorPtr = magnitudeVector;
73 
74  __m256 vScalar = _mm256_set1_ps(32768.0);
75  __m256 invScalar = _mm256_set1_ps(1.0/32768.0);
76  __m256i int1, int2;
77  __m128i short1, short2;
78  __m256 cplxValue1, cplxValue2, result;
79  __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
80 
81  for(;number < eighthPoints; number++){
82 
83  int1 = _mm256_load_si256((__m256i*)complexVectorPtr);
84  complexVectorPtr += 16;
85  short1 = _mm256_extracti128_si256(int1,0);
86  short2 = _mm256_extracti128_si256(int1,1);
87 
88  int1 = _mm256_cvtepi16_epi32(short1);
89  int2 = _mm256_cvtepi16_epi32(short2);
90  cplxValue1 = _mm256_cvtepi32_ps(int1);
91  cplxValue2 = _mm256_cvtepi32_ps(int2);
92 
93  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
94  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
95 
96  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
97  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
98 
99  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
100 
101  result = _mm256_sqrt_ps(result); // Square root the values
102 
103  result = _mm256_mul_ps(result, vScalar); // Scale the results
104 
105  int1 = _mm256_cvtps_epi32(result);
106  int1 = _mm256_packs_epi32(int1, int1);
107  int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs
108  short1 = _mm256_extracti128_si256(int1, 0);
109  _mm_store_si128((__m128i*)magnitudeVectorPtr,short1);
110  magnitudeVectorPtr += 8;
111  }
112 
113  number = eighthPoints * 8;
114  magnitudeVectorPtr = &magnitudeVector[number];
115  complexVectorPtr = (const int16_t*)&complexVector[number];
116  for(; number < num_points; number++){
117  const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
118  const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
119  const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
120  *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
121  }
122 }
123 #endif /* LV_HAVE_AVX2 */
124 
125 #ifdef LV_HAVE_SSE3
126 #include <pmmintrin.h>
127 
128 static inline void
129 volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
130 {
131  unsigned int number = 0;
132  const unsigned int quarterPoints = num_points / 4;
133 
134  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
135  int16_t* magnitudeVectorPtr = magnitudeVector;
136 
137  __m128 vScalar = _mm_set_ps1(32768.0);
138  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
139 
140  __m128 cplxValue1, cplxValue2, result;
141 
142  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
143  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
144 
145  for(;number < quarterPoints; number++){
146 
147  inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
148  inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
149  inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
150  inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
151 
152  inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
153  inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
154  inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
155  inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
156 
157  cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
158  cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
159 
160  complexVectorPtr += 8;
161 
162  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
163  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
164 
165  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
166  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
167 
168  result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
169 
170  result = _mm_sqrt_ps(result); // Square root the values
171 
172  result = _mm_mul_ps(result, vScalar); // Scale the results
173 
174  _mm_store_ps(outputFloatBuffer, result);
175  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
176  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
177  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
178  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
179  }
180 
181  number = quarterPoints * 4;
182  magnitudeVectorPtr = &magnitudeVector[number];
183  complexVectorPtr = (const int16_t*)&complexVector[number];
184  for(; number < num_points; number++){
185  const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
186  const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
187  const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
188  *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
189  }
190 }
191 #endif /* LV_HAVE_SSE3 */
192 
193 #ifdef LV_HAVE_SSE
194 #include <xmmintrin.h>
195 
196 static inline void
197 volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
198 {
199  unsigned int number = 0;
200  const unsigned int quarterPoints = num_points / 4;
201 
202  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
203  int16_t* magnitudeVectorPtr = magnitudeVector;
204 
205  __m128 vScalar = _mm_set_ps1(32768.0);
206  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
207 
208  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
209 
210  __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4];
211  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
212 
213  for(;number < quarterPoints; number++){
214 
215  inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
216  inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
217  inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
218  inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
219 
220  cplxValue1 = _mm_load_ps(inputFloatBuffer);
221  complexVectorPtr += 4;
222 
223  inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
224  inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
225  inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
226  inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
227 
228  cplxValue2 = _mm_load_ps(inputFloatBuffer);
229  complexVectorPtr += 4;
230 
231  cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
232  cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
233 
234  // Arrange in i1i2i3i4 format
235  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
236  // Arrange in q1q2q3q4 format
237  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
238 
239  iValue = _mm_mul_ps(iValue, iValue); // Square the I values
240  qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
241 
242  result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
243 
244  result = _mm_sqrt_ps(result); // Square root the values
245 
246  result = _mm_mul_ps(result, vScalar); // Scale the results
247 
248  _mm_store_ps(outputFloatBuffer, result);
249  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
250  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
251  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
252  *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
253  }
254 
255  number = quarterPoints * 4;
256  magnitudeVectorPtr = &magnitudeVector[number];
257  complexVectorPtr = (const int16_t*)&complexVector[number];
258  for(; number < num_points; number++){
259  const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
260  const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
261  const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
262  *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
263  }
264 }
265 #endif /* LV_HAVE_SSE */
266 
267 #ifdef LV_HAVE_GENERIC
268 
269 static inline void
270 volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
271 {
272  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
273  int16_t* magnitudeVectorPtr = magnitudeVector;
274  unsigned int number = 0;
275  const float scalar = 32768.0;
276  for(number = 0; number < num_points; number++){
277  float real = ((float)(*complexVectorPtr++)) / scalar;
278  float imag = ((float)(*complexVectorPtr++)) / scalar;
279  *magnitudeVectorPtr++ = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar);
280  }
281 }
282 #endif /* LV_HAVE_GENERIC */
283 
284 #ifdef LV_HAVE_ORC_DISABLED
285 extern void
286 volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
287 
288 static inline void
289 volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
290 {
291  volk_16ic_magnitude_16i_a_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
292 }
293 #endif /* LV_HAVE_ORC */
294 
295 
296 #endif /* INCLUDED_volk_16ic_magnitude_16i_a_H */
297 
298 
299 #ifndef INCLUDED_volk_16ic_magnitude_16i_u_H
300 #define INCLUDED_volk_16ic_magnitude_16i_u_H
301 
302 #include <volk/volk_common.h>
303 #include <inttypes.h>
304 #include <stdio.h>
305 #include <math.h>
306 
307 #ifdef LV_HAVE_AVX2
308 #include <immintrin.h>
309 
310 static inline void
311 volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points)
312 {
313  unsigned int number = 0;
314  const unsigned int eighthPoints = num_points / 8;
315 
316  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
317  int16_t* magnitudeVectorPtr = magnitudeVector;
318 
319  __m256 vScalar = _mm256_set1_ps(32768.0);
320  __m256 invScalar = _mm256_set1_ps(1.0/32768.0);
321  __m256i int1, int2;
322  __m128i short1, short2;
323  __m256 cplxValue1, cplxValue2, result;
324  __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
325 
326  for(;number < eighthPoints; number++){
327 
328  int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
329  complexVectorPtr += 16;
330  short1 = _mm256_extracti128_si256(int1,0);
331  short2 = _mm256_extracti128_si256(int1,1);
332 
333  int1 = _mm256_cvtepi16_epi32(short1);
334  int2 = _mm256_cvtepi16_epi32(short2);
335  cplxValue1 = _mm256_cvtepi32_ps(int1);
336  cplxValue2 = _mm256_cvtepi32_ps(int2);
337 
338  cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
339  cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
340 
341  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
342  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
343 
344  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
345 
346  result = _mm256_sqrt_ps(result); // Square root the values
347 
348  result = _mm256_mul_ps(result, vScalar); // Scale the results
349 
350  int1 = _mm256_cvtps_epi32(result);
351  int1 = _mm256_packs_epi32(int1, int1);
352  int1 = _mm256_permutevar8x32_epi32(int1, idx); //permute to compensate for shuffling in hadd and packs
353  short1 = _mm256_extracti128_si256(int1, 0);
354  _mm_storeu_si128((__m128i*)magnitudeVectorPtr,short1);
355  magnitudeVectorPtr += 8;
356  }
357 
358  number = eighthPoints * 8;
359  magnitudeVectorPtr = &magnitudeVector[number];
360  complexVectorPtr = (const int16_t*)&complexVector[number];
361  for(; number < num_points; number++){
362  const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
363  const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
364  const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
365  *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
366  }
367 }
368 #endif /* LV_HAVE_AVX2 */
369 
370 #endif /* INCLUDED_volk_16ic_magnitude_16i_u_H */
short complex lv_16sc_t
Definition: volk_complex.h:58
static float rintf(float x)
Definition: config.h:31
static void volk_16ic_magnitude_16i_a_sse(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:197
static void volk_16ic_magnitude_16i_a_sse3(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:129
static void volk_16ic_magnitude_16i_generic(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:270
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33