Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32fc_s32f_magnitude_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
73 #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
74 #define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
75 
76 #include <volk/volk_common.h>
77 #include <inttypes.h>
78 #include <stdio.h>
79 #include <math.h>
80 
81 #ifdef LV_HAVE_AVX2
82 #include <immintrin.h>
83 
84 static inline void
85 volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
86  const float scalar, unsigned int num_points)
87 {
88  unsigned int number = 0;
89  const unsigned int eighthPoints = num_points / 8;
90 
91  const float* complexVectorPtr = (const float*)complexVector;
92  int16_t* magnitudeVectorPtr = magnitudeVector;
93 
94  __m256 vScalar = _mm256_set1_ps(scalar);
95  __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
96  __m256 cplxValue1, cplxValue2, result;
97  __m256i resultInt;
98  __m128i resultShort;
99 
100  for(;number < eighthPoints; number++){
101  cplxValue1 = _mm256_load_ps(complexVectorPtr);
102  complexVectorPtr += 8;
103 
104  cplxValue2 = _mm256_load_ps(complexVectorPtr);
105  complexVectorPtr += 8;
106 
107  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
108  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
109 
110  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
111 
112  result = _mm256_sqrt_ps(result);
113 
114  result = _mm256_mul_ps(result, vScalar);
115 
116  resultInt = _mm256_cvtps_epi32(result);
117  resultInt = _mm256_packs_epi32(resultInt, resultInt);
118  resultInt = _mm256_permutevar8x32_epi32(resultInt, idx); //permute to compensate for shuffling in hadd and packs
119  resultShort = _mm256_extracti128_si256(resultInt,0);
120  _mm_store_si128((__m128i*)magnitudeVectorPtr,resultShort);
121  magnitudeVectorPtr += 8;
122  }
123 
124  number = eighthPoints * 8;
125  magnitudeVectorPtr = &magnitudeVector[number];
126  for(; number < num_points; number++){
127  float val1Real = *complexVectorPtr++;
128  float val1Imag = *complexVectorPtr++;
129  *magnitudeVectorPtr++ = (int16_t)rintf(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
130  }
131 }
132 #endif /* LV_HAVE_AVX2 */
133 
134 #ifdef LV_HAVE_SSE3
135 #include <pmmintrin.h>
136 
137 static inline void
138 volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
139  const float scalar, unsigned int num_points)
140 {
141  unsigned int number = 0;
142  const unsigned int quarterPoints = num_points / 4;
143 
144  const float* complexVectorPtr = (const float*)complexVector;
145  int16_t* magnitudeVectorPtr = magnitudeVector;
146 
147  __m128 vScalar = _mm_set_ps1(scalar);
148 
149  __m128 cplxValue1, cplxValue2, result;
150 
151  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
152 
153  for(;number < quarterPoints; number++){
154  cplxValue1 = _mm_load_ps(complexVectorPtr);
155  complexVectorPtr += 4;
156 
157  cplxValue2 = _mm_load_ps(complexVectorPtr);
158  complexVectorPtr += 4;
159 
160  cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
161  cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
162 
163  result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
164 
165  result = _mm_sqrt_ps(result);
166 
167  result = _mm_mul_ps(result, vScalar);
168 
169  _mm_store_ps(floatBuffer, result);
170  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
171  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
172  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
173  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
174  }
175 
176  number = quarterPoints * 4;
177  magnitudeVectorPtr = &magnitudeVector[number];
178  for(; number < num_points; number++){
179  float val1Real = *complexVectorPtr++;
180  float val1Imag = *complexVectorPtr++;
181  *magnitudeVectorPtr++ = (int16_t)rintf(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
182  }
183 }
184 #endif /* LV_HAVE_SSE3 */
185 
186 
187 #ifdef LV_HAVE_SSE
188 #include <xmmintrin.h>
189 
190 static inline void
191 volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
192  const float scalar, unsigned int num_points)
193 {
194  unsigned int number = 0;
195  const unsigned int quarterPoints = num_points / 4;
196 
197  const float* complexVectorPtr = (const float*)complexVector;
198  int16_t* magnitudeVectorPtr = magnitudeVector;
199 
200  __m128 vScalar = _mm_set_ps1(scalar);
201 
202  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
203 
204  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
205 
206  for(;number < quarterPoints; number++){
207  cplxValue1 = _mm_load_ps(complexVectorPtr);
208  complexVectorPtr += 4;
209 
210  cplxValue2 = _mm_load_ps(complexVectorPtr);
211  complexVectorPtr += 4;
212 
213  // Arrange in i1i2i3i4 format
214  iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
215  // Arrange in q1q2q3q4 format
216  qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
217 
218  iValue = _mm_mul_ps(iValue, iValue); // Square the I values
219  qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
220 
221  result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
222 
223  result = _mm_sqrt_ps(result);
224 
225  result = _mm_mul_ps(result, vScalar);
226 
227  _mm_store_ps(floatBuffer, result);
228  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
229  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
230  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
231  *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
232  }
233 
234  number = quarterPoints * 4;
235  magnitudeVectorPtr = &magnitudeVector[number];
236  for(; number < num_points; number++){
237  float val1Real = *complexVectorPtr++;
238  float val1Imag = *complexVectorPtr++;
239  *magnitudeVectorPtr++ = (int16_t)rintf(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
240  }
241 }
242 #endif /* LV_HAVE_SSE */
243 
244 #ifdef LV_HAVE_GENERIC
245 
246 static inline void
247 volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
248  const float scalar, unsigned int num_points)
249 {
250  const float* complexVectorPtr = (float*)complexVector;
251  int16_t* magnitudeVectorPtr = magnitudeVector;
252  unsigned int number = 0;
253  for(number = 0; number < num_points; number++){
254  const float real = *complexVectorPtr++;
255  const float imag = *complexVectorPtr++;
256  *magnitudeVectorPtr++ = (int16_t)rintf(sqrtf((real*real) + (imag*imag)) * scalar);
257  }
258 }
259 #endif /* LV_HAVE_GENERIC */
260 
261 
262 #endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a_H */
263 
264 #ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
265 #define INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
266 
267 #include <volk/volk_common.h>
268 #include <inttypes.h>
269 #include <stdio.h>
270 #include <math.h>
271 
272 #ifdef LV_HAVE_AVX2
273 #include <immintrin.h>
274 
275 static inline void
276 volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector, const lv_32fc_t* complexVector,
277  const float scalar, unsigned int num_points)
278 {
279  unsigned int number = 0;
280  const unsigned int eighthPoints = num_points / 8;
281 
282  const float* complexVectorPtr = (const float*)complexVector;
283  int16_t* magnitudeVectorPtr = magnitudeVector;
284 
285  __m256 vScalar = _mm256_set1_ps(scalar);
286  __m256i idx = _mm256_set_epi32(0,0,0,0,5,1,4,0);
287  __m256 cplxValue1, cplxValue2, result;
288  __m256i resultInt;
289  __m128i resultShort;
290 
291  for(;number < eighthPoints; number++){
292  cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
293  complexVectorPtr += 8;
294 
295  cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
296  complexVectorPtr += 8;
297 
298  cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
299  cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
300 
301  result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
302 
303  result = _mm256_sqrt_ps(result);
304 
305  result = _mm256_mul_ps(result, vScalar);
306 
307  resultInt = _mm256_cvtps_epi32(result);
308  resultInt = _mm256_packs_epi32(resultInt, resultInt);
309  resultInt = _mm256_permutevar8x32_epi32(resultInt, idx); //permute to compensate for shuffling in hadd and packs
310  resultShort = _mm256_extracti128_si256(resultInt,0);
311  _mm_storeu_si128((__m128i*)magnitudeVectorPtr,resultShort);
312  magnitudeVectorPtr += 8;
313  }
314 
315  number = eighthPoints * 8;
316  magnitudeVectorPtr = &magnitudeVector[number];
317  for(; number < num_points; number++){
318  float val1Real = *complexVectorPtr++;
319  float val1Imag = *complexVectorPtr++;
320  *magnitudeVectorPtr++ = (int16_t)rintf(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
321  }
322 }
323 #endif /* LV_HAVE_AVX2 */
324 
325 #endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_u_H */
static float rintf(float x)
Definition: config.h:31
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t *magnitudeVector, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_magnitude_16i.h:138
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_s32f_magnitude_16i_generic(int16_t *magnitudeVector, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_magnitude_16i.h:247
static void volk_32fc_s32f_magnitude_16i_a_sse(int16_t *magnitudeVector, const lv_32fc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_32fc_s32f_magnitude_16i.h:191