Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_index_max_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_index_max_16u_a_H
72 #define INCLUDED_volk_32f_index_max_16u_a_H
73 
74 #include <volk/volk_common.h>
75 #include <volk/volk_common.h>
76 #include <inttypes.h>
77 #include <limits.h>
78 #include <stdio.h>
79 
80 #ifdef LV_HAVE_AVX
81 #include <immintrin.h>
82 
83 static inline void
84 volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0,
85  uint32_t num_points)
86 {
87  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
88 
89  uint32_t number = 0;
90  const uint32_t eighthPoints = num_points / 8;
91 
92  float* inputPtr = (float*)src0;
93 
94  __m256 indexIncrementValues = _mm256_set1_ps(8);
95  __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
96 
97  float max = src0[0];
98  float index = 0;
99  __m256 maxValues = _mm256_set1_ps(max);
100  __m256 maxValuesIndex = _mm256_setzero_ps();
101  __m256 compareResults;
102  __m256 currentValues;
103 
104  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
105  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
106 
107  for(;number < eighthPoints; number++){
108 
109  currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
110  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
111 
112  compareResults = _mm256_cmp_ps(maxValues, currentValues,14);
113 
114  maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
115  maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
116  }
117 
118  // Calculate the largest value from the remaining 4 points
119  _mm256_store_ps(maxValuesBuffer, maxValues);
120  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
121 
122  for(number = 0; number < 8; number++){
123  if(maxValuesBuffer[number] > max){
124  index = maxIndexesBuffer[number];
125  max = maxValuesBuffer[number];
126  }
127  }
128 
129  number = eighthPoints * 8;
130  for(;number < num_points; number++){
131  if(src0[number] > max){
132  index = number;
133  max = src0[number];
134  }
135  }
136  target[0] = (uint16_t)index;
137 }
138 
139 #endif /*LV_HAVE_AVX*/
140 
141 #ifdef LV_HAVE_SSE4_1
142 #include <smmintrin.h>
143 
144 static inline void
145 volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0,
146  uint32_t num_points)
147 {
148  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
149 
150  uint32_t number = 0;
151  const uint32_t quarterPoints = num_points / 4;
152 
153  float* inputPtr = (float*)src0;
154 
155  __m128 indexIncrementValues = _mm_set1_ps(4);
156  __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
157 
158  float max = src0[0];
159  float index = 0;
160  __m128 maxValues = _mm_set1_ps(max);
161  __m128 maxValuesIndex = _mm_setzero_ps();
162  __m128 compareResults;
163  __m128 currentValues;
164 
165  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
166  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
167 
168  for(;number < quarterPoints; number++){
169 
170  currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
171  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
172 
173  compareResults = _mm_cmpgt_ps(maxValues, currentValues);
174 
175  maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
176  maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
177  }
178 
179  // Calculate the largest value from the remaining 4 points
180  _mm_store_ps(maxValuesBuffer, maxValues);
181  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
182 
183  for(number = 0; number < 4; number++){
184  if(maxValuesBuffer[number] > max){
185  index = maxIndexesBuffer[number];
186  max = maxValuesBuffer[number];
187  }
188  }
189 
190  number = quarterPoints * 4;
191  for(;number < num_points; number++){
192  if(src0[number] > max){
193  index = number;
194  max = src0[number];
195  }
196  }
197  target[0] = (uint16_t)index;
198 }
199 
200 #endif /*LV_HAVE_SSE4_1*/
201 
202 
203 #ifdef LV_HAVE_SSE
204 
205 #include <xmmintrin.h>
206 
207 static inline void
208 volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0,
209  uint32_t num_points)
210 {
211  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
212 
213  uint32_t number = 0;
214  const uint32_t quarterPoints = num_points / 4;
215 
216  float* inputPtr = (float*)src0;
217 
218  __m128 indexIncrementValues = _mm_set1_ps(4);
219  __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
220 
221  float max = src0[0];
222  float index = 0;
223  __m128 maxValues = _mm_set1_ps(max);
224  __m128 maxValuesIndex = _mm_setzero_ps();
225  __m128 compareResults;
226  __m128 currentValues;
227 
228  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
229  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
230 
231  for(;number < quarterPoints; number++){
232 
233  currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
234  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
235 
236  compareResults = _mm_cmpgt_ps(maxValues, currentValues);
237 
238  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
239 
240  maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
241  }
242 
243  // Calculate the largest value from the remaining 4 points
244  _mm_store_ps(maxValuesBuffer, maxValues);
245  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
246 
247  for(number = 0; number < 4; number++){
248  if(maxValuesBuffer[number] > max){
249  index = maxIndexesBuffer[number];
250  max = maxValuesBuffer[number];
251  }
252  }
253 
254  number = quarterPoints * 4;
255  for(;number < num_points; number++){
256  if(src0[number] > max){
257  index = number;
258  max = src0[number];
259  }
260  }
261  target[0] = (uint16_t)index;
262 }
263 
264 #endif /*LV_HAVE_SSE*/
265 
266 
267 #ifdef LV_HAVE_GENERIC
268 
269 static inline void
270 volk_32f_index_max_16u_generic(uint16_t* target, const float* src0,
271  uint32_t num_points)
272 {
273  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
274 
275  float max = src0[0];
276  uint16_t index = 0;
277 
278  uint32_t i = 1;
279 
280  for(; i < num_points; ++i) {
281  if(src0[i] > max) {
282  index = i;
283  max = src0[i];
284  }
285  }
286  target[0] = index;
287 }
288 
289 #endif /*LV_HAVE_GENERIC*/
290 
291 
292 #endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
293 
294 
295 
296 #ifndef INCLUDED_volk_32f_index_max_16u_u_H
297 #define INCLUDED_volk_32f_index_max_16u_u_H
298 
299 #include <volk/volk_common.h>
300 #include <volk/volk_common.h>
301 #include <inttypes.h>
302 #include <limits.h>
303 #include <stdio.h>
304 
305 #ifdef LV_HAVE_AVX
306 #include <immintrin.h>
307 
308 static inline void
309 volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0,
310  uint32_t num_points)
311 {
312  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
313 
314  uint32_t number = 0;
315  const uint32_t eighthPoints = num_points / 8;
316 
317  float* inputPtr = (float*)src0;
318 
319  __m256 indexIncrementValues = _mm256_set1_ps(8);
320  __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
321 
322  float max = src0[0];
323  float index = 0;
324  __m256 maxValues = _mm256_set1_ps(max);
325  __m256 maxValuesIndex = _mm256_setzero_ps();
326  __m256 compareResults;
327  __m256 currentValues;
328 
329  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
330  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
331 
332  for(;number < eighthPoints; number++){
333 
334  currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
335  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
336 
337  compareResults = _mm256_cmp_ps(maxValues, currentValues,14);
338 
339  maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
340  maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
341  }
342 
343  // Calculate the largest value from the remaining 4 points
344  _mm256_storeu_ps(maxValuesBuffer, maxValues);
345  _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
346 
347  for(number = 0; number < 8; number++){
348  if(maxValuesBuffer[number] > max){
349  index = maxIndexesBuffer[number];
350  max = maxValuesBuffer[number];
351  }
352  }
353 
354  number = eighthPoints * 8;
355  for(;number < num_points; number++){
356  if(src0[number] > max){
357  index = number;
358  max = src0[number];
359  }
360  }
361  target[0] = (uint16_t)index;
362 }
363 
364 #endif /*LV_HAVE_AVX*/
365 
366 #endif /*INCLUDED_volk_32f_index_max_16u_u_H*/
static void volk_32f_index_max_16u_generic(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:270
static void volk_32f_index_max_16u_u_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:309
static void volk_32f_index_max_16u_a_sse(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:208
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_index_max_16u_a_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:84