Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_index_max_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
64 #ifndef INCLUDED_volk_32f_index_max_32u_a_H
65 #define INCLUDED_volk_32f_index_max_32u_a_H
66 
67 #include <volk/volk_common.h>
68 #include <volk/volk_common.h>
69 #include <inttypes.h>
70 #include <stdio.h>
71 
72 #ifdef LV_HAVE_SSE4_1
73 #include<smmintrin.h>
74 
75 static inline void
76 volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
77 {
78  if(num_points > 0){
79  uint32_t number = 0;
80  const uint32_t quarterPoints = num_points / 4;
81 
82  float* inputPtr = (float*)src0;
83 
84  __m128 indexIncrementValues = _mm_set1_ps(4);
85  __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
86 
87  float max = src0[0];
88  float index = 0;
89  __m128 maxValues = _mm_set1_ps(max);
90  __m128 maxValuesIndex = _mm_setzero_ps();
91  __m128 compareResults;
92  __m128 currentValues;
93 
94  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
95  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
96 
97  for(;number < quarterPoints; number++){
98 
99  currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
100  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
101 
102  compareResults = _mm_cmpgt_ps(maxValues, currentValues);
103 
104  maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
105  maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
106  }
107 
108  // Calculate the largest value from the remaining 4 points
109  _mm_store_ps(maxValuesBuffer, maxValues);
110  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
111 
112  for(number = 0; number < 4; number++){
113  if(maxValuesBuffer[number] > max){
114  index = maxIndexesBuffer[number];
115  max = maxValuesBuffer[number];
116  }
117  }
118 
119  number = quarterPoints * 4;
120  for(;number < num_points; number++){
121  if(src0[number] > max){
122  index = number;
123  max = src0[number];
124  }
125  }
126  target[0] = (uint32_t)index;
127  }
128 }
129 
130 #endif /*LV_HAVE_SSE4_1*/
131 
132 
133 #ifdef LV_HAVE_SSE
134 
135 #include<xmmintrin.h>
136 
137 static inline void
138 volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
139 {
140  if(num_points > 0){
141  uint32_t number = 0;
142  const uint32_t quarterPoints = num_points / 4;
143 
144  float* inputPtr = (float*)src0;
145 
146  __m128 indexIncrementValues = _mm_set1_ps(4);
147  __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
148 
149  float max = src0[0];
150  float index = 0;
151  __m128 maxValues = _mm_set1_ps(max);
152  __m128 maxValuesIndex = _mm_setzero_ps();
153  __m128 compareResults;
154  __m128 currentValues;
155 
156  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
157  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
158 
159  for(;number < quarterPoints; number++){
160 
161  currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
162  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
163 
164  compareResults = _mm_cmpgt_ps(maxValues, currentValues);
165 
166  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
167 
168  maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
169  }
170 
171  // Calculate the largest value from the remaining 4 points
172  _mm_store_ps(maxValuesBuffer, maxValues);
173  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
174 
175  for(number = 0; number < 4; number++){
176  if(maxValuesBuffer[number] > max){
177  index = maxIndexesBuffer[number];
178  max = maxValuesBuffer[number];
179  }
180  }
181 
182  number = quarterPoints * 4;
183  for(;number < num_points; number++){
184  if(src0[number] > max){
185  index = number;
186  max = src0[number];
187  }
188  }
189  target[0] = (uint32_t)index;
190  }
191 }
192 
193 #endif /*LV_HAVE_SSE*/
194 
195 
196 #ifdef LV_HAVE_AVX
197 #include <immintrin.h>
198 
199 static inline void volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
200 {
201  if(num_points > 0)
202  {
203  uint32_t number = 0;
204  const uint32_t quarterPoints = num_points / 8;
205 
206  float* inputPtr = (float*)src0;
207 
208  __m256 indexIncrementValues = _mm256_set1_ps(8);
209  __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
210 
211  float max = src0[0];
212  float index = 0;
213  __m256 maxValues = _mm256_set1_ps(max);
214  __m256 maxValuesIndex = _mm256_setzero_ps();
215  __m256 compareResults;
216  __m256 currentValues;
217 
218  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
219  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
220 
221  for(;number < quarterPoints; number++)
222  {
223  currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
224  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
225  compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
226  maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
227  maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
228  }
229 
230  // Calculate the largest value from the remaining 8 points
231  _mm256_store_ps(maxValuesBuffer, maxValues);
232  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
233 
234  for(number = 0; number < 8; number++)
235  {
236  if(maxValuesBuffer[number] > max)
237  {
238  index = maxIndexesBuffer[number];
239  max = maxValuesBuffer[number];
240  }
241  }
242 
243  number = quarterPoints * 8;
244  for(;number < num_points; number++)
245  {
246  if(src0[number] > max)
247  {
248  index = number;
249  max = src0[number];
250  }
251  }
252  target[0] = (uint32_t)index;
253  }
254 }
255 
256 #endif /*LV_HAVE_AVX*/
257 
258 
259 #ifdef LV_HAVE_NEON
260 #include <arm_neon.h>
261 
262 static inline void volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
263 {
264  if(num_points > 0)
265  {
266  uint32_t number = 0;
267  const uint32_t quarterPoints = num_points / 4;
268 
269  float* inputPtr = (float*)src0;
270  float32x4_t indexIncrementValues = vdupq_n_f32(4);
271  __VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
272  float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
273 
274  float max = src0[0];
275  float index = 0;
276  float32x4_t maxValues = vdupq_n_f32(max);
277  uint32x4_t maxValuesIndex = vmovq_n_u32(0);
278  uint32x4_t compareResults;
279  uint32x4_t currentIndexes_u;
280  float32x4_t currentValues;
281 
282  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
283  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
284 
285  for(;number < quarterPoints; number++)
286  {
287  currentValues = vld1q_f32(inputPtr); inputPtr += 4;
288  currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
289  currentIndexes_u = vcvtq_u32_f32(currentIndexes);
290  compareResults = vcgtq_f32( maxValues, currentValues);
291  maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) );
292  maxValues = vmaxq_f32(currentValues, maxValues);
293  }
294 
295  // Calculate the largest value from the remaining 4 points
296  vst1q_f32(maxValuesBuffer, maxValues);
297  vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
298  for(number = 0; number < 4; number++)
299  {
300  if(maxValuesBuffer[number] > max)
301  {
302  index = maxIndexesBuffer[number];
303  max = maxValuesBuffer[number];
304  }
305  }
306 
307  number = quarterPoints * 4;
308  for(;number < num_points; number++)
309  {
310  if(src0[number] > max)
311  {
312  index = number;
313  max = src0[number];
314  }
315  }
316  target[0] = (uint32_t)index;
317  }
318 }
319 
320 #endif /*LV_HAVE_NEON*/
321 
322 
323 #ifdef LV_HAVE_GENERIC
324 
325 static inline void
326 volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
327 {
328  if(num_points > 0){
329  float max = src0[0];
330  uint32_t index = 0;
331 
332  uint32_t i = 1;
333 
334  for(; i < num_points; ++i) {
335  if(src0[i] > max){
336  index = i;
337  max = src0[i];
338  }
339  }
340  target[0] = index;
341  }
342 }
343 
344 #endif /*LV_HAVE_GENERIC*/
345 
346 
347 #endif /*INCLUDED_volk_32f_index_max_32u_a_H*/
348 
349 
350 #ifndef INCLUDED_volk_32f_index_max_32u_u_H
351 #define INCLUDED_volk_32f_index_max_32u_u_H
352 
353 #include <volk/volk_common.h>
354 #include <volk/volk_common.h>
355 #include <inttypes.h>
356 #include <stdio.h>
357 
358 
359 #ifdef LV_HAVE_AVX
360 #include <immintrin.h>
361 
362 static inline void volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
363 {
364  if(num_points > 0)
365  {
366  uint32_t number = 0;
367  const uint32_t quarterPoints = num_points / 8;
368 
369  float* inputPtr = (float*)src0;
370 
371  __m256 indexIncrementValues = _mm256_set1_ps(8);
372  __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
373 
374  float max = src0[0];
375  float index = 0;
376  __m256 maxValues = _mm256_set1_ps(max);
377  __m256 maxValuesIndex = _mm256_setzero_ps();
378  __m256 compareResults;
379  __m256 currentValues;
380 
381  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
382  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
383 
384  for(;number < quarterPoints; number++)
385  {
386  currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
387  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
388  compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
389  maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
390  maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
391  }
392 
393  // Calculate the largest value from the remaining 8 points
394  _mm256_store_ps(maxValuesBuffer, maxValues);
395  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
396 
397  for(number = 0; number < 8; number++)
398  {
399  if(maxValuesBuffer[number] > max)
400  {
401  index = maxIndexesBuffer[number];
402  max = maxValuesBuffer[number];
403  }
404  }
405 
406  number = quarterPoints * 8;
407  for(;number < num_points; number++)
408  {
409  if(src0[number] > max)
410  {
411  index = number;
412  max = src0[number];
413  }
414  }
415  target[0] = (uint32_t)index;
416  }
417 }
418 
419 #endif /*LV_HAVE_AVX*/
420 
421 
422 #ifdef LV_HAVE_SSE4_1
423 #include<smmintrin.h>
424 
425 static inline void volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
426 {
427  if(num_points > 0)
428  {
429  uint32_t number = 0;
430  const uint32_t quarterPoints = num_points / 4;
431 
432  float* inputPtr = (float*)src0;
433 
434  __m128 indexIncrementValues = _mm_set1_ps(4);
435  __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
436 
437  float max = src0[0];
438  float index = 0;
439  __m128 maxValues = _mm_set1_ps(max);
440  __m128 maxValuesIndex = _mm_setzero_ps();
441  __m128 compareResults;
442  __m128 currentValues;
443 
444  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
445  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
446 
447  for(;number < quarterPoints; number++)
448  {
449  currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
450  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
451  compareResults = _mm_cmpgt_ps(maxValues, currentValues);
452  maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
453  maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
454  }
455 
456  // Calculate the largest value from the remaining 4 points
457  _mm_store_ps(maxValuesBuffer, maxValues);
458  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
459 
460  for(number = 0; number < 4; number++)
461  {
462  if(maxValuesBuffer[number] > max)
463  {
464  index = maxIndexesBuffer[number];
465  max = maxValuesBuffer[number];
466  }
467  }
468 
469  number = quarterPoints * 4;
470  for(;number < num_points; number++)
471  {
472  if(src0[number] > max)
473  {
474  index = number;
475  max = src0[number];
476  }
477  }
478  target[0] = (uint32_t)index;
479  }
480 }
481 
482 #endif /*LV_HAVE_SSE4_1*/
483 
484 #ifdef LV_HAVE_SSE
485 #include<xmmintrin.h>
486 
487 static inline void volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
488 {
489  if(num_points > 0)
490  {
491  uint32_t number = 0;
492  const uint32_t quarterPoints = num_points / 4;
493 
494  float* inputPtr = (float*)src0;
495 
496  __m128 indexIncrementValues = _mm_set1_ps(4);
497  __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
498 
499  float max = src0[0];
500  float index = 0;
501  __m128 maxValues = _mm_set1_ps(max);
502  __m128 maxValuesIndex = _mm_setzero_ps();
503  __m128 compareResults;
504  __m128 currentValues;
505 
506  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
507  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
508 
509  for(;number < quarterPoints; number++)
510  {
511  currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
512  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
513  compareResults = _mm_cmpgt_ps(maxValues, currentValues);
514  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
515  maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
516  }
517 
518  // Calculate the largest value from the remaining 4 points
519  _mm_store_ps(maxValuesBuffer, maxValues);
520  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
521 
522  for(number = 0; number < 4; number++)
523  {
524  if(maxValuesBuffer[number] > max)
525  {
526  index = maxIndexesBuffer[number];
527  max = maxValuesBuffer[number];
528  }
529  }
530 
531  number = quarterPoints * 4;
532  for(;number < num_points; number++)
533  {
534  if(src0[number] > max)
535  {
536  index = number;
537  max = src0[number];
538  }
539  }
540  target[0] = (uint32_t)index;
541  }
542 }
543 
544 #endif /*LV_HAVE_SSE*/
545 
546 #endif /*INCLUDED_volk_32f_index_max_32u_u_H*/
static void volk_32f_index_max_32u_a_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:138
static void volk_32f_index_max_32u_neon(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:262
static void volk_32f_index_max_32u_a_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:199
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32f_index_max_32u_generic(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:326
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_index_max_32u_u_sse(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:487
static void volk_32f_index_max_32u_u_avx(uint32_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_32u.h:362