Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32f_index_min_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2021 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * VOLK is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * VOLK is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
65 #ifndef INCLUDED_volk_32f_index_min_32u_a_H
66 #define INCLUDED_volk_32f_index_min_32u_a_H
67 
68 #include <inttypes.h>
69 #include <stdio.h>
70 #include <volk/volk_common.h>
71 
72 #ifdef LV_HAVE_SSE4_1
73 #include <smmintrin.h>
74 
75 static inline void volk_32f_index_min_32u_a_sse4_1(uint32_t* target,
76  const float* source,
77  uint32_t num_points)
78 {
79  const uint32_t quarterPoints = num_points / 4;
80 
81  float* inputPtr = (float*)source;
82 
83  __m128 indexIncrementValues = _mm_set1_ps(4);
84  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
85 
86  float min = source[0];
87  float index = 0;
88  __m128 minValues = _mm_set1_ps(min);
89  __m128 minValuesIndex = _mm_setzero_ps();
90  __m128 compareResults;
91  __m128 currentValues;
92 
93  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
94  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
95 
96  for (uint32_t number = 0; number < quarterPoints; number++) {
97 
98  currentValues = _mm_load_ps(inputPtr);
99  inputPtr += 4;
100  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
101 
102  compareResults = _mm_cmplt_ps(currentValues, minValues);
103 
104  minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
105  minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
106  }
107 
108  // Calculate the smallest value from the remaining 4 points
109  _mm_store_ps(minValuesBuffer, minValues);
110  _mm_store_ps(minIndexesBuffer, minValuesIndex);
111 
112  for (uint32_t number = 0; number < 4; number++) {
113  if (minValuesBuffer[number] < min) {
114  index = minIndexesBuffer[number];
115  min = minValuesBuffer[number];
116  } else if (minValuesBuffer[number] == min) {
117  if (index > minIndexesBuffer[number])
118  index = minIndexesBuffer[number];
119  }
120  }
121 
122  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
123  if (source[number] < min) {
124  index = number;
125  min = source[number];
126  }
127  }
128  target[0] = (uint32_t)index;
129 }
130 
131 #endif /*LV_HAVE_SSE4_1*/
132 
133 
134 #ifdef LV_HAVE_SSE
135 
136 #include <xmmintrin.h>
137 
138 static inline void
139 volk_32f_index_min_32u_a_sse(uint32_t* target, const float* source, uint32_t num_points)
140 {
141  const uint32_t quarterPoints = num_points / 4;
142 
143  float* inputPtr = (float*)source;
144 
145  __m128 indexIncrementValues = _mm_set1_ps(4);
146  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
147 
148  float min = source[0];
149  float index = 0;
150  __m128 minValues = _mm_set1_ps(min);
151  __m128 minValuesIndex = _mm_setzero_ps();
152  __m128 compareResults;
153  __m128 currentValues;
154 
155  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
156  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
157 
158  for (uint32_t number = 0; number < quarterPoints; number++) {
159 
160  currentValues = _mm_load_ps(inputPtr);
161  inputPtr += 4;
162  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
163 
164  compareResults = _mm_cmplt_ps(currentValues, minValues);
165 
166  minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
167  _mm_andnot_ps(compareResults, minValuesIndex));
168 
169  minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
170  _mm_andnot_ps(compareResults, minValues));
171  }
172 
173  // Calculate the smallest value from the remaining 4 points
174  _mm_store_ps(minValuesBuffer, minValues);
175  _mm_store_ps(minIndexesBuffer, minValuesIndex);
176 
177  for (uint32_t number = 0; number < 4; number++) {
178  if (minValuesBuffer[number] < min) {
179  index = minIndexesBuffer[number];
180  min = minValuesBuffer[number];
181  } else if (minValuesBuffer[number] == min) {
182  if (index > minIndexesBuffer[number])
183  index = minIndexesBuffer[number];
184  }
185  }
186 
187  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
188  if (source[number] < min) {
189  index = number;
190  min = source[number];
191  }
192  }
193  target[0] = (uint32_t)index;
194 }
195 
196 #endif /*LV_HAVE_SSE*/
197 
198 
199 #ifdef LV_HAVE_AVX
200 #include <immintrin.h>
201 
202 static inline void
203 volk_32f_index_min_32u_a_avx(uint32_t* target, const float* source, uint32_t num_points)
204 {
205  const uint32_t quarterPoints = num_points / 8;
206 
207  float* inputPtr = (float*)source;
208 
209  __m256 indexIncrementValues = _mm256_set1_ps(8);
210  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
211 
212  float min = source[0];
213  float index = 0;
214  __m256 minValues = _mm256_set1_ps(min);
215  __m256 minValuesIndex = _mm256_setzero_ps();
216  __m256 compareResults;
217  __m256 currentValues;
218 
219  __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
220  __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
221 
222  for (uint32_t number = 0; number < quarterPoints; number++) {
223  currentValues = _mm256_load_ps(inputPtr);
224  inputPtr += 8;
225  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
226  compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
227  minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
228  minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
229  }
230 
231  // Calculate the smallest value from the remaining 8 points
232  _mm256_store_ps(minValuesBuffer, minValues);
233  _mm256_store_ps(minIndexesBuffer, minValuesIndex);
234 
235  for (uint32_t number = 0; number < 8; number++) {
236  if (minValuesBuffer[number] < min) {
237  index = minIndexesBuffer[number];
238  min = minValuesBuffer[number];
239  } else if (minValuesBuffer[number] == min) {
240  if (index > minIndexesBuffer[number])
241  index = minIndexesBuffer[number];
242  }
243  }
244 
245  for (uint32_t number = quarterPoints * 8; number < num_points; number++) {
246  if (source[number] < min) {
247  index = number;
248  min = source[number];
249  }
250  }
251  target[0] = (uint32_t)index;
252 }
253 
254 #endif /*LV_HAVE_AVX*/
255 
256 
257 #ifdef LV_HAVE_NEON
258 #include <arm_neon.h>
259 
260 static inline void
261 volk_32f_index_min_32u_neon(uint32_t* target, const float* source, uint32_t num_points)
262 {
263  const uint32_t quarterPoints = num_points / 4;
264 
265  float* inputPtr = (float*)source;
266  float32x4_t indexIncrementValues = vdupq_n_f32(4);
268  float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
269  float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
270 
271  float min = source[0];
272  float index = 0;
273  float32x4_t minValues = vdupq_n_f32(min);
274  uint32x4_t minValuesIndex = vmovq_n_u32(0);
275  uint32x4_t compareResults;
276  uint32x4_t currentIndexes_u;
277  float32x4_t currentValues;
278 
279  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
280  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
281 
282  for (uint32_t number = 0; number < quarterPoints; number++) {
283  currentValues = vld1q_f32(inputPtr);
284  inputPtr += 4;
285  currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
286  currentIndexes_u = vcvtq_u32_f32(currentIndexes);
287  compareResults = vcgeq_f32(currentValues, minValues);
288  minValuesIndex = vorrq_u32(vandq_u32(compareResults, minValuesIndex),
289  vbicq_u32(currentIndexes_u, compareResults));
290  minValues = vminq_f32(currentValues, minValues);
291  }
292 
293  // Calculate the smallest value from the remaining 4 points
294  vst1q_f32(minValuesBuffer, minValues);
295  vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex));
296  for (uint32_t number = 0; number < 4; number++) {
297  if (minValuesBuffer[number] < min) {
298  index = minIndexesBuffer[number];
299  min = minValuesBuffer[number];
300  } else if (minValues[number] == min) {
301  if (index > minIndexesBuffer[number])
302  index = minIndexesBuffer[number];
303  }
304  }
305 
306  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
307  if (source[number] < min) {
308  index = number;
309  min = source[number];
310  }
311  }
312  target[0] = (uint32_t)index;
313 }
314 
315 #endif /*LV_HAVE_NEON*/
316 
317 
318 #ifdef LV_HAVE_GENERIC
319 
320 static inline void
321 volk_32f_index_min_32u_generic(uint32_t* target, const float* source, uint32_t num_points)
322 {
323  float min = source[0];
324  uint32_t index = 0;
325 
326  for (uint32_t i = 1; i < num_points; ++i) {
327  if (source[i] < min) {
328  index = i;
329  min = source[i];
330  }
331  }
332  target[0] = index;
333 }
334 
335 #endif /*LV_HAVE_GENERIC*/
336 
337 
338 #endif /*INCLUDED_volk_32f_index_min_32u_a_H*/
339 
340 
341 #ifndef INCLUDED_volk_32f_index_min_32u_u_H
342 #define INCLUDED_volk_32f_index_min_32u_u_H
343 
344 #include <inttypes.h>
345 #include <stdio.h>
346 #include <volk/volk_common.h>
347 
348 
349 #ifdef LV_HAVE_AVX
350 #include <immintrin.h>
351 
352 static inline void
353 volk_32f_index_min_32u_u_avx(uint32_t* target, const float* source, uint32_t num_points)
354 {
355  const uint32_t quarterPoints = num_points / 8;
356 
357  float* inputPtr = (float*)source;
358 
359  __m256 indexIncrementValues = _mm256_set1_ps(8);
360  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
361 
362  float min = source[0];
363  float index = 0;
364  __m256 minValues = _mm256_set1_ps(min);
365  __m256 minValuesIndex = _mm256_setzero_ps();
366  __m256 compareResults;
367  __m256 currentValues;
368 
369  __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
370  __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
371 
372  for (uint32_t number = 0; number < quarterPoints; number++) {
373  currentValues = _mm256_loadu_ps(inputPtr);
374  inputPtr += 8;
375  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
376  compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
377  minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
378  minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
379  }
380 
381  // Calculate the smalles value from the remaining 8 points
382  _mm256_store_ps(minValuesBuffer, minValues);
383  _mm256_store_ps(minIndexesBuffer, minValuesIndex);
384 
385  for (uint32_t number = 0; number < 8; number++) {
386  if (minValuesBuffer[number] < min) {
387  index = minIndexesBuffer[number];
388  min = minValuesBuffer[number];
389  } else if (minValuesBuffer[number] == min) {
390  if (index > minIndexesBuffer[number])
391  index = minIndexesBuffer[number];
392  }
393  }
394 
395  for (uint32_t number = quarterPoints * 8; number < num_points; number++) {
396  if (source[number] < min) {
397  index = number;
398  min = source[number];
399  }
400  }
401  target[0] = (uint32_t)index;
402 }
403 
404 #endif /*LV_HAVE_AVX*/
405 
406 
407 #ifdef LV_HAVE_SSE4_1
408 #include <smmintrin.h>
409 
410 static inline void volk_32f_index_min_32u_u_sse4_1(uint32_t* target,
411  const float* source,
412  uint32_t num_points)
413 {
414  const uint32_t quarterPoints = num_points / 4;
415 
416  float* inputPtr = (float*)source;
417 
418  __m128 indexIncrementValues = _mm_set1_ps(4);
419  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
420 
421  float min = source[0];
422  float index = 0;
423  __m128 minValues = _mm_set1_ps(min);
424  __m128 minValuesIndex = _mm_setzero_ps();
425  __m128 compareResults;
426  __m128 currentValues;
427 
428  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
429  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
430 
431  for (uint32_t number = 0; number < quarterPoints; number++) {
432  currentValues = _mm_loadu_ps(inputPtr);
433  inputPtr += 4;
434  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
435  compareResults = _mm_cmplt_ps(currentValues, minValues);
436  minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
437  minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
438  }
439 
440  // Calculate the smallest value from the remaining 4 points
441  _mm_store_ps(minValuesBuffer, minValues);
442  _mm_store_ps(minIndexesBuffer, minValuesIndex);
443 
444  for (uint32_t number = 0; number < 4; number++) {
445  if (minValuesBuffer[number] < min) {
446  index = minIndexesBuffer[number];
447  min = minValuesBuffer[number];
448  } else if (minValuesBuffer[number] == min) {
449  if (index > minIndexesBuffer[number])
450  index = minIndexesBuffer[number];
451  }
452  }
453 
454  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
455  if (source[number] < min) {
456  index = number;
457  min = source[number];
458  }
459  }
460  target[0] = (uint32_t)index;
461 }
462 
463 #endif /*LV_HAVE_SSE4_1*/
464 
465 #ifdef LV_HAVE_SSE
466 #include <xmmintrin.h>
467 
468 static inline void
469 volk_32f_index_min_32u_u_sse(uint32_t* target, const float* source, uint32_t num_points)
470 {
471  const uint32_t quarterPoints = num_points / 4;
472 
473  float* inputPtr = (float*)source;
474 
475  __m128 indexIncrementValues = _mm_set1_ps(4);
476  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
477 
478  float min = source[0];
479  float index = 0;
480  __m128 minValues = _mm_set1_ps(min);
481  __m128 minValuesIndex = _mm_setzero_ps();
482  __m128 compareResults;
483  __m128 currentValues;
484 
485  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
486  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
487 
488  for (uint32_t number = 0; number < quarterPoints; number++) {
489  currentValues = _mm_loadu_ps(inputPtr);
490  inputPtr += 4;
491  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
492  compareResults = _mm_cmplt_ps(currentValues, minValues);
493  minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
494  _mm_andnot_ps(compareResults, minValuesIndex));
495  minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
496  _mm_andnot_ps(compareResults, minValues));
497  }
498 
499  // Calculate the smallest value from the remaining 4 points
500  _mm_store_ps(minValuesBuffer, minValues);
501  _mm_store_ps(minIndexesBuffer, minValuesIndex);
502 
503  for (uint32_t number = 0; number < 4; number++) {
504  if (minValuesBuffer[number] < min) {
505  index = minIndexesBuffer[number];
506  min = minValuesBuffer[number];
507  } else if (minValuesBuffer[number] == min) {
508  if (index > minIndexesBuffer[number])
509  index = minIndexesBuffer[number];
510  }
511  }
512 
513  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
514  if (source[number] < min) {
515  index = number;
516  min = source[number];
517  }
518  }
519  target[0] = (uint32_t)index;
520 }
521 
522 #endif /*LV_HAVE_SSE*/
523 
524 #endif /*INCLUDED_volk_32f_index_min_32u_u_H*/
static void volk_32f_index_min_32u_neon(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:261
static void volk_32f_index_min_32u_a_sse(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:139
static void volk_32f_index_min_32u_u_sse(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:469
static void volk_32f_index_min_32u_generic(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:321
static void volk_32f_index_min_32u_a_avx(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:203
static void volk_32f_index_min_32u_u_avx(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:353
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
for i
Definition: volk_config_fixed.tmpl.h:25