Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_32f_index_min_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2021 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * VOLK is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * VOLK is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_index_min_16u_a_H
72 #define INCLUDED_volk_32f_index_min_16u_a_H
73 
74 #include <inttypes.h>
75 #include <limits.h>
76 #include <stdio.h>
77 #include <volk/volk_common.h>
78 
79 #ifdef LV_HAVE_AVX
80 #include <immintrin.h>
81 
82 static inline void
83 volk_32f_index_min_16u_a_avx(uint16_t* target, const float* source, uint32_t num_points)
84 {
85  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
86  const uint32_t eighthPoints = num_points / 8;
87 
88  float* inputPtr = (float*)source;
89 
90  __m256 indexIncrementValues = _mm256_set1_ps(8);
91  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
92 
93  float min = source[0];
94  float index = 0;
95  __m256 minValues = _mm256_set1_ps(min);
96  __m256 minValuesIndex = _mm256_setzero_ps();
97  __m256 compareResults;
98  __m256 currentValues;
99 
100  __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
101  __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
102 
103  for (uint32_t number = 0; number < eighthPoints; number++) {
104 
105  currentValues = _mm256_load_ps(inputPtr);
106  inputPtr += 8;
107  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
108 
109  compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
110 
111  minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
112  minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
113  }
114 
115  // Calculate the smallest value from the remaining 4 points
116  _mm256_store_ps(minValuesBuffer, minValues);
117  _mm256_store_ps(minIndexesBuffer, minValuesIndex);
118 
119  for (uint32_t number = 0; number < 8; number++) {
120  if (minValuesBuffer[number] < min) {
121  index = minIndexesBuffer[number];
122  min = minValuesBuffer[number];
123  } else if (minValuesBuffer[number] == min) {
124  if (index > minIndexesBuffer[number])
125  index = minIndexesBuffer[number];
126  }
127  }
128 
129  for (uint32_t number = eighthPoints * 8; number < num_points; number++) {
130  if (source[number] < min) {
131  index = number;
132  min = source[number];
133  }
134  }
135  target[0] = (uint16_t)index;
136 }
137 
138 #endif /*LV_HAVE_AVX*/
139 
140 #ifdef LV_HAVE_SSE4_1
141 #include <smmintrin.h>
142 
143 static inline void volk_32f_index_min_16u_a_sse4_1(uint16_t* target,
144  const float* source,
145  uint32_t num_points)
146 {
147  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
148  const uint32_t quarterPoints = num_points / 4;
149 
150  float* inputPtr = (float*)source;
151 
152  __m128 indexIncrementValues = _mm_set1_ps(4);
153  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
154 
155  float min = source[0];
156  float index = 0;
157  __m128 minValues = _mm_set1_ps(min);
158  __m128 minValuesIndex = _mm_setzero_ps();
159  __m128 compareResults;
160  __m128 currentValues;
161 
162  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
163  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
164 
165  for (uint32_t number = 0; number < quarterPoints; number++) {
166 
167  currentValues = _mm_load_ps(inputPtr);
168  inputPtr += 4;
169  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
170 
171  compareResults = _mm_cmplt_ps(currentValues, minValues);
172 
173  minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
174  minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
175  }
176 
177  // Calculate the smallest value from the remaining 4 points
178  _mm_store_ps(minValuesBuffer, minValues);
179  _mm_store_ps(minIndexesBuffer, minValuesIndex);
180 
181  for (uint32_t number = 0; number < 4; number++) {
182  if (minValuesBuffer[number] < min) {
183  index = minIndexesBuffer[number];
184  min = minValuesBuffer[number];
185  } else if (minValuesBuffer[number] == min) {
186  if (index > minIndexesBuffer[number])
187  index = minIndexesBuffer[number];
188  }
189  }
190 
191  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
192  if (source[number] < min) {
193  index = number;
194  min = source[number];
195  }
196  }
197  target[0] = (uint16_t)index;
198 }
199 
200 #endif /*LV_HAVE_SSE4_1*/
201 
202 
203 #ifdef LV_HAVE_SSE
204 
205 #include <xmmintrin.h>
206 
207 static inline void
208 volk_32f_index_min_16u_a_sse(uint16_t* target, const float* source, uint32_t num_points)
209 {
210  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
211  const uint32_t quarterPoints = num_points / 4;
212 
213  float* inputPtr = (float*)source;
214 
215  __m128 indexIncrementValues = _mm_set1_ps(4);
216  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
217 
218  float min = source[0];
219  float index = 0;
220  __m128 minValues = _mm_set1_ps(min);
221  __m128 minValuesIndex = _mm_setzero_ps();
222  __m128 compareResults;
223  __m128 currentValues;
224 
225  __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
226  __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
227 
228  for (uint32_t number = 0; number < quarterPoints; number++) {
229 
230  currentValues = _mm_load_ps(inputPtr);
231  inputPtr += 4;
232  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
233 
234  compareResults = _mm_cmplt_ps(currentValues, minValues);
235 
236  minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
237  _mm_andnot_ps(compareResults, minValuesIndex));
238  minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
239  _mm_andnot_ps(compareResults, minValues));
240  }
241 
242  // Calculate the smallest value from the remaining 4 points
243  _mm_store_ps(minValuesBuffer, minValues);
244  _mm_store_ps(minIndexesBuffer, minValuesIndex);
245 
246  for (uint32_t number = 0; number < 4; number++) {
247  if (minValuesBuffer[number] < min) {
248  index = minIndexesBuffer[number];
249  min = minValuesBuffer[number];
250  } else if (minValuesBuffer[number] == min) {
251  if (index > minIndexesBuffer[number])
252  index = minIndexesBuffer[number];
253  }
254  }
255 
256  for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
257  if (source[number] < min) {
258  index = number;
259  min = source[number];
260  }
261  }
262  target[0] = (uint16_t)index;
263 }
264 
265 #endif /*LV_HAVE_SSE*/
266 
267 
268 #ifdef LV_HAVE_GENERIC
269 
270 static inline void
271 volk_32f_index_min_16u_generic(uint16_t* target, const float* source, uint32_t num_points)
272 {
273  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
274 
275  float min = source[0];
276  uint16_t index = 0;
277 
278  for (uint32_t i = 1; i < num_points; ++i) {
279  if (source[i] < min) {
280  index = i;
281  min = source[i];
282  }
283  }
284  target[0] = index;
285 }
286 
287 #endif /*LV_HAVE_GENERIC*/
288 
289 
290 #endif /*INCLUDED_volk_32f_index_min_16u_a_H*/
291 
292 
293 #ifndef INCLUDED_volk_32f_index_min_16u_u_H
294 #define INCLUDED_volk_32f_index_min_16u_u_H
295 
296 #include <inttypes.h>
297 #include <limits.h>
298 #include <stdio.h>
299 #include <volk/volk_common.h>
300 
301 #ifdef LV_HAVE_AVX
302 #include <immintrin.h>
303 
304 static inline void
305 volk_32f_index_min_16u_u_avx(uint16_t* target, const float* source, uint32_t num_points)
306 {
307  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
308  const uint32_t eighthPoints = num_points / 8;
309 
310  float* inputPtr = (float*)source;
311 
312  __m256 indexIncrementValues = _mm256_set1_ps(8);
313  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
314 
315  float min = source[0];
316  float index = 0;
317  __m256 minValues = _mm256_set1_ps(min);
318  __m256 minValuesIndex = _mm256_setzero_ps();
319  __m256 compareResults;
320  __m256 currentValues;
321 
322  __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
323  __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
324 
325  for (uint32_t number = 0; number < eighthPoints; number++) {
326 
327  currentValues = _mm256_loadu_ps(inputPtr);
328  inputPtr += 8;
329  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
330 
331  compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
332 
333  minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
334  minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
335  }
336 
337  // Calculate the smallest value from the remaining 4 points
338  _mm256_storeu_ps(minValuesBuffer, minValues);
339  _mm256_storeu_ps(minIndexesBuffer, minValuesIndex);
340 
341  for (uint32_t number = 0; number < 8; number++) {
342  if (minValuesBuffer[number] < min) {
343  index = minIndexesBuffer[number];
344  min = minValuesBuffer[number];
345  } else if (minValuesBuffer[number] == min) {
346  if (index > minIndexesBuffer[number])
347  index = minIndexesBuffer[number];
348  }
349  }
350 
351  for (uint32_t number = eighthPoints * 8; number < num_points; number++) {
352  if (source[number] < min) {
353  index = number;
354  min = source[number];
355  }
356  }
357  target[0] = (uint16_t)index;
358 }
359 
360 #endif /*LV_HAVE_AVX*/
361 
362 #endif /*INCLUDED_volk_32f_index_min_16u_u_H*/
static void volk_32f_index_min_16u_a_avx(uint16_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_16u.h:83
static void volk_32f_index_min_16u_generic(uint16_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_16u.h:271
static void volk_32f_index_min_16u_a_sse(uint16_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_16u.h:208
static void volk_32f_index_min_16u_u_avx(uint16_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_16u.h:305
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
for i
Definition: volk_config_fixed.tmpl.h:25