Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32fc_index_max_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
76 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
77 #define INCLUDED_volk_32fc_index_max_16u_a_H
78 
79 #include <volk/volk_common.h>
80 #include <inttypes.h>
81 #include <stdio.h>
82 #include <limits.h>
83 #include <volk/volk_complex.h>
84 
85 #ifdef LV_HAVE_AVX2
86 #include <immintrin.h>
87 
88 static inline void
89 volk_32fc_index_max_16u_a_avx2(uint16_t* target, lv_32fc_t* src0,
90  uint32_t num_points)
91 {
92  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
93  // Branchless version, if we think it'll make a difference
94  //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
95 
96  const uint32_t num_bytes = num_points*8;
97 
98  union bit256 holderf;
99  union bit256 holderi;
100  float sq_dist = 0.0;
101 
102  union bit256 xmm5, xmm4;
103  __m256 xmm1, xmm2, xmm3;
104  __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
105 
106  xmm5.int_vec = xmmfive = _mm256_setzero_si256();
107  xmm4.int_vec = xmmfour = _mm256_setzero_si256();
108  holderf.int_vec = holder0 = _mm256_setzero_si256();
109  holderi.int_vec = holder1 = _mm256_setzero_si256();
110 
111  int bound = num_bytes >> 6;
112  int leftovers0 = (num_bytes >> 5) & 1;
113  int leftovers1 = (num_bytes >> 4) & 1;
114  int i = 0;
115 
116  xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
117  xmm9 = _mm256_setzero_si256(); //=xmm8
118  xmm10 = _mm256_set1_epi32(8);
119  xmm3 = _mm256_setzero_ps();
120 
121  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
122  for(; i < bound; ++i) {
123  xmm1 = _mm256_load_ps((float*)src0);
124  xmm2 = _mm256_load_ps((float*)&src0[4]);
125 
126  src0 += 8;
127 
128  xmm1 = _mm256_mul_ps(xmm1, xmm1);
129  xmm2 = _mm256_mul_ps(xmm2, xmm2);
130 
131  xmm1 = _mm256_hadd_ps(xmm1, xmm2);
132  xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
133 
134  xmm3 = _mm256_max_ps(xmm1, xmm3);
135 
136  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
137  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
138 
139  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
140  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
141 
142  xmm9 = _mm256_add_epi32(xmm11, xmm12);
143 
144  xmm8 = _mm256_add_epi32(xmm8, xmm10);
145  }
146  xmm10 = _mm256_set1_epi32(4);
147  for(; i < leftovers0; ++i) {
148  xmm1 = _mm256_load_ps((float*)src0);
149 
150  src0 += 4;
151 
152  xmm1 = _mm256_mul_ps(xmm1, xmm1);
153 
154  xmm1 = _mm256_hadd_ps(xmm1, xmm1);
155  xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
156 
157  xmm3 = _mm256_max_ps(xmm1, xmm3);
158 
159  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
160  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
161 
162  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
163  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
164 
165  xmm9 = _mm256_add_epi32(xmm11, xmm12);
166 
167  xmm8 = _mm256_add_epi32(xmm8, xmm10);
168  }
169 
170  idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
171  xmm10 = _mm256_set1_epi32(2);
172  for(i = 0; i < leftovers1; ++i) {
173  xmm2 = _mm256_load_ps((float*)src0);
174 
175  xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
176  xmm8 = bit256_p(&xmm1)->int_vec;
177 
178  xmm2 = _mm256_mul_ps(xmm2, xmm2);
179 
180  src0 += 2;
181 
182  xmm1 = _mm256_hadd_ps(xmm2, xmm2);
183 
184  xmm3 = _mm256_max_ps(xmm1, xmm3);
185 
186  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3,1);
187  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3,0);
188 
189  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
190  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
191 
192  xmm9 = _mm256_add_epi32(xmm11, xmm12);
193 
194  xmm8 = _mm256_add_epi32(xmm8, xmm10);
195  }
196 
197  /*
198  idx = _mm256_setzero_si256();
199  for(i = 0; i < leftovers2; ++i) {
200  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
201 
202  sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
203 
204  //xmm = _mm_load1_ps(&sq_dist);//insert?
205  xmm2 = _mm256_set1_ps(sq_dist);
206  //xmm2 = _mm256_insertf128_ps(xmm2, xmm, 0);
207 
208  xmm1 = xmm3;
209 
210  xmm3 = _mm256_max_ps(xmm3, xmm2);//only lowest 32bit value
211  xmm3 = _mm256_permutevar8x32_ps(xmm3, idx);
212 
213  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
214  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
215 
216  xmm8 = _mm256_permutevar8x32_epi32(xmm8, idx);
217 
218  xmm11 = _mm256_and_si256(xmm8, xmm4.int_vec);
219  xmm12 = _mm256_and_si256(xmm9, xmm5.int_vec);
220 
221  xmm9 = _mm256_add_epi32(xmm11, xmm12);
222 }*/
223 
224  _mm256_store_ps((float*)&(holderf.f), xmm3);
225  _mm256_store_si256(&(holderi.int_vec), xmm9);
226 
227  target[0] = holderi.i[0];
228  sq_dist = holderf.f[0];
229  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
230  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
231  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
232  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
233  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
234  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
235  target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
236  sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
237  target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
238  sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
239  target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
240  sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
241  target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
242  sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
243 
244 }
245 
246 #endif /*LV_HAVE_AVX2*/
247 
248 #ifdef LV_HAVE_SSE3
249 #include <xmmintrin.h>
250 #include <pmmintrin.h>
251 
252 static inline void
254  uint32_t num_points)
255 {
256  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
257  // Branchless version, if we think it'll make a difference
258  //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
259 
260  const uint32_t num_bytes = num_points*8;
261 
262  union bit128 holderf;
263  union bit128 holderi;
264  float sq_dist = 0.0;
265 
266  union bit128 xmm5, xmm4;
267  __m128 xmm1, xmm2, xmm3;
268  __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
269 
270  xmm5.int_vec = xmmfive = _mm_setzero_si128();
271  xmm4.int_vec = xmmfour = _mm_setzero_si128();
272  holderf.int_vec = holder0 = _mm_setzero_si128();
273  holderi.int_vec = holder1 = _mm_setzero_si128();
274 
275  int bound = num_bytes >> 5;
276  int leftovers0 = (num_bytes >> 4) & 1;
277  int leftovers1 = (num_bytes >> 3) & 1;
278  int i = 0;
279 
280  xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
281  xmm9 = _mm_setzero_si128();
282  xmm10 = _mm_set_epi32(4, 4, 4, 4);
283  xmm3 = _mm_setzero_ps();
284  //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
285 
286  for(; i < bound; ++i) {
287  xmm1 = _mm_load_ps((float*)src0);
288  xmm2 = _mm_load_ps((float*)&src0[2]);
289 
290  src0 += 4;
291 
292  xmm1 = _mm_mul_ps(xmm1, xmm1);
293  xmm2 = _mm_mul_ps(xmm2, xmm2);
294 
295  xmm1 = _mm_hadd_ps(xmm1, xmm2);
296 
297  xmm3 = _mm_max_ps(xmm1, xmm3);
298 
299  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
300  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
301 
302  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
303  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
304 
305  xmm9 = _mm_add_epi32(xmm11, xmm12);
306 
307  xmm8 = _mm_add_epi32(xmm8, xmm10);
308 
309  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
310  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
311  }
312 
313 
314  for(i = 0; i < leftovers0; ++i) {
315  xmm2 = _mm_load_ps((float*)src0);
316 
317  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
318  xmm8 = bit128_p(&xmm1)->int_vec;
319 
320  xmm2 = _mm_mul_ps(xmm2, xmm2);
321 
322  src0 += 2;
323 
324  xmm1 = _mm_hadd_ps(xmm2, xmm2);
325 
326  xmm3 = _mm_max_ps(xmm1, xmm3);
327 
328  xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
329 
330  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
331  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
332 
333  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
334  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
335 
336  xmm9 = _mm_add_epi32(xmm11, xmm12);
337 
338  xmm8 = _mm_add_epi32(xmm8, xmm10);
339  //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
340  }
341 
342  for(i = 0; i < leftovers1; ++i) {
343  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
344 
345  sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
346 
347  xmm2 = _mm_load1_ps(&sq_dist);
348 
349  xmm1 = xmm3;
350 
351  xmm3 = _mm_max_ss(xmm3, xmm2);
352 
353  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
354  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
355 
356  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
357 
358  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
359  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
360 
361  xmm9 = _mm_add_epi32(xmm11, xmm12);
362  }
363 
364  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
365  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
366 
367  _mm_store_ps((float*)&(holderf.f), xmm3);
368  _mm_store_si128(&(holderi.int_vec), xmm9);
369 
370  target[0] = holderi.i[0];
371  sq_dist = holderf.f[0];
372  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
373  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
374  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
375  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
376  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
377  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
378 
379  /*
380  float placeholder = 0.0;
381  uint32_t temp0, temp1;
382  uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
383  uint32_t l0 = g0 ^ 1;
384 
385  uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
386  uint32_t l1 = g1 ^ 1;
387 
388  temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
389  temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
390  sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
391  placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
392 
393  g0 = (sq_dist > placeholder);
394  l0 = g0 ^ 1;
395  target[0] = g0 * temp0 + l0 * temp1;
396  */
397 }
398 
399 #endif /*LV_HAVE_SSE3*/
400 
401 #ifdef LV_HAVE_GENERIC
402 static inline void
404  uint32_t num_points)
405 {
406  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
407 
408  const uint32_t num_bytes = num_points*8;
409 
410  float sq_dist = 0.0;
411  float max = 0.0;
412  uint16_t index = 0;
413 
414  uint32_t i = 0;
415 
416  for(; i < num_bytes >> 3; ++i) {
417  sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
418 
419  index = sq_dist > max ? i : index;
420  max = sq_dist > max ? sq_dist : max;
421  }
422  target[0] = index;
423 }
424 
425 #endif /*LV_HAVE_GENERIC*/
426 
427 
428 #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
429 
430 
431 #ifndef INCLUDED_volk_32fc_index_max_16u_u_H
432 #define INCLUDED_volk_32fc_index_max_16u_u_H
433 
434 #include <volk/volk_common.h>
435 #include <inttypes.h>
436 #include <stdio.h>
437 #include <limits.h>
438 #include <volk/volk_complex.h>
439 
440 #ifdef LV_HAVE_AVX2
441 #include <immintrin.h>
442 
443 static inline void
444 volk_32fc_index_max_16u_u_avx2(uint16_t* target, lv_32fc_t* src0,
445  uint32_t num_points)
446 {
447  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
448  // Branchless version, if we think it'll make a difference
449  //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
450 
451  const uint32_t num_bytes = num_points*8;
452 
453  union bit256 holderf;
454  union bit256 holderi;
455  float sq_dist = 0.0;
456 
457  union bit256 xmm5, xmm4;
458  __m256 xmm1, xmm2, xmm3;
459  __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
460 
461  xmm5.int_vec = xmmfive = _mm256_setzero_si256();
462  xmm4.int_vec = xmmfour = _mm256_setzero_si256();
463  holderf.int_vec = holder0 = _mm256_setzero_si256();
464  holderi.int_vec = holder1 = _mm256_setzero_si256();
465 
466  int bound = num_bytes >> 6;
467  int leftovers0 = (num_bytes >> 5) & 1;
468  int leftovers1 = (num_bytes >> 4) & 1;
469  int i = 0;
470 
471  xmm8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
472  xmm9 = _mm256_setzero_si256(); //=xmm8
473  xmm10 = _mm256_set1_epi32(8);
474  xmm3 = _mm256_setzero_ps();
475 
476  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
477  for(; i < bound; ++i) {
478  xmm1 = _mm256_loadu_ps((float*)src0);
479  xmm2 = _mm256_loadu_ps((float*)&src0[4]);
480 
481  src0 += 8;
482 
483  xmm1 = _mm256_mul_ps(xmm1, xmm1);
484  xmm2 = _mm256_mul_ps(xmm2, xmm2);
485 
486  xmm1 = _mm256_hadd_ps(xmm1, xmm2);
487  xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
488 
489  xmm3 = _mm256_max_ps(xmm1, xmm3);
490 
491  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
492  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
493 
494  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
495  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
496 
497  xmm9 = _mm256_add_epi32(xmm11, xmm12);
498 
499  xmm8 = _mm256_add_epi32(xmm8, xmm10);
500  }
501  xmm10 = _mm256_set1_epi32(4);
502  for(; i < leftovers0; ++i) {
503  xmm1 = _mm256_loadu_ps((float*)src0);
504 
505  src0 += 4;
506 
507  xmm1 = _mm256_mul_ps(xmm1, xmm1);
508 
509  xmm1 = _mm256_hadd_ps(xmm1, xmm1);
510  xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
511 
512  xmm3 = _mm256_max_ps(xmm1, xmm3);
513 
514  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
515  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
516 
517  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
518  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
519 
520  xmm9 = _mm256_add_epi32(xmm11, xmm12);
521 
522  xmm8 = _mm256_add_epi32(xmm8, xmm10);
523  }
524 
525  idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
526  xmm10 = _mm256_set1_epi32(2);
527  for(i = 0; i < leftovers1; ++i) {
528  xmm2 = _mm256_loadu_ps((float*)src0);
529 
530  xmm1 = _mm256_permutevar8x32_ps(bit256_p(&xmm8)->float_vec, idx);
531  xmm8 = bit256_p(&xmm1)->int_vec;
532 
533  xmm2 = _mm256_mul_ps(xmm2, xmm2);
534 
535  src0 += 2;
536 
537  xmm1 = _mm256_hadd_ps(xmm2, xmm2);
538 
539  xmm3 = _mm256_max_ps(xmm1, xmm3);
540 
541  xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3,1);
542  xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3,0);
543 
544  xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
545  xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
546 
547  xmm9 = _mm256_add_epi32(xmm11, xmm12);
548 
549  xmm8 = _mm256_add_epi32(xmm8, xmm10);
550  }
551 
552  _mm256_storeu_ps((float*)&(holderf.f), xmm3);
553  _mm256_storeu_si256(&(holderi.int_vec), xmm9);
554 
555  target[0] = holderi.i[0];
556  sq_dist = holderf.f[0];
557  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
558  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
559  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
560  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
561  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
562  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
563  target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
564  sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
565  target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
566  sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
567  target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
568  sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
569  target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
570  sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
571 
572 }
573 
574 #endif /*LV_HAVE_AVX2*/
575 
576 #endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/
#define bit128_p(x)
Definition: volk_common.h:118
static void volk_32fc_index_max_16u_generic(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:403
float f[8]
Definition: volk_common.h:108
__m256i int_vec
Definition: volk_common.h:113
uint32_t i[8]
Definition: volk_common.h:107
__m128i int_vec
Definition: volk_common.h:99
static void volk_32fc_index_max_16u_a_sse3(uint16_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_16u.h:253
for i
Definition: volk_config_fixed.tmpl.h:25
#define bit256_p(x)
Definition: volk_common.h:119
Definition: volk_common.h:104
__m128 float_vec
Definition: volk_common.h:95
float complex lv_32fc_t
Definition: volk_complex.h:61
__m256 float_vec
Definition: volk_common.h:112
float f[4]
Definition: volk_common.h:91
#define lv_creal(x)
Definition: volk_complex.h:83
Definition: volk_common.h:87
#define lv_cimag(x)
Definition: volk_complex.h:85
uint32_t i[4]
Definition: volk_common.h:90