70 #ifndef INCLUDED_volk_32fc_index_max_32u_a_H 71 #define INCLUDED_volk_32fc_index_max_32u_a_H 82 volk_32fc_index_max_32u_a_avx2(uint32_t* target,
lv_32fc_t* src0,
85 const uint32_t num_bytes = num_points*8;
92 __m256 xmm1, xmm2, xmm3;
93 __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
95 xmm5.
int_vec = xmmfive = _mm256_setzero_si256();
96 xmm4.int_vec = xmmfour = _mm256_setzero_si256();
97 holderf.int_vec = holder0 = _mm256_setzero_si256();
98 holderi.int_vec = holder1 = _mm256_setzero_si256();
100 int bound = num_bytes >> 6;
101 int leftovers1 = (num_bytes >> 4) & 1;
104 xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);
105 xmm9 = _mm256_setzero_si256();
106 xmm10 = _mm256_set1_epi32(8);
107 xmm3 = _mm256_setzero_ps();
108 __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
110 for(; i < bound; ++
i) {
111 xmm1 = _mm256_load_ps((
float*)src0);
112 xmm2 = _mm256_load_ps((
float*)&src0[4]);
116 xmm1 = _mm256_mul_ps(xmm1, xmm1);
117 xmm2 = _mm256_mul_ps(xmm2, xmm2);
119 xmm1 = _mm256_hadd_ps(xmm1, xmm2);
120 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
122 xmm3 = _mm256_max_ps(xmm1, xmm3);
124 xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
125 xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
127 xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
128 xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
130 xmm9 = _mm256_add_epi32(xmm11, xmm12);
132 xmm8 = _mm256_add_epi32(xmm8, xmm10);
135 xmm10 = _mm256_set1_epi32(4);
136 for(i = 0; i < leftovers1; ++
i) {
137 xmm1 = _mm256_load_ps((
float*)src0);
139 xmm1 = _mm256_mul_ps(xmm1, xmm1);
143 xmm1 = _mm256_hadd_ps(xmm1, xmm1);
145 xmm3 = _mm256_max_ps(xmm1, xmm3);
147 xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
148 xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
150 xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
151 xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
153 xmm9 = _mm256_add_epi32(xmm11, xmm12);
155 xmm8 = _mm256_add_epi32(xmm8, xmm10);
158 idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
159 xmm10 = _mm256_set1_epi32(2);
160 for(i = 0; i < leftovers1; ++
i) {
161 xmm2 = _mm256_load_ps((
float*)src0);
166 xmm2 = _mm256_mul_ps(xmm2, xmm2);
170 xmm1 = _mm256_hadd_ps(xmm2, xmm2);
172 xmm3 = _mm256_max_ps(xmm1, xmm3);
174 xmm4.float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
175 xmm5.float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
177 xmm11 = _mm256_and_si256(xmm8, xmm5.int_vec);
178 xmm12 = _mm256_and_si256(xmm9, xmm4.int_vec);
180 xmm9 = _mm256_add_epi32(xmm11, xmm12);
182 xmm8 = _mm256_add_epi32(xmm8, xmm10);
185 _mm256_store_ps((
float*)&(holderf.f), xmm3);
186 _mm256_store_si256(&(holderi.int_vec), xmm9);
188 target[0] = holderi.i[0];
189 sq_dist = holderf.f[0];
190 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
191 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
192 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
193 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
194 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
195 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
196 target[0] = (holderf.f[4] > sq_dist) ? holderi.i[4] : target[0];
197 sq_dist = (holderf.f[4] > sq_dist) ? holderf.f[4] : sq_dist;
198 target[0] = (holderf.f[5] > sq_dist) ? holderi.i[5] : target[0];
199 sq_dist = (holderf.f[5] > sq_dist) ? holderf.f[5] : sq_dist;
200 target[0] = (holderf.f[6] > sq_dist) ? holderi.i[6] : target[0];
201 sq_dist = (holderf.f[6] > sq_dist) ? holderf.f[6] : sq_dist;
202 target[0] = (holderf.f[7] > sq_dist) ? holderi.i[7] : target[0];
203 sq_dist = (holderf.f[7] > sq_dist) ? holderf.f[7] : sq_dist;
210 #include<xmmintrin.h> 211 #include<pmmintrin.h> 217 const uint32_t num_bytes = num_points*8;
224 __m128 xmm1, xmm2, xmm3;
225 __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
227 xmm5.
int_vec = xmmfive = _mm_setzero_si128();
228 xmm4.
int_vec = xmmfour = _mm_setzero_si128();
229 holderf.
int_vec = holder0 = _mm_setzero_si128();
230 holderi.
int_vec = holder1 = _mm_setzero_si128();
232 int bound = num_bytes >> 5;
233 int leftovers0 = (num_bytes >> 4) & 1;
234 int leftovers1 = (num_bytes >> 3) & 1;
237 xmm8 = _mm_set_epi32(3, 2, 1, 0);
238 xmm9 = _mm_setzero_si128();
239 xmm10 = _mm_set_epi32(4, 4, 4, 4);
240 xmm3 = _mm_setzero_ps();
244 for(; i < bound; ++
i) {
245 xmm1 = _mm_load_ps((
float*)src0);
246 xmm2 = _mm_load_ps((
float*)&src0[2]);
250 xmm1 = _mm_mul_ps(xmm1, xmm1);
251 xmm2 = _mm_mul_ps(xmm2, xmm2);
253 xmm1 = _mm_hadd_ps(xmm1, xmm2);
255 xmm3 = _mm_max_ps(xmm1, xmm3);
257 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
258 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
260 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
261 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
263 xmm9 = _mm_add_epi32(xmm11, xmm12);
265 xmm8 = _mm_add_epi32(xmm8, xmm10);
272 for(i = 0; i < leftovers0; ++
i) {
273 xmm2 = _mm_load_ps((
float*)src0);
278 xmm2 = _mm_mul_ps(xmm2, xmm2);
282 xmm1 = _mm_hadd_ps(xmm2, xmm2);
284 xmm3 = _mm_max_ps(xmm1, xmm3);
286 xmm10 = _mm_set_epi32(2, 2, 2, 2);
288 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
289 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
291 xmm11 = _mm_and_si128(xmm8, xmm5.
int_vec);
292 xmm12 = _mm_and_si128(xmm9, xmm4.
int_vec);
294 xmm9 = _mm_add_epi32(xmm11, xmm12);
296 xmm8 = _mm_add_epi32(xmm8, xmm10);
300 for(i = 0; i < leftovers1; ++
i) {
305 xmm2 = _mm_load1_ps(&sq_dist);
309 xmm3 = _mm_max_ss(xmm3, xmm2);
311 xmm4.
float_vec = _mm_cmplt_ps(xmm1, xmm3);
312 xmm5.
float_vec = _mm_cmpeq_ps(xmm1, xmm3);
314 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
316 xmm11 = _mm_and_si128(xmm8, xmm4.
int_vec);
317 xmm12 = _mm_and_si128(xmm9, xmm5.
int_vec);
319 xmm9 = _mm_add_epi32(xmm11, xmm12);
325 _mm_store_ps((
float*)&(holderf.
f), xmm3);
326 _mm_store_si128(&(holderi.
int_vec), xmm9);
328 target[0] = holderi.
i[0];
329 sq_dist = holderf.
f[0];
330 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
331 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
332 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
333 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
334 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
335 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
359 #ifdef LV_HAVE_GENERIC 364 const uint32_t num_bytes = num_points*8;
372 for(; i < num_bytes >> 3; ++
i) {
375 index = sq_dist > max ?
i : index;
376 max = sq_dist > max ? sq_dist : max;
387 #ifndef INCLUDED_volk_32fc_index_max_32u_u_H 388 #define INCLUDED_volk_32fc_index_max_32u_u_H 396 #include<immintrin.h> 399 volk_32fc_index_max_32u_u_avx2(uint32_t* target,
lv_32fc_t* src0,
402 const uint32_t num_bytes = num_points*8;
409 __m256 xmm1, xmm2, xmm3;
410 __m256i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
412 xmm5.
int_vec = xmmfive = _mm256_setzero_si256();
413 xmm4.
int_vec = xmmfour = _mm256_setzero_si256();
414 holderf.
int_vec = holder0 = _mm256_setzero_si256();
415 holderi.
int_vec = holder1 = _mm256_setzero_si256();
417 int bound = num_bytes >> 6;
418 int leftovers1 = (num_bytes >> 4) & 1;
421 xmm8 = _mm256_set_epi32(7,6,5,4,3, 2, 1, 0);
422 xmm9 = _mm256_setzero_si256();
423 xmm10 = _mm256_set1_epi32(8);
424 xmm3 = _mm256_setzero_ps();
425 __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
427 for(; i < bound; ++
i) {
428 xmm1 = _mm256_loadu_ps((
float*)src0);
429 xmm2 = _mm256_loadu_ps((
float*)&src0[4]);
433 xmm1 = _mm256_mul_ps(xmm1, xmm1);
434 xmm2 = _mm256_mul_ps(xmm2, xmm2);
436 xmm1 = _mm256_hadd_ps(xmm1, xmm2);
437 xmm1 = _mm256_permutevar8x32_ps(xmm1, idx);
439 xmm3 = _mm256_max_ps(xmm1, xmm3);
441 xmm4.
float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
442 xmm5.
float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
444 xmm11 = _mm256_and_si256(xmm8, xmm5.
int_vec);
445 xmm12 = _mm256_and_si256(xmm9, xmm4.
int_vec);
447 xmm9 = _mm256_add_epi32(xmm11, xmm12);
449 xmm8 = _mm256_add_epi32(xmm8, xmm10);
452 xmm10 = _mm256_set1_epi32(4);
453 for(i = 0; i < leftovers1; ++
i) {
454 xmm1 = _mm256_loadu_ps((
float*)src0);
456 xmm1 = _mm256_mul_ps(xmm1, xmm1);
460 xmm1 = _mm256_hadd_ps(xmm1, xmm1);
462 xmm3 = _mm256_max_ps(xmm1, xmm3);
464 xmm4.
float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
465 xmm5.
float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
467 xmm11 = _mm256_and_si256(xmm8, xmm5.
int_vec);
468 xmm12 = _mm256_and_si256(xmm9, xmm4.
int_vec);
470 xmm9 = _mm256_add_epi32(xmm11, xmm12);
472 xmm8 = _mm256_add_epi32(xmm8, xmm10);
475 idx = _mm256_set_epi32(1,0,1,0,1,0,1,0);
476 xmm10 = _mm256_set1_epi32(2);
477 for(i = 0; i < leftovers1; ++
i) {
478 xmm2 = _mm256_loadu_ps((
float*)src0);
483 xmm2 = _mm256_mul_ps(xmm2, xmm2);
487 xmm1 = _mm256_hadd_ps(xmm2, xmm2);
489 xmm3 = _mm256_max_ps(xmm1, xmm3);
491 xmm4.
float_vec = _mm256_cmp_ps(xmm1, xmm3, 1);
492 xmm5.
float_vec = _mm256_cmp_ps(xmm1, xmm3, 0);
494 xmm11 = _mm256_and_si256(xmm8, xmm5.
int_vec);
495 xmm12 = _mm256_and_si256(xmm9, xmm4.
int_vec);
497 xmm9 = _mm256_add_epi32(xmm11, xmm12);
499 xmm8 = _mm256_add_epi32(xmm8, xmm10);
502 _mm256_storeu_ps((
float*)&(holderf.
f), xmm3);
503 _mm256_storeu_si256(&(holderi.
int_vec), xmm9);
505 target[0] = holderi.
i[0];
506 sq_dist = holderf.
f[0];
507 target[0] = (holderf.
f[1] > sq_dist) ? holderi.
i[1] : target[0];
508 sq_dist = (holderf.
f[1] > sq_dist) ? holderf.
f[1] : sq_dist;
509 target[0] = (holderf.
f[2] > sq_dist) ? holderi.
i[2] : target[0];
510 sq_dist = (holderf.
f[2] > sq_dist) ? holderf.
f[2] : sq_dist;
511 target[0] = (holderf.
f[3] > sq_dist) ? holderi.
i[3] : target[0];
512 sq_dist = (holderf.
f[3] > sq_dist) ? holderf.
f[3] : sq_dist;
513 target[0] = (holderf.
f[4] > sq_dist) ? holderi.
i[4] : target[0];
514 sq_dist = (holderf.
f[4] > sq_dist) ? holderf.
f[4] : sq_dist;
515 target[0] = (holderf.
f[5] > sq_dist) ? holderi.
i[5] : target[0];
516 sq_dist = (holderf.
f[5] > sq_dist) ? holderf.
f[5] : sq_dist;
517 target[0] = (holderf.
f[6] > sq_dist) ? holderi.
i[6] : target[0];
518 sq_dist = (holderf.
f[6] > sq_dist) ? holderf.
f[6] : sq_dist;
519 target[0] = (holderf.
f[7] > sq_dist) ? holderi.
i[7] : target[0];
520 sq_dist = (holderf.
f[7] > sq_dist) ? holderf.
f[7] : sq_dist;
static void volk_32fc_index_max_32u_a_sse3(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:214
#define bit128_p(x)
Definition: volk_common.h:118
float f[8]
Definition: volk_common.h:108
__m256i int_vec
Definition: volk_common.h:113
uint32_t i[8]
Definition: volk_common.h:107
__m128i int_vec
Definition: volk_common.h:99
static void volk_32fc_index_max_32u_generic(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:361
for i
Definition: volk_config_fixed.tmpl.h:25
#define bit256_p(x)
Definition: volk_common.h:119
Definition: volk_common.h:104
__m128 float_vec
Definition: volk_common.h:95
float complex lv_32fc_t
Definition: volk_complex.h:61
__m256 float_vec
Definition: volk_common.h:112
float f[4]
Definition: volk_common.h:91
#define lv_creal(x)
Definition: volk_complex.h:83
Definition: volk_common.h:87
#define lv_cimag(x)
Definition: volk_complex.h:85
uint32_t i[4]
Definition: volk_common.h:90