57 #ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H 58 #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H 61 unsigned char t[64/8];
62 unsigned int w[64/32];
63 unsigned short s[64/16];
64 unsigned char c[64/8];
78 unsigned char min=X[0];
80 for(i=0;i<NUMSTATES;i++)
83 for(i=0;i<NUMSTATES;i++)
91 BFLY(
int i,
int s,
unsigned char * syms,
unsigned char *Y,
92 unsigned char *X,
decision_t * d,
unsigned char* Branchtab)
94 int j, decision0, decision1;
95 unsigned char metric,m0,m1,m2,m3;
100 int PRECISIONSHIFT = 2;
104 metric += (Branchtab[i+j*NUMSTATES/2] ^ syms[s*RATE+j])>>METRICSHIFT;
105 metric=metric>>PRECISIONSHIFT;
107 unsigned char max = ((RATE*((256 -1)>>METRICSHIFT))>>PRECISIONSHIFT);
110 m1 = X[i+NUMSTATES/2] + (max - metric);
111 m2 = X[
i] + (max - metric);
112 m3 = X[i+NUMSTATES/2] + metric;
114 decision0 = (
signed int)(m0-m1) > 0;
115 decision1 = (
signed int)(m2-m3) > 0;
117 Y[2*
i] = decision0 ? m1 : m0;
118 Y[2*i+1] = decision1 ? m3 : m2;
120 d->
w[i/(
sizeof(
unsigned int)*8/2)+s*(
sizeof(
decision_t)/
sizeof(
unsigned int))] |=
121 (decision0|decision1<<1) << ((2*
i)&(
sizeof(
unsigned int)*8-1));
127 #include <immintrin.h> 131 volk_8u_x4_conv_k7_r2_8u_avx2(
unsigned char* Y,
unsigned char* X,
132 unsigned char* syms,
unsigned char* dec,
133 unsigned int framebits,
unsigned int excess,
134 unsigned char* Branchtab)
137 for(i9 = 0; i9 < ((framebits + excess)>>1); i9++) {
138 unsigned char a75, a81;
141 unsigned char *a80, *b6;
142 int *a110, *a91, *a93;
143 __m256i *a112, *a71, *a72, *a77, *a83, *a95;
145 __m256i a76, a78, a79, a82, a84, a85, a88, a89
146 , a90, d10, d9, m23, m24, m25
148 , s23, s24, s25, t13, t14, t15;
149 a71 = ((__m256i *) X);
153 s22 = _mm256_permute2x128_si256(s18,s19,0x20);
154 s19 = _mm256_permute2x128_si256(s18,s19,0x31);
159 a76 = _mm256_set1_epi8(a75);
160 a77 = ((__m256i *) Branchtab);
162 a79 = _mm256_xor_si256(a76, a78);
165 a82 = _mm256_set1_epi8(a81);
168 a85 = _mm256_xor_si256(a82, a84);
169 t13 = _mm256_avg_epu8(a79,a85);
170 a86 = ((__m256i ) t13);
171 a87 = _mm256_srli_epi16(a86, 2);
172 a88 = ((__m256i ) a87);
173 t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
174 t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
175 m23 = _mm256_adds_epu8(s18, t14);
176 m24 = _mm256_adds_epu8(s19, t15);
177 m25 = _mm256_adds_epu8(s18, t15);
178 m26 = _mm256_adds_epu8(s19, t14);
179 a89 = _mm256_min_epu8(m24, m23);
180 d9 = _mm256_cmpeq_epi8(a89, m24);
181 a90 = _mm256_min_epu8(m26, m25);
182 d10 = _mm256_cmpeq_epi8(a90, m26);
183 s22 = _mm256_unpacklo_epi8(d9,d10);
184 s23 = _mm256_unpackhi_epi8(d9,d10);
185 s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
190 s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
193 s22 = _mm256_unpacklo_epi8(a89, a90);
194 s23 = _mm256_unpackhi_epi8(a89, a90);
195 a95 = ((__m256i *) Y);
196 s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
198 s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
201 if ((((
unsigned char *) Y)[0]>210)) {
203 m5 = ((__m256i *) Y)[0];
204 m5 = _mm256_min_epu8(m5, ((__m256i *) Y)[1]);
206 m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
207 m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 32)), ((__m256i ) m7)));
208 m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 16)), ((__m256i ) m7)));
209 m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 8)), ((__m256i ) m7)));
210 m7 = _mm256_unpacklo_epi8(m7, m7);
211 m7 = _mm256_shufflelo_epi16(m7, 0);
212 m6 = _mm256_unpacklo_epi64(m7, m7);
213 m6 = _mm256_permute2x128_si256(m6, m6, 0);
214 ((__m256i *) Y)[0] = _mm256_subs_epu8(((__m256i *) Y)[0], m6);
215 ((__m256i *) Y)[1] = _mm256_subs_epu8(((__m256i *) Y)[1], m6);
217 unsigned char a188, a194;
220 unsigned char *a187, *a193;
221 int *a204, *a206, *a223, *b16;
222 __m256i *a184, *a185, *a190, *a196, *a208, *a225;
224 __m256i a189, a191, a192, a195, a197, a198, a201
225 , a202, a203, d17, d18, m39, m40, m41
227 , s51, t25, t26, t27;
228 a184 = ((__m256i *) Y);
232 s50 = _mm256_permute2x128_si256(s46,s47,0x20);
233 s47 = _mm256_permute2x128_si256(s46,s47,0x31);
237 a189 = _mm256_set1_epi8(a188);
238 a190 = ((__m256i *) Branchtab);
240 a192 = _mm256_xor_si256(a189, a191);
243 a195 = _mm256_set1_epi8(a194);
246 a198 = _mm256_xor_si256(a195, a197);
247 t25 = _mm256_avg_epu8(a192,a198);
248 a199 = ((__m256i ) t25);
249 a200 = _mm256_srli_epi16(a199, 2);
250 a201 = ((__m256i ) a200);
251 t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
252 t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
253 m39 = _mm256_adds_epu8(s46, t26);
254 m40 = _mm256_adds_epu8(s47, t27);
255 m41 = _mm256_adds_epu8(s46, t27);
256 m42 = _mm256_adds_epu8(s47, t26);
257 a202 = _mm256_min_epu8(m40, m39);
258 d17 = _mm256_cmpeq_epi8(a202, m40);
259 a203 = _mm256_min_epu8(m42, m41);
260 d18 = _mm256_cmpeq_epi8(a203, m42);
261 s24 = _mm256_unpacklo_epi8(d17,d18);
262 s25 = _mm256_unpackhi_epi8(d17,d18);
263 s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
264 a204 = ((
int *) dec);
269 s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
272 s50 = _mm256_unpacklo_epi8(a202, a203);
273 s51 = _mm256_unpackhi_epi8(a202, a203);
274 s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
275 s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
276 a208 = ((__m256i *) X);
281 if ((((
unsigned char *) X)[0]>210)) {
283 m12 = ((__m256i *) X)[0];
284 m12 = _mm256_min_epu8(m12, ((__m256i *) X)[1]);
286 m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
287 m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 32)), ((__m256i ) m14)));
288 m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 16)), ((__m256i ) m14)));
289 m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 8)), ((__m256i ) m14)));
290 m14 = _mm256_unpacklo_epi8(m14, m14);
291 m14 = _mm256_shufflelo_epi16(m14, 0);
292 m13 = _mm256_unpacklo_epi64(m14, m14);
293 m13 = _mm256_permute2x128_si256(m13, m13, 0);
294 ((__m256i *) X)[0] = _mm256_subs_epu8(((__m256i *) X)[0], m13);
295 ((__m256i *) X)[1] = _mm256_subs_epu8(((__m256i *) X)[1], m13);
302 for(j=0; j < (framebits + excess) % 2; ++j) {
305 BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (
decision_t *)dec, Branchtab);
319 #include <pmmintrin.h> 320 #include <emmintrin.h> 321 #include <xmmintrin.h> 322 #include <mmintrin.h> 327 unsigned char* syms,
unsigned char* dec,
328 unsigned int framebits,
unsigned int excess,
329 unsigned char* Branchtab)
332 for(i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
333 unsigned char a75, a81;
335 short int s20, s21, s26, s27;
336 unsigned char *a74, *a80, *b6;
337 short int *a110, *a111, *a91, *a93, *a94;
338 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83
339 , *a95, *a96, *a97, *a98, *a99;
340 __m128i a105, a106, a86, a87;
341 __m128i a100, a101, a103, a104, a107, a108, a109
342 , a76, a78, a79, a82, a84, a85, a88, a89
343 , a90, d10, d11, d12, d9, m23, m24, m25
344 , m26, m27, m28, m29, m30, s18, s19, s22
345 , s23, s24, s25, s28, s29, t13, t14, t15
347 a71 = ((__m128i *) X);
354 a76 = _mm_set1_epi8(a75);
355 a77 = ((__m128i *) Branchtab);
357 a79 = _mm_xor_si128(a76, a78);
361 a82 = _mm_set1_epi8(a81);
364 a85 = _mm_xor_si128(a82, a84);
365 t13 = _mm_avg_epu8(a79,a85);
366 a86 = ((__m128i ) t13);
367 a87 = _mm_srli_epi16(a86, 2);
368 a88 = ((__m128i ) a87);
369 t14 = _mm_and_si128(a88, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
370 , 63, 63, 63, 63, 63, 63, 63, 63
372 t15 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
373 , 63, 63, 63, 63, 63, 63, 63, 63
375 m23 = _mm_adds_epu8(s18, t14);
376 m24 = _mm_adds_epu8(s19, t15);
377 m25 = _mm_adds_epu8(s18, t15);
378 m26 = _mm_adds_epu8(s19, t14);
379 a89 = _mm_min_epu8(m24, m23);
380 d9 = _mm_cmpeq_epi8(a89, m24);
381 a90 = _mm_min_epu8(m26, m25);
382 d10 = _mm_cmpeq_epi8(a90, m26);
383 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9,d10));
384 a91 = ((
short int *) dec);
388 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9,d10));
391 s22 = _mm_unpacklo_epi8(a89, a90);
392 s23 = _mm_unpackhi_epi8(a89, a90);
393 a95 = ((__m128i *) Y);
403 a101 = _mm_xor_si128(a76, a100);
406 a104 = _mm_xor_si128(a82, a103);
407 t16 = _mm_avg_epu8(a101,a104);
408 a105 = ((__m128i ) t16);
409 a106 = _mm_srli_epi16(a105, 2);
410 a107 = ((__m128i ) a106);
411 t17 = _mm_and_si128(a107, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
412 , 63, 63, 63, 63, 63, 63, 63, 63
414 t18 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
415 , 63, 63, 63, 63, 63, 63, 63, 63
417 m27 = _mm_adds_epu8(s24, t17);
418 m28 = _mm_adds_epu8(s25, t18);
419 m29 = _mm_adds_epu8(s24, t18);
420 m30 = _mm_adds_epu8(s25, t17);
421 a108 = _mm_min_epu8(m28, m27);
422 d11 = _mm_cmpeq_epi8(a108, m28);
423 a109 = _mm_min_epu8(m30, m29);
424 d12 = _mm_cmpeq_epi8(a109, m30);
425 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11,d12));
428 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11,d12));
431 s28 = _mm_unpacklo_epi8(a108, a109);
432 s29 = _mm_unpackhi_epi8(a108, a109);
437 if ((((
unsigned char *) Y)[0]>210)) {
439 m5 = ((__m128i *) Y)[0];
440 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[1]);
441 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[2]);
442 m5 = _mm_min_epu8(m5, ((__m128i *) Y)[3]);
444 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
445 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 32)), ((__m128i ) m7)));
446 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 16)), ((__m128i ) m7)));
447 m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 8)), ((__m128i ) m7)));
448 m7 = _mm_unpacklo_epi8(m7, m7);
449 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
450 m6 = _mm_unpacklo_epi64(m7, m7);
451 ((__m128i *) Y)[0] = _mm_subs_epu8(((__m128i *) Y)[0], m6);
452 ((__m128i *) Y)[1] = _mm_subs_epu8(((__m128i *) Y)[1], m6);
453 ((__m128i *) Y)[2] = _mm_subs_epu8(((__m128i *) Y)[2], m6);
454 ((__m128i *) Y)[3] = _mm_subs_epu8(((__m128i *) Y)[3], m6);
456 unsigned char a188, a194;
458 short int s48, s49, s54, s55;
459 unsigned char *a187, *a193, *b15;
460 short int *a204, *a206, *a207, *a223, *a224, *b16;
461 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210
462 , *a211, *a212, *a215, *a225, *a226;
463 __m128i a199, a200, a218, a219;
464 __m128i a189, a191, a192, a195, a197, a198, a201
465 , a202, a203, a213, a214, a216, a217, a220, a221
466 , a222, d17, d18, d19, d20, m39, m40, m41
467 , m42, m43, m44, m45, m46, s46, s47, s50
468 , s51, s52, s53, s56, s57, t25, t26, t27
470 a184 = ((__m128i *) Y);
478 a189 = _mm_set1_epi8(a188);
479 a190 = ((__m128i *) Branchtab);
481 a192 = _mm_xor_si128(a189, a191);
484 a195 = _mm_set1_epi8(a194);
487 a198 = _mm_xor_si128(a195, a197);
488 t25 = _mm_avg_epu8(a192,a198);
489 a199 = ((__m128i ) t25);
490 a200 = _mm_srli_epi16(a199, 2);
491 a201 = ((__m128i ) a200);
492 t26 = _mm_and_si128(a201, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
493 , 63, 63, 63, 63, 63, 63, 63, 63
495 t27 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
496 , 63, 63, 63, 63, 63, 63, 63, 63
498 m39 = _mm_adds_epu8(s46, t26);
499 m40 = _mm_adds_epu8(s47, t27);
500 m41 = _mm_adds_epu8(s46, t27);
501 m42 = _mm_adds_epu8(s47, t26);
502 a202 = _mm_min_epu8(m40, m39);
503 d17 = _mm_cmpeq_epi8(a202, m40);
504 a203 = _mm_min_epu8(m42, m41);
505 d18 = _mm_cmpeq_epi8(a203, m42);
506 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17,d18));
507 a204 = ((
short int *) dec);
512 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17,d18));
515 s50 = _mm_unpacklo_epi8(a202, a203);
516 s51 = _mm_unpackhi_epi8(a202, a203);
517 a208 = ((__m128i *) X);
527 a214 = _mm_xor_si128(a189, a213);
530 a217 = _mm_xor_si128(a195, a216);
531 t28 = _mm_avg_epu8(a214,a217);
532 a218 = ((__m128i ) t28);
533 a219 = _mm_srli_epi16(a218, 2);
534 a220 = ((__m128i ) a219);
535 t29 = _mm_and_si128(a220, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
536 , 63, 63, 63, 63, 63, 63, 63, 63
538 t30 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
539 , 63, 63, 63, 63, 63, 63, 63, 63
541 m43 = _mm_adds_epu8(s52, t29);
542 m44 = _mm_adds_epu8(s53, t30);
543 m45 = _mm_adds_epu8(s52, t30);
544 m46 = _mm_adds_epu8(s53, t29);
545 a221 = _mm_min_epu8(m44, m43);
546 d19 = _mm_cmpeq_epi8(a221, m44);
547 a222 = _mm_min_epu8(m46, m45);
548 d20 = _mm_cmpeq_epi8(a222, m46);
549 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19,d20));
552 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19,d20));
555 s56 = _mm_unpacklo_epi8(a221, a222);
556 s57 = _mm_unpackhi_epi8(a221, a222);
561 if ((((
unsigned char *) X)[0]>210)) {
563 m12 = ((__m128i *) X)[0];
564 m12 = _mm_min_epu8(m12, ((__m128i *) X)[1]);
565 m12 = _mm_min_epu8(m12, ((__m128i *) X)[2]);
566 m12 = _mm_min_epu8(m12, ((__m128i *) X)[3]);
568 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
569 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 32)), ((__m128i ) m14)));
570 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 16)), ((__m128i ) m14)));
571 m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 8)), ((__m128i ) m14)));
572 m14 = _mm_unpacklo_epi8(m14, m14);
573 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
574 m13 = _mm_unpacklo_epi64(m14, m14);
575 ((__m128i *) X)[0] = _mm_subs_epu8(((__m128i *) X)[0], m13);
576 ((__m128i *) X)[1] = _mm_subs_epu8(((__m128i *) X)[1], m13);
577 ((__m128i *) X)[2] = _mm_subs_epu8(((__m128i *) X)[2], m13);
578 ((__m128i *) X)[3] = _mm_subs_epu8(((__m128i *) X)[3], m13);
591 for(j=0; j < (framebits + excess) % 2; ++j) {
594 BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (
decision_t *)dec, Branchtab);
617 unsigned char* syms,
unsigned char* dec,
618 unsigned int framebits,
unsigned int excess,
619 unsigned char* Branchtab)
621 int nbits = framebits + excess;
623 int RENORMALIZE_THRESHOLD = 210;
626 for (s=0;s<nbits;s++){
628 for(i=0;i<NUMSTATES/2;i++){
637 Y = (
unsigned char*)tmp;
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:326
static void renormalize(unsigned char *X, unsigned char threshold)
Definition: volk_8u_x4_conv_k7_r2_8u.h:73
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:616
for i
Definition: volk_config_fixed.tmpl.h:25
Definition: volk_8u_x4_conv_k7_r2_8u.h:60
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:91
unsigned int w[64/32]
Definition: volk_8u_x4_conv_k7_r2_8u.h:62