Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_8u_x4_conv_k7_r2_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
57 #ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
58 #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
59 
60 typedef union {
61  unsigned char/*DECISIONTYPE*/ t[64/*NUMSTATES*//8/*DECISIONTYPE_BITSIZE*/];
62  unsigned int w[64/*NUMSTATES*//32];
63  unsigned short s[64/*NUMSTATES*//16];
64  unsigned char c[64/*NUMSTATES*//8];
65 #ifdef _MSC_VER
66 } decision_t;
67 #else
68 } decision_t __attribute__ ((aligned (16)));
69 #endif
70 
71 
72 static inline void
73 renormalize(unsigned char* X, unsigned char threshold)
74 {
75  int NUMSTATES = 64;
76  int i;
77 
78  unsigned char min=X[0];
79  //if(min > threshold) {
80  for(i=0;i<NUMSTATES;i++)
81  if (min>X[i])
82  min=X[i];
83  for(i=0;i<NUMSTATES;i++)
84  X[i]-=min;
85  //}
86 }
87 
88 
89 //helper BFLY for GENERIC version
90 static inline void
91 BFLY(int i, int s, unsigned char * syms, unsigned char *Y,
92  unsigned char *X, decision_t * d, unsigned char* Branchtab)
93 {
94  int j, decision0, decision1;
95  unsigned char metric,m0,m1,m2,m3;
96 
97  int NUMSTATES = 64;
98  int RATE = 2;
99  int METRICSHIFT = 1;
100  int PRECISIONSHIFT = 2;
101 
102  metric =0;
103  for(j=0;j<RATE;j++)
104  metric += (Branchtab[i+j*NUMSTATES/2] ^ syms[s*RATE+j])>>METRICSHIFT;
105  metric=metric>>PRECISIONSHIFT;
106 
107  unsigned char max = ((RATE*((256 -1)>>METRICSHIFT))>>PRECISIONSHIFT);
108 
109  m0 = X[i] + metric;
110  m1 = X[i+NUMSTATES/2] + (max - metric);
111  m2 = X[i] + (max - metric);
112  m3 = X[i+NUMSTATES/2] + metric;
113 
114  decision0 = (signed int)(m0-m1) > 0;
115  decision1 = (signed int)(m2-m3) > 0;
116 
117  Y[2*i] = decision0 ? m1 : m0;
118  Y[2*i+1] = decision1 ? m3 : m2;
119 
120  d->w[i/(sizeof(unsigned int)*8/2)+s*(sizeof(decision_t)/sizeof(unsigned int))] |=
121  (decision0|decision1<<1) << ((2*i)&(sizeof(unsigned int)*8-1));
122 }
123 
124 
125 #if LV_HAVE_AVX2
126 
127 #include <immintrin.h>
128 #include <stdio.h>
129 
130 static inline void
131 volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y, unsigned char* X,
132  unsigned char* syms, unsigned char* dec,
133  unsigned int framebits, unsigned int excess,
134  unsigned char* Branchtab)
135 {
136  unsigned int i9;
137  for(i9 = 0; i9 < ((framebits + excess)>>1); i9++) {
138  unsigned char a75, a81;
139  int a73, a92;
140  int s20, s21;
141  unsigned char *a80, *b6;
142  int *a110, *a91, *a93;
143  __m256i *a112, *a71, *a72, *a77, *a83, *a95;
144  __m256i a86, a87;
145  __m256i a76, a78, a79, a82, a84, a85, a88, a89
146  , a90, d10, d9, m23, m24, m25
147  , m26, s18, s19, s22
148  , s23, s24, s25, t13, t14, t15;
149  a71 = ((__m256i *) X);
150  s18 = *(a71);
151  a72 = (a71 + 1);
152  s19 = *(a72);
153  s22 = _mm256_permute2x128_si256(s18,s19,0x20);
154  s19 = _mm256_permute2x128_si256(s18,s19,0x31);
155  s18 = s22;
156  a73 = (4 * i9);
157  b6 = (syms + a73);
158  a75 = *(b6);
159  a76 = _mm256_set1_epi8(a75);
160  a77 = ((__m256i *) Branchtab);
161  a78 = *(a77);
162  a79 = _mm256_xor_si256(a76, a78);
163  a80 = (b6 + 1);
164  a81 = *(a80);
165  a82 = _mm256_set1_epi8(a81);
166  a83 = (a77 + 1);
167  a84 = *(a83);
168  a85 = _mm256_xor_si256(a82, a84);
169  t13 = _mm256_avg_epu8(a79,a85);
170  a86 = ((__m256i ) t13);
171  a87 = _mm256_srli_epi16(a86, 2);
172  a88 = ((__m256i ) a87);
173  t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
174  t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
175  m23 = _mm256_adds_epu8(s18, t14);
176  m24 = _mm256_adds_epu8(s19, t15);
177  m25 = _mm256_adds_epu8(s18, t15);
178  m26 = _mm256_adds_epu8(s19, t14);
179  a89 = _mm256_min_epu8(m24, m23);
180  d9 = _mm256_cmpeq_epi8(a89, m24);
181  a90 = _mm256_min_epu8(m26, m25);
182  d10 = _mm256_cmpeq_epi8(a90, m26);
183  s22 = _mm256_unpacklo_epi8(d9,d10);
184  s23 = _mm256_unpackhi_epi8(d9,d10);
185  s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
186  a91 = ((int *) dec);
187  a92 = (4 * i9);
188  a93 = (a91 + a92);
189  *(a93) = s20;
190  s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
191  a110 = (a93 + 1);
192  *(a110) = s21;
193  s22 = _mm256_unpacklo_epi8(a89, a90);
194  s23 = _mm256_unpackhi_epi8(a89, a90);
195  a95 = ((__m256i *) Y);
196  s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
197  *(a95) = s24;
198  s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
199  a112 = (a95 + 1);
200  *(a112) = s23;
201  if ((((unsigned char *) Y)[0]>210)) {
202  __m256i m5, m6;
203  m5 = ((__m256i *) Y)[0];
204  m5 = _mm256_min_epu8(m5, ((__m256i *) Y)[1]);
205  __m256i m7;
206  m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
207  m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 32)), ((__m256i ) m7)));
208  m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 16)), ((__m256i ) m7)));
209  m7 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m7, 8)), ((__m256i ) m7)));
210  m7 = _mm256_unpacklo_epi8(m7, m7);
211  m7 = _mm256_shufflelo_epi16(m7, 0);
212  m6 = _mm256_unpacklo_epi64(m7, m7);
213  m6 = _mm256_permute2x128_si256(m6, m6, 0); //copy lower half of m6 to upper half, since above ops operate on 128 bit lanes
214  ((__m256i *) Y)[0] = _mm256_subs_epu8(((__m256i *) Y)[0], m6);
215  ((__m256i *) Y)[1] = _mm256_subs_epu8(((__m256i *) Y)[1], m6);
216  }
217  unsigned char a188, a194;
218  int a205;
219  int s48, s54;
220  unsigned char *a187, *a193;
221  int *a204, *a206, *a223, *b16;
222  __m256i *a184, *a185, *a190, *a196, *a208, *a225;
223  __m256i a199, a200;
224  __m256i a189, a191, a192, a195, a197, a198, a201
225  , a202, a203, d17, d18, m39, m40, m41
226  , m42, s46, s47, s50
227  , s51, t25, t26, t27;
228  a184 = ((__m256i *) Y);
229  s46 = *(a184);
230  a185 = (a184 + 1);
231  s47 = *(a185);
232  s50 = _mm256_permute2x128_si256(s46,s47,0x20);
233  s47 = _mm256_permute2x128_si256(s46,s47,0x31);
234  s46 = s50;
235  a187 = (b6 + 2);
236  a188 = *(a187);
237  a189 = _mm256_set1_epi8(a188);
238  a190 = ((__m256i *) Branchtab);
239  a191 = *(a190);
240  a192 = _mm256_xor_si256(a189, a191);
241  a193 = (b6 + 3);
242  a194 = *(a193);
243  a195 = _mm256_set1_epi8(a194);
244  a196 = (a190 + 1);
245  a197 = *(a196);
246  a198 = _mm256_xor_si256(a195, a197);
247  t25 = _mm256_avg_epu8(a192,a198);
248  a199 = ((__m256i ) t25);
249  a200 = _mm256_srli_epi16(a199, 2);
250  a201 = ((__m256i ) a200);
251  t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
252  t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
253  m39 = _mm256_adds_epu8(s46, t26);
254  m40 = _mm256_adds_epu8(s47, t27);
255  m41 = _mm256_adds_epu8(s46, t27);
256  m42 = _mm256_adds_epu8(s47, t26);
257  a202 = _mm256_min_epu8(m40, m39);
258  d17 = _mm256_cmpeq_epi8(a202, m40);
259  a203 = _mm256_min_epu8(m42, m41);
260  d18 = _mm256_cmpeq_epi8(a203, m42);
261  s24 = _mm256_unpacklo_epi8(d17,d18);
262  s25 = _mm256_unpackhi_epi8(d17,d18);
263  s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
264  a204 = ((int *) dec);
265  a205 = (4 * i9);
266  b16 = (a204 + a205);
267  a206 = (b16 + 2);
268  *(a206) = s48;
269  s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
270  a223 = (b16 + 3);
271  *(a223) = s54;
272  s50 = _mm256_unpacklo_epi8(a202, a203);
273  s51 = _mm256_unpackhi_epi8(a202, a203);
274  s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
275  s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
276  a208 = ((__m256i *) X);
277  *(a208) = s25;
278  a225 = (a208 + 1);
279  *(a225) = s51;
280 
281  if ((((unsigned char *) X)[0]>210)) {
282  __m256i m12, m13;
283  m12 = ((__m256i *) X)[0];
284  m12 = _mm256_min_epu8(m12, ((__m256i *) X)[1]);
285  __m256i m14;
286  m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
287  m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 32)), ((__m256i ) m14)));
288  m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 16)), ((__m256i ) m14)));
289  m14 = ((__m256i ) _mm256_min_epu8(((__m256i ) _mm256_srli_epi64(m14, 8)), ((__m256i ) m14)));
290  m14 = _mm256_unpacklo_epi8(m14, m14);
291  m14 = _mm256_shufflelo_epi16(m14, 0);
292  m13 = _mm256_unpacklo_epi64(m14, m14);
293  m13 = _mm256_permute2x128_si256(m13, m13, 0);
294  ((__m256i *) X)[0] = _mm256_subs_epu8(((__m256i *) X)[0], m13);
295  ((__m256i *) X)[1] = _mm256_subs_epu8(((__m256i *) X)[1], m13);
296  }
297  }
298 
299  renormalize(X, 210);
300 
301  unsigned int j;
302  for(j=0; j < (framebits + excess) % 2; ++j) {
303  int i;
304  for(i=0;i<64/2;i++){
305  BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab);
306  }
307 
308  renormalize(Y, 210);
309 
310  }
311  /*skip*/
312 }
313 
314 #endif /*LV_HAVE_AVX2*/
315 
316 
317 #if LV_HAVE_SSE3
318 
319 #include <pmmintrin.h>
320 #include <emmintrin.h>
321 #include <xmmintrin.h>
322 #include <mmintrin.h>
323 #include <stdio.h>
324 
325 static inline void
326 volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y, unsigned char* X,
327  unsigned char* syms, unsigned char* dec,
328  unsigned int framebits, unsigned int excess,
329  unsigned char* Branchtab)
330 {
331  unsigned int i9;
332  for(i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
333  unsigned char a75, a81;
334  int a73, a92;
335  short int s20, s21, s26, s27;
336  unsigned char *a74, *a80, *b6;
337  short int *a110, *a111, *a91, *a93, *a94;
338  __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83
339  , *a95, *a96, *a97, *a98, *a99;
340  __m128i a105, a106, a86, a87;
341  __m128i a100, a101, a103, a104, a107, a108, a109
342  , a76, a78, a79, a82, a84, a85, a88, a89
343  , a90, d10, d11, d12, d9, m23, m24, m25
344  , m26, m27, m28, m29, m30, s18, s19, s22
345  , s23, s24, s25, s28, s29, t13, t14, t15
346  , t16, t17, t18;
347  a71 = ((__m128i *) X);
348  s18 = *(a71);
349  a72 = (a71 + 2);
350  s19 = *(a72);
351  a73 = (4 * i9);
352  a74 = (syms + a73);
353  a75 = *(a74);
354  a76 = _mm_set1_epi8(a75);
355  a77 = ((__m128i *) Branchtab);
356  a78 = *(a77);
357  a79 = _mm_xor_si128(a76, a78);
358  b6 = (a73 + syms);
359  a80 = (b6 + 1);
360  a81 = *(a80);
361  a82 = _mm_set1_epi8(a81);
362  a83 = (a77 + 2);
363  a84 = *(a83);
364  a85 = _mm_xor_si128(a82, a84);
365  t13 = _mm_avg_epu8(a79,a85);
366  a86 = ((__m128i ) t13);
367  a87 = _mm_srli_epi16(a86, 2);
368  a88 = ((__m128i ) a87);
369  t14 = _mm_and_si128(a88, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
370  , 63, 63, 63, 63, 63, 63, 63, 63
371  , 63));
372  t15 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
373  , 63, 63, 63, 63, 63, 63, 63, 63
374  , 63), t14);
375  m23 = _mm_adds_epu8(s18, t14);
376  m24 = _mm_adds_epu8(s19, t15);
377  m25 = _mm_adds_epu8(s18, t15);
378  m26 = _mm_adds_epu8(s19, t14);
379  a89 = _mm_min_epu8(m24, m23);
380  d9 = _mm_cmpeq_epi8(a89, m24);
381  a90 = _mm_min_epu8(m26, m25);
382  d10 = _mm_cmpeq_epi8(a90, m26);
383  s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9,d10));
384  a91 = ((short int *) dec);
385  a92 = (8 * i9);
386  a93 = (a91 + a92);
387  *(a93) = s20;
388  s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9,d10));
389  a94 = (a93 + 1);
390  *(a94) = s21;
391  s22 = _mm_unpacklo_epi8(a89, a90);
392  s23 = _mm_unpackhi_epi8(a89, a90);
393  a95 = ((__m128i *) Y);
394  *(a95) = s22;
395  a96 = (a95 + 1);
396  *(a96) = s23;
397  a97 = (a71 + 1);
398  s24 = *(a97);
399  a98 = (a71 + 3);
400  s25 = *(a98);
401  a99 = (a77 + 1);
402  a100 = *(a99);
403  a101 = _mm_xor_si128(a76, a100);
404  a102 = (a77 + 3);
405  a103 = *(a102);
406  a104 = _mm_xor_si128(a82, a103);
407  t16 = _mm_avg_epu8(a101,a104);
408  a105 = ((__m128i ) t16);
409  a106 = _mm_srli_epi16(a105, 2);
410  a107 = ((__m128i ) a106);
411  t17 = _mm_and_si128(a107, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
412  , 63, 63, 63, 63, 63, 63, 63, 63
413  , 63));
414  t18 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
415  , 63, 63, 63, 63, 63, 63, 63, 63
416  , 63), t17);
417  m27 = _mm_adds_epu8(s24, t17);
418  m28 = _mm_adds_epu8(s25, t18);
419  m29 = _mm_adds_epu8(s24, t18);
420  m30 = _mm_adds_epu8(s25, t17);
421  a108 = _mm_min_epu8(m28, m27);
422  d11 = _mm_cmpeq_epi8(a108, m28);
423  a109 = _mm_min_epu8(m30, m29);
424  d12 = _mm_cmpeq_epi8(a109, m30);
425  s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11,d12));
426  a110 = (a93 + 2);
427  *(a110) = s26;
428  s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11,d12));
429  a111 = (a93 + 3);
430  *(a111) = s27;
431  s28 = _mm_unpacklo_epi8(a108, a109);
432  s29 = _mm_unpackhi_epi8(a108, a109);
433  a112 = (a95 + 2);
434  *(a112) = s28;
435  a113 = (a95 + 3);
436  *(a113) = s29;
437  if ((((unsigned char *) Y)[0]>210)) {
438  __m128i m5, m6;
439  m5 = ((__m128i *) Y)[0];
440  m5 = _mm_min_epu8(m5, ((__m128i *) Y)[1]);
441  m5 = _mm_min_epu8(m5, ((__m128i *) Y)[2]);
442  m5 = _mm_min_epu8(m5, ((__m128i *) Y)[3]);
443  __m128i m7;
444  m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
445  m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 32)), ((__m128i ) m7)));
446  m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 16)), ((__m128i ) m7)));
447  m7 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m7, 8)), ((__m128i ) m7)));
448  m7 = _mm_unpacklo_epi8(m7, m7);
449  m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
450  m6 = _mm_unpacklo_epi64(m7, m7);
451  ((__m128i *) Y)[0] = _mm_subs_epu8(((__m128i *) Y)[0], m6);
452  ((__m128i *) Y)[1] = _mm_subs_epu8(((__m128i *) Y)[1], m6);
453  ((__m128i *) Y)[2] = _mm_subs_epu8(((__m128i *) Y)[2], m6);
454  ((__m128i *) Y)[3] = _mm_subs_epu8(((__m128i *) Y)[3], m6);
455  }
456  unsigned char a188, a194;
457  int a186, a205;
458  short int s48, s49, s54, s55;
459  unsigned char *a187, *a193, *b15;
460  short int *a204, *a206, *a207, *a223, *a224, *b16;
461  __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210
462  , *a211, *a212, *a215, *a225, *a226;
463  __m128i a199, a200, a218, a219;
464  __m128i a189, a191, a192, a195, a197, a198, a201
465  , a202, a203, a213, a214, a216, a217, a220, a221
466  , a222, d17, d18, d19, d20, m39, m40, m41
467  , m42, m43, m44, m45, m46, s46, s47, s50
468  , s51, s52, s53, s56, s57, t25, t26, t27
469  , t28, t29, t30;
470  a184 = ((__m128i *) Y);
471  s46 = *(a184);
472  a185 = (a184 + 2);
473  s47 = *(a185);
474  a186 = (4 * i9);
475  b15 = (a186 + syms);
476  a187 = (b15 + 2);
477  a188 = *(a187);
478  a189 = _mm_set1_epi8(a188);
479  a190 = ((__m128i *) Branchtab);
480  a191 = *(a190);
481  a192 = _mm_xor_si128(a189, a191);
482  a193 = (b15 + 3);
483  a194 = *(a193);
484  a195 = _mm_set1_epi8(a194);
485  a196 = (a190 + 2);
486  a197 = *(a196);
487  a198 = _mm_xor_si128(a195, a197);
488  t25 = _mm_avg_epu8(a192,a198);
489  a199 = ((__m128i ) t25);
490  a200 = _mm_srli_epi16(a199, 2);
491  a201 = ((__m128i ) a200);
492  t26 = _mm_and_si128(a201, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
493  , 63, 63, 63, 63, 63, 63, 63, 63
494  , 63));
495  t27 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
496  , 63, 63, 63, 63, 63, 63, 63, 63
497  , 63), t26);
498  m39 = _mm_adds_epu8(s46, t26);
499  m40 = _mm_adds_epu8(s47, t27);
500  m41 = _mm_adds_epu8(s46, t27);
501  m42 = _mm_adds_epu8(s47, t26);
502  a202 = _mm_min_epu8(m40, m39);
503  d17 = _mm_cmpeq_epi8(a202, m40);
504  a203 = _mm_min_epu8(m42, m41);
505  d18 = _mm_cmpeq_epi8(a203, m42);
506  s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17,d18));
507  a204 = ((short int *) dec);
508  a205 = (8 * i9);
509  b16 = (a204 + a205);
510  a206 = (b16 + 4);
511  *(a206) = s48;
512  s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17,d18));
513  a207 = (b16 + 5);
514  *(a207) = s49;
515  s50 = _mm_unpacklo_epi8(a202, a203);
516  s51 = _mm_unpackhi_epi8(a202, a203);
517  a208 = ((__m128i *) X);
518  *(a208) = s50;
519  a209 = (a208 + 1);
520  *(a209) = s51;
521  a210 = (a184 + 1);
522  s52 = *(a210);
523  a211 = (a184 + 3);
524  s53 = *(a211);
525  a212 = (a190 + 1);
526  a213 = *(a212);
527  a214 = _mm_xor_si128(a189, a213);
528  a215 = (a190 + 3);
529  a216 = *(a215);
530  a217 = _mm_xor_si128(a195, a216);
531  t28 = _mm_avg_epu8(a214,a217);
532  a218 = ((__m128i ) t28);
533  a219 = _mm_srli_epi16(a218, 2);
534  a220 = ((__m128i ) a219);
535  t29 = _mm_and_si128(a220, _mm_set_epi8(63, 63, 63, 63, 63, 63, 63
536  , 63, 63, 63, 63, 63, 63, 63, 63
537  , 63));
538  t30 = _mm_subs_epu8(_mm_set_epi8(63, 63, 63, 63, 63, 63, 63
539  , 63, 63, 63, 63, 63, 63, 63, 63
540  , 63), t29);
541  m43 = _mm_adds_epu8(s52, t29);
542  m44 = _mm_adds_epu8(s53, t30);
543  m45 = _mm_adds_epu8(s52, t30);
544  m46 = _mm_adds_epu8(s53, t29);
545  a221 = _mm_min_epu8(m44, m43);
546  d19 = _mm_cmpeq_epi8(a221, m44);
547  a222 = _mm_min_epu8(m46, m45);
548  d20 = _mm_cmpeq_epi8(a222, m46);
549  s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19,d20));
550  a223 = (b16 + 6);
551  *(a223) = s54;
552  s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19,d20));
553  a224 = (b16 + 7);
554  *(a224) = s55;
555  s56 = _mm_unpacklo_epi8(a221, a222);
556  s57 = _mm_unpackhi_epi8(a221, a222);
557  a225 = (a208 + 2);
558  *(a225) = s56;
559  a226 = (a208 + 3);
560  *(a226) = s57;
561  if ((((unsigned char *) X)[0]>210)) {
562  __m128i m12, m13;
563  m12 = ((__m128i *) X)[0];
564  m12 = _mm_min_epu8(m12, ((__m128i *) X)[1]);
565  m12 = _mm_min_epu8(m12, ((__m128i *) X)[2]);
566  m12 = _mm_min_epu8(m12, ((__m128i *) X)[3]);
567  __m128i m14;
568  m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
569  m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 32)), ((__m128i ) m14)));
570  m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 16)), ((__m128i ) m14)));
571  m14 = ((__m128i ) _mm_min_epu8(((__m128i ) _mm_srli_epi64(m14, 8)), ((__m128i ) m14)));
572  m14 = _mm_unpacklo_epi8(m14, m14);
573  m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
574  m13 = _mm_unpacklo_epi64(m14, m14);
575  ((__m128i *) X)[0] = _mm_subs_epu8(((__m128i *) X)[0], m13);
576  ((__m128i *) X)[1] = _mm_subs_epu8(((__m128i *) X)[1], m13);
577  ((__m128i *) X)[2] = _mm_subs_epu8(((__m128i *) X)[2], m13);
578  ((__m128i *) X)[3] = _mm_subs_epu8(((__m128i *) X)[3], m13);
579  }
580  }
581 
582  renormalize(X, 210);
583 
584  /*int ch;
585  for(ch = 0; ch < 64; ch++) {
586  printf("%d,", X[ch]);
587  }
588  printf("\n");*/
589 
590  unsigned int j;
591  for(j=0; j < (framebits + excess) % 2; ++j) {
592  int i;
593  for(i=0;i<64/2;i++){
594  BFLY(i, (((framebits+excess) >> 1) << 1) + j , syms, Y, X, (decision_t *)dec, Branchtab);
595  }
596 
597 
598  renormalize(Y, 210);
599 
600  /*printf("\n");
601  for(ch = 0; ch < 64; ch++) {
602  printf("%d,", Y[ch]);
603  }
604  printf("\n");*/
605 
606  }
607  /*skip*/
608 }
609 
610 #endif /*LV_HAVE_SSE3*/
611 
612 
613 #if LV_HAVE_GENERIC
614 
615 static inline void
616 volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y, unsigned char* X,
617  unsigned char* syms, unsigned char* dec,
618  unsigned int framebits, unsigned int excess,
619  unsigned char* Branchtab)
620 {
621  int nbits = framebits + excess;
622  int NUMSTATES = 64;
623  int RENORMALIZE_THRESHOLD = 210;
624 
625  int s,i;
626  for (s=0;s<nbits;s++){
627  void *tmp;
628  for(i=0;i<NUMSTATES/2;i++){
629  BFLY(i, s, syms, Y, X, (decision_t *)dec, Branchtab);
630  }
631 
632  renormalize(Y, RENORMALIZE_THRESHOLD);
633 
635  tmp = (void *)X;
636  X = Y;
637  Y = (unsigned char*)tmp;
638  }
639 }
640 
641 #endif /* LV_HAVE_GENERIC */
642 
643 #endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:326
static void renormalize(unsigned char *X, unsigned char threshold)
Definition: volk_8u_x4_conv_k7_r2_8u.h:73
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:616
for i
Definition: volk_config_fixed.tmpl.h:25
Definition: volk_8u_x4_conv_k7_r2_8u.h:60
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:91
unsigned int w[64/32]
Definition: volk_8u_x4_conv_k7_r2_8u.h:62