Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_8u_x2_encodeframepolar_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*
24  * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h'
25  */
26 
27 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
28 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
29 #include <string.h>
30 
31 static inline unsigned int
32 log2_of_power_of_2(unsigned int val){
33  // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
34  static const unsigned int b[] = {0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0,
35  0xFF00FF00, 0xFFFF0000};
36 
37  unsigned int res = (val & b[0]) != 0;
38  res |= ((val & b[4]) != 0) << 4;
39  res |= ((val & b[3]) != 0) << 3;
40  res |= ((val & b[2]) != 0) << 2;
41  res |= ((val & b[1]) != 0) << 1;
42  return res;
43 }
44 
45 static inline void
46 encodepolar_single_stage(unsigned char* frame_ptr, const unsigned char* temp_ptr,
47  const unsigned int num_branches, const unsigned int frame_half)
48 {
49  unsigned int branch, bit;
50  for(branch = 0; branch < num_branches; ++branch){
51  for(bit = 0; bit < frame_half; ++bit){
52  *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
53  *(frame_ptr + frame_half) = *(temp_ptr + 1);
54  ++frame_ptr;
55  temp_ptr += 2;
56  }
57  frame_ptr += frame_half;
58  }
59 }
60 
61 static inline void
62 volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, unsigned char* temp,
63  unsigned int frame_size)
64 {
65  unsigned int stage = log2_of_power_of_2(frame_size);
66  unsigned int frame_half = frame_size >> 1;
67  unsigned int num_branches = 1;
68 
69  while(stage){
70  // encode stage
71  encodepolar_single_stage(frame, temp, num_branches, frame_half);
72  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
73 
74  // update all the parameters.
75  num_branches = num_branches << 1;
76  frame_half = frame_half >> 1;
77  --stage;
78  }
79 }
80 
81 #ifdef LV_HAVE_SSSE3
82 #include <tmmintrin.h>
83 
84 static inline void
85 volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame, unsigned char* temp,
86  unsigned int frame_size)
87 {
88  const unsigned int po2 = log2_of_power_of_2(frame_size);
89 
90  unsigned int stage = po2;
91  unsigned char* frame_ptr = frame;
92  unsigned char* temp_ptr = temp;
93 
94  unsigned int frame_half = frame_size >> 1;
95  unsigned int num_branches = 1;
96  unsigned int branch;
97  unsigned int bit;
98 
99  // prepare constants
100  const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
101 
102  // get some SIMD registers to play with.
103  __m128i r_frame0, r_temp0, shifted;
104 
105  {
106  __m128i r_frame1, r_temp1;
107  const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
108 
109  while(stage > 4){
110  frame_ptr = frame;
111  temp_ptr = temp;
112 
113  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
114  for(branch = 0; branch < num_branches; ++branch){
115  for(bit = 0; bit < frame_half; bit += 16){
116  r_temp0 = _mm_loadu_si128((__m128i *) temp_ptr);
117  temp_ptr += 16;
118  r_temp1 = _mm_loadu_si128((__m128i *) temp_ptr);
119  temp_ptr += 16;
120 
121  shifted = _mm_srli_si128(r_temp0, 1);
122  shifted = _mm_and_si128(shifted, mask_stage1);
123  r_temp0 = _mm_xor_si128(shifted, r_temp0);
124  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
125 
126  shifted = _mm_srli_si128(r_temp1, 1);
127  shifted = _mm_and_si128(shifted, mask_stage1);
128  r_temp1 = _mm_xor_si128(shifted, r_temp1);
129  r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
130 
131  r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
132  _mm_storeu_si128((__m128i*) frame_ptr, r_frame0);
133 
134  r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
135  _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
136  frame_ptr += 16;
137  }
138 
139  frame_ptr += frame_half;
140  }
141  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
142 
143  num_branches = num_branches << 1;
144  frame_half = frame_half >> 1;
145  stage--;
146  }
147  }
148 
149  // This last part requires at least 16-bit frames.
150  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
151 
152  // reset pointers to correct positions.
153  frame_ptr = frame;
154  temp_ptr = temp;
155 
156  // load first chunk.
157  // Tests show a 1-2% gain compared to loading a new chunk and using it right after.
158  r_temp0 = _mm_loadu_si128((__m128i*) temp_ptr);
159  temp_ptr += 16;
160 
161  const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
162  const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
163  const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
164  const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
165 
166  for(branch = 1; branch < num_branches; ++branch){
167  // shuffle once for bit-reversal.
168  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
169 
170  shifted = _mm_srli_si128(r_temp0, 8);
171  shifted = _mm_and_si128(shifted, mask_stage4);
172  r_frame0 = _mm_xor_si128(shifted, r_temp0);
173 
174  // start loading next chunk.
175  r_temp0 = _mm_loadu_si128((__m128i*) temp_ptr);
176  temp_ptr += 16;
177 
178  shifted = _mm_srli_si128(r_frame0, 4);
179  shifted = _mm_and_si128(shifted, mask_stage3);
180  r_frame0 = _mm_xor_si128(shifted, r_frame0);
181 
182  shifted = _mm_srli_si128(r_frame0, 2);
183  shifted = _mm_and_si128(shifted, mask_stage2);
184  r_frame0 = _mm_xor_si128(shifted, r_frame0);
185 
186  shifted = _mm_srli_si128(r_frame0, 1);
187  shifted = _mm_and_si128(shifted, mask_stage1);
188  r_frame0 = _mm_xor_si128(shifted, r_frame0);
189 
190  // store result of chunk.
191  _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
192  frame_ptr += 16;
193  }
194  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
195 
196  shifted = _mm_srli_si128(r_temp0, 8);
197  shifted = _mm_and_si128(shifted, mask_stage4);
198  r_frame0 = _mm_xor_si128(shifted, r_temp0);
199 
200  // start loading the next chunk, but do not
201  // reload r_temp0
202 
203  shifted = _mm_srli_si128(r_frame0, 4);
204  shifted = _mm_and_si128(shifted, mask_stage3);
205  r_frame0 = _mm_xor_si128(shifted, r_frame0);
206 
207  shifted = _mm_srli_si128(r_frame0, 2);
208  shifted = _mm_and_si128(shifted, mask_stage2);
209  r_frame0 = _mm_xor_si128(shifted, r_frame0);
210 
211  shifted = _mm_srli_si128(r_frame0, 1);
212  shifted = _mm_and_si128(shifted, mask_stage1);
213  r_frame0 = _mm_xor_si128(shifted, r_frame0);
214 
215  // store result of chunk.
216  _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
217 }
218 
219 #endif /* LV_HAVE_SSSE3 */
220 
221 #ifdef LV_HAVE_AVX2
222 #include <immintrin.h>
223 
224 static inline void
225 volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame, unsigned char* temp,
226  unsigned int frame_size)
227 {
228  const unsigned int po2 = log2_of_power_of_2(frame_size);
229 
230  unsigned int stage = po2;
231  unsigned char* frame_ptr = frame;
232  unsigned char* temp_ptr = temp;
233 
234  unsigned int frame_half = frame_size >> 1;
235  unsigned int num_branches = 1;
236  unsigned int branch;
237  unsigned int bit;
238 
239  // prepare constants
240  const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF,
241  0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
242 
243  const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
244  // get some SIMD registers to play with.
245  __m256i r_frame0, r_temp0, shifted;
246  __m128i r_temp2, r_frame2, shifted2;
247  {
248  __m256i r_frame1, r_temp1;
249  __m128i r_frame3, r_temp3;
250  const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
251  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
252  const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
253 
254  while(stage > 4){
255  frame_ptr = frame;
256  temp_ptr = temp;
257 
258  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
259  for(branch = 0; branch < num_branches; ++branch){
260  for(bit = 0; bit < frame_half; bit += 32){
261  if ((frame_half-bit)<32) //if only 16 bits remaining in frame, not 32
262  {
263  r_temp2 = _mm_loadu_si128((__m128i *) temp_ptr);
264  temp_ptr += 16;
265  r_temp3 = _mm_loadu_si128((__m128i *) temp_ptr);
266  temp_ptr += 16;
267 
268  shifted2 = _mm_srli_si128(r_temp2, 1);
269  shifted2 = _mm_and_si128(shifted2, mask_stage0);
270  r_temp2 = _mm_xor_si128(shifted2, r_temp2);
271  r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
272 
273  shifted2 = _mm_srli_si128(r_temp3, 1);
274  shifted2 = _mm_and_si128(shifted2, mask_stage0);
275  r_temp3 = _mm_xor_si128(shifted2, r_temp3);
276  r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
277 
278  r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
279  _mm_storeu_si128((__m128i*) frame_ptr, r_frame2);
280 
281  r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
282  _mm_storeu_si128((__m128i*) (frame_ptr + frame_half), r_frame3);
283  frame_ptr += 16;
284  break;
285  }
286  r_temp0 = _mm256_loadu_si256((__m256i *) temp_ptr);
287  temp_ptr += 32;
288  r_temp1 = _mm256_loadu_si256((__m256i *) temp_ptr);
289  temp_ptr += 32;
290 
291  shifted = _mm256_srli_si256(r_temp0, 1);//operate on 128 bit lanes
292  shifted = _mm256_and_si256(shifted, mask_stage1);
293  r_temp0 = _mm256_xor_si256(shifted, r_temp0);
294  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
295 
296  shifted = _mm256_srli_si256(r_temp1, 1);
297  shifted = _mm256_and_si256(shifted, mask_stage1);
298  r_temp1 = _mm256_xor_si256(shifted, r_temp1);
299  r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
300 
301  r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
302  r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
303  r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
304  r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
305 
306  _mm256_storeu_si256((__m256i*) frame_ptr, r_frame0);
307 
308  _mm256_storeu_si256((__m256i*) (frame_ptr + frame_half), r_frame1);
309  frame_ptr += 32;
310  }
311 
312  frame_ptr += frame_half;
313  }
314  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
315 
316  num_branches = num_branches << 1;
317  frame_half = frame_half >> 1;
318  stage--;
319  }
320  }
321 
322  // This last part requires at least 32-bit frames.
323  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
324 
325  // reset pointers to correct positions.
326  frame_ptr = frame;
327  temp_ptr = temp;
328 
329  // load first chunk.
330  // Tests show a 1-2% gain compared to loading a new chunk and using it right after.
331  r_temp0 = _mm256_loadu_si256((__m256i*) temp_ptr);
332  temp_ptr += 32;
333  const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
334  0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
335  const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
336  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
337  const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF,
338  0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
339  const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF,
340  0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
341 
342  for(branch = 0; branch < num_branches/2; ++branch){
343  // shuffle once for bit-reversal.
344  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
345 
346  shifted = _mm256_srli_si256(r_temp0, 8); //128 bit lanes
347  shifted = _mm256_and_si256(shifted, mask_stage4);
348  r_frame0 = _mm256_xor_si256(shifted, r_temp0);
349 
350  // start loading next chunk.
351  r_temp0 = _mm256_loadu_si256((__m256i*) temp_ptr);
352  temp_ptr += 32;
353 
354  shifted = _mm256_srli_si256(r_frame0, 4);
355  shifted = _mm256_and_si256(shifted, mask_stage3);
356  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
357 
358  shifted = _mm256_srli_si256(r_frame0, 2);
359  shifted = _mm256_and_si256(shifted, mask_stage2);
360  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
361 
362  shifted = _mm256_srli_si256(r_frame0, 1);
363  shifted = _mm256_and_si256(shifted, mask_stage1);
364  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
365 
366  // store result of chunk.
367  _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
368  frame_ptr += 32;
369  }
370 }
371 #endif /* LV_HAVE_AVX2 */
372 
373 #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */
374 
375 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
376 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
377 
378 #ifdef LV_HAVE_SSSE3
379 #include <tmmintrin.h>
380 
381 static inline void
382 volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame, unsigned char* temp,
383  unsigned int frame_size)
384 {
385  const unsigned int po2 = log2_of_power_of_2(frame_size);
386 
387  unsigned int stage = po2;
388  unsigned char* frame_ptr = frame;
389  unsigned char* temp_ptr = temp;
390 
391  unsigned int frame_half = frame_size >> 1;
392  unsigned int num_branches = 1;
393  unsigned int branch;
394  unsigned int bit;
395 
396  // prepare constants
397  const __m128i mask_stage1 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
398 
399  // get some SIMD registers to play with.
400  __m128i r_frame0, r_temp0, shifted;
401 
402  {
403  __m128i r_frame1, r_temp1;
404  const __m128i shuffle_separate = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
405 
406  while(stage > 4){
407  frame_ptr = frame;
408  temp_ptr = temp;
409 
410  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
411  for(branch = 0; branch < num_branches; ++branch){
412  for(bit = 0; bit < frame_half; bit += 16){
413  r_temp0 = _mm_load_si128((__m128i *) temp_ptr);
414  temp_ptr += 16;
415  r_temp1 = _mm_load_si128((__m128i *) temp_ptr);
416  temp_ptr += 16;
417 
418  shifted = _mm_srli_si128(r_temp0, 1);
419  shifted = _mm_and_si128(shifted, mask_stage1);
420  r_temp0 = _mm_xor_si128(shifted, r_temp0);
421  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
422 
423  shifted = _mm_srli_si128(r_temp1, 1);
424  shifted = _mm_and_si128(shifted, mask_stage1);
425  r_temp1 = _mm_xor_si128(shifted, r_temp1);
426  r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
427 
428  r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
429  _mm_store_si128((__m128i*) frame_ptr, r_frame0);
430 
431  r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
432  _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame1);
433  frame_ptr += 16;
434  }
435 
436  frame_ptr += frame_half;
437  }
438  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
439 
440  num_branches = num_branches << 1;
441  frame_half = frame_half >> 1;
442  stage--;
443  }
444  }
445 
446  // This last part requires at least 16-bit frames.
447  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
448 
449  // reset pointers to correct positions.
450  frame_ptr = frame;
451  temp_ptr = temp;
452 
453  // load first chunk.
454  // Tests show a 1-2% gain compared to loading a new chunk and using it right after.
455  r_temp0 = _mm_load_si128((__m128i*) temp_ptr);
456  temp_ptr += 16;
457 
458  const __m128i shuffle_stage4 = _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
459  const __m128i mask_stage4 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
460  const __m128i mask_stage3 = _mm_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
461  const __m128i mask_stage2 = _mm_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
462 
463  for(branch = 1; branch < num_branches; ++branch){
464  // shuffle once for bit-reversal.
465  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
466 
467  shifted = _mm_srli_si128(r_temp0, 8);
468  shifted = _mm_and_si128(shifted, mask_stage4);
469  r_frame0 = _mm_xor_si128(shifted, r_temp0);
470 
471  // start loading next chunk.
472  r_temp0 = _mm_load_si128((__m128i*) temp_ptr);
473  temp_ptr += 16;
474 
475  shifted = _mm_srli_si128(r_frame0, 4);
476  shifted = _mm_and_si128(shifted, mask_stage3);
477  r_frame0 = _mm_xor_si128(shifted, r_frame0);
478 
479  shifted = _mm_srli_si128(r_frame0, 2);
480  shifted = _mm_and_si128(shifted, mask_stage2);
481  r_frame0 = _mm_xor_si128(shifted, r_frame0);
482 
483  shifted = _mm_srli_si128(r_frame0, 1);
484  shifted = _mm_and_si128(shifted, mask_stage1);
485  r_frame0 = _mm_xor_si128(shifted, r_frame0);
486 
487  // store result of chunk.
488  _mm_store_si128((__m128i*)frame_ptr, r_frame0);
489  frame_ptr += 16;
490  }
491  // shuffle once for bit-reversal.
492  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
493 
494  shifted = _mm_srli_si128(r_temp0, 8);
495  shifted = _mm_and_si128(shifted, mask_stage4);
496  r_frame0 = _mm_xor_si128(shifted, r_temp0);
497 
498  // start loading the next chunk, but do not
499  // reload r_temp0
500  shifted = _mm_srli_si128(r_frame0, 4);
501  shifted = _mm_and_si128(shifted, mask_stage3);
502  r_frame0 = _mm_xor_si128(shifted, r_frame0);
503 
504  shifted = _mm_srli_si128(r_frame0, 2);
505  shifted = _mm_and_si128(shifted, mask_stage2);
506  r_frame0 = _mm_xor_si128(shifted, r_frame0);
507 
508  shifted = _mm_srli_si128(r_frame0, 1);
509  shifted = _mm_and_si128(shifted, mask_stage1);
510  r_frame0 = _mm_xor_si128(shifted, r_frame0);
511 
512  // store result of chunk.
513  _mm_store_si128((__m128i*)frame_ptr, r_frame0);
514 }
515 #endif /* LV_HAVE_SSSE3 */
516 
517 #ifdef LV_HAVE_AVX2
518 #include <immintrin.h>
519 
520 static inline void
521 volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame, unsigned char* temp,
522  unsigned int frame_size)
523 {
524  const unsigned int po2 = log2_of_power_of_2(frame_size);
525 
526  unsigned int stage = po2;
527  unsigned char* frame_ptr = frame;
528  unsigned char* temp_ptr = temp;
529 
530  unsigned int frame_half = frame_size >> 1;
531  unsigned int num_branches = 1;
532  unsigned int branch;
533  unsigned int bit;
534 
535  // prepare constants
536  const __m256i mask_stage1 = _mm256_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF,
537  0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
538 
539  const __m128i mask_stage0 = _mm_set_epi8(0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF, 0x0, 0xFF);
540  // get some SIMD registers to play with.
541  __m256i r_frame0, r_temp0, shifted;
542  __m128i r_temp2, r_frame2, shifted2;
543  {
544  __m256i r_frame1, r_temp1;
545  __m128i r_frame3, r_temp3;
546  const __m256i shuffle_separate = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
547  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
548  const __m128i shuffle_separate128 = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
549 
550  while(stage > 4){
551  frame_ptr = frame;
552  temp_ptr = temp;
553 
554  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
555  for(branch = 0; branch < num_branches; ++branch){
556  for(bit = 0; bit < frame_half; bit += 32){
557  if ((frame_half-bit)<32) //if only 16 bits remaining in frame, not 32
558  {
559  r_temp2 = _mm_load_si128((__m128i *) temp_ptr);
560  temp_ptr += 16;
561  r_temp3 = _mm_load_si128((__m128i *) temp_ptr);
562  temp_ptr += 16;
563 
564  shifted2 = _mm_srli_si128(r_temp2, 1);
565  shifted2 = _mm_and_si128(shifted2, mask_stage0);
566  r_temp2 = _mm_xor_si128(shifted2, r_temp2);
567  r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
568 
569  shifted2 = _mm_srli_si128(r_temp3, 1);
570  shifted2 = _mm_and_si128(shifted2, mask_stage0);
571  r_temp3 = _mm_xor_si128(shifted2, r_temp3);
572  r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
573 
574  r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
575  _mm_store_si128((__m128i*) frame_ptr, r_frame2);
576 
577  r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
578  _mm_store_si128((__m128i*) (frame_ptr + frame_half), r_frame3);
579  frame_ptr += 16;
580  break;
581  }
582  r_temp0 = _mm256_load_si256((__m256i *) temp_ptr);
583  temp_ptr += 32;
584  r_temp1 = _mm256_load_si256((__m256i *) temp_ptr);
585  temp_ptr += 32;
586 
587  shifted = _mm256_srli_si256(r_temp0, 1);//operate on 128 bit lanes
588  shifted = _mm256_and_si256(shifted, mask_stage1);
589  r_temp0 = _mm256_xor_si256(shifted, r_temp0);
590  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
591 
592  shifted = _mm256_srli_si256(r_temp1, 1);
593  shifted = _mm256_and_si256(shifted, mask_stage1);
594  r_temp1 = _mm256_xor_si256(shifted, r_temp1);
595  r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
596 
597  r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
598  r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
599  r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
600  r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
601 
602  _mm256_store_si256((__m256i*) frame_ptr, r_frame0);
603 
604  _mm256_store_si256((__m256i*) (frame_ptr + frame_half), r_frame1);
605  frame_ptr += 32;
606  }
607 
608  frame_ptr += frame_half;
609  }
610  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
611 
612  num_branches = num_branches << 1;
613  frame_half = frame_half >> 1;
614  stage--;
615  }
616  }
617 
618  // This last part requires at least 32-bit frames.
619  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
620 
621  // reset pointers to correct positions.
622  frame_ptr = frame;
623  temp_ptr = temp;
624 
625  // load first chunk.
626  // Tests show a 1-2% gain compared to loading a new chunk and using it right after.
627  r_temp0 = _mm256_load_si256((__m256i*) temp_ptr);
628  temp_ptr += 32;
629  const __m256i shuffle_stage4 = _mm256_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
630  0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
631  const __m256i mask_stage4 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
632  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
633  const __m256i mask_stage3 = _mm256_set_epi8(0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF,
634  0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0xFF, 0xFF, 0xFF, 0xFF);
635  const __m256i mask_stage2 = _mm256_set_epi8(0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF,
636  0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF);
637 
638  for(branch = 0; branch < num_branches/2; ++branch){
639  // shuffle once for bit-reversal.
640  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
641 
642  shifted = _mm256_srli_si256(r_temp0, 8); //128 bit lanes
643  shifted = _mm256_and_si256(shifted, mask_stage4);
644  r_frame0 = _mm256_xor_si256(shifted, r_temp0);
645 
646  // start loading next chunk.
647  r_temp0 = _mm256_load_si256((__m256i*) temp_ptr);
648  temp_ptr += 32;
649 
650  shifted = _mm256_srli_si256(r_frame0, 4);
651  shifted = _mm256_and_si256(shifted, mask_stage3);
652  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
653 
654  shifted = _mm256_srli_si256(r_frame0, 2);
655  shifted = _mm256_and_si256(shifted, mask_stage2);
656  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
657 
658  shifted = _mm256_srli_si256(r_frame0, 1);
659  shifted = _mm256_and_si256(shifted, mask_stage1);
660  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
661 
662  // store result of chunk.
663  _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
664  frame_ptr += 32;
665  }
666 }
667 #endif /* LV_HAVE_AVX2 */
668 
669 
670 
671 #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:62
static void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:382
val
Definition: volk_arch_defs.py:69
static unsigned int log2_of_power_of_2(unsigned int val)
Definition: volk_8u_x2_encodeframepolar_8u.h:32
static void encodepolar_single_stage(unsigned char *frame_ptr, const unsigned char *temp_ptr, const unsigned int num_branches, const unsigned int frame_half)
Definition: volk_8u_x2_encodeframepolar_8u.h:46
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:85