Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_8u_polarbutterfly_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
72 #define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
73 #include <math.h>
75 
76 static inline float
77 llr_odd(const float la, const float lb)
78 {
79  const float ala = fabsf(la);
80  const float alb = fabsf(lb);
81  return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
82 }
83 
84 static inline void
85 llr_odd_stages(float* llrs, int min_stage, const int depth, const int frame_size, const int row)
86 {
87  int loop_stage = depth - 1;
88  float* dst_llr_ptr;
89  float* src_llr_ptr;
90  int stage_size = 0x01 << loop_stage;
91 
92  int el;
93  while(min_stage <= loop_stage){
94  dst_llr_ptr = llrs + loop_stage * frame_size + row;
95  src_llr_ptr = dst_llr_ptr + frame_size;
96  for(el = 0; el < stage_size; el++){
97  *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
98  src_llr_ptr += 2;
99  }
100 
101  --loop_stage;
102  stage_size >>= 1;
103  }
104 }
105 
106 static inline float
107 llr_even(const float la, const float lb, const unsigned char f)
108 {
109  switch(f){
110  case 0:
111  return lb + la;
112  default:
113  return lb - la;
114  }
115 }
116 
117 static inline void
118 even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num)
119 {
120  u++;
121  int i;
122  for(i = 1; i < u_num; i += 2){
123  *u_even++ = *u;
124  u += 2;
125  }
126 }
127 
128 static inline void
129 odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num)
130 {
131  int i;
132  for(i = 1; i < u_num; i += 2){
133  *u_xor++ = *u ^ *(u + 1);
134  u += 2;
135  }
136 }
137 
138 static inline int
139 calculate_max_stage_depth_for_row(const int frame_exp, const int row)
140 {
141  int max_stage_depth = 0;
142  int half_stage_size = 0x01;
143  int stage_size = half_stage_size << 1;
144  while(max_stage_depth < (frame_exp - 1)){ // last stage holds received values.
145  if(!(row % stage_size < half_stage_size)){
146  break;
147  }
148  half_stage_size <<= 1;
149  stage_size <<= 1;
150  max_stage_depth++;
151  }
152  return max_stage_depth;
153 }
154 
155 #ifdef LV_HAVE_GENERIC
156 
157 static inline void
158 volk_32f_8u_polarbutterfly_32f_generic(float* llrs, unsigned char* u,
159  const int frame_exp,
160  const int stage, const int u_num, const int row)
161 {
162  const int frame_size = 0x01 << frame_exp;
163  const int next_stage = stage + 1;
164 
165  const int half_stage_size = 0x01 << stage;
166  const int stage_size = half_stage_size << 1;
167 
168  const bool is_upper_stage_half = row % stage_size < half_stage_size;
169 
170 // // this is a natural bit order impl
171  float* next_llrs = llrs + frame_size;// LLRs are stored in a consecutive array.
172  float* call_row_llr = llrs + row;
173 
174  const int section = row - (row % stage_size);
175  const int jump_size = ((row % half_stage_size) << 1) % stage_size;
176 
177  const int next_upper_row = section + jump_size;
178  const int next_lower_row = next_upper_row + 1;
179 
180  const float* upper_right_llr_ptr = next_llrs + next_upper_row;
181  const float* lower_right_llr_ptr = next_llrs + next_lower_row;
182 
183  if(!is_upper_stage_half){
184  const int u_pos = u_num >> stage;
185  const unsigned char f = u[u_pos - 1];
186  *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
187  return;
188  }
189 
190  if(frame_exp > next_stage){
191  unsigned char* u_half = u + frame_size;
192  odd_xor_even_values(u_half, u, u_num);
193  volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
194 
195  even_u_values(u_half, u, u_num);
196  volk_32f_8u_polarbutterfly_32f_generic(next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
197  }
198 
199  *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
200 }
201 
202 #endif /* LV_HAVE_GENERIC */
203 
204 
205 #ifdef LV_HAVE_AVX
206 #include <immintrin.h>
208 
209 static inline void
210 volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs, unsigned char* u,
211  const int frame_exp,
212  const int stage, const int u_num, const int row)
213 {
214  const int frame_size = 0x01 << frame_exp;
215  if(row % 2){ // for odd rows just do the only necessary calculation and return.
216  const float* next_llrs = llrs + frame_size + row;
217  *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
218  return;
219  }
220 
221  const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
222  if(max_stage_depth < 3){ // vectorized version needs larger vectors.
223  volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
224  return;
225  }
226 
227  int loop_stage = max_stage_depth;
228  int stage_size = 0x01 << loop_stage;
229 
230  float* src_llr_ptr;
231  float* dst_llr_ptr;
232 
233  __m256 src0, src1, dst;
234 
235  if(row){ // not necessary for ZERO row. == first bit to be decoded.
236  // first do bit combination for all stages
237  // effectively encode some decoded bits again.
238  unsigned char* u_target = u + frame_size;
239  unsigned char* u_temp = u + 2* frame_size;
240  memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
241 
242  if(stage_size > 15){
243  _mm256_zeroupper();
244  volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
245  }
246  else{
247  volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
248  }
249 
250  src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
251  dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
252 
253  __m128i fbits;
254 
255  int p;
256  for(p = 0; p < stage_size; p += 8){
257  _mm256_zeroupper();
258  fbits = _mm_loadu_si128((__m128i*) u_target);
259  u_target += 8;
260 
261  src0 = _mm256_loadu_ps(src_llr_ptr);
262  src1 = _mm256_loadu_ps(src_llr_ptr + 8);
263  src_llr_ptr += 16;
264 
265  dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits);
266 
267  _mm256_storeu_ps(dst_llr_ptr, dst);
268  dst_llr_ptr += 8;
269  }
270 
271  --loop_stage;
272  stage_size >>= 1;
273  }
274 
275  const int min_stage = stage > 2 ? stage : 2;
276 
277  _mm256_zeroall(); // Important to clear cache!
278 
279  int el;
280  while(min_stage < loop_stage){
281  dst_llr_ptr = llrs + loop_stage * frame_size + row;
282  src_llr_ptr = dst_llr_ptr + frame_size;
283  for(el = 0; el < stage_size; el += 8){
284  src0 = _mm256_loadu_ps(src_llr_ptr);
285  src_llr_ptr += 8;
286  src1 = _mm256_loadu_ps(src_llr_ptr);
287  src_llr_ptr += 8;
288 
289  dst = _mm256_polar_minsum_llrs(src0, src1);
290 
291  _mm256_storeu_ps(dst_llr_ptr, dst);
292  dst_llr_ptr += 8;
293  }
294 
295  --loop_stage;
296  stage_size >>= 1;
297 
298  }
299 
300  // for stages < 3 vectors are too small!.
301  llr_odd_stages(llrs, stage, loop_stage + 1,frame_size, row);
302 }
303 
304 #endif /* LV_HAVE_AVX */
305 
306 #ifdef LV_HAVE_AVX2
307 #include <immintrin.h>
309 
310 static inline void
311 volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs, unsigned char* u,
312  const int frame_exp,
313  const int stage, const int u_num, const int row)
314 {
315  const int frame_size = 0x01 << frame_exp;
316  if(row % 2){ // for odd rows just do the only necessary calculation and return.
317  const float* next_llrs = llrs + frame_size + row;
318  *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
319  return;
320  }
321 
322  const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
323  if(max_stage_depth < 3){ // vectorized version needs larger vectors.
324  volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
325  return;
326  }
327 
328  int loop_stage = max_stage_depth;
329  int stage_size = 0x01 << loop_stage;
330 
331  float* src_llr_ptr;
332  float* dst_llr_ptr;
333 
334  __m256 src0, src1, dst;
335 
336  if(row){ // not necessary for ZERO row. == first bit to be decoded.
337  // first do bit combination for all stages
338  // effectively encode some decoded bits again.
339  unsigned char* u_target = u + frame_size;
340  unsigned char* u_temp = u + 2* frame_size;
341  memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
342 
343  if(stage_size > 15){
344  _mm256_zeroupper();
345  volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
346  }
347  else{
348  volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
349  }
350 
351  src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
352  dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
353 
354  __m128i fbits;
355 
356  int p;
357  for(p = 0; p < stage_size; p += 8){
358  _mm256_zeroupper();
359  fbits = _mm_loadu_si128((__m128i*) u_target);
360  u_target += 8;
361 
362  src0 = _mm256_loadu_ps(src_llr_ptr);
363  src1 = _mm256_loadu_ps(src_llr_ptr + 8);
364  src_llr_ptr += 16;
365 
366  dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits);
367 
368  _mm256_storeu_ps(dst_llr_ptr, dst);
369  dst_llr_ptr += 8;
370  }
371 
372  --loop_stage;
373  stage_size >>= 1;
374  }
375 
376  const int min_stage = stage > 2 ? stage : 2;
377 
378  _mm256_zeroall(); // Important to clear cache!
379 
380  int el;
381  while(min_stage < loop_stage){
382  dst_llr_ptr = llrs + loop_stage * frame_size + row;
383  src_llr_ptr = dst_llr_ptr + frame_size;
384  for(el = 0; el < stage_size; el += 8){
385  src0 = _mm256_loadu_ps(src_llr_ptr);
386  src_llr_ptr += 8;
387  src1 = _mm256_loadu_ps(src_llr_ptr);
388  src_llr_ptr += 8;
389 
390  dst = _mm256_polar_minsum_llrs(src0, src1);
391 
392  _mm256_storeu_ps(dst_llr_ptr, dst);
393  dst_llr_ptr += 8;
394  }
395 
396  --loop_stage;
397  stage_size >>= 1;
398 
399  }
400 
401  // for stages < 3 vectors are too small!.
402  llr_odd_stages(llrs, stage, loop_stage + 1,frame_size, row);
403 }
404 
405 #endif /* LV_HAVE_AVX2 */
406 
407 #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ */
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:62
static void volk_32f_8u_polarbutterfly_32f_generic(float *llrs, unsigned char *u, const int frame_exp, const int stage, const int u_num, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:158
static void llr_odd_stages(float *llrs, int min_stage, const int depth, const int frame_size, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:85
static void volk_32f_8u_polarbutterfly_32f_u_avx(float *llrs, unsigned char *u, const int frame_exp, const int stage, const int u_num, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:210
static void odd_xor_even_values(unsigned char *u_xor, const unsigned char *u, const int u_num)
Definition: volk_32f_8u_polarbutterfly_32f.h:129
static float llr_even(const float la, const float lb, const unsigned char f)
Definition: volk_32f_8u_polarbutterfly_32f.h:107
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:101
static void even_u_values(unsigned char *u_even, const unsigned char *u, const int u_num)
Definition: volk_32f_8u_polarbutterfly_32f.h:118
for i
Definition: volk_config_fixed.tmpl.h:25
static int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:139
static float llr_odd(const float la, const float lb)
Definition: volk_32f_8u_polarbutterfly_32f.h:77
static __m256 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx2_intrinsics.h:51
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:85
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx_intrinsics.h:115