Vector Optimized Library of Kernels  2.5.1
Architecture-tuned implementations of math kernels
volk_16ic_x2_multiply_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
47 #ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
48 #define INCLUDED_volk_16ic_x2_multiply_16ic_H
49 
50 #include <volk/volk_common.h>
51 #include <volk/volk_complex.h>
52 
53 #ifdef LV_HAVE_GENERIC
54 
55 static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result,
56  const lv_16sc_t* in_a,
57  const lv_16sc_t* in_b,
58  unsigned int num_points)
59 {
60  unsigned int n;
61  for (n = 0; n < num_points; n++) {
62  result[n] = in_a[n] * in_b[n];
63  }
64 }
65 
66 #endif /*LV_HAVE_GENERIC*/
67 
68 
69 #ifdef LV_HAVE_SSE2
70 #include <emmintrin.h>
71 
73  const lv_16sc_t* in_a,
74  const lv_16sc_t* in_b,
75  unsigned int num_points)
76 {
77  const unsigned int sse_iters = num_points / 4;
78  __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
79  result;
80 
81  mask_imag = _mm_set_epi8(
82  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
83  mask_real = _mm_set_epi8(
84  0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
85 
86  const lv_16sc_t* _in_a = in_a;
87  const lv_16sc_t* _in_b = in_b;
88  lv_16sc_t* _out = out;
89  unsigned int number;
90 
91  for (number = 0; number < sse_iters; number++) {
92  a = _mm_load_si128(
93  (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
94  b = _mm_load_si128((__m128i*)_in_b);
95  c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
96 
97  c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
98  // zeros, and store the results in dst.
99  real = _mm_subs_epi16(c, c_sr);
100  real = _mm_and_si128(real,
101  mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
102 
103  b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
104  a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
105 
106  imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
107  imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
108 
109  imag = _mm_adds_epi16(imag1, imag2);
110  imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
111 
112  result = _mm_or_si128(real, imag);
113 
114  _mm_store_si128((__m128i*)_out, result);
115 
116  _in_a += 4;
117  _in_b += 4;
118  _out += 4;
119  }
120 
121  for (number = sse_iters * 4; number < num_points; ++number) {
122  *_out++ = (*_in_a++) * (*_in_b++);
123  }
124 }
125 #endif /* LV_HAVE_SSE2 */
126 
127 
128 #ifdef LV_HAVE_SSE2
129 #include <emmintrin.h>
130 
132  const lv_16sc_t* in_a,
133  const lv_16sc_t* in_b,
134  unsigned int num_points)
135 {
136  const unsigned int sse_iters = num_points / 4;
137  __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
138  result;
139 
140  mask_imag = _mm_set_epi8(
141  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
142  mask_real = _mm_set_epi8(
143  0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
144 
145  const lv_16sc_t* _in_a = in_a;
146  const lv_16sc_t* _in_b = in_b;
147  lv_16sc_t* _out = out;
148  unsigned int number;
149 
150  for (number = 0; number < sse_iters; number++) {
151  a = _mm_loadu_si128(
152  (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
153  b = _mm_loadu_si128((__m128i*)_in_b);
154  c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
155 
156  c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
157  // zeros, and store the results in dst.
158  real = _mm_subs_epi16(c, c_sr);
159  real = _mm_and_si128(real,
160  mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
161 
162  b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
163  a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
164 
165  imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
166  imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
167 
168  imag = _mm_adds_epi16(imag1, imag2);
169  imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
170 
171  result = _mm_or_si128(real, imag);
172 
173  _mm_storeu_si128((__m128i*)_out, result);
174 
175  _in_a += 4;
176  _in_b += 4;
177  _out += 4;
178  }
179 
180  for (number = sse_iters * 4; number < num_points; ++number) {
181  *_out++ = (*_in_a++) * (*_in_b++);
182  }
183 }
184 #endif /* LV_HAVE_SSE2 */
185 
186 
187 #ifdef LV_HAVE_AVX2
188 #include <immintrin.h>
189 
190 static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out,
191  const lv_16sc_t* in_a,
192  const lv_16sc_t* in_b,
193  unsigned int num_points)
194 {
195  unsigned int number = 0;
196  const unsigned int avx2_points = num_points / 8;
197 
198  const lv_16sc_t* _in_a = in_a;
199  const lv_16sc_t* _in_b = in_b;
200  lv_16sc_t* _out = out;
201 
202  __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
203 
204  const __m256i mask_imag = _mm256_set_epi8(0xFF,
205  0xFF,
206  0,
207  0,
208  0xFF,
209  0xFF,
210  0,
211  0,
212  0xFF,
213  0xFF,
214  0,
215  0,
216  0xFF,
217  0xFF,
218  0,
219  0,
220  0xFF,
221  0xFF,
222  0,
223  0,
224  0xFF,
225  0xFF,
226  0,
227  0,
228  0xFF,
229  0xFF,
230  0,
231  0,
232  0xFF,
233  0xFF,
234  0,
235  0);
236  const __m256i mask_real = _mm256_set_epi8(0,
237  0,
238  0xFF,
239  0xFF,
240  0,
241  0,
242  0xFF,
243  0xFF,
244  0,
245  0,
246  0xFF,
247  0xFF,
248  0,
249  0,
250  0xFF,
251  0xFF,
252  0,
253  0,
254  0xFF,
255  0xFF,
256  0,
257  0,
258  0xFF,
259  0xFF,
260  0,
261  0,
262  0xFF,
263  0xFF,
264  0,
265  0,
266  0xFF,
267  0xFF);
268 
269  for (; number < avx2_points; number++) {
270  a = _mm256_loadu_si256(
271  (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
272  b = _mm256_loadu_si256(
273  (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
274  c = _mm256_mullo_epi16(a, b);
275 
276  c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
277  // zeros, and store the results in dst.
278  real = _mm256_subs_epi16(c, c_sr);
279  real = _mm256_and_si256(
280  real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
281 
282  b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
283  a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
284 
285  imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
286  imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
287 
288  imag = _mm256_adds_epi16(imag1, imag2);
289  imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
290 
291  result = _mm256_or_si256(real, imag);
292 
293  _mm256_storeu_si256((__m256i*)_out, result);
294 
295  _in_a += 8;
296  _in_b += 8;
297  _out += 8;
298  }
299 
300  number = avx2_points * 8;
301  for (; number < num_points; number++) {
302  *_out++ = (*_in_a++) * (*_in_b++);
303  }
304 }
305 #endif /* LV_HAVE_AVX2 */
306 
307 
308 #ifdef LV_HAVE_AVX2
309 #include <immintrin.h>
310 
311 static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out,
312  const lv_16sc_t* in_a,
313  const lv_16sc_t* in_b,
314  unsigned int num_points)
315 {
316  unsigned int number = 0;
317  const unsigned int avx2_points = num_points / 8;
318 
319  const lv_16sc_t* _in_a = in_a;
320  const lv_16sc_t* _in_b = in_b;
321  lv_16sc_t* _out = out;
322 
323  __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
324 
325  const __m256i mask_imag = _mm256_set_epi8(0xFF,
326  0xFF,
327  0,
328  0,
329  0xFF,
330  0xFF,
331  0,
332  0,
333  0xFF,
334  0xFF,
335  0,
336  0,
337  0xFF,
338  0xFF,
339  0,
340  0,
341  0xFF,
342  0xFF,
343  0,
344  0,
345  0xFF,
346  0xFF,
347  0,
348  0,
349  0xFF,
350  0xFF,
351  0,
352  0,
353  0xFF,
354  0xFF,
355  0,
356  0);
357  const __m256i mask_real = _mm256_set_epi8(0,
358  0,
359  0xFF,
360  0xFF,
361  0,
362  0,
363  0xFF,
364  0xFF,
365  0,
366  0,
367  0xFF,
368  0xFF,
369  0,
370  0,
371  0xFF,
372  0xFF,
373  0,
374  0,
375  0xFF,
376  0xFF,
377  0,
378  0,
379  0xFF,
380  0xFF,
381  0,
382  0,
383  0xFF,
384  0xFF,
385  0,
386  0,
387  0xFF,
388  0xFF);
389 
390  for (; number < avx2_points; number++) {
391  a = _mm256_load_si256(
392  (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
393  b = _mm256_load_si256(
394  (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
395  c = _mm256_mullo_epi16(a, b);
396 
397  c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
398  // zeros, and store the results in dst.
399  real = _mm256_subs_epi16(c, c_sr);
400  real = _mm256_and_si256(
401  real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
402 
403  b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
404  a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
405 
406  imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
407  imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
408 
409  imag = _mm256_adds_epi16(imag1, imag2);
410  imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
411 
412  result = _mm256_or_si256(real, imag);
413 
414  _mm256_store_si256((__m256i*)_out, result);
415 
416  _in_a += 8;
417  _in_b += 8;
418  _out += 8;
419  }
420 
421  number = avx2_points * 8;
422  for (; number < num_points; number++) {
423  *_out++ = (*_in_a++) * (*_in_b++);
424  }
425 }
426 #endif /* LV_HAVE_AVX2 */
427 
428 
429 #ifdef LV_HAVE_NEON
430 #include <arm_neon.h>
431 
433  const lv_16sc_t* in_a,
434  const lv_16sc_t* in_b,
435  unsigned int num_points)
436 {
437  lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
438  lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
439  unsigned int quarter_points = num_points / 4;
440  int16x4x2_t a_val, b_val, c_val;
441  int16x4x2_t tmp_real, tmp_imag;
442  unsigned int number = 0;
443 
444  for (number = 0; number < quarter_points; ++number) {
445  a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
446  b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
447  __VOLK_PREFETCH(a_ptr + 4);
448  __VOLK_PREFETCH(b_ptr + 4);
449 
450  // multiply the real*real and imag*imag to get real result
451  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
452  tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
453  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
454  tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
455 
456  // Multiply cross terms to get the imaginary result
457  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
458  tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
459  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
460  tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
461 
462  // store the results
463  c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
464  c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
465  vst2_s16((int16_t*)out, c_val);
466 
467  a_ptr += 4;
468  b_ptr += 4;
469  out += 4;
470  }
471 
472  for (number = quarter_points * 4; number < num_points; number++) {
473  *out++ = (*a_ptr++) * (*b_ptr++);
474  }
475 }
476 #endif /* LV_HAVE_NEON */
477 
478 #endif /*INCLUDED_volk_16ic_x2_multiply_16ic_H*/
static void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:72
static void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:131
static void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:55
static void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:432
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
short complex lv_16sc_t
Definition: volk_complex.h:62