Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_16ic_x2_multiply_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
46 #ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
47 #define INCLUDED_volk_16ic_x2_multiply_16ic_H
48 
49 #include <volk/volk_common.h>
50 #include <volk/volk_complex.h>
51 
52 #ifdef LV_HAVE_GENERIC
53 
54 static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
55 {
56  unsigned int n;
57  for (n = 0; n < num_points; n++)
58  {
59  result[n] = in_a[n] * in_b[n];
60  }
61 }
62 
63 #endif /*LV_HAVE_GENERIC*/
64 
65 
66 #ifdef LV_HAVE_SSE2
67 #include <emmintrin.h>
68 
69 static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
70 {
71  const unsigned int sse_iters = num_points / 4;
72  __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result;
73 
74  mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
75  mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
76 
77  const lv_16sc_t* _in_a = in_a;
78  const lv_16sc_t* _in_b = in_b;
79  lv_16sc_t* _out = out;
80  unsigned int number;
81 
82  for(number = 0; number < sse_iters; number++)
83  {
84  a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
85  b = _mm_load_si128((__m128i*)_in_b);
86  c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
87 
88  c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
89  real = _mm_subs_epi16 (c, c_sr);
90  real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
91 
92  b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
93  a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
94 
95  imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
96  imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
97 
98  imag = _mm_adds_epi16(imag1, imag2);
99  imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
100 
101  result = _mm_or_si128 (real, imag);
102 
103  _mm_store_si128((__m128i*)_out, result);
104 
105  _in_a += 4;
106  _in_b += 4;
107  _out += 4;
108  }
109 
110  for (number = sse_iters * 4; number < num_points; ++number)
111  {
112  *_out++ = (*_in_a++) * (*_in_b++);
113  }
114 }
115 #endif /* LV_HAVE_SSE2 */
116 
117 
118 #ifdef LV_HAVE_SSE2
119 #include <emmintrin.h>
120 
121 static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
122 {
123  const unsigned int sse_iters = num_points / 4;
124  __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
125 
126  mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
127  mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
128 
129  const lv_16sc_t* _in_a = in_a;
130  const lv_16sc_t* _in_b = in_b;
131  lv_16sc_t* _out = out;
132  unsigned int number;
133 
134  for(number = 0; number < sse_iters; number++)
135  {
136  a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
137  b = _mm_loadu_si128((__m128i*)_in_b);
138  c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
139 
140  c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
141  real = _mm_subs_epi16 (c, c_sr);
142  real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
143 
144  b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
145  a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
146 
147  imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
148  imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
149 
150  imag = _mm_adds_epi16(imag1, imag2);
151  imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
152 
153  result = _mm_or_si128 (real, imag);
154 
155  _mm_storeu_si128((__m128i*)_out, result);
156 
157  _in_a += 4;
158  _in_b += 4;
159  _out += 4;
160  }
161 
162  for (number = sse_iters * 4; number < num_points; ++number)
163  {
164  *_out++ = (*_in_a++) * (*_in_b++);
165  }
166 }
167 #endif /* LV_HAVE_SSE2 */
168 
169 
170 #ifdef LV_HAVE_AVX2
171 #include <immintrin.h>
172 
173 static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
174 {
175  unsigned int number = 0;
176  const unsigned int avx2_points = num_points / 8;
177 
178  const lv_16sc_t* _in_a = in_a;
179  const lv_16sc_t* _in_b = in_b;
180  lv_16sc_t* _out = out;
181 
182  __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
183 
184  const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
185  const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
186 
187  for(;number < avx2_points; number++)
188  {
189  a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
190  b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
191  c = _mm256_mullo_epi16(a, b);
192 
193  c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
194  real = _mm256_subs_epi16(c, c_sr);
195  real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
196 
197  b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
198  a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
199 
200  imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
201  imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
202 
203  imag = _mm256_adds_epi16(imag1, imag2);
204  imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
205 
206  result = _mm256_or_si256(real, imag);
207 
208  _mm256_storeu_si256((__m256i*)_out, result);
209 
210  _in_a += 8;
211  _in_b += 8;
212  _out += 8;
213  }
214  _mm256_zeroupper();
215  number = avx2_points * 8;
216  for(;number < num_points; number++)
217  {
218  *_out++ = (*_in_a++) * (*_in_b++);
219  }
220 }
221 #endif /* LV_HAVE_AVX2 */
222 
223 
224 #ifdef LV_HAVE_AVX2
225 #include <immintrin.h>
226 
227 static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
228 {
229  unsigned int number = 0;
230  const unsigned int avx2_points = num_points / 8;
231 
232  const lv_16sc_t* _in_a = in_a;
233  const lv_16sc_t* _in_b = in_b;
234  lv_16sc_t* _out = out;
235 
236  __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
237 
238  const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
239  const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
240 
241  for(;number < avx2_points; number++)
242  {
243  a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
244  b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
245  c = _mm256_mullo_epi16(a, b);
246 
247  c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
248  real = _mm256_subs_epi16(c, c_sr);
249  real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
250 
251  b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
252  a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
253 
254  imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
255  imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
256 
257  imag = _mm256_adds_epi16(imag1, imag2);
258  imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
259 
260  result = _mm256_or_si256(real, imag);
261 
262  _mm256_store_si256((__m256i*)_out, result);
263 
264  _in_a += 8;
265  _in_b += 8;
266  _out += 8;
267  }
268  _mm256_zeroupper();
269  number = avx2_points * 8;
270  for(;number < num_points; number++)
271  {
272  *_out++ = (*_in_a++) * (*_in_b++);
273  }
274 }
275 #endif /* LV_HAVE_AVX2 */
276 
277 
278 #ifdef LV_HAVE_NEON
279 #include <arm_neon.h>
280 
281 static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
282 {
283  lv_16sc_t *a_ptr = (lv_16sc_t*) in_a;
284  lv_16sc_t *b_ptr = (lv_16sc_t*) in_b;
285  unsigned int quarter_points = num_points / 4;
286  int16x4x2_t a_val, b_val, c_val;
287  int16x4x2_t tmp_real, tmp_imag;
288  unsigned int number = 0;
289 
290  for(number = 0; number < quarter_points; ++number)
291  {
292  a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
293  b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
294  __VOLK_PREFETCH(a_ptr + 4);
295  __VOLK_PREFETCH(b_ptr + 4);
296 
297  // multiply the real*real and imag*imag to get real result
298  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
299  tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
300  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
301  tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
302 
303  // Multiply cross terms to get the imaginary result
304  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
305  tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
306  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
307  tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
308 
309  // store the results
310  c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
311  c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
312  vst2_s16((int16_t*)out, c_val);
313 
314  a_ptr += 4;
315  b_ptr += 4;
316  out += 4;
317  }
318 
319  for(number = quarter_points * 4; number < num_points; number++)
320  {
321  *out++ = (*a_ptr++) * (*b_ptr++);
322  }
323 }
324 #endif /* LV_HAVE_NEON */
325 
326 #endif /*INCLUDED_volk_16ic_x2_multiply_16ic_H*/
static void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:281
static void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:121
short complex lv_16sc_t
Definition: volk_complex.h:58
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:69
static void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:54