46 #ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H 47 #define INCLUDED_volk_16ic_x2_multiply_16ic_H 52 #ifdef LV_HAVE_GENERIC 57 for (n = 0; n < num_points; n++)
59 result[n] = in_a[n] * in_b[n];
67 #include <emmintrin.h> 71 const unsigned int sse_iters = num_points / 4;
72 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result;
74 mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
75 mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
82 for(number = 0; number < sse_iters; number++)
84 a = _mm_load_si128((__m128i*)_in_a);
85 b = _mm_load_si128((__m128i*)_in_b);
86 c = _mm_mullo_epi16 (a, b);
88 c_sr = _mm_srli_si128 (c, 2);
89 real = _mm_subs_epi16 (c, c_sr);
90 real = _mm_and_si128 (real, mask_real);
92 b_sl = _mm_slli_si128(b, 2);
93 a_sl = _mm_slli_si128(a, 2);
95 imag1 = _mm_mullo_epi16(a, b_sl);
96 imag2 = _mm_mullo_epi16(b, a_sl);
98 imag = _mm_adds_epi16(imag1, imag2);
99 imag = _mm_and_si128 (imag, mask_imag);
101 result = _mm_or_si128 (real, imag);
103 _mm_store_si128((__m128i*)_out, result);
110 for (number = sse_iters * 4; number < num_points; ++number)
112 *_out++ = (*_in_a++) * (*_in_b++);
119 #include <emmintrin.h> 123 const unsigned int sse_iters = num_points / 4;
124 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
126 mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
127 mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
134 for(number = 0; number < sse_iters; number++)
136 a = _mm_loadu_si128((__m128i*)_in_a);
137 b = _mm_loadu_si128((__m128i*)_in_b);
138 c = _mm_mullo_epi16 (a, b);
140 c_sr = _mm_srli_si128 (c, 2);
141 real = _mm_subs_epi16 (c, c_sr);
142 real = _mm_and_si128 (real, mask_real);
144 b_sl = _mm_slli_si128(b, 2);
145 a_sl = _mm_slli_si128(a, 2);
147 imag1 = _mm_mullo_epi16(a, b_sl);
148 imag2 = _mm_mullo_epi16(b, a_sl);
150 imag = _mm_adds_epi16(imag1, imag2);
151 imag = _mm_and_si128 (imag, mask_imag);
153 result = _mm_or_si128 (real, imag);
155 _mm_storeu_si128((__m128i*)_out, result);
162 for (number = sse_iters * 4; number < num_points; ++number)
164 *_out++ = (*_in_a++) * (*_in_b++);
171 #include <immintrin.h> 173 static inline void volk_16ic_x2_multiply_16ic_u_avx2(
lv_16sc_t* out,
const lv_16sc_t* in_a,
const lv_16sc_t* in_b,
unsigned int num_points)
175 unsigned int number = 0;
176 const unsigned int avx2_points = num_points / 8;
182 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
184 const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
185 const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
187 for(;number < avx2_points; number++)
189 a = _mm256_loadu_si256((__m256i*)_in_a);
190 b = _mm256_loadu_si256((__m256i*)_in_b);
191 c = _mm256_mullo_epi16(a, b);
193 c_sr = _mm256_srli_si256(c, 2);
194 real = _mm256_subs_epi16(c, c_sr);
195 real = _mm256_and_si256(real, mask_real);
197 b_sl = _mm256_slli_si256(b, 2);
198 a_sl = _mm256_slli_si256(a, 2);
200 imag1 = _mm256_mullo_epi16(a, b_sl);
201 imag2 = _mm256_mullo_epi16(b, a_sl);
203 imag = _mm256_adds_epi16(imag1, imag2);
204 imag = _mm256_and_si256(imag, mask_imag);
206 result = _mm256_or_si256(real, imag);
208 _mm256_storeu_si256((__m256i*)_out, result);
215 number = avx2_points * 8;
216 for(;number < num_points; number++)
218 *_out++ = (*_in_a++) * (*_in_b++);
225 #include <immintrin.h> 227 static inline void volk_16ic_x2_multiply_16ic_a_avx2(
lv_16sc_t* out,
const lv_16sc_t* in_a,
const lv_16sc_t* in_b,
unsigned int num_points)
229 unsigned int number = 0;
230 const unsigned int avx2_points = num_points / 8;
236 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
238 const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
239 const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
241 for(;number < avx2_points; number++)
243 a = _mm256_load_si256((__m256i*)_in_a);
244 b = _mm256_load_si256((__m256i*)_in_b);
245 c = _mm256_mullo_epi16(a, b);
247 c_sr = _mm256_srli_si256(c, 2);
248 real = _mm256_subs_epi16(c, c_sr);
249 real = _mm256_and_si256(real, mask_real);
251 b_sl = _mm256_slli_si256(b, 2);
252 a_sl = _mm256_slli_si256(a, 2);
254 imag1 = _mm256_mullo_epi16(a, b_sl);
255 imag2 = _mm256_mullo_epi16(b, a_sl);
257 imag = _mm256_adds_epi16(imag1, imag2);
258 imag = _mm256_and_si256(imag, mask_imag);
260 result = _mm256_or_si256(real, imag);
262 _mm256_store_si256((__m256i*)_out, result);
269 number = avx2_points * 8;
270 for(;number < num_points; number++)
272 *_out++ = (*_in_a++) * (*_in_b++);
279 #include <arm_neon.h> 285 unsigned int quarter_points = num_points / 4;
286 int16x4x2_t a_val, b_val, c_val;
287 int16x4x2_t tmp_real, tmp_imag;
288 unsigned int number = 0;
290 for(number = 0; number < quarter_points; ++number)
292 a_val = vld2_s16((int16_t*)a_ptr);
293 b_val = vld2_s16((int16_t*)b_ptr);
299 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
301 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
305 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
307 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
310 c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
311 c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
312 vst2_s16((int16_t*)out, c_val);
319 for(number = quarter_points * 4; number < num_points; number++)
321 *out++ = (*a_ptr++) * (*b_ptr++);
static void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:281
static void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:121
short complex lv_16sc_t
Definition: volk_complex.h:58
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:69
static void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:54