76 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H 77 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H 84 #if LV_HAVE_AVX && LV_HAVE_FMA 85 #include <immintrin.h> 87 static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
88 unsigned int number = 0;
90 const unsigned int quarterPoints = num_points / 4;
91 unsigned int isodd = num_points & 3;
92 __m256 x, yl, yh, z, tmp1, tmp2;
97 yl = _mm256_set1_ps(
lv_creal(scalar));
98 yh = _mm256_set1_ps(
lv_cimag(scalar));
100 for(;number < quarterPoints; number++){
101 x = _mm256_loadu_ps((
float*)a);
105 x = _mm256_shuffle_ps(x,x,0xB1);
107 tmp2 = _mm256_mul_ps(x,yh);
109 z = _mm256_fmaddsub_ps(tmp1, yl,tmp2);
111 _mm256_storeu_ps((
float*)c,z);
117 for(i = num_points-isodd; i < num_points; i++) {
118 *c++ = (*a++) * scalar;
125 #include <immintrin.h> 128 unsigned int number = 0;
130 const unsigned int quarterPoints = num_points / 4;
131 unsigned int isodd = num_points & 3;
132 __m256 x, yl, yh, z, tmp1, tmp2;
137 yl = _mm256_set1_ps(
lv_creal(scalar));
138 yh = _mm256_set1_ps(
lv_cimag(scalar));
140 for(;number < quarterPoints; number++){
141 x = _mm256_loadu_ps((
float*)a);
143 tmp1 = _mm256_mul_ps(x,yl);
145 x = _mm256_shuffle_ps(x,x,0xB1);
147 tmp2 = _mm256_mul_ps(x,yh);
149 z = _mm256_addsub_ps(tmp1,tmp2);
151 _mm256_storeu_ps((
float*)c,z);
157 for(i = num_points-isodd; i < num_points; i++) {
158 *c++ = (*a++) * scalar;
165 #include <pmmintrin.h> 168 unsigned int number = 0;
169 const unsigned int halfPoints = num_points / 2;
171 __m128 x, yl, yh, z, tmp1, tmp2;
179 for(;number < halfPoints; number++){
181 x = _mm_loadu_ps((
float*)a);
183 tmp1 = _mm_mul_ps(x,yl);
185 x = _mm_shuffle_ps(x,x,0xB1);
187 tmp2 = _mm_mul_ps(x,yh);
189 z = _mm_addsub_ps(tmp1,tmp2);
191 _mm_storeu_ps((
float*)c,z);
197 if((num_points % 2) != 0) {
203 #ifdef LV_HAVE_GENERIC 208 unsigned int number = num_points;
212 *cPtr++ = (*aPtr++) * scalar;
213 *cPtr++ = (*aPtr++) * scalar;
214 *cPtr++ = (*aPtr++) * scalar;
215 *cPtr++ = (*aPtr++) * scalar;
216 *cPtr++ = (*aPtr++) * scalar;
217 *cPtr++ = (*aPtr++) * scalar;
218 *cPtr++ = (*aPtr++) * scalar;
219 *cPtr++ = (*aPtr++) * scalar;
225 *cPtr++ = *aPtr++ * scalar;
231 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H 232 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H 234 #include <inttypes.h> 239 #if LV_HAVE_AVX && LV_HAVE_FMA 240 #include <immintrin.h> 242 static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
const lv_32fc_t scalar,
unsigned int num_points){
243 unsigned int number = 0;
245 const unsigned int quarterPoints = num_points / 4;
246 unsigned int isodd = num_points & 3;
247 __m256 x, yl, yh, z, tmp1, tmp2;
252 yl = _mm256_set1_ps(
lv_creal(scalar));
253 yh = _mm256_set1_ps(
lv_cimag(scalar));
255 for(;number < quarterPoints; number++){
256 x = _mm256_load_ps((
float*)a);
260 x = _mm256_shuffle_ps(x,x,0xB1);
262 tmp2 = _mm256_mul_ps(x,yh);
264 z = _mm256_fmaddsub_ps(tmp1, yl,tmp2);
266 _mm256_store_ps((
float*)c,z);
272 for(i = num_points-isodd; i < num_points; i++) {
273 *c++ = (*a++) * scalar;
281 #include <immintrin.h> 284 unsigned int number = 0;
286 const unsigned int quarterPoints = num_points / 4;
287 unsigned int isodd = num_points & 3;
288 __m256 x, yl, yh, z, tmp1, tmp2;
293 yl = _mm256_set1_ps(
lv_creal(scalar));
294 yh = _mm256_set1_ps(
lv_cimag(scalar));
296 for(;number < quarterPoints; number++){
297 x = _mm256_load_ps((
float*)a);
299 tmp1 = _mm256_mul_ps(x,yl);
301 x = _mm256_shuffle_ps(x,x,0xB1);
303 tmp2 = _mm256_mul_ps(x,yh);
305 z = _mm256_addsub_ps(tmp1,tmp2);
307 _mm256_store_ps((
float*)c,z);
313 for(i = num_points-isodd; i < num_points; i++) {
314 *c++ = (*a++) * scalar;
321 #include <pmmintrin.h> 324 unsigned int number = 0;
325 const unsigned int halfPoints = num_points / 2;
327 __m128 x, yl, yh, z, tmp1, tmp2;
335 for(;number < halfPoints; number++){
337 x = _mm_load_ps((
float*)a);
339 tmp1 = _mm_mul_ps(x,yl);
341 x = _mm_shuffle_ps(x,x,0xB1);
343 tmp2 = _mm_mul_ps(x,yh);
345 z = _mm_addsub_ps(tmp1,tmp2);
347 _mm_store_ps((
float*)c,z);
353 if((num_points % 2) != 0) {
360 #include <arm_neon.h> 365 unsigned int number = num_points;
366 unsigned int quarter_points = num_points / 4;
368 float32x4x2_t a_val, scalar_val;
369 float32x4x2_t tmp_imag;
371 scalar_val.val[0] = vld1q_dup_f32((
const float*)&scalar);
372 scalar_val.val[1] = vld1q_dup_f32(((
const float*)&scalar) + 1);
373 for(number = 0; number < quarter_points; ++number) {
374 a_val = vld2q_f32((
float*)aPtr);
375 tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
376 tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
378 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
379 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
381 vst2q_f32((
float*)cPtr, tmp_imag);
386 for(number = quarter_points*4; number < num_points; number++){
387 *cPtr++ = *aPtr++ * scalar;
392 #ifdef LV_HAVE_GENERIC 397 unsigned int number = num_points;
401 *cPtr++ = (*aPtr++) * scalar;
402 *cPtr++ = (*aPtr++) * scalar;
403 *cPtr++ = (*aPtr++) * scalar;
404 *cPtr++ = (*aPtr++) * scalar;
405 *cPtr++ = (*aPtr++) * scalar;
406 *cPtr++ = (*aPtr++) * scalar;
407 *cPtr++ = (*aPtr++) * scalar;
408 *cPtr++ = (*aPtr++) * scalar;
414 *cPtr++ = *aPtr++ * scalar;
static void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:323
static void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:394
static void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:205
static void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:167
for i
Definition: volk_config_fixed.tmpl.h:25
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:283
static void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:362
#define lv_creal(x)
Definition: volk_complex.h:83
static void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:127
#define lv_cimag(x)
Definition: volk_complex.h:85