55 #ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H 56 #define INCLUDED_volk_32fc_32f_multiply_32fc_a_H 62 #include <immintrin.h> 66 const float* bVector,
unsigned int num_points)
68 unsigned int number = 0;
69 const unsigned int eighthPoints = num_points / 8;
73 const float* bPtr= bVector;
75 __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
77 __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
79 for(;number < eighthPoints; number++){
81 aVal1 = _mm256_load_ps((
float *)aPtr);
84 aVal2 = _mm256_load_ps((
float *)aPtr);
87 bVal = _mm256_load_ps(bPtr);
90 bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00);
91 bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11);
93 bVal1 = _mm256_permutevar_ps(bVal1, permute_mask);
94 bVal2 = _mm256_permutevar_ps(bVal2, permute_mask);
96 cVal1 = _mm256_mul_ps(aVal1, bVal1);
97 cVal2 = _mm256_mul_ps(aVal2, bVal2);
99 _mm256_store_ps((
float*)cPtr,cVal1);
102 _mm256_store_ps((
float*)cPtr,cVal2);
106 number = eighthPoints * 8;
107 for(;number < num_points; ++number){
108 *cPtr++ = (*aPtr++) * (*bPtr++);
115 #include <xmmintrin.h> 119 const float* bVector,
unsigned int num_points)
121 unsigned int number = 0;
122 const unsigned int quarterPoints = num_points / 4;
126 const float* bPtr= bVector;
128 __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
129 for(;number < quarterPoints; number++){
131 aVal1 = _mm_load_ps((
const float*)aPtr);
134 aVal2 = _mm_load_ps((
const float*)aPtr);
137 bVal = _mm_load_ps(bPtr);
140 bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0));
141 bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2));
143 cVal = _mm_mul_ps(aVal1, bVal1);
145 _mm_store_ps((
float*)cPtr,cVal);
148 cVal = _mm_mul_ps(aVal2, bVal2);
150 _mm_store_ps((
float*)cPtr,cVal);
155 number = quarterPoints * 4;
156 for(;number < num_points; number++){
157 *cPtr++ = (*aPtr++) * (*bPtr);
164 #ifdef LV_HAVE_GENERIC 168 const float* bVector,
unsigned int num_points)
172 const float* bPtr= bVector;
173 unsigned int number = 0;
175 for(number = 0; number < num_points; number++){
176 *cPtr++ = (*aPtr++) * (*bPtr++);
183 #include <arm_neon.h> 187 const float* bVector,
unsigned int num_points)
191 const float* bPtr= bVector;
192 unsigned int number = 0;
193 unsigned int quarter_points = num_points / 4;
195 float32x4x2_t inputVector, outputVector;
196 float32x4_t tapsVector;
197 for(number = 0; number < quarter_points; number++){
198 inputVector = vld2q_f32((
float*)aPtr);
199 tapsVector = vld1q_f32(bPtr);
201 outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
202 outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
204 vst2q_f32((
float*)cPtr, outputVector);
210 for(number = quarter_points * 4; number < num_points; number++){
211 *cPtr++ = (*aPtr++) * (*bPtr++);
221 const float* bVector,
unsigned int num_points);
225 const float* bVector,
unsigned int num_points)
227 volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
static void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:118
static void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:167
static void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:65
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:186