73 #ifndef INCLUDED_volk_32fc_32f_add_32fc_u_H 74 #define INCLUDED_volk_32fc_32f_add_32fc_u_H 76 #ifdef LV_HAVE_GENERIC 80 const float* bVector,
unsigned int num_points)
84 const float* bPtr= bVector;
85 unsigned int number = 0;
87 for(number = 0; number < num_points; number++){
88 *cPtr++ = (*aPtr++) + (*bPtr++);
95 #include <immintrin.h> 99 const float* bVector,
unsigned int num_points)
101 unsigned int number = 0;
102 const unsigned int eighthPoints = num_points / 8;
106 const float* bPtr= bVector;
108 __m256 aVal1, aVal2, bVal, cVal1, cVal2;
109 __m256 cpx_b1, cpx_b2;
111 zero = _mm256_setzero_ps();
113 for(;number < eighthPoints; number++){
115 aVal1 = _mm256_loadu_ps((
float *) aPtr);
116 aVal2 = _mm256_loadu_ps((
float *) (aPtr+4));
117 bVal = _mm256_loadu_ps(bPtr);
118 cpx_b1 = _mm256_unpacklo_ps(bVal, zero);
119 cpx_b2 = _mm256_unpackhi_ps(bVal, zero);
121 tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4));
122 tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4));
124 cVal1 = _mm256_add_ps(aVal1, tmp1);
125 cVal2 = _mm256_add_ps(aVal2, tmp2);
127 _mm256_storeu_ps((
float *) cPtr, cVal1);
128 _mm256_storeu_ps((
float *) (cPtr+4), cVal2);
135 number = eighthPoints * 8;
136 for(;number < num_points; number++){
137 *cPtr++ = (*aPtr++) + (*bPtr++);
143 #include <immintrin.h> 147 const float* bVector,
unsigned int num_points)
149 unsigned int number = 0;
150 const unsigned int eighthPoints = num_points / 8;
154 const float* bPtr= bVector;
156 __m256 aVal1, aVal2, bVal, cVal1, cVal2;
157 __m256 cpx_b1, cpx_b2;
159 zero = _mm256_setzero_ps();
161 for(;number < eighthPoints; number++){
163 aVal1 = _mm256_load_ps((
float *) aPtr);
164 aVal2 = _mm256_load_ps((
float *) (aPtr+4));
165 bVal = _mm256_load_ps(bPtr);
166 cpx_b1 = _mm256_unpacklo_ps(bVal, zero);
167 cpx_b2 = _mm256_unpackhi_ps(bVal, zero);
169 tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0+(0x2<<4));
170 tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1+(0x3<<4));
172 cVal1 = _mm256_add_ps(aVal1, tmp1);
173 cVal2 = _mm256_add_ps(aVal2, tmp2);
175 _mm256_store_ps((
float *) cPtr, cVal1);
176 _mm256_store_ps((
float *) (cPtr+4), cVal2);
183 number = eighthPoints * 8;
184 for(;number < num_points; number++){
185 *cPtr++ = (*aPtr++) + (*bPtr++);
191 #include <arm_neon.h> 195 const float* bVector,
unsigned int num_points)
199 const float* bPtr = bVector;
201 float32x4x4_t aVal0, aVal1;
202 float32x4x2_t bVal0, bVal1;
204 const unsigned int sixteenthPoints = num_points / 16;
205 unsigned int number = 0;
206 for(; number < sixteenthPoints; number++){
207 aVal0 = vld4q_f32((
const float*)aPtr);
209 aVal1 = vld4q_f32((
const float*)aPtr);
213 bVal0 = vld2q_f32((
const float*)bPtr);
215 bVal1 = vld2q_f32((
const float*)bPtr);
219 aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
220 aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
222 aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
223 aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
225 vst4q_f32((
float*)(cPtr), aVal0);
227 vst4q_f32((
float*)(cPtr), aVal1);
231 for(number = sixteenthPoints * 16; number < num_points; number++){
232 *cPtr++ = (*aPtr++) + (*bPtr++);
static void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:98
static void volk_32fc_32f_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:79
static void volk_32fc_32f_add_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:194
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:146