73 #ifndef INCLUDED_volk_32f_x2_add_32f_u_H 74 #define INCLUDED_volk_32f_x2_add_32f_u_H 79 #ifdef LV_HAVE_AVX512F 80 #include <immintrin.h> 83 volk_32f_x2_add_32f_u_avx512f(
float* cVector,
const float* aVector,
84 const float* bVector,
unsigned int num_points)
86 unsigned int number = 0;
87 const unsigned int sixteenthPoints = num_points / 16;
89 float* cPtr = cVector;
90 const float* aPtr = aVector;
91 const float* bPtr= bVector;
93 __m512 aVal, bVal, cVal;
94 for(;number < sixteenthPoints; number++){
96 aVal = _mm512_loadu_ps(aPtr);
97 bVal = _mm512_loadu_ps(bPtr);
99 cVal = _mm512_add_ps(aVal, bVal);
101 _mm512_storeu_ps(cPtr,cVal);
108 number = sixteenthPoints * 16;
110 for(;number < num_points; number++){
111 *cPtr++ = (*aPtr++) + (*bPtr++);
119 #include <immintrin.h> 123 const float* bVector,
unsigned int num_points)
125 unsigned int number = 0;
126 const unsigned int eighthPoints = num_points / 8;
127 float* cPtr = cVector;
128 const float* aPtr = aVector;
129 const float* bPtr= bVector;
130 __m256 aVal, bVal, cVal;
131 for(;number < eighthPoints; number++){
133 aVal = _mm256_loadu_ps(aPtr);
134 bVal = _mm256_loadu_ps(bPtr);
136 cVal = _mm256_add_ps(aVal, bVal);
138 _mm256_storeu_ps(cPtr,cVal);
145 number = eighthPoints * 8;
147 for(;number < num_points; number++){
148 *cPtr++ = (*aPtr++) + (*bPtr++);
155 #include <xmmintrin.h> 159 const float* bVector,
unsigned int num_points)
161 unsigned int number = 0;
162 const unsigned int quarterPoints = num_points / 4;
164 float* cPtr = cVector;
165 const float* aPtr = aVector;
166 const float* bPtr= bVector;
168 __m128 aVal, bVal, cVal;
169 for(;number < quarterPoints; number++){
171 aVal = _mm_loadu_ps(aPtr);
172 bVal = _mm_loadu_ps(bPtr);
174 cVal = _mm_add_ps(aVal, bVal);
176 _mm_storeu_ps(cPtr,cVal);
183 number = quarterPoints * 4;
184 for(;number < num_points; number++){
185 *cPtr++ = (*aPtr++) + (*bPtr++);
191 #ifdef LV_HAVE_GENERIC 195 const float* bVector,
unsigned int num_points)
197 float* cPtr = cVector;
198 const float* aPtr = aVector;
199 const float* bPtr= bVector;
200 unsigned int number = 0;
202 for(number = 0; number < num_points; number++){
203 *cPtr++ = (*aPtr++) + (*bPtr++);
210 #ifndef INCLUDED_volk_32f_x2_add_32f_a_H 211 #define INCLUDED_volk_32f_x2_add_32f_a_H 213 #include <inttypes.h> 216 #ifdef LV_HAVE_AVX512F 217 #include <immintrin.h> 220 volk_32f_x2_add_32f_a_avx512f(
float* cVector,
const float* aVector,
221 const float* bVector,
unsigned int num_points)
223 unsigned int number = 0;
224 const unsigned int sixteenthPoints = num_points / 16;
226 float* cPtr = cVector;
227 const float* aPtr = aVector;
228 const float* bPtr= bVector;
230 __m512 aVal, bVal, cVal;
231 for(;number < sixteenthPoints; number++){
233 aVal = _mm512_load_ps(aPtr);
234 bVal = _mm512_load_ps(bPtr);
236 cVal = _mm512_add_ps(aVal, bVal);
238 _mm512_store_ps(cPtr,cVal);
245 number = sixteenthPoints * 16;
247 for(;number < num_points; number++){
248 *cPtr++ = (*aPtr++) + (*bPtr++);
256 #include <immintrin.h> 260 const float* bVector,
unsigned int num_points)
262 unsigned int number = 0;
263 const unsigned int eighthPoints = num_points / 8;
265 float* cPtr = cVector;
266 const float* aPtr = aVector;
267 const float* bPtr= bVector;
269 __m256 aVal, bVal, cVal;
270 for(;number < eighthPoints; number++){
272 aVal = _mm256_load_ps(aPtr);
273 bVal = _mm256_load_ps(bPtr);
275 cVal = _mm256_add_ps(aVal, bVal);
277 _mm256_store_ps(cPtr,cVal);
284 number = eighthPoints * 8;
285 for(;number < num_points; number++){
286 *cPtr++ = (*aPtr++) + (*bPtr++);
292 #include <xmmintrin.h> 297 unsigned int number = 0;
298 const unsigned int quarterPoints = num_points / 4;
300 float* cPtr = cVector;
301 const float* aPtr = aVector;
302 const float* bPtr= bVector;
304 __m128 aVal, bVal, cVal;
305 for(;number < quarterPoints; number++){
306 aVal = _mm_load_ps(aPtr);
307 bVal = _mm_load_ps(bPtr);
309 cVal = _mm_add_ps(aVal, bVal);
311 _mm_store_ps(cPtr,cVal);
318 number = quarterPoints * 4;
319 for(;number < num_points; number++){
320 *cPtr++ = (*aPtr++) + (*bPtr++);
327 #include <arm_neon.h> 331 const float* bVector,
unsigned int num_points)
333 unsigned int number = 0;
334 const unsigned int quarterPoints = num_points / 4;
336 float* cPtr = cVector;
337 const float* aPtr = aVector;
338 const float* bPtr= bVector;
339 float32x4_t aVal, bVal, cVal;
340 for(number=0; number < quarterPoints; number++){
342 aVal = vld1q_f32(aPtr);
343 bVal = vld1q_f32(bPtr);
348 cVal = vaddq_f32(aVal, bVal);
350 vst1q_f32(cPtr,cVal);
357 number = quarterPoints * 4;
358 for(;number < num_points; number++){
359 *cPtr++ = (*aPtr++) + (*bPtr++);
365 #ifdef LV_HAVE_NEONV7 366 extern void volk_32f_x2_add_32f_a_neonasm(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
369 #ifdef LV_HAVE_NEONV7 370 extern void volk_32f_x2_add_32f_a_neonpipeline(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
373 #ifdef LV_HAVE_GENERIC 377 const float* bVector,
unsigned int num_points)
379 float* cPtr = cVector;
380 const float* aPtr = aVector;
381 const float* bPtr= bVector;
382 unsigned int number = 0;
384 for(number = 0; number < num_points; number++){
385 *cPtr++ = (*aPtr++) + (*bPtr++);
394 volk_32f_x2_add_32f_a_orc_impl(
float* cVector,
const float* aVector,
395 const float* bVector,
unsigned int num_points);
398 volk_32f_x2_add_32f_u_orc(
float* cVector,
const float* aVector,
399 const float* bVector,
unsigned int num_points){
400 volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
static void volk_32f_x2_add_32f_a_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:376
static void volk_32f_x2_add_32f_u_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:330
static void volk_32f_x2_add_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:194
static void volk_32f_x2_add_32f_u_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:158
static void volk_32f_x2_add_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:259
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_32f_x2_add_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:122
static void volk_32f_x2_add_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:295