Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32f_x2_add_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
74 #ifndef INCLUDED_volk_32f_x2_add_32f_u_H
75 #define INCLUDED_volk_32f_x2_add_32f_u_H
76 
77 #include <inttypes.h>
78 #include <stdio.h>
79 
80 #ifdef LV_HAVE_AVX512F
81 #include <immintrin.h>
82 
83 static inline void volk_32f_x2_add_32f_u_avx512f(float* cVector,
84  const float* aVector,
85  const float* bVector,
86  unsigned int num_points)
87 {
88  unsigned int number = 0;
89  const unsigned int sixteenthPoints = num_points / 16;
90 
91  float* cPtr = cVector;
92  const float* aPtr = aVector;
93  const float* bPtr = bVector;
94 
95  __m512 aVal, bVal, cVal;
96  for (; number < sixteenthPoints; number++) {
97 
98  aVal = _mm512_loadu_ps(aPtr);
99  bVal = _mm512_loadu_ps(bPtr);
100 
101  cVal = _mm512_add_ps(aVal, bVal);
102 
103  _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
104 
105  aPtr += 16;
106  bPtr += 16;
107  cPtr += 16;
108  }
109 
110  number = sixteenthPoints * 16;
111 
112  for (; number < num_points; number++) {
113  *cPtr++ = (*aPtr++) + (*bPtr++);
114  }
115 }
116 
117 #endif /* LV_HAVE_AVX512F */
118 
119 
120 #ifdef LV_HAVE_AVX
121 #include <immintrin.h>
122 
123 static inline void volk_32f_x2_add_32f_u_avx(float* cVector,
124  const float* aVector,
125  const float* bVector,
126  unsigned int num_points)
127 {
128  unsigned int number = 0;
129  const unsigned int eighthPoints = num_points / 8;
130  float* cPtr = cVector;
131  const float* aPtr = aVector;
132  const float* bPtr = bVector;
133  __m256 aVal, bVal, cVal;
134  for (; number < eighthPoints; number++) {
135 
136  aVal = _mm256_loadu_ps(aPtr);
137  bVal = _mm256_loadu_ps(bPtr);
138 
139  cVal = _mm256_add_ps(aVal, bVal);
140 
141  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
142 
143  aPtr += 8;
144  bPtr += 8;
145  cPtr += 8;
146  }
147 
148  number = eighthPoints * 8;
149 
150  for (; number < num_points; number++) {
151  *cPtr++ = (*aPtr++) + (*bPtr++);
152  }
153 }
154 #endif /* LV_HAVE_AVX */
155 
156 
157 #ifdef LV_HAVE_SSE
158 #include <xmmintrin.h>
159 
160 static inline void volk_32f_x2_add_32f_u_sse(float* cVector,
161  const float* aVector,
162  const float* bVector,
163  unsigned int num_points)
164 {
165  unsigned int number = 0;
166  const unsigned int quarterPoints = num_points / 4;
167 
168  float* cPtr = cVector;
169  const float* aPtr = aVector;
170  const float* bPtr = bVector;
171 
172  __m128 aVal, bVal, cVal;
173  for (; number < quarterPoints; number++) {
174 
175  aVal = _mm_loadu_ps(aPtr);
176  bVal = _mm_loadu_ps(bPtr);
177 
178  cVal = _mm_add_ps(aVal, bVal);
179 
180  _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
181 
182  aPtr += 4;
183  bPtr += 4;
184  cPtr += 4;
185  }
186 
187  number = quarterPoints * 4;
188  for (; number < num_points; number++) {
189  *cPtr++ = (*aPtr++) + (*bPtr++);
190  }
191 }
192 #endif /* LV_HAVE_SSE */
193 
194 
195 #ifdef LV_HAVE_GENERIC
196 
197 static inline void volk_32f_x2_add_32f_generic(float* cVector,
198  const float* aVector,
199  const float* bVector,
200  unsigned int num_points)
201 {
202  float* cPtr = cVector;
203  const float* aPtr = aVector;
204  const float* bPtr = bVector;
205  unsigned int number = 0;
206 
207  for (number = 0; number < num_points; number++) {
208  *cPtr++ = (*aPtr++) + (*bPtr++);
209  }
210 }
211 #endif /* LV_HAVE_GENERIC */
212 
213 
214 #endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
215 #ifndef INCLUDED_volk_32f_x2_add_32f_a_H
216 #define INCLUDED_volk_32f_x2_add_32f_a_H
217 
218 #include <inttypes.h>
219 #include <stdio.h>
220 
221 #ifdef LV_HAVE_AVX512F
222 #include <immintrin.h>
223 
224 static inline void volk_32f_x2_add_32f_a_avx512f(float* cVector,
225  const float* aVector,
226  const float* bVector,
227  unsigned int num_points)
228 {
229  unsigned int number = 0;
230  const unsigned int sixteenthPoints = num_points / 16;
231 
232  float* cPtr = cVector;
233  const float* aPtr = aVector;
234  const float* bPtr = bVector;
235 
236  __m512 aVal, bVal, cVal;
237  for (; number < sixteenthPoints; number++) {
238 
239  aVal = _mm512_load_ps(aPtr);
240  bVal = _mm512_load_ps(bPtr);
241 
242  cVal = _mm512_add_ps(aVal, bVal);
243 
244  _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
245 
246  aPtr += 16;
247  bPtr += 16;
248  cPtr += 16;
249  }
250 
251  number = sixteenthPoints * 16;
252 
253  for (; number < num_points; number++) {
254  *cPtr++ = (*aPtr++) + (*bPtr++);
255  }
256 }
257 
258 #endif /* LV_HAVE_AVX512F */
259 
260 
261 #ifdef LV_HAVE_AVX
262 #include <immintrin.h>
263 
264 static inline void volk_32f_x2_add_32f_a_avx(float* cVector,
265  const float* aVector,
266  const float* bVector,
267  unsigned int num_points)
268 {
269  unsigned int number = 0;
270  const unsigned int eighthPoints = num_points / 8;
271 
272  float* cPtr = cVector;
273  const float* aPtr = aVector;
274  const float* bPtr = bVector;
275 
276  __m256 aVal, bVal, cVal;
277  for (; number < eighthPoints; number++) {
278 
279  aVal = _mm256_load_ps(aPtr);
280  bVal = _mm256_load_ps(bPtr);
281 
282  cVal = _mm256_add_ps(aVal, bVal);
283 
284  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
285 
286  aPtr += 8;
287  bPtr += 8;
288  cPtr += 8;
289  }
290 
291  number = eighthPoints * 8;
292  for (; number < num_points; number++) {
293  *cPtr++ = (*aPtr++) + (*bPtr++);
294  }
295 }
296 #endif /* LV_HAVE_AVX */
297 
298 #ifdef LV_HAVE_SSE
299 #include <xmmintrin.h>
300 
301 static inline void volk_32f_x2_add_32f_a_sse(float* cVector,
302  const float* aVector,
303  const float* bVector,
304  unsigned int num_points)
305 {
306  unsigned int number = 0;
307  const unsigned int quarterPoints = num_points / 4;
308 
309  float* cPtr = cVector;
310  const float* aPtr = aVector;
311  const float* bPtr = bVector;
312 
313  __m128 aVal, bVal, cVal;
314  for (; number < quarterPoints; number++) {
315  aVal = _mm_load_ps(aPtr);
316  bVal = _mm_load_ps(bPtr);
317 
318  cVal = _mm_add_ps(aVal, bVal);
319 
320  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
321 
322  aPtr += 4;
323  bPtr += 4;
324  cPtr += 4;
325  }
326 
327  number = quarterPoints * 4;
328  for (; number < num_points; number++) {
329  *cPtr++ = (*aPtr++) + (*bPtr++);
330  }
331 }
332 #endif /* LV_HAVE_SSE */
333 
334 
335 #ifdef LV_HAVE_NEON
336 #include <arm_neon.h>
337 
338 static inline void volk_32f_x2_add_32f_u_neon(float* cVector,
339  const float* aVector,
340  const float* bVector,
341  unsigned int num_points)
342 {
343  unsigned int number = 0;
344  const unsigned int quarterPoints = num_points / 4;
345 
346  float* cPtr = cVector;
347  const float* aPtr = aVector;
348  const float* bPtr = bVector;
349  float32x4_t aVal, bVal, cVal;
350  for (number = 0; number < quarterPoints; number++) {
351  // Load in to NEON registers
352  aVal = vld1q_f32(aPtr);
353  bVal = vld1q_f32(bPtr);
354  __VOLK_PREFETCH(aPtr + 4);
355  __VOLK_PREFETCH(bPtr + 4);
356 
357  // vector add
358  cVal = vaddq_f32(aVal, bVal);
359  // Store the results back into the C container
360  vst1q_f32(cPtr, cVal);
361 
362  aPtr += 4; // q uses quadwords, 4 floats per vadd
363  bPtr += 4;
364  cPtr += 4;
365  }
366 
367  number = quarterPoints * 4; // should be = num_points
368  for (; number < num_points; number++) {
369  *cPtr++ = (*aPtr++) + (*bPtr++);
370  }
371 }
372 
373 #endif /* LV_HAVE_NEON */
374 
375 #ifdef LV_HAVE_NEONV7
376 extern void volk_32f_x2_add_32f_a_neonasm(float* cVector,
377  const float* aVector,
378  const float* bVector,
379  unsigned int num_points);
380 #endif /* LV_HAVE_NEONV7 */
381 
382 #ifdef LV_HAVE_NEONV7
383 extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector,
384  const float* aVector,
385  const float* bVector,
386  unsigned int num_points);
387 #endif /* LV_HAVE_NEONV7 */
388 
389 #ifdef LV_HAVE_GENERIC
390 
391 static inline void volk_32f_x2_add_32f_a_generic(float* cVector,
392  const float* aVector,
393  const float* bVector,
394  unsigned int num_points)
395 {
396  float* cPtr = cVector;
397  const float* aPtr = aVector;
398  const float* bPtr = bVector;
399  unsigned int number = 0;
400 
401  for (number = 0; number < num_points; number++) {
402  *cPtr++ = (*aPtr++) + (*bPtr++);
403  }
404 }
405 #endif /* LV_HAVE_GENERIC */
406 
407 
408 #ifdef LV_HAVE_ORC
409 
410 extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector,
411  const float* aVector,
412  const float* bVector,
413  unsigned int num_points);
414 
415 static inline void volk_32f_x2_add_32f_u_orc(float* cVector,
416  const float* aVector,
417  const float* bVector,
418  unsigned int num_points)
419 {
420  volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
421 }
422 
423 #endif /* LV_HAVE_ORC */
424 
425 
426 #endif /* INCLUDED_volk_32f_x2_add_32f_a_H */
volk_32f_x2_add_32f_a_sse
static void volk_32f_x2_add_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:301
volk_32f_x2_add_32f_a_generic
static void volk_32f_x2_add_32f_a_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:391
__VOLK_PREFETCH
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
volk_32f_x2_add_32f_u_neon
static void volk_32f_x2_add_32f_u_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:338
volk_32f_x2_add_32f_generic
static void volk_32f_x2_add_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:197
volk_32f_x2_add_32f_u_avx
static void volk_32f_x2_add_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:123
volk_32f_x2_add_32f_u_sse
static void volk_32f_x2_add_32f_u_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:160
volk_32f_x2_add_32f_a_avx
static void volk_32f_x2_add_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:264