Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_x2_add_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
73 #ifndef INCLUDED_volk_32f_x2_add_32f_u_H
74 #define INCLUDED_volk_32f_x2_add_32f_u_H
75 
76 #include <inttypes.h>
77 #include <stdio.h>
78 
79 #ifdef LV_HAVE_AVX512F
80 #include <immintrin.h>
81 
82 static inline void
83 volk_32f_x2_add_32f_u_avx512f(float* cVector, const float* aVector,
84  const float* bVector, unsigned int num_points)
85 {
86  unsigned int number = 0;
87  const unsigned int sixteenthPoints = num_points / 16;
88 
89  float* cPtr = cVector;
90  const float* aPtr = aVector;
91  const float* bPtr= bVector;
92 
93  __m512 aVal, bVal, cVal;
94  for(;number < sixteenthPoints; number++){
95 
96  aVal = _mm512_loadu_ps(aPtr);
97  bVal = _mm512_loadu_ps(bPtr);
98 
99  cVal = _mm512_add_ps(aVal, bVal);
100 
101  _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
102 
103  aPtr += 16;
104  bPtr += 16;
105  cPtr += 16;
106  }
107 
108  number = sixteenthPoints * 16;
109 
110  for(;number < num_points; number++){
111  *cPtr++ = (*aPtr++) + (*bPtr++);
112  }
113 }
114 
115 #endif /* LV_HAVE_AVX512F */
116 
117 
118 #ifdef LV_HAVE_AVX
119 #include <immintrin.h>
120 
121 static inline void
122 volk_32f_x2_add_32f_u_avx(float* cVector, const float* aVector,
123  const float* bVector, unsigned int num_points)
124 {
125  unsigned int number = 0;
126  const unsigned int eighthPoints = num_points / 8;
127  float* cPtr = cVector;
128  const float* aPtr = aVector;
129  const float* bPtr= bVector;
130  __m256 aVal, bVal, cVal;
131  for(;number < eighthPoints; number++){
132 
133  aVal = _mm256_loadu_ps(aPtr);
134  bVal = _mm256_loadu_ps(bPtr);
135 
136  cVal = _mm256_add_ps(aVal, bVal);
137 
138  _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
139 
140  aPtr += 8;
141  bPtr += 8;
142  cPtr += 8;
143  }
144 
145  number = eighthPoints * 8;
146 
147  for(;number < num_points; number++){
148  *cPtr++ = (*aPtr++) + (*bPtr++);
149  }
150 }
151 #endif /* LV_HAVE_AVX */
152 
153 
154 #ifdef LV_HAVE_SSE
155 #include <xmmintrin.h>
156 
157 static inline void
158 volk_32f_x2_add_32f_u_sse(float* cVector, const float* aVector,
159  const float* bVector, unsigned int num_points)
160 {
161  unsigned int number = 0;
162  const unsigned int quarterPoints = num_points / 4;
163 
164  float* cPtr = cVector;
165  const float* aPtr = aVector;
166  const float* bPtr= bVector;
167 
168  __m128 aVal, bVal, cVal;
169  for(;number < quarterPoints; number++){
170 
171  aVal = _mm_loadu_ps(aPtr);
172  bVal = _mm_loadu_ps(bPtr);
173 
174  cVal = _mm_add_ps(aVal, bVal);
175 
176  _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
177 
178  aPtr += 4;
179  bPtr += 4;
180  cPtr += 4;
181  }
182 
183  number = quarterPoints * 4;
184  for(;number < num_points; number++){
185  *cPtr++ = (*aPtr++) + (*bPtr++);
186  }
187 }
188 #endif /* LV_HAVE_SSE */
189 
190 
191 #ifdef LV_HAVE_GENERIC
192 
193 static inline void
194 volk_32f_x2_add_32f_generic(float* cVector, const float* aVector,
195  const float* bVector, unsigned int num_points)
196 {
197  float* cPtr = cVector;
198  const float* aPtr = aVector;
199  const float* bPtr= bVector;
200  unsigned int number = 0;
201 
202  for(number = 0; number < num_points; number++){
203  *cPtr++ = (*aPtr++) + (*bPtr++);
204  }
205 }
206 #endif /* LV_HAVE_GENERIC */
207 
208 
209 #endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
210 #ifndef INCLUDED_volk_32f_x2_add_32f_a_H
211 #define INCLUDED_volk_32f_x2_add_32f_a_H
212 
213 #include <inttypes.h>
214 #include <stdio.h>
215 
216 #ifdef LV_HAVE_AVX512F
217 #include <immintrin.h>
218 
219 static inline void
220 volk_32f_x2_add_32f_a_avx512f(float* cVector, const float* aVector,
221  const float* bVector, unsigned int num_points)
222 {
223  unsigned int number = 0;
224  const unsigned int sixteenthPoints = num_points / 16;
225 
226  float* cPtr = cVector;
227  const float* aPtr = aVector;
228  const float* bPtr= bVector;
229 
230  __m512 aVal, bVal, cVal;
231  for(;number < sixteenthPoints; number++){
232 
233  aVal = _mm512_load_ps(aPtr);
234  bVal = _mm512_load_ps(bPtr);
235 
236  cVal = _mm512_add_ps(aVal, bVal);
237 
238  _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
239 
240  aPtr += 16;
241  bPtr += 16;
242  cPtr += 16;
243  }
244 
245  number = sixteenthPoints * 16;
246 
247  for(;number < num_points; number++){
248  *cPtr++ = (*aPtr++) + (*bPtr++);
249  }
250 }
251 
252 #endif /* LV_HAVE_AVX512F */
253 
254 
255 #ifdef LV_HAVE_AVX
256 #include <immintrin.h>
257 
258 static inline void
259 volk_32f_x2_add_32f_a_avx(float* cVector, const float* aVector,
260  const float* bVector, unsigned int num_points)
261 {
262  unsigned int number = 0;
263  const unsigned int eighthPoints = num_points / 8;
264 
265  float* cPtr = cVector;
266  const float* aPtr = aVector;
267  const float* bPtr= bVector;
268 
269  __m256 aVal, bVal, cVal;
270  for(;number < eighthPoints; number++){
271 
272  aVal = _mm256_load_ps(aPtr);
273  bVal = _mm256_load_ps(bPtr);
274 
275  cVal = _mm256_add_ps(aVal, bVal);
276 
277  _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
278 
279  aPtr += 8;
280  bPtr += 8;
281  cPtr += 8;
282  }
283 
284  number = eighthPoints * 8;
285  for(;number < num_points; number++){
286  *cPtr++ = (*aPtr++) + (*bPtr++);
287  }
288 }
289 #endif /* LV_HAVE_AVX */
290 
291 #ifdef LV_HAVE_SSE
292 #include <xmmintrin.h>
293 
294 static inline void
295 volk_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
296 {
297  unsigned int number = 0;
298  const unsigned int quarterPoints = num_points / 4;
299 
300  float* cPtr = cVector;
301  const float* aPtr = aVector;
302  const float* bPtr= bVector;
303 
304  __m128 aVal, bVal, cVal;
305  for(;number < quarterPoints; number++){
306  aVal = _mm_load_ps(aPtr);
307  bVal = _mm_load_ps(bPtr);
308 
309  cVal = _mm_add_ps(aVal, bVal);
310 
311  _mm_store_ps(cPtr,cVal); // Store the results back into the C container
312 
313  aPtr += 4;
314  bPtr += 4;
315  cPtr += 4;
316  }
317 
318  number = quarterPoints * 4;
319  for(;number < num_points; number++){
320  *cPtr++ = (*aPtr++) + (*bPtr++);
321  }
322 }
323 #endif /* LV_HAVE_SSE */
324 
325 
326 #ifdef LV_HAVE_NEON
327 #include <arm_neon.h>
328 
329 static inline void
330 volk_32f_x2_add_32f_u_neon(float* cVector, const float* aVector,
331  const float* bVector, unsigned int num_points)
332 {
333  unsigned int number = 0;
334  const unsigned int quarterPoints = num_points / 4;
335 
336  float* cPtr = cVector;
337  const float* aPtr = aVector;
338  const float* bPtr= bVector;
339  float32x4_t aVal, bVal, cVal;
340  for(number=0; number < quarterPoints; number++){
341  // Load in to NEON registers
342  aVal = vld1q_f32(aPtr);
343  bVal = vld1q_f32(bPtr);
344  __VOLK_PREFETCH(aPtr+4);
345  __VOLK_PREFETCH(bPtr+4);
346 
347  // vector add
348  cVal = vaddq_f32(aVal, bVal);
349  // Store the results back into the C container
350  vst1q_f32(cPtr,cVal);
351 
352  aPtr += 4; // q uses quadwords, 4 floats per vadd
353  bPtr += 4;
354  cPtr += 4;
355  }
356 
357  number = quarterPoints * 4; // should be = num_points
358  for(;number < num_points; number++){
359  *cPtr++ = (*aPtr++) + (*bPtr++);
360  }
361 }
362 
363 #endif /* LV_HAVE_NEON */
364 
365 #ifdef LV_HAVE_NEONV7
366 extern void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
367 #endif /* LV_HAVE_NEONV7 */
368 
369 #ifdef LV_HAVE_NEONV7
370 extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
371 #endif /* LV_HAVE_NEONV7 */
372 
373 #ifdef LV_HAVE_GENERIC
374 
375 static inline void
376 volk_32f_x2_add_32f_a_generic(float* cVector, const float* aVector,
377  const float* bVector, unsigned int num_points)
378 {
379  float* cPtr = cVector;
380  const float* aPtr = aVector;
381  const float* bPtr= bVector;
382  unsigned int number = 0;
383 
384  for(number = 0; number < num_points; number++){
385  *cPtr++ = (*aPtr++) + (*bPtr++);
386  }
387 }
388 #endif /* LV_HAVE_GENERIC */
389 
390 
391 #ifdef LV_HAVE_ORC
392 
393 extern void
394 volk_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector,
395  const float* bVector, unsigned int num_points);
396 
397 static inline void
398 volk_32f_x2_add_32f_u_orc(float* cVector, const float* aVector,
399  const float* bVector, unsigned int num_points){
400  volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
401 }
402 
403 #endif /* LV_HAVE_ORC */
404 
405 
406 #endif /* INCLUDED_volk_32f_x2_add_32f_a_H */
static void volk_32f_x2_add_32f_a_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:376
static void volk_32f_x2_add_32f_u_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:330
static void volk_32f_x2_add_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:194
static void volk_32f_x2_add_32f_u_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:158
static void volk_32f_x2_add_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:259
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_32f_x2_add_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:122
static void volk_32f_x2_add_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_add_32f.h:295