Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_x2_max_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_x2_max_32f_a_H
72 #define INCLUDED_volk_32f_x2_max_32f_a_H
73 
74 #include <inttypes.h>
75 #include <stdio.h>
76 
77 #ifdef LV_HAVE_AVX512F
78 #include <immintrin.h>
79 
80 static inline void
81 volk_32f_x2_max_32f_a_avx512f(float* cVector, const float* aVector,
82  const float* bVector, unsigned int num_points)
83 {
84  unsigned int number = 0;
85  const unsigned int sixteenthPoints = num_points / 16;
86 
87  float* cPtr = cVector;
88  const float* aPtr = aVector;
89  const float* bPtr= bVector;
90 
91  __m512 aVal, bVal, cVal;
92  for(;number < sixteenthPoints; number++){
93  aVal = _mm512_load_ps(aPtr);
94  bVal = _mm512_load_ps(bPtr);
95 
96  cVal = _mm512_max_ps(aVal, bVal);
97 
98  _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
99 
100  aPtr += 16;
101  bPtr += 16;
102  cPtr += 16;
103  }
104 
105  number = sixteenthPoints * 16;
106  for(;number < num_points; number++){
107  const float a = *aPtr++;
108  const float b = *bPtr++;
109  *cPtr++ = ( a > b ? a : b);
110  }
111 }
112 #endif /* LV_HAVE_AVX512F */
113 
114 #ifdef LV_HAVE_SSE
115 #include <xmmintrin.h>
116 
117 static inline void
118 volk_32f_x2_max_32f_a_sse(float* cVector, const float* aVector,
119  const float* bVector, unsigned int num_points)
120 {
121  unsigned int number = 0;
122  const unsigned int quarterPoints = num_points / 4;
123 
124  float* cPtr = cVector;
125  const float* aPtr = aVector;
126  const float* bPtr= bVector;
127 
128  __m128 aVal, bVal, cVal;
129  for(;number < quarterPoints; number++){
130  aVal = _mm_load_ps(aPtr);
131  bVal = _mm_load_ps(bPtr);
132 
133  cVal = _mm_max_ps(aVal, bVal);
134 
135  _mm_store_ps(cPtr,cVal); // Store the results back into the C container
136 
137  aPtr += 4;
138  bPtr += 4;
139  cPtr += 4;
140  }
141 
142  number = quarterPoints * 4;
143  for(;number < num_points; number++){
144  const float a = *aPtr++;
145  const float b = *bPtr++;
146  *cPtr++ = ( a > b ? a : b);
147  }
148 }
149 #endif /* LV_HAVE_SSE */
150 
151 #ifdef LV_HAVE_AVX
152 #include <immintrin.h>
153 
154 static inline void
155 volk_32f_x2_max_32f_a_avx(float* cVector, const float* aVector,
156  const float* bVector, unsigned int num_points)
157 {
158  unsigned int number = 0;
159  const unsigned int eighthPoints = num_points / 8;
160 
161  float* cPtr = cVector;
162  const float* aPtr = aVector;
163  const float* bPtr= bVector;
164 
165  __m256 aVal, bVal, cVal;
166  for(;number < eighthPoints; number++){
167  aVal = _mm256_load_ps(aPtr);
168  bVal = _mm256_load_ps(bPtr);
169 
170  cVal = _mm256_max_ps(aVal, bVal);
171 
172  _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
173 
174  aPtr += 8;
175  bPtr += 8;
176  cPtr += 8;
177  }
178 
179  number = eighthPoints * 8;
180  for(;number < num_points; number++){
181  const float a = *aPtr++;
182  const float b = *bPtr++;
183  *cPtr++ = ( a > b ? a : b);
184  }
185 }
186 #endif /* LV_HAVE_AVX */
187 
188 #ifdef LV_HAVE_NEON
189 #include <arm_neon.h>
190 
191 static inline void
192 volk_32f_x2_max_32f_neon(float* cVector, const float* aVector,
193  const float* bVector, unsigned int num_points)
194 {
195  unsigned int quarter_points = num_points / 4;
196  float* cPtr = cVector;
197  const float* aPtr = aVector;
198  const float* bPtr= bVector;
199  unsigned int number = 0;
200 
201  float32x4_t a_vec, b_vec, c_vec;
202  for(number = 0; number < quarter_points; number++){
203  a_vec = vld1q_f32(aPtr);
204  b_vec = vld1q_f32(bPtr);
205  c_vec = vmaxq_f32(a_vec, b_vec);
206  vst1q_f32(cPtr, c_vec);
207  aPtr += 4;
208  bPtr += 4;
209  cPtr += 4;
210  }
211 
212  for(number = quarter_points*4; number < num_points; number++){
213  const float a = *aPtr++;
214  const float b = *bPtr++;
215  *cPtr++ = ( a > b ? a : b);
216  }
217 }
218 #endif /* LV_HAVE_NEON */
219 
220 
221 #ifdef LV_HAVE_GENERIC
222 
223 static inline void
224 volk_32f_x2_max_32f_generic(float* cVector, const float* aVector,
225  const float* bVector, unsigned int num_points)
226 {
227  float* cPtr = cVector;
228  const float* aPtr = aVector;
229  const float* bPtr= bVector;
230  unsigned int number = 0;
231 
232  for(number = 0; number < num_points; number++){
233  const float a = *aPtr++;
234  const float b = *bPtr++;
235  *cPtr++ = ( a > b ? a : b);
236  }
237 }
238 #endif /* LV_HAVE_GENERIC */
239 
240 #ifdef LV_HAVE_ORC
241 extern void
242 volk_32f_x2_max_32f_a_orc_impl(float* cVector, const float* aVector,
243  const float* bVector, unsigned int num_points);
244 
245 static inline void
246 volk_32f_x2_max_32f_u_orc(float* cVector, const float* aVector,
247  const float* bVector, unsigned int num_points)
248 {
249  volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
250 }
251 #endif /* LV_HAVE_ORC */
252 
253 
254 #endif /* INCLUDED_volk_32f_x2_max_32f_a_H */
255 
256 
257 #ifndef INCLUDED_volk_32f_x2_max_32f_u_H
258 #define INCLUDED_volk_32f_x2_max_32f_u_H
259 
260 #include <inttypes.h>
261 #include <stdio.h>
262 
263 #ifdef LV_HAVE_AVX512F
264 #include <immintrin.h>
265 
266 static inline void
267 volk_32f_x2_max_32f_u_avx512f(float* cVector, const float* aVector,
268  const float* bVector, unsigned int num_points)
269 {
270  unsigned int number = 0;
271  const unsigned int sixteenthPoints = num_points / 16;
272 
273  float* cPtr = cVector;
274  const float* aPtr = aVector;
275  const float* bPtr= bVector;
276 
277  __m512 aVal, bVal, cVal;
278  for(;number < sixteenthPoints; number++){
279  aVal = _mm512_loadu_ps(aPtr);
280  bVal = _mm512_loadu_ps(bPtr);
281 
282  cVal = _mm512_max_ps(aVal, bVal);
283 
284  _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
285 
286  aPtr += 16;
287  bPtr += 16;
288  cPtr += 16;
289  }
290 
291  number = sixteenthPoints * 16;
292  for(;number < num_points; number++){
293  const float a = *aPtr++;
294  const float b = *bPtr++;
295  *cPtr++ = ( a > b ? a : b);
296  }
297 }
298 #endif /* LV_HAVE_AVX512F */
299 
300 #ifdef LV_HAVE_AVX
301 #include <immintrin.h>
302 
303 static inline void
304 volk_32f_x2_max_32f_u_avx(float* cVector, const float* aVector,
305  const float* bVector, unsigned int num_points)
306 {
307  unsigned int number = 0;
308  const unsigned int eighthPoints = num_points / 8;
309 
310  float* cPtr = cVector;
311  const float* aPtr = aVector;
312  const float* bPtr= bVector;
313 
314  __m256 aVal, bVal, cVal;
315  for(;number < eighthPoints; number++){
316  aVal = _mm256_loadu_ps(aPtr);
317  bVal = _mm256_loadu_ps(bPtr);
318 
319  cVal = _mm256_max_ps(aVal, bVal);
320 
321  _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
322 
323  aPtr += 8;
324  bPtr += 8;
325  cPtr += 8;
326  }
327 
328  number = eighthPoints * 8;
329  for(;number < num_points; number++){
330  const float a = *aPtr++;
331  const float b = *bPtr++;
332  *cPtr++ = ( a > b ? a : b);
333  }
334 }
335 #endif /* LV_HAVE_AVX */
336 
337 #endif /* INCLUDED_volk_32f_x2_max_32f_u_H */
static void volk_32f_x2_max_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:304
static void volk_32f_x2_max_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:118
static void volk_32f_x2_max_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:224
static void volk_32f_x2_max_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:192
static void volk_32f_x2_max_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:155