Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_x2_divide_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_x2_divide_32f_a_H
72 #define INCLUDED_volk_32f_x2_divide_32f_a_H
73 
74 #include <inttypes.h>
75 #include <stdio.h>
76 
77 #ifdef LV_HAVE_AVX512F
78 #include <immintrin.h>
79 
80 static inline void
81 volk_32f_x2_divide_32f_a_avx512f(float* cVector, const float* aVector,
82  const float* bVector, unsigned int num_points)
83 {
84  unsigned int number = 0;
85  const unsigned int sixteenthPoints = num_points / 16;
86 
87  float* cPtr = cVector;
88  const float* aPtr = aVector;
89  const float* bPtr= bVector;
90 
91  __m512 aVal, bVal, cVal;
92  for(;number < sixteenthPoints; number++){
93  aVal = _mm512_load_ps(aPtr);
94  bVal = _mm512_load_ps(bPtr);
95 
96  cVal = _mm512_div_ps(aVal, bVal);
97 
98  _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
99 
100  aPtr += 16;
101  bPtr += 16;
102  cPtr += 16;
103  }
104 
105  number = sixteenthPoints * 16;
106  for(;number < num_points; number++){
107  *cPtr++ = (*aPtr++) / (*bPtr++);
108  }
109 }
110 #endif /* LV_HAVE_AVX512F */
111 
112 
113 #ifdef LV_HAVE_AVX
114 #include <immintrin.h>
115 
116 static inline void
117 volk_32f_x2_divide_32f_a_avx(float* cVector, const float* aVector,
118  const float* bVector, unsigned int num_points)
119 {
120  unsigned int number = 0;
121  const unsigned int eighthPoints = num_points / 8;
122 
123  float* cPtr = cVector;
124  const float* aPtr = aVector;
125  const float* bPtr= bVector;
126 
127  __m256 aVal, bVal, cVal;
128  for(;number < eighthPoints; number++){
129  aVal = _mm256_load_ps(aPtr);
130  bVal = _mm256_load_ps(bPtr);
131 
132  cVal = _mm256_div_ps(aVal, bVal);
133 
134  _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
135 
136  aPtr += 8;
137  bPtr += 8;
138  cPtr += 8;
139  }
140 
141  number = eighthPoints * 8;
142  for(;number < num_points; number++){
143  *cPtr++ = (*aPtr++) / (*bPtr++);
144  }
145 }
146 #endif /* LV_HAVE_AVX */
147 
148 
149 #ifdef LV_HAVE_SSE
150 #include <xmmintrin.h>
151 
152 static inline void
153 volk_32f_x2_divide_32f_a_sse(float* cVector, const float* aVector,
154  const float* bVector, unsigned int num_points)
155 {
156  unsigned int number = 0;
157  const unsigned int quarterPoints = num_points / 4;
158 
159  float* cPtr = cVector;
160  const float* aPtr = aVector;
161  const float* bPtr= bVector;
162 
163  __m128 aVal, bVal, cVal;
164  for(;number < quarterPoints; number++){
165  aVal = _mm_load_ps(aPtr);
166  bVal = _mm_load_ps(bPtr);
167 
168  cVal = _mm_div_ps(aVal, bVal);
169 
170  _mm_store_ps(cPtr,cVal); // Store the results back into the C container
171 
172  aPtr += 4;
173  bPtr += 4;
174  cPtr += 4;
175  }
176 
177  number = quarterPoints * 4;
178  for(;number < num_points; number++){
179  *cPtr++ = (*aPtr++) / (*bPtr++);
180  }
181 }
182 #endif /* LV_HAVE_SSE */
183 
184 
185 #ifdef LV_HAVE_NEON
186 #include <arm_neon.h>
187 
188 static inline void
189 volk_32f_x2_divide_32f_neon(float* cVector, const float* aVector,
190  const float* bVector, unsigned int num_points)
191 {
192  float* cPtr = cVector;
193  const float* aPtr = aVector;
194  const float* bPtr = bVector;
195 
196  float32x4x4_t aVal, bVal, bInv, cVal;
197 
198  const unsigned int eighthPoints = num_points / 16;
199  unsigned int number = 0;
200  for(; number < eighthPoints; number++){
201  aVal = vld4q_f32(aPtr);
202  aPtr += 16;
203  bVal = vld4q_f32(bPtr);
204  bPtr += 16;
205 
206  __VOLK_PREFETCH(aPtr+16);
207  __VOLK_PREFETCH(bPtr+16);
208 
209  bInv.val[0] = vrecpeq_f32(bVal.val[0]);
210  bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
211  bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
212  cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
213 
214  bInv.val[1] = vrecpeq_f32(bVal.val[1]);
215  bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
216  bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
217  cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
218 
219  bInv.val[2] = vrecpeq_f32(bVal.val[2]);
220  bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
221  bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
222  cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
223 
224  bInv.val[3] = vrecpeq_f32(bVal.val[3]);
225  bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
226  bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
227  cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
228 
229  vst4q_f32(cPtr, cVal);
230  cPtr += 16;
231  }
232 
233  for(number = eighthPoints * 16; number < num_points; number++){
234  *cPtr++ = (*aPtr++) / (*bPtr++);
235  }
236 }
237 
238 #endif /* LV_HAVE_NEON */
239 
240 
241 #ifdef LV_HAVE_GENERIC
242 
243 static inline void
244 volk_32f_x2_divide_32f_generic(float* cVector, const float* aVector,
245  const float* bVector, unsigned int num_points)
246 {
247  float* cPtr = cVector;
248  const float* aPtr = aVector;
249  const float* bPtr= bVector;
250  unsigned int number = 0;
251 
252  for(number = 0; number < num_points; number++){
253  *cPtr++ = (*aPtr++) / (*bPtr++);
254  }
255 }
256 #endif /* LV_HAVE_GENERIC */
257 
258 
259 #ifdef LV_HAVE_ORC
260 
261 extern void
262 volk_32f_x2_divide_32f_a_orc_impl(float* cVector, const float* aVector,
263  const float* bVector, unsigned int num_points);
264 
265 static inline void
266 volk_32f_x2_divide_32f_u_orc(float* cVector, const float* aVector,
267  const float* bVector, unsigned int num_points)
268 {
269  volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
270 }
271 #endif /* LV_HAVE_ORC */
272 
273 
274 
275 #endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */
276 
277 
278 #ifndef INCLUDED_volk_32f_x2_divide_32f_u_H
279 #define INCLUDED_volk_32f_x2_divide_32f_u_H
280 
281 #include <inttypes.h>
282 #include <stdio.h>
283 
284 #ifdef LV_HAVE_AVX512F
285 #include <immintrin.h>
286 
287 static inline void
288 volk_32f_x2_divide_32f_u_avx512f(float* cVector, const float* aVector,
289  const float* bVector, unsigned int num_points)
290 {
291  unsigned int number = 0;
292  const unsigned int sixteenthPoints = num_points / 16;
293 
294  float* cPtr = cVector;
295  const float* aPtr = aVector;
296  const float* bPtr= bVector;
297 
298  __m512 aVal, bVal, cVal;
299  for(;number < sixteenthPoints; number++){
300  aVal = _mm512_loadu_ps(aPtr);
301  bVal = _mm512_loadu_ps(bPtr);
302 
303  cVal = _mm512_div_ps(aVal, bVal);
304 
305  _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
306 
307  aPtr += 16;
308  bPtr += 16;
309  cPtr += 16;
310  }
311 
312  number = sixteenthPoints * 16;
313  for(;number < num_points; number++){
314  *cPtr++ = (*aPtr++) / (*bPtr++);
315  }
316 }
317 #endif /* LV_HAVE_AVX512F */
318 
319 
320 #ifdef LV_HAVE_AVX
321 #include <immintrin.h>
322 
323 static inline void
324 volk_32f_x2_divide_32f_u_avx(float* cVector, const float* aVector,
325  const float* bVector, unsigned int num_points)
326 {
327  unsigned int number = 0;
328  const unsigned int eighthPoints = num_points / 8;
329 
330  float* cPtr = cVector;
331  const float* aPtr = aVector;
332  const float* bPtr= bVector;
333 
334  __m256 aVal, bVal, cVal;
335  for(;number < eighthPoints; number++){
336  aVal = _mm256_loadu_ps(aPtr);
337  bVal = _mm256_loadu_ps(bPtr);
338 
339  cVal = _mm256_div_ps(aVal, bVal);
340 
341  _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
342 
343  aPtr += 8;
344  bPtr += 8;
345  cPtr += 8;
346  }
347 
348  number = eighthPoints * 8;
349  for(;number < num_points; number++){
350  *cPtr++ = (*aPtr++) / (*bPtr++);
351  }
352 }
353 #endif /* LV_HAVE_AVX */
354 
355 #endif /* INCLUDED_volk_32f_x2_divide_32f_u_H */
static void volk_32f_x2_divide_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:117
static void volk_32f_x2_divide_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:153
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_32f_x2_divide_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:189
static void volk_32f_x2_divide_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:324
static void volk_32f_x2_divide_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:244