71 #ifndef INCLUDED_volk_32f_x2_divide_32f_a_H 72 #define INCLUDED_volk_32f_x2_divide_32f_a_H 77 #ifdef LV_HAVE_AVX512F 78 #include <immintrin.h> 81 volk_32f_x2_divide_32f_a_avx512f(
float* cVector,
const float* aVector,
82 const float* bVector,
unsigned int num_points)
84 unsigned int number = 0;
85 const unsigned int sixteenthPoints = num_points / 16;
87 float* cPtr = cVector;
88 const float* aPtr = aVector;
89 const float* bPtr= bVector;
91 __m512 aVal, bVal, cVal;
92 for(;number < sixteenthPoints; number++){
93 aVal = _mm512_load_ps(aPtr);
94 bVal = _mm512_load_ps(bPtr);
96 cVal = _mm512_div_ps(aVal, bVal);
98 _mm512_store_ps(cPtr,cVal);
105 number = sixteenthPoints * 16;
106 for(;number < num_points; number++){
107 *cPtr++ = (*aPtr++) / (*bPtr++);
114 #include <immintrin.h> 118 const float* bVector,
unsigned int num_points)
120 unsigned int number = 0;
121 const unsigned int eighthPoints = num_points / 8;
123 float* cPtr = cVector;
124 const float* aPtr = aVector;
125 const float* bPtr= bVector;
127 __m256 aVal, bVal, cVal;
128 for(;number < eighthPoints; number++){
129 aVal = _mm256_load_ps(aPtr);
130 bVal = _mm256_load_ps(bPtr);
132 cVal = _mm256_div_ps(aVal, bVal);
134 _mm256_store_ps(cPtr,cVal);
141 number = eighthPoints * 8;
142 for(;number < num_points; number++){
143 *cPtr++ = (*aPtr++) / (*bPtr++);
150 #include <xmmintrin.h> 154 const float* bVector,
unsigned int num_points)
156 unsigned int number = 0;
157 const unsigned int quarterPoints = num_points / 4;
159 float* cPtr = cVector;
160 const float* aPtr = aVector;
161 const float* bPtr= bVector;
163 __m128 aVal, bVal, cVal;
164 for(;number < quarterPoints; number++){
165 aVal = _mm_load_ps(aPtr);
166 bVal = _mm_load_ps(bPtr);
168 cVal = _mm_div_ps(aVal, bVal);
170 _mm_store_ps(cPtr,cVal);
177 number = quarterPoints * 4;
178 for(;number < num_points; number++){
179 *cPtr++ = (*aPtr++) / (*bPtr++);
186 #include <arm_neon.h> 190 const float* bVector,
unsigned int num_points)
192 float* cPtr = cVector;
193 const float* aPtr = aVector;
194 const float* bPtr = bVector;
196 float32x4x4_t aVal, bVal, bInv, cVal;
198 const unsigned int eighthPoints = num_points / 16;
199 unsigned int number = 0;
200 for(; number < eighthPoints; number++){
201 aVal = vld4q_f32(aPtr);
203 bVal = vld4q_f32(bPtr);
209 bInv.val[0] = vrecpeq_f32(bVal.val[0]);
210 bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
211 bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
212 cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
214 bInv.val[1] = vrecpeq_f32(bVal.val[1]);
215 bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
216 bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
217 cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
219 bInv.val[2] = vrecpeq_f32(bVal.val[2]);
220 bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
221 bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
222 cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
224 bInv.val[3] = vrecpeq_f32(bVal.val[3]);
225 bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
226 bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
227 cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
229 vst4q_f32(cPtr, cVal);
233 for(number = eighthPoints * 16; number < num_points; number++){
234 *cPtr++ = (*aPtr++) / (*bPtr++);
241 #ifdef LV_HAVE_GENERIC 245 const float* bVector,
unsigned int num_points)
247 float* cPtr = cVector;
248 const float* aPtr = aVector;
249 const float* bPtr= bVector;
250 unsigned int number = 0;
252 for(number = 0; number < num_points; number++){
253 *cPtr++ = (*aPtr++) / (*bPtr++);
262 volk_32f_x2_divide_32f_a_orc_impl(
float* cVector,
const float* aVector,
263 const float* bVector,
unsigned int num_points);
266 volk_32f_x2_divide_32f_u_orc(
float* cVector,
const float* aVector,
267 const float* bVector,
unsigned int num_points)
269 volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
278 #ifndef INCLUDED_volk_32f_x2_divide_32f_u_H 279 #define INCLUDED_volk_32f_x2_divide_32f_u_H 281 #include <inttypes.h> 284 #ifdef LV_HAVE_AVX512F 285 #include <immintrin.h> 288 volk_32f_x2_divide_32f_u_avx512f(
float* cVector,
const float* aVector,
289 const float* bVector,
unsigned int num_points)
291 unsigned int number = 0;
292 const unsigned int sixteenthPoints = num_points / 16;
294 float* cPtr = cVector;
295 const float* aPtr = aVector;
296 const float* bPtr= bVector;
298 __m512 aVal, bVal, cVal;
299 for(;number < sixteenthPoints; number++){
300 aVal = _mm512_loadu_ps(aPtr);
301 bVal = _mm512_loadu_ps(bPtr);
303 cVal = _mm512_div_ps(aVal, bVal);
305 _mm512_storeu_ps(cPtr,cVal);
312 number = sixteenthPoints * 16;
313 for(;number < num_points; number++){
314 *cPtr++ = (*aPtr++) / (*bPtr++);
321 #include <immintrin.h> 325 const float* bVector,
unsigned int num_points)
327 unsigned int number = 0;
328 const unsigned int eighthPoints = num_points / 8;
330 float* cPtr = cVector;
331 const float* aPtr = aVector;
332 const float* bPtr= bVector;
334 __m256 aVal, bVal, cVal;
335 for(;number < eighthPoints; number++){
336 aVal = _mm256_loadu_ps(aPtr);
337 bVal = _mm256_loadu_ps(bPtr);
339 cVal = _mm256_div_ps(aVal, bVal);
341 _mm256_storeu_ps(cPtr,cVal);
348 number = eighthPoints * 8;
349 for(;number < num_points; number++){
350 *cPtr++ = (*aPtr++) / (*bPtr++);
static void volk_32f_x2_divide_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:117
static void volk_32f_x2_divide_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:153
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:39
static void volk_32f_x2_divide_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:189
static void volk_32f_x2_divide_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:324
static void volk_32f_x2_divide_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:244