72 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
73 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
80 #include <pmmintrin.h>
86 unsigned int num_points)
94 unsigned int number = 0;
95 const unsigned int quarterPoints = num_points / 4;
97 __m128 num01, num23, den01, den23, norm, result;
102 for (; number < quarterPoints; number++) {
103 num01 = _mm_loadu_ps((
float*)a);
104 den01 = _mm_loadu_ps((
float*)b);
109 num23 = _mm_loadu_ps((
float*)a);
110 den23 = _mm_loadu_ps((
float*)b);
116 den01 = _mm_unpacklo_ps(norm, norm);
117 den23 = _mm_unpackhi_ps(norm, norm);
119 result = _mm_div_ps(num01, den01);
120 _mm_storeu_ps((
float*)c, result);
122 result = _mm_div_ps(num23, den23);
123 _mm_storeu_ps((
float*)c, result);
128 for (; number < num_points; number++) {
139 #include <immintrin.h>
145 unsigned int num_points)
153 unsigned int number = 0;
154 const unsigned int quarterPoints = num_points / 4;
156 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
161 for (; number < quarterPoints; number++) {
162 num = _mm256_loadu_ps(
164 denum = _mm256_loadu_ps(
167 sq = _mm256_mul_ps(denum, denum);
168 mag_sq_un = _mm256_hadd_ps(
170 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8);
173 div = _mm256_div_ps(mul_conj, mag_sq);
175 _mm256_storeu_ps((
float*)c, div);
182 number = quarterPoints * 4;
184 for (; number < num_points; number++) {
185 *c++ = (*a++) / (*b++);
191 #ifdef LV_HAVE_GENERIC
196 unsigned int num_points)
201 unsigned int number = 0;
203 for (number = 0; number < num_points; number++) {
204 *cPtr++ = (*aPtr++) / (*bPtr++);
213 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
214 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
217 #include <inttypes.h>
222 #include <pmmintrin.h>
228 unsigned int num_points)
236 unsigned int number = 0;
237 const unsigned int quarterPoints = num_points / 4;
239 __m128 num01, num23, den01, den23, norm, result;
244 for (; number < quarterPoints; number++) {
245 num01 = _mm_load_ps((
float*)a);
246 den01 = _mm_load_ps((
float*)b);
251 num23 = _mm_load_ps((
float*)a);
252 den23 = _mm_load_ps((
float*)b);
259 den01 = _mm_unpacklo_ps(norm, norm);
260 den23 = _mm_unpackhi_ps(norm, norm);
262 result = _mm_div_ps(num01, den01);
263 _mm_store_ps((
float*)c, result);
265 result = _mm_div_ps(num23, den23);
266 _mm_store_ps((
float*)c, result);
271 for (; number < num_points; number++) {
281 #include <immintrin.h>
287 unsigned int num_points)
295 unsigned int number = 0;
296 const unsigned int quarterPoints = num_points / 4;
298 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
303 for (; number < quarterPoints; number++) {
305 _mm256_load_ps((
float*)a);
307 _mm256_load_ps((
float*)b);
309 sq = _mm256_mul_ps(denum, denum);
310 mag_sq_un = _mm256_hadd_ps(
312 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8);
315 div = _mm256_div_ps(mul_conj, mag_sq);
317 _mm256_store_ps((
float*)c, div);
324 number = quarterPoints * 4;
326 for (; number < num_points; number++) {
327 *c++ = (*a++) / (*b++);
333 #include <arm_neon.h>
338 unsigned int num_points)
344 float32x4x2_t aVal, bVal, cVal;
345 float32x4_t bAbs, bAbsInv;
347 const unsigned int quarterPoints = num_points / 4;
348 unsigned int number = 0;
349 for (; number < quarterPoints; number++) {
350 aVal = vld2q_f32((
const float*)(aPtr));
351 bVal = vld2q_f32((
const float*)(bPtr));
357 bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
358 bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
360 bAbsInv = vrecpeq_f32(bAbs);
361 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
362 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
364 cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
365 cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
366 cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
368 cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
369 cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
370 cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
372 vst2q_f32((
float*)(cPtr), cVal);
376 for (number = quarterPoints * 4; number < num_points; number++) {
377 *cPtr++ = (*aPtr++) / (*bPtr++);
383 #ifdef LV_HAVE_GENERIC
388 unsigned int num_points)
393 unsigned int number = 0;
395 for (number = 0; number < num_points; number++) {
396 *cPtr++ = (*aPtr++) / (*bPtr++);