78 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H 79 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H 90 unsigned int num_points)
92 const unsigned int num_bytes = num_points*8;
93 __m128 xmm0, xmm9, xmm10;
94 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
98 int bound = num_bytes >> 6;
99 int leftovers0 = (num_bytes >> 5) & 1;
100 int leftovers1 = (num_bytes >> 4) & 1;
101 int leftovers2 = (num_bytes >> 3) & 1;
104 __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
105 xmm1 = _mm256_setzero_ps();
106 xmm2 = _mm256_load_ps((
float*)&points[0]);
107 xmm0 = _mm_load_ps((
float*)src0);
108 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
109 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
110 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
111 xmm3 = _mm256_load_ps((
float*)&points[4]);
113 for(; i < bound; ++
i) {
114 xmm4 = _mm256_sub_ps(xmm1, xmm2);
115 xmm5 = _mm256_sub_ps(xmm1, xmm3);
117 xmm6 = _mm256_mul_ps(xmm4, xmm4);
118 xmm7 = _mm256_mul_ps(xmm5, xmm5);
120 xmm2 = _mm256_load_ps((
float*)&points[0]);
122 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
123 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
125 xmm3 = _mm256_load_ps((
float*)&points[4]);
127 _mm256_store_ps(target, xmm4);
132 for(i = 0; i < leftovers0; ++
i) {
134 xmm2 = _mm256_load_ps((
float*)&points[0]);
136 xmm4 = _mm256_sub_ps(xmm1, xmm2);
140 xmm6 = _mm256_mul_ps(xmm4, xmm4);
142 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
143 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
145 xmm9 = _mm256_extractf128_ps(xmm4, 1);
146 _mm_store_ps(target,xmm9);
151 for(i = 0; i < leftovers1; ++
i) {
152 xmm9 = _mm_load_ps((
float*)&points[0]);
154 xmm10 = _mm_sub_ps(xmm0, xmm9);
158 xmm9 = _mm_mul_ps(xmm10, xmm10);
160 xmm10 = _mm_hadd_ps(xmm9, xmm9);
162 _mm_storeh_pi((__m64*)target, xmm10);
167 for(i = 0; i < leftovers2; ++
i) {
169 diff = src0[0] - points[0];
180 #include<xmmintrin.h> 181 #include<pmmintrin.h> 185 unsigned int num_points)
187 const unsigned int num_bytes = num_points*8;
189 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
193 int bound = num_bytes >> 5;
194 int leftovers0 = (num_bytes >> 4) & 1;
195 int leftovers1 = (num_bytes >> 3) & 1;
198 xmm1 = _mm_setzero_ps();
199 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
200 xmm2 = _mm_load_ps((
float*)&points[0]);
201 xmm1 = _mm_movelh_ps(xmm1, xmm1);
202 xmm3 = _mm_load_ps((
float*)&points[2]);
204 for(; i < bound - 1; ++
i) {
205 xmm4 = _mm_sub_ps(xmm1, xmm2);
206 xmm5 = _mm_sub_ps(xmm1, xmm3);
208 xmm6 = _mm_mul_ps(xmm4, xmm4);
209 xmm7 = _mm_mul_ps(xmm5, xmm5);
211 xmm2 = _mm_load_ps((
float*)&points[0]);
213 xmm4 = _mm_hadd_ps(xmm6, xmm7);
215 xmm3 = _mm_load_ps((
float*)&points[2]);
217 _mm_store_ps(target, xmm4);
222 xmm4 = _mm_sub_ps(xmm1, xmm2);
223 xmm5 = _mm_sub_ps(xmm1, xmm3);
226 xmm6 = _mm_mul_ps(xmm4, xmm4);
227 xmm7 = _mm_mul_ps(xmm5, xmm5);
229 xmm4 = _mm_hadd_ps(xmm6, xmm7);
231 _mm_store_ps(target, xmm4);
235 for(i = 0; i < leftovers0; ++
i) {
237 xmm2 = _mm_load_ps((
float*)&points[0]);
239 xmm4 = _mm_sub_ps(xmm1, xmm2);
243 xmm6 = _mm_mul_ps(xmm4, xmm4);
245 xmm4 = _mm_hadd_ps(xmm6, xmm6);
247 _mm_storeh_pi((__m64*)target, xmm4);
252 for(i = 0; i < leftovers1; ++
i) {
254 diff = src0[0] - points[0];
266 #include <arm_neon.h> 270 const unsigned int quarter_points = num_points / 4;
273 float32x4x2_t a_vec, b_vec;
274 float32x4x2_t diff_vec;
275 float32x4_t tmp, tmp1, dist_sq;
276 a_vec.val[0] = vdupq_n_f32(
lv_creal(src0[0]) );
277 a_vec.val[1] = vdupq_n_f32(
lv_cimag(src0[0]) );
278 for(number=0; number < quarter_points; ++number) {
279 b_vec = vld2q_f32((
float*)points);
280 diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
281 diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
282 tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
283 tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
285 dist_sq = vaddq_f32(tmp, tmp1);
286 vst1q_f32(target, dist_sq);
290 for(number=quarter_points*4; number < num_points; ++number) {
298 #ifdef LV_HAVE_GENERIC 301 unsigned int num_points)
303 const unsigned int num_bytes = num_points*8;
309 for(; i < num_bytes >> 3; ++
i) {
310 diff = src0[0] - points[
i];
323 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H 324 #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H 331 #include<immintrin.h> 334 volk_32fc_x2_square_dist_32f_u_avx2(
float* target,
lv_32fc_t* src0,
lv_32fc_t* points,
335 unsigned int num_points)
337 const unsigned int num_bytes = num_points*8;
339 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
343 int bound = num_bytes >> 6;
344 int leftovers0 = (num_bytes >> 5) & 1;
345 int leftovers1 = (num_bytes >> 3) & 0b11;
348 __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
349 xmm1 = _mm256_setzero_ps();
350 xmm2 = _mm256_loadu_ps((
float*)&points[0]);
351 xmm0 = _mm_loadu_ps((
float*)src0);
352 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
353 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
354 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
355 xmm3 = _mm256_loadu_ps((
float*)&points[4]);
357 for(; i < bound; ++
i) {
358 xmm4 = _mm256_sub_ps(xmm1, xmm2);
359 xmm5 = _mm256_sub_ps(xmm1, xmm3);
361 xmm6 = _mm256_mul_ps(xmm4, xmm4);
362 xmm7 = _mm256_mul_ps(xmm5, xmm5);
364 xmm2 = _mm256_loadu_ps((
float*)&points[0]);
366 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
367 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
369 xmm3 = _mm256_loadu_ps((
float*)&points[4]);
371 _mm256_storeu_ps(target, xmm4);
376 for(i = 0; i < leftovers0; ++
i) {
378 xmm2 = _mm256_loadu_ps((
float*)&points[0]);
380 xmm4 = _mm256_sub_ps(xmm1, xmm2);
384 xmm6 = _mm256_mul_ps(xmm4, xmm4);
386 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
387 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
389 xmm9 = _mm256_extractf128_ps(xmm4, 1);
390 _mm_storeu_ps(target,xmm9);
395 for(i = 0; i < leftovers1; ++
i) {
397 diff = src0[0] - points[0];
static void volk_32fc_x2_square_dist_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:184
for i
Definition: volk_config_fixed.tmpl.h:25
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_x2_square_dist_32f_neon(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:268
#define lv_creal(x)
Definition: volk_complex.h:83
#define lv_cimag(x)
Definition: volk_complex.h:85
static void volk_32fc_x2_square_dist_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:300