79 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H 80 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H 92 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(
float* target,
lv_32fc_t* src0,
lv_32fc_t* points,
93 float scalar,
unsigned int num_points)
95 const unsigned int num_bytes = num_points*8;
96 __m128 xmm0, xmm9, xmm10, xmm11;
97 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
100 memset(&diff, 0x0, 2*
sizeof(
float));
103 int bound = num_bytes >> 6;
104 int leftovers0 = (num_bytes >> 5) & 1;
105 int leftovers1 = (num_bytes >> 4) & 1;
106 int leftovers2 = (num_bytes >> 3) & 1;
109 __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
110 xmm1 = _mm256_setzero_ps();
111 xmm2 = _mm256_load_ps((
float*)&points[0]);
112 xmm8 = _mm256_set1_ps(scalar);
113 xmm11 = _mm256_extractf128_ps(xmm8,1);
114 xmm0 = _mm_load_ps((
float*)src0);
115 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
116 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
117 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
118 xmm3 = _mm256_load_ps((
float*)&points[4]);
120 for(; i < bound; ++
i) {
121 xmm4 = _mm256_sub_ps(xmm1, xmm2);
122 xmm5 = _mm256_sub_ps(xmm1, xmm3);
124 xmm6 = _mm256_mul_ps(xmm4, xmm4);
125 xmm7 = _mm256_mul_ps(xmm5, xmm5);
127 xmm2 = _mm256_load_ps((
float*)&points[0]);
129 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
130 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
132 xmm3 = _mm256_load_ps((
float*)&points[4]);
134 xmm4 = _mm256_mul_ps(xmm4, xmm8);
136 _mm256_store_ps(target, xmm4);
141 for(i = 0; i < leftovers0; ++
i) {
142 xmm2 = _mm256_load_ps((
float*)&points[0]);
144 xmm4 = _mm256_sub_ps(xmm1, xmm2);
148 xmm6 = _mm256_mul_ps(xmm4, xmm4);
150 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
151 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
153 xmm4 = _mm256_mul_ps(xmm4, xmm8);
155 xmm9 = _mm256_extractf128_ps(xmm4,1);
156 _mm_store_ps(target,xmm9);
161 for(i = 0; i < leftovers1; ++
i) {
162 xmm9 = _mm_load_ps((
float*)&points[0]);
164 xmm10 = _mm_sub_ps(xmm0, xmm9);
168 xmm9 = _mm_mul_ps(xmm10, xmm10);
170 xmm10 = _mm_hadd_ps(xmm9, xmm9);
172 xmm10 = _mm_mul_ps(xmm10, xmm11);
174 _mm_storeh_pi((__m64*)target, xmm10);
179 for(i = 0; i < leftovers2; ++
i) {
181 diff = src0[0] - points[0];
193 #include <immintrin.h> 198 float scalar,
unsigned int num_points) {
199 static const unsigned int work_size = 8;
200 unsigned int avx_work_size = num_points / work_size * work_size;
203 for (; i < avx_work_size; i += work_size) {
207 __m256 source = _mm256_setr_ps(src_real, src_imag, src_real, src_imag, src_real, src_imag, src_real, src_imag);
208 __m256 points_low = _mm256_load_ps((
const float *) points);
209 __m256 points_high = _mm256_load_ps((
const float *) (points + work_size / 2));
210 __m256 difference_low = _mm256_sub_ps(source, points_low);
211 __m256 difference_high = _mm256_sub_ps(source, points_high);
213 difference_low = _mm256_mul_ps(difference_low, difference_low);
214 difference_high = _mm256_mul_ps(difference_high, difference_high);
216 __m256 magnitudes_squared = _mm256_hadd_ps(difference_low, difference_high);
217 __m128 lower_magnitudes_squared_bottom = _mm256_extractf128_ps(magnitudes_squared, 0);
218 __m128 upper_magnitudes_squared_top = _mm256_extractf128_ps(magnitudes_squared, 1);
219 __m256 lower_magnitudes_squared = _mm256_castps128_ps256(lower_magnitudes_squared_bottom);
221 lower_magnitudes_squared = _mm256_insertf128_ps(
222 lower_magnitudes_squared, _mm_permute_ps(lower_magnitudes_squared_bottom, 0x4E), 1
225 __m256 upper_magnitudes_squared = _mm256_castps128_ps256(upper_magnitudes_squared_top);
227 upper_magnitudes_squared = _mm256_insertf128_ps(upper_magnitudes_squared, upper_magnitudes_squared_top, 1);
228 upper_magnitudes_squared_top = _mm_permute_ps(upper_magnitudes_squared_top, 0x4E);
229 upper_magnitudes_squared = _mm256_insertf128_ps(upper_magnitudes_squared, upper_magnitudes_squared_top, 0);
231 __m256 ordered_magnitudes_squared = _mm256_blend_ps(lower_magnitudes_squared, upper_magnitudes_squared, 0xCC);
232 __m256 scalars = _mm256_set1_ps(scalar);
233 __m256 output = _mm256_mul_ps(ordered_magnitudes_squared, scalars);
235 _mm256_store_ps(target, output);
239 for (; i < num_points; ++
i) {
252 #include<xmmintrin.h> 253 #include<pmmintrin.h> 257 float scalar,
unsigned int num_points)
259 const unsigned int num_bytes = num_points*8;
261 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
264 memset(&diff, 0x0, 2*
sizeof(
float));
267 int bound = num_bytes >> 5;
268 int leftovers0 = (num_bytes >> 4) & 1;
269 int leftovers1 = (num_bytes >> 3) & 1;
272 xmm1 = _mm_setzero_ps();
273 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
274 xmm2 = _mm_load_ps((
float*)&points[0]);
275 xmm8 = _mm_load1_ps(&scalar);
276 xmm1 = _mm_movelh_ps(xmm1, xmm1);
277 xmm3 = _mm_load_ps((
float*)&points[2]);
279 for(; i < bound - 1; ++
i) {
280 xmm4 = _mm_sub_ps(xmm1, xmm2);
281 xmm5 = _mm_sub_ps(xmm1, xmm3);
283 xmm6 = _mm_mul_ps(xmm4, xmm4);
284 xmm7 = _mm_mul_ps(xmm5, xmm5);
286 xmm2 = _mm_load_ps((
float*)&points[0]);
288 xmm4 = _mm_hadd_ps(xmm6, xmm7);
290 xmm3 = _mm_load_ps((
float*)&points[2]);
292 xmm4 = _mm_mul_ps(xmm4, xmm8);
294 _mm_store_ps(target, xmm4);
299 xmm4 = _mm_sub_ps(xmm1, xmm2);
300 xmm5 = _mm_sub_ps(xmm1, xmm3);
303 xmm6 = _mm_mul_ps(xmm4, xmm4);
304 xmm7 = _mm_mul_ps(xmm5, xmm5);
306 xmm4 = _mm_hadd_ps(xmm6, xmm7);
308 xmm4 = _mm_mul_ps(xmm4, xmm8);
310 _mm_store_ps(target, xmm4);
314 for(i = 0; i < leftovers0; ++
i) {
315 xmm2 = _mm_load_ps((
float*)&points[0]);
317 xmm4 = _mm_sub_ps(xmm1, xmm2);
321 xmm6 = _mm_mul_ps(xmm4, xmm4);
323 xmm4 = _mm_hadd_ps(xmm6, xmm6);
325 xmm4 = _mm_mul_ps(xmm4, xmm8);
327 _mm_storeh_pi((__m64*)target, xmm4);
332 for(i = 0; i < leftovers1; ++
i) {
334 diff = src0[0] - points[0];
345 #ifdef LV_HAVE_GENERIC 348 float scalar,
unsigned int num_points)
354 for(; i < num_points; ++
i) {
355 diff = src0[0] - points[
i];
368 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H 369 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H 377 #include<immintrin.h> 380 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(
float* target,
lv_32fc_t* src0,
lv_32fc_t* points,
381 float scalar,
unsigned int num_points)
383 const unsigned int num_bytes = num_points*8;
385 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
388 memset(&diff, 0x0, 2*
sizeof(
float));
391 int bound = num_bytes >> 6;
392 int leftovers0 = (num_bytes >> 5) & 1;
393 int leftovers1 = (num_bytes >> 3) & 0b11;
396 __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
397 xmm1 = _mm256_setzero_ps();
398 xmm2 = _mm256_loadu_ps((
float*)&points[0]);
399 xmm8 = _mm256_set1_ps(scalar);
400 xmm0 = _mm_loadu_ps((
float*)src0);
401 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
402 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
403 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
404 xmm3 = _mm256_loadu_ps((
float*)&points[4]);
406 for(; i < bound; ++
i) {
407 xmm4 = _mm256_sub_ps(xmm1, xmm2);
408 xmm5 = _mm256_sub_ps(xmm1, xmm3);
410 xmm6 = _mm256_mul_ps(xmm4, xmm4);
411 xmm7 = _mm256_mul_ps(xmm5, xmm5);
413 xmm2 = _mm256_loadu_ps((
float*)&points[0]);
415 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
416 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
418 xmm3 = _mm256_loadu_ps((
float*)&points[4]);
420 xmm4 = _mm256_mul_ps(xmm4, xmm8);
422 _mm256_storeu_ps(target, xmm4);
427 for(i = 0; i < leftovers0; ++
i) {
428 xmm2 = _mm256_loadu_ps((
float*)&points[0]);
430 xmm4 = _mm256_sub_ps(xmm1, xmm2);
434 xmm6 = _mm256_mul_ps(xmm4, xmm4);
436 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
437 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
439 xmm4 = _mm256_mul_ps(xmm4, xmm8);
441 xmm9 = _mm256_extractf128_ps(xmm4,1);
442 _mm_storeu_ps(target,xmm9);
447 for(i = 0; i < leftovers1; ++
i) {
449 diff = src0[0] - points[0];
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:347
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:256
for i
Definition: volk_config_fixed.tmpl.h:25
float complex lv_32fc_t
Definition: volk_complex.h:61
#define lv_creal(x)
Definition: volk_complex.h:83
#define lv_cimag(x)
Definition: volk_complex.h:85
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:196