Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
79 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
80 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
81 
82 #include<inttypes.h>
83 #include<stdio.h>
84 #include<volk/volk_complex.h>
85 #include <string.h>
86 
87 
88 #ifdef LV_HAVE_AVX2
89 #include<immintrin.h>
90 
91 static inline void
92 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points,
93  float scalar, unsigned int num_points)
94 {
95  const unsigned int num_bytes = num_points*8;
96  __m128 xmm0, xmm9, xmm10, xmm11;
97  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
98 
99  lv_32fc_t diff;
100  memset(&diff, 0x0, 2*sizeof(float));
101 
102  float sq_dist = 0.0;
103  int bound = num_bytes >> 6;
104  int leftovers0 = (num_bytes >> 5) & 1;
105  int leftovers1 = (num_bytes >> 4) & 1;
106  int leftovers2 = (num_bytes >> 3) & 1;
107  int i = 0;
108 
109  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
110  xmm1 = _mm256_setzero_ps();
111  xmm2 = _mm256_load_ps((float*)&points[0]);
112  xmm8 = _mm256_set1_ps(scalar);
113  xmm11 = _mm256_extractf128_ps(xmm8,1);
114  xmm0 = _mm_load_ps((float*)src0);
115  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
116  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
117  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
118  xmm3 = _mm256_load_ps((float*)&points[4]);
119 
120  for(; i < bound; ++i) {
121  xmm4 = _mm256_sub_ps(xmm1, xmm2);
122  xmm5 = _mm256_sub_ps(xmm1, xmm3);
123  points += 8;
124  xmm6 = _mm256_mul_ps(xmm4, xmm4);
125  xmm7 = _mm256_mul_ps(xmm5, xmm5);
126 
127  xmm2 = _mm256_load_ps((float*)&points[0]);
128 
129  xmm4 = _mm256_hadd_ps(xmm6, xmm7);
130  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
131 
132  xmm3 = _mm256_load_ps((float*)&points[4]);
133 
134  xmm4 = _mm256_mul_ps(xmm4, xmm8);
135 
136  _mm256_store_ps(target, xmm4);
137 
138  target += 8;
139  }
140 
141  for(i = 0; i < leftovers0; ++i) {
142  xmm2 = _mm256_load_ps((float*)&points[0]);
143 
144  xmm4 = _mm256_sub_ps(xmm1, xmm2);
145 
146  points += 4;
147 
148  xmm6 = _mm256_mul_ps(xmm4, xmm4);
149 
150  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
151  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
152 
153  xmm4 = _mm256_mul_ps(xmm4, xmm8);
154 
155  xmm9 = _mm256_extractf128_ps(xmm4,1);
156  _mm_store_ps(target,xmm9);
157 
158  target += 4;
159  }
160 
161  for(i = 0; i < leftovers1; ++i) {
162  xmm9 = _mm_load_ps((float*)&points[0]);
163 
164  xmm10 = _mm_sub_ps(xmm0, xmm9);
165 
166  points += 2;
167 
168  xmm9 = _mm_mul_ps(xmm10, xmm10);
169 
170  xmm10 = _mm_hadd_ps(xmm9, xmm9);
171 
172  xmm10 = _mm_mul_ps(xmm10, xmm11);
173 
174  _mm_storeh_pi((__m64*)target, xmm10);
175 
176  target += 2;
177  }
178 
179  for(i = 0; i < leftovers2; ++i) {
180 
181  diff = src0[0] - points[0];
182 
183  sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
184 
185  target[0] = sq_dist;
186  }
187 }
188 
189 #endif /*LV_HAVE_AVX2*/
190 
191 
192 #ifdef LV_HAVE_AVX
193 #include <immintrin.h>
194 
195 static inline void
197  float *target, lv_32fc_t *src0, lv_32fc_t *points,
198  float scalar, unsigned int num_points) {
199  static const unsigned int work_size = 8;
200  unsigned int avx_work_size = num_points / work_size * work_size;
201  int i = 0;
202 
203  for (; i < avx_work_size; i += work_size) {
204  lv_32fc_t src = *src0;
205  float src_real = lv_creal(src);
206  float src_imag = lv_cimag(src);
207  __m256 source = _mm256_setr_ps(src_real, src_imag, src_real, src_imag, src_real, src_imag, src_real, src_imag);
208  __m256 points_low = _mm256_load_ps((const float *) points);
209  __m256 points_high = _mm256_load_ps((const float *) (points + work_size / 2));
210  __m256 difference_low = _mm256_sub_ps(source, points_low);
211  __m256 difference_high = _mm256_sub_ps(source, points_high);
212 
213  difference_low = _mm256_mul_ps(difference_low, difference_low);
214  difference_high = _mm256_mul_ps(difference_high, difference_high);
215 
216  __m256 magnitudes_squared = _mm256_hadd_ps(difference_low, difference_high);
217  __m128 lower_magnitudes_squared_bottom = _mm256_extractf128_ps(magnitudes_squared, 0);
218  __m128 upper_magnitudes_squared_top = _mm256_extractf128_ps(magnitudes_squared, 1);
219  __m256 lower_magnitudes_squared = _mm256_castps128_ps256(lower_magnitudes_squared_bottom);
220 
221  lower_magnitudes_squared = _mm256_insertf128_ps(
222  lower_magnitudes_squared, _mm_permute_ps(lower_magnitudes_squared_bottom, 0x4E), 1
223  );
224 
225  __m256 upper_magnitudes_squared = _mm256_castps128_ps256(upper_magnitudes_squared_top);
226 
227  upper_magnitudes_squared = _mm256_insertf128_ps(upper_magnitudes_squared, upper_magnitudes_squared_top, 1);
228  upper_magnitudes_squared_top = _mm_permute_ps(upper_magnitudes_squared_top, 0x4E);
229  upper_magnitudes_squared = _mm256_insertf128_ps(upper_magnitudes_squared, upper_magnitudes_squared_top, 0);
230 
231  __m256 ordered_magnitudes_squared = _mm256_blend_ps(lower_magnitudes_squared, upper_magnitudes_squared, 0xCC);
232  __m256 scalars = _mm256_set1_ps(scalar);
233  __m256 output = _mm256_mul_ps(ordered_magnitudes_squared, scalars);
234 
235  _mm256_store_ps(target, output);
236  target += work_size;
237  points += work_size;
238  }
239  for (; i < num_points; ++i) {
240  lv_32fc_t diff = src0[0] - *points;
241 
242  *target = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
243  ++target;
244  ++points;
245  }
246 }
247 
248 #endif /* LV_HAVE_AVX */
249 
250 
251 #ifdef LV_HAVE_SSE3
252 #include<xmmintrin.h>
253 #include<pmmintrin.h>
254 
255 static inline void
257  float scalar, unsigned int num_points)
258 {
259  const unsigned int num_bytes = num_points*8;
260 
261  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
262 
263  lv_32fc_t diff;
264  memset(&diff, 0x0, 2*sizeof(float));
265 
266  float sq_dist = 0.0;
267  int bound = num_bytes >> 5;
268  int leftovers0 = (num_bytes >> 4) & 1;
269  int leftovers1 = (num_bytes >> 3) & 1;
270  int i = 0;
271 
272  xmm1 = _mm_setzero_ps();
273  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
274  xmm2 = _mm_load_ps((float*)&points[0]);
275  xmm8 = _mm_load1_ps(&scalar);
276  xmm1 = _mm_movelh_ps(xmm1, xmm1);
277  xmm3 = _mm_load_ps((float*)&points[2]);
278 
279  for(; i < bound - 1; ++i) {
280  xmm4 = _mm_sub_ps(xmm1, xmm2);
281  xmm5 = _mm_sub_ps(xmm1, xmm3);
282  points += 4;
283  xmm6 = _mm_mul_ps(xmm4, xmm4);
284  xmm7 = _mm_mul_ps(xmm5, xmm5);
285 
286  xmm2 = _mm_load_ps((float*)&points[0]);
287 
288  xmm4 = _mm_hadd_ps(xmm6, xmm7);
289 
290  xmm3 = _mm_load_ps((float*)&points[2]);
291 
292  xmm4 = _mm_mul_ps(xmm4, xmm8);
293 
294  _mm_store_ps(target, xmm4);
295 
296  target += 4;
297  }
298 
299  xmm4 = _mm_sub_ps(xmm1, xmm2);
300  xmm5 = _mm_sub_ps(xmm1, xmm3);
301 
302  points += 4;
303  xmm6 = _mm_mul_ps(xmm4, xmm4);
304  xmm7 = _mm_mul_ps(xmm5, xmm5);
305 
306  xmm4 = _mm_hadd_ps(xmm6, xmm7);
307 
308  xmm4 = _mm_mul_ps(xmm4, xmm8);
309 
310  _mm_store_ps(target, xmm4);
311 
312  target += 4;
313 
314  for(i = 0; i < leftovers0; ++i) {
315  xmm2 = _mm_load_ps((float*)&points[0]);
316 
317  xmm4 = _mm_sub_ps(xmm1, xmm2);
318 
319  points += 2;
320 
321  xmm6 = _mm_mul_ps(xmm4, xmm4);
322 
323  xmm4 = _mm_hadd_ps(xmm6, xmm6);
324 
325  xmm4 = _mm_mul_ps(xmm4, xmm8);
326 
327  _mm_storeh_pi((__m64*)target, xmm4);
328 
329  target += 2;
330  }
331 
332  for(i = 0; i < leftovers1; ++i) {
333 
334  diff = src0[0] - points[0];
335 
336  sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
337 
338  target[0] = sq_dist;
339  }
340 }
341 
342 #endif /*LV_HAVE_SSE3*/
343 
344 
345 #ifdef LV_HAVE_GENERIC
346 static inline void
348  float scalar, unsigned int num_points)
349 {
350  lv_32fc_t diff;
351  float sq_dist;
352  unsigned int i = 0;
353 
354  for(; i < num_points; ++i) {
355  diff = src0[0] - points[i];
356 
357  sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
358 
359  target[i] = sq_dist;
360  }
361 }
362 
363 #endif /*LV_HAVE_GENERIC*/
364 
365 
366 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
367 
368 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
369 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
370 
371 #include<inttypes.h>
372 #include<stdio.h>
373 #include<volk/volk_complex.h>
374 #include <string.h>
375 
376 #ifdef LV_HAVE_AVX2
377 #include<immintrin.h>
378 
379 static inline void
380 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points,
381  float scalar, unsigned int num_points)
382 {
383  const unsigned int num_bytes = num_points*8;
384  __m128 xmm0, xmm9;
385  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
386 
387  lv_32fc_t diff;
388  memset(&diff, 0x0, 2*sizeof(float));
389 
390  float sq_dist = 0.0;
391  int bound = num_bytes >> 6;
392  int leftovers0 = (num_bytes >> 5) & 1;
393  int leftovers1 = (num_bytes >> 3) & 0b11;
394  int i = 0;
395 
396  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
397  xmm1 = _mm256_setzero_ps();
398  xmm2 = _mm256_loadu_ps((float*)&points[0]);
399  xmm8 = _mm256_set1_ps(scalar);
400  xmm0 = _mm_loadu_ps((float*)src0);
401  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
402  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
403  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
404  xmm3 = _mm256_loadu_ps((float*)&points[4]);
405 
406  for(; i < bound; ++i) {
407  xmm4 = _mm256_sub_ps(xmm1, xmm2);
408  xmm5 = _mm256_sub_ps(xmm1, xmm3);
409  points += 8;
410  xmm6 = _mm256_mul_ps(xmm4, xmm4);
411  xmm7 = _mm256_mul_ps(xmm5, xmm5);
412 
413  xmm2 = _mm256_loadu_ps((float*)&points[0]);
414 
415  xmm4 = _mm256_hadd_ps(xmm6, xmm7);
416  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
417 
418  xmm3 = _mm256_loadu_ps((float*)&points[4]);
419 
420  xmm4 = _mm256_mul_ps(xmm4, xmm8);
421 
422  _mm256_storeu_ps(target, xmm4);
423 
424  target += 8;
425  }
426 
427  for(i = 0; i < leftovers0; ++i) {
428  xmm2 = _mm256_loadu_ps((float*)&points[0]);
429 
430  xmm4 = _mm256_sub_ps(xmm1, xmm2);
431 
432  points += 4;
433 
434  xmm6 = _mm256_mul_ps(xmm4, xmm4);
435 
436  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
437  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
438 
439  xmm4 = _mm256_mul_ps(xmm4, xmm8);
440 
441  xmm9 = _mm256_extractf128_ps(xmm4,1);
442  _mm_storeu_ps(target,xmm9);
443 
444  target += 4;
445  }
446 
447  for(i = 0; i < leftovers1; ++i) {
448 
449  diff = src0[0] - points[0];
450  points += 1;
451 
452  sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
453 
454  target[0] = sq_dist;
455  target += 1;
456  }
457 }
458 
459 #endif /*LV_HAVE_AVX2*/
460 
461 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:347
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:256
for i
Definition: volk_config_fixed.tmpl.h:25
float complex lv_32fc_t
Definition: volk_complex.h:61
#define lv_creal(x)
Definition: volk_complex.h:83
#define lv_cimag(x)
Definition: volk_complex.h:85
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:196