Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32fc_x2_square_dist_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
78 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
79 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
80 
81 #include<inttypes.h>
82 #include<stdio.h>
83 #include<volk/volk_complex.h>
84 
85 #ifdef LV_HAVE_AVX2
86 #include<immintrin.h>
87 
88 static inline void
89 volk_32fc_x2_square_dist_32f_a_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points,
90  unsigned int num_points)
91 {
92  const unsigned int num_bytes = num_points*8;
93  __m128 xmm0, xmm9, xmm10;
94  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
95 
96  lv_32fc_t diff;
97  float sq_dist;
98  int bound = num_bytes >> 6;
99  int leftovers0 = (num_bytes >> 5) & 1;
100  int leftovers1 = (num_bytes >> 4) & 1;
101  int leftovers2 = (num_bytes >> 3) & 1;
102  int i = 0;
103 
104  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
105  xmm1 = _mm256_setzero_ps();
106  xmm2 = _mm256_load_ps((float*)&points[0]);
107  xmm0 = _mm_load_ps((float*)src0);
108  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
109  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
110  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
111  xmm3 = _mm256_load_ps((float*)&points[4]);
112 
113  for(; i < bound; ++i) {
114  xmm4 = _mm256_sub_ps(xmm1, xmm2);
115  xmm5 = _mm256_sub_ps(xmm1, xmm3);
116  points += 8;
117  xmm6 = _mm256_mul_ps(xmm4, xmm4);
118  xmm7 = _mm256_mul_ps(xmm5, xmm5);
119 
120  xmm2 = _mm256_load_ps((float*)&points[0]);
121 
122  xmm4 = _mm256_hadd_ps(xmm6, xmm7);
123  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
124 
125  xmm3 = _mm256_load_ps((float*)&points[4]);
126 
127  _mm256_store_ps(target, xmm4);
128 
129  target += 8;
130  }
131 
132  for(i = 0; i < leftovers0; ++i) {
133 
134  xmm2 = _mm256_load_ps((float*)&points[0]);
135 
136  xmm4 = _mm256_sub_ps(xmm1, xmm2);
137 
138  points += 4;
139 
140  xmm6 = _mm256_mul_ps(xmm4, xmm4);
141 
142  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
143  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
144 
145  xmm9 = _mm256_extractf128_ps(xmm4, 1);
146  _mm_store_ps(target,xmm9);
147 
148  target += 4;
149  }
150 
151  for(i = 0; i < leftovers1; ++i) {
152  xmm9 = _mm_load_ps((float*)&points[0]);
153 
154  xmm10 = _mm_sub_ps(xmm0, xmm9);
155 
156  points += 2;
157 
158  xmm9 = _mm_mul_ps(xmm10, xmm10);
159 
160  xmm10 = _mm_hadd_ps(xmm9, xmm9);
161 
162  _mm_storeh_pi((__m64*)target, xmm10);
163 
164  target += 2;
165  }
166 
167  for(i = 0; i < leftovers2; ++i) {
168 
169  diff = src0[0] - points[0];
170 
171  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
172 
173  target[0] = sq_dist;
174  }
175 }
176 
177 #endif /*LV_HAVE_AVX2*/
178 
179 #ifdef LV_HAVE_SSE3
180 #include<xmmintrin.h>
181 #include<pmmintrin.h>
182 
183 static inline void
185  unsigned int num_points)
186 {
187  const unsigned int num_bytes = num_points*8;
188 
189  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
190 
191  lv_32fc_t diff;
192  float sq_dist;
193  int bound = num_bytes >> 5;
194  int leftovers0 = (num_bytes >> 4) & 1;
195  int leftovers1 = (num_bytes >> 3) & 1;
196  int i = 0;
197 
198  xmm1 = _mm_setzero_ps();
199  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
200  xmm2 = _mm_load_ps((float*)&points[0]);
201  xmm1 = _mm_movelh_ps(xmm1, xmm1);
202  xmm3 = _mm_load_ps((float*)&points[2]);
203 
204  for(; i < bound - 1; ++i) {
205  xmm4 = _mm_sub_ps(xmm1, xmm2);
206  xmm5 = _mm_sub_ps(xmm1, xmm3);
207  points += 4;
208  xmm6 = _mm_mul_ps(xmm4, xmm4);
209  xmm7 = _mm_mul_ps(xmm5, xmm5);
210 
211  xmm2 = _mm_load_ps((float*)&points[0]);
212 
213  xmm4 = _mm_hadd_ps(xmm6, xmm7);
214 
215  xmm3 = _mm_load_ps((float*)&points[2]);
216 
217  _mm_store_ps(target, xmm4);
218 
219  target += 4;
220  }
221 
222  xmm4 = _mm_sub_ps(xmm1, xmm2);
223  xmm5 = _mm_sub_ps(xmm1, xmm3);
224 
225  points += 4;
226  xmm6 = _mm_mul_ps(xmm4, xmm4);
227  xmm7 = _mm_mul_ps(xmm5, xmm5);
228 
229  xmm4 = _mm_hadd_ps(xmm6, xmm7);
230 
231  _mm_store_ps(target, xmm4);
232 
233  target += 4;
234 
235  for(i = 0; i < leftovers0; ++i) {
236 
237  xmm2 = _mm_load_ps((float*)&points[0]);
238 
239  xmm4 = _mm_sub_ps(xmm1, xmm2);
240 
241  points += 2;
242 
243  xmm6 = _mm_mul_ps(xmm4, xmm4);
244 
245  xmm4 = _mm_hadd_ps(xmm6, xmm6);
246 
247  _mm_storeh_pi((__m64*)target, xmm4);
248 
249  target += 2;
250  }
251 
252  for(i = 0; i < leftovers1; ++i) {
253 
254  diff = src0[0] - points[0];
255 
256  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
257 
258  target[0] = sq_dist;
259  }
260 }
261 
262 #endif /*LV_HAVE_SSE3*/
263 
264 
265 #ifdef LV_HAVE_NEON
266 #include <arm_neon.h>
267 static inline void
268 volk_32fc_x2_square_dist_32f_neon(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points)
269 {
270  const unsigned int quarter_points = num_points / 4;
271  unsigned int number;
272 
273  float32x4x2_t a_vec, b_vec;
274  float32x4x2_t diff_vec;
275  float32x4_t tmp, tmp1, dist_sq;
276  a_vec.val[0] = vdupq_n_f32( lv_creal(src0[0]) );
277  a_vec.val[1] = vdupq_n_f32( lv_cimag(src0[0]) );
278  for(number=0; number < quarter_points; ++number) {
279  b_vec = vld2q_f32((float*)points);
280  diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
281  diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
282  tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
283  tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
284 
285  dist_sq = vaddq_f32(tmp, tmp1);
286  vst1q_f32(target, dist_sq);
287  points += 4;
288  target += 4;
289  }
290  for(number=quarter_points*4; number < num_points; ++number) {
291  lv_32fc_t diff = src0[0] - *points++;
292  *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
293  }
294 }
295 #endif /* LV_HAVE_NEON */
296 
297 
298 #ifdef LV_HAVE_GENERIC
299 static inline void
301  unsigned int num_points)
302 {
303  const unsigned int num_bytes = num_points*8;
304 
305  lv_32fc_t diff;
306  float sq_dist;
307  unsigned int i = 0;
308 
309  for(; i < num_bytes >> 3; ++i) {
310  diff = src0[0] - points[i];
311 
312  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
313 
314  target[i] = sq_dist;
315  }
316 }
317 
318 #endif /*LV_HAVE_GENERIC*/
319 
320 
321 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
322 
323 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
324 #define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
325 
326 #include<inttypes.h>
327 #include<stdio.h>
328 #include<volk/volk_complex.h>
329 
330 #ifdef LV_HAVE_AVX2
331 #include<immintrin.h>
332 
333 static inline void
334 volk_32fc_x2_square_dist_32f_u_avx2(float* target, lv_32fc_t* src0, lv_32fc_t* points,
335  unsigned int num_points)
336 {
337  const unsigned int num_bytes = num_points*8;
338  __m128 xmm0, xmm9;
339  __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
340 
341  lv_32fc_t diff;
342  float sq_dist;
343  int bound = num_bytes >> 6;
344  int leftovers0 = (num_bytes >> 5) & 1;
345  int leftovers1 = (num_bytes >> 3) & 0b11;
346  int i = 0;
347 
348  __m256i idx = _mm256_set_epi32(7,6,3,2,5,4,1,0);
349  xmm1 = _mm256_setzero_ps();
350  xmm2 = _mm256_loadu_ps((float*)&points[0]);
351  xmm0 = _mm_loadu_ps((float*)src0);
352  xmm0 = _mm_permute_ps(xmm0, 0b01000100);
353  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
354  xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
355  xmm3 = _mm256_loadu_ps((float*)&points[4]);
356 
357  for(; i < bound; ++i) {
358  xmm4 = _mm256_sub_ps(xmm1, xmm2);
359  xmm5 = _mm256_sub_ps(xmm1, xmm3);
360  points += 8;
361  xmm6 = _mm256_mul_ps(xmm4, xmm4);
362  xmm7 = _mm256_mul_ps(xmm5, xmm5);
363 
364  xmm2 = _mm256_loadu_ps((float*)&points[0]);
365 
366  xmm4 = _mm256_hadd_ps(xmm6, xmm7);
367  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
368 
369  xmm3 = _mm256_loadu_ps((float*)&points[4]);
370 
371  _mm256_storeu_ps(target, xmm4);
372 
373  target += 8;
374  }
375 
376  for(i = 0; i < leftovers0; ++i) {
377 
378  xmm2 = _mm256_loadu_ps((float*)&points[0]);
379 
380  xmm4 = _mm256_sub_ps(xmm1, xmm2);
381 
382  points += 4;
383 
384  xmm6 = _mm256_mul_ps(xmm4, xmm4);
385 
386  xmm4 = _mm256_hadd_ps(xmm6, xmm6);
387  xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
388 
389  xmm9 = _mm256_extractf128_ps(xmm4, 1);
390  _mm_storeu_ps(target,xmm9);
391 
392  target += 4;
393  }
394 
395  for(i = 0; i < leftovers1; ++i) {
396 
397  diff = src0[0] - points[0];
398  points += 1;
399 
400  sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
401 
402  target[0] = sq_dist;
403  target += 1;
404  }
405 }
406 
407 #endif /*LV_HAVE_AVX2*/
408 
409 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/
static void volk_32fc_x2_square_dist_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:184
for i
Definition: volk_config_fixed.tmpl.h:25
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_x2_square_dist_32f_neon(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:268
#define lv_creal(x)
Definition: volk_complex.h:83
#define lv_cimag(x)
Definition: volk_complex.h:85
static void volk_32fc_x2_square_dist_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, unsigned int num_points)
Definition: volk_32fc_x2_square_dist_32f.h:300