Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_neon_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*
24  * This file is intended to hold NEON intrinsics of intrinsics.
25  * They should be used in VOLK kernels to avoid copy-pasta.
26  */
27 
28 #ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
29 #define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
30 #ifdef LV_HAVE_NEON
31 #include <arm_neon.h>
32 
33 
34 /* Magnitude squared for float32x4x2_t */
35 static inline float32x4_t
36 _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
37 {
38  float32x4_t iValue, qValue, result;
39  iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
40  qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
41  result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
42  return result;
43 }
44 
45 /* Inverse square root for float32x4_t */
46 static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
47 {
48  float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
49  sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
50  sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
51 
52  return sqrt_reciprocal;
53 }
54 
55 /* Complex multiplication for float32x4x2_t */
56 static inline float32x4x2_t
57 _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
58 {
59  float32x4x2_t tmp_real;
60  float32x4x2_t tmp_imag;
61  float32x4x2_t c_val;
62 
63  // multiply the real*real and imag*imag to get real result
64  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
65  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
66  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
67  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
68  // Multiply cross terms to get the imaginary result
69  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
70  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
71  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
72  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
73  // combine the products
74  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
75  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
76  return c_val;
77 }
78 
79 #endif /*LV_HAVE_NEON*/
80 #endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:46
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:57
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:36