Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32fc_32f_multiply_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
55 #ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
56 #define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
57 
58 #include <inttypes.h>
59 #include <stdio.h>
60 
61 #ifdef LV_HAVE_AVX
62 #include <immintrin.h>
63 
64 static inline void
66  const float* bVector, unsigned int num_points)
67 {
68  unsigned int number = 0;
69  const unsigned int eighthPoints = num_points / 8;
70 
71  lv_32fc_t* cPtr = cVector;
72  const lv_32fc_t* aPtr = aVector;
73  const float* bPtr= bVector;
74 
75  __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
76 
77  __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
78 
79  for(;number < eighthPoints; number++){
80 
81  aVal1 = _mm256_load_ps((float *)aPtr);
82  aPtr += 4;
83 
84  aVal2 = _mm256_load_ps((float *)aPtr);
85  aPtr += 4;
86 
87  bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
88  bPtr += 8;
89 
90  bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
91  bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
92 
93  bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
94  bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
95 
96  cVal1 = _mm256_mul_ps(aVal1, bVal1);
97  cVal2 = _mm256_mul_ps(aVal2, bVal2);
98 
99  _mm256_store_ps((float*)cPtr,cVal1); // Store the results back into the C container
100  cPtr += 4;
101 
102  _mm256_store_ps((float*)cPtr,cVal2); // Store the results back into the C container
103  cPtr += 4;
104  }
105 
106  number = eighthPoints * 8;
107  for(;number < num_points; ++number){
108  *cPtr++ = (*aPtr++) * (*bPtr++);
109  }
110 }
111 #endif /* LV_HAVE_AVX */
112 
113 
114 #ifdef LV_HAVE_SSE
115 #include <xmmintrin.h>
116 
117 static inline void
119  const float* bVector, unsigned int num_points)
120 {
121  unsigned int number = 0;
122  const unsigned int quarterPoints = num_points / 4;
123 
124  lv_32fc_t* cPtr = cVector;
125  const lv_32fc_t* aPtr = aVector;
126  const float* bPtr= bVector;
127 
128  __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
129  for(;number < quarterPoints; number++){
130 
131  aVal1 = _mm_load_ps((const float*)aPtr);
132  aPtr += 2;
133 
134  aVal2 = _mm_load_ps((const float*)aPtr);
135  aPtr += 2;
136 
137  bVal = _mm_load_ps(bPtr);
138  bPtr += 4;
139 
140  bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0));
141  bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2));
142 
143  cVal = _mm_mul_ps(aVal1, bVal1);
144 
145  _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
146  cPtr += 2;
147 
148  cVal = _mm_mul_ps(aVal2, bVal2);
149 
150  _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
151 
152  cPtr += 2;
153  }
154 
155  number = quarterPoints * 4;
156  for(;number < num_points; number++){
157  *cPtr++ = (*aPtr++) * (*bPtr);
158  bPtr++;
159  }
160 }
161 #endif /* LV_HAVE_SSE */
162 
163 
164 #ifdef LV_HAVE_GENERIC
165 
166 static inline void
168  const float* bVector, unsigned int num_points)
169 {
170  lv_32fc_t* cPtr = cVector;
171  const lv_32fc_t* aPtr = aVector;
172  const float* bPtr= bVector;
173  unsigned int number = 0;
174 
175  for(number = 0; number < num_points; number++){
176  *cPtr++ = (*aPtr++) * (*bPtr++);
177  }
178 }
179 #endif /* LV_HAVE_GENERIC */
180 
181 
182 #ifdef LV_HAVE_NEON
183 #include <arm_neon.h>
184 
185 static inline void
187  const float* bVector, unsigned int num_points)
188 {
189  lv_32fc_t* cPtr = cVector;
190  const lv_32fc_t* aPtr = aVector;
191  const float* bPtr= bVector;
192  unsigned int number = 0;
193  unsigned int quarter_points = num_points / 4;
194 
195  float32x4x2_t inputVector, outputVector;
196  float32x4_t tapsVector;
197  for(number = 0; number < quarter_points; number++){
198  inputVector = vld2q_f32((float*)aPtr);
199  tapsVector = vld1q_f32(bPtr);
200 
201  outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
202  outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
203 
204  vst2q_f32((float*)cPtr, outputVector);
205  aPtr += 4;
206  bPtr += 4;
207  cPtr += 4;
208  }
209 
210  for(number = quarter_points * 4; number < num_points; number++){
211  *cPtr++ = (*aPtr++) * (*bPtr++);
212  }
213 }
214 #endif /* LV_HAVE_NEON */
215 
216 
217 #ifdef LV_HAVE_ORC
218 
219 extern void
220 volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector,
221  const float* bVector, unsigned int num_points);
222 
223 static inline void
224 volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
225  const float* bVector, unsigned int num_points)
226 {
227  volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
228 }
229 
230 #endif /* LV_HAVE_GENERIC */
231 
232 
233 #endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */
static void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:118
static void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:167
static void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:65
float complex lv_32fc_t
Definition: volk_complex.h:61
static void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:186