Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_x2_multiply_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
72 #define INCLUDED_volk_32f_x2_multiply_32f_u_H
73 
74 #include <inttypes.h>
75 #include <stdio.h>
76 
77 #ifdef LV_HAVE_SSE
78 #include <xmmintrin.h>
79 
80 static inline void
81 volk_32f_x2_multiply_32f_u_sse(float* cVector, const float* aVector,
82  const float* bVector, unsigned int num_points)
83 {
84  unsigned int number = 0;
85  const unsigned int quarterPoints = num_points / 4;
86 
87  float* cPtr = cVector;
88  const float* aPtr = aVector;
89  const float* bPtr= bVector;
90 
91  __m128 aVal, bVal, cVal;
92  for(;number < quarterPoints; number++){
93 
94  aVal = _mm_loadu_ps(aPtr);
95  bVal = _mm_loadu_ps(bPtr);
96 
97  cVal = _mm_mul_ps(aVal, bVal);
98 
99  _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
100 
101  aPtr += 4;
102  bPtr += 4;
103  cPtr += 4;
104  }
105 
106  number = quarterPoints * 4;
107  for(;number < num_points; number++){
108  *cPtr++ = (*aPtr++) * (*bPtr++);
109  }
110 }
111 #endif /* LV_HAVE_SSE */
112 
113 #ifdef LV_HAVE_AVX512F
114 #include <immintrin.h>
115 
116 static inline void
117 volk_32f_x2_multiply_32f_u_avx512f(float* cVector, const float* aVector,
118  const float* bVector, unsigned int num_points)
119 {
120  unsigned int number = 0;
121  const unsigned int sixteenthPoints = num_points / 16;
122 
123  float* cPtr = cVector;
124  const float* aPtr = aVector;
125  const float* bPtr= bVector;
126 
127  __m512 aVal, bVal, cVal;
128  for(;number < sixteenthPoints; number++){
129 
130  aVal = _mm512_loadu_ps(aPtr);
131  bVal = _mm512_loadu_ps(bPtr);
132 
133  cVal = _mm512_mul_ps(aVal, bVal);
134 
135  _mm512_storeu_ps(cPtr,cVal); // Store the results back into the C container
136 
137  aPtr += 16;
138  bPtr += 16;
139  cPtr += 16;
140  }
141 
142  number = sixteenthPoints * 16;
143  for(;number < num_points; number++){
144  *cPtr++ = (*aPtr++) * (*bPtr++);
145  }
146 }
147 #endif /* LV_HAVE_AVX512F */
148 
149 #ifdef LV_HAVE_AVX
150 #include <immintrin.h>
151 
152 static inline void
153 volk_32f_x2_multiply_32f_u_avx(float* cVector, const float* aVector,
154  const float* bVector, unsigned int num_points)
155 {
156  unsigned int number = 0;
157  const unsigned int eighthPoints = num_points / 8;
158 
159  float* cPtr = cVector;
160  const float* aPtr = aVector;
161  const float* bPtr= bVector;
162 
163  __m256 aVal, bVal, cVal;
164  for(;number < eighthPoints; number++){
165 
166  aVal = _mm256_loadu_ps(aPtr);
167  bVal = _mm256_loadu_ps(bPtr);
168 
169  cVal = _mm256_mul_ps(aVal, bVal);
170 
171  _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
172 
173  aPtr += 8;
174  bPtr += 8;
175  cPtr += 8;
176  }
177 
178  number = eighthPoints * 8;
179  for(;number < num_points; number++){
180  *cPtr++ = (*aPtr++) * (*bPtr++);
181  }
182 }
183 #endif /* LV_HAVE_AVX */
184 
185 
186 #ifdef LV_HAVE_GENERIC
187 
188 static inline void
189 volk_32f_x2_multiply_32f_generic(float* cVector, const float* aVector,
190  const float* bVector, unsigned int num_points)
191 {
192  float* cPtr = cVector;
193  const float* aPtr = aVector;
194  const float* bPtr= bVector;
195  unsigned int number = 0;
196 
197  for(number = 0; number < num_points; number++){
198  *cPtr++ = (*aPtr++) * (*bPtr++);
199  }
200 }
201 #endif /* LV_HAVE_GENERIC */
202 
203 
204 #endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
205 
206 
207 #ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
208 #define INCLUDED_volk_32f_x2_multiply_32f_a_H
209 
210 #include <inttypes.h>
211 #include <stdio.h>
212 
213 #ifdef LV_HAVE_SSE
214 #include <xmmintrin.h>
215 
216 static inline void
217 volk_32f_x2_multiply_32f_a_sse(float* cVector, const float* aVector,
218  const float* bVector, unsigned int num_points)
219 {
220  unsigned int number = 0;
221  const unsigned int quarterPoints = num_points / 4;
222 
223  float* cPtr = cVector;
224  const float* aPtr = aVector;
225  const float* bPtr= bVector;
226 
227  __m128 aVal, bVal, cVal;
228  for(;number < quarterPoints; number++){
229 
230  aVal = _mm_load_ps(aPtr);
231  bVal = _mm_load_ps(bPtr);
232 
233  cVal = _mm_mul_ps(aVal, bVal);
234 
235  _mm_store_ps(cPtr,cVal); // Store the results back into the C container
236 
237  aPtr += 4;
238  bPtr += 4;
239  cPtr += 4;
240  }
241 
242  number = quarterPoints * 4;
243  for(;number < num_points; number++){
244  *cPtr++ = (*aPtr++) * (*bPtr++);
245  }
246 }
247 #endif /* LV_HAVE_SSE */
248 
249 #ifdef LV_HAVE_AVX512F
250 #include <immintrin.h>
251 
252 static inline void
253 volk_32f_x2_multiply_32f_a_avx512f(float* cVector, const float* aVector,
254  const float* bVector, unsigned int num_points)
255 {
256  unsigned int number = 0;
257  const unsigned int sixteenthPoints = num_points / 16;
258 
259  float* cPtr = cVector;
260  const float* aPtr = aVector;
261  const float* bPtr= bVector;
262 
263  __m512 aVal, bVal, cVal;
264  for(;number < sixteenthPoints; number++){
265 
266  aVal = _mm512_load_ps(aPtr);
267  bVal = _mm512_load_ps(bPtr);
268 
269  cVal = _mm512_mul_ps(aVal, bVal);
270 
271  _mm512_store_ps(cPtr,cVal); // Store the results back into the C container
272 
273  aPtr += 16;
274  bPtr += 16;
275  cPtr += 16;
276  }
277 
278  number = sixteenthPoints * 16;
279  for(;number < num_points; number++){
280  *cPtr++ = (*aPtr++) * (*bPtr++);
281  }
282 }
283 #endif /* LV_HAVE_AVX512F */
284 
285 
286 #ifdef LV_HAVE_AVX
287 #include <immintrin.h>
288 
289 static inline void
290 volk_32f_x2_multiply_32f_a_avx(float* cVector, const float* aVector,
291  const float* bVector, unsigned int num_points)
292 {
293  unsigned int number = 0;
294  const unsigned int eighthPoints = num_points / 8;
295 
296  float* cPtr = cVector;
297  const float* aPtr = aVector;
298  const float* bPtr= bVector;
299 
300  __m256 aVal, bVal, cVal;
301  for(;number < eighthPoints; number++){
302 
303  aVal = _mm256_load_ps(aPtr);
304  bVal = _mm256_load_ps(bPtr);
305 
306  cVal = _mm256_mul_ps(aVal, bVal);
307 
308  _mm256_store_ps(cPtr,cVal); // Store the results back into the C container
309 
310  aPtr += 8;
311  bPtr += 8;
312  cPtr += 8;
313  }
314 
315  number = eighthPoints * 8;
316  for(;number < num_points; number++){
317  *cPtr++ = (*aPtr++) * (*bPtr++);
318  }
319 }
320 #endif /* LV_HAVE_AVX */
321 
322 
323 #ifdef LV_HAVE_NEON
324 #include <arm_neon.h>
325 
326 static inline void
327 volk_32f_x2_multiply_32f_neon(float* cVector, const float* aVector,
328  const float* bVector, unsigned int num_points)
329 {
330  const unsigned int quarter_points = num_points / 4;
331  unsigned int number;
332  float32x4_t avec, bvec, cvec;
333  for(number=0; number < quarter_points; ++number) {
334  avec = vld1q_f32(aVector);
335  bvec = vld1q_f32(bVector);
336  cvec = vmulq_f32(avec, bvec);
337  vst1q_f32(cVector, cvec);
338  aVector += 4;
339  bVector += 4;
340  cVector += 4;
341  }
342  for(number=quarter_points*4; number < num_points; ++number) {
343  *cVector++ = *aVector++ * *bVector++;
344  }
345 }
346 #endif /* LV_HAVE_NEON */
347 
348 
349 #ifdef LV_HAVE_GENERIC
350 
351 static inline void
352 volk_32f_x2_multiply_32f_a_generic(float* cVector, const float* aVector,
353  const float* bVector, unsigned int num_points)
354 {
355  float* cPtr = cVector;
356  const float* aPtr = aVector;
357  const float* bPtr= bVector;
358  unsigned int number = 0;
359 
360  for(number = 0; number < num_points; number++){
361  *cPtr++ = (*aPtr++) * (*bPtr++);
362  }
363 }
364 #endif /* LV_HAVE_GENERIC */
365 
366 
367 #ifdef LV_HAVE_ORC
368 extern void
369 volk_32f_x2_multiply_32f_a_orc_impl(float* cVector, const float* aVector,
370  const float* bVector, unsigned int num_points);
371 
372 static inline void
373 volk_32f_x2_multiply_32f_u_orc(float* cVector, const float* aVector,
374  const float* bVector, unsigned int num_points)
375 {
376  volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
377 }
378 #endif /* LV_HAVE_ORC */
379 
380 
381 #endif /* INCLUDED_volk_32f_x2_multiply_32f_a_H */
static void volk_32f_x2_multiply_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:153
static void volk_32f_x2_multiply_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:217
static void volk_32f_x2_multiply_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:327
static void volk_32f_x2_multiply_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:189
static void volk_32f_x2_multiply_32f_u_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:81
static void volk_32f_x2_multiply_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:290
static void volk_32f_x2_multiply_32f_a_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_multiply_32f.h:352