Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32i_x2_and_32i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
81 #ifndef INCLUDED_volk_32i_x2_and_32i_a_H
82 #define INCLUDED_volk_32i_x2_and_32i_a_H
83 
84 #include <inttypes.h>
85 #include <stdio.h>
86 
87 #ifdef LV_HAVE_AVX512F
88 #include <immintrin.h>
89 
90 static inline void volk_32i_x2_and_32i_a_avx512f(int32_t* cVector,
91  const int32_t* aVector,
92  const int32_t* bVector,
93  unsigned int num_points)
94 {
95  unsigned int number = 0;
96  const unsigned int sixteenthPoints = num_points / 16;
97 
98  int32_t* cPtr = (int32_t*)cVector;
99  const int32_t* aPtr = (int32_t*)aVector;
100  const int32_t* bPtr = (int32_t*)bVector;
101 
102  __m512i aVal, bVal, cVal;
103  for (; number < sixteenthPoints; number++) {
104 
105  aVal = _mm512_load_si512(aPtr);
106  bVal = _mm512_load_si512(bPtr);
107 
108  cVal = _mm512_and_si512(aVal, bVal);
109 
110  _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
111 
112  aPtr += 16;
113  bPtr += 16;
114  cPtr += 16;
115  }
116 
117  number = sixteenthPoints * 16;
118  for (; number < num_points; number++) {
119  cVector[number] = aVector[number] & bVector[number];
120  }
121 }
122 #endif /* LV_HAVE_AVX512F */
123 
124 #ifdef LV_HAVE_AVX2
125 #include <immintrin.h>
126 
127 static inline void volk_32i_x2_and_32i_a_avx2(int32_t* cVector,
128  const int32_t* aVector,
129  const int32_t* bVector,
130  unsigned int num_points)
131 {
132  unsigned int number = 0;
133  const unsigned int oneEightPoints = num_points / 8;
134 
135  int32_t* cPtr = cVector;
136  const int32_t* aPtr = aVector;
137  const int32_t* bPtr = bVector;
138 
139  __m256i aVal, bVal, cVal;
140  for (; number < oneEightPoints; number++) {
141 
142  aVal = _mm256_load_si256((__m256i*)aPtr);
143  bVal = _mm256_load_si256((__m256i*)bPtr);
144 
145  cVal = _mm256_and_si256(aVal, bVal);
146 
147  _mm256_store_si256((__m256i*)cPtr,
148  cVal); // Store the results back into the C container
149 
150  aPtr += 8;
151  bPtr += 8;
152  cPtr += 8;
153  }
154 
155  number = oneEightPoints * 8;
156  for (; number < num_points; number++) {
157  cVector[number] = aVector[number] & bVector[number];
158  }
159 }
160 #endif /* LV_HAVE_AVX2 */
161 
162 
163 #ifdef LV_HAVE_SSE
164 #include <xmmintrin.h>
165 
166 static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector,
167  const int32_t* aVector,
168  const int32_t* bVector,
169  unsigned int num_points)
170 {
171  unsigned int number = 0;
172  const unsigned int quarterPoints = num_points / 4;
173 
174  float* cPtr = (float*)cVector;
175  const float* aPtr = (float*)aVector;
176  const float* bPtr = (float*)bVector;
177 
178  __m128 aVal, bVal, cVal;
179  for (; number < quarterPoints; number++) {
180 
181  aVal = _mm_load_ps(aPtr);
182  bVal = _mm_load_ps(bPtr);
183 
184  cVal = _mm_and_ps(aVal, bVal);
185 
186  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
187 
188  aPtr += 4;
189  bPtr += 4;
190  cPtr += 4;
191  }
192 
193  number = quarterPoints * 4;
194  for (; number < num_points; number++) {
195  cVector[number] = aVector[number] & bVector[number];
196  }
197 }
198 #endif /* LV_HAVE_SSE */
199 
200 
201 #ifdef LV_HAVE_NEON
202 #include <arm_neon.h>
203 
204 static inline void volk_32i_x2_and_32i_neon(int32_t* cVector,
205  const int32_t* aVector,
206  const int32_t* bVector,
207  unsigned int num_points)
208 {
209  int32_t* cPtr = cVector;
210  const int32_t* aPtr = aVector;
211  const int32_t* bPtr = bVector;
212  unsigned int number = 0;
213  unsigned int quarter_points = num_points / 4;
214 
215  int32x4_t a_val, b_val, c_val;
216 
217  for (number = 0; number < quarter_points; number++) {
218  a_val = vld1q_s32(aPtr);
219  b_val = vld1q_s32(bPtr);
220  c_val = vandq_s32(a_val, b_val);
221  vst1q_s32(cPtr, c_val);
222  aPtr += 4;
223  bPtr += 4;
224  cPtr += 4;
225  }
226 
227  for (number = quarter_points * 4; number < num_points; number++) {
228  *cPtr++ = (*aPtr++) & (*bPtr++);
229  }
230 }
231 #endif /* LV_HAVE_NEON */
232 
233 
234 #ifdef LV_HAVE_GENERIC
235 
236 static inline void volk_32i_x2_and_32i_generic(int32_t* cVector,
237  const int32_t* aVector,
238  const int32_t* bVector,
239  unsigned int num_points)
240 {
241  int32_t* cPtr = cVector;
242  const int32_t* aPtr = aVector;
243  const int32_t* bPtr = bVector;
244  unsigned int number = 0;
245 
246  for (number = 0; number < num_points; number++) {
247  *cPtr++ = (*aPtr++) & (*bPtr++);
248  }
249 }
250 #endif /* LV_HAVE_GENERIC */
251 
252 
253 #ifdef LV_HAVE_ORC
254 extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector,
255  const int32_t* aVector,
256  const int32_t* bVector,
257  unsigned int num_points);
258 
259 static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector,
260  const int32_t* aVector,
261  const int32_t* bVector,
262  unsigned int num_points)
263 {
264  volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
265 }
266 #endif /* LV_HAVE_ORC */
267 
268 
269 #endif /* INCLUDED_volk_32i_x2_and_32i_a_H */
270 
271 
272 #ifndef INCLUDED_volk_32i_x2_and_32i_u_H
273 #define INCLUDED_volk_32i_x2_and_32i_u_H
274 
275 #include <inttypes.h>
276 #include <stdio.h>
277 
278 #ifdef LV_HAVE_AVX512F
279 #include <immintrin.h>
280 
281 static inline void volk_32i_x2_and_32i_u_avx512f(int32_t* cVector,
282  const int32_t* aVector,
283  const int32_t* bVector,
284  unsigned int num_points)
285 {
286  unsigned int number = 0;
287  const unsigned int sixteenthPoints = num_points / 16;
288 
289  int32_t* cPtr = (int32_t*)cVector;
290  const int32_t* aPtr = (int32_t*)aVector;
291  const int32_t* bPtr = (int32_t*)bVector;
292 
293  __m512i aVal, bVal, cVal;
294  for (; number < sixteenthPoints; number++) {
295 
296  aVal = _mm512_loadu_si512(aPtr);
297  bVal = _mm512_loadu_si512(bPtr);
298 
299  cVal = _mm512_and_si512(aVal, bVal);
300 
301  _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
302 
303  aPtr += 16;
304  bPtr += 16;
305  cPtr += 16;
306  }
307 
308  number = sixteenthPoints * 16;
309  for (; number < num_points; number++) {
310  cVector[number] = aVector[number] & bVector[number];
311  }
312 }
313 #endif /* LV_HAVE_AVX512F */
314 
315 #ifdef LV_HAVE_AVX2
316 #include <immintrin.h>
317 
318 static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector,
319  const int32_t* aVector,
320  const int32_t* bVector,
321  unsigned int num_points)
322 {
323  unsigned int number = 0;
324  const unsigned int oneEightPoints = num_points / 8;
325 
326  int32_t* cPtr = cVector;
327  const int32_t* aPtr = aVector;
328  const int32_t* bPtr = bVector;
329 
330  __m256i aVal, bVal, cVal;
331  for (; number < oneEightPoints; number++) {
332 
333  aVal = _mm256_loadu_si256((__m256i*)aPtr);
334  bVal = _mm256_loadu_si256((__m256i*)bPtr);
335 
336  cVal = _mm256_and_si256(aVal, bVal);
337 
338  _mm256_storeu_si256((__m256i*)cPtr,
339  cVal); // Store the results back into the C container
340 
341  aPtr += 8;
342  bPtr += 8;
343  cPtr += 8;
344  }
345 
346  number = oneEightPoints * 8;
347  for (; number < num_points; number++) {
348  cVector[number] = aVector[number] & bVector[number];
349  }
350 }
351 #endif /* LV_HAVE_AVX2 */
352 
353 
354 #endif /* INCLUDED_volk_32i_x2_and_32i_u_H */
volk_32i_x2_and_32i_generic
static void volk_32i_x2_and_32i_generic(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_and_32i.h:236
volk_32i_x2_and_32i_neon
static void volk_32i_x2_and_32i_neon(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_and_32i.h:204
volk_32i_x2_and_32i_a_sse
static void volk_32i_x2_and_32i_a_sse(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_and_32i.h:166