Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32i_x2_or_32i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
81 #ifndef INCLUDED_volk_32i_x2_or_32i_a_H
82 #define INCLUDED_volk_32i_x2_or_32i_a_H
83 
84 #include <inttypes.h>
85 #include <stdio.h>
86 
87 #ifdef LV_HAVE_AVX512F
88 #include <immintrin.h>
89 
90 static inline void
91 volk_32i_x2_or_32i_a_avx512f(int32_t* cVector, const int32_t* aVector,
92  const int32_t* bVector, unsigned int num_points)
93 {
94  unsigned int number = 0;
95  const unsigned int sixteenthPoints = num_points / 16;
96 
97  int32_t* cPtr = (int32_t*)cVector;
98  const int32_t* aPtr = (int32_t*)aVector;
99  const int32_t* bPtr = (int32_t*)bVector;
100 
101  __m512i aVal, bVal, cVal;
102  for(;number < sixteenthPoints; number++){
103 
104  aVal = _mm512_load_si512(aPtr);
105  bVal = _mm512_load_si512(bPtr);
106 
107  cVal = _mm512_or_si512(aVal, bVal);
108 
109  _mm512_store_si512(cPtr,cVal); // Store the results back into the C container
110 
111  aPtr += 16;
112  bPtr += 16;
113  cPtr += 16;
114  }
115 
116  number = sixteenthPoints * 16;
117  for(;number < num_points; number++){
118  cVector[number] = aVector[number] | bVector[number];
119  }
120 }
121 #endif /* LV_HAVE_AVX512F */
122 
123 #ifdef LV_HAVE_AVX2
124 #include <immintrin.h>
125 
126 static inline void
127 volk_32i_x2_or_32i_a_avx2(int32_t* cVector, const int32_t* aVector,
128  const int32_t* bVector, unsigned int num_points)
129 {
130  unsigned int number = 0;
131  const unsigned int oneEightPoints = num_points / 8;
132 
133  int32_t* cPtr = cVector;
134  const int32_t* aPtr = aVector;
135  const int32_t* bPtr = bVector;
136 
137  __m256i aVal, bVal, cVal;
138  for(;number < oneEightPoints; number++){
139 
140  aVal = _mm256_load_si256((__m256i*)aPtr);
141  bVal = _mm256_load_si256((__m256i*)bPtr);
142 
143  cVal = _mm256_or_si256(aVal, bVal);
144 
145  _mm256_store_si256((__m256i*)cPtr,cVal); // Store the results back into the C container
146 
147  aPtr += 8;
148  bPtr += 8;
149  cPtr += 8;
150  }
151 
152  number = oneEightPoints * 8;
153  for(;number < num_points; number++){
154  cVector[number] = aVector[number] | bVector[number];
155  }
156 }
157 #endif /* LV_HAVE_AVX2 */
158 
159 
160 #ifdef LV_HAVE_SSE
161 #include <xmmintrin.h>
162 
163 static inline void
164 volk_32i_x2_or_32i_a_sse(int32_t* cVector, const int32_t* aVector,
165  const int32_t* bVector, unsigned int num_points)
166 {
167  unsigned int number = 0;
168  const unsigned int quarterPoints = num_points / 4;
169 
170  float* cPtr = (float*)cVector;
171  const float* aPtr = (float*)aVector;
172  const float* bPtr = (float*)bVector;
173 
174  __m128 aVal, bVal, cVal;
175  for(;number < quarterPoints; number++){
176  aVal = _mm_load_ps(aPtr);
177  bVal = _mm_load_ps(bPtr);
178 
179  cVal = _mm_or_ps(aVal, bVal);
180 
181  _mm_store_ps(cPtr,cVal); // Store the results back into the C container
182 
183  aPtr += 4;
184  bPtr += 4;
185  cPtr += 4;
186  }
187 
188  number = quarterPoints * 4;
189  for(;number < num_points; number++){
190  cVector[number] = aVector[number] | bVector[number];
191  }
192 }
193 #endif /* LV_HAVE_SSE */
194 
195 
196 #ifdef LV_HAVE_NEON
197 #include <arm_neon.h>
198 
199 static inline void
200 volk_32i_x2_or_32i_neon(int32_t* cVector, const int32_t* aVector,
201  const int32_t* bVector, unsigned int num_points)
202 {
203  int32_t* cPtr = cVector;
204  const int32_t* aPtr = aVector;
205  const int32_t* bPtr= bVector;
206  unsigned int number = 0;
207  unsigned int quarter_points = num_points / 4;
208 
209  int32x4_t a_val, b_val, c_val;
210 
211  for(number = 0; number < quarter_points; number++){
212  a_val = vld1q_s32(aPtr);
213  b_val = vld1q_s32(bPtr);
214  c_val = vorrq_s32(a_val, b_val);
215  vst1q_s32(cPtr, c_val);
216  aPtr += 4;
217  bPtr += 4;
218  cPtr += 4;
219  }
220 
221  for(number = quarter_points * 4; number < num_points; number++){
222  *cPtr++ = (*aPtr++) | (*bPtr++);
223  }
224 }
225 #endif /* LV_HAVE_NEON */
226 
227 
228 #ifdef LV_HAVE_GENERIC
229 
230 static inline void
231 volk_32i_x2_or_32i_generic(int32_t* cVector, const int32_t* aVector,
232  const int32_t* bVector, unsigned int num_points)
233 {
234  int32_t* cPtr = cVector;
235  const int32_t* aPtr = aVector;
236  const int32_t* bPtr= bVector;
237  unsigned int number = 0;
238 
239  for(number = 0; number < num_points; number++){
240  *cPtr++ = (*aPtr++) | (*bPtr++);
241  }
242 }
243 #endif /* LV_HAVE_GENERIC */
244 
245 
246 #ifdef LV_HAVE_ORC
247 extern void
248 volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector, const int32_t* aVector,
249  const int32_t* bVector, unsigned int num_points);
250 
251 static inline void
252 volk_32i_x2_or_32i_u_orc(int32_t* cVector, const int32_t* aVector,
253  const int32_t* bVector, unsigned int num_points)
254 {
255  volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
256 }
257 #endif /* LV_HAVE_ORC */
258 
259 
260 #endif /* INCLUDED_volk_32i_x2_or_32i_a_H */
261 
262 
263 #ifndef INCLUDED_volk_32i_x2_or_32i_u_H
264 #define INCLUDED_volk_32i_x2_or_32i_u_H
265 
266 #include <inttypes.h>
267 #include <stdio.h>
268 
269 #ifdef LV_HAVE_AVX512F
270 #include <immintrin.h>
271 
272 static inline void
273 volk_32i_x2_or_32i_u_avx512f(int32_t* cVector, const int32_t* aVector,
274  const int32_t* bVector, unsigned int num_points)
275 {
276  unsigned int number = 0;
277  const unsigned int sixteenthPoints = num_points / 16;
278 
279  int32_t* cPtr = (int32_t*)cVector;
280  const int32_t* aPtr = (int32_t*)aVector;
281  const int32_t* bPtr = (int32_t*)bVector;
282 
283  __m512i aVal, bVal, cVal;
284  for(;number < sixteenthPoints; number++){
285 
286  aVal = _mm512_loadu_si512(aPtr);
287  bVal = _mm512_loadu_si512(bPtr);
288 
289  cVal = _mm512_or_si512(aVal, bVal);
290 
291  _mm512_storeu_si512(cPtr,cVal); // Store the results back into the C container
292 
293  aPtr += 16;
294  bPtr += 16;
295  cPtr += 16;
296  }
297 
298  number = sixteenthPoints * 16;
299  for(;number < num_points; number++){
300  cVector[number] = aVector[number] | bVector[number];
301  }
302 }
303 #endif /* LV_HAVE_AVX512F */
304 
305 #ifdef LV_HAVE_AVX2
306 #include <immintrin.h>
307 
308 static inline void
309 volk_32i_x2_or_32i_u_avx2(int32_t* cVector, const int32_t* aVector,
310  const int32_t* bVector, unsigned int num_points)
311 {
312  unsigned int number = 0;
313  const unsigned int oneEightPoints = num_points / 8;
314 
315  int32_t* cPtr = cVector;
316  const int32_t* aPtr = aVector;
317  const int32_t* bPtr = bVector;
318 
319  __m256i aVal, bVal, cVal;
320  for(;number < oneEightPoints; number++){
321 
322  aVal = _mm256_loadu_si256((__m256i*)aPtr);
323  bVal = _mm256_loadu_si256((__m256i*)bPtr);
324 
325  cVal = _mm256_or_si256(aVal, bVal);
326 
327  _mm256_storeu_si256((__m256i*)cPtr,cVal); // Store the results back into the C container
328 
329  aPtr += 8;
330  bPtr += 8;
331  cPtr += 8;
332  }
333 
334  number = oneEightPoints * 8;
335  for(;number < num_points; number++){
336  cVector[number] = aVector[number] | bVector[number];
337  }
338 }
339 #endif /* LV_HAVE_AVX2 */
340 
341 
342 #endif /* INCLUDED_volk_32i_x2_or_32i_u_H */
static void volk_32i_x2_or_32i_a_sse(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:164
static void volk_32i_x2_or_32i_generic(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:231
static void volk_32i_x2_or_32i_neon(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:200