81 #ifndef INCLUDED_volk_32i_x2_or_32i_a_H
82 #define INCLUDED_volk_32i_x2_or_32i_a_H
87 #ifdef LV_HAVE_AVX512F
88 #include <immintrin.h>
90 static inline void volk_32i_x2_or_32i_a_avx512f(int32_t* cVector,
91 const int32_t* aVector,
92 const int32_t* bVector,
93 unsigned int num_points)
95 unsigned int number = 0;
96 const unsigned int sixteenthPoints = num_points / 16;
98 int32_t* cPtr = (int32_t*)cVector;
99 const int32_t* aPtr = (int32_t*)aVector;
100 const int32_t* bPtr = (int32_t*)bVector;
102 __m512i aVal, bVal, cVal;
103 for (; number < sixteenthPoints; number++) {
105 aVal = _mm512_load_si512(aPtr);
106 bVal = _mm512_load_si512(bPtr);
108 cVal = _mm512_or_si512(aVal, bVal);
110 _mm512_store_si512(cPtr, cVal);
117 number = sixteenthPoints * 16;
118 for (; number < num_points; number++) {
119 cVector[number] = aVector[number] | bVector[number];
125 #include <immintrin.h>
127 static inline void volk_32i_x2_or_32i_a_avx2(int32_t* cVector,
128 const int32_t* aVector,
129 const int32_t* bVector,
130 unsigned int num_points)
132 unsigned int number = 0;
133 const unsigned int oneEightPoints = num_points / 8;
135 int32_t* cPtr = cVector;
136 const int32_t* aPtr = aVector;
137 const int32_t* bPtr = bVector;
139 __m256i aVal, bVal, cVal;
140 for (; number < oneEightPoints; number++) {
142 aVal = _mm256_load_si256((__m256i*)aPtr);
143 bVal = _mm256_load_si256((__m256i*)bPtr);
145 cVal = _mm256_or_si256(aVal, bVal);
147 _mm256_store_si256((__m256i*)cPtr,
155 number = oneEightPoints * 8;
156 for (; number < num_points; number++) {
157 cVector[number] = aVector[number] | bVector[number];
164 #include <xmmintrin.h>
167 const int32_t* aVector,
168 const int32_t* bVector,
169 unsigned int num_points)
171 unsigned int number = 0;
172 const unsigned int quarterPoints = num_points / 4;
174 float* cPtr = (
float*)cVector;
175 const float* aPtr = (
float*)aVector;
176 const float* bPtr = (
float*)bVector;
178 __m128 aVal, bVal, cVal;
179 for (; number < quarterPoints; number++) {
180 aVal = _mm_load_ps(aPtr);
181 bVal = _mm_load_ps(bPtr);
183 cVal = _mm_or_ps(aVal, bVal);
185 _mm_store_ps(cPtr, cVal);
192 number = quarterPoints * 4;
193 for (; number < num_points; number++) {
194 cVector[number] = aVector[number] | bVector[number];
201 #include <arm_neon.h>
204 const int32_t* aVector,
205 const int32_t* bVector,
206 unsigned int num_points)
208 int32_t* cPtr = cVector;
209 const int32_t* aPtr = aVector;
210 const int32_t* bPtr = bVector;
211 unsigned int number = 0;
212 unsigned int quarter_points = num_points / 4;
214 int32x4_t a_val, b_val, c_val;
216 for (number = 0; number < quarter_points; number++) {
217 a_val = vld1q_s32(aPtr);
218 b_val = vld1q_s32(bPtr);
219 c_val = vorrq_s32(a_val, b_val);
220 vst1q_s32(cPtr, c_val);
226 for (number = quarter_points * 4; number < num_points; number++) {
227 *cPtr++ = (*aPtr++) | (*bPtr++);
233 #ifdef LV_HAVE_GENERIC
236 const int32_t* aVector,
237 const int32_t* bVector,
238 unsigned int num_points)
240 int32_t* cPtr = cVector;
241 const int32_t* aPtr = aVector;
242 const int32_t* bPtr = bVector;
243 unsigned int number = 0;
245 for (number = 0; number < num_points; number++) {
246 *cPtr++ = (*aPtr++) | (*bPtr++);
253 extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector,
254 const int32_t* aVector,
255 const int32_t* bVector,
256 unsigned int num_points);
258 static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector,
259 const int32_t* aVector,
260 const int32_t* bVector,
261 unsigned int num_points)
263 volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
271 #ifndef INCLUDED_volk_32i_x2_or_32i_u_H
272 #define INCLUDED_volk_32i_x2_or_32i_u_H
274 #include <inttypes.h>
277 #ifdef LV_HAVE_AVX512F
278 #include <immintrin.h>
280 static inline void volk_32i_x2_or_32i_u_avx512f(int32_t* cVector,
281 const int32_t* aVector,
282 const int32_t* bVector,
283 unsigned int num_points)
285 unsigned int number = 0;
286 const unsigned int sixteenthPoints = num_points / 16;
288 int32_t* cPtr = (int32_t*)cVector;
289 const int32_t* aPtr = (int32_t*)aVector;
290 const int32_t* bPtr = (int32_t*)bVector;
292 __m512i aVal, bVal, cVal;
293 for (; number < sixteenthPoints; number++) {
295 aVal = _mm512_loadu_si512(aPtr);
296 bVal = _mm512_loadu_si512(bPtr);
298 cVal = _mm512_or_si512(aVal, bVal);
300 _mm512_storeu_si512(cPtr, cVal);
307 number = sixteenthPoints * 16;
308 for (; number < num_points; number++) {
309 cVector[number] = aVector[number] | bVector[number];
315 #include <immintrin.h>
317 static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector,
318 const int32_t* aVector,
319 const int32_t* bVector,
320 unsigned int num_points)
322 unsigned int number = 0;
323 const unsigned int oneEightPoints = num_points / 8;
325 int32_t* cPtr = cVector;
326 const int32_t* aPtr = aVector;
327 const int32_t* bPtr = bVector;
329 __m256i aVal, bVal, cVal;
330 for (; number < oneEightPoints; number++) {
332 aVal = _mm256_loadu_si256((__m256i*)aPtr);
333 bVal = _mm256_loadu_si256((__m256i*)bPtr);
335 cVal = _mm256_or_si256(aVal, bVal);
337 _mm256_storeu_si256((__m256i*)cPtr,
345 number = oneEightPoints * 8;
346 for (; number < num_points; number++) {
347 cVector[number] = aVector[number] | bVector[number];