71 #ifndef INCLUDED_volk_32f_x2_subtract_32f_a_H 72 #define INCLUDED_volk_32f_x2_subtract_32f_a_H 77 #ifdef LV_HAVE_AVX512F 78 #include <immintrin.h> 81 volk_32f_x2_subtract_32f_a_avx512f(
float* cVector,
const float* aVector,
82 const float* bVector,
unsigned int num_points)
84 unsigned int number = 0;
85 const unsigned int sixteenthPoints = num_points / 16;
87 float* cPtr = cVector;
88 const float* aPtr = aVector;
89 const float* bPtr = bVector;
91 __m512 aVal, bVal, cVal;
92 for(;number < sixteenthPoints; number++){
94 aVal = _mm512_load_ps(aPtr);
95 bVal = _mm512_load_ps(bPtr);
97 cVal = _mm512_sub_ps(aVal, bVal);
99 _mm512_store_ps(cPtr,cVal);
106 number = sixteenthPoints *16;
107 for(;number < num_points; number++){
108 *cPtr++ = (*aPtr++) - (*bPtr++);
114 #include <immintrin.h> 118 const float* bVector,
unsigned int num_points)
120 unsigned int number = 0;
121 const unsigned int eighthPoints = num_points / 8;
123 float* cPtr = cVector;
124 const float* aPtr = aVector;
125 const float* bPtr = bVector;
127 __m256 aVal, bVal, cVal;
128 for(;number < eighthPoints; number++){
130 aVal = _mm256_load_ps(aPtr);
131 bVal = _mm256_load_ps(bPtr);
133 cVal = _mm256_sub_ps(aVal, bVal);
135 _mm256_store_ps(cPtr,cVal);
142 number = eighthPoints * 8;
143 for(;number < num_points; number++){
144 *cPtr++ = (*aPtr++) - (*bPtr++);
150 #include <xmmintrin.h> 154 const float* bVector,
unsigned int num_points)
156 unsigned int number = 0;
157 const unsigned int quarterPoints = num_points / 4;
159 float* cPtr = cVector;
160 const float* aPtr = aVector;
161 const float* bPtr = bVector;
163 __m128 aVal, bVal, cVal;
164 for(;number < quarterPoints; number++){
166 aVal = _mm_load_ps(aPtr);
167 bVal = _mm_load_ps(bPtr);
169 cVal = _mm_sub_ps(aVal, bVal);
171 _mm_store_ps(cPtr,cVal);
178 number = quarterPoints * 4;
179 for(;number < num_points; number++){
180 *cPtr++ = (*aPtr++) - (*bPtr++);
186 #ifdef LV_HAVE_GENERIC 190 const float* bVector,
unsigned int num_points)
192 float* cPtr = cVector;
193 const float* aPtr = aVector;
194 const float* bPtr = bVector;
195 unsigned int number = 0;
197 for(number = 0; number < num_points; number++){
198 *cPtr++ = (*aPtr++) - (*bPtr++);
205 #include <arm_neon.h> 209 const float* bVector,
unsigned int num_points)
211 float* cPtr = cVector;
212 const float* aPtr = aVector;
213 const float* bPtr = bVector;
214 unsigned int number = 0;
215 unsigned int quarter_points = num_points / 4;
217 float32x4_t a_vec, b_vec, c_vec;
219 for(number = 0; number < quarter_points; number++){
220 a_vec = vld1q_f32(aPtr);
221 b_vec = vld1q_f32(bPtr);
222 c_vec = vsubq_f32(a_vec, b_vec);
223 vst1q_f32(cPtr, c_vec);
229 for(number = quarter_points * 4; number < num_points; number++){
230 *cPtr++ = (*aPtr++) - (*bPtr++);
238 volk_32f_x2_subtract_32f_a_orc_impl(
float* cVector,
const float* aVector,
239 const float* bVector,
unsigned int num_points);
242 volk_32f_x2_subtract_32f_u_orc(
float* cVector,
const float* aVector,
243 const float* bVector,
unsigned int num_points)
245 volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
253 #ifndef INCLUDED_volk_32f_x2_subtract_32f_u_H 254 #define INCLUDED_volk_32f_x2_subtract_32f_u_H 256 #include <inttypes.h> 259 #ifdef LV_HAVE_AVX512F 260 #include <immintrin.h> 263 volk_32f_x2_subtract_32f_u_avx512f(
float* cVector,
const float* aVector,
264 const float* bVector,
unsigned int num_points)
266 unsigned int number = 0;
267 const unsigned int sixteenthPoints = num_points / 16;
269 float* cPtr = cVector;
270 const float* aPtr = aVector;
271 const float* bPtr = bVector;
273 __m512 aVal, bVal, cVal;
274 for(;number < sixteenthPoints; number++){
276 aVal = _mm512_loadu_ps(aPtr);
277 bVal = _mm512_loadu_ps(bPtr);
279 cVal = _mm512_sub_ps(aVal, bVal);
281 _mm512_storeu_ps(cPtr,cVal);
288 number = sixteenthPoints *16;
289 for(;number < num_points; number++){
290 *cPtr++ = (*aPtr++) - (*bPtr++);
297 #include <immintrin.h> 301 const float* bVector,
unsigned int num_points)
303 unsigned int number = 0;
304 const unsigned int eighthPoints = num_points / 8;
306 float* cPtr = cVector;
307 const float* aPtr = aVector;
308 const float* bPtr = bVector;
310 __m256 aVal, bVal, cVal;
311 for(;number < eighthPoints; number++){
313 aVal = _mm256_loadu_ps(aPtr);
314 bVal = _mm256_loadu_ps(bPtr);
316 cVal = _mm256_sub_ps(aVal, bVal);
318 _mm256_storeu_ps(cPtr,cVal);
325 number = eighthPoints * 8;
326 for(;number < num_points; number++){
327 *cPtr++ = (*aPtr++) - (*bPtr++);
static void volk_32f_x2_subtract_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:117
static void volk_32f_x2_subtract_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:208
static void volk_32f_x2_subtract_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:189
static void volk_32f_x2_subtract_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:153
static void volk_32f_x2_subtract_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_subtract_32f.h:300