59 #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H 60 #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H 71 short* src0,
short* src1,
short* src2,
short* src3,
short* src4,
72 unsigned int num_points)
74 const unsigned int num_bytes = num_points*2;
76 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
77 __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
78 p_target0 = (__m128i*)target0;
79 p_target1 = (__m128i*)target1;
80 p_target2 = (__m128i*)target2;
81 p_target3 = (__m128i*)target3;
83 p_src0 = (__m128i*)src0;
84 p_src1 = (__m128i*)src1;
85 p_src2 = (__m128i*)src2;
86 p_src3 = (__m128i*)src3;
87 p_src4 = (__m128i*)src4;
91 int bound = (num_bytes >> 4);
92 int leftovers = (num_bytes >> 1) & 7;
94 for(; i < bound; ++
i) {
95 xmm0 = _mm_load_si128(p_src0);
96 xmm1 = _mm_load_si128(p_src1);
97 xmm2 = _mm_load_si128(p_src2);
98 xmm3 = _mm_load_si128(p_src3);
99 xmm4 = _mm_load_si128(p_src4);
104 xmm1 = _mm_add_epi16(xmm0, xmm1);
105 xmm2 = _mm_add_epi16(xmm0, xmm2);
106 xmm3 = _mm_add_epi16(xmm0, xmm3);
107 xmm4 = _mm_add_epi16(xmm0, xmm4);
114 _mm_store_si128(p_target0, xmm1);
115 _mm_store_si128(p_target1, xmm2);
116 _mm_store_si128(p_target2, xmm3);
117 _mm_store_si128(p_target3, xmm4);
160 for(i = bound * 8; i < (bound * 8) + leftovers; ++
i) {
161 target0[
i] = src0[
i] + src1[
i];
162 target1[
i] = src0[
i] + src2[
i];
163 target2[
i] = src0[
i] + src3[
i];
164 target3[
i] = src0[
i] + src4[
i];
170 #include <arm_neon.h> 174 short* src0,
short* src1,
short* src2,
short* src3,
short* src4,
175 unsigned int num_points)
177 const unsigned int eighth_points = num_points / 8;
178 unsigned int number = 0;
180 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
181 int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
182 for(number = 0; number < eighth_points; ++number) {
183 src0_vec = vld1q_s16(src0);
184 src1_vec = vld1q_s16(src1);
185 src2_vec = vld1q_s16(src2);
186 src3_vec = vld1q_s16(src3);
187 src4_vec = vld1q_s16(src4);
189 target0_vec = vaddq_s16(src0_vec , src1_vec);
190 target1_vec = vaddq_s16(src0_vec , src2_vec);
191 target2_vec = vaddq_s16(src0_vec , src3_vec);
192 target3_vec = vaddq_s16(src0_vec , src4_vec);
194 vst1q_s16(target0, target0_vec);
195 vst1q_s16(target1, target1_vec);
196 vst1q_s16(target2, target2_vec);
197 vst1q_s16(target3, target3_vec);
209 for(number = eighth_points * 8; number < num_points; ++number) {
210 *target0++ = *src0 + *src1++;
211 *target1++ = *src0 + *src2++;
212 *target2++ = *src0 + *src3++;
213 *target3++ = *src0++ + *src4++;
219 #ifdef LV_HAVE_GENERIC 223 short* src0,
short* src1,
short* src2,
short* src3,
short* src4,
224 unsigned int num_points)
226 const unsigned int num_bytes = num_points*2;
230 int bound = num_bytes >> 1;
232 for(i = 0; i < bound; ++
i) {
233 target0[
i] = src0[
i] + src1[
i];
234 target1[
i] = src0[
i] + src2[
i];
235 target2[
i] = src0[
i] + src3[
i];
236 target3[
i] = src0[
i] + src4[
i];
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_16i_x5_add_quad_16i_x4_neon(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:173
static void volk_16i_x5_add_quad_16i_x4_generic(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:222
static void volk_16i_x5_add_quad_16i_x4_a_sse2(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:70