Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_16i_x5_add_quad_16i_x4.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
60 #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
61 #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
62 
63 #include <inttypes.h>
64 #include <stdio.h>
65 
66 #ifdef LV_HAVE_SSE2
67 #include <emmintrin.h>
68 #include <xmmintrin.h>
69 
70 static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0,
71  short* target1,
72  short* target2,
73  short* target3,
74  short* src0,
75  short* src1,
76  short* src2,
77  short* src3,
78  short* src4,
79  unsigned int num_points)
80 {
81  const unsigned int num_bytes = num_points * 2;
82 
83  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
84  __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2,
85  *p_src3, *p_src4;
86  p_target0 = (__m128i*)target0;
87  p_target1 = (__m128i*)target1;
88  p_target2 = (__m128i*)target2;
89  p_target3 = (__m128i*)target3;
90 
91  p_src0 = (__m128i*)src0;
92  p_src1 = (__m128i*)src1;
93  p_src2 = (__m128i*)src2;
94  p_src3 = (__m128i*)src3;
95  p_src4 = (__m128i*)src4;
96 
97  int i = 0;
98 
99  int bound = (num_bytes >> 4);
100  int leftovers = (num_bytes >> 1) & 7;
101 
102  for (; i < bound; ++i) {
103  xmm0 = _mm_load_si128(p_src0);
104  xmm1 = _mm_load_si128(p_src1);
105  xmm2 = _mm_load_si128(p_src2);
106  xmm3 = _mm_load_si128(p_src3);
107  xmm4 = _mm_load_si128(p_src4);
108 
109  p_src0 += 1;
110  p_src1 += 1;
111 
112  xmm1 = _mm_add_epi16(xmm0, xmm1);
113  xmm2 = _mm_add_epi16(xmm0, xmm2);
114  xmm3 = _mm_add_epi16(xmm0, xmm3);
115  xmm4 = _mm_add_epi16(xmm0, xmm4);
116 
117 
118  p_src2 += 1;
119  p_src3 += 1;
120  p_src4 += 1;
121 
122  _mm_store_si128(p_target0, xmm1);
123  _mm_store_si128(p_target1, xmm2);
124  _mm_store_si128(p_target2, xmm3);
125  _mm_store_si128(p_target3, xmm4);
126 
127  p_target0 += 1;
128  p_target1 += 1;
129  p_target2 += 1;
130  p_target3 += 1;
131  }
132  /*__VOLK_ASM __VOLK_VOLATILE
133  (
134  ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
135  "cmp $0, %[bound]\n\t"
136  "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
137  "movaps (%[src0]), %%xmm1\n\t"
138  "movaps (%[src1]), %%xmm2\n\t"
139  "movaps (%[src2]), %%xmm3\n\t"
140  "movaps (%[src3]), %%xmm4\n\t"
141  "movaps (%[src4]), %%xmm5\n\t"
142  "add $16, %[src0]\n\t"
143  "add $16, %[src1]\n\t"
144  "add $16, %[src2]\n\t"
145  "add $16, %[src3]\n\t"
146  "add $16, %[src4]\n\t"
147  "paddw %%xmm1, %%xmm2\n\t"
148  "paddw %%xmm1, %%xmm3\n\t"
149  "paddw %%xmm1, %%xmm4\n\t"
150  "paddw %%xmm1, %%xmm5\n\t"
151  "add $-1, %[bound]\n\t"
152  "movaps %%xmm2, (%[target0])\n\t"
153  "movaps %%xmm3, (%[target1])\n\t"
154  "movaps %%xmm4, (%[target2])\n\t"
155  "movaps %%xmm5, (%[target3])\n\t"
156  "add $16, %[target0]\n\t"
157  "add $16, %[target1]\n\t"
158  "add $16, %[target2]\n\t"
159  "add $16, %[target3]\n\t"
160  "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
161  ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
162  :
163  :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
164  [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1),
165  [target2]"r"(target2), [target3]"r"(target3)
166  :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
167  );
168  */
169 
170  for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
171  target0[i] = src0[i] + src1[i];
172  target1[i] = src0[i] + src2[i];
173  target2[i] = src0[i] + src3[i];
174  target3[i] = src0[i] + src4[i];
175  }
176 }
177 #endif /*LV_HAVE_SSE2*/
178 
179 #ifdef LV_HAVE_NEON
180 #include <arm_neon.h>
181 
182 static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0,
183  short* target1,
184  short* target2,
185  short* target3,
186  short* src0,
187  short* src1,
188  short* src2,
189  short* src3,
190  short* src4,
191  unsigned int num_points)
192 {
193  const unsigned int eighth_points = num_points / 8;
194  unsigned int number = 0;
195 
196  int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
197  int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
198  for (number = 0; number < eighth_points; ++number) {
199  src0_vec = vld1q_s16(src0);
200  src1_vec = vld1q_s16(src1);
201  src2_vec = vld1q_s16(src2);
202  src3_vec = vld1q_s16(src3);
203  src4_vec = vld1q_s16(src4);
204 
205  target0_vec = vaddq_s16(src0_vec, src1_vec);
206  target1_vec = vaddq_s16(src0_vec, src2_vec);
207  target2_vec = vaddq_s16(src0_vec, src3_vec);
208  target3_vec = vaddq_s16(src0_vec, src4_vec);
209 
210  vst1q_s16(target0, target0_vec);
211  vst1q_s16(target1, target1_vec);
212  vst1q_s16(target2, target2_vec);
213  vst1q_s16(target3, target3_vec);
214  src0 += 8;
215  src1 += 8;
216  src2 += 8;
217  src3 += 8;
218  src4 += 8;
219  target0 += 8;
220  target1 += 8;
221  target2 += 8;
222  target3 += 8;
223  }
224 
225  for (number = eighth_points * 8; number < num_points; ++number) {
226  *target0++ = *src0 + *src1++;
227  *target1++ = *src0 + *src2++;
228  *target2++ = *src0 + *src3++;
229  *target3++ = *src0++ + *src4++;
230  }
231 }
232 
233 #endif /* LV_HAVE_NEON */
234 
235 #ifdef LV_HAVE_GENERIC
236 
237 static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0,
238  short* target1,
239  short* target2,
240  short* target3,
241  short* src0,
242  short* src1,
243  short* src2,
244  short* src3,
245  short* src4,
246  unsigned int num_points)
247 {
248  const unsigned int num_bytes = num_points * 2;
249 
250  int i = 0;
251 
252  int bound = num_bytes >> 1;
253 
254  for (i = 0; i < bound; ++i) {
255  target0[i] = src0[i] + src1[i];
256  target1[i] = src0[i] + src2[i];
257  target2[i] = src0[i] + src3[i];
258  target3[i] = src0[i] + src4[i];
259  }
260 }
261 
262 #endif /* LV_HAVE_GENERIC */
263 
264 #endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/
i
for i
Definition: volk_config_fixed.tmpl.h:25
volk_16i_x5_add_quad_16i_x4_a_sse2
static void volk_16i_x5_add_quad_16i_x4_a_sse2(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:70
volk_16i_x5_add_quad_16i_x4_generic
static void volk_16i_x5_add_quad_16i_x4_generic(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:237
volk_16i_x5_add_quad_16i_x4_neon
static void volk_16i_x5_add_quad_16i_x4_neon(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:182