Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_8i_convert_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
53 #ifndef INCLUDED_volk_8i_convert_16i_u_H
54 #define INCLUDED_volk_8i_convert_16i_u_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 #ifdef LV_HAVE_AVX2
60 #include <immintrin.h>
61 
62 static inline void
63 volk_8i_convert_16i_u_avx2(int16_t* outputVector, const int8_t* inputVector,
64  unsigned int num_points)
65 {
66  unsigned int number = 0;
67  const unsigned int sixteenthPoints = num_points / 16;
68 
69  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
70  __m256i* outputVectorPtr = (__m256i*)outputVector;
71  __m128i inputVal;
72  __m256i ret;
73 
74  for(;number < sixteenthPoints; number++){
75  inputVal = _mm_loadu_si128(inputVectorPtr);
76  ret = _mm256_cvtepi8_epi16(inputVal);
77  ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
78  _mm256_storeu_si256(outputVectorPtr, ret);
79 
80  outputVectorPtr++;
81  inputVectorPtr++;
82  }
83 
84  number = sixteenthPoints * 16;
85  for(; number < num_points; number++){
86  outputVector[number] = (int16_t)(inputVector[number])*256;
87  }
88 }
89 #endif /* LV_HAVE_AVX2 */
90 
91 
92 #ifdef LV_HAVE_SSE4_1
93 #include <smmintrin.h>
94 
95 static inline void
96 volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector,
97  unsigned int num_points)
98 {
99  unsigned int number = 0;
100  const unsigned int sixteenthPoints = num_points / 16;
101 
102  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
103  __m128i* outputVectorPtr = (__m128i*)outputVector;
104  __m128i inputVal;
105  __m128i ret;
106 
107  for(;number < sixteenthPoints; number++){
108  inputVal = _mm_loadu_si128(inputVectorPtr);
109  ret = _mm_cvtepi8_epi16(inputVal);
110  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
111  _mm_storeu_si128(outputVectorPtr, ret);
112 
113  outputVectorPtr++;
114 
115  inputVal = _mm_srli_si128(inputVal, 8);
116  ret = _mm_cvtepi8_epi16(inputVal);
117  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
118  _mm_storeu_si128(outputVectorPtr, ret);
119 
120  outputVectorPtr++;
121 
122  inputVectorPtr++;
123  }
124 
125  number = sixteenthPoints * 16;
126  for(; number < num_points; number++){
127  outputVector[number] = (int16_t)(inputVector[number])*256;
128  }
129 }
130 #endif /* LV_HAVE_SSE4_1 */
131 
132 
133 #ifdef LV_HAVE_GENERIC
134 
135 static inline void
136 volk_8i_convert_16i_generic(int16_t* outputVector, const int8_t* inputVector,
137  unsigned int num_points)
138 {
139  int16_t* outputVectorPtr = outputVector;
140  const int8_t* inputVectorPtr = inputVector;
141  unsigned int number = 0;
142 
143  for(number = 0; number < num_points; number++){
144  *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
145  }
146 }
147 #endif /* LV_HAVE_GENERIC */
148 
149 
150 #endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
151 
152 
153 
154 #ifndef INCLUDED_volk_8i_convert_16i_a_H
155 #define INCLUDED_volk_8i_convert_16i_a_H
156 
157 #include <inttypes.h>
158 #include <stdio.h>
159 
160 #ifdef LV_HAVE_AVX2
161 #include <immintrin.h>
162 
163 static inline void
164 volk_8i_convert_16i_a_avx2(int16_t* outputVector, const int8_t* inputVector,
165  unsigned int num_points)
166 {
167  unsigned int number = 0;
168  const unsigned int sixteenthPoints = num_points / 16;
169 
170  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
171  __m256i* outputVectorPtr = (__m256i*)outputVector;
172  __m128i inputVal;
173  __m256i ret;
174 
175  for(;number < sixteenthPoints; number++){
176  inputVal = _mm_load_si128(inputVectorPtr);
177  ret = _mm256_cvtepi8_epi16(inputVal);
178  ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
179  _mm256_store_si256(outputVectorPtr, ret);
180 
181  outputVectorPtr++;
182  inputVectorPtr++;
183  }
184 
185  number = sixteenthPoints * 16;
186  for(; number < num_points; number++){
187  outputVector[number] = (int16_t)(inputVector[number])*256;
188  }
189 }
190 #endif /* LV_HAVE_AVX2 */
191 
192 
193 #ifdef LV_HAVE_SSE4_1
194 #include <smmintrin.h>
195 
196 static inline void
197 volk_8i_convert_16i_a_sse4_1(int16_t* outputVector, const int8_t* inputVector,
198  unsigned int num_points)
199 {
200  unsigned int number = 0;
201  const unsigned int sixteenthPoints = num_points / 16;
202 
203  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
204  __m128i* outputVectorPtr = (__m128i*)outputVector;
205  __m128i inputVal;
206  __m128i ret;
207 
208  for(;number < sixteenthPoints; number++){
209  inputVal = _mm_load_si128(inputVectorPtr);
210  ret = _mm_cvtepi8_epi16(inputVal);
211  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
212  _mm_store_si128(outputVectorPtr, ret);
213 
214  outputVectorPtr++;
215 
216  inputVal = _mm_srli_si128(inputVal, 8);
217  ret = _mm_cvtepi8_epi16(inputVal);
218  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
219  _mm_store_si128(outputVectorPtr, ret);
220 
221  outputVectorPtr++;
222 
223  inputVectorPtr++;
224  }
225 
226  number = sixteenthPoints * 16;
227  for(; number < num_points; number++){
228  outputVector[number] = (int16_t)(inputVector[number])*256;
229  }
230 }
231 #endif /* LV_HAVE_SSE4_1 */
232 
233 
234 #ifdef LV_HAVE_GENERIC
235 
236 static inline void
237 volk_8i_convert_16i_a_generic(int16_t* outputVector, const int8_t* inputVector,
238  unsigned int num_points)
239 {
240  int16_t* outputVectorPtr = outputVector;
241  const int8_t* inputVectorPtr = inputVector;
242  unsigned int number = 0;
243 
244  for(number = 0; number < num_points; number++){
245  *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
246  }
247 }
248 #endif /* LV_HAVE_GENERIC */
249 
250 
251 #ifdef LV_HAVE_NEON
252 #include <arm_neon.h>
253 
254 static inline void
255 volk_8i_convert_16i_neon(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points)
256 {
257  int16_t* outputVectorPtr = outputVector;
258  const int8_t* inputVectorPtr = inputVector;
259  unsigned int number;
260  const unsigned int eighth_points = num_points / 8;
261 
262  int8x8_t input_vec ;
263  int16x8_t converted_vec;
264 
265  // NEON doesn't have a concept of 8 bit registers, so we are really
266  // dealing with the low half of 16-bit registers. Since this requires
267  // a move instruction we likely do better with ASM here.
268  for(number = 0; number < eighth_points; ++number) {
269  input_vec = vld1_s8(inputVectorPtr);
270  converted_vec = vmovl_s8(input_vec);
271  //converted_vec = vmulq_s16(converted_vec, scale_factor);
272  converted_vec = vshlq_n_s16(converted_vec, 8);
273  vst1q_s16( outputVectorPtr, converted_vec);
274 
275  inputVectorPtr += 8;
276  outputVectorPtr += 8;
277  }
278 
279  for(number = eighth_points * 8; number < num_points; number++){
280  *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
281  }
282 }
283 #endif /* LV_HAVE_NEON */
284 
285 
286 #ifdef LV_HAVE_ORC
287 extern void
288 volk_8i_convert_16i_a_orc_impl(int16_t* outputVector, const int8_t* inputVector,
289  unsigned int num_points);
290 
291 static inline void
292 volk_8i_convert_16i_u_orc(int16_t* outputVector, const int8_t* inputVector,
293  unsigned int num_points)
294 {
295  volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
296 }
297 #endif /* LV_HAVE_ORC */
298 
299 
300 
301 #endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
static void volk_8i_convert_16i_a_generic(int16_t *outputVector, const int8_t *inputVector, unsigned int num_points)
Definition: volk_8i_convert_16i.h:237
static void volk_8i_convert_16i_neon(int16_t *outputVector, const int8_t *inputVector, unsigned int num_points)
Definition: volk_8i_convert_16i.h:255
static void volk_8i_convert_16i_generic(int16_t *outputVector, const int8_t *inputVector, unsigned int num_points)
Definition: volk_8i_convert_16i.h:136