Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_16i_convert_8i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
53 #ifndef INCLUDED_volk_16i_convert_8i_u_H
54 #define INCLUDED_volk_16i_convert_8i_u_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 #ifdef LV_HAVE_AVX2
60 #include <immintrin.h>
61 
62 static inline void
63 volk_16i_convert_8i_u_avx2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
64 {
65  unsigned int number = 0;
66  const unsigned int thirtysecondPoints = num_points / 32;
67 
68  int8_t* outputVectorPtr = outputVector;
69  int16_t* inputPtr = (int16_t*)inputVector;
70  __m256i inputVal1;
71  __m256i inputVal2;
72  __m256i ret;
73 
74  for(;number < thirtysecondPoints; number++){
75 
76  // Load the 16 values
77  inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr); inputPtr += 16;
78  inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr); inputPtr += 16;
79 
80  inputVal1 = _mm256_srai_epi16(inputVal1, 8);
81  inputVal2 = _mm256_srai_epi16(inputVal2, 8);
82 
83  ret = _mm256_packs_epi16(inputVal1, inputVal2);
84  ret = _mm256_permute4x64_epi64(ret, 0b11011000);
85 
86  _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
87 
88  outputVectorPtr += 32;
89  }
90 
91  number = thirtysecondPoints * 32;
92  for(; number < num_points; number++){
93  outputVector[number] =(int8_t)(inputVector[number] >> 8);
94  }
95 }
96 #endif /* LV_HAVE_AVX2 */
97 
98 
99 #ifdef LV_HAVE_SSE2
100 #include <emmintrin.h>
101 
102 static inline void
103 volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
104 {
105  unsigned int number = 0;
106  const unsigned int sixteenthPoints = num_points / 16;
107 
108  int8_t* outputVectorPtr = outputVector;
109  int16_t* inputPtr = (int16_t*)inputVector;
110  __m128i inputVal1;
111  __m128i inputVal2;
112  __m128i ret;
113 
114  for(;number < sixteenthPoints; number++){
115 
116  // Load the 16 values
117  inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
118  inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
119 
120  inputVal1 = _mm_srai_epi16(inputVal1, 8);
121  inputVal2 = _mm_srai_epi16(inputVal2, 8);
122 
123  ret = _mm_packs_epi16(inputVal1, inputVal2);
124 
125  _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
126 
127  outputVectorPtr += 16;
128  }
129 
130  number = sixteenthPoints * 16;
131  for(; number < num_points; number++){
132  outputVector[number] =(int8_t)(inputVector[number] >> 8);
133  }
134 }
135 #endif /* LV_HAVE_SSE2 */
136 
137 
138 #ifdef LV_HAVE_GENERIC
139 
140 static inline void
141 volk_16i_convert_8i_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
142 {
143  int8_t* outputVectorPtr = outputVector;
144  const int16_t* inputVectorPtr = inputVector;
145  unsigned int number = 0;
146 
147  for(number = 0; number < num_points; number++){
148  *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
149  }
150 }
151 #endif /* LV_HAVE_GENERIC */
152 
153 
154 
155 
156 #endif /* INCLUDED_volk_16i_convert_8i_u_H */
157 #ifndef INCLUDED_volk_16i_convert_8i_a_H
158 #define INCLUDED_volk_16i_convert_8i_a_H
159 
160 #include <inttypes.h>
161 #include <stdio.h>
162 
163 #ifdef LV_HAVE_AVX2
164 #include <immintrin.h>
165 
166 static inline void
167 volk_16i_convert_8i_a_avx2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
168 {
169  unsigned int number = 0;
170  const unsigned int thirtysecondPoints = num_points / 32;
171 
172  int8_t* outputVectorPtr = outputVector;
173  int16_t* inputPtr = (int16_t*)inputVector;
174  __m256i inputVal1;
175  __m256i inputVal2;
176  __m256i ret;
177 
178  for(;number < thirtysecondPoints; number++){
179 
180  // Load the 16 values
181  inputVal1 = _mm256_load_si256((__m256i*)inputPtr); inputPtr += 16;
182  inputVal2 = _mm256_load_si256((__m256i*)inputPtr); inputPtr += 16;
183 
184  inputVal1 = _mm256_srai_epi16(inputVal1, 8);
185  inputVal2 = _mm256_srai_epi16(inputVal2, 8);
186 
187  ret = _mm256_packs_epi16(inputVal1, inputVal2);
188  ret = _mm256_permute4x64_epi64(ret, 0b11011000);
189 
190  _mm256_store_si256((__m256i*)outputVectorPtr, ret);
191 
192  outputVectorPtr += 32;
193  }
194 
195  number = thirtysecondPoints * 32;
196  for(; number < num_points; number++){
197  outputVector[number] =(int8_t)(inputVector[number] >> 8);
198  }
199 }
200 #endif /* LV_HAVE_AVX2 */
201 
202 
203 #ifdef LV_HAVE_SSE2
204 #include <emmintrin.h>
205 
206 static inline void
207 volk_16i_convert_8i_a_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
208 {
209  unsigned int number = 0;
210  const unsigned int sixteenthPoints = num_points / 16;
211 
212  int8_t* outputVectorPtr = outputVector;
213  int16_t* inputPtr = (int16_t*)inputVector;
214  __m128i inputVal1;
215  __m128i inputVal2;
216  __m128i ret;
217 
218  for(;number < sixteenthPoints; number++){
219 
220  // Load the 16 values
221  inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
222  inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
223 
224  inputVal1 = _mm_srai_epi16(inputVal1, 8);
225  inputVal2 = _mm_srai_epi16(inputVal2, 8);
226 
227  ret = _mm_packs_epi16(inputVal1, inputVal2);
228 
229  _mm_store_si128((__m128i*)outputVectorPtr, ret);
230 
231  outputVectorPtr += 16;
232  }
233 
234  number = sixteenthPoints * 16;
235  for(; number < num_points; number++){
236  outputVector[number] =(int8_t)(inputVector[number] >> 8);
237  }
238 }
239 #endif /* LV_HAVE_SSE2 */
240 
241 
242 #ifdef LV_HAVE_NEON
243 #include <arm_neon.h>
244 
245 static inline void
246 volk_16i_convert_8i_neon(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
247 {
248  int8_t* outputVectorPtr = outputVector;
249  const int16_t* inputVectorPtr = inputVector;
250  unsigned int number = 0;
251  unsigned int sixteenth_points = num_points / 16;
252 
253  int16x8_t inputVal0;
254  int16x8_t inputVal1;
255  int8x8_t outputVal0;
256  int8x8_t outputVal1;
257  int8x16_t outputVal;
258 
259  for(number = 0; number < sixteenth_points; number++){
260  // load two input vectors
261  inputVal0 = vld1q_s16(inputVectorPtr);
262  inputVal1 = vld1q_s16(inputVectorPtr+8);
263  // shift right
264  outputVal0 = vshrn_n_s16(inputVal0, 8);
265  outputVal1 = vshrn_n_s16(inputVal1, 8);
266  // squash two vectors and write output
267  outputVal = vcombine_s8(outputVal0, outputVal1);
268  vst1q_s8(outputVectorPtr, outputVal);
269  inputVectorPtr += 16;
270  outputVectorPtr += 16;
271  }
272 
273  for(number = sixteenth_points * 16; number < num_points; number++){
274  *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
275  }
276 }
277 #endif /* LV_HAVE_NEON */
278 
279 
280 #ifdef LV_HAVE_GENERIC
281 
282 static inline void
283 volk_16i_convert_8i_a_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points)
284 {
285  int8_t* outputVectorPtr = outputVector;
286  const int16_t* inputVectorPtr = inputVector;
287  unsigned int number = 0;
288 
289  for(number = 0; number < num_points; number++){
290  *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
291  }
292 }
293 #endif /* LV_HAVE_GENERIC */
294 
295 #endif /* INCLUDED_volk_16i_convert_8i_a_H */
static void volk_16i_convert_8i_u_sse2(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:103
static void volk_16i_convert_8i_neon(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:246
static void volk_16i_convert_8i_a_generic(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:283
static void volk_16i_convert_8i_generic(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:141
static void volk_16i_convert_8i_a_sse2(int8_t *outputVector, const int16_t *inputVector, unsigned int num_points)
Definition: volk_16i_convert_8i.h:207