Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_expfast_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
65 #include <stdio.h>
66 #include <math.h>
67 #include <inttypes.h>
68 
69 #define Mln2 0.6931471805f
70 #define A 8388608.0f
71 #define B 1065353216.0f
72 #define C 60801.0f
73 
74 
75 #ifndef INCLUDED_volk_32f_expfast_32f_a_H
76 #define INCLUDED_volk_32f_expfast_32f_a_H
77 
78 #if LV_HAVE_AVX && LV_HAVE_FMA
79 
80 #include <immintrin.h>
81 
82 static inline void
83  volk_32f_expfast_32f_a_avx_fma(float* bVector, const float* aVector, unsigned int num_points)
84 {
85  float* bPtr = bVector;
86  const float* aPtr = aVector;
87 
88  unsigned int number = 0;
89  const unsigned int eighthPoints = num_points / 8;
90 
91  __m256 aVal, bVal, a, b;
92  __m256i exp;
93  a = _mm256_set1_ps(A/Mln2);
94  b = _mm256_set1_ps(B-C);
95 
96  for(;number < eighthPoints; number++){
97  aVal = _mm256_load_ps(aPtr);
98  exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a,aVal, b));
99  bVal = _mm256_castsi256_ps(exp);
100 
101  _mm256_store_ps(bPtr, bVal);
102  aPtr += 8;
103  bPtr += 8;
104  }
105 
106  number = eighthPoints * 8;
107  for(;number < num_points; number++){
108  *bPtr++ = expf(*aPtr++);
109  }
110 }
111 
112 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
113 
114 #ifdef LV_HAVE_AVX
115 
116 #include <immintrin.h>
117 
118 static inline void
119  volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
120 {
121  float* bPtr = bVector;
122  const float* aPtr = aVector;
123 
124  unsigned int number = 0;
125  const unsigned int eighthPoints = num_points / 8;
126 
127  __m256 aVal, bVal, a, b;
128  __m256i exp;
129  a = _mm256_set1_ps(A/Mln2);
130  b = _mm256_set1_ps(B-C);
131 
132  for(;number < eighthPoints; number++){
133  aVal = _mm256_load_ps(aPtr);
134  exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
135  bVal = _mm256_castsi256_ps(exp);
136 
137  _mm256_store_ps(bPtr, bVal);
138  aPtr += 8;
139  bPtr += 8;
140  }
141 
142  number = eighthPoints * 8;
143  for(;number < num_points; number++){
144  *bPtr++ = expf(*aPtr++);
145  }
146 }
147 
148 #endif /* LV_HAVE_AVX for aligned */
149 
150 #ifdef LV_HAVE_SSE4_1
151 #include <smmintrin.h>
152 
153 static inline void
154 volk_32f_expfast_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
155 {
156  float* bPtr = bVector;
157  const float* aPtr = aVector;
158 
159  unsigned int number = 0;
160  const unsigned int quarterPoints = num_points / 4;
161 
162  __m128 aVal, bVal, a, b;
163  __m128i exp;
164  a = _mm_set1_ps(A/Mln2);
165  b = _mm_set1_ps(B-C);
166 
167  for(;number < quarterPoints; number++){
168  aVal = _mm_load_ps(aPtr);
169  exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
170  bVal = _mm_castsi128_ps(exp);
171 
172  _mm_store_ps(bPtr, bVal);
173  aPtr += 4;
174  bPtr += 4;
175  }
176 
177  number = quarterPoints * 4;
178  for(;number < num_points; number++){
179  *bPtr++ = expf(*aPtr++);
180  }
181 }
182 
183 #endif /* LV_HAVE_SSE4_1 for aligned */
184 
185 #endif /* INCLUDED_volk_32f_expfast_32f_a_H */
186 
187 #ifndef INCLUDED_volk_32f_expfast_32f_u_H
188 #define INCLUDED_volk_32f_expfast_32f_u_H
189 
190 #if LV_HAVE_AVX && LV_HAVE_FMA
191 #include <immintrin.h>
192 
193 static inline void
194 volk_32f_expfast_32f_u_avx_fma(float* bVector, const float* aVector, unsigned int num_points)
195 {
196  float* bPtr = bVector;
197  const float* aPtr = aVector;
198 
199  unsigned int number = 0;
200  const unsigned int eighthPoints = num_points / 8;
201 
202  __m256 aVal, bVal, a, b;
203  __m256i exp;
204  a = _mm256_set1_ps(A/Mln2);
205  b = _mm256_set1_ps(B-C);
206 
207  for(;number < eighthPoints; number++){
208  aVal = _mm256_loadu_ps(aPtr);
209  exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a,aVal, b));
210  bVal = _mm256_castsi256_ps(exp);
211 
212  _mm256_storeu_ps(bPtr, bVal);
213  aPtr += 8;
214  bPtr += 8;
215  }
216 
217  number = eighthPoints * 8;
218  for(;number < num_points; number++){
219  *bPtr++ = expf(*aPtr++);
220  }
221 }
222 
223 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
224 
225 #ifdef LV_HAVE_AVX
226 #include <immintrin.h>
227 
228 static inline void
229 volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
230 {
231  float* bPtr = bVector;
232  const float* aPtr = aVector;
233 
234  unsigned int number = 0;
235  const unsigned int eighthPoints = num_points / 8;
236 
237  __m256 aVal, bVal, a, b;
238  __m256i exp;
239  a = _mm256_set1_ps(A/Mln2);
240  b = _mm256_set1_ps(B-C);
241 
242  for(;number < eighthPoints; number++){
243  aVal = _mm256_loadu_ps(aPtr);
244  exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
245  bVal = _mm256_castsi256_ps(exp);
246 
247  _mm256_storeu_ps(bPtr, bVal);
248  aPtr += 8;
249  bPtr += 8;
250  }
251 
252  number = eighthPoints * 8;
253  for(;number < num_points; number++){
254  *bPtr++ = expf(*aPtr++);
255  }
256 }
257 
258 #endif /* LV_HAVE_AVX for unaligned */
259 
260 
261 #ifdef LV_HAVE_SSE4_1
262 #include <smmintrin.h>
263 
264 static inline void
265 volk_32f_expfast_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
266 {
267  float* bPtr = bVector;
268  const float* aPtr = aVector;
269 
270  unsigned int number = 0;
271  const unsigned int quarterPoints = num_points / 4;
272 
273  __m128 aVal, bVal, a, b;
274  __m128i exp;
275  a = _mm_set1_ps(A/Mln2);
276  b = _mm_set1_ps(B-C);
277 
278  for(;number < quarterPoints; number++){
279  aVal = _mm_loadu_ps(aPtr);
280  exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
281  bVal = _mm_castsi128_ps(exp);
282 
283  _mm_storeu_ps(bPtr, bVal);
284  aPtr += 4;
285  bPtr += 4;
286  }
287 
288  number = quarterPoints * 4;
289  for(;number < num_points; number++){
290  *bPtr++ = expf(*aPtr++);
291  }
292 }
293 
294 #endif /* LV_HAVE_SSE4_1 for unaligned */
295 
296 
297 #ifdef LV_HAVE_GENERIC
298 
299 static inline void
300 volk_32f_expfast_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
301 {
302  float* bPtr = bVector;
303  const float* aPtr = aVector;
304  unsigned int number = 0;
305 
306  for(number = 0; number < num_points; number++){
307  *bPtr++ = expf(*aPtr++);
308  }
309 }
310 #endif /* LV_HAVE_GENERIC */
311 
312 #endif /* INCLUDED_volk_32f_expfast_32f_u_H */
static void volk_32f_expfast_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:229
static void volk_32f_expfast_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:119
#define B
Definition: volk_32f_expfast_32f.h:71
#define C
Definition: volk_32f_expfast_32f.h:72
#define Mln2
Definition: volk_32f_expfast_32f.h:69
static void volk_32f_expfast_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:300
#define A
Definition: volk_32f_expfast_32f.h:70