Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_tanh_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
68 #ifndef INCLUDED_volk_32f_tanh_32f_a_H
69 #define INCLUDED_volk_32f_tanh_32f_a_H
70 
71 #include <inttypes.h>
72 #include <stdio.h>
73 #include <math.h>
74 #include <string.h>
75 
76 #ifdef LV_HAVE_GENERIC
77 
78 static inline void
79 volk_32f_tanh_32f_generic(float* cVector, const float* aVector,
80  unsigned int num_points)
81 {
82  unsigned int number = 0;
83  float* cPtr = cVector;
84  const float* aPtr = aVector;
85  for(; number < num_points; number++) {
86  *cPtr++ = tanhf(*aPtr++);
87  }
88 }
89 
90 #endif /* LV_HAVE_GENERIC */
91 
92 
93 #ifdef LV_HAVE_GENERIC
94 
95 static inline void
96 volk_32f_tanh_32f_series(float* cVector, const float* aVector,
97  unsigned int num_points)
98 {
99  unsigned int number = 0;
100  float* cPtr = cVector;
101  const float* aPtr = aVector;
102  for(; number < num_points; number++) {
103  if(*aPtr > 4.97)
104  *cPtr++ = 1;
105  else if(*aPtr <= -4.97)
106  *cPtr++ = -1;
107  else {
108  float x2 = (*aPtr) * (*aPtr);
109  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
110  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
111  *cPtr++ = a / b;
112  aPtr++;
113  }
114  }
115 }
116 
117 #endif /* LV_HAVE_GENERIC */
118 
119 
120 
121 #ifdef LV_HAVE_SSE
122 #include <xmmintrin.h>
123 
124 static inline void
125 volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector,
126  unsigned int num_points)
127 {
128  unsigned int number = 0;
129  const unsigned int quarterPoints = num_points / 4;
130 
131  float* cPtr = cVector;
132  const float* aPtr = aVector;
133 
134  __m128 aVal, cVal, x2, a, b;
135  __m128 const1, const2, const3, const4, const5, const6;
136  const1 = _mm_set_ps1(135135.0f);
137  const2 = _mm_set_ps1(17325.0f);
138  const3 = _mm_set_ps1(378.0f);
139  const4 = _mm_set_ps1(62370.0f);
140  const5 = _mm_set_ps1(3150.0f);
141  const6 = _mm_set_ps1(28.0f);
142  for(;number < quarterPoints; number++){
143 
144  aVal = _mm_load_ps(aPtr);
145  x2 = _mm_mul_ps(aVal, aVal);
146  a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
147  b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
148 
149  cVal = _mm_div_ps(a, b);
150 
151  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
152 
153  aPtr += 4;
154  cPtr += 4;
155  }
156 
157  number = quarterPoints * 4;
158  for(;number < num_points; number++) {
159  if(*aPtr > 4.97)
160  *cPtr++ = 1;
161  else if(*aPtr <= -4.97)
162  *cPtr++ = -1;
163  else {
164  float x2 = (*aPtr) * (*aPtr);
165  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
166  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
167  *cPtr++ = a / b;
168  aPtr++;
169  }
170  }
171 }
172 #endif /* LV_HAVE_SSE */
173 
174 
175 #ifdef LV_HAVE_AVX
176 #include <immintrin.h>
177 
178 static inline void
179 volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector,
180  unsigned int num_points)
181 {
182  unsigned int number = 0;
183  const unsigned int eighthPoints = num_points / 8;
184 
185  float* cPtr = cVector;
186  const float* aPtr = aVector;
187 
188  __m256 aVal, cVal, x2, a, b;
189  __m256 const1, const2, const3, const4, const5, const6;
190  const1 = _mm256_set1_ps(135135.0f);
191  const2 = _mm256_set1_ps(17325.0f);
192  const3 = _mm256_set1_ps(378.0f);
193  const4 = _mm256_set1_ps(62370.0f);
194  const5 = _mm256_set1_ps(3150.0f);
195  const6 = _mm256_set1_ps(28.0f);
196  for(;number < eighthPoints; number++){
197 
198  aVal = _mm256_load_ps(aPtr);
199  x2 = _mm256_mul_ps(aVal, aVal);
200  a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
201  b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
202 
203  cVal = _mm256_div_ps(a, b);
204 
205  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
206 
207  aPtr += 8;
208  cPtr += 8;
209  }
210 
211  number = eighthPoints * 8;
212  for(;number < num_points; number++) {
213  if(*aPtr > 4.97)
214  *cPtr++ = 1;
215  else if(*aPtr <= -4.97)
216  *cPtr++ = -1;
217  else {
218  float x2 = (*aPtr) * (*aPtr);
219  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
220  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
221  *cPtr++ = a / b;
222  aPtr++;
223  }
224  }
225 }
226 #endif /* LV_HAVE_AVX */
227 
228 #if LV_HAVE_AVX && LV_HAVE_FMA
229 #include <immintrin.h>
230 
231 static inline void
232 volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector,
233  unsigned int num_points)
234 {
235  unsigned int number = 0;
236  const unsigned int eighthPoints = num_points / 8;
237 
238  float* cPtr = cVector;
239  const float* aPtr = aVector;
240 
241  __m256 aVal, cVal, x2, a, b;
242  __m256 const1, const2, const3, const4, const5, const6;
243  const1 = _mm256_set1_ps(135135.0f);
244  const2 = _mm256_set1_ps(17325.0f);
245  const3 = _mm256_set1_ps(378.0f);
246  const4 = _mm256_set1_ps(62370.0f);
247  const5 = _mm256_set1_ps(3150.0f);
248  const6 = _mm256_set1_ps(28.0f);
249  for(;number < eighthPoints; number++){
250 
251  aVal = _mm256_load_ps(aPtr);
252  x2 = _mm256_mul_ps(aVal, aVal);
253  a = _mm256_mul_ps(aVal, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2),const1));
254  b = _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
255 
256  cVal = _mm256_div_ps(a, b);
257 
258  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
259 
260  aPtr += 8;
261  cPtr += 8;
262  }
263 
264  number = eighthPoints * 8;
265  for(;number < num_points; number++) {
266  if(*aPtr > 4.97)
267  *cPtr++ = 1;
268  else if(*aPtr <= -4.97)
269  *cPtr++ = -1;
270  else {
271  float x2 = (*aPtr) * (*aPtr);
272  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
273  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
274  *cPtr++ = a / b;
275  aPtr++;
276  }
277  }
278 }
279 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
280 
281 #endif /* INCLUDED_volk_32f_tanh_32f_a_H */
282 
283 
284 #ifndef INCLUDED_volk_32f_tanh_32f_u_H
285 #define INCLUDED_volk_32f_tanh_32f_u_H
286 
287 #include <inttypes.h>
288 #include <stdio.h>
289 #include <math.h>
290 #include <string.h>
291 
292 
293 #ifdef LV_HAVE_SSE
294 #include <xmmintrin.h>
295 
296 static inline void
297 volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector,
298  unsigned int num_points)
299 {
300  unsigned int number = 0;
301  const unsigned int quarterPoints = num_points / 4;
302 
303  float* cPtr = cVector;
304  const float* aPtr = aVector;
305 
306  __m128 aVal, cVal, x2, a, b;
307  __m128 const1, const2, const3, const4, const5, const6;
308  const1 = _mm_set_ps1(135135.0f);
309  const2 = _mm_set_ps1(17325.0f);
310  const3 = _mm_set_ps1(378.0f);
311  const4 = _mm_set_ps1(62370.0f);
312  const5 = _mm_set_ps1(3150.0f);
313  const6 = _mm_set_ps1(28.0f);
314  for(;number < quarterPoints; number++){
315 
316  aVal = _mm_loadu_ps(aPtr);
317  x2 = _mm_mul_ps(aVal, aVal);
318  a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
319  b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
320 
321  cVal = _mm_div_ps(a, b);
322 
323  _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
324 
325  aPtr += 4;
326  cPtr += 4;
327  }
328 
329  number = quarterPoints * 4;
330  for(;number < num_points; number++) {
331  if(*aPtr > 4.97)
332  *cPtr++ = 1;
333  else if(*aPtr <= -4.97)
334  *cPtr++ = -1;
335  else {
336  float x2 = (*aPtr) * (*aPtr);
337  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
338  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
339  *cPtr++ = a / b;
340  aPtr++;
341  }
342  }
343 }
344 #endif /* LV_HAVE_SSE */
345 
346 
347 #ifdef LV_HAVE_AVX
348 #include <immintrin.h>
349 
350 static inline void
351 volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector,
352  unsigned int num_points)
353 {
354  unsigned int number = 0;
355  const unsigned int eighthPoints = num_points / 8;
356 
357  float* cPtr = cVector;
358  const float* aPtr = aVector;
359 
360  __m256 aVal, cVal, x2, a, b;
361  __m256 const1, const2, const3, const4, const5, const6;
362  const1 = _mm256_set1_ps(135135.0f);
363  const2 = _mm256_set1_ps(17325.0f);
364  const3 = _mm256_set1_ps(378.0f);
365  const4 = _mm256_set1_ps(62370.0f);
366  const5 = _mm256_set1_ps(3150.0f);
367  const6 = _mm256_set1_ps(28.0f);
368  for(;number < eighthPoints; number++){
369 
370  aVal = _mm256_loadu_ps(aPtr);
371  x2 = _mm256_mul_ps(aVal, aVal);
372  a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
373  b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
374 
375  cVal = _mm256_div_ps(a, b);
376 
377  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
378 
379  aPtr += 8;
380  cPtr += 8;
381  }
382 
383  number = eighthPoints * 8;
384  for(;number < num_points; number++) {
385  if(*aPtr > 4.97)
386  *cPtr++ = 1;
387  else if(*aPtr <= -4.97)
388  *cPtr++ = -1;
389  else {
390  float x2 = (*aPtr) * (*aPtr);
391  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
392  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
393  *cPtr++ = a / b;
394  aPtr++;
395  }
396  }
397 }
398 #endif /* LV_HAVE_AVX */
399 
400 #if LV_HAVE_AVX && LV_HAVE_FMA
401 #include <immintrin.h>
402 
403 static inline void
404 volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector,
405  unsigned int num_points)
406 {
407  unsigned int number = 0;
408  const unsigned int eighthPoints = num_points / 8;
409 
410  float* cPtr = cVector;
411  const float* aPtr = aVector;
412 
413  __m256 aVal, cVal, x2, a, b;
414  __m256 const1, const2, const3, const4, const5, const6;
415  const1 = _mm256_set1_ps(135135.0f);
416  const2 = _mm256_set1_ps(17325.0f);
417  const3 = _mm256_set1_ps(378.0f);
418  const4 = _mm256_set1_ps(62370.0f);
419  const5 = _mm256_set1_ps(3150.0f);
420  const6 = _mm256_set1_ps(28.0f);
421  for(;number < eighthPoints; number++){
422 
423  aVal = _mm256_loadu_ps(aPtr);
424  x2 = _mm256_mul_ps(aVal, aVal);
425  a = _mm256_mul_ps(aVal, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2),const1));
426  b = _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
427 
428  cVal = _mm256_div_ps(a, b);
429 
430  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
431 
432  aPtr += 8;
433  cPtr += 8;
434  }
435 
436  number = eighthPoints * 8;
437  for(;number < num_points; number++) {
438  if(*aPtr > 4.97)
439  *cPtr++ = 1;
440  else if(*aPtr <= -4.97)
441  *cPtr++ = -1;
442  else {
443  float x2 = (*aPtr) * (*aPtr);
444  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
445  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
446  *cPtr++ = a / b;
447  aPtr++;
448  }
449  }
450 }
451 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
452 
453 #endif /* INCLUDED_volk_32f_tanh_32f_u_H */
static void volk_32f_tanh_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:179
static void volk_32f_tanh_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:351
static void volk_32f_tanh_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:79
static void volk_32f_tanh_32f_series(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:96
static void volk_32f_tanh_32f_u_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:297
static void volk_32f_tanh_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:125