68 #ifndef INCLUDED_volk_32f_tanh_32f_a_H
69 #define INCLUDED_volk_32f_tanh_32f_a_H
76 #ifdef LV_HAVE_GENERIC
81 unsigned int number = 0;
82 float* cPtr = cVector;
83 const float* aPtr = aVector;
84 for (; number < num_points; number++) {
85 *cPtr++ = tanhf(*aPtr++);
92 #ifdef LV_HAVE_GENERIC
97 unsigned int number = 0;
98 float* cPtr = cVector;
99 const float* aPtr = aVector;
100 for (; number < num_points; number++) {
103 else if (*aPtr <= -4.97)
106 float x2 = (*aPtr) * (*aPtr);
107 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
108 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
119 #include <xmmintrin.h>
124 unsigned int number = 0;
125 const unsigned int quarterPoints = num_points / 4;
127 float* cPtr = cVector;
128 const float* aPtr = aVector;
130 __m128 aVal, cVal, x2, a, b;
131 __m128 const1, const2, const3, const4, const5, const6;
132 const1 = _mm_set_ps1(135135.0f);
133 const2 = _mm_set_ps1(17325.0f);
134 const3 = _mm_set_ps1(378.0f);
135 const4 = _mm_set_ps1(62370.0f);
136 const5 = _mm_set_ps1(3150.0f);
137 const6 = _mm_set_ps1(28.0f);
138 for (; number < quarterPoints; number++) {
140 aVal = _mm_load_ps(aPtr);
141 x2 = _mm_mul_ps(aVal, aVal);
147 _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
153 _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
155 cVal = _mm_div_ps(a, b);
157 _mm_store_ps(cPtr, cVal);
163 number = quarterPoints * 4;
164 for (; number < num_points; number++) {
167 else if (*aPtr <= -4.97)
170 float x2 = (*aPtr) * (*aPtr);
171 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
172 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
182 #include <immintrin.h>
187 unsigned int number = 0;
188 const unsigned int eighthPoints = num_points / 8;
190 float* cPtr = cVector;
191 const float* aPtr = aVector;
193 __m256 aVal, cVal, x2, a, b;
194 __m256 const1, const2, const3, const4, const5, const6;
195 const1 = _mm256_set1_ps(135135.0f);
196 const2 = _mm256_set1_ps(17325.0f);
197 const3 = _mm256_set1_ps(378.0f);
198 const4 = _mm256_set1_ps(62370.0f);
199 const5 = _mm256_set1_ps(3150.0f);
200 const6 = _mm256_set1_ps(28.0f);
201 for (; number < eighthPoints; number++) {
203 aVal = _mm256_load_ps(aPtr);
204 x2 = _mm256_mul_ps(aVal, aVal);
211 _mm256_add_ps(const2,
212 _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
220 _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
222 cVal = _mm256_div_ps(a, b);
224 _mm256_store_ps(cPtr, cVal);
230 number = eighthPoints * 8;
231 for (; number < num_points; number++) {
234 else if (*aPtr <= -4.97)
237 float x2 = (*aPtr) * (*aPtr);
238 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
239 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
247 #if LV_HAVE_AVX && LV_HAVE_FMA
248 #include <immintrin.h>
251 volk_32f_tanh_32f_a_avx_fma(
float* cVector,
const float* aVector,
unsigned int num_points)
253 unsigned int number = 0;
254 const unsigned int eighthPoints = num_points / 8;
256 float* cPtr = cVector;
257 const float* aPtr = aVector;
259 __m256 aVal, cVal, x2, a, b;
260 __m256 const1, const2, const3, const4, const5, const6;
261 const1 = _mm256_set1_ps(135135.0f);
262 const2 = _mm256_set1_ps(17325.0f);
263 const3 = _mm256_set1_ps(378.0f);
264 const4 = _mm256_set1_ps(62370.0f);
265 const5 = _mm256_set1_ps(3150.0f);
266 const6 = _mm256_set1_ps(28.0f);
267 for (; number < eighthPoints; number++) {
269 aVal = _mm256_load_ps(aPtr);
270 x2 = _mm256_mul_ps(aVal, aVal);
274 x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
276 x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
278 cVal = _mm256_div_ps(a, b);
280 _mm256_store_ps(cPtr, cVal);
286 number = eighthPoints * 8;
287 for (; number < num_points; number++) {
290 else if (*aPtr <= -4.97)
293 float x2 = (*aPtr) * (*aPtr);
294 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
295 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
306 #ifndef INCLUDED_volk_32f_tanh_32f_u_H
307 #define INCLUDED_volk_32f_tanh_32f_u_H
309 #include <inttypes.h>
316 #include <xmmintrin.h>
321 unsigned int number = 0;
322 const unsigned int quarterPoints = num_points / 4;
324 float* cPtr = cVector;
325 const float* aPtr = aVector;
327 __m128 aVal, cVal, x2, a, b;
328 __m128 const1, const2, const3, const4, const5, const6;
329 const1 = _mm_set_ps1(135135.0f);
330 const2 = _mm_set_ps1(17325.0f);
331 const3 = _mm_set_ps1(378.0f);
332 const4 = _mm_set_ps1(62370.0f);
333 const5 = _mm_set_ps1(3150.0f);
334 const6 = _mm_set_ps1(28.0f);
335 for (; number < quarterPoints; number++) {
337 aVal = _mm_loadu_ps(aPtr);
338 x2 = _mm_mul_ps(aVal, aVal);
344 _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
350 _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
352 cVal = _mm_div_ps(a, b);
354 _mm_storeu_ps(cPtr, cVal);
360 number = quarterPoints * 4;
361 for (; number < num_points; number++) {
364 else if (*aPtr <= -4.97)
367 float x2 = (*aPtr) * (*aPtr);
368 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
369 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
379 #include <immintrin.h>
384 unsigned int number = 0;
385 const unsigned int eighthPoints = num_points / 8;
387 float* cPtr = cVector;
388 const float* aPtr = aVector;
390 __m256 aVal, cVal, x2, a, b;
391 __m256 const1, const2, const3, const4, const5, const6;
392 const1 = _mm256_set1_ps(135135.0f);
393 const2 = _mm256_set1_ps(17325.0f);
394 const3 = _mm256_set1_ps(378.0f);
395 const4 = _mm256_set1_ps(62370.0f);
396 const5 = _mm256_set1_ps(3150.0f);
397 const6 = _mm256_set1_ps(28.0f);
398 for (; number < eighthPoints; number++) {
400 aVal = _mm256_loadu_ps(aPtr);
401 x2 = _mm256_mul_ps(aVal, aVal);
408 _mm256_add_ps(const2,
409 _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
417 _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
419 cVal = _mm256_div_ps(a, b);
421 _mm256_storeu_ps(cPtr, cVal);
427 number = eighthPoints * 8;
428 for (; number < num_points; number++) {
431 else if (*aPtr <= -4.97)
434 float x2 = (*aPtr) * (*aPtr);
435 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
436 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
444 #if LV_HAVE_AVX && LV_HAVE_FMA
445 #include <immintrin.h>
448 volk_32f_tanh_32f_u_avx_fma(
float* cVector,
const float* aVector,
unsigned int num_points)
450 unsigned int number = 0;
451 const unsigned int eighthPoints = num_points / 8;
453 float* cPtr = cVector;
454 const float* aPtr = aVector;
456 __m256 aVal, cVal, x2, a, b;
457 __m256 const1, const2, const3, const4, const5, const6;
458 const1 = _mm256_set1_ps(135135.0f);
459 const2 = _mm256_set1_ps(17325.0f);
460 const3 = _mm256_set1_ps(378.0f);
461 const4 = _mm256_set1_ps(62370.0f);
462 const5 = _mm256_set1_ps(3150.0f);
463 const6 = _mm256_set1_ps(28.0f);
464 for (; number < eighthPoints; number++) {
466 aVal = _mm256_loadu_ps(aPtr);
467 x2 = _mm256_mul_ps(aVal, aVal);
471 x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
473 x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
475 cVal = _mm256_div_ps(a, b);
477 _mm256_storeu_ps(cPtr, cVal);
483 number = eighthPoints * 8;
484 for (; number < num_points; number++) {
487 else if (*aPtr <= -4.97)
490 float x2 = (*aPtr) * (*aPtr);
491 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
492 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));