68 #ifndef INCLUDED_volk_32f_tanh_32f_a_H 69 #define INCLUDED_volk_32f_tanh_32f_a_H 76 #ifdef LV_HAVE_GENERIC 80 unsigned int num_points)
82 unsigned int number = 0;
83 float* cPtr = cVector;
84 const float* aPtr = aVector;
85 for(; number < num_points; number++) {
86 *cPtr++ = tanhf(*aPtr++);
93 #ifdef LV_HAVE_GENERIC 97 unsigned int num_points)
99 unsigned int number = 0;
100 float* cPtr = cVector;
101 const float* aPtr = aVector;
102 for(; number < num_points; number++) {
105 else if(*aPtr <= -4.97)
108 float x2 = (*aPtr) * (*aPtr);
109 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
110 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
122 #include <xmmintrin.h> 126 unsigned int num_points)
128 unsigned int number = 0;
129 const unsigned int quarterPoints = num_points / 4;
131 float* cPtr = cVector;
132 const float* aPtr = aVector;
134 __m128 aVal, cVal, x2, a, b;
135 __m128 const1, const2, const3, const4, const5, const6;
136 const1 = _mm_set_ps1(135135.0f);
137 const2 = _mm_set_ps1(17325.0f);
138 const3 = _mm_set_ps1(378.0f);
139 const4 = _mm_set_ps1(62370.0f);
140 const5 = _mm_set_ps1(3150.0f);
141 const6 = _mm_set_ps1(28.0f);
142 for(;number < quarterPoints; number++){
144 aVal = _mm_load_ps(aPtr);
145 x2 = _mm_mul_ps(aVal, aVal);
146 a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
147 b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
149 cVal = _mm_div_ps(a, b);
151 _mm_store_ps(cPtr, cVal);
157 number = quarterPoints * 4;
158 for(;number < num_points; number++) {
161 else if(*aPtr <= -4.97)
164 float x2 = (*aPtr) * (*aPtr);
165 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
166 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
176 #include <immintrin.h> 180 unsigned int num_points)
182 unsigned int number = 0;
183 const unsigned int eighthPoints = num_points / 8;
185 float* cPtr = cVector;
186 const float* aPtr = aVector;
188 __m256 aVal, cVal, x2, a, b;
189 __m256 const1, const2, const3, const4, const5, const6;
190 const1 = _mm256_set1_ps(135135.0f);
191 const2 = _mm256_set1_ps(17325.0f);
192 const3 = _mm256_set1_ps(378.0f);
193 const4 = _mm256_set1_ps(62370.0f);
194 const5 = _mm256_set1_ps(3150.0f);
195 const6 = _mm256_set1_ps(28.0f);
196 for(;number < eighthPoints; number++){
198 aVal = _mm256_load_ps(aPtr);
199 x2 = _mm256_mul_ps(aVal, aVal);
200 a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
201 b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
203 cVal = _mm256_div_ps(a, b);
205 _mm256_store_ps(cPtr, cVal);
211 number = eighthPoints * 8;
212 for(;number < num_points; number++) {
215 else if(*aPtr <= -4.97)
218 float x2 = (*aPtr) * (*aPtr);
219 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
220 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
228 #if LV_HAVE_AVX && LV_HAVE_FMA 229 #include <immintrin.h> 232 volk_32f_tanh_32f_a_avx_fma(
float* cVector,
const float* aVector,
233 unsigned int num_points)
235 unsigned int number = 0;
236 const unsigned int eighthPoints = num_points / 8;
238 float* cPtr = cVector;
239 const float* aPtr = aVector;
241 __m256 aVal, cVal, x2, a, b;
242 __m256 const1, const2, const3, const4, const5, const6;
243 const1 = _mm256_set1_ps(135135.0f);
244 const2 = _mm256_set1_ps(17325.0f);
245 const3 = _mm256_set1_ps(378.0f);
246 const4 = _mm256_set1_ps(62370.0f);
247 const5 = _mm256_set1_ps(3150.0f);
248 const6 = _mm256_set1_ps(28.0f);
249 for(;number < eighthPoints; number++){
251 aVal = _mm256_load_ps(aPtr);
252 x2 = _mm256_mul_ps(aVal, aVal);
253 a = _mm256_mul_ps(aVal, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2),const1));
254 b = _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
256 cVal = _mm256_div_ps(a, b);
258 _mm256_store_ps(cPtr, cVal);
264 number = eighthPoints * 8;
265 for(;number < num_points; number++) {
268 else if(*aPtr <= -4.97)
271 float x2 = (*aPtr) * (*aPtr);
272 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
273 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
284 #ifndef INCLUDED_volk_32f_tanh_32f_u_H 285 #define INCLUDED_volk_32f_tanh_32f_u_H 287 #include <inttypes.h> 294 #include <xmmintrin.h> 298 unsigned int num_points)
300 unsigned int number = 0;
301 const unsigned int quarterPoints = num_points / 4;
303 float* cPtr = cVector;
304 const float* aPtr = aVector;
306 __m128 aVal, cVal, x2, a, b;
307 __m128 const1, const2, const3, const4, const5, const6;
308 const1 = _mm_set_ps1(135135.0f);
309 const2 = _mm_set_ps1(17325.0f);
310 const3 = _mm_set_ps1(378.0f);
311 const4 = _mm_set_ps1(62370.0f);
312 const5 = _mm_set_ps1(3150.0f);
313 const6 = _mm_set_ps1(28.0f);
314 for(;number < quarterPoints; number++){
316 aVal = _mm_loadu_ps(aPtr);
317 x2 = _mm_mul_ps(aVal, aVal);
318 a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
319 b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
321 cVal = _mm_div_ps(a, b);
323 _mm_storeu_ps(cPtr, cVal);
329 number = quarterPoints * 4;
330 for(;number < num_points; number++) {
333 else if(*aPtr <= -4.97)
336 float x2 = (*aPtr) * (*aPtr);
337 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
338 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
348 #include <immintrin.h> 352 unsigned int num_points)
354 unsigned int number = 0;
355 const unsigned int eighthPoints = num_points / 8;
357 float* cPtr = cVector;
358 const float* aPtr = aVector;
360 __m256 aVal, cVal, x2, a, b;
361 __m256 const1, const2, const3, const4, const5, const6;
362 const1 = _mm256_set1_ps(135135.0f);
363 const2 = _mm256_set1_ps(17325.0f);
364 const3 = _mm256_set1_ps(378.0f);
365 const4 = _mm256_set1_ps(62370.0f);
366 const5 = _mm256_set1_ps(3150.0f);
367 const6 = _mm256_set1_ps(28.0f);
368 for(;number < eighthPoints; number++){
370 aVal = _mm256_loadu_ps(aPtr);
371 x2 = _mm256_mul_ps(aVal, aVal);
372 a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
373 b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4, _mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
375 cVal = _mm256_div_ps(a, b);
377 _mm256_storeu_ps(cPtr, cVal);
383 number = eighthPoints * 8;
384 for(;number < num_points; number++) {
387 else if(*aPtr <= -4.97)
390 float x2 = (*aPtr) * (*aPtr);
391 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
392 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
400 #if LV_HAVE_AVX && LV_HAVE_FMA 401 #include <immintrin.h> 404 volk_32f_tanh_32f_u_avx_fma(
float* cVector,
const float* aVector,
405 unsigned int num_points)
407 unsigned int number = 0;
408 const unsigned int eighthPoints = num_points / 8;
410 float* cPtr = cVector;
411 const float* aPtr = aVector;
413 __m256 aVal, cVal, x2, a, b;
414 __m256 const1, const2, const3, const4, const5, const6;
415 const1 = _mm256_set1_ps(135135.0f);
416 const2 = _mm256_set1_ps(17325.0f);
417 const3 = _mm256_set1_ps(378.0f);
418 const4 = _mm256_set1_ps(62370.0f);
419 const5 = _mm256_set1_ps(3150.0f);
420 const6 = _mm256_set1_ps(28.0f);
421 for(;number < eighthPoints; number++){
423 aVal = _mm256_loadu_ps(aPtr);
424 x2 = _mm256_mul_ps(aVal, aVal);
425 a = _mm256_mul_ps(aVal, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2),const1));
426 b = _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
428 cVal = _mm256_div_ps(a, b);
430 _mm256_storeu_ps(cPtr, cVal);
436 number = eighthPoints * 8;
437 for(;number < num_points; number++) {
440 else if(*aPtr <= -4.97)
443 float x2 = (*aPtr) * (*aPtr);
444 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
445 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
static void volk_32f_tanh_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:179
static void volk_32f_tanh_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:351
static void volk_32f_tanh_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:79
static void volk_32f_tanh_32f_series(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:96
static void volk_32f_tanh_32f_u_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:297
static void volk_32f_tanh_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:125