Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_s32f_power_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_s32f_power_32f_a_H
72 #define INCLUDED_volk_32f_s32f_power_32f_a_H
73 
74 #include <inttypes.h>
75 #include <stdio.h>
76 #include <math.h>
77 
78 #ifdef LV_HAVE_SSE4_1
79 #include <tmmintrin.h>
80 
81 #ifdef LV_HAVE_LIB_SIMDMATH
82 #include <simdmath.h>
83 #endif /* LV_HAVE_LIB_SIMDMATH */
84 
85 static inline void
86 volk_32f_s32f_power_32f_a_sse4_1(float* cVector, const float* aVector,
87  const float power, unsigned int num_points)
88 {
89  unsigned int number = 0;
90 
91  float* cPtr = cVector;
92  const float* aPtr = aVector;
93 
94 #ifdef LV_HAVE_LIB_SIMDMATH
95  const unsigned int quarterPoints = num_points / 4;
96  __m128 vPower = _mm_set_ps1(power);
97  __m128 zeroValue = _mm_setzero_ps();
98  __m128 signMask;
99  __m128 negatedValues;
100  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
101  __m128 onesMask = _mm_set_ps1(1);
102 
103  __m128 aVal, cVal;
104  for(;number < quarterPoints; number++){
105 
106  aVal = _mm_load_ps(aPtr);
107  signMask = _mm_cmplt_ps(aVal, zeroValue);
108  negatedValues = _mm_sub_ps(zeroValue, aVal);
109  aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
110 
111  // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
112  cVal = powf4(aVal, vPower); // Takes each input value to the specified power
113 
114  cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
115 
116  _mm_store_ps(cPtr,cVal); // Store the results back into the C container
117 
118  aPtr += 4;
119  cPtr += 4;
120  }
121 
122  number = quarterPoints * 4;
123 #endif /* LV_HAVE_LIB_SIMDMATH */
124 
125  for(;number < num_points; number++){
126  *cPtr++ = powf((*aPtr++), power);
127  }
128 }
129 
130 #endif /* LV_HAVE_SSE4_1 */
131 
132 
133 #ifdef LV_HAVE_SSE
134 #include <xmmintrin.h>
135 
136 #ifdef LV_HAVE_LIB_SIMDMATH
137 #include <simdmath.h>
138 #endif /* LV_HAVE_LIB_SIMDMATH */
139 
140 static inline void
141 volk_32f_s32f_power_32f_a_sse(float* cVector, const float* aVector,
142  const float power, unsigned int num_points)
143 {
144  unsigned int number = 0;
145 
146  float* cPtr = cVector;
147  const float* aPtr = aVector;
148 
149 #ifdef LV_HAVE_LIB_SIMDMATH
150  const unsigned int quarterPoints = num_points / 4;
151  __m128 vPower = _mm_set_ps1(power);
152  __m128 zeroValue = _mm_setzero_ps();
153  __m128 signMask;
154  __m128 negatedValues;
155  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
156  __m128 onesMask = _mm_set_ps1(1);
157 
158  __m128 aVal, cVal;
159  for(;number < quarterPoints; number++){
160 
161  aVal = _mm_load_ps(aPtr);
162  signMask = _mm_cmplt_ps(aVal, zeroValue);
163  negatedValues = _mm_sub_ps(zeroValue, aVal);
164  aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) );
165 
166  // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
167  cVal = powf4(aVal, vPower); // Takes each input value to the specified power
168 
169  cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal);
170 
171  _mm_store_ps(cPtr,cVal); // Store the results back into the C container
172 
173  aPtr += 4;
174  cPtr += 4;
175  }
176 
177  number = quarterPoints * 4;
178 #endif /* LV_HAVE_LIB_SIMDMATH */
179 
180  for(;number < num_points; number++){
181  *cPtr++ = powf((*aPtr++), power);
182  }
183 }
184 
185 #endif /* LV_HAVE_SSE */
186 
187 
188 #ifdef LV_HAVE_GENERIC
189 
190 static inline void
191 volk_32f_s32f_power_32f_generic(float* cVector, const float* aVector,
192  const float power, unsigned int num_points)
193 {
194  float* cPtr = cVector;
195  const float* aPtr = aVector;
196  unsigned int number = 0;
197 
198  for(number = 0; number < num_points; number++){
199  *cPtr++ = powf((*aPtr++), power);
200  }
201 }
202 #endif /* LV_HAVE_GENERIC */
203 
204 
205 #endif /* INCLUDED_volk_32f_s32f_power_32f_a_H */
static void volk_32f_s32f_power_32f_a_sse(float *cVector, const float *aVector, const float power, unsigned int num_points)
Definition: volk_32f_s32f_power_32f.h:141
static void volk_32f_s32f_power_32f_generic(float *cVector, const float *aVector, const float power, unsigned int num_points)
Definition: volk_32f_s32f_power_32f.h:191