Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32f_s32f_power_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_s32f_power_32f_a_H
72 #define INCLUDED_volk_32f_s32f_power_32f_a_H
73 
74 #include <inttypes.h>
75 #include <math.h>
76 #include <stdio.h>
77 
78 #ifdef LV_HAVE_SSE4_1
79 #include <tmmintrin.h>
80 
81 #ifdef LV_HAVE_LIB_SIMDMATH
82 #include <simdmath.h>
83 #endif /* LV_HAVE_LIB_SIMDMATH */
84 
85 static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector,
86  const float* aVector,
87  const float power,
88  unsigned int num_points)
89 {
90  unsigned int number = 0;
91 
92  float* cPtr = cVector;
93  const float* aPtr = aVector;
94 
95 #ifdef LV_HAVE_LIB_SIMDMATH
96  const unsigned int quarterPoints = num_points / 4;
97  __m128 vPower = _mm_set_ps1(power);
98  __m128 zeroValue = _mm_setzero_ps();
99  __m128 signMask;
100  __m128 negatedValues;
101  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
102  __m128 onesMask = _mm_set_ps1(1);
103 
104  __m128 aVal, cVal;
105  for (; number < quarterPoints; number++) {
106 
107  aVal = _mm_load_ps(aPtr);
108  signMask = _mm_cmplt_ps(aVal, zeroValue);
109  negatedValues = _mm_sub_ps(zeroValue, aVal);
110  aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
111 
112  // powf4 doesn't support negative values in the base, so we mask them off and then
113  // apply the negative after
114  cVal = powf4(aVal, vPower); // Takes each input value to the specified power
115 
116  cVal = _mm_mul_ps(_mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
117 
118  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
119 
120  aPtr += 4;
121  cPtr += 4;
122  }
123 
124  number = quarterPoints * 4;
125 #endif /* LV_HAVE_LIB_SIMDMATH */
126 
127  for (; number < num_points; number++) {
128  *cPtr++ = powf((*aPtr++), power);
129  }
130 }
131 
132 #endif /* LV_HAVE_SSE4_1 */
133 
134 
135 #ifdef LV_HAVE_SSE
136 #include <xmmintrin.h>
137 
138 #ifdef LV_HAVE_LIB_SIMDMATH
139 #include <simdmath.h>
140 #endif /* LV_HAVE_LIB_SIMDMATH */
141 
142 static inline void volk_32f_s32f_power_32f_a_sse(float* cVector,
143  const float* aVector,
144  const float power,
145  unsigned int num_points)
146 {
147  unsigned int number = 0;
148 
149  float* cPtr = cVector;
150  const float* aPtr = aVector;
151 
152 #ifdef LV_HAVE_LIB_SIMDMATH
153  const unsigned int quarterPoints = num_points / 4;
154  __m128 vPower = _mm_set_ps1(power);
155  __m128 zeroValue = _mm_setzero_ps();
156  __m128 signMask;
157  __m128 negatedValues;
158  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
159  __m128 onesMask = _mm_set_ps1(1);
160 
161  __m128 aVal, cVal;
162  for (; number < quarterPoints; number++) {
163 
164  aVal = _mm_load_ps(aPtr);
165  signMask = _mm_cmplt_ps(aVal, zeroValue);
166  negatedValues = _mm_sub_ps(zeroValue, aVal);
167  aVal =
168  _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues));
169 
170  // powf4 doesn't support negative values in the base, so we mask them off and then
171  // apply the negative after
172  cVal = powf4(aVal, vPower); // Takes each input value to the specified power
173 
174  cVal = _mm_mul_ps(_mm_or_ps(_mm_andnot_ps(signMask, onesMask),
175  _mm_and_ps(signMask, negativeOneToPower)),
176  cVal);
177 
178  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
179 
180  aPtr += 4;
181  cPtr += 4;
182  }
183 
184  number = quarterPoints * 4;
185 #endif /* LV_HAVE_LIB_SIMDMATH */
186 
187  for (; number < num_points; number++) {
188  *cPtr++ = powf((*aPtr++), power);
189  }
190 }
191 
192 #endif /* LV_HAVE_SSE */
193 
194 
195 #ifdef LV_HAVE_GENERIC
196 
197 static inline void volk_32f_s32f_power_32f_generic(float* cVector,
198  const float* aVector,
199  const float power,
200  unsigned int num_points)
201 {
202  float* cPtr = cVector;
203  const float* aPtr = aVector;
204  unsigned int number = 0;
205 
206  for (number = 0; number < num_points; number++) {
207  *cPtr++ = powf((*aPtr++), power);
208  }
209 }
210 #endif /* LV_HAVE_GENERIC */
211 
212 
213 #endif /* INCLUDED_volk_32f_s32f_power_32f_a_H */
volk_32f_s32f_power_32f_generic
static void volk_32f_s32f_power_32f_generic(float *cVector, const float *aVector, const float power, unsigned int num_points)
Definition: volk_32f_s32f_power_32f.h:197
volk_32f_s32f_power_32f_a_sse
static void volk_32f_s32f_power_32f_a_sse(float *cVector, const float *aVector, const float power, unsigned int num_points)
Definition: volk_32f_s32f_power_32f.h:142