65 #ifndef INCLUDED_volk_32f_sqrt_32f_a_H 66 #define INCLUDED_volk_32f_sqrt_32f_a_H 73 #include <xmmintrin.h> 78 unsigned int number = 0;
79 const unsigned int quarterPoints = num_points / 4;
81 float* cPtr = cVector;
82 const float* aPtr = aVector;
85 for(;number < quarterPoints; number++) {
86 aVal = _mm_load_ps(aPtr);
88 cVal = _mm_sqrt_ps(aVal);
90 _mm_store_ps(cPtr,cVal);
96 number = quarterPoints * 4;
97 for(;number < num_points; number++) {
98 *cPtr++ = sqrtf(*aPtr++);
105 #include <immintrin.h> 110 unsigned int number = 0;
111 const unsigned int eighthPoints = num_points / 8;
113 float* cPtr = cVector;
114 const float* aPtr = aVector;
117 for(;number < eighthPoints; number++) {
118 aVal = _mm256_load_ps(aPtr);
120 cVal = _mm256_sqrt_ps(aVal);
122 _mm256_store_ps(cPtr,cVal);
128 number = eighthPoints * 8;
129 for(;number < num_points; number++) {
130 *cPtr++ = sqrtf(*aPtr++);
138 #include <arm_neon.h> 143 float* cPtr = cVector;
144 const float* aPtr = aVector;
145 unsigned int number = 0;
146 unsigned int quarter_points = num_points / 4;
147 float32x4_t in_vec, out_vec;
149 for(number = 0; number < quarter_points; number++) {
150 in_vec = vld1q_f32(aPtr);
152 out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec) );
153 vst1q_f32(cPtr, out_vec);
158 for(number = quarter_points * 4; number < num_points; number++) {
159 *cPtr++ = sqrtf(*aPtr++);
166 #ifdef LV_HAVE_GENERIC 171 float* cPtr = cVector;
172 const float* aPtr = aVector;
173 unsigned int number = 0;
175 for(number = 0; number < num_points; number++) {
176 *cPtr++ = sqrtf(*aPtr++);
186 volk_32f_sqrt_32f_a_orc_impl(
float *,
const float*,
unsigned int);
189 volk_32f_sqrt_32f_u_orc(
float* cVector,
const float* aVector,
unsigned int num_points)
191 volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
198 #ifndef INCLUDED_volk_32f_sqrt_32f_u_H 199 #define INCLUDED_volk_32f_sqrt_32f_u_H 201 #include <inttypes.h> 205 #include <immintrin.h> 210 unsigned int number = 0;
211 const unsigned int eighthPoints = num_points / 8;
213 float* cPtr = cVector;
214 const float* aPtr = aVector;
217 for(;number < eighthPoints; number++) {
218 aVal = _mm256_loadu_ps(aPtr);
220 cVal = _mm256_sqrt_ps(aVal);
222 _mm256_storeu_ps(cPtr,cVal);
228 number = eighthPoints * 8;
229 for(;number < num_points; number++) {
230 *cPtr++ = sqrtf(*aPtr++);
static void volk_32f_sqrt_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:208
static void volk_32f_sqrt_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:76
static void volk_32f_sqrt_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:169
static void volk_32f_sqrt_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:108
static void volk_32f_sqrt_32f_neon(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:141