65 #ifndef INCLUDED_volk_32f_sqrt_32f_a_H
66 #define INCLUDED_volk_32f_sqrt_32f_a_H
73 #include <xmmintrin.h>
78 unsigned int number = 0;
79 const unsigned int quarterPoints = num_points / 4;
81 float* cPtr = cVector;
82 const float* aPtr = aVector;
85 for (; number < quarterPoints; number++) {
86 aVal = _mm_load_ps(aPtr);
88 cVal = _mm_sqrt_ps(aVal);
90 _mm_store_ps(cPtr, cVal);
96 number = quarterPoints * 4;
97 for (; number < num_points; number++) {
98 *cPtr++ = sqrtf(*aPtr++);
105 #include <immintrin.h>
110 unsigned int number = 0;
111 const unsigned int eighthPoints = num_points / 8;
113 float* cPtr = cVector;
114 const float* aPtr = aVector;
117 for (; number < eighthPoints; number++) {
118 aVal = _mm256_load_ps(aPtr);
120 cVal = _mm256_sqrt_ps(aVal);
122 _mm256_store_ps(cPtr, cVal);
128 number = eighthPoints * 8;
129 for (; number < num_points; number++) {
130 *cPtr++ = sqrtf(*aPtr++);
138 #include <arm_neon.h>
143 float* cPtr = cVector;
144 const float* aPtr = aVector;
145 unsigned int number = 0;
146 unsigned int quarter_points = num_points / 4;
147 float32x4_t in_vec, out_vec;
149 for (number = 0; number < quarter_points; number++) {
150 in_vec = vld1q_f32(aPtr);
152 out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
153 vst1q_f32(cPtr, out_vec);
158 for (number = quarter_points * 4; number < num_points; number++) {
159 *cPtr++ = sqrtf(*aPtr++);
166 #ifdef LV_HAVE_GENERIC
171 float* cPtr = cVector;
172 const float* aPtr = aVector;
173 unsigned int number = 0;
175 for (number = 0; number < num_points; number++) {
176 *cPtr++ = sqrtf(*aPtr++);
185 extern void volk_32f_sqrt_32f_a_orc_impl(
float*,
const float*,
unsigned int);
188 volk_32f_sqrt_32f_u_orc(
float* cVector,
const float* aVector,
unsigned int num_points)
190 volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
197 #ifndef INCLUDED_volk_32f_sqrt_32f_u_H
198 #define INCLUDED_volk_32f_sqrt_32f_u_H
200 #include <inttypes.h>
204 #include <immintrin.h>
209 unsigned int number = 0;
210 const unsigned int eighthPoints = num_points / 8;
212 float* cPtr = cVector;
213 const float* aPtr = aVector;
216 for (; number < eighthPoints; number++) {
217 aVal = _mm256_loadu_ps(aPtr);
219 cVal = _mm256_sqrt_ps(aVal);
221 _mm256_storeu_ps(cPtr, cVal);
227 number = eighthPoints * 8;
228 for (; number < num_points; number++) {
229 *cPtr++ = sqrtf(*aPtr++);