Vector Optimized Library of Kernels  2.0
Architecture-tuned implementations of math kernels
volk_32f_accumulator_s32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
63 #ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
64 #define INCLUDED_volk_32f_accumulator_s32f_a_H
65 
66 #include <volk/volk_common.h>
67 #include <inttypes.h>
68 
69 #ifdef LV_HAVE_AVX
70 #include <immintrin.h>
71 
72 static inline void
73 volk_32f_accumulator_s32f_a_avx(float* result, const float* inputBuffer, unsigned int num_points)
74 {
75  float returnValue = 0;
76  unsigned int number = 0;
77  const unsigned int eighthPoints = num_points / 8;
78 
79  const float* aPtr = inputBuffer;
80  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
81 
82  __m256 accumulator = _mm256_setzero_ps();
83  __m256 aVal = _mm256_setzero_ps();
84 
85  for(;number < eighthPoints; number++){
86  aVal = _mm256_load_ps(aPtr);
87  accumulator = _mm256_add_ps(accumulator, aVal);
88  aPtr += 8;
89  }
90 
91  _mm256_store_ps(tempBuffer, accumulator);
92 
93  returnValue = tempBuffer[0];
94  returnValue += tempBuffer[1];
95  returnValue += tempBuffer[2];
96  returnValue += tempBuffer[3];
97  returnValue += tempBuffer[4];
98  returnValue += tempBuffer[5];
99  returnValue += tempBuffer[6];
100  returnValue += tempBuffer[7];
101 
102  number = eighthPoints * 8;
103  for(;number < num_points; number++){
104  returnValue += (*aPtr++);
105  }
106  *result = returnValue;
107 }
108 #endif /* LV_HAVE_AVX */
109 
110 
111 #ifdef LV_HAVE_AVX
112 #include <immintrin.h>
113 
114 static inline void
115 volk_32f_accumulator_s32f_u_avx(float* result, const float* inputBuffer, unsigned int num_points)
116 {
117  float returnValue = 0;
118  unsigned int number = 0;
119  const unsigned int eighthPoints = num_points / 8;
120 
121  const float* aPtr = inputBuffer;
122  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
123 
124  __m256 accumulator = _mm256_setzero_ps();
125  __m256 aVal = _mm256_setzero_ps();
126 
127  for(;number < eighthPoints; number++){
128  aVal = _mm256_loadu_ps(aPtr);
129  accumulator = _mm256_add_ps(accumulator, aVal);
130  aPtr += 8;
131  }
132 
133  _mm256_store_ps(tempBuffer, accumulator);
134 
135  returnValue = tempBuffer[0];
136  returnValue += tempBuffer[1];
137  returnValue += tempBuffer[2];
138  returnValue += tempBuffer[3];
139  returnValue += tempBuffer[4];
140  returnValue += tempBuffer[5];
141  returnValue += tempBuffer[6];
142  returnValue += tempBuffer[7];
143 
144  number = eighthPoints * 8;
145  for(;number < num_points; number++){
146  returnValue += (*aPtr++);
147  }
148  *result = returnValue;
149 }
150 #endif /* LV_HAVE_AVX */
151 
152 
153 #ifdef LV_HAVE_SSE
154 #include <xmmintrin.h>
155 
156 static inline void
157 volk_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points)
158 {
159  float returnValue = 0;
160  unsigned int number = 0;
161  const unsigned int quarterPoints = num_points / 4;
162 
163  const float* aPtr = inputBuffer;
164  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
165 
166  __m128 accumulator = _mm_setzero_ps();
167  __m128 aVal = _mm_setzero_ps();
168 
169  for(;number < quarterPoints; number++){
170  aVal = _mm_load_ps(aPtr);
171  accumulator = _mm_add_ps(accumulator, aVal);
172  aPtr += 4;
173  }
174 
175  _mm_store_ps(tempBuffer,accumulator);
176 
177  returnValue = tempBuffer[0];
178  returnValue += tempBuffer[1];
179  returnValue += tempBuffer[2];
180  returnValue += tempBuffer[3];
181 
182  number = quarterPoints * 4;
183  for(;number < num_points; number++){
184  returnValue += (*aPtr++);
185  }
186  *result = returnValue;
187 }
188 #endif /* LV_HAVE_SSE */
189 
190 
191 #ifdef LV_HAVE_SSE
192 #include <xmmintrin.h>
193 
194 static inline void
195 volk_32f_accumulator_s32f_u_sse(float* result, const float* inputBuffer, unsigned int num_points)
196 {
197  float returnValue = 0;
198  unsigned int number = 0;
199  const unsigned int quarterPoints = num_points / 4;
200 
201  const float* aPtr = inputBuffer;
202  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
203 
204  __m128 accumulator = _mm_setzero_ps();
205  __m128 aVal = _mm_setzero_ps();
206 
207  for(;number < quarterPoints; number++){
208  aVal = _mm_load_ps(aPtr);
209  accumulator = _mm_add_ps(accumulator, aVal);
210  aPtr += 4;
211  }
212 
213  _mm_store_ps(tempBuffer,accumulator);
214 
215  returnValue = tempBuffer[0];
216  returnValue += tempBuffer[1];
217  returnValue += tempBuffer[2];
218  returnValue += tempBuffer[3];
219 
220  number = quarterPoints * 4;
221  for(;number < num_points; number++){
222  returnValue += (*aPtr++);
223  }
224  *result = returnValue;
225 }
226 #endif /* LV_HAVE_SSE */
227 
228 #ifdef LV_HAVE_GENERIC
229 static inline void
230 volk_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points)
231 {
232  const float* aPtr = inputBuffer;
233  unsigned int number = 0;
234  float returnValue = 0;
235 
236  for(;number < num_points; number++){
237  returnValue += (*aPtr++);
238  }
239  *result = returnValue;
240 }
241 #endif /* LV_HAVE_GENERIC */
242 
243 #endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */
static void volk_32f_accumulator_s32f_u_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:115
static void volk_32f_accumulator_s32f_a_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:73
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:33
static void volk_32f_accumulator_s32f_u_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:195
static void volk_32f_accumulator_s32f_generic(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:230
static void volk_32f_accumulator_s32f_a_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:157