ViennaCL - The Vienna Computing Library  1.2.0
matrix_prod_row_row_row_source.h
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_KERNELS_MATRIX_PROD_ROW_ROW_ROW_SOURCE_HPP_
2 #define VIENNACL_LINALG_KERNELS_MATRIX_PROD_ROW_ROW_ROW_SOURCE_HPP_
3 //Automatically generated file from auxiliary-directory, do not edit manually!
4 namespace viennacl
5 {
6  namespace linalg
7  {
8  namespace kernels
9  {
11 "// file automatically generated - do not edit!\n"
12 "// matrix-matrix multiplication C = A^T * B^T\n"
13 "// matrix layouts: C...row_major, A...row_major, B...row_major\n"
14 "__kernel void prod_TT(\n"
15 " __global const float * A,\n"
16 " unsigned int A_row_start,\n"
17 " unsigned int A_col_start,\n"
18 " unsigned int A_row_size,\n"
19 " unsigned int A_col_size,\n"
20 " unsigned int A_internal_rows,\n"
21 " unsigned int A_internal_cols,\n"
22 " __global const float * B, \n"
23 " unsigned int B_row_start,\n"
24 " unsigned int B_col_start,\n"
25 " unsigned int B_row_size,\n"
26 " unsigned int B_col_size,\n"
27 " unsigned int B_internal_rows,\n"
28 " unsigned int B_internal_cols,\n"
29 " __global float * C,\n"
30 " unsigned int C_row_start,\n"
31 " unsigned int C_col_start,\n"
32 " unsigned int C_row_size,\n"
33 " unsigned int C_col_size,\n"
34 " unsigned int C_internal_rows,\n"
35 " unsigned int C_internal_cols,\n"
36 " __local float * bufA,\n"
37 " __local float * bufB) \n"
38 "{ \n"
39 " size_t block_size = get_local_size(0);\n"
40 " size_t row_block_id = get_group_id(0);\n"
41 " size_t col_block_id = get_group_id(1);\n"
42 " size_t row_thread_id = get_local_id(0);\n"
43 " size_t col_thread_id = get_local_id(1);\n"
44 " size_t row_block_id_ = get_local_id(1);\n"
45 " size_t aBegin = (row_block_id * block_size + A_col_start) + A_row_start * A_internal_cols;\n"
46 " size_t aStep = block_size * A_internal_cols;\n"
47 " size_t bBegin = (col_block_id * block_size + B_row_start) * B_internal_cols + B_col_start;\n"
48 " size_t bStep = block_size;\n"
49 " size_t block_num = A_row_size / block_size;\n"
50 " if (block_num * block_size != A_row_size)\n"
51 " ++block_num;\n"
52 " float Csub = 0;\n"
53 " size_t aOffset = row_thread_id + col_thread_id * A_internal_cols;\n"
54 " size_t bOffset = row_thread_id + col_thread_id * B_internal_cols;\n"
55 " size_t row_thread_id_times_block_size = row_thread_id * block_size;\n"
56 " for (size_t block = 0;\n"
57 " block < block_num;\n"
58 " ++block)\n"
59 " {\n"
60 " bufA[row_thread_id_times_block_size + col_thread_id] = (block * block_size + col_thread_id < A_row_size && get_global_id(0) < A_col_size) ? A[aBegin + aOffset] : 0;\n"
61 " bufB[col_thread_id * block_size + row_thread_id] = ( (block * block_size + row_thread_id < B_col_size) && get_global_id(1) < B_row_size ) ? B[bBegin + bOffset] : 0;\n"
62 " barrier(CLK_LOCAL_MEM_FENCE);\n"
63 "__local float * bufAptr = bufA + row_thread_id_times_block_size;\n"
64 "__local float * bufBptr = bufB + col_thread_id * block_size;\n"
65 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
66 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
67 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
68 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
69 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
70 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
71 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
72 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
73 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
74 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
75 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
76 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
77 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
78 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
79 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
80 " barrier(CLK_LOCAL_MEM_FENCE);\n"
81 " aBegin += aStep;\n"
82 " bBegin += bStep;\n"
83 " }\n"
84 " if (get_global_id(0) < A_col_size && get_global_id(1) < B_row_size)\n"
85 " C[(get_global_id(0) + C_row_start) * C_internal_cols + get_global_id(1) + C_col_start] = Csub;\n"
86 "}\n"
87 ; //matrix_prod_row_row_row_align1_prod_TT
88 
90 "// file automatically generated - do not edit!\n"
91 "// matrix-matrix multiplication C = A * B\n"
92 "// matrix layouts: C...row_major, A...row_major, B...row_major\n"
93 "__kernel void prod_AA(\n"
94 " __global const float * A,\n"
95 " unsigned int A_row_start,\n"
96 " unsigned int A_col_start,\n"
97 " unsigned int A_row_size,\n"
98 " unsigned int A_col_size,\n"
99 " unsigned int A_internal_rows,\n"
100 " unsigned int A_internal_cols,\n"
101 " __global const float * B, \n"
102 " unsigned int B_row_start,\n"
103 " unsigned int B_col_start,\n"
104 " unsigned int B_row_size,\n"
105 " unsigned int B_col_size,\n"
106 " unsigned int B_internal_rows,\n"
107 " unsigned int B_internal_cols,\n"
108 " __global float * C,\n"
109 " unsigned int C_row_start,\n"
110 " unsigned int C_col_start,\n"
111 " unsigned int C_row_size,\n"
112 " unsigned int C_col_size,\n"
113 " unsigned int C_internal_rows,\n"
114 " unsigned int C_internal_cols,\n"
115 " __local float * bufA,\n"
116 " __local float * bufB) \n"
117 "{ \n"
118 " size_t block_size = get_local_size(0);\n"
119 " size_t row_block_id = get_group_id(0);\n"
120 " size_t col_block_id = get_group_id(1);\n"
121 " size_t row_thread_id = get_local_id(0);\n"
122 " size_t col_thread_id = get_local_id(1);\n"
123 " size_t row_block_id_ = get_local_id(1);\n"
124 " size_t aBegin = (row_block_id * block_size + A_row_start) * A_internal_cols + A_col_start;\n"
125 " size_t aStep = block_size;\n"
126 " size_t bBegin = (col_block_id * block_size + B_col_start) + B_row_start * B_internal_cols;\n"
127 " size_t bStep = block_size * B_internal_cols;\n"
128 " size_t block_num = A_col_size / block_size;\n"
129 " if (block_num * block_size != A_col_size)\n"
130 " ++block_num;\n"
131 " float Csub = 0;\n"
132 " size_t aOffset = row_thread_id * A_internal_cols + col_thread_id;\n"
133 " size_t bOffset = row_thread_id * B_internal_cols + col_thread_id;\n"
134 " size_t row_thread_id_times_block_size = row_thread_id * block_size;\n"
135 " for (size_t block = 0;\n"
136 " block < block_num;\n"
137 " ++block)\n"
138 " {\n"
139 " bufA[row_thread_id_times_block_size + col_thread_id] = (block * block_size + col_thread_id < A_col_size && get_global_id(0) < A_row_size) ? A[aBegin + aOffset] : 0;\n"
140 " bufB[col_thread_id * block_size + row_thread_id] = ( (block * block_size + row_thread_id < B_row_size) && get_global_id(1) < B_col_size ) ? B[bBegin + bOffset] : 0;\n"
141 " barrier(CLK_LOCAL_MEM_FENCE);\n"
142 "__local float * bufAptr = bufA + row_thread_id_times_block_size;\n"
143 "__local float * bufBptr = bufB + col_thread_id * block_size;\n"
144 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
145 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
146 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
147 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
148 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
149 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
150 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
151 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
152 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
153 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
154 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
155 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
156 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
157 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
158 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
159 " barrier(CLK_LOCAL_MEM_FENCE);\n"
160 " aBegin += aStep;\n"
161 " bBegin += bStep;\n"
162 " }\n"
163 " if (get_global_id(0) < A_row_size && get_global_id(1) < B_col_size)\n"
164 " C[(get_global_id(0) + C_row_start) * C_internal_cols + get_global_id(1) + C_col_start] = Csub;\n"
165 "}\n"
166 ; //matrix_prod_row_row_row_align1_prod_AA
167 
169 "// file automatically generated - do not edit!\n"
170 "// matrix-matrix multiplication C = A^T * B\n"
171 "// matrix layouts: C...row_major, A...row_major, B...row_major\n"
172 "__kernel void prod_TA(\n"
173 " __global const float * A,\n"
174 " unsigned int A_row_start,\n"
175 " unsigned int A_col_start,\n"
176 " unsigned int A_row_size,\n"
177 " unsigned int A_col_size,\n"
178 " unsigned int A_internal_rows,\n"
179 " unsigned int A_internal_cols,\n"
180 " __global const float * B, \n"
181 " unsigned int B_row_start,\n"
182 " unsigned int B_col_start,\n"
183 " unsigned int B_row_size,\n"
184 " unsigned int B_col_size,\n"
185 " unsigned int B_internal_rows,\n"
186 " unsigned int B_internal_cols,\n"
187 " __global float * C,\n"
188 " unsigned int C_row_start,\n"
189 " unsigned int C_col_start,\n"
190 " unsigned int C_row_size,\n"
191 " unsigned int C_col_size,\n"
192 " unsigned int C_internal_rows,\n"
193 " unsigned int C_internal_cols,\n"
194 " __local float * bufA,\n"
195 " __local float * bufB) \n"
196 "{ \n"
197 " size_t block_size = get_local_size(0);\n"
198 " size_t row_block_id = get_group_id(0);\n"
199 " size_t col_block_id = get_group_id(1);\n"
200 " size_t row_thread_id = get_local_id(0);\n"
201 " size_t col_thread_id = get_local_id(1);\n"
202 " size_t row_block_id_ = get_local_id(1);\n"
203 " size_t aBegin = (row_block_id * block_size + A_col_start) + A_row_start * A_internal_cols;\n"
204 " size_t aStep = block_size * A_internal_cols;\n"
205 " size_t bBegin = (col_block_id * block_size + B_col_start) + B_row_start * B_internal_cols;\n"
206 " size_t bStep = block_size * B_internal_cols;\n"
207 " size_t block_num = A_row_size / block_size;\n"
208 " if (block_num * block_size != A_row_size)\n"
209 " ++block_num;\n"
210 " float Csub = 0;\n"
211 " size_t aOffset = row_thread_id + col_thread_id * A_internal_cols;\n"
212 " size_t bOffset = row_thread_id * B_internal_cols + col_thread_id;\n"
213 " size_t row_thread_id_times_block_size = row_thread_id * block_size;\n"
214 " for (size_t block = 0;\n"
215 " block < block_num;\n"
216 " ++block)\n"
217 " {\n"
218 " bufA[row_thread_id_times_block_size + col_thread_id] = (block * block_size + col_thread_id < A_row_size && get_global_id(0) < A_col_size) ? A[aBegin + aOffset] : 0;\n"
219 " bufB[col_thread_id * block_size + row_thread_id] = ( (block * block_size + row_thread_id < B_row_size) && get_global_id(1) < B_col_size ) ? B[bBegin + bOffset] : 0;\n"
220 " barrier(CLK_LOCAL_MEM_FENCE);\n"
221 "__local float * bufAptr = bufA + row_thread_id_times_block_size;\n"
222 "__local float * bufBptr = bufB + col_thread_id * block_size;\n"
223 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
224 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
225 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
226 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
227 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
228 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
229 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
230 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
231 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
232 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
233 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
234 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
235 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
236 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
237 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
238 " barrier(CLK_LOCAL_MEM_FENCE);\n"
239 " aBegin += aStep;\n"
240 " bBegin += bStep;\n"
241 " }\n"
242 " if (get_global_id(0) < A_col_size && get_global_id(1) < B_col_size)\n"
243 " C[(get_global_id(0) + C_row_start) * C_internal_cols + get_global_id(1) + C_col_start] = Csub;\n"
244 "}\n"
245 ; //matrix_prod_row_row_row_align1_prod_TA
246 
248 "// file automatically generated - do not edit!\n"
249 "// matrix-matrix multiplication C = A * B^T\n"
250 "// matrix layouts: C...row_major, A...row_major, B...row_major\n"
251 "__kernel void prod_AT(\n"
252 " __global const float * A,\n"
253 " unsigned int A_row_start,\n"
254 " unsigned int A_col_start,\n"
255 " unsigned int A_row_size,\n"
256 " unsigned int A_col_size,\n"
257 " unsigned int A_internal_rows,\n"
258 " unsigned int A_internal_cols,\n"
259 " __global const float * B, \n"
260 " unsigned int B_row_start,\n"
261 " unsigned int B_col_start,\n"
262 " unsigned int B_row_size,\n"
263 " unsigned int B_col_size,\n"
264 " unsigned int B_internal_rows,\n"
265 " unsigned int B_internal_cols,\n"
266 " __global float * C,\n"
267 " unsigned int C_row_start,\n"
268 " unsigned int C_col_start,\n"
269 " unsigned int C_row_size,\n"
270 " unsigned int C_col_size,\n"
271 " unsigned int C_internal_rows,\n"
272 " unsigned int C_internal_cols,\n"
273 " __local float * bufA,\n"
274 " __local float * bufB) \n"
275 "{ \n"
276 " size_t block_size = get_local_size(0);\n"
277 " size_t row_block_id = get_group_id(0);\n"
278 " size_t col_block_id = get_group_id(1);\n"
279 " size_t row_thread_id = get_local_id(0);\n"
280 " size_t col_thread_id = get_local_id(1);\n"
281 " size_t row_block_id_ = get_local_id(1);\n"
282 " size_t aBegin = (row_block_id * block_size + A_row_start) * A_internal_cols + A_col_start;\n"
283 " size_t aStep = block_size;\n"
284 " size_t bBegin = (col_block_id * block_size + B_row_start) * B_internal_cols + B_col_start;\n"
285 " size_t bStep = block_size;\n"
286 " size_t block_num = A_col_size / block_size;\n"
287 " if (block_num * block_size != A_col_size)\n"
288 " ++block_num;\n"
289 " float Csub = 0;\n"
290 " size_t aOffset = row_thread_id * A_internal_cols + col_thread_id;\n"
291 " size_t bOffset = row_thread_id + col_thread_id * B_internal_cols;\n"
292 " size_t row_thread_id_times_block_size = row_thread_id * block_size;\n"
293 " for (size_t block = 0;\n"
294 " block < block_num;\n"
295 " ++block)\n"
296 " {\n"
297 " bufA[row_thread_id_times_block_size + col_thread_id] = (block * block_size + col_thread_id < A_col_size && get_global_id(0) < A_row_size) ? A[aBegin + aOffset] : 0;\n"
298 " bufB[col_thread_id * block_size + row_thread_id] = ( (block * block_size + row_thread_id < B_col_size) && get_global_id(1) < B_row_size ) ? B[bBegin + bOffset] : 0;\n"
299 " barrier(CLK_LOCAL_MEM_FENCE);\n"
300 "__local float * bufAptr = bufA + row_thread_id_times_block_size;\n"
301 "__local float * bufBptr = bufB + col_thread_id * block_size;\n"
302 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
303 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
304 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
305 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
306 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
307 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
308 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
309 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
310 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
311 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
312 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
313 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
314 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
315 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
316 " Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;\n"
317 " barrier(CLK_LOCAL_MEM_FENCE);\n"
318 " aBegin += aStep;\n"
319 " bBegin += bStep;\n"
320 " }\n"
321 " if (get_global_id(0) < A_row_size && get_global_id(1) < B_row_size)\n"
322 " C[(get_global_id(0) + C_row_start) * C_internal_cols + get_global_id(1) + C_col_start] = Csub;\n"
323 "}\n"
324 ; //matrix_prod_row_row_row_align1_prod_AT
325 
326  } //namespace kernels
327  } //namespace linalg
328 } //namespace viennacl
329 #endif