ViennaCL - The Vienna Computing Library  1.2.0
compressed_matrix_source.h
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_KERNELS_COMPRESSED_MATRIX_SOURCE_HPP_
2 #define VIENNACL_LINALG_KERNELS_COMPRESSED_MATRIX_SOURCE_HPP_
3 //Automatically generated file from auxiliary-directory, do not edit manually!
4 namespace viennacl
5 {
6  namespace linalg
7  {
8  namespace kernels
9  {
10 const char * const compressed_matrix_align8_vec_mul =
11 "__kernel void vec_mul(\n"
12 " __global const unsigned int * row_indices,\n"
13 " __global const uint8 * column_indices, \n"
14 " __global const float8 * elements,\n"
15 " __global const float * vector, \n"
16 " __global float * result,\n"
17 " unsigned int size)\n"
18 "{ \n"
19 " float dot_prod;\n"
20 " unsigned int start, next_stop;\n"
21 " uint8 col_idx;\n"
22 " float8 tmp_vec;\n"
23 " float8 tmp_entries;\n"
24 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
25 " {\n"
26 " dot_prod = 0.0f;\n"
27 " start = row_indices[row] / 8;\n"
28 " next_stop = row_indices[row+1] / 8;\n"
29 " for (unsigned int i = start; i < next_stop; ++i)\n"
30 " {\n"
31 " col_idx = column_indices[i];\n"
32 " tmp_entries = elements[i];\n"
33 " tmp_vec.s0 = vector[col_idx.s0];\n"
34 " tmp_vec.s1 = vector[col_idx.s1];\n"
35 " tmp_vec.s2 = vector[col_idx.s2];\n"
36 " tmp_vec.s3 = vector[col_idx.s3];\n"
37 " tmp_vec.s4 = vector[col_idx.s4];\n"
38 " tmp_vec.s5 = vector[col_idx.s5];\n"
39 " tmp_vec.s6 = vector[col_idx.s6];\n"
40 " tmp_vec.s7 = vector[col_idx.s7];\n"
41 " dot_prod += dot(tmp_entries.lo, tmp_vec.lo);\n"
42 " dot_prod += dot(tmp_entries.hi, tmp_vec.hi);\n"
43 " }\n"
44 " result[row] = dot_prod;\n"
45 " }\n"
46 "}\n"
47 ; //compressed_matrix_align8_vec_mul
48 
50 "void helper_bicgstab_kernel2_parallel_reduction( __local float * tmp_buffer )\n"
51 "{\n"
52 " for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
53 " {\n"
54 " barrier(CLK_LOCAL_MEM_FENCE);\n"
55 " if (get_local_id(0) < stride)\n"
56 " tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
57 " }\n"
58 "}\n"
59 "//////// inner products:\n"
60 "float bicgstab_kernel2_inner_prod(\n"
61 " __global const float * vec1,\n"
62 " __global const float * vec2,\n"
63 " unsigned int size,\n"
64 " __local float * tmp_buffer)\n"
65 "{\n"
66 " float tmp = 0;\n"
67 " unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
68 " for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n"
69 " {\n"
70 " if (i < size)\n"
71 " tmp += vec1[i] * vec2[i];\n"
72 " }\n"
73 " tmp_buffer[get_local_id(0)] = tmp;\n"
74 " \n"
75 " helper_bicgstab_kernel2_parallel_reduction(tmp_buffer);\n"
76 " barrier(CLK_LOCAL_MEM_FENCE);\n"
77 " return tmp_buffer[0];\n"
78 "}\n"
79 "__kernel void bicgstab_kernel2(\n"
80 " __global const float * tmp0,\n"
81 " __global const float * tmp1,\n"
82 " __global const float * r0star, \n"
83 " __global const float * s, \n"
84 " __global float * p, \n"
85 " __global float * result,\n"
86 " __global float * residual,\n"
87 " __global const float * alpha,\n"
88 " __global float * ip_rr0star,\n"
89 " __global float * error_estimate,\n"
90 " __local float * tmp_buffer,\n"
91 " unsigned int size) \n"
92 "{ \n"
93 " float omega_local = bicgstab_kernel2_inner_prod(tmp1, s, size, tmp_buffer) / bicgstab_kernel2_inner_prod(tmp1, tmp1, size, tmp_buffer);\n"
94 " float alpha_local = alpha[0];\n"
95 " \n"
96 " //result += alpha * p + omega * s;\n"
97 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
98 " result[i] += alpha_local * p[i] + omega_local * s[i];\n"
99 " //residual = s - omega * tmp1;\n"
100 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
101 " residual[i] = s[i] - omega_local * tmp1[i];\n"
102 " //new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);\n"
103 " float new_ip_rr0star = bicgstab_kernel2_inner_prod(residual, r0star, size, tmp_buffer);\n"
104 " float beta = (new_ip_rr0star / ip_rr0star[0]) * (alpha_local / omega_local);\n"
105 " \n"
106 " //p = residual + beta * (p - omega*tmp0);\n"
107 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
108 " p[i] = residual[i] + beta * (p[i] - omega_local * tmp0[i]);\n"
109 " //compute norm of residual:\n"
110 " float new_error_estimate = bicgstab_kernel2_inner_prod(residual, residual, size, tmp_buffer);\n"
111 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
112 " //update values:\n"
113 " if (get_global_id(0) == 0)\n"
114 " {\n"
115 " error_estimate[0] = new_error_estimate;\n"
116 " ip_rr0star[0] = new_ip_rr0star;\n"
117 " }\n"
118 "}\n"
119 ; //compressed_matrix_align1_bicgstab_kernel2
120 
122 " \n"
123 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n"
124 "__kernel void lu_forward(\n"
125 " __global const unsigned int * row_indices,\n"
126 " __global const unsigned int * column_indices, \n"
127 " __global const float * elements,\n"
128 " __local int * buffer, \n"
129 " __local float * vec_entries, //a memory block from vector\n"
130 " __global float * vector,\n"
131 " unsigned int size) \n"
132 "{\n"
133 " int waiting_for; //block index that must be finished before the current thread can start\n"
134 " unsigned int waiting_for_index;\n"
135 " int block_offset;\n"
136 " unsigned int col;\n"
137 " unsigned int row;\n"
138 " unsigned int row_index_end;\n"
139 " \n"
140 " //backward substitution: one thread per row in blocks of get_global_size(0)\n"
141 " for (unsigned int block_num = 0; block_num <= size / get_global_size(0); ++block_num)\n"
142 " {\n"
143 " block_offset = block_num * get_global_size(0);\n"
144 " row = block_offset + get_global_id(0);\n"
145 " buffer[get_global_id(0)] = 0; //set flag to 'undone'\n"
146 " waiting_for = -1;\n"
147 " if (row < size)\n"
148 " {\n"
149 " vec_entries[get_global_id(0)] = vector[row];\n"
150 " waiting_for_index = row_indices[row];\n"
151 " row_index_end = row_indices[row+1];\n"
152 " }\n"
153 " \n"
154 " if (get_global_id(0) == 0)\n"
155 " buffer[get_global_size(0)] = 1;\n"
156 " //try to eliminate all lines in the block. \n"
157 " //in worst case scenarios, in each step only one line can be substituted, thus loop\n"
158 " for (unsigned int k = 0; k<get_global_size(0); ++k)\n"
159 " {\n"
160 " barrier(CLK_LOCAL_MEM_FENCE);\n"
161 " if (row < size) //valid index?\n"
162 " {\n"
163 " if (waiting_for >= 0)\n"
164 " {\n"
165 " if (buffer[waiting_for] == 1)\n"
166 " waiting_for = -1;\n"
167 " }\n"
168 " \n"
169 " if (waiting_for == -1) //substitution not yet done, check whether possible\n"
170 " {\n"
171 " //check whether reduction is possible:\n"
172 " for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n"
173 " {\n"
174 " col = column_indices[j];\n"
175 " if (col < block_offset) //index valid, but not from current block\n"
176 " vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n"
177 " else if (col < row) //index is from current block\n"
178 " {\n"
179 " if (buffer[col - block_offset] == 0) //entry is not yet calculated\n"
180 " {\n"
181 " waiting_for = col - block_offset;\n"
182 " waiting_for_index = j;\n"
183 " break;\n"
184 " }\n"
185 " else //updated entry is available in shared memory:\n"
186 " vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n"
187 " }\n"
188 " }\n"
189 " \n"
190 " if (waiting_for == -1) //this row is done\n"
191 " {\n"
192 " buffer[get_global_id(0)] = 1;\n"
193 " waiting_for = -2; //magic number: thread is finished\n"
194 " }\n"
195 " } \n"
196 " } //row < size\n"
197 " else\n"
198 " buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n"
199 " ///////// check whether all threads are done. If yes, exit loop /////////////\n"
200 " \n"
201 " if (buffer[get_global_id(0)] == 0)\n"
202 " buffer[get_global_size(0)] = 0;\n"
203 " barrier(CLK_LOCAL_MEM_FENCE);\n"
204 " \n"
205 " if (buffer[get_global_size(0)] > 0) //all threads break this loop simultaneously\n"
206 " break;\n"
207 " if (get_global_id(0) == 0)\n"
208 " buffer[get_global_size(0)] = 1;\n"
209 " } //for k\n"
210 " \n"
211 " //write to vector:\n"
212 " if (row < size)\n"
213 " vector[row] = vec_entries[get_global_id(0)];\n"
214 " \n"
215 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
216 " } //for block_num\n"
217 "}\n"
218 ; //compressed_matrix_align1_lu_forward
219 
220 const char * const compressed_matrix_align1_jacobi =
221 "__kernel void jacobi(\n"
222 " __global const unsigned int * row_indices,\n"
223 " __global const unsigned int * column_indices,\n"
224 " __global const float * elements,\n"
225 " float weight,\n"
226 " __global const float * old_result,\n"
227 " __global float * new_result,\n"
228 " __global const float * rhs,\n"
229 " unsigned int size)\n"
230 " {\n"
231 " float sum, diag=1;\n"
232 " int col;\n"
233 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
234 " {\n"
235 " sum = 0;\n"
236 " for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++)\n"
237 " {\n"
238 " col = column_indices[j];\n"
239 " if (i == col)\n"
240 " diag = elements[j];\n"
241 " else \n"
242 " sum += elements[j] * old_result[col]; \n"
243 " } \n"
244 " new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n"
245 " } \n"
246 " } \n"
247 ; //compressed_matrix_align1_jacobi
248 
250 "// compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format\n"
251 "__kernel void lu_backward(\n"
252 " __global const unsigned int * row_indices,\n"
253 " __global const unsigned int * column_indices, \n"
254 " __global const float * elements,\n"
255 " __local int * buffer, \n"
256 " __local float * vec_entries, //a memory block from vector\n"
257 " __global float * vector,\n"
258 " unsigned int size) \n"
259 "{\n"
260 " int waiting_for; //block index that must be finished before the current thread can start\n"
261 " unsigned int waiting_for_index;\n"
262 " unsigned int block_offset;\n"
263 " unsigned int col;\n"
264 " unsigned int row;\n"
265 " unsigned int row_index_end;\n"
266 " float diagonal_entry = 42;\n"
267 " \n"
268 " //forward substitution: one thread per row in blocks of get_global_size(0)\n"
269 " for (int block_num = size / get_global_size(0); block_num > -1; --block_num)\n"
270 " {\n"
271 " block_offset = block_num * get_global_size(0);\n"
272 " row = block_offset + get_global_id(0);\n"
273 " buffer[get_global_id(0)] = 0; //set flag to 'undone'\n"
274 " waiting_for = -1;\n"
275 " \n"
276 " if (row < size)\n"
277 " {\n"
278 " vec_entries[get_global_id(0)] = vector[row];\n"
279 " waiting_for_index = row_indices[row];\n"
280 " row_index_end = row_indices[row+1];\n"
281 " diagonal_entry = column_indices[waiting_for_index];\n"
282 " }\n"
283 " \n"
284 " if (get_global_id(0) == 0)\n"
285 " buffer[get_global_size(0)] = 1;\n"
286 " //try to eliminate all lines in the block. \n"
287 " //in worst case scenarios, in each step only one line can be substituted, thus loop\n"
288 " for (unsigned int k = 0; k<get_global_size(0); ++k)\n"
289 " {\n"
290 " barrier(CLK_LOCAL_MEM_FENCE);\n"
291 " if (row < size) //valid index?\n"
292 " {\n"
293 " if (waiting_for >= 0)\n"
294 " {\n"
295 " if (buffer[waiting_for] == 1)\n"
296 " waiting_for = -1;\n"
297 " }\n"
298 " \n"
299 " if (waiting_for == -1) //substitution not yet done, check whether possible\n"
300 " {\n"
301 " //check whether reduction is possible:\n"
302 " for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n"
303 " {\n"
304 " col = column_indices[j];\n"
305 " barrier(CLK_LOCAL_MEM_FENCE);\n"
306 " if (col >= block_offset + get_global_size(0)) //index valid, but not from current block\n"
307 " vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n"
308 " else if (col > row) //index is from current block\n"
309 " {\n"
310 " if (buffer[col - block_offset] == 0) //entry is not yet calculated\n"
311 " {\n"
312 " waiting_for = col - block_offset;\n"
313 " waiting_for_index = j;\n"
314 " break;\n"
315 " }\n"
316 " else //updated entry is available in shared memory:\n"
317 " vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n"
318 " }\n"
319 " else if (col == row)\n"
320 " diagonal_entry = elements[j];\n"
321 " }\n"
322 " \n"
323 " if (waiting_for == -1) //this row is done\n"
324 " {\n"
325 " if (row == 0)\n"
326 " vec_entries[get_global_id(0)] /= elements[0];\n"
327 " else\n"
328 " vec_entries[get_global_id(0)] /= diagonal_entry;\n"
329 " buffer[get_global_id(0)] = 1;\n"
330 " waiting_for = -2; //magic number: thread is finished\n"
331 " }\n"
332 " } \n"
333 " } //row < size\n"
334 " else\n"
335 " buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n"
336 " \n"
337 " ///////// check whether all threads are done. If yes, exit loop /////////////\n"
338 " if (buffer[get_global_id(0)] == 0)\n"
339 " buffer[get_global_size(0)] = 0;\n"
340 " barrier(CLK_LOCAL_MEM_FENCE);\n"
341 " \n"
342 " if (buffer[get_global_size(0)] > 0) //all threads break the loop simultaneously\n"
343 " break;\n"
344 " if (get_global_id(0) == 0)\n"
345 " buffer[get_global_size(0)] = 1;\n"
346 " } //for k\n"
347 " if (row < size)\n"
348 " vector[row] = vec_entries[get_global_id(0)];\n"
349 " //vector[row] = diagonal_entry;\n"
350 " \n"
351 " //if (row == 0)\n"
352 " //vector[0] = diagonal_entry;\n"
353 " //vector[0] = elements[0];\n"
354 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
355 " } //for block_num\n"
356 "}\n"
357 ; //compressed_matrix_align1_lu_backward
358 
360 "__kernel void row_scaling_2(\n"
361 " __global const unsigned int * row_indices,\n"
362 " __global const unsigned int * column_indices, \n"
363 " __global const float * elements,\n"
364 " __global float * diag_M_inv,\n"
365 " unsigned int size) \n"
366 "{ \n"
367 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
368 " {\n"
369 " float dot_prod = 0.0f;\n"
370 " float temp = 0.0f;\n"
371 " unsigned int row_end = row_indices[row+1];\n"
372 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
373 " {\n"
374 " temp = elements[i];\n"
375 " dot_prod += temp * temp;\n"
376 " }\n"
377 " diag_M_inv[row] = 1.0f / sqrt(dot_prod);\n"
378 " }\n"
379 "}\n"
380 ; //compressed_matrix_align1_row_scaling_2
381 
383 "__kernel void row_scaling_1(\n"
384 " __global const unsigned int * row_indices,\n"
385 " __global const unsigned int * column_indices, \n"
386 " __global const float * elements,\n"
387 " __global float * diag_M_inv,\n"
388 " unsigned int size) \n"
389 "{ \n"
390 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
391 " {\n"
392 " float dot_prod = 0.0f;\n"
393 " unsigned int row_end = row_indices[row+1];\n"
394 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
395 " dot_prod += fabs(elements[i]);\n"
396 " diag_M_inv[row] = 1.0f / dot_prod;\n"
397 " }\n"
398 "}\n"
399 ; //compressed_matrix_align1_row_scaling_1
400 
401 const char * const compressed_matrix_align1_vec_mul =
402 "__kernel void vec_mul(\n"
403 " __global const unsigned int * row_indices,\n"
404 " __global const unsigned int * column_indices, \n"
405 " __global const float * elements,\n"
406 " __global const float * vector, \n"
407 " __global float * result,\n"
408 " unsigned int size) \n"
409 "{ \n"
410 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
411 " {\n"
412 " float dot_prod = 0.0f;\n"
413 " unsigned int row_end = row_indices[row+1];\n"
414 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
415 " dot_prod += elements[i] * vector[column_indices[i]];\n"
416 " result[row] = dot_prod;\n"
417 " }\n"
418 "}\n"
419 ; //compressed_matrix_align1_vec_mul
420 
422 "void helper_bicgstab_kernel1_parallel_reduction( __local float * tmp_buffer )\n"
423 "{\n"
424 " for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
425 " {\n"
426 " barrier(CLK_LOCAL_MEM_FENCE);\n"
427 " if (get_local_id(0) < stride)\n"
428 " tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
429 " }\n"
430 "}\n"
431 "//////// inner products:\n"
432 "float bicgstab_kernel1_inner_prod(\n"
433 " __global const float * vec1,\n"
434 " __global const float * vec2,\n"
435 " unsigned int size,\n"
436 " __local float * tmp_buffer)\n"
437 "{\n"
438 " float tmp = 0;\n"
439 " unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
440 " for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n"
441 " {\n"
442 " if (i < size)\n"
443 " tmp += vec1[i] * vec2[i];\n"
444 " }\n"
445 " tmp_buffer[get_local_id(0)] = tmp;\n"
446 " \n"
447 " helper_bicgstab_kernel1_parallel_reduction(tmp_buffer);\n"
448 " barrier(CLK_LOCAL_MEM_FENCE);\n"
449 " return tmp_buffer[0];\n"
450 "}\n"
451 "__kernel void bicgstab_kernel1(\n"
452 " __global const float * tmp0,\n"
453 " __global const float * r0star, \n"
454 " __global const float * residual,\n"
455 " __global float * s,\n"
456 " __global float * alpha,\n"
457 " __global const float * ip_rr0star,\n"
458 " __local float * tmp_buffer,\n"
459 " unsigned int size) \n"
460 "{ \n"
461 " float alpha_local = ip_rr0star[0] / bicgstab_kernel1_inner_prod(tmp0, r0star, size, tmp_buffer);\n"
462 " \n"
463 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
464 " s[i] = residual[i] - alpha_local * tmp0[i];\n"
465 " \n"
466 " if (get_global_id(0) == 0)\n"
467 " alpha[0] = alpha_local;\n"
468 "}\n"
469 ; //compressed_matrix_align1_bicgstab_kernel1
470 
472 "__kernel void jacobi_precond(\n"
473 " __global const unsigned int * row_indices,\n"
474 " __global const unsigned int * column_indices, \n"
475 " __global const float * elements,\n"
476 " __global float * diag_M_inv,\n"
477 " unsigned int size) \n"
478 "{ \n"
479 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
480 " {\n"
481 " float diag = 1.0f;\n"
482 " unsigned int row_end = row_indices[row+1];\n"
483 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
484 " {\n"
485 " if (row == column_indices[i])\n"
486 " {\n"
487 " diag = elements[i];\n"
488 " break;\n"
489 " }\n"
490 " }\n"
491 " diag_M_inv[row] = 1.0f / diag;\n"
492 " }\n"
493 "}\n"
494 ; //compressed_matrix_align1_jacobi_precond
495 
496 const char * const compressed_matrix_align4_vec_mul =
497 "__kernel void vec_mul(\n"
498 " __global const unsigned int * row_indices,\n"
499 " __global const uint4 * column_indices, \n"
500 " __global const float4 * elements,\n"
501 " __global const float * vector, \n"
502 " __global float * result,\n"
503 " unsigned int size)\n"
504 "{ \n"
505 " float dot_prod;\n"
506 " unsigned int start, next_stop;\n"
507 " uint4 col_idx;\n"
508 " float4 tmp_vec;\n"
509 " float4 tmp_entries;\n"
510 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
511 " {\n"
512 " dot_prod = 0.0f;\n"
513 " start = row_indices[row] / 4;\n"
514 " next_stop = row_indices[row+1] / 4;\n"
515 " for (unsigned int i = start; i < next_stop; ++i)\n"
516 " {\n"
517 " col_idx = column_indices[i];\n"
518 " tmp_entries = elements[i];\n"
519 " tmp_vec.x = vector[col_idx.x];\n"
520 " tmp_vec.y = vector[col_idx.y];\n"
521 " tmp_vec.z = vector[col_idx.z];\n"
522 " tmp_vec.w = vector[col_idx.w];\n"
523 " dot_prod += dot(tmp_entries, tmp_vec);\n"
524 " }\n"
525 " result[row] = dot_prod;\n"
526 " }\n"
527 "}\n"
528 ; //compressed_matrix_align4_vec_mul
529 
530  } //namespace kernels
531  } //namespace linalg
532 } //namespace viennacl
533 #endif