ViennaCL - The Vienna Computing Library  1.2.0
coordinate_matrix_source.h
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_KERNELS_COORDINATE_MATRIX_SOURCE_HPP_
2 #define VIENNACL_LINALG_KERNELS_COORDINATE_MATRIX_SOURCE_HPP_
3 //Automatically generated file from auxiliary-directory, do not edit manually!
4 namespace viennacl
5 {
6  namespace linalg
7  {
8  namespace kernels
9  {
10 const char * const coordinate_matrix_align1_vec_mul =
11 "//segmented parallel reduction. At present restricted to up to 256 threads\n"
12 "void segmented_parallel_reduction(unsigned int row, \n"
13 " float val, \n"
14 " __local unsigned int * shared_rows, \n"
15 " __local float * inter_results) \n"
16 "{ \n"
17 " //barrier(CLK_LOCAL_MEM_FENCE); \n"
18 " shared_rows[get_local_id(0)] = row; \n"
19 " inter_results[get_local_id(0)] = val; \n"
20 " float left = 0;\n"
21 " \n"
22 " barrier(CLK_LOCAL_MEM_FENCE); \n"
23 " if( get_local_id(0) >= 1 && row == shared_rows[get_local_id(0) - 1] ) { left = inter_results[get_local_id(0) - 1]; } \n"
24 " barrier(CLK_LOCAL_MEM_FENCE); \n"
25 " inter_results[get_local_id(0)] += left; left = 0;\n"
26 " barrier(CLK_LOCAL_MEM_FENCE); \n"
27 " if( get_local_id(0) >= 2 && row == shared_rows[get_local_id(0) - 2] ) { left = inter_results[get_local_id(0) - 2]; } \n"
28 " barrier(CLK_LOCAL_MEM_FENCE); \n"
29 " inter_results[get_local_id(0)] += left; left = 0;\n"
30 " barrier(CLK_LOCAL_MEM_FENCE); \n"
31 " if( get_local_id(0) >= 4 && row == shared_rows[get_local_id(0) - 4] ) { left = inter_results[get_local_id(0) - 4]; } \n"
32 " barrier(CLK_LOCAL_MEM_FENCE); \n"
33 " inter_results[get_local_id(0)] += left; left = 0;\n"
34 " barrier(CLK_LOCAL_MEM_FENCE); \n"
35 " if( get_local_id(0) >= 8 && row == shared_rows[get_local_id(0) - 8] ) { left = inter_results[get_local_id(0) - 8]; } \n"
36 " barrier(CLK_LOCAL_MEM_FENCE); \n"
37 " inter_results[get_local_id(0)] += left; left = 0;\n"
38 " barrier(CLK_LOCAL_MEM_FENCE); \n"
39 " if( get_local_id(0) >= 16 && row == shared_rows[get_local_id(0) - 16] ) { left = inter_results[get_local_id(0) - 16]; } \n"
40 " barrier(CLK_LOCAL_MEM_FENCE); \n"
41 " inter_results[get_local_id(0)] += left; left = 0;\n"
42 " barrier(CLK_LOCAL_MEM_FENCE); \n"
43 " if( get_local_id(0) >= 32 && row == shared_rows[get_local_id(0) - 32] ) { left = inter_results[get_local_id(0) - 32]; } \n"
44 " barrier(CLK_LOCAL_MEM_FENCE); \n"
45 " inter_results[get_local_id(0)] += left; left = 0;\n"
46 " barrier(CLK_LOCAL_MEM_FENCE); \n"
47 " if( get_local_id(0) >= 64 && row == shared_rows[get_local_id(0) - 64] ) { left = inter_results[get_local_id(0) - 64]; } \n"
48 " barrier(CLK_LOCAL_MEM_FENCE); \n"
49 " inter_results[get_local_id(0)] += left; left = 0;\n"
50 " barrier(CLK_LOCAL_MEM_FENCE); \n"
51 " if( get_local_id(0) >= 128 && row == shared_rows[get_local_id(0) - 128] ) { left = inter_results[get_local_id(0) - 128]; } \n"
52 " barrier(CLK_LOCAL_MEM_FENCE); \n"
53 " inter_results[get_local_id(0)] += left; left = 0;\n"
54 " barrier(CLK_LOCAL_MEM_FENCE); \n"
55 " //if( get_local_id(0) >= 256 && row == shared_rows[get_local_id(0) - 256] ) { left = inter_results[get_local_id(0) - 256]; } \n"
56 " //barrier(CLK_LOCAL_MEM_FENCE); \n"
57 " //inter_results[get_local_id(0)] += left; left = 0;\n"
58 " //barrier(CLK_LOCAL_MEM_FENCE); \n"
59 "}\n"
60 "__kernel void vec_mul( \n"
61 " __global const uint2 * coords, //(row_index, column_index) \n"
62 " __global const float * elements, \n"
63 " __global const uint * group_boundaries,\n"
64 " __global const float * vector, \n"
65 " __global float * result, \n"
66 " __local unsigned int * shared_rows, \n"
67 " __local float * inter_results) \n"
68 "{ \n"
69 " uint2 tmp; \n"
70 " float val;\n"
71 " uint last_index = get_local_size(0) - 1;\n"
72 " uint group_start = group_boundaries[get_group_id(0)];\n"
73 " uint group_end = group_boundaries[get_group_id(0) + 1];\n"
74 " uint k_end = 1 + (group_end - group_start - 1) / get_local_size(0); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)\n"
75 " uint local_index = 0;\n"
76 " for (uint k = 0; k < k_end; ++k)\n"
77 " { \n"
78 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
79 " \n"
80 " local_index = group_start + k * get_local_size(0) + get_local_id(0); \n"
81 " \n"
82 " if (local_index < group_end)\n"
83 " {\n"
84 " tmp = coords[local_index]; \n"
85 " val = elements[local_index] * vector[tmp.y]; \n"
86 " }\n"
87 " else\n"
88 " {\n"
89 " tmp.x = 0;\n"
90 " tmp.y = 0;\n"
91 " val = 0;\n"
92 " }\n"
93 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
94 " //check for carry from previous loop run: \n"
95 " if (get_local_id(0) == 0 && k > 0)\n"
96 " { \n"
97 " if (tmp.x == shared_rows[last_index]) \n"
98 " val += inter_results[last_index]; \n"
99 " else \n"
100 " result[shared_rows[last_index]] += inter_results[last_index]; \n"
101 " } \n"
102 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
103 " segmented_parallel_reduction(tmp.x, val, shared_rows, inter_results); //all threads have to enter this function\n"
104 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
105 " if (get_local_id(0) != last_index &&\n"
106 " shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1] &&\n"
107 " inter_results[get_local_id(0)] != 0) \n"
108 " { \n"
109 " result[tmp.x] += inter_results[get_local_id(0)]; \n"
110 " }\n"
111 " \n"
112 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
113 " } //for k\n"
114 " \n"
115 " if (get_local_id(0) == last_index && inter_results[last_index] != 0) \n"
116 " result[tmp.x] += inter_results[last_index]; \n"
117 "}\n"
118 ; //coordinate_matrix_align1_vec_mul
119 
120  } //namespace kernels
121  } //namespace linalg
122 } //namespace viennacl
123 #endif