ViennaCL - The Vienna Computing Library  1.2.0
spai_source.h
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_KERNELS_SPAI_SOURCE_HPP_
2 #define VIENNACL_LINALG_KERNELS_SPAI_SOURCE_HPP_
3 //Automatically generated file from auxiliary-directory, do not edit manually!
4 namespace viennacl
5 {
6  namespace linalg
7  {
8  namespace kernels
9  {
10 const char * const spai_align1_block_qr_assembly =
11 "void assemble_upper_part(__global float * R_q,\n"
12 " unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, \n"
13 " unsigned int row_n_u, unsigned int col_n_u,\n"
14 " unsigned int col_n, unsigned int diff){\n"
15 " for(unsigned int i = 0; i < col_n_q; ++i){\n"
16 " for(unsigned int j = 0; j < diff; ++j){\n"
17 " R_q[ i*row_n_q + j] = R_u[ i*row_n_u + j + col_n ];\n"
18 " }\n"
19 " }\n"
20 " }\n"
21 "void assemble_lower_part(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u_u, \n"
22 " unsigned int row_n_u_u, unsigned int col_n_u_u, \n"
23 " unsigned int diff){\n"
24 " for(unsigned int i = 0; i < col_n_u_u; ++i){\n"
25 " for(unsigned int j = 0; j < row_n_u_u; ++j){\n"
26 " R_q[i*row_n_q + j + diff] = R_u_u[i*row_n_u_u + j];\n"
27 " }\n"
28 " } \n"
29 "}\n"
30 "void assemble_qr_block(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, unsigned int row_n_u,\n"
31 " unsigned int col_n_u, __global float * R_u_u, unsigned int row_n_u_u, unsigned int col_n_u_u, unsigned int col_n){\n"
32 " unsigned int diff = row_n_u - col_n;\n"
33 " assemble_upper_part(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff);\n"
34 " if(diff > 0){\n"
35 " assemble_lower_part(R_q, row_n_q, col_n_q, R_u_u, row_n_u_u, col_n_u_u, diff);\n"
36 " }\n"
37 "}\n"
38 "__kernel void block_qr_assembly(\n"
39 " __global unsigned int * matrix_dimensions,\n"
40 " __global float * R_u,\n"
41 " __global unsigned int * block_ind_u,\n"
42 " __global unsigned int * matrix_dimensions_u,\n"
43 " __global float * R_u_u,\n"
44 " __global unsigned int * block_ind_u_u,\n"
45 " __global unsigned int * matrix_dimensions_u_u,\n"
46 " __global float * R_q,\n"
47 " __global unsigned int * block_ind_q,\n"
48 " __global unsigned int * matrix_dimensions_q,\n"
49 " __global unsigned int * g_is_update,\n"
50 " //__local float * local_R_q,\n"
51 " unsigned int block_elems_num) \n"
52 "{ \n"
53 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
54 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
55 " //\n"
56 " assemble_qr_block(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n"
57 " matrix_dimensions_u[2*i + 1], R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1], matrix_dimensions[2*i + 1]);\n"
58 " }\n"
59 " }\n"
60 "}\n"
61 ; //spai_align1_block_qr_assembly
62 
63 const char * const spai_align1_block_qr =
64 "void dot_prod(__local const float* A, unsigned int n, unsigned int beg_ind, float* res){\n"
65 " *res = 0;\n"
66 " for(unsigned int i = beg_ind; i < n; ++i){\n"
67 " *res += A[(beg_ind-1)*n + i]*A[(beg_ind-1)*n + i];\n"
68 " }\n"
69 "}\n"
70 " \n"
71 "void vector_div(__global float* v, unsigned int beg_ind, float b, unsigned int n){\n"
72 " for(unsigned int i = beg_ind; i < n; ++i){\n"
73 " v[i] /= b;\n"
74 " }\n"
75 "}\n"
76 "void copy_vector(__local const float* A, __global float* v, const unsigned int beg_ind, const unsigned int n){\n"
77 " for(unsigned int i = beg_ind; i < n; ++i){\n"
78 " v[i] = A[(beg_ind-1)*n + i];\n"
79 " }\n"
80 "}\n"
81 " \n"
82 " \n"
83 "void householder_vector(__local const float* A, unsigned int j, unsigned int n, __global float* v, __global float* b){\n"
84 " float sg;\n"
85 " dot_prod(A, n, j+1, &sg); \n"
86 " copy_vector(A, v, j+1, n);\n"
87 " float mu;\n"
88 " v[j] = 1.0;\n"
89 " //print_contigious_vector(v, v_start_ind, n);\n"
90 " if(sg == 0){\n"
91 " *b = 0;\n"
92 " }\n"
93 " else{\n"
94 " mu = sqrt(A[j*n + j]*A[ j*n + j] + sg);\n"
95 " if(A[ j*n + j] <= 0){\n"
96 " v[j] = A[ j*n + j] - mu;\n"
97 " }else{\n"
98 " v[j] = -sg/(A[ j*n + j] + mu);\n"
99 " }\n"
100 " *b = 2*(v[j]*v[j])/(sg + v[j]*v[j]);\n"
101 " //*b = (2*v[j]*v[j])/(sg + (v[j])*(v[j]));\n"
102 " vector_div(v, j, v[j], n);\n"
103 " //print_contigious_vector(v, v_start_ind, n);\n"
104 " }\n"
105 "}\n"
106 "void custom_inner_prod(__local const float* A, __global float* v, unsigned int col_ind, unsigned int row_num, unsigned int start_ind, float* res){\n"
107 " for(unsigned int i = start_ind; i < row_num; ++i){\n"
108 " *res += A[col_ind*row_num + i]*v[i]; \n"
109 " }\n"
110 "}\n"
111 "// \n"
112 "void apply_householder_reflection(__local float* A, unsigned int row_n, unsigned int col_n, unsigned int iter_cnt, __global float* v, float b){\n"
113 " float in_prod_res;\n"
114 " for(unsigned int i= iter_cnt + get_local_id(0); i < col_n; i+=get_local_size(0)){\n"
115 " in_prod_res = 0.0;\n"
116 " custom_inner_prod(A, v, i, row_n, iter_cnt, &in_prod_res);\n"
117 " for(unsigned int j = iter_cnt; j < row_n; ++j){\n"
118 " A[ i*row_n + j] -= b*in_prod_res* v[j];\n"
119 " }\n"
120 " }\n"
121 " \n"
122 "}\n"
123 "void store_householder_vector(__local float* A, unsigned int ind, unsigned int n, __global float* v){\n"
124 " for(unsigned int i = ind; i < n; ++i){\n"
125 " A[ (ind-1)*n + i] = v[i];\n"
126 " }\n"
127 "}\n"
128 "void single_qr( __local float* R, __global unsigned int* matrix_dimensions, __global float* b_v, __global float* v, unsigned int matrix_ind){\n"
129 " //matrix_dimensions[0] - number of rows\n"
130 " //matrix_dimensions[1] - number of columns\n"
131 " unsigned int col_n = matrix_dimensions[2*matrix_ind + 1];\n"
132 " unsigned int row_n = matrix_dimensions[2*matrix_ind];\n"
133 " \n"
134 " if((col_n == row_n)&&(row_n == 1)){\n"
135 " b_v[0] = 0.0;\n"
136 " return;\n"
137 " }\n"
138 " for(unsigned int i = 0; i < col_n; ++i){\n"
139 " if(get_local_id(0) == 0){\n"
140 " householder_vector(R, i, row_n, v, b_v + i);\n"
141 " }\n"
142 " barrier(CLK_LOCAL_MEM_FENCE);\n"
143 " apply_householder_reflection(R, row_n, col_n, i, v, b_v[i]);\n"
144 " barrier(CLK_LOCAL_MEM_FENCE);\n"
145 " if(get_local_id(0) == 0){\n"
146 " if(i < matrix_dimensions[2*matrix_ind]){\n"
147 " store_householder_vector(R, i+1, row_n, v);\n"
148 " }\n"
149 " }\n"
150 " }\n"
151 "}\n"
152 "void matrix_from_global_to_local_qr(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){\n"
153 " for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){\n"
154 " for(unsigned int j = 0; j < row_n; ++j){\n"
155 " l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j];\n"
156 " }\n"
157 " }\n"
158 "}\n"
159 "void matrix_from_local_to_global_qr(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){\n"
160 " for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){\n"
161 " for(unsigned int j = 0; j < row_n; ++j){\n"
162 " g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j];\n"
163 " }\n"
164 " }\n"
165 "}\n"
166 "__kernel void block_qr(\n"
167 " __global float* R, \n"
168 " __global unsigned int* matrix_dimensions, \n"
169 " __global float* b_v, \n"
170 " __global float* v, \n"
171 " __global unsigned int* start_matrix_inds, \n"
172 " __global unsigned int* start_bv_inds, \n"
173 " __global unsigned int* start_v_inds,\n"
174 " __global unsigned int * g_is_update, \n"
175 " __local float* local_buff_R,\n"
176 " unsigned int block_elems_num){\n"
177 " for(unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){\n"
178 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
179 " matrix_from_global_to_local_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]);\n"
180 " barrier(CLK_LOCAL_MEM_FENCE);\n"
181 " single_qr(local_buff_R, matrix_dimensions, b_v + start_bv_inds[i], v + start_v_inds[i], i);\n"
182 " barrier(CLK_LOCAL_MEM_FENCE);\n"
183 " matrix_from_local_to_global_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]);\n"
184 " }\n"
185 " }\n"
186 "}\n"
187 ; //spai_align1_block_qr
188 
189 const char * const spai_align1_block_r_assembly =
190 "void assemble_r(__global float * gR, unsigned int row_n_r, unsigned int col_n_r, __global float * R, \n"
191 " unsigned int row_n, unsigned int col_n)\n"
192 "{\n"
193 " for(unsigned int i = 0; i < col_n; ++i){\n"
194 " for(unsigned int j = 0; j < row_n; ++j){\n"
195 " gR[i*row_n_r + j] = R[i*row_n + j ];\n"
196 " }\n"
197 " }\n"
198 "}\n"
199 "void assemble_r_u(__global float * gR,\n"
200 " unsigned int row_n_r, unsigned int col_n_r, __global float * R_u, unsigned int row_n_u, unsigned int col_n_u, \n"
201 " unsigned int col_n)\n"
202 "{\n"
203 " for(unsigned int i = 0; i < col_n_u; ++i){\n"
204 " for(unsigned int j = 0; j < col_n; ++j){\n"
205 " gR[ (i+col_n)*row_n_r + j] = R_u[ i*row_n_u + j];\n"
206 " }\n"
207 " } \n"
208 "}\n"
209 "void assemble_r_u_u(__global float * gR, unsigned int row_n_r, unsigned int col_n_r, __global float * R_u_u, unsigned int row_n_u_u, \n"
210 " unsigned int col_n_u_u, unsigned int col_n)\n"
211 "{\n"
212 " for(unsigned int i = 0; i < col_n_u_u; ++i){\n"
213 " for(unsigned int j = 0; j < row_n_u_u; ++j){\n"
214 " gR[(col_n+i)*row_n_r + j + col_n] = R_u_u[i*row_n_u_u + j];\n"
215 " }\n"
216 " } \n"
217 "}\n"
218 "void assemble_r_block(__global float * gR, unsigned int row_n_r, unsigned int col_n_r, __global float * R, unsigned int row_n, \n"
219 " unsigned int col_n, __global float * R_u, unsigned int row_n_u, unsigned int col_n_u, __global float * R_u_u, \n"
220 " unsigned int row_n_u_u, unsigned int col_n_u_u){\n"
221 " assemble_r(gR, row_n_r, col_n_r, R, row_n, col_n); \n"
222 " assemble_r_u(gR, row_n_r, col_n_r, R_u, row_n_u, col_n_u, col_n);\n"
223 " assemble_r_u_u(gR, row_n_r, col_n_r, R_u_u, row_n_u_u, col_n_u_u, col_n);\n"
224 "}\n"
225 "__kernel void block_r_assembly(\n"
226 " __global float * R,\n"
227 " __global unsigned int * block_ind,\n"
228 " __global unsigned int * matrix_dimensions,\n"
229 " __global float * R_u,\n"
230 " __global unsigned int * block_ind_u,\n"
231 " __global unsigned int * matrix_dimensions_u,\n"
232 " __global float * R_u_u,\n"
233 " __global unsigned int * block_ind_u_u,\n"
234 " __global unsigned int * matrix_dimensions_u_u,\n"
235 " __global float * g_R,\n"
236 " __global unsigned int * block_ind_r,\n"
237 " __global unsigned int * matrix_dimensions_r,\n"
238 " __global unsigned int * g_is_update,\n"
239 " //__local float * local_gR,\n"
240 " unsigned int block_elems_num) \n"
241 "{ \n"
242 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
243 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
244 " \n"
245 " assemble_r_block(g_R + block_ind_r[i], matrix_dimensions_r[2*i], matrix_dimensions_r[2*i + 1], R + block_ind[i], matrix_dimensions[2*i], \n"
246 " matrix_dimensions[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1],\n"
247 " R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1]);\n"
248 " \n"
249 " }\n"
250 " }\n"
251 "}\n"
252 ; //spai_align1_block_r_assembly
253 
254 const char * const spai_align1_assemble_blocks =
255 "float get_element(__global const unsigned int * row_indices,\n"
256 " __global const unsigned int * column_indices,\n"
257 " __global const float * elements,\n"
258 " unsigned int row,\n"
259 " unsigned int col\n"
260 " )\n"
261 "{\n"
262 " unsigned int row_end = row_indices[row+1];\n"
263 " for(unsigned int i = row_indices[row]; i < row_end; ++i){\n"
264 " if(column_indices[i] == col)\n"
265 " return elements[i];\n"
266 " if(column_indices[i] > col)\n"
267 " return 0.0;\n"
268 " }\n"
269 " return 0.0; \n"
270 "}\n"
271 "void block_assembly(__global const unsigned int * row_indices,\n"
272 " __global const unsigned int * column_indices, \n"
273 " __global const float * elements,\n"
274 " __global const unsigned int * matrix_dimensions,\n"
275 " __global const unsigned int * set_I,\n"
276 " __global const unsigned int * set_J, \n"
277 " unsigned int matrix_ind,\n"
278 " __global float * com_A_I_J)\n"
279 "{\n"
280 " unsigned int row_n = matrix_dimensions[2*matrix_ind];\n"
281 " unsigned int col_n = matrix_dimensions[2*matrix_ind + 1];\n"
282 " \n"
283 " for(unsigned int i = 0; i < col_n; ++i){\n"
284 " //start row index\n"
285 " for(unsigned int j = 0; j < row_n; j++){\n"
286 " com_A_I_J[ i*row_n + j] = get_element(row_indices, column_indices, elements, set_I[j], set_J[i]);\n"
287 " }\n"
288 " }\n"
289 " \n"
290 "}\n"
291 "__kernel void assemble_blocks(\n"
292 " __global const unsigned int * row_indices,\n"
293 " __global const unsigned int * column_indices, \n"
294 " __global const float * elements,\n"
295 " __global const unsigned int * set_I,\n"
296 " __global const unsigned int * set_J,\n"
297 " __global const unsigned int * i_ind,\n"
298 " __global const unsigned int * j_ind,\n"
299 " __global const unsigned int * block_ind,\n"
300 " __global const unsigned int * matrix_dimensions,\n"
301 " __global float * com_A_I_J,\n"
302 " __global unsigned int * g_is_update,\n"
303 " unsigned int block_elems_num) \n"
304 "{ \n"
305 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
306 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
307 " \n"
308 " block_assembly(row_indices, column_indices, elements, matrix_dimensions, set_I + i_ind[i], set_J + j_ind[i], i, com_A_I_J + block_ind[i]);\n"
309 " }\n"
310 " }\n"
311 "}\n"
312 ; //spai_align1_assemble_blocks
313 
314 const char * const spai_align1_block_least_squares =
315 "void custom_dot_prod_ls(__global float * A, unsigned int row_n, __global float * v, unsigned int ind, float *res){\n"
316 " *res = 0.0;\n"
317 " for(unsigned int j = ind; j < row_n; ++j){\n"
318 " if(j == ind){\n"
319 " *res += v[ j];\n"
320 " }else{\n"
321 " *res += A[ j + ind*row_n]*v[ j];\n"
322 " }\n"
323 " }\n"
324 " }\n"
325 "void backwardSolve(__global float * R, unsigned int row_n, unsigned int col_n, __global float * y, __global float * x){\n"
326 " for (int i = col_n-1; i >= 0 ; i--) {\n"
327 " x[ i] = y[ i];\n"
328 " for (int j = i+1; j < col_n; ++j) {\n"
329 " x[ i] -= R[ i + j*row_n]*x[ j];\n"
330 " }\n"
331 " x[i] /= R[ i + i*row_n];\n"
332 " }\n"
333 " \n"
334 "}\n"
335 " \n"
336 "void apply_q_trans_vec_ls(__global float * R, unsigned int row_n, unsigned int col_n, __global const float * b_v, __global float * y){\n"
337 " float inn_prod = 0;\n"
338 " for(unsigned int i = 0; i < col_n; ++i){\n"
339 " custom_dot_prod_ls(R, row_n, y, i, &inn_prod);\n"
340 " for(unsigned int j = i; j < row_n; ++j){\n"
341 " if(i == j){\n"
342 " y[ j] -= b_v[ i]*inn_prod;\n"
343 " }\n"
344 " else{\n"
345 " y[j] -= b_v[ i]*inn_prod*R[ j +i*row_n];\n"
346 " }\n"
347 " }\n"
348 " //std::cout<<y<<std::endl;\n"
349 " }\n"
350 " }\n"
351 "void ls(__global float * R, unsigned int row_n, unsigned int col_n, __global float * b_v, __global float * m_v, __global float * y_v){\n"
352 " \n"
353 " apply_q_trans_vec_ls(R, row_n, col_n, b_v, y_v);\n"
354 " //m_new - is m_v now\n"
355 " backwardSolve(R, row_n, col_n, y_v, m_v);\n"
356 "}\n"
357 "__kernel void block_least_squares(\n"
358 " __global float * global_R,\n"
359 " __global unsigned int * block_ind,\n"
360 " __global float * b_v,\n"
361 " __global unsigned int * start_bv_inds,\n"
362 " __global float * m_v,\n"
363 " __global float * y_v,\n"
364 " __global unsigned int * start_y_inds,\n"
365 " __global unsigned int * matrix_dimensions,\n"
366 " __global unsigned int * g_is_update,\n"
367 " //__local float * local_R,\n"
368 " unsigned int block_elems_num) \n"
369 "{ \n"
370 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
371 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
372 " \n"
373 " ls(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v +start_bv_inds[i], m_v + start_bv_inds[i], y_v + start_y_inds[i] );\n"
374 " \n"
375 " }\n"
376 " }\n"
377 "}\n"
378 ; //spai_align1_block_least_squares
379 
380 const char * const spai_align1_block_qr_assembly_1 =
381 "void assemble_upper_part_1(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, \n"
382 " unsigned int row_n_u, unsigned int col_n_u,\n"
383 " unsigned int col_n, unsigned int diff){\n"
384 " for(unsigned int i = 0; i < col_n_q; ++i){\n"
385 " for(unsigned int j = 0; j < diff; ++j){\n"
386 " R_q[ i*row_n_q + j] = R_u[i*row_n_u + j + col_n ];\n"
387 " }\n"
388 " }\n"
389 " }\n"
390 "void assemble_qr_block_1(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, unsigned int row_n_u,\n"
391 " unsigned int col_n_u, unsigned int col_n){\n"
392 " unsigned int diff = row_n_u - col_n;\n"
393 " assemble_upper_part_1(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff);\n"
394 "}\n"
395 "__kernel void block_qr_assembly_1(\n"
396 " __global unsigned int * matrix_dimensions,\n"
397 " __global float * R_u,\n"
398 " __global unsigned int * block_ind_u,\n"
399 " __global unsigned int * matrix_dimensions_u,\n"
400 " __global float * R_q,\n"
401 " __global unsigned int * block_ind_q,\n"
402 " __global unsigned int * matrix_dimensions_q,\n"
403 " __global unsigned int * g_is_update,\n"
404 " //__local float * local_R_q,\n"
405 " unsigned int block_elems_num) \n"
406 "{ \n"
407 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
408 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
409 " assemble_qr_block_1(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n"
410 " matrix_dimensions_u[2*i + 1], matrix_dimensions[2*i + 1]);\n"
411 " }\n"
412 " }\n"
413 "}\n"
414 ; //spai_align1_block_qr_assembly_1
415 
416 const char * const spai_align1_block_q_mult =
417 "void custom_dot_prod(__global float * A, unsigned int row_n, __local float * v, unsigned int ind, float *res){\n"
418 " *res = 0.0;\n"
419 " for(unsigned int j = ind; j < row_n; ++j){\n"
420 " if(j == ind){\n"
421 " *res += v[j];\n"
422 " }else{\n"
423 " *res += A[j + ind*row_n]*v[j];\n"
424 " }\n"
425 " }\n"
426 " }\n"
427 "void apply_q_trans_vec(__global float * R, unsigned int row_n, unsigned int col_n, __global float * b_v, __local float * y){\n"
428 " float inn_prod = 0;\n"
429 " for(unsigned int i = 0; i < col_n; ++i){\n"
430 " custom_dot_prod(R, row_n, y, i, &inn_prod);\n"
431 " for(unsigned int j = i; j < row_n; ++j){\n"
432 " if(i == j){\n"
433 " y[j] -= b_v[ i]*inn_prod;\n"
434 " }\n"
435 " else{\n"
436 " y[j] -= b_v[ i]*inn_prod*R[ j + i*row_n];\n"
437 " }\n"
438 " }\n"
439 " }\n"
440 " }\n"
441 "void q_mult(__global float * R, unsigned int row_n, unsigned int col_n, __global float * b_v, __local float * R_u, unsigned int col_n_u){\n"
442 " for(unsigned int i = get_local_id(0); i < col_n_u; i+= get_local_size(0)){\n"
443 " apply_q_trans_vec(R, row_n, col_n, b_v, R_u + row_n*i);\n"
444 " } \n"
445 "}\n"
446 "void matrix_from_global_to_local(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){\n"
447 " for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){\n"
448 " for(unsigned int j = 0; j < row_n; ++j){\n"
449 " l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j];\n"
450 " }\n"
451 " }\n"
452 "}\n"
453 "void matrix_from_local_to_global(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){\n"
454 " for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){\n"
455 " for(unsigned int j = 0; j < row_n; ++j){\n"
456 " g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j];\n"
457 " }\n"
458 " }\n"
459 "}\n"
460 "__kernel void block_q_mult(__global float * global_R,\n"
461 " __global unsigned int * block_ind,\n"
462 " __global float * global_R_u,\n"
463 " __global unsigned int *block_ind_u,\n"
464 " __global float * b_v,\n"
465 " __global unsigned int * start_bv_inds,\n"
466 " __global unsigned int * matrix_dimensions,\n"
467 " __global unsigned int * matrix_dimensions_u,\n"
468 " __global unsigned int * g_is_update,\n"
469 " __local float * local_R_u,\n"
470 " unsigned int block_elems_num){\n"
471 " for(unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){\n"
472 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && (g_is_update[i] > 0)){\n"
473 " //matrix_from_global_to_local(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]);\n"
474 " matrix_from_global_to_local(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i+ 1], block_ind_u[i]);\n"
475 " barrier(CLK_LOCAL_MEM_FENCE);\n"
476 " q_mult(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v + start_bv_inds[i], local_R_u, \n"
477 " matrix_dimensions_u[2*i + 1]);\n"
478 " barrier(CLK_LOCAL_MEM_FENCE);\n"
479 " matrix_from_local_to_global(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], block_ind_u[i]);\n"
480 " }\n"
481 " }\n"
482 "}\n"
483 ; //spai_align1_block_q_mult
484 
485 const char * const spai_align1_block_bv_assembly =
486 "void assemble_bv(__global float * g_bv_r, __global float * g_bv, unsigned int col_n){\n"
487 " for(unsigned int i = 0; i < col_n; ++i){\n"
488 " g_bv_r[i] = g_bv[ i];\n"
489 " }\n"
490 "}\n"
491 "void assemble_bv_block(__global float * g_bv_r, __global float * g_bv, unsigned int col_n,\n"
492 " __global float * g_bv_u, unsigned int col_n_u)\n"
493 "{\n"
494 " assemble_bv(g_bv_r, g_bv, col_n);\n"
495 " assemble_bv(g_bv_r + col_n, g_bv_u, col_n_u);\n"
496 " \n"
497 "}\n"
498 "__kernel void block_bv_assembly(__global float * g_bv,\n"
499 " __global unsigned int * start_bv_ind,\n"
500 " __global unsigned int * matrix_dimensions,\n"
501 " __global float * g_bv_u,\n"
502 " __global unsigned int * start_bv_u_ind,\n"
503 " __global unsigned int * matrix_dimensions_u,\n"
504 " __global float * g_bv_r,\n"
505 " __global unsigned int * start_bv_r_ind,\n"
506 " __global unsigned int * matrix_dimensions_r,\n"
507 " __global unsigned int * g_is_update,\n"
508 " //__local float * local_gb,\n"
509 " unsigned int block_elems_num)\n"
510 "{ \n"
511 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
512 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
513 " assemble_bv_block(g_bv_r + start_bv_r_ind[i], g_bv + start_bv_ind[i], matrix_dimensions[2*i + 1], g_bv_u + start_bv_u_ind[i], matrix_dimensions_u[2*i + 1]);\n"
514 " }\n"
515 " }\n"
516 "}\n"
517 ; //spai_align1_block_bv_assembly
518 
519  } //namespace kernels
520  } //namespace linalg
521 } //namespace viennacl
522 #endif