ViennaCL - The Vienna Computing Library  1.2.0
matrix_row_source.h
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_KERNELS_MATRIX_ROW_SOURCE_HPP_
2 #define VIENNACL_LINALG_KERNELS_MATRIX_ROW_SOURCE_HPP_
3 //Automatically generated file from auxiliary-directory, do not edit manually!
4 namespace viennacl
5 {
6  namespace linalg
7  {
8  namespace kernels
9  {
10 const char * const matrix_row_align1_rank1_update =
11 "//perform a rank-1 update of the matrix, i.e. A += x * x^T\n"
12 "__kernel void rank1_update(\n"
13 " __global float * matrix,\n"
14 " unsigned int matrix_rows,\n"
15 " unsigned int matrix_cols,\n"
16 " unsigned int matrix_internal_rows,\n"
17 " unsigned int matrix_internal_cols,\n"
18 " __global const float * vector1, \n"
19 " __global const float * vector2) \n"
20 "{ \n"
21 " float tmp;\n"
22 " unsigned int offset;\n"
23 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
24 " {\n"
25 " tmp = vector1[row];\n"
26 " offset = row*matrix_internal_cols;\n"
27 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
28 " matrix[offset+col] += tmp * vector2[col];\n"
29 " }\n"
30 "}\n"
31 ; //matrix_row_align1_rank1_update
32 
33 const char * const matrix_row_align1_inplace_mult =
34 "__kernel void inplace_mult(\n"
35 " __global float * vec,\n"
36 " __global const float * fac, \n"
37 " unsigned int size) \n"
38 "{ \n"
39 " float factor = *fac;\n"
40 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
41 " vec[i] *= factor;\n"
42 "}\n"
43 ; //matrix_row_align1_inplace_mult
44 
46 "unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) {\n"
47 " v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
48 " v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
49 " v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
50 " v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
51 " v = (v >> 16) | (v << 16);\n"
52 " v = v >> (32 - bit_size);\n"
53 " return v;\n"
54 "}\n"
55 "__kernel void fft_radix2_local(__global float2* input,\n"
56 " __local float2* lcl_input,\n"
57 " unsigned int bit_size,\n"
58 " unsigned int size,\n"
59 " unsigned int stride,\n"
60 " unsigned int batch_num,\n"
61 " float sign) {\n"
62 " unsigned int grp_id = get_group_id(0);\n"
63 " unsigned int grp_num = get_num_groups(0);\n"
64 " unsigned int lcl_sz = get_local_size(0);\n"
65 " unsigned int lcl_id = get_local_id(0);\n"
66 " const float NUM_PI = 3.14159265358979323846;\n"
67 " for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) {\n"
68 " //unsigned int base_offset = stride * batch_id;\n"
69 " //copy chunk of global memory to local\n"
70 " for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
71 " unsigned int v = get_reorder_num(p, bit_size);\n"
72 " lcl_input[v] = input[batch_id * stride + p];//index\n"
73 " }\n"
74 " barrier(CLK_LOCAL_MEM_FENCE);\n"
75 " \n"
76 " //performs Cooley-Tukey FFT on local array\n"
77 " for(unsigned int s = 0; s < bit_size; s++) {\n"
78 " unsigned int ss = 1 << s;\n"
79 " float cs, sn;\n"
80 " for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) {\n"
81 " unsigned int group = (tid & (ss - 1));\n"
82 " unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
83 " float2 in1 = lcl_input[pos];\n"
84 " float2 in2 = lcl_input[pos + ss];\n"
85 " float arg = group * sign * NUM_PI / ss;\n"
86 " sn = sincos(arg, &cs);\n"
87 " float2 ex = (float2)(cs, sn);\n"
88 " float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
89 " lcl_input[pos + ss] = in1 - tmp;\n"
90 " lcl_input[pos] = in1 + tmp;\n"
91 " }\n"
92 " barrier(CLK_LOCAL_MEM_FENCE);\n"
93 " }\n"
94 " \n"
95 " //copy local array back to global memory\n"
96 " for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
97 " input[batch_id * stride + p] = lcl_input[p];//index\n"
98 " }\n"
99 " }\n"
100 "}\n"
101 ; //matrix_row_align1_fft_radix2_local
102 
103 const char * const matrix_row_align1_inplace_divide =
104 "__kernel void inplace_divide(\n"
105 " __global float * vec,\n"
106 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
107 " unsigned int size) \n"
108 "{ \n"
109 " float factor = *fac;\n"
110 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
111 " vec[i] /= factor;\n"
112 "}\n"
113 ; //matrix_row_align1_inplace_divide
114 
116 "\n"
117 "__kernel void trans_unit_lower_triangular_substitute_inplace(\n"
118 " __global const float * matrix,\n"
119 " unsigned int matrix_rows,\n"
120 " unsigned int matrix_cols,\n"
121 " unsigned int matrix_internal_rows,\n"
122 " unsigned int matrix_internal_cols,\n"
123 " __global float * vector)\n"
124 "{\n"
125 " float temp;\n"
126 " for (int row = 0; row < matrix_rows; ++row)\n"
127 " {\n"
128 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
129 "\n"
130 " temp = vector[row];\n"
131 "\n"
132 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
133 " vector[elim] -= temp * matrix[row * matrix_internal_cols + elim];\n"
134 " }\n"
135 "}\n"
136 "\n"
137 "\n"
138 ; //matrix_row_align1_trans_unit_lower_triangular_substitute_inplace
139 
141 "__kernel void lower_triangular_substitute_inplace(\n"
142 " __global const float * matrix,\n"
143 " unsigned int matrix_rows,\n"
144 " unsigned int matrix_cols,\n"
145 " unsigned int matrix_internal_rows,\n"
146 " unsigned int matrix_internal_cols,\n"
147 " __global float * vector)\n"
148 "{\n"
149 " float temp;\n"
150 " for (int row = 0; row < matrix_rows; ++row)\n"
151 " {\n"
152 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
153 " if (get_global_id(0) == 0)\n"
154 " vector[row] /= matrix[row+row*matrix_internal_cols];\n"
155 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
156 " temp = vector[row];\n"
157 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
158 " vector[elim] -= temp * matrix[elim * matrix_internal_cols + row];\n"
159 " }\n"
160 "}\n"
161 ; //matrix_row_align1_lower_triangular_substitute_inplace
162 
163 const char * const matrix_row_align1_inplace_add =
164 "__kernel void inplace_add(\n"
165 " __global float * A,\n"
166 " unsigned int A_row_start,\n"
167 " unsigned int A_col_start,\n"
168 " unsigned int A_row_size,\n"
169 " unsigned int A_col_size,\n"
170 " unsigned int A_internal_rows,\n"
171 " unsigned int A_internal_cols,\n"
172 " __global const float * B, \n"
173 " unsigned int B_row_start,\n"
174 " unsigned int B_col_start,\n"
175 " unsigned int B_row_size,\n"
176 " unsigned int B_col_size,\n"
177 " unsigned int B_internal_rows,\n"
178 " unsigned int B_internal_cols)\n"
179 "{ \n"
180 " if ( get_global_id(0) < A_row_size\n"
181 " && get_global_id(1) < A_col_size\n"
182 " )\n"
183 " A[ (get_global_id(0) + A_row_start) * A_internal_cols\n"
184 " + (get_global_id(1) + A_col_start)] \n"
185 " += B[ (get_global_id(0) + B_row_start) * B_internal_cols\n"
186 " + (get_global_id(1) + B_col_start)];\n"
187 "}\n"
188 ; //matrix_row_align1_inplace_add
189 
190 const char * const matrix_row_align1_add =
191 "__kernel void add(\n"
192 " __global const float * vec1,\n"
193 " __global const float * vec2, \n"
194 " __global float * result,\n"
195 " unsigned int size) \n"
196 "{ \n"
197 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
198 " result[i] = vec1[i] + vec2[i];\n"
199 "}\n"
200 ; //matrix_row_align1_add
201 
202 const char * const matrix_row_align1_sub =
203 "__kernel void sub(\n"
204 " __global const float * vec1,\n"
205 " __global const float * vec2, \n"
206 " __global float * result,\n"
207 " unsigned int size)\n"
208 "{ \n"
209 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
210 " result[i] = vec1[i] - vec2[i];\n"
211 "}\n"
212 ; //matrix_row_align1_sub
213 
215 "//transposed lower triangular matrix\n"
216 "__kernel void trans_unit_upper_triangular_substitute_inplace(\n"
217 " __global const float * matrix, \n"
218 " unsigned int matrix_rows,\n"
219 " unsigned int matrix_cols,\n"
220 " unsigned int matrix_internal_rows,\n"
221 " unsigned int matrix_internal_cols,\n"
222 " __global float * vector) \n"
223 "{ \n"
224 " float temp; \n"
225 " for (int row = matrix_rows-1; row > -1; --row) \n"
226 " { \n"
227 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
228 " temp = vector[row]; \n"
229 " //eliminate column with index 'row' in parallel: \n"
230 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
231 " vector[elim] -= temp * matrix[row * matrix_internal_cols + elim]; \n"
232 " } \n"
233 " \n"
234 "}\n"
235 ; //matrix_row_align1_trans_unit_upper_triangular_substitute_inplace
236 
238 "__kernel void scaled_rank1_update(\n"
239 " __global float * matrix,\n"
240 " unsigned int matrix_rows,\n"
241 " unsigned int matrix_cols,\n"
242 " unsigned int matrix_internal_rows,\n"
243 " unsigned int matrix_internal_cols,\n"
244 " float val,\n"
245 " __global const float * vector1, \n"
246 " __global const float * vector2) \n"
247 "{ \n"
248 " float tmp;\n"
249 " unsigned int offset;\n"
250 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
251 " {\n"
252 " tmp = val * vector1[row];\n"
253 " offset = row * matrix_internal_cols;\n"
254 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
255 " matrix[offset+col] += tmp * vector2[col];\n"
256 " }\n"
257 "}\n"
258 ; //matrix_row_align1_scaled_rank1_update
259 
261 "__kernel void unit_upper_triangular_substitute_inplace( \n"
262 " __global const float * matrix, \n"
263 " unsigned int matrix_rows,\n"
264 " unsigned int matrix_cols,\n"
265 " unsigned int matrix_internal_rows,\n"
266 " unsigned int matrix_internal_cols,\n"
267 " __global float * vector) \n"
268 "{ \n"
269 " float temp; \n"
270 " for (int row = matrix_rows-1; row > -1; --row) \n"
271 " { \n"
272 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
273 " temp = vector[row]; \n"
274 " //eliminate column with index 'row' in parallel: \n"
275 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
276 " vector[elim] -= temp * matrix[elim * matrix_internal_cols + row]; \n"
277 " } \n"
278 " \n"
279 "}\n"
280 ; //matrix_row_align1_unit_upper_triangular_substitute_inplace
281 
283 "//transposed lower triangular matrix\n"
284 "__kernel void trans_upper_triangular_substitute_inplace(\n"
285 " __global const float * matrix, \n"
286 " unsigned int matrix_rows,\n"
287 " unsigned int matrix_cols,\n"
288 " unsigned int matrix_internal_rows,\n"
289 " unsigned int matrix_internal_cols,\n"
290 " __global float * vector) \n"
291 "{ \n"
292 " float temp; \n"
293 " for (int row = matrix_rows-1; row > -1; --row) \n"
294 " { \n"
295 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
296 " if (get_global_id(0) == 0) \n"
297 " vector[row] /= matrix[row*matrix_internal_cols + row]; \n"
298 " \n"
299 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
300 " temp = vector[row]; \n"
301 " //eliminate column with index 'row' in parallel: \n"
302 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
303 " vector[elim] -= temp * matrix[row * matrix_internal_cols + elim]; \n"
304 " } \n"
305 " \n"
306 "}\n"
307 ; //matrix_row_align1_trans_upper_triangular_substitute_inplace
308 
310 "__kernel void upper_triangular_substitute_inplace( \n"
311 " __global const float * matrix, \n"
312 " unsigned int matrix_rows,\n"
313 " unsigned int matrix_cols,\n"
314 " unsigned int matrix_internal_rows,\n"
315 " unsigned int matrix_internal_cols,\n"
316 " __global float * vector) \n"
317 "{ \n"
318 " float temp; \n"
319 " for (int row = matrix_rows-1; row > -1; --row) \n"
320 " { \n"
321 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
322 " if (get_global_id(0) == 0) \n"
323 " vector[row] /= matrix[row*matrix_internal_cols + row]; \n"
324 " \n"
325 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
326 " temp = vector[row]; \n"
327 " //eliminate column with index 'row' in parallel: \n"
328 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
329 " vector[elim] -= temp * matrix[elim * matrix_internal_cols + row]; \n"
330 " } \n"
331 " \n"
332 "}\n"
333 ; //matrix_row_align1_upper_triangular_substitute_inplace
334 
336 "__kernel void unit_lower_triangular_substitute_inplace(\n"
337 " __global const float * matrix,\n"
338 " unsigned int matrix_rows,\n"
339 " unsigned int matrix_cols,\n"
340 " unsigned int matrix_internal_rows,\n"
341 " unsigned int matrix_internal_cols,\n"
342 " __global float * vector)\n"
343 "{\n"
344 " float temp;\n"
345 " for (int row = 0; row < matrix_rows; ++row)\n"
346 " {\n"
347 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
348 " temp = vector[row];\n"
349 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
350 " vector[elim] -= temp * matrix[elim * matrix_internal_cols + row];\n"
351 " }\n"
352 "}\n"
353 ; //matrix_row_align1_unit_lower_triangular_substitute_inplace
354 
355 const char * const matrix_row_align1_fft_radix2 =
356 "__kernel void fft_radix2(__global float2* input,\n"
357 " unsigned int s,\n"
358 " unsigned int bit_size,\n"
359 " unsigned int size,\n"
360 " unsigned int stride,\n"
361 " unsigned int batch_num,\n"
362 " float sign) {\n"
363 " unsigned int ss = 1 << s;\n"
364 " unsigned int half_size = size >> 1;\n"
365 " float cs, sn;\n"
366 " const float NUM_PI = 3.14159265358979323846;\n"
367 " unsigned int glb_id = get_global_id(0);\n"
368 " unsigned int glb_sz = get_global_size(0);\n"
369 " \n"
370 "// unsigned int base_offset = 0;\n"
371 " \n"
372 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
373 " for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) {\n"
374 " unsigned int group = (tid & (ss - 1));\n"
375 " unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
376 " unsigned int offset = batch_id * stride + pos;\n"
377 " float2 in1 = input[offset];//index\n"
378 " float2 in2 = input[offset + ss];//index\n"
379 " float arg = group * sign * NUM_PI / ss;\n"
380 " sn = sincos(arg, &cs);\n"
381 " //sn = native_sin(arg);\n"
382 " //cs = native_cos(arg);\n"
383 " float2 ex = (float2)(cs, sn);\n"
384 " float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
385 " input[offset + ss] = in1 - tmp;//index\n"
386 " input[offset] = in1 + tmp;//index\n"
387 " }\n"
388 "// base_offset += stride;\n"
389 " }\n"
390 "}\n"
391 ; //matrix_row_align1_fft_radix2
392 
393 const char * const matrix_row_align1_fft_reorder =
394 "/*\n"
395 "* Performs reordering of input data in bit-reversal order\n"
396 "* Probably it's better to do in host side,\n"
397 "*/\n"
398 "unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) {\n"
399 " v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
400 " v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
401 " v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
402 " v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
403 " v = (v >> 16) | (v << 16);\n"
404 " v = v >> (32 - bit_size);\n"
405 " return v;\n"
406 "}\n"
407 "__kernel void fft_reorder(__global float2* input,\n"
408 " unsigned int bit_size,\n"
409 " unsigned int size,\n"
410 " unsigned int stride,\n"
411 " int batch_num) {\n"
412 " //unsigned int base_offset = 0;\n"
413 " unsigned int glb_id = get_global_id(0);\n"
414 " unsigned int glb_sz = get_global_size(0);\n"
415 " \n"
416 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
417 " for(unsigned int i = glb_id; i < size; i += glb_sz) {\n"
418 " unsigned int v = get_reorder_num_2(i, bit_size);\n"
419 " if(i < v) {\n"
420 " float2 tmp = input[batch_id * stride + i]; // index\n"
421 " input[batch_id * stride + i] = input[batch_id * stride + v]; //index\n"
422 " input[batch_id * stride + v] = tmp; //index\n"
423 " }\n"
424 " }\n"
425 " //base_offset += stride;\n"
426 " }\n"
427 "}\n"
428 ; //matrix_row_align1_fft_reorder
429 
431 "__kernel void cpu_inplace_mult(\n"
432 " __global float * vec,\n"
433 " float factor, \n"
434 " unsigned int size) \n"
435 "{ \n"
436 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
437 " vec[i] *= factor;\n"
438 "}\n"
439 ; //matrix_row_align1_cpu_inplace_mult
440 
441 const char * const matrix_row_align1_clear =
442 "__kernel void clear(\n"
443 " __global float * vec,\n"
444 " unsigned int size) \n"
445 "{ \n"
446 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
447 " vec[i] = 0;\n"
448 "}\n"
449 ; //matrix_row_align1_clear
450 
452 "__kernel void trans_lower_triangular_substitute_inplace(\n"
453 " __global const float * matrix,\n"
454 " unsigned int matrix_rows,\n"
455 " unsigned int matrix_cols,\n"
456 " unsigned int matrix_internal_rows,\n"
457 " unsigned int matrix_internal_cols,\n"
458 " __global float * vector)\n"
459 "{\n"
460 " float temp;\n"
461 " for (int row = 0; row < matrix_rows; ++row)\n"
462 " {\n"
463 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
464 " if (get_global_id(0) == 0)\n"
465 " vector[row] /= matrix[row+row*matrix_internal_cols];\n"
466 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
467 " temp = vector[row];\n"
468 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
469 " vector[elim] -= temp * matrix[row * matrix_internal_cols + elim];\n"
470 " }\n"
471 "}\n"
472 ; //matrix_row_align1_trans_lower_triangular_substitute_inplace
473 
474 const char * const matrix_row_align1_vec_mul =
475 "\n"
476 "\n"
477 "\n"
478 "__kernel void vec_mul(\n"
479 " __global const float * matrix,\n"
480 " unsigned int matrix_rows,\n"
481 " unsigned int matrix_cols,\n"
482 " unsigned int matrix_internal_rows,\n"
483 " unsigned int matrix_internal_cols,\n"
484 " __global const float * vector, \n"
485 " __global float * result) \n"
486 "{ \n"
487 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
488 " {\n"
489 " float dot_prod = 0.0f;\n"
490 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
491 " dot_prod += matrix[row*matrix_internal_cols + col] * vector[col];\n"
492 " result[row] = dot_prod;\n"
493 " }\n"
494 "}\n"
495 "\n"
496 "\n"
497 ; //matrix_row_align1_vec_mul
498 
499 const char * const matrix_row_align1_lu_factorize =
500 "__kernel void lu_factorize(\n"
501 " __global float * matrix,\n"
502 " unsigned int matrix_rows,\n"
503 " unsigned int matrix_cols,\n"
504 " unsigned int matrix_internal_rows,\n"
505 " unsigned int matrix_internal_cols) \n"
506 "{ \n"
507 " float temp;\n"
508 " unsigned rowi;\n"
509 " unsigned rowk;\n"
510 " for (unsigned int i=1; i<matrix_rows; ++i)\n"
511 " {\n"
512 " rowi = i * matrix_internal_cols;\n"
513 " for (unsigned int k=0; k<i; ++k)\n"
514 " {\n"
515 " rowk = k * matrix_internal_cols;\n"
516 " if (get_global_id(0) == 0)\n"
517 " matrix[rowi + k] /= matrix[rowk + k];\n"
518 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
519 " temp = matrix[rowi + k];\n"
520 " \n"
521 " //parallel subtraction:\n"
522 " for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0))\n"
523 " matrix[rowi + j] -= temp * matrix[rowk + j];\n"
524 " }\n"
525 " }\n"
526 "} \n"
527 ; //matrix_row_align1_lu_factorize
528 
529 const char * const matrix_row_align1_trans_vec_mul =
530 "__kernel void trans_vec_mul(\n"
531 " __global const float * matrix,\n"
532 " unsigned int matrix_rows,\n"
533 " unsigned int matrix_cols,\n"
534 " unsigned int matrix_internal_rows,\n"
535 " unsigned int matrix_internal_cols,\n"
536 " __global const float * vector, \n"
537 " __global float * result) \n"
538 "{ \n"
539 " //row and col indicate indices within transposed matrix\n"
540 " for (unsigned int row = get_global_id(0); row < matrix_cols; row += get_global_size(0))\n"
541 " {\n"
542 " float dot_prod2 = 0.0f;\n"
543 " for (unsigned int col = 0; col < matrix_rows; ++col)\n"
544 " dot_prod2 += matrix[row + col*matrix_internal_cols] * vector[col];\n"
545 " result[row] = dot_prod2;\n"
546 " }\n"
547 "}\n"
548 ; //matrix_row_align1_trans_vec_mul
549 
550 const char * const matrix_row_align1_fft_direct =
551 "// naive fourier transform (quadratic complexity, use for reference only)\n"
552 "__kernel void fft_direct(__global float2* input,\n"
553 " __global float2* output,\n"
554 " unsigned int size,\n"
555 " unsigned int stride,\n"
556 " unsigned int batch_num,\n"
557 " float sign) {\n"
558 "// unsigned int base_offset = 0;\n"
559 " const float NUM_PI = 3.14159265358979323846;\n"
560 " \n"
561 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
562 " for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) {\n"
563 " float2 f = 0.0f;\n"
564 " for(unsigned int n = 0; n < size; n++) {\n"
565 " float2 in = input[batch_id * stride + n]; //input index here\n"
566 " float sn, cs;\n"
567 " float arg = sign * 2 * NUM_PI * k / size * n;\n"
568 " sn = sincos(arg, &cs);\n"
569 " float2 ex = (float2)(cs, sn);\n"
570 " f = f + (float2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x);\n"
571 " }\n"
572 " output[batch_id * stride + k] = f;// output index here\n"
573 " }\n"
574 "// base_offset += stride;\n"
575 " }\n"
576 "}\n"
577 ; //matrix_row_align1_fft_direct
578 
579 const char * const matrix_row_align1_inplace_sub =
580 "__kernel void inplace_sub(\n"
581 " __global float * vec1,\n"
582 " __global const float * vec2,\n"
583 " unsigned int size) \n"
584 "{ \n"
585 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
586 " vec1[i] -= vec2[i];\n"
587 "}\n"
588 ; //matrix_row_align1_inplace_sub
589 
590  } //namespace kernels
591  } //namespace linalg
592 } //namespace viennacl
593 #endif