ViennaCL - The Vienna Computing Library  1.2.0
matrix_col_source.h
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_KERNELS_MATRIX_COL_SOURCE_HPP_
2 #define VIENNACL_LINALG_KERNELS_MATRIX_COL_SOURCE_HPP_
3 //Automatically generated file from auxiliary-directory, do not edit manually!
4 namespace viennacl
5 {
6  namespace linalg
7  {
8  namespace kernels
9  {
10 const char * const matrix_col_align1_rank1_update =
11 "//perform a rank-1 update of the matrix, i.e. A += x * x^T\n"
12 "__kernel void rank1_update(\n"
13 " __global float * matrix,\n"
14 " unsigned int matrix_rows,\n"
15 " unsigned int matrix_cols,\n"
16 " unsigned int matrix_internal_rows,\n"
17 " unsigned int matrix_internal_cols,\n"
18 " __global const float * vector1, \n"
19 " __global const float * vector2) \n"
20 "{ \n"
21 " float tmp;\n"
22 " for (unsigned int row= get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
23 " {\n"
24 " tmp = vector1[row];\n"
25 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
26 " matrix[row + col * matrix_internal_rows] += tmp * vector2[col];\n"
27 " }\n"
28 "}\n"
29 ; //matrix_col_align1_rank1_update
30 
31 const char * const matrix_col_align1_inplace_mult =
32 "__kernel void inplace_mult(\n"
33 " __global float * vec,\n"
34 " __global const float * fac, \n"
35 " unsigned int size) \n"
36 "{ \n"
37 " float factor = *fac;\n"
38 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
39 " vec[i] *= factor;\n"
40 "}\n"
41 ; //matrix_col_align1_inplace_mult
42 
44 "unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) {\n"
45 " v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
46 " v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
47 " v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
48 " v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
49 " v = (v >> 16) | (v << 16);\n"
50 " v = v >> (32 - bit_size);\n"
51 " return v;\n"
52 "}\n"
53 "__kernel void fft_radix2_local(__global float2* input,\n"
54 " __local float2* lcl_input,\n"
55 " unsigned int bit_size,\n"
56 " unsigned int size,\n"
57 " unsigned int stride,\n"
58 " unsigned int batch_num,\n"
59 " float sign) {\n"
60 " unsigned int grp_id = get_group_id(0);\n"
61 " unsigned int grp_num = get_num_groups(0);\n"
62 " unsigned int lcl_sz = get_local_size(0);\n"
63 " unsigned int lcl_id = get_local_id(0);\n"
64 " const float NUM_PI = 3.14159265358979323846;\n"
65 " for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) {\n"
66 " //unsigned int base_offset = stride * batch_id;\n"
67 " //copy chunk of global memory to local\n"
68 " for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
69 " unsigned int v = get_reorder_num(p, bit_size);\n"
70 " lcl_input[v] = input[p * stride + batch_id];//index\n"
71 " }\n"
72 " barrier(CLK_LOCAL_MEM_FENCE);\n"
73 " //performs Cooley-Tukey FFT on local array\n"
74 " for(unsigned int s = 0; s < bit_size; s++) {\n"
75 " unsigned int ss = 1 << s;\n"
76 " float cs, sn;\n"
77 " for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) {\n"
78 " unsigned int group = (tid & (ss - 1));\n"
79 " unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
80 " float2 in1 = lcl_input[pos];\n"
81 " float2 in2 = lcl_input[pos + ss];\n"
82 " float arg = group * sign * NUM_PI / ss;\n"
83 " sn = sincos(arg, &cs);\n"
84 " float2 ex = (float2)(cs, sn);\n"
85 " float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
86 " lcl_input[pos + ss] = in1 - tmp;\n"
87 " lcl_input[pos] = in1 + tmp;\n"
88 " }\n"
89 " barrier(CLK_LOCAL_MEM_FENCE);\n"
90 " }\n"
91 " //copy local array back to global memory\n"
92 " for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
93 " input[p * stride + batch_id] = lcl_input[p];//index\n"
94 " }\n"
95 " }\n"
96 "}\n"
97 ; //matrix_col_align1_fft_radix2_local
98 
99 const char * const matrix_col_align1_inplace_divide =
100 "__kernel void inplace_divide(\n"
101 " __global float * vec,\n"
102 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
103 " unsigned int size) \n"
104 "{ \n"
105 " float factor = *fac;\n"
106 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
107 " vec[i] /= factor;\n"
108 "}\n"
109 ; //matrix_col_align1_inplace_divide
110 
112 "\n"
113 "__kernel void trans_unit_lower_triangular_substitute_inplace(\n"
114 " __global const float * matrix,\n"
115 " unsigned int matrix_rows,\n"
116 " unsigned int matrix_cols,\n"
117 " unsigned int matrix_internal_rows,\n"
118 " unsigned int matrix_internal_cols,\n"
119 " __global float * vector)\n"
120 "{\n"
121 " float temp;\n"
122 " for (int row = 0; row < matrix_rows; ++row)\n"
123 " {\n"
124 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
125 "\n"
126 " temp = vector[row];\n"
127 "\n"
128 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
129 " vector[elim] -= temp * matrix[elim * matrix_internal_rows + row];\n"
130 " }\n"
131 "}\n"
132 "\n"
133 "\n"
134 ; //matrix_col_align1_trans_unit_lower_triangular_substitute_inplace
135 
137 "__kernel void lower_triangular_substitute_inplace(\n"
138 " __global const float * matrix,\n"
139 " unsigned int matrix_rows,\n"
140 " unsigned int matrix_cols,\n"
141 " unsigned int matrix_internal_rows,\n"
142 " unsigned int matrix_internal_cols,\n"
143 " __global float * vector)\n"
144 "{\n"
145 " float temp;\n"
146 " for (int row = 0; row < matrix_rows; ++row)\n"
147 " {\n"
148 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
149 " if (get_global_id(0) == 0)\n"
150 " vector[row] /= matrix[row+row*matrix_internal_rows];\n"
151 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
152 " temp = vector[row];\n"
153 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
154 " vector[elim] -= temp * matrix[row * matrix_internal_rows + elim];\n"
155 " }\n"
156 "}\n"
157 ; //matrix_col_align1_lower_triangular_substitute_inplace
158 
159 const char * const matrix_col_align1_inplace_add =
160 "__kernel void inplace_add(\n"
161 " __global float * A,\n"
162 " unsigned int A_row_start,\n"
163 " unsigned int A_col_start,\n"
164 " unsigned int A_row_size,\n"
165 " unsigned int A_col_size,\n"
166 " unsigned int A_internal_rows,\n"
167 " unsigned int A_internal_cols,\n"
168 " __global const float * B, \n"
169 " unsigned int B_row_start,\n"
170 " unsigned int B_col_start,\n"
171 " unsigned int B_row_size,\n"
172 " unsigned int B_col_size,\n"
173 " unsigned int B_internal_rows,\n"
174 " unsigned int B_internal_cols)\n"
175 "{ \n"
176 " if ( get_global_id(0) < A_row_size\n"
177 " && get_global_id(1) < A_col_size\n"
178 " )\n"
179 " A[ (get_global_id(0) + A_row_start)\n"
180 " + (get_global_id(1) + A_col_start) * A_internal_rows] \n"
181 " += B[ (get_global_id(0) + B_row_start)\n"
182 " + (get_global_id(1) + B_col_start) * B_internal_rows];\n"
183 "}\n"
184 ; //matrix_col_align1_inplace_add
185 
186 const char * const matrix_col_align1_add =
187 "__kernel void add(\n"
188 " __global const float * vec1,\n"
189 " __global const float * vec2, \n"
190 " __global float * result,\n"
191 " unsigned int size) \n"
192 "{ \n"
193 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
194 " result[i] = vec1[i] + vec2[i];\n"
195 "}\n"
196 ; //matrix_col_align1_add
197 
198 const char * const matrix_col_align1_sub =
199 "__kernel void sub(\n"
200 " __global const float * vec1,\n"
201 " __global const float * vec2, \n"
202 " __global float * result,\n"
203 " unsigned int size)\n"
204 "{ \n"
205 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
206 " result[i] = vec1[i] - vec2[i];\n"
207 "}\n"
208 ; //matrix_col_align1_sub
209 
211 "//transposed lower triangular matrix\n"
212 "__kernel void trans_unit_upper_triangular_substitute_inplace(\n"
213 " __global const float * matrix, \n"
214 " unsigned int matrix_rows,\n"
215 " unsigned int matrix_cols,\n"
216 " unsigned int matrix_internal_rows,\n"
217 " unsigned int matrix_internal_cols,\n"
218 " __global float * vector) \n"
219 "{ \n"
220 " float temp; \n"
221 " for (int row = matrix_rows-1; row > -1; --row) \n"
222 " { \n"
223 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
224 " \n"
225 " temp = vector[row]; \n"
226 " //eliminate column with index 'row' in parallel: \n"
227 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
228 " vector[elim] -= temp * matrix[row + elim * matrix_internal_rows]; \n"
229 " } \n"
230 " \n"
231 "}\n"
232 ; //matrix_col_align1_trans_unit_upper_triangular_substitute_inplace
233 
235 "__kernel void scaled_rank1_update(\n"
236 " __global float * matrix,\n"
237 " unsigned int matrix_rows,\n"
238 " unsigned int matrix_cols,\n"
239 " unsigned int matrix_internal_rows,\n"
240 " unsigned int matrix_internal_cols,\n"
241 " float val,\n"
242 " __global const float * vector1, \n"
243 " __global const float * vector2) \n"
244 "{ \n"
245 " float tmp;\n"
246 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
247 " {\n"
248 " tmp = val * vector1[row];\n"
249 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
250 " matrix[row + col*matrix_internal_rows] += tmp * vector2[col];\n"
251 " }\n"
252 "}\n"
253 ; //matrix_col_align1_scaled_rank1_update
254 
256 "__kernel void unit_upper_triangular_substitute_inplace( \n"
257 " __global const float * matrix, \n"
258 " unsigned int matrix_rows,\n"
259 " unsigned int matrix_cols,\n"
260 " unsigned int matrix_internal_rows,\n"
261 " unsigned int matrix_internal_cols,\n"
262 " __global float * vector) \n"
263 "{ \n"
264 " float temp; \n"
265 " for (int row = matrix_rows-1; row > -1; --row) \n"
266 " { \n"
267 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
268 " \n"
269 " temp = vector[row]; \n"
270 " //eliminate column with index 'row' in parallel: \n"
271 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
272 " vector[elim] -= temp * matrix[elim + row * matrix_internal_rows]; \n"
273 " } \n"
274 " \n"
275 "}\n"
276 ; //matrix_col_align1_unit_upper_triangular_substitute_inplace
277 
279 "//transposed lower triangular matrix\n"
280 "__kernel void trans_upper_triangular_substitute_inplace(\n"
281 " __global const float * matrix, \n"
282 " unsigned int matrix_rows,\n"
283 " unsigned int matrix_cols,\n"
284 " unsigned int matrix_internal_rows,\n"
285 " unsigned int matrix_internal_cols,\n"
286 " __global float * vector) \n"
287 "{ \n"
288 " float temp; \n"
289 " for (int row = matrix_rows-1; row > -1; --row) \n"
290 " { \n"
291 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
292 " if (get_global_id(0) == 0) \n"
293 " vector[row] /= matrix[row + row*matrix_internal_rows]; \n"
294 " \n"
295 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
296 " temp = vector[row]; \n"
297 " //eliminate column with index 'row' in parallel: \n"
298 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
299 " vector[elim] -= temp * matrix[row + elim * matrix_internal_rows]; \n"
300 " } \n"
301 " \n"
302 "}\n"
303 ; //matrix_col_align1_trans_upper_triangular_substitute_inplace
304 
306 "__kernel void upper_triangular_substitute_inplace( \n"
307 " __global const float * matrix, \n"
308 " unsigned int matrix_rows,\n"
309 " unsigned int matrix_cols,\n"
310 " unsigned int matrix_internal_rows,\n"
311 " unsigned int matrix_internal_cols,\n"
312 " __global float * vector) \n"
313 "{ \n"
314 " float temp; \n"
315 " for (int row = matrix_rows-1; row > -1; --row) \n"
316 " { \n"
317 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
318 " if (get_global_id(0) == 0) \n"
319 " vector[row] /= matrix[row + row*matrix_internal_rows]; \n"
320 " \n"
321 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
322 " temp = vector[row]; \n"
323 " //eliminate column with index 'row' in parallel: \n"
324 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
325 " vector[elim] -= temp * matrix[elim + row * matrix_internal_rows]; \n"
326 " } \n"
327 " \n"
328 "}\n"
329 ; //matrix_col_align1_upper_triangular_substitute_inplace
330 
332 "__kernel void unit_lower_triangular_substitute_inplace(\n"
333 " __global const float * matrix,\n"
334 " unsigned int matrix_rows,\n"
335 " unsigned int matrix_cols,\n"
336 " unsigned int matrix_internal_rows,\n"
337 " unsigned int matrix_internal_cols,\n"
338 " __global float * vector)\n"
339 "{\n"
340 " float temp;\n"
341 " for (int row = 0; row < matrix_rows; ++row)\n"
342 " {\n"
343 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
344 " temp = vector[row];\n"
345 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
346 " vector[elim] -= temp * matrix[row * matrix_internal_rows + elim];\n"
347 " }\n"
348 "}\n"
349 ; //matrix_col_align1_unit_lower_triangular_substitute_inplace
350 
351 const char * const matrix_col_align1_fft_radix2 =
352 "__kernel void fft_radix2(__global float2* input,\n"
353 " unsigned int s,\n"
354 " unsigned int bit_size,\n"
355 " unsigned int size,\n"
356 " unsigned int stride,\n"
357 " unsigned int batch_num,\n"
358 " float sign) {\n"
359 " unsigned int ss = 1 << s;\n"
360 " unsigned int half_size = size >> 1;\n"
361 " float cs, sn;\n"
362 " const float NUM_PI = 3.14159265358979323846;\n"
363 " unsigned int glb_id = get_global_id(0);\n"
364 " unsigned int glb_sz = get_global_size(0);\n"
365 " \n"
366 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
367 " for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) {\n"
368 " unsigned int group = (tid & (ss - 1));\n"
369 " unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
370 " unsigned int offset = pos * stride + batch_id;\n"
371 " float2 in1 = input[offset];//index\n"
372 " float2 in2 = input[offset + ss * stride];//index\n"
373 " float arg = group * sign * NUM_PI / ss;\n"
374 " sn = sincos(arg, &cs);\n"
375 " float2 ex = (float2)(cs, sn);\n"
376 " float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
377 " input[offset + ss * stride] = in1 - tmp;//index\n"
378 " input[offset] = in1 + tmp;//index\n"
379 " }\n"
380 " }\n"
381 "}\n"
382 ; //matrix_col_align1_fft_radix2
383 
384 const char * const matrix_col_align1_fft_reorder =
385 "/*\n"
386 "* Performs reordering of input data in bit-reversal order\n"
387 "* Probably it's better to do in host side,\n"
388 "*/\n"
389 "unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) {\n"
390 " v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
391 " v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
392 " v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
393 " v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
394 " v = (v >> 16) | (v << 16);\n"
395 " v = v >> (32 - bit_size);\n"
396 " return v;\n"
397 "}\n"
398 "__kernel void fft_reorder(__global float2* input,\n"
399 " unsigned int bit_size,\n"
400 " unsigned int size,\n"
401 " unsigned int stride,\n"
402 " int batch_num) {\n"
403 " unsigned int glb_id = get_global_id(0);\n"
404 " unsigned int glb_sz = get_global_size(0);\n"
405 " \n"
406 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
407 " for(unsigned int i = glb_id; i < size; i += glb_sz) {\n"
408 " unsigned int v = get_reorder_num_2(i, bit_size);\n"
409 " if(i < v) {\n"
410 " float2 tmp = input[i * stride + batch_id]; // index\n"
411 " input[i * stride + batch_id] = input[v * stride + batch_id]; //index\n"
412 " input[v * stride + batch_id] = tmp; //index\n"
413 " }\n"
414 " }\n"
415 " }\n"
416 "}\n"
417 ; //matrix_col_align1_fft_reorder
418 
420 "__kernel void cpu_inplace_mult(\n"
421 " __global float * vec,\n"
422 " float factor, \n"
423 " unsigned int size) \n"
424 "{ \n"
425 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
426 " vec[i] *= factor;\n"
427 "}\n"
428 ; //matrix_col_align1_cpu_inplace_mult
429 
430 const char * const matrix_col_align1_clear =
431 "__kernel void clear(\n"
432 " __global float * vec,\n"
433 " unsigned int size) \n"
434 "{ \n"
435 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
436 " vec[i] = 0;\n"
437 "}\n"
438 ; //matrix_col_align1_clear
439 
441 "__kernel void trans_lower_triangular_substitute_inplace(\n"
442 " __global const float * matrix,\n"
443 " unsigned int matrix_rows,\n"
444 " unsigned int matrix_cols,\n"
445 " unsigned int matrix_internal_rows,\n"
446 " unsigned int matrix_internal_cols,\n"
447 " __global float * vector)\n"
448 "{\n"
449 " float temp;\n"
450 " for (int row = 0; row < matrix_rows; ++row)\n"
451 " {\n"
452 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
453 " if (get_global_id(0) == 0)\n"
454 " vector[row] /= matrix[row+row*matrix_internal_rows];\n"
455 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
456 " temp = vector[row];\n"
457 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
458 " vector[elim] -= temp * matrix[elim * matrix_internal_rows + row];\n"
459 " }\n"
460 "}\n"
461 ; //matrix_col_align1_trans_lower_triangular_substitute_inplace
462 
463 const char * const matrix_col_align1_vec_mul =
464 "__kernel void vec_mul(\n"
465 " __global const float * matrix,\n"
466 " unsigned int matrix_rows,\n"
467 " unsigned int matrix_cols,\n"
468 " unsigned int matrix_internal_rows,\n"
469 " unsigned int matrix_internal_cols,\n"
470 " __global const float * vector, \n"
471 " __global float * result) \n"
472 "{ \n"
473 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
474 " {\n"
475 " float dot_prod = 0.0f;\n"
476 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
477 " dot_prod += matrix[row + col*matrix_internal_rows] * vector[col];\n"
478 " result[row] = dot_prod;\n"
479 " }\n"
480 "}\n"
481 ; //matrix_col_align1_vec_mul
482 
483 const char * const matrix_col_align1_lu_factorize =
484 "__kernel void lu_factorize(\n"
485 " __global float * matrix,\n"
486 " unsigned int matrix_rows,\n"
487 " unsigned int matrix_cols,\n"
488 " unsigned int matrix_internal_rows,\n"
489 " unsigned int matrix_internal_cols) \n"
490 "{ \n"
491 " float temp;\n"
492 " for (unsigned int i=1; i<matrix_rows; ++i)\n"
493 " {\n"
494 " for (unsigned int k=0; k<i; ++k)\n"
495 " {\n"
496 " if (get_global_id(0) == 0)\n"
497 " matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows];\n"
498 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
499 " temp = matrix[i + k*matrix_internal_rows];\n"
500 " \n"
501 " //parallel subtraction:\n"
502 " for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0))\n"
503 " matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows];\n"
504 " }\n"
505 " }\n"
506 "} \n"
507 ; //matrix_col_align1_lu_factorize
508 
509 const char * const matrix_col_align1_trans_vec_mul =
510 "__kernel void trans_vec_mul(\n"
511 " __global const float * matrix,\n"
512 " unsigned int matrix_rows,\n"
513 " unsigned int matrix_cols,\n"
514 " unsigned int matrix_internal_rows,\n"
515 " unsigned int matrix_internal_cols,\n"
516 " __global const float * vector, \n"
517 " __global float * result) \n"
518 "{ \n"
519 " //row and col indicate indices within transposed matrix\n"
520 " for (unsigned int row = get_global_id(0); row < matrix_cols; row += get_global_size(0))\n"
521 " {\n"
522 " float dot_prod2 = 0.0f;\n"
523 " for (unsigned int col = 0; col < matrix_rows; ++col)\n"
524 " dot_prod2 += matrix[row * matrix_internal_rows + col] * vector[col];\n"
525 " result[row] = dot_prod2;\n"
526 " }\n"
527 "}\n"
528 ; //matrix_col_align1_trans_vec_mul
529 
530 const char * const matrix_col_align1_fft_direct =
531 "// Direct FFT computation (quadratic complexity - use for reference only)\n"
532 "__kernel void fft_direct(__global float2* input,\n"
533 " __global float2* output,\n"
534 " unsigned int size,\n"
535 " unsigned int stride,\n"
536 " unsigned int batch_num,\n"
537 " float sign) {\n"
538 " \n"
539 " const float NUM_PI = 3.14159265358979323846;\n"
540 " \n"
541 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
542 " for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) {\n"
543 " float2 f = 0.0f;\n"
544 " for(unsigned int n = 0; n < size; n++) {\n"
545 " float2 in = input[n * stride + batch_id]; //input index here\n"
546 " float sn, cs;\n"
547 " float arg = sign * 2 * NUM_PI * k / size * n;\n"
548 " sn = sincos(arg, &cs);\n"
549 " float2 ex = (float2)(cs, sn);\n"
550 " f = f + (float2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x);\n"
551 " }\n"
552 " output[k * stride + batch_id] = f;// output index here\n"
553 " }\n"
554 " }\n"
555 "}\n"
556 ; //matrix_col_align1_fft_direct
557 
558 const char * const matrix_col_align1_inplace_sub =
559 "__kernel void inplace_sub(\n"
560 " __global float * vec1,\n"
561 " __global const float * vec2,\n"
562 " unsigned int size) \n"
563 "{ \n"
564 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
565 " vec1[i] -= vec2[i];\n"
566 "}\n"
567 ; //matrix_col_align1_inplace_sub
568 
569  } //namespace kernels
570  } //namespace linalg
571 } //namespace viennacl
572 #endif