ViennaCL - The Vienna Computing Library
1.2.0
Main Page
Namespaces
Data Structures
Files
File List
Globals
viennacl
linalg
kernels
matrix_row_source.h
Go to the documentation of this file.
1
#ifndef VIENNACL_LINALG_KERNELS_MATRIX_ROW_SOURCE_HPP_
2
#define VIENNACL_LINALG_KERNELS_MATRIX_ROW_SOURCE_HPP_
3
//Automatically generated file from auxiliary-directory, do not edit manually!
4
namespace
viennacl
5
{
6
namespace
linalg
7
{
8
namespace
kernels
9
{
10
const
char
*
const
matrix_row_align1_rank1_update
=
11
"//perform a rank-1 update of the matrix, i.e. A += x * x^T\n"
12
"__kernel void rank1_update(\n"
13
" __global float * matrix,\n"
14
" unsigned int matrix_rows,\n"
15
" unsigned int matrix_cols,\n"
16
" unsigned int matrix_internal_rows,\n"
17
" unsigned int matrix_internal_cols,\n"
18
" __global const float * vector1, \n"
19
" __global const float * vector2) \n"
20
"{ \n"
21
" float tmp;\n"
22
" unsigned int offset;\n"
23
" for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
24
" {\n"
25
" tmp = vector1[row];\n"
26
" offset = row*matrix_internal_cols;\n"
27
" for (unsigned int col = 0; col < matrix_cols; ++col)\n"
28
" matrix[offset+col] += tmp * vector2[col];\n"
29
" }\n"
30
"}\n"
31
;
//matrix_row_align1_rank1_update
32
33
const
char
*
const
matrix_row_align1_inplace_mult
=
34
"__kernel void inplace_mult(\n"
35
" __global float * vec,\n"
36
" __global const float * fac, \n"
37
" unsigned int size) \n"
38
"{ \n"
39
" float factor = *fac;\n"
40
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
41
" vec[i] *= factor;\n"
42
"}\n"
43
;
//matrix_row_align1_inplace_mult
44
45
const
char
*
const
matrix_row_align1_fft_radix2_local
=
46
"unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) {\n"
47
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
48
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
49
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
50
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
51
" v = (v >> 16) | (v << 16);\n"
52
" v = v >> (32 - bit_size);\n"
53
" return v;\n"
54
"}\n"
55
"__kernel void fft_radix2_local(__global float2* input,\n"
56
" __local float2* lcl_input,\n"
57
" unsigned int bit_size,\n"
58
" unsigned int size,\n"
59
" unsigned int stride,\n"
60
" unsigned int batch_num,\n"
61
" float sign) {\n"
62
" unsigned int grp_id = get_group_id(0);\n"
63
" unsigned int grp_num = get_num_groups(0);\n"
64
" unsigned int lcl_sz = get_local_size(0);\n"
65
" unsigned int lcl_id = get_local_id(0);\n"
66
" const float NUM_PI = 3.14159265358979323846;\n"
67
" for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) {\n"
68
" //unsigned int base_offset = stride * batch_id;\n"
69
" //copy chunk of global memory to local\n"
70
" for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
71
" unsigned int v = get_reorder_num(p, bit_size);\n"
72
" lcl_input[v] = input[batch_id * stride + p];//index\n"
73
" }\n"
74
" barrier(CLK_LOCAL_MEM_FENCE);\n"
75
" \n"
76
" //performs Cooley-Tukey FFT on local array\n"
77
" for(unsigned int s = 0; s < bit_size; s++) {\n"
78
" unsigned int ss = 1 << s;\n"
79
" float cs, sn;\n"
80
" for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) {\n"
81
" unsigned int group = (tid & (ss - 1));\n"
82
" unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
83
" float2 in1 = lcl_input[pos];\n"
84
" float2 in2 = lcl_input[pos + ss];\n"
85
" float arg = group * sign * NUM_PI / ss;\n"
86
" sn = sincos(arg, &cs);\n"
87
" float2 ex = (float2)(cs, sn);\n"
88
" float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
89
" lcl_input[pos + ss] = in1 - tmp;\n"
90
" lcl_input[pos] = in1 + tmp;\n"
91
" }\n"
92
" barrier(CLK_LOCAL_MEM_FENCE);\n"
93
" }\n"
94
" \n"
95
" //copy local array back to global memory\n"
96
" for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
97
" input[batch_id * stride + p] = lcl_input[p];//index\n"
98
" }\n"
99
" }\n"
100
"}\n"
101
;
//matrix_row_align1_fft_radix2_local
102
103
const
char
*
const
matrix_row_align1_inplace_divide
=
104
"__kernel void inplace_divide(\n"
105
" __global float * vec,\n"
106
" __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
107
" unsigned int size) \n"
108
"{ \n"
109
" float factor = *fac;\n"
110
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
111
" vec[i] /= factor;\n"
112
"}\n"
113
;
//matrix_row_align1_inplace_divide
114
115
const
char
*
const
matrix_row_align1_trans_unit_lower_triangular_substitute_inplace
=
116
"\n"
117
"__kernel void trans_unit_lower_triangular_substitute_inplace(\n"
118
" __global const float * matrix,\n"
119
" unsigned int matrix_rows,\n"
120
" unsigned int matrix_cols,\n"
121
" unsigned int matrix_internal_rows,\n"
122
" unsigned int matrix_internal_cols,\n"
123
" __global float * vector)\n"
124
"{\n"
125
" float temp;\n"
126
" for (int row = 0; row < matrix_rows; ++row)\n"
127
" {\n"
128
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
129
"\n"
130
" temp = vector[row];\n"
131
"\n"
132
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
133
" vector[elim] -= temp * matrix[row * matrix_internal_cols + elim];\n"
134
" }\n"
135
"}\n"
136
"\n"
137
"\n"
138
;
//matrix_row_align1_trans_unit_lower_triangular_substitute_inplace
139
140
const
char
*
const
matrix_row_align1_lower_triangular_substitute_inplace
=
141
"__kernel void lower_triangular_substitute_inplace(\n"
142
" __global const float * matrix,\n"
143
" unsigned int matrix_rows,\n"
144
" unsigned int matrix_cols,\n"
145
" unsigned int matrix_internal_rows,\n"
146
" unsigned int matrix_internal_cols,\n"
147
" __global float * vector)\n"
148
"{\n"
149
" float temp;\n"
150
" for (int row = 0; row < matrix_rows; ++row)\n"
151
" {\n"
152
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
153
" if (get_global_id(0) == 0)\n"
154
" vector[row] /= matrix[row+row*matrix_internal_cols];\n"
155
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
156
" temp = vector[row];\n"
157
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
158
" vector[elim] -= temp * matrix[elim * matrix_internal_cols + row];\n"
159
" }\n"
160
"}\n"
161
;
//matrix_row_align1_lower_triangular_substitute_inplace
162
163
const
char
*
const
matrix_row_align1_inplace_add
=
164
"__kernel void inplace_add(\n"
165
" __global float * A,\n"
166
" unsigned int A_row_start,\n"
167
" unsigned int A_col_start,\n"
168
" unsigned int A_row_size,\n"
169
" unsigned int A_col_size,\n"
170
" unsigned int A_internal_rows,\n"
171
" unsigned int A_internal_cols,\n"
172
" __global const float * B, \n"
173
" unsigned int B_row_start,\n"
174
" unsigned int B_col_start,\n"
175
" unsigned int B_row_size,\n"
176
" unsigned int B_col_size,\n"
177
" unsigned int B_internal_rows,\n"
178
" unsigned int B_internal_cols)\n"
179
"{ \n"
180
" if ( get_global_id(0) < A_row_size\n"
181
" && get_global_id(1) < A_col_size\n"
182
" )\n"
183
" A[ (get_global_id(0) + A_row_start) * A_internal_cols\n"
184
" + (get_global_id(1) + A_col_start)] \n"
185
" += B[ (get_global_id(0) + B_row_start) * B_internal_cols\n"
186
" + (get_global_id(1) + B_col_start)];\n"
187
"}\n"
188
;
//matrix_row_align1_inplace_add
189
190
const
char
*
const
matrix_row_align1_add
=
191
"__kernel void add(\n"
192
" __global const float * vec1,\n"
193
" __global const float * vec2, \n"
194
" __global float * result,\n"
195
" unsigned int size) \n"
196
"{ \n"
197
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
198
" result[i] = vec1[i] + vec2[i];\n"
199
"}\n"
200
;
//matrix_row_align1_add
201
202
const
char
*
const
matrix_row_align1_sub
=
203
"__kernel void sub(\n"
204
" __global const float * vec1,\n"
205
" __global const float * vec2, \n"
206
" __global float * result,\n"
207
" unsigned int size)\n"
208
"{ \n"
209
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
210
" result[i] = vec1[i] - vec2[i];\n"
211
"}\n"
212
;
//matrix_row_align1_sub
213
214
const
char
*
const
matrix_row_align1_trans_unit_upper_triangular_substitute_inplace
=
215
"//transposed lower triangular matrix\n"
216
"__kernel void trans_unit_upper_triangular_substitute_inplace(\n"
217
" __global const float * matrix, \n"
218
" unsigned int matrix_rows,\n"
219
" unsigned int matrix_cols,\n"
220
" unsigned int matrix_internal_rows,\n"
221
" unsigned int matrix_internal_cols,\n"
222
" __global float * vector) \n"
223
"{ \n"
224
" float temp; \n"
225
" for (int row = matrix_rows-1; row > -1; --row) \n"
226
" { \n"
227
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
228
" temp = vector[row]; \n"
229
" //eliminate column with index 'row' in parallel: \n"
230
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
231
" vector[elim] -= temp * matrix[row * matrix_internal_cols + elim]; \n"
232
" } \n"
233
" \n"
234
"}\n"
235
;
//matrix_row_align1_trans_unit_upper_triangular_substitute_inplace
236
237
const
char
*
const
matrix_row_align1_scaled_rank1_update
=
238
"__kernel void scaled_rank1_update(\n"
239
" __global float * matrix,\n"
240
" unsigned int matrix_rows,\n"
241
" unsigned int matrix_cols,\n"
242
" unsigned int matrix_internal_rows,\n"
243
" unsigned int matrix_internal_cols,\n"
244
" float val,\n"
245
" __global const float * vector1, \n"
246
" __global const float * vector2) \n"
247
"{ \n"
248
" float tmp;\n"
249
" unsigned int offset;\n"
250
" for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
251
" {\n"
252
" tmp = val * vector1[row];\n"
253
" offset = row * matrix_internal_cols;\n"
254
" for (unsigned int col = 0; col < matrix_cols; ++col)\n"
255
" matrix[offset+col] += tmp * vector2[col];\n"
256
" }\n"
257
"}\n"
258
;
//matrix_row_align1_scaled_rank1_update
259
260
const
char
*
const
matrix_row_align1_unit_upper_triangular_substitute_inplace
=
261
"__kernel void unit_upper_triangular_substitute_inplace( \n"
262
" __global const float * matrix, \n"
263
" unsigned int matrix_rows,\n"
264
" unsigned int matrix_cols,\n"
265
" unsigned int matrix_internal_rows,\n"
266
" unsigned int matrix_internal_cols,\n"
267
" __global float * vector) \n"
268
"{ \n"
269
" float temp; \n"
270
" for (int row = matrix_rows-1; row > -1; --row) \n"
271
" { \n"
272
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
273
" temp = vector[row]; \n"
274
" //eliminate column with index 'row' in parallel: \n"
275
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
276
" vector[elim] -= temp * matrix[elim * matrix_internal_cols + row]; \n"
277
" } \n"
278
" \n"
279
"}\n"
280
;
//matrix_row_align1_unit_upper_triangular_substitute_inplace
281
282
const
char
*
const
matrix_row_align1_trans_upper_triangular_substitute_inplace
=
283
"//transposed lower triangular matrix\n"
284
"__kernel void trans_upper_triangular_substitute_inplace(\n"
285
" __global const float * matrix, \n"
286
" unsigned int matrix_rows,\n"
287
" unsigned int matrix_cols,\n"
288
" unsigned int matrix_internal_rows,\n"
289
" unsigned int matrix_internal_cols,\n"
290
" __global float * vector) \n"
291
"{ \n"
292
" float temp; \n"
293
" for (int row = matrix_rows-1; row > -1; --row) \n"
294
" { \n"
295
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
296
" if (get_global_id(0) == 0) \n"
297
" vector[row] /= matrix[row*matrix_internal_cols + row]; \n"
298
" \n"
299
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
300
" temp = vector[row]; \n"
301
" //eliminate column with index 'row' in parallel: \n"
302
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
303
" vector[elim] -= temp * matrix[row * matrix_internal_cols + elim]; \n"
304
" } \n"
305
" \n"
306
"}\n"
307
;
//matrix_row_align1_trans_upper_triangular_substitute_inplace
308
309
const
char
*
const
matrix_row_align1_upper_triangular_substitute_inplace
=
310
"__kernel void upper_triangular_substitute_inplace( \n"
311
" __global const float * matrix, \n"
312
" unsigned int matrix_rows,\n"
313
" unsigned int matrix_cols,\n"
314
" unsigned int matrix_internal_rows,\n"
315
" unsigned int matrix_internal_cols,\n"
316
" __global float * vector) \n"
317
"{ \n"
318
" float temp; \n"
319
" for (int row = matrix_rows-1; row > -1; --row) \n"
320
" { \n"
321
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
322
" if (get_global_id(0) == 0) \n"
323
" vector[row] /= matrix[row*matrix_internal_cols + row]; \n"
324
" \n"
325
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
326
" temp = vector[row]; \n"
327
" //eliminate column with index 'row' in parallel: \n"
328
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
329
" vector[elim] -= temp * matrix[elim * matrix_internal_cols + row]; \n"
330
" } \n"
331
" \n"
332
"}\n"
333
;
//matrix_row_align1_upper_triangular_substitute_inplace
334
335
const
char
*
const
matrix_row_align1_unit_lower_triangular_substitute_inplace
=
336
"__kernel void unit_lower_triangular_substitute_inplace(\n"
337
" __global const float * matrix,\n"
338
" unsigned int matrix_rows,\n"
339
" unsigned int matrix_cols,\n"
340
" unsigned int matrix_internal_rows,\n"
341
" unsigned int matrix_internal_cols,\n"
342
" __global float * vector)\n"
343
"{\n"
344
" float temp;\n"
345
" for (int row = 0; row < matrix_rows; ++row)\n"
346
" {\n"
347
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
348
" temp = vector[row];\n"
349
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
350
" vector[elim] -= temp * matrix[elim * matrix_internal_cols + row];\n"
351
" }\n"
352
"}\n"
353
;
//matrix_row_align1_unit_lower_triangular_substitute_inplace
354
355
const
char
*
const
matrix_row_align1_fft_radix2
=
356
"__kernel void fft_radix2(__global float2* input,\n"
357
" unsigned int s,\n"
358
" unsigned int bit_size,\n"
359
" unsigned int size,\n"
360
" unsigned int stride,\n"
361
" unsigned int batch_num,\n"
362
" float sign) {\n"
363
" unsigned int ss = 1 << s;\n"
364
" unsigned int half_size = size >> 1;\n"
365
" float cs, sn;\n"
366
" const float NUM_PI = 3.14159265358979323846;\n"
367
" unsigned int glb_id = get_global_id(0);\n"
368
" unsigned int glb_sz = get_global_size(0);\n"
369
" \n"
370
"// unsigned int base_offset = 0;\n"
371
" \n"
372
" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
373
" for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) {\n"
374
" unsigned int group = (tid & (ss - 1));\n"
375
" unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
376
" unsigned int offset = batch_id * stride + pos;\n"
377
" float2 in1 = input[offset];//index\n"
378
" float2 in2 = input[offset + ss];//index\n"
379
" float arg = group * sign * NUM_PI / ss;\n"
380
" sn = sincos(arg, &cs);\n"
381
" //sn = native_sin(arg);\n"
382
" //cs = native_cos(arg);\n"
383
" float2 ex = (float2)(cs, sn);\n"
384
" float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
385
" input[offset + ss] = in1 - tmp;//index\n"
386
" input[offset] = in1 + tmp;//index\n"
387
" }\n"
388
"// base_offset += stride;\n"
389
" }\n"
390
"}\n"
391
;
//matrix_row_align1_fft_radix2
392
393
const
char
*
const
matrix_row_align1_fft_reorder
=
394
"/*\n"
395
"* Performs reordering of input data in bit-reversal order\n"
396
"* Probably it's better to do in host side,\n"
397
"*/\n"
398
"unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) {\n"
399
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
400
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
401
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
402
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
403
" v = (v >> 16) | (v << 16);\n"
404
" v = v >> (32 - bit_size);\n"
405
" return v;\n"
406
"}\n"
407
"__kernel void fft_reorder(__global float2* input,\n"
408
" unsigned int bit_size,\n"
409
" unsigned int size,\n"
410
" unsigned int stride,\n"
411
" int batch_num) {\n"
412
" //unsigned int base_offset = 0;\n"
413
" unsigned int glb_id = get_global_id(0);\n"
414
" unsigned int glb_sz = get_global_size(0);\n"
415
" \n"
416
" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
417
" for(unsigned int i = glb_id; i < size; i += glb_sz) {\n"
418
" unsigned int v = get_reorder_num_2(i, bit_size);\n"
419
" if(i < v) {\n"
420
" float2 tmp = input[batch_id * stride + i]; // index\n"
421
" input[batch_id * stride + i] = input[batch_id * stride + v]; //index\n"
422
" input[batch_id * stride + v] = tmp; //index\n"
423
" }\n"
424
" }\n"
425
" //base_offset += stride;\n"
426
" }\n"
427
"}\n"
428
;
//matrix_row_align1_fft_reorder
429
430
const
char
*
const
matrix_row_align1_cpu_inplace_mult
=
431
"__kernel void cpu_inplace_mult(\n"
432
" __global float * vec,\n"
433
" float factor, \n"
434
" unsigned int size) \n"
435
"{ \n"
436
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
437
" vec[i] *= factor;\n"
438
"}\n"
439
;
//matrix_row_align1_cpu_inplace_mult
440
441
const
char
*
const
matrix_row_align1_clear
=
442
"__kernel void clear(\n"
443
" __global float * vec,\n"
444
" unsigned int size) \n"
445
"{ \n"
446
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
447
" vec[i] = 0;\n"
448
"}\n"
449
;
//matrix_row_align1_clear
450
451
const
char
*
const
matrix_row_align1_trans_lower_triangular_substitute_inplace
=
452
"__kernel void trans_lower_triangular_substitute_inplace(\n"
453
" __global const float * matrix,\n"
454
" unsigned int matrix_rows,\n"
455
" unsigned int matrix_cols,\n"
456
" unsigned int matrix_internal_rows,\n"
457
" unsigned int matrix_internal_cols,\n"
458
" __global float * vector)\n"
459
"{\n"
460
" float temp;\n"
461
" for (int row = 0; row < matrix_rows; ++row)\n"
462
" {\n"
463
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
464
" if (get_global_id(0) == 0)\n"
465
" vector[row] /= matrix[row+row*matrix_internal_cols];\n"
466
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
467
" temp = vector[row];\n"
468
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
469
" vector[elim] -= temp * matrix[row * matrix_internal_cols + elim];\n"
470
" }\n"
471
"}\n"
472
;
//matrix_row_align1_trans_lower_triangular_substitute_inplace
473
474
const
char
*
const
matrix_row_align1_vec_mul
=
475
"\n"
476
"\n"
477
"\n"
478
"__kernel void vec_mul(\n"
479
" __global const float * matrix,\n"
480
" unsigned int matrix_rows,\n"
481
" unsigned int matrix_cols,\n"
482
" unsigned int matrix_internal_rows,\n"
483
" unsigned int matrix_internal_cols,\n"
484
" __global const float * vector, \n"
485
" __global float * result) \n"
486
"{ \n"
487
" for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
488
" {\n"
489
" float dot_prod = 0.0f;\n"
490
" for (unsigned int col = 0; col < matrix_cols; ++col)\n"
491
" dot_prod += matrix[row*matrix_internal_cols + col] * vector[col];\n"
492
" result[row] = dot_prod;\n"
493
" }\n"
494
"}\n"
495
"\n"
496
"\n"
497
;
//matrix_row_align1_vec_mul
498
499
const
char
*
const
matrix_row_align1_lu_factorize
=
500
"__kernel void lu_factorize(\n"
501
" __global float * matrix,\n"
502
" unsigned int matrix_rows,\n"
503
" unsigned int matrix_cols,\n"
504
" unsigned int matrix_internal_rows,\n"
505
" unsigned int matrix_internal_cols) \n"
506
"{ \n"
507
" float temp;\n"
508
" unsigned rowi;\n"
509
" unsigned rowk;\n"
510
" for (unsigned int i=1; i<matrix_rows; ++i)\n"
511
" {\n"
512
" rowi = i * matrix_internal_cols;\n"
513
" for (unsigned int k=0; k<i; ++k)\n"
514
" {\n"
515
" rowk = k * matrix_internal_cols;\n"
516
" if (get_global_id(0) == 0)\n"
517
" matrix[rowi + k] /= matrix[rowk + k];\n"
518
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
519
" temp = matrix[rowi + k];\n"
520
" \n"
521
" //parallel subtraction:\n"
522
" for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0))\n"
523
" matrix[rowi + j] -= temp * matrix[rowk + j];\n"
524
" }\n"
525
" }\n"
526
"} \n"
527
;
//matrix_row_align1_lu_factorize
528
529
const
char
*
const
matrix_row_align1_trans_vec_mul
=
530
"__kernel void trans_vec_mul(\n"
531
" __global const float * matrix,\n"
532
" unsigned int matrix_rows,\n"
533
" unsigned int matrix_cols,\n"
534
" unsigned int matrix_internal_rows,\n"
535
" unsigned int matrix_internal_cols,\n"
536
" __global const float * vector, \n"
537
" __global float * result) \n"
538
"{ \n"
539
" //row and col indicate indices within transposed matrix\n"
540
" for (unsigned int row = get_global_id(0); row < matrix_cols; row += get_global_size(0))\n"
541
" {\n"
542
" float dot_prod2 = 0.0f;\n"
543
" for (unsigned int col = 0; col < matrix_rows; ++col)\n"
544
" dot_prod2 += matrix[row + col*matrix_internal_cols] * vector[col];\n"
545
" result[row] = dot_prod2;\n"
546
" }\n"
547
"}\n"
548
;
//matrix_row_align1_trans_vec_mul
549
550
const
char
*
const
matrix_row_align1_fft_direct
=
551
"// naive fourier transform (quadratic complexity, use for reference only)\n"
552
"__kernel void fft_direct(__global float2* input,\n"
553
" __global float2* output,\n"
554
" unsigned int size,\n"
555
" unsigned int stride,\n"
556
" unsigned int batch_num,\n"
557
" float sign) {\n"
558
"// unsigned int base_offset = 0;\n"
559
" const float NUM_PI = 3.14159265358979323846;\n"
560
" \n"
561
" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
562
" for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) {\n"
563
" float2 f = 0.0f;\n"
564
" for(unsigned int n = 0; n < size; n++) {\n"
565
" float2 in = input[batch_id * stride + n]; //input index here\n"
566
" float sn, cs;\n"
567
" float arg = sign * 2 * NUM_PI * k / size * n;\n"
568
" sn = sincos(arg, &cs);\n"
569
" float2 ex = (float2)(cs, sn);\n"
570
" f = f + (float2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x);\n"
571
" }\n"
572
" output[batch_id * stride + k] = f;// output index here\n"
573
" }\n"
574
"// base_offset += stride;\n"
575
" }\n"
576
"}\n"
577
;
//matrix_row_align1_fft_direct
578
579
const
char
*
const
matrix_row_align1_inplace_sub
=
580
"__kernel void inplace_sub(\n"
581
" __global float * vec1,\n"
582
" __global const float * vec2,\n"
583
" unsigned int size) \n"
584
"{ \n"
585
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
586
" vec1[i] -= vec2[i];\n"
587
"}\n"
588
;
//matrix_row_align1_inplace_sub
589
590
}
//namespace kernels
591
}
//namespace linalg
592
}
//namespace viennacl
593
#endif
Generated on Fri Jul 27 2012 22:02:50 for ViennaCL - The Vienna Computing Library by
1.8.1.2