ViennaCL - The Vienna Computing Library
1.2.0
Main Page
Namespaces
Data Structures
Files
File List
Globals
viennacl
linalg
kernels
matrix_col_source.h
Go to the documentation of this file.
1
#ifndef VIENNACL_LINALG_KERNELS_MATRIX_COL_SOURCE_HPP_
2
#define VIENNACL_LINALG_KERNELS_MATRIX_COL_SOURCE_HPP_
3
//Automatically generated file from auxiliary-directory, do not edit manually!
4
namespace
viennacl
5
{
6
namespace
linalg
7
{
8
namespace
kernels
9
{
10
const
char
*
const
matrix_col_align1_rank1_update
=
11
"//perform a rank-1 update of the matrix, i.e. A += x * x^T\n"
12
"__kernel void rank1_update(\n"
13
" __global float * matrix,\n"
14
" unsigned int matrix_rows,\n"
15
" unsigned int matrix_cols,\n"
16
" unsigned int matrix_internal_rows,\n"
17
" unsigned int matrix_internal_cols,\n"
18
" __global const float * vector1, \n"
19
" __global const float * vector2) \n"
20
"{ \n"
21
" float tmp;\n"
22
" for (unsigned int row= get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
23
" {\n"
24
" tmp = vector1[row];\n"
25
" for (unsigned int col = 0; col < matrix_cols; ++col)\n"
26
" matrix[row + col * matrix_internal_rows] += tmp * vector2[col];\n"
27
" }\n"
28
"}\n"
29
;
//matrix_col_align1_rank1_update
30
31
const
char
*
const
matrix_col_align1_inplace_mult
=
32
"__kernel void inplace_mult(\n"
33
" __global float * vec,\n"
34
" __global const float * fac, \n"
35
" unsigned int size) \n"
36
"{ \n"
37
" float factor = *fac;\n"
38
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
39
" vec[i] *= factor;\n"
40
"}\n"
41
;
//matrix_col_align1_inplace_mult
42
43
const
char
*
const
matrix_col_align1_fft_radix2_local
=
44
"unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) {\n"
45
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
46
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
47
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
48
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
49
" v = (v >> 16) | (v << 16);\n"
50
" v = v >> (32 - bit_size);\n"
51
" return v;\n"
52
"}\n"
53
"__kernel void fft_radix2_local(__global float2* input,\n"
54
" __local float2* lcl_input,\n"
55
" unsigned int bit_size,\n"
56
" unsigned int size,\n"
57
" unsigned int stride,\n"
58
" unsigned int batch_num,\n"
59
" float sign) {\n"
60
" unsigned int grp_id = get_group_id(0);\n"
61
" unsigned int grp_num = get_num_groups(0);\n"
62
" unsigned int lcl_sz = get_local_size(0);\n"
63
" unsigned int lcl_id = get_local_id(0);\n"
64
" const float NUM_PI = 3.14159265358979323846;\n"
65
" for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) {\n"
66
" //unsigned int base_offset = stride * batch_id;\n"
67
" //copy chunk of global memory to local\n"
68
" for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
69
" unsigned int v = get_reorder_num(p, bit_size);\n"
70
" lcl_input[v] = input[p * stride + batch_id];//index\n"
71
" }\n"
72
" barrier(CLK_LOCAL_MEM_FENCE);\n"
73
" //performs Cooley-Tukey FFT on local array\n"
74
" for(unsigned int s = 0; s < bit_size; s++) {\n"
75
" unsigned int ss = 1 << s;\n"
76
" float cs, sn;\n"
77
" for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) {\n"
78
" unsigned int group = (tid & (ss - 1));\n"
79
" unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
80
" float2 in1 = lcl_input[pos];\n"
81
" float2 in2 = lcl_input[pos + ss];\n"
82
" float arg = group * sign * NUM_PI / ss;\n"
83
" sn = sincos(arg, &cs);\n"
84
" float2 ex = (float2)(cs, sn);\n"
85
" float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
86
" lcl_input[pos + ss] = in1 - tmp;\n"
87
" lcl_input[pos] = in1 + tmp;\n"
88
" }\n"
89
" barrier(CLK_LOCAL_MEM_FENCE);\n"
90
" }\n"
91
" //copy local array back to global memory\n"
92
" for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
93
" input[p * stride + batch_id] = lcl_input[p];//index\n"
94
" }\n"
95
" }\n"
96
"}\n"
97
;
//matrix_col_align1_fft_radix2_local
98
99
const
char
*
const
matrix_col_align1_inplace_divide
=
100
"__kernel void inplace_divide(\n"
101
" __global float * vec,\n"
102
" __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
103
" unsigned int size) \n"
104
"{ \n"
105
" float factor = *fac;\n"
106
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
107
" vec[i] /= factor;\n"
108
"}\n"
109
;
//matrix_col_align1_inplace_divide
110
111
const
char
*
const
matrix_col_align1_trans_unit_lower_triangular_substitute_inplace
=
112
"\n"
113
"__kernel void trans_unit_lower_triangular_substitute_inplace(\n"
114
" __global const float * matrix,\n"
115
" unsigned int matrix_rows,\n"
116
" unsigned int matrix_cols,\n"
117
" unsigned int matrix_internal_rows,\n"
118
" unsigned int matrix_internal_cols,\n"
119
" __global float * vector)\n"
120
"{\n"
121
" float temp;\n"
122
" for (int row = 0; row < matrix_rows; ++row)\n"
123
" {\n"
124
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
125
"\n"
126
" temp = vector[row];\n"
127
"\n"
128
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
129
" vector[elim] -= temp * matrix[elim * matrix_internal_rows + row];\n"
130
" }\n"
131
"}\n"
132
"\n"
133
"\n"
134
;
//matrix_col_align1_trans_unit_lower_triangular_substitute_inplace
135
136
const
char
*
const
matrix_col_align1_lower_triangular_substitute_inplace
=
137
"__kernel void lower_triangular_substitute_inplace(\n"
138
" __global const float * matrix,\n"
139
" unsigned int matrix_rows,\n"
140
" unsigned int matrix_cols,\n"
141
" unsigned int matrix_internal_rows,\n"
142
" unsigned int matrix_internal_cols,\n"
143
" __global float * vector)\n"
144
"{\n"
145
" float temp;\n"
146
" for (int row = 0; row < matrix_rows; ++row)\n"
147
" {\n"
148
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
149
" if (get_global_id(0) == 0)\n"
150
" vector[row] /= matrix[row+row*matrix_internal_rows];\n"
151
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
152
" temp = vector[row];\n"
153
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
154
" vector[elim] -= temp * matrix[row * matrix_internal_rows + elim];\n"
155
" }\n"
156
"}\n"
157
;
//matrix_col_align1_lower_triangular_substitute_inplace
158
159
const
char
*
const
matrix_col_align1_inplace_add
=
160
"__kernel void inplace_add(\n"
161
" __global float * A,\n"
162
" unsigned int A_row_start,\n"
163
" unsigned int A_col_start,\n"
164
" unsigned int A_row_size,\n"
165
" unsigned int A_col_size,\n"
166
" unsigned int A_internal_rows,\n"
167
" unsigned int A_internal_cols,\n"
168
" __global const float * B, \n"
169
" unsigned int B_row_start,\n"
170
" unsigned int B_col_start,\n"
171
" unsigned int B_row_size,\n"
172
" unsigned int B_col_size,\n"
173
" unsigned int B_internal_rows,\n"
174
" unsigned int B_internal_cols)\n"
175
"{ \n"
176
" if ( get_global_id(0) < A_row_size\n"
177
" && get_global_id(1) < A_col_size\n"
178
" )\n"
179
" A[ (get_global_id(0) + A_row_start)\n"
180
" + (get_global_id(1) + A_col_start) * A_internal_rows] \n"
181
" += B[ (get_global_id(0) + B_row_start)\n"
182
" + (get_global_id(1) + B_col_start) * B_internal_rows];\n"
183
"}\n"
184
;
//matrix_col_align1_inplace_add
185
186
const
char
*
const
matrix_col_align1_add
=
187
"__kernel void add(\n"
188
" __global const float * vec1,\n"
189
" __global const float * vec2, \n"
190
" __global float * result,\n"
191
" unsigned int size) \n"
192
"{ \n"
193
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
194
" result[i] = vec1[i] + vec2[i];\n"
195
"}\n"
196
;
//matrix_col_align1_add
197
198
const
char
*
const
matrix_col_align1_sub
=
199
"__kernel void sub(\n"
200
" __global const float * vec1,\n"
201
" __global const float * vec2, \n"
202
" __global float * result,\n"
203
" unsigned int size)\n"
204
"{ \n"
205
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
206
" result[i] = vec1[i] - vec2[i];\n"
207
"}\n"
208
;
//matrix_col_align1_sub
209
210
const
char
*
const
matrix_col_align1_trans_unit_upper_triangular_substitute_inplace
=
211
"//transposed lower triangular matrix\n"
212
"__kernel void trans_unit_upper_triangular_substitute_inplace(\n"
213
" __global const float * matrix, \n"
214
" unsigned int matrix_rows,\n"
215
" unsigned int matrix_cols,\n"
216
" unsigned int matrix_internal_rows,\n"
217
" unsigned int matrix_internal_cols,\n"
218
" __global float * vector) \n"
219
"{ \n"
220
" float temp; \n"
221
" for (int row = matrix_rows-1; row > -1; --row) \n"
222
" { \n"
223
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
224
" \n"
225
" temp = vector[row]; \n"
226
" //eliminate column with index 'row' in parallel: \n"
227
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
228
" vector[elim] -= temp * matrix[row + elim * matrix_internal_rows]; \n"
229
" } \n"
230
" \n"
231
"}\n"
232
;
//matrix_col_align1_trans_unit_upper_triangular_substitute_inplace
233
234
const
char
*
const
matrix_col_align1_scaled_rank1_update
=
235
"__kernel void scaled_rank1_update(\n"
236
" __global float * matrix,\n"
237
" unsigned int matrix_rows,\n"
238
" unsigned int matrix_cols,\n"
239
" unsigned int matrix_internal_rows,\n"
240
" unsigned int matrix_internal_cols,\n"
241
" float val,\n"
242
" __global const float * vector1, \n"
243
" __global const float * vector2) \n"
244
"{ \n"
245
" float tmp;\n"
246
" for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
247
" {\n"
248
" tmp = val * vector1[row];\n"
249
" for (unsigned int col = 0; col < matrix_cols; ++col)\n"
250
" matrix[row + col*matrix_internal_rows] += tmp * vector2[col];\n"
251
" }\n"
252
"}\n"
253
;
//matrix_col_align1_scaled_rank1_update
254
255
const
char
*
const
matrix_col_align1_unit_upper_triangular_substitute_inplace
=
256
"__kernel void unit_upper_triangular_substitute_inplace( \n"
257
" __global const float * matrix, \n"
258
" unsigned int matrix_rows,\n"
259
" unsigned int matrix_cols,\n"
260
" unsigned int matrix_internal_rows,\n"
261
" unsigned int matrix_internal_cols,\n"
262
" __global float * vector) \n"
263
"{ \n"
264
" float temp; \n"
265
" for (int row = matrix_rows-1; row > -1; --row) \n"
266
" { \n"
267
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
268
" \n"
269
" temp = vector[row]; \n"
270
" //eliminate column with index 'row' in parallel: \n"
271
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
272
" vector[elim] -= temp * matrix[elim + row * matrix_internal_rows]; \n"
273
" } \n"
274
" \n"
275
"}\n"
276
;
//matrix_col_align1_unit_upper_triangular_substitute_inplace
277
278
const
char
*
const
matrix_col_align1_trans_upper_triangular_substitute_inplace
=
279
"//transposed lower triangular matrix\n"
280
"__kernel void trans_upper_triangular_substitute_inplace(\n"
281
" __global const float * matrix, \n"
282
" unsigned int matrix_rows,\n"
283
" unsigned int matrix_cols,\n"
284
" unsigned int matrix_internal_rows,\n"
285
" unsigned int matrix_internal_cols,\n"
286
" __global float * vector) \n"
287
"{ \n"
288
" float temp; \n"
289
" for (int row = matrix_rows-1; row > -1; --row) \n"
290
" { \n"
291
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
292
" if (get_global_id(0) == 0) \n"
293
" vector[row] /= matrix[row + row*matrix_internal_rows]; \n"
294
" \n"
295
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
296
" temp = vector[row]; \n"
297
" //eliminate column with index 'row' in parallel: \n"
298
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
299
" vector[elim] -= temp * matrix[row + elim * matrix_internal_rows]; \n"
300
" } \n"
301
" \n"
302
"}\n"
303
;
//matrix_col_align1_trans_upper_triangular_substitute_inplace
304
305
const
char
*
const
matrix_col_align1_upper_triangular_substitute_inplace
=
306
"__kernel void upper_triangular_substitute_inplace( \n"
307
" __global const float * matrix, \n"
308
" unsigned int matrix_rows,\n"
309
" unsigned int matrix_cols,\n"
310
" unsigned int matrix_internal_rows,\n"
311
" unsigned int matrix_internal_cols,\n"
312
" __global float * vector) \n"
313
"{ \n"
314
" float temp; \n"
315
" for (int row = matrix_rows-1; row > -1; --row) \n"
316
" { \n"
317
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
318
" if (get_global_id(0) == 0) \n"
319
" vector[row] /= matrix[row + row*matrix_internal_rows]; \n"
320
" \n"
321
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
322
" temp = vector[row]; \n"
323
" //eliminate column with index 'row' in parallel: \n"
324
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
325
" vector[elim] -= temp * matrix[elim + row * matrix_internal_rows]; \n"
326
" } \n"
327
" \n"
328
"}\n"
329
;
//matrix_col_align1_upper_triangular_substitute_inplace
330
331
const
char
*
const
matrix_col_align1_unit_lower_triangular_substitute_inplace
=
332
"__kernel void unit_lower_triangular_substitute_inplace(\n"
333
" __global const float * matrix,\n"
334
" unsigned int matrix_rows,\n"
335
" unsigned int matrix_cols,\n"
336
" unsigned int matrix_internal_rows,\n"
337
" unsigned int matrix_internal_cols,\n"
338
" __global float * vector)\n"
339
"{\n"
340
" float temp;\n"
341
" for (int row = 0; row < matrix_rows; ++row)\n"
342
" {\n"
343
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
344
" temp = vector[row];\n"
345
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
346
" vector[elim] -= temp * matrix[row * matrix_internal_rows + elim];\n"
347
" }\n"
348
"}\n"
349
;
//matrix_col_align1_unit_lower_triangular_substitute_inplace
350
351
const
char
*
const
matrix_col_align1_fft_radix2
=
352
"__kernel void fft_radix2(__global float2* input,\n"
353
" unsigned int s,\n"
354
" unsigned int bit_size,\n"
355
" unsigned int size,\n"
356
" unsigned int stride,\n"
357
" unsigned int batch_num,\n"
358
" float sign) {\n"
359
" unsigned int ss = 1 << s;\n"
360
" unsigned int half_size = size >> 1;\n"
361
" float cs, sn;\n"
362
" const float NUM_PI = 3.14159265358979323846;\n"
363
" unsigned int glb_id = get_global_id(0);\n"
364
" unsigned int glb_sz = get_global_size(0);\n"
365
" \n"
366
" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
367
" for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) {\n"
368
" unsigned int group = (tid & (ss - 1));\n"
369
" unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
370
" unsigned int offset = pos * stride + batch_id;\n"
371
" float2 in1 = input[offset];//index\n"
372
" float2 in2 = input[offset + ss * stride];//index\n"
373
" float arg = group * sign * NUM_PI / ss;\n"
374
" sn = sincos(arg, &cs);\n"
375
" float2 ex = (float2)(cs, sn);\n"
376
" float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
377
" input[offset + ss * stride] = in1 - tmp;//index\n"
378
" input[offset] = in1 + tmp;//index\n"
379
" }\n"
380
" }\n"
381
"}\n"
382
;
//matrix_col_align1_fft_radix2
383
384
const
char
*
const
matrix_col_align1_fft_reorder
=
385
"/*\n"
386
"* Performs reordering of input data in bit-reversal order\n"
387
"* Probably it's better to do in host side,\n"
388
"*/\n"
389
"unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) {\n"
390
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
391
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
392
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
393
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
394
" v = (v >> 16) | (v << 16);\n"
395
" v = v >> (32 - bit_size);\n"
396
" return v;\n"
397
"}\n"
398
"__kernel void fft_reorder(__global float2* input,\n"
399
" unsigned int bit_size,\n"
400
" unsigned int size,\n"
401
" unsigned int stride,\n"
402
" int batch_num) {\n"
403
" unsigned int glb_id = get_global_id(0);\n"
404
" unsigned int glb_sz = get_global_size(0);\n"
405
" \n"
406
" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
407
" for(unsigned int i = glb_id; i < size; i += glb_sz) {\n"
408
" unsigned int v = get_reorder_num_2(i, bit_size);\n"
409
" if(i < v) {\n"
410
" float2 tmp = input[i * stride + batch_id]; // index\n"
411
" input[i * stride + batch_id] = input[v * stride + batch_id]; //index\n"
412
" input[v * stride + batch_id] = tmp; //index\n"
413
" }\n"
414
" }\n"
415
" }\n"
416
"}\n"
417
;
//matrix_col_align1_fft_reorder
418
419
const
char
*
const
matrix_col_align1_cpu_inplace_mult
=
420
"__kernel void cpu_inplace_mult(\n"
421
" __global float * vec,\n"
422
" float factor, \n"
423
" unsigned int size) \n"
424
"{ \n"
425
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
426
" vec[i] *= factor;\n"
427
"}\n"
428
;
//matrix_col_align1_cpu_inplace_mult
429
430
const
char
*
const
matrix_col_align1_clear
=
431
"__kernel void clear(\n"
432
" __global float * vec,\n"
433
" unsigned int size) \n"
434
"{ \n"
435
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
436
" vec[i] = 0;\n"
437
"}\n"
438
;
//matrix_col_align1_clear
439
440
const
char
*
const
matrix_col_align1_trans_lower_triangular_substitute_inplace
=
441
"__kernel void trans_lower_triangular_substitute_inplace(\n"
442
" __global const float * matrix,\n"
443
" unsigned int matrix_rows,\n"
444
" unsigned int matrix_cols,\n"
445
" unsigned int matrix_internal_rows,\n"
446
" unsigned int matrix_internal_cols,\n"
447
" __global float * vector)\n"
448
"{\n"
449
" float temp;\n"
450
" for (int row = 0; row < matrix_rows; ++row)\n"
451
" {\n"
452
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
453
" if (get_global_id(0) == 0)\n"
454
" vector[row] /= matrix[row+row*matrix_internal_rows];\n"
455
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
456
" temp = vector[row];\n"
457
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
458
" vector[elim] -= temp * matrix[elim * matrix_internal_rows + row];\n"
459
" }\n"
460
"}\n"
461
;
//matrix_col_align1_trans_lower_triangular_substitute_inplace
462
463
const
char
*
const
matrix_col_align1_vec_mul
=
464
"__kernel void vec_mul(\n"
465
" __global const float * matrix,\n"
466
" unsigned int matrix_rows,\n"
467
" unsigned int matrix_cols,\n"
468
" unsigned int matrix_internal_rows,\n"
469
" unsigned int matrix_internal_cols,\n"
470
" __global const float * vector, \n"
471
" __global float * result) \n"
472
"{ \n"
473
" for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
474
" {\n"
475
" float dot_prod = 0.0f;\n"
476
" for (unsigned int col = 0; col < matrix_cols; ++col)\n"
477
" dot_prod += matrix[row + col*matrix_internal_rows] * vector[col];\n"
478
" result[row] = dot_prod;\n"
479
" }\n"
480
"}\n"
481
;
//matrix_col_align1_vec_mul
482
483
const
char
*
const
matrix_col_align1_lu_factorize
=
484
"__kernel void lu_factorize(\n"
485
" __global float * matrix,\n"
486
" unsigned int matrix_rows,\n"
487
" unsigned int matrix_cols,\n"
488
" unsigned int matrix_internal_rows,\n"
489
" unsigned int matrix_internal_cols) \n"
490
"{ \n"
491
" float temp;\n"
492
" for (unsigned int i=1; i<matrix_rows; ++i)\n"
493
" {\n"
494
" for (unsigned int k=0; k<i; ++k)\n"
495
" {\n"
496
" if (get_global_id(0) == 0)\n"
497
" matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows];\n"
498
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
499
" temp = matrix[i + k*matrix_internal_rows];\n"
500
" \n"
501
" //parallel subtraction:\n"
502
" for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0))\n"
503
" matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows];\n"
504
" }\n"
505
" }\n"
506
"} \n"
507
;
//matrix_col_align1_lu_factorize
508
509
const
char
*
const
matrix_col_align1_trans_vec_mul
=
510
"__kernel void trans_vec_mul(\n"
511
" __global const float * matrix,\n"
512
" unsigned int matrix_rows,\n"
513
" unsigned int matrix_cols,\n"
514
" unsigned int matrix_internal_rows,\n"
515
" unsigned int matrix_internal_cols,\n"
516
" __global const float * vector, \n"
517
" __global float * result) \n"
518
"{ \n"
519
" //row and col indicate indices within transposed matrix\n"
520
" for (unsigned int row = get_global_id(0); row < matrix_cols; row += get_global_size(0))\n"
521
" {\n"
522
" float dot_prod2 = 0.0f;\n"
523
" for (unsigned int col = 0; col < matrix_rows; ++col)\n"
524
" dot_prod2 += matrix[row * matrix_internal_rows + col] * vector[col];\n"
525
" result[row] = dot_prod2;\n"
526
" }\n"
527
"}\n"
528
;
//matrix_col_align1_trans_vec_mul
529
530
const
char
*
const
matrix_col_align1_fft_direct
=
531
"// Direct FFT computation (quadratic complexity - use for reference only)\n"
532
"__kernel void fft_direct(__global float2* input,\n"
533
" __global float2* output,\n"
534
" unsigned int size,\n"
535
" unsigned int stride,\n"
536
" unsigned int batch_num,\n"
537
" float sign) {\n"
538
" \n"
539
" const float NUM_PI = 3.14159265358979323846;\n"
540
" \n"
541
" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
542
" for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) {\n"
543
" float2 f = 0.0f;\n"
544
" for(unsigned int n = 0; n < size; n++) {\n"
545
" float2 in = input[n * stride + batch_id]; //input index here\n"
546
" float sn, cs;\n"
547
" float arg = sign * 2 * NUM_PI * k / size * n;\n"
548
" sn = sincos(arg, &cs);\n"
549
" float2 ex = (float2)(cs, sn);\n"
550
" f = f + (float2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x);\n"
551
" }\n"
552
" output[k * stride + batch_id] = f;// output index here\n"
553
" }\n"
554
" }\n"
555
"}\n"
556
;
//matrix_col_align1_fft_direct
557
558
const
char
*
const
matrix_col_align1_inplace_sub
=
559
"__kernel void inplace_sub(\n"
560
" __global float * vec1,\n"
561
" __global const float * vec2,\n"
562
" unsigned int size) \n"
563
"{ \n"
564
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
565
" vec1[i] -= vec2[i];\n"
566
"}\n"
567
;
//matrix_col_align1_inplace_sub
568
569
}
//namespace kernels
570
}
//namespace linalg
571
}
//namespace viennacl
572
#endif
Generated on Fri Jul 27 2012 22:02:50 for ViennaCL - The Vienna Computing Library by
1.8.1.2