ViennaCL - The Vienna Computing Library
1.2.0
Main Page
Namespaces
Data Structures
Files
File List
Globals
viennacl
linalg
kernels
vector_source.h
Go to the documentation of this file.
1
#ifndef VIENNACL_LINALG_KERNELS_VECTOR_SOURCE_HPP_
2
#define VIENNACL_LINALG_KERNELS_VECTOR_SOURCE_HPP_
3
//Automatically generated file from auxiliary-directory, do not edit manually!
4
namespace
viennacl
5
{
6
namespace
linalg
7
{
8
namespace
kernels
9
{
10
const
char
*
const
vector_align1_inplace_mult
=
11
"__kernel void inplace_mult(\n"
12
" __global float * vec,\n"
13
" unsigned int start1,\n"
14
" unsigned int size1,\n"
15
" __global const float * fac) \n"
16
"{ \n"
17
" float factor = *fac;\n"
18
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
19
" vec[i+start1] *= factor;\n"
20
"}\n"
21
;
//vector_align1_inplace_mult
22
23
const
char
*
const
vector_align1_inplace_mul_add
=
24
"__kernel void inplace_mul_add(\n"
25
" __global float * vec1,\n"
26
" unsigned int start1,\n"
27
" unsigned int size1,\n"
28
" __global const float * vec2,\n"
29
" unsigned int start2,\n"
30
" unsigned int size2,\n"
31
" __global const float * fac) \n"
32
"{ \n"
33
" float factor = *fac;\n"
34
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
35
" vec1[i+start1] += vec2[i+start2] * factor;\n"
36
"}\n"
37
;
//vector_align1_inplace_mul_add
38
39
const
char
*
const
vector_align1_inplace_div_sub
=
40
"///// divide substract:\n"
41
"__kernel void inplace_div_sub(\n"
42
" __global float * vec1,\n"
43
" unsigned int start1,\n"
44
" unsigned int size1,\n"
45
" __global const float * vec2,\n"
46
" unsigned int start2,\n"
47
" unsigned int size2,\n"
48
" __global const float * fac) //CPU variant is mapped to mult_add\n"
49
"{ \n"
50
" float factor = *fac;\n"
51
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
52
" vec1[i+start1] -= vec2[i+start2] / factor;\n"
53
"}\n"
54
;
//vector_align1_inplace_div_sub
55
56
const
char
*
const
vector_align1_inplace_divide
=
57
"__kernel void inplace_divide(\n"
58
" __global float * vec,\n"
59
" unsigned int start1,\n"
60
" unsigned int size1,\n"
61
" __global const float * fac) //note: CPU variant is mapped to prod_scalar\n"
62
"{ \n"
63
" float factor = *fac;\n"
64
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
65
" vec[i+start1] /= factor;\n"
66
"}\n"
67
;
//vector_align1_inplace_divide
68
69
const
char
*
const
vector_align1_inplace_mul_sub
=
70
"__kernel void inplace_mul_sub(\n"
71
" __global float * vec1,\n"
72
" unsigned int start1,\n"
73
" unsigned int size1,\n"
74
" __global const float * vec2,\n"
75
" unsigned int start2,\n"
76
" unsigned int size2,\n"
77
" __global const float * fac) //CPU variant is mapped to mult_add\n"
78
"{ \n"
79
" float factor = *fac;\n"
80
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
81
" vec1[i+start1] -= vec2[i+start2] * factor;\n"
82
"}\n"
83
;
//vector_align1_inplace_mul_sub
84
85
const
char
*
const
vector_align1_inplace_add
=
86
"__kernel void inplace_add(\n"
87
" __global float * vec1,\n"
88
" unsigned int start1,\n"
89
" unsigned int size1,\n"
90
" __global const float * vec2,\n"
91
" unsigned int start2,\n"
92
" unsigned int size2) \n"
93
"{ \n"
94
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
95
" vec1[i+start1] += vec2[i+start2];\n"
96
"}\n"
97
;
//vector_align1_inplace_add
98
99
const
char
*
const
vector_align1_add
=
100
"__kernel void add(\n"
101
" __global const float * vec1,\n"
102
" unsigned int start1,\n"
103
" unsigned int size1,\n"
104
" __global const float * vec2,\n"
105
" unsigned int start2,\n"
106
" unsigned int size2,\n"
107
" __global float * result,\n"
108
" unsigned int start3,\n"
109
" unsigned int size3) \n"
110
"{ \n"
111
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
112
" result[i+start3] = vec1[i+start1] + vec2[i+start2];\n"
113
"}\n"
114
;
//vector_align1_add
115
116
const
char
*
const
vector_align1_sub
=
117
"__kernel void sub(\n"
118
" __global const float * vec1,\n"
119
" unsigned int start1,\n"
120
" unsigned int size1,\n"
121
" __global const float * vec2, \n"
122
" unsigned int start2,\n"
123
" unsigned int size2,\n"
124
" __global float * result,\n"
125
" unsigned int start3,\n"
126
" unsigned int size3)\n"
127
"{ \n"
128
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
129
" result[i+start3] = vec1[i+start1] - vec2[i+start2];\n"
130
"}\n"
131
;
//vector_align1_sub
132
133
const
char
*
const
vector_align1_inner_prod
=
134
"//helper:\n"
135
"void helper_inner_prod_parallel_reduction( __local float * tmp_buffer )\n"
136
"{\n"
137
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
138
" {\n"
139
" barrier(CLK_LOCAL_MEM_FENCE);\n"
140
" if (get_local_id(0) < stride)\n"
141
" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
142
" }\n"
143
"}\n"
144
"//////// inner products:\n"
145
"float impl_inner_prod(\n"
146
" __global const float * vec1,\n"
147
" unsigned int start1,\n"
148
" unsigned int size1,\n"
149
" __global const float * vec2,\n"
150
" unsigned int start2,\n"
151
" unsigned int size2,\n"
152
" __local float * tmp_buffer)\n"
153
"{\n"
154
" float tmp = 0;\n"
155
" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0))\n"
156
" tmp += vec1[i+start1] * vec2[i+start2];\n"
157
" tmp_buffer[get_local_id(0)] = tmp;\n"
158
" \n"
159
" helper_inner_prod_parallel_reduction(tmp_buffer);\n"
160
" \n"
161
" return tmp_buffer[0];\n"
162
"}\n"
163
"__kernel void inner_prod(\n"
164
" __global const float * vec1,\n"
165
" unsigned int start1,\n"
166
" unsigned int size1,\n"
167
" __global const float * vec2,\n"
168
" unsigned int start2,\n"
169
" unsigned int size2,\n"
170
" __local float * tmp_buffer,\n"
171
" global float * group_buffer)\n"
172
"{\n"
173
" float tmp = impl_inner_prod(vec1,\n"
174
" ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
175
" ((get_group_id(0) + 1) * size1) / get_num_groups(0) - ( get_group_id(0) * size1) / get_num_groups(0),\n"
176
" vec2,\n"
177
" ( get_group_id(0) * size2) / get_num_groups(0) + start2,\n"
178
" ((get_group_id(0) + 1) * size2) / get_num_groups(0) - ( get_group_id(0) * size2) / get_num_groups(0),\n"
179
" tmp_buffer);\n"
180
" \n"
181
" if (get_local_id(0) == 0)\n"
182
" group_buffer[get_group_id(0)] = tmp;\n"
183
" \n"
184
"}\n"
185
;
//vector_align1_inner_prod
186
187
const
char
*
const
vector_align1_sum
=
188
"__kernel void sum(\n"
189
" __global float * vec1,\n"
190
" unsigned int start1,\n"
191
" unsigned int size1,\n"
192
" __global float * result) \n"
193
"{ \n"
194
" //parallel reduction on global memory (make sure get_global_size(0) is a power of 2)\n"
195
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
196
" {\n"
197
" if (get_global_id(0) < stride)\n"
198
" vec1[get_global_id(0)+start1] += vec1[get_global_id(0)+start1+stride];\n"
199
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
200
" }\n"
201
" \n"
202
" if (get_global_id(0) == 0)\n"
203
" *result = vec1[0]; \n"
204
"}\n"
205
;
//vector_align1_sum
206
207
const
char
*
const
vector_align1_cpu_mul_add
=
208
"__kernel void cpu_mul_add(\n"
209
" __global const float * vec1,\n"
210
" unsigned int start1,\n"
211
" unsigned int size1,\n"
212
" float factor,\n"
213
" __global const float * vec2,\n"
214
" unsigned int start2,\n"
215
" unsigned int size2,\n"
216
" __global float * result,\n"
217
" unsigned int start3,\n"
218
" unsigned int size3\n"
219
" ) \n"
220
"{ \n"
221
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
222
" result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
223
"}\n"
224
;
//vector_align1_cpu_mul_add
225
226
const
char
*
const
vector_align1_norm_1
=
227
"//helper:\n"
228
"void helper_norm1_parallel_reduction( __local float * tmp_buffer )\n"
229
"{\n"
230
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
231
" {\n"
232
" barrier(CLK_LOCAL_MEM_FENCE);\n"
233
" if (get_global_id(0) < stride)\n"
234
" tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
235
" }\n"
236
"}\n"
237
"\n"
238
"////// norm_1\n"
239
"float impl_norm_1(\n"
240
" __global const float * vec,\n"
241
" unsigned int start_index,\n"
242
" unsigned int end_index,\n"
243
" __local float * tmp_buffer)\n"
244
"{\n"
245
" float tmp = 0;\n"
246
" for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
247
" tmp += fabs(vec[i]);\n"
248
" \n"
249
" tmp_buffer[get_local_id(0)] = tmp;\n"
250
" \n"
251
" helper_norm1_parallel_reduction(tmp_buffer);\n"
252
" \n"
253
" return tmp_buffer[0];\n"
254
"};\n"
255
"\n"
256
"__kernel void norm_1(\n"
257
" __global const float * vec,\n"
258
" unsigned int start1,\n"
259
" unsigned int size1,\n"
260
" __local float * tmp_buffer,\n"
261
" global float * group_buffer)\n"
262
"{\n"
263
" float tmp = impl_norm_1(vec,\n"
264
" ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
265
" ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
266
" tmp_buffer);\n"
267
" \n"
268
" if (get_local_id(0) == 0)\n"
269
" group_buffer[get_group_id(0)] = tmp; \n"
270
"}\n"
271
"\n"
272
;
//vector_align1_norm_1
273
274
const
char
*
const
vector_align1_mul_sub
=
275
"///// multiply subtract:\n"
276
"__kernel void mul_sub(\n"
277
" __global const float * vec1,\n"
278
" unsigned int start1,\n"
279
" unsigned int size1,\n"
280
" __global const float * fac,\n"
281
" __global const float * vec2,\n"
282
" unsigned int start2,\n"
283
" unsigned int size2,\n"
284
" __global float * result,\n"
285
" unsigned int start3,\n"
286
" unsigned int size3\n"
287
" ) \n"
288
"{ \n"
289
" float factor = *fac;\n"
290
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
291
" result[i+start3] = vec1[i+start1] * factor - vec2[i+start2];\n"
292
"}\n"
293
;
//vector_align1_mul_sub
294
295
const
char
*
const
vector_align1_mul_add
=
296
"__kernel void mul_add(\n"
297
" __global const float * vec1,\n"
298
" unsigned int start1,\n"
299
" unsigned int size1,\n"
300
" __global const float * fac,\n"
301
" __global const float * vec2,\n"
302
" unsigned int start2,\n"
303
" unsigned int size2,\n"
304
" __global float * result,\n"
305
" unsigned int start3,\n"
306
" unsigned int size3\n"
307
" ) \n"
308
"{ \n"
309
" float factor = *fac;\n"
310
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
311
" result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
312
"}\n"
313
;
//vector_align1_mul_add
314
315
const
char
*
const
vector_align1_swap
=
316
"////// swap:\n"
317
"__kernel void swap(\n"
318
" __global float * vec1,\n"
319
" unsigned int start1,\n"
320
" unsigned int size1,\n"
321
" __global float * vec2,\n"
322
" unsigned int start2,\n"
323
" unsigned int size2\n"
324
" ) \n"
325
"{ \n"
326
" float tmp;\n"
327
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
328
" {\n"
329
" tmp = vec2[i+start2];\n"
330
" vec2[i+start2] = vec1[i+start1];\n"
331
" vec1[i+start1] = tmp;\n"
332
" }\n"
333
"}\n"
334
" \n"
335
;
//vector_align1_swap
336
337
const
char
*
const
vector_align1_inplace_div_add
=
338
"///// divide add:\n"
339
"__kernel void inplace_div_add(\n"
340
" __global float * vec1,\n"
341
" unsigned int start1,\n"
342
" unsigned int size1,\n"
343
" __global const float * vec2,\n"
344
" unsigned int start2,\n"
345
" unsigned int size2,\n"
346
" __global const float * fac) //CPU variant is mapped to mult_add\n"
347
"{ \n"
348
" float factor = *fac;\n"
349
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
350
" vec1[i+start1] -= vec2[i+start2] / factor;\n"
351
"}\n"
352
;
//vector_align1_inplace_div_add
353
354
const
char
*
const
vector_align1_sqrt_sum
=
355
"__kernel void sqrt_sum(\n"
356
" __global float * vec1,\n"
357
" unsigned int start1,\n"
358
" unsigned int size1,\n"
359
" __global float * result) \n"
360
"{ \n"
361
" //parallel reduction on global memory: (make sure get_global_size(0) is a power of 2)\n"
362
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
363
" {\n"
364
" if (get_global_id(0) < stride)\n"
365
" vec1[get_global_id(0)+start1] += vec1[get_global_id(0)+start1+stride];\n"
366
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
367
" }\n"
368
" \n"
369
" if (get_global_id(0) == 0)\n"
370
" *result = sqrt(vec1[start1]);\n"
371
" \n"
372
"}\n"
373
;
//vector_align1_sqrt_sum
374
375
const
char
*
const
vector_align1_diag_precond
=
376
"__kernel void diag_precond(\n"
377
" __global const float * diag_A_inv, \n"
378
" unsigned int start1,\n"
379
" unsigned int size1,\n"
380
" __global float * x, \n"
381
" unsigned int start2,\n"
382
" unsigned int size2) \n"
383
"{ \n"
384
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
385
" x[i+start2] *= diag_A_inv[i+start1];\n"
386
"}\n"
387
;
//vector_align1_diag_precond
388
389
const
char
*
const
vector_align1_cpu_inplace_mult
=
390
"__kernel void cpu_inplace_mult(\n"
391
" __global float * vec,\n"
392
" unsigned int start1,\n"
393
" unsigned int size1,\n"
394
" float factor) \n"
395
"{ \n"
396
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
397
" vec[i+start1] *= factor;\n"
398
"}\n"
399
;
//vector_align1_cpu_inplace_mult
400
401
const
char
*
const
vector_align1_clear
=
402
"__kernel void clear(\n"
403
" __global float * vec,\n"
404
" unsigned int start1,\n"
405
" unsigned int size1) \n"
406
"{ \n"
407
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
408
" vec[i+start1] = 0;\n"
409
"}\n"
410
;
//vector_align1_clear
411
412
const
char
*
const
vector_align1_cpu_inplace_mul_add
=
413
"__kernel void cpu_inplace_mul_add(\n"
414
" __global float * vec1,\n"
415
" unsigned int start1,\n"
416
" unsigned int size1,\n"
417
" __global const float * vec2,\n"
418
" unsigned int start2,\n"
419
" unsigned int size2,\n"
420
" float factor) \n"
421
"{ \n"
422
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
423
" vec1[i+start1] += vec2[i+start2] * factor;\n"
424
"}\n"
425
;
//vector_align1_cpu_inplace_mul_add
426
427
const
char
*
const
vector_align1_vmax
=
428
"__kernel void vmax(\n"
429
" __global float * vec1,\n"
430
" unsigned int start1,\n"
431
" unsigned int size1,\n"
432
" __global float * result) \n"
433
"{ \n"
434
" //parallel reduction on global memory (make sure that size is a power of 2)\n"
435
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
436
" {\n"
437
" if (get_global_id(0) < stride)\n"
438
" vec1[get_global_id(0)+start1] = fmax(vec1[get_global_id(0)+start1+stride], vec1[get_global_id(0)+start1]);\n"
439
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
440
" }\n"
441
" \n"
442
" if (get_global_id(0) == 0)\n"
443
" *result = vec1[start1];\n"
444
"}\n"
445
;
//vector_align1_vmax
446
447
const
char
*
const
vector_align1_norm_inf
=
448
"\n"
449
"////// norm_inf\n"
450
"float impl_norm_inf(\n"
451
" __global const float * vec,\n"
452
" unsigned int start_index,\n"
453
" unsigned int end_index,\n"
454
" __local float * tmp_buffer)\n"
455
"{\n"
456
" float tmp = 0;\n"
457
" for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
458
" tmp = fmax(fabs(vec[i]), tmp);\n"
459
" tmp_buffer[get_local_id(0)] = tmp;\n"
460
" \n"
461
" //step 2: parallel reduction:\n"
462
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
463
" {\n"
464
" barrier(CLK_LOCAL_MEM_FENCE);\n"
465
" if (get_global_id(0) < stride)\n"
466
" tmp_buffer[get_global_id(0)] = fmax(tmp_buffer[get_global_id(0)], tmp_buffer[get_global_id(0)+stride]);\n"
467
" }\n"
468
" \n"
469
" return tmp_buffer[0];\n"
470
"}\n"
471
"\n"
472
"__kernel void norm_inf(\n"
473
" __global const float * vec,\n"
474
" unsigned int start1,\n"
475
" unsigned int size1,\n"
476
" __local float * tmp_buffer,\n"
477
" global float * group_buffer)\n"
478
"{\n"
479
" float tmp = impl_norm_inf(vec,\n"
480
" ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
481
" ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
482
" tmp_buffer);\n"
483
" \n"
484
" if (get_local_id(0) == 0)\n"
485
" group_buffer[get_group_id(0)] = tmp; \n"
486
"}\n"
487
;
//vector_align1_norm_inf
488
489
const
char
*
const
vector_align1_cpu_mult
=
490
"__kernel void cpu_mult(\n"
491
" __global const float * vec,\n"
492
" unsigned int start1,\n"
493
" unsigned int size1,\n"
494
" float factor, \n"
495
" __global float * result,\n"
496
" unsigned int start2,\n"
497
" unsigned int size2) \n"
498
"{ \n"
499
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
500
" result[i+start2] = vec[i+start1] * factor;\n"
501
"}\n"
502
;
//vector_align1_cpu_mult
503
504
const
char
*
const
vector_align1_index_norm_inf
=
505
"//index_norm_inf:\n"
506
"unsigned int float_vector1_index_norm_inf_impl(\n"
507
" __global const float * vec,\n"
508
" unsigned int start1,\n"
509
" unsigned int size1,\n"
510
" __local float * float_buffer,\n"
511
" __local unsigned int * index_buffer)\n"
512
"{\n"
513
" //step 1: fill buffer:\n"
514
" float cur_max = 0.0f;\n"
515
" float tmp;\n"
516
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
517
" {\n"
518
" tmp = fabs(vec[i+start1]);\n"
519
" if (cur_max < tmp)\n"
520
" {\n"
521
" float_buffer[get_global_id(0)] = tmp;\n"
522
" index_buffer[get_global_id(0)] = i;\n"
523
" cur_max = tmp;\n"
524
" }\n"
525
" }\n"
526
" \n"
527
" //step 2: parallel reduction:\n"
528
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
529
" {\n"
530
" barrier(CLK_LOCAL_MEM_FENCE);\n"
531
" if (get_global_id(0) < stride)\n"
532
" {\n"
533
" //find the first occurring index\n"
534
" if (float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride])\n"
535
" {\n"
536
" index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride];\n"
537
" float_buffer[get_global_id(0)] = float_buffer[get_global_id(0)+stride];\n"
538
" }\n"
539
" \n"
540
" //index_buffer[get_global_id(0)] = float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride] ? index_buffer[get_global_id(0)+stride] : index_buffer[get_global_id(0)];\n"
541
" //float_buffer[get_global_id(0)] = max(float_buffer[get_global_id(0)], float_buffer[get_global_id(0)+stride]);\n"
542
" }\n"
543
" }\n"
544
" \n"
545
" return index_buffer[0];\n"
546
"}\n"
547
"\n"
548
"__kernel void index_norm_inf(\n"
549
" __global float * vec,\n"
550
" unsigned int start1,\n"
551
" unsigned int size1,\n"
552
" __local float * float_buffer,\n"
553
" __local unsigned int * index_buffer,\n"
554
" global unsigned int * result) \n"
555
"{ \n"
556
" unsigned int tmp = float_vector1_index_norm_inf_impl(vec, start1, size1, float_buffer, index_buffer);\n"
557
" if (get_global_id(0) == 0) *result = tmp;\n"
558
"}\n"
559
"\n"
560
"\n"
561
;
//vector_align1_index_norm_inf
562
563
const
char
*
const
vector_align1_norm_2
=
564
"//helper:\n"
565
"void helper_norm2_parallel_reduction( __local float * tmp_buffer )\n"
566
"{\n"
567
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
568
" {\n"
569
" barrier(CLK_LOCAL_MEM_FENCE);\n"
570
" if (get_global_id(0) < stride)\n"
571
" tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
572
" }\n"
573
"}\n"
574
"\n"
575
"////// norm_2\n"
576
"float impl_norm_2(\n"
577
" __global const float * vec,\n"
578
" unsigned int start_index,\n"
579
" unsigned int end_index,\n"
580
" __local float * tmp_buffer)\n"
581
"{\n"
582
" float tmp = 0;\n"
583
" float vec_entry = 0;\n"
584
" for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
585
" {\n"
586
" vec_entry = vec[i];\n"
587
" tmp += vec_entry * vec_entry;\n"
588
" }\n"
589
" tmp_buffer[get_local_id(0)] = tmp;\n"
590
" \n"
591
" helper_norm2_parallel_reduction(tmp_buffer);\n"
592
" \n"
593
" return tmp_buffer[0];\n"
594
"};\n"
595
"\n"
596
"__kernel void norm_2(\n"
597
" __global const float * vec,\n"
598
" unsigned int start1,\n"
599
" unsigned int size1,\n"
600
" __local float * tmp_buffer,\n"
601
" global float * group_buffer)\n"
602
"{\n"
603
" float tmp = impl_norm_2(vec,\n"
604
" ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
605
" ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
606
" tmp_buffer);\n"
607
" \n"
608
" if (get_local_id(0) == 0)\n"
609
" group_buffer[get_group_id(0)] = tmp; \n"
610
"}\n"
611
"\n"
612
;
//vector_align1_norm_2
613
614
const
char
*
const
vector_align1_mult
=
615
"__kernel void mult(\n"
616
" __global const float * vec,\n"
617
" unsigned int start1,\n"
618
" unsigned int size1,\n"
619
" __global const float * fac, \n"
620
" __global float * result,\n"
621
" unsigned int start3,\n"
622
" unsigned int size3) \n"
623
"{ \n"
624
" float factor = *fac;\n"
625
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
626
" result[i+start3] = vec[i+start1] * factor;\n"
627
"}\n"
628
;
//vector_align1_mult
629
630
const
char
*
const
vector_align1_divide
=
631
"// Note: name 'div' is not allowed by the jit-compiler\n"
632
"__kernel void divide(\n"
633
" __global const float * vec,\n"
634
" unsigned int start1,\n"
635
" unsigned int size1,\n"
636
" __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
637
" __global float * result,\n"
638
" unsigned int start3,\n"
639
" unsigned int size3) \n"
640
"{ \n"
641
" float factor = *fac;\n"
642
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
643
" result[i+start3] = vec[i+start1] / factor;\n"
644
"}\n"
645
;
//vector_align1_divide
646
647
const
char
*
const
vector_align1_inplace_sub
=
648
"__kernel void inplace_sub(\n"
649
" __global float * vec1,\n"
650
" unsigned int start1,\n"
651
" unsigned int size1,\n"
652
" __global const float * vec2,\n"
653
" unsigned int start2,\n"
654
" unsigned int size2) \n"
655
"{ \n"
656
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
657
" vec1[i+start1] -= vec2[i+start2];\n"
658
"}\n"
659
;
//vector_align1_inplace_sub
660
661
const
char
*
const
vector_align1_plane_rotation
=
662
"////// plane rotation: (x,y) <- (\alpha x + \beta y, -\beta x + \alpha y)\n"
663
"__kernel void plane_rotation(\n"
664
" __global float * vec1,\n"
665
" unsigned int start1,\n"
666
" unsigned int size1,\n"
667
" __global float * vec2, \n"
668
" unsigned int start2,\n"
669
" unsigned int size2,\n"
670
" float alpha,\n"
671
" float beta) \n"
672
"{ \n"
673
" float tmp1 = 0;\n"
674
" float tmp2 = 0;\n"
675
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
676
" {\n"
677
" tmp1 = vec1[i+start1];\n"
678
" tmp2 = vec2[i+start2];\n"
679
" \n"
680
" vec1[i+start1] = alpha * tmp1 + beta * tmp2;\n"
681
" vec2[i+start2] = alpha * tmp2 - beta * tmp1;\n"
682
" }\n"
683
"}\n"
684
;
//vector_align1_plane_rotation
685
686
const
char
*
const
vector_align16_inplace_mult
=
687
"__kernel void inplace_mult(\n"
688
" __global float16 * vec,\n"
689
" unsigned int start1,\n"
690
" unsigned int size1,\n"
691
" __global const float * fac) \n"
692
"{ \n"
693
" float factor = *fac;\n"
694
" unsigned int i_end = size1/16;\n"
695
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
696
" vec[i+start1] *= factor;\n"
697
"}\n"
698
;
//vector_align16_inplace_mult
699
700
const
char
*
const
vector_align16_inplace_divide
=
701
"__kernel void inplace_divide(\n"
702
" __global float16 * vec,\n"
703
" unsigned int start1,\n"
704
" unsigned int size1,\n"
705
" __global const float * fac) //note: CPU variant is mapped to prod_scalar\n"
706
"{ \n"
707
" float factor = *fac;\n"
708
" unsigned int i_end = size1/16;\n"
709
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
710
" vec[i+start1] /= factor;\n"
711
"}\n"
712
;
//vector_align16_inplace_divide
713
714
const
char
*
const
vector_align16_inplace_add
=
715
"__kernel void inplace_add(\n"
716
" __global float16 * vec1,\n"
717
" unsigned int start1,\n"
718
" unsigned int size1,\n"
719
" __global const float16 * vec2,\n"
720
" unsigned int start2,\n"
721
" unsigned int size2) \n"
722
"{ \n"
723
" unsigned int i_end = size1/16;\n"
724
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
725
" vec1[i+start1] += vec2[i+start2];\n"
726
"}\n"
727
;
//vector_align16_inplace_add
728
729
const
char
*
const
vector_align16_cpu_inplace_mul
=
730
"\n"
731
"__kernel void cpu_inplace_mult(\n"
732
" __global float16 * vec,\n"
733
" unsigned int start1,\n"
734
" unsigned int size1,\n"
735
" float factor) \n"
736
"{ \n"
737
" unsigned int i_end = size1/16;\n"
738
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
739
" vec[i+start1] *= factor;\n"
740
"}\n"
741
"\n"
742
;
//vector_align16_cpu_inplace_mul
743
744
const
char
*
const
vector_align16_add
=
745
"__kernel void add(\n"
746
" __global const float16 * vec1,\n"
747
" unsigned int start1,\n"
748
" unsigned int size1,\n"
749
" __global const float16 * vec2, \n"
750
" unsigned int start2,\n"
751
" unsigned int size2,\n"
752
" __global float16 * result,\n"
753
" unsigned int start3,\n"
754
" unsigned int size3)\n"
755
"{ \n"
756
" unsigned int i_end = size/16;\n"
757
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
758
" result[i+start3] = vec1[i+start1] + vec2[i+start2];\n"
759
"}\n"
760
;
//vector_align16_add
761
762
const
char
*
const
vector_align16_sub
=
763
"__kernel void sub(\n"
764
" __global const float16 * vec1,\n"
765
" unsigned int start1,\n"
766
" unsigned int size1,\n"
767
" __global const float16 * vec2, \n"
768
" unsigned int start2,\n"
769
" unsigned int size2,\n"
770
" __global float16 * result,\n"
771
" unsigned int start3,\n"
772
" unsigned int size3)\n"
773
"{ \n"
774
" unsigned int i_end = size1 / 16;\n"
775
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
776
" result[i+start3] = vec1[i+start1] - vec2[i+start2];\n"
777
"}\n"
778
;
//vector_align16_sub
779
780
const
char
*
const
vector_align16_cpu_mult
=
781
"__kernel void cpu_mult(\n"
782
" __global const float16 * vec,\n"
783
" unsigned int start1,\n"
784
" unsigned int size1,\n"
785
" float factor, \n"
786
" __global float16 * result,\n"
787
" unsigned int start2,\n"
788
" unsigned int size2) \n"
789
"{ \n"
790
" unsigned int i_end = size1/16;\n"
791
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
792
" result[i+start2] = vec[i+start1] * factor;\n"
793
"}\n"
794
;
//vector_align16_cpu_mult
795
796
const
char
*
const
vector_align16_mult
=
797
"__kernel void mult(\n"
798
" __global const float16 * vec,\n"
799
" unsigned int start1,\n"
800
" unsigned int size1,\n"
801
" __global const float * fac, \n"
802
" __global float16 * result,\n"
803
" unsigned int start2,\n"
804
" unsigned int size2) \n"
805
"{ \n"
806
" float factor = *fac;\n"
807
" unsigned int i_end = size1/16;\n"
808
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
809
" result[i+start2] = vec[i+start1] * factor;\n"
810
"}\n"
811
;
//vector_align16_mult
812
813
const
char
*
const
vector_align16_divide
=
814
"//Note: 'div' cannot be used because of complaints by the jit-compiler\n"
815
"__kernel void divide(\n"
816
" __global const float16 * vec,\n"
817
" unsigned int start1,\n"
818
" unsigned int size1,\n"
819
" __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
820
" __global float16 * result,\n"
821
" unsigned int start2,\n"
822
" unsigned int size2) \n"
823
"{ \n"
824
" float factor = *fac;\n"
825
" unsigned int i_end = size/16;\n"
826
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
827
" result[i+start2] = vec[i+start1] / factor;\n"
828
"}\n"
829
;
//vector_align16_divide
830
831
const
char
*
const
vector_align16_inplace_sub
=
832
"__kernel void inplace_sub(\n"
833
" __global float16 * vec1,\n"
834
" unsigned int start1,\n"
835
" unsigned int size1,\n"
836
" __global const float16 * vec2,\n"
837
" unsigned int start2,\n"
838
" unsigned int size2) \n"
839
"{ \n"
840
" unsigned int i_end = size1/16;\n"
841
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
842
" vec1[i+start1] -= vec2[i+start2];\n"
843
"}\n"
844
;
//vector_align16_inplace_sub
845
846
const
char
*
const
vector_align4_inplace_mul_add
=
847
"__kernel void inplace_mul_add(\n"
848
" __global float4 * vec1,\n"
849
" unsigned int start1,\n"
850
" unsigned int size1,\n"
851
" __global const float4 * vec2,\n"
852
" unsigned int start2,\n"
853
" unsigned int size2,\n"
854
" __global const float * fac) \n"
855
"{ \n"
856
" float factor = *fac;\n"
857
" unsigned int size_div_4 = size1/4;\n"
858
" for (unsigned int i = get_global_id(0); i < size_div_4; i += get_global_size(0))\n"
859
" vec1[i+start1] += vec2[i+start2] * factor;\n"
860
"}\n"
861
;
//vector_align4_inplace_mul_add
862
863
const
char
*
const
vector_align4_inplace_div_sub
=
864
"__kernel void inplace_div_sub(\n"
865
" __global float4 * vec1,\n"
866
" unsigned int start1,\n"
867
" unsigned int size1,\n"
868
" __global const float4 * vec2,\n"
869
" unsigned int start2,\n"
870
" unsigned int size2,\n"
871
" __global const float * fac) //CPU variant is mapped to mult_add\n"
872
"{ \n"
873
" float factor = *fac;\n"
874
" unsigned int i_end = size1/4;\n"
875
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
876
" vec1[i+start1] -= vec2[i+start2] / factor;\n"
877
"}\n"
878
;
//vector_align4_inplace_div_sub
879
880
const
char
*
const
vector_align4_inplace_mul_sub
=
881
"__kernel void inplace_mul_sub(\n"
882
" __global float4 * vec1,\n"
883
" unsigned int start1,\n"
884
" unsigned int size1,\n"
885
" __global const float4 * vec2,\n"
886
" unsigned int start2,\n"
887
" unsigned int size2,\n"
888
" __global const float * fac) //CPU variant is mapped to mult_add\n"
889
"{ \n"
890
" float factor = *fac;\n"
891
" unsigned int i_end = size/4;\n"
892
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
893
" vec1[i+start1] -= vec2[i+start2] * factor;\n"
894
"}\n"
895
;
//vector_align4_inplace_mul_sub
896
897
const
char
*
const
vector_align4_cpu_mul_add
=
898
"__kernel void cpu_mul_add(\n"
899
" __global const float4 * vec1,\n"
900
" unsigned int start1,\n"
901
" unsigned int size1,\n"
902
" float factor,\n"
903
" __global const float4 * vec2,\n"
904
" unsigned int start2,\n"
905
" unsigned int size2,\n"
906
" __global float4 * result,\n"
907
" unsigned int start3,\n"
908
" unsigned int size3) \n"
909
"{ \n"
910
" unsigned int i_end = size1/4;\n"
911
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
912
" result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
913
"}\n"
914
;
//vector_align4_cpu_mul_add
915
916
const
char
*
const
vector_align4_mul_add
=
917
"__kernel void mul_add(\n"
918
" __global const float4 * vec1,\n"
919
" unsigned int start1,\n"
920
" unsigned int size1,\n"
921
" __global const float * fac,\n"
922
" __global const float4 * vec2,\n"
923
" unsigned int start2,\n"
924
" unsigned int size2,\n"
925
" __global float4 * result,\n"
926
" unsigned int start3,\n"
927
" unsigned int size3) \n"
928
"{ \n"
929
" float factor = *fac;\n"
930
" unsigned int i_end = size1/4;\n"
931
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
932
" result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
933
"}\n"
934
;
//vector_align4_mul_add
935
936
const
char
*
const
vector_align4_inplace_div_add
=
937
"__kernel void inplace_div_add(\n"
938
" __global float4 * vec1,\n"
939
" unsigned int start1,\n"
940
" unsigned int size1,\n"
941
" __global const float4 * vec2,\n"
942
" unsigned int start2,\n"
943
" unsigned int size2,\n"
944
" __global const float * fac) //CPU variant is mapped to mult_add\n"
945
"{ \n"
946
" float factor = *fac;\n"
947
" unsigned int i_end = size1 / 4;\n"
948
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
949
" vec1[i+start1] -= vec2[i+start2] / factor;\n"
950
"}\n"
951
;
//vector_align4_inplace_div_add
952
953
const
char
*
const
vector_align4_cpu_inplace_mul_add
=
954
"__kernel void cpu_inplace_mul_add(\n"
955
" __global float4 * vec1,\n"
956
" unsigned int start1,\n"
957
" unsigned int size1,\n"
958
" __global const float4 * vec2,\n"
959
" unsigned int start2,\n"
960
" unsigned int size2,\n"
961
" float factor) \n"
962
"{ \n"
963
" unsigned int i_end = size1/4;\n"
964
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
965
" vec1[i+start1] += vec2[i+start2] * factor;\n"
966
"}\n"
967
;
//vector_align4_cpu_inplace_mul_add
968
969
}
//namespace kernels
970
}
//namespace linalg
971
}
//namespace viennacl
972
#endif
Generated on Fri Jul 27 2012 22:02:51 for ViennaCL - The Vienna Computing Library by
1.8.1.2