ViennaCL - The Vienna Computing Library  1.2.0
vector_source.h
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_KERNELS_VECTOR_SOURCE_HPP_
2 #define VIENNACL_LINALG_KERNELS_VECTOR_SOURCE_HPP_
3 //Automatically generated file from auxiliary-directory, do not edit manually!
4 namespace viennacl
5 {
6  namespace linalg
7  {
8  namespace kernels
9  {
10 const char * const vector_align1_inplace_mult =
11 "__kernel void inplace_mult(\n"
12 " __global float * vec,\n"
13 " unsigned int start1,\n"
14 " unsigned int size1,\n"
15 " __global const float * fac) \n"
16 "{ \n"
17 " float factor = *fac;\n"
18 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
19 " vec[i+start1] *= factor;\n"
20 "}\n"
21 ; //vector_align1_inplace_mult
22 
23 const char * const vector_align1_inplace_mul_add =
24 "__kernel void inplace_mul_add(\n"
25 " __global float * vec1,\n"
26 " unsigned int start1,\n"
27 " unsigned int size1,\n"
28 " __global const float * vec2,\n"
29 " unsigned int start2,\n"
30 " unsigned int size2,\n"
31 " __global const float * fac) \n"
32 "{ \n"
33 " float factor = *fac;\n"
34 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
35 " vec1[i+start1] += vec2[i+start2] * factor;\n"
36 "}\n"
37 ; //vector_align1_inplace_mul_add
38 
39 const char * const vector_align1_inplace_div_sub =
40 "///// divide substract:\n"
41 "__kernel void inplace_div_sub(\n"
42 " __global float * vec1,\n"
43 " unsigned int start1,\n"
44 " unsigned int size1,\n"
45 " __global const float * vec2,\n"
46 " unsigned int start2,\n"
47 " unsigned int size2,\n"
48 " __global const float * fac) //CPU variant is mapped to mult_add\n"
49 "{ \n"
50 " float factor = *fac;\n"
51 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
52 " vec1[i+start1] -= vec2[i+start2] / factor;\n"
53 "}\n"
54 ; //vector_align1_inplace_div_sub
55 
56 const char * const vector_align1_inplace_divide =
57 "__kernel void inplace_divide(\n"
58 " __global float * vec,\n"
59 " unsigned int start1,\n"
60 " unsigned int size1,\n"
61 " __global const float * fac) //note: CPU variant is mapped to prod_scalar\n"
62 "{ \n"
63 " float factor = *fac;\n"
64 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
65 " vec[i+start1] /= factor;\n"
66 "}\n"
67 ; //vector_align1_inplace_divide
68 
69 const char * const vector_align1_inplace_mul_sub =
70 "__kernel void inplace_mul_sub(\n"
71 " __global float * vec1,\n"
72 " unsigned int start1,\n"
73 " unsigned int size1,\n"
74 " __global const float * vec2,\n"
75 " unsigned int start2,\n"
76 " unsigned int size2,\n"
77 " __global const float * fac) //CPU variant is mapped to mult_add\n"
78 "{ \n"
79 " float factor = *fac;\n"
80 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
81 " vec1[i+start1] -= vec2[i+start2] * factor;\n"
82 "}\n"
83 ; //vector_align1_inplace_mul_sub
84 
85 const char * const vector_align1_inplace_add =
86 "__kernel void inplace_add(\n"
87 " __global float * vec1,\n"
88 " unsigned int start1,\n"
89 " unsigned int size1,\n"
90 " __global const float * vec2,\n"
91 " unsigned int start2,\n"
92 " unsigned int size2) \n"
93 "{ \n"
94 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
95 " vec1[i+start1] += vec2[i+start2];\n"
96 "}\n"
97 ; //vector_align1_inplace_add
98 
99 const char * const vector_align1_add =
100 "__kernel void add(\n"
101 " __global const float * vec1,\n"
102 " unsigned int start1,\n"
103 " unsigned int size1,\n"
104 " __global const float * vec2,\n"
105 " unsigned int start2,\n"
106 " unsigned int size2,\n"
107 " __global float * result,\n"
108 " unsigned int start3,\n"
109 " unsigned int size3) \n"
110 "{ \n"
111 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
112 " result[i+start3] = vec1[i+start1] + vec2[i+start2];\n"
113 "}\n"
114 ; //vector_align1_add
115 
116 const char * const vector_align1_sub =
117 "__kernel void sub(\n"
118 " __global const float * vec1,\n"
119 " unsigned int start1,\n"
120 " unsigned int size1,\n"
121 " __global const float * vec2, \n"
122 " unsigned int start2,\n"
123 " unsigned int size2,\n"
124 " __global float * result,\n"
125 " unsigned int start3,\n"
126 " unsigned int size3)\n"
127 "{ \n"
128 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
129 " result[i+start3] = vec1[i+start1] - vec2[i+start2];\n"
130 "}\n"
131 ; //vector_align1_sub
132 
133 const char * const vector_align1_inner_prod =
134 "//helper:\n"
135 "void helper_inner_prod_parallel_reduction( __local float * tmp_buffer )\n"
136 "{\n"
137 " for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
138 " {\n"
139 " barrier(CLK_LOCAL_MEM_FENCE);\n"
140 " if (get_local_id(0) < stride)\n"
141 " tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
142 " }\n"
143 "}\n"
144 "//////// inner products:\n"
145 "float impl_inner_prod(\n"
146 " __global const float * vec1,\n"
147 " unsigned int start1,\n"
148 " unsigned int size1,\n"
149 " __global const float * vec2,\n"
150 " unsigned int start2,\n"
151 " unsigned int size2,\n"
152 " __local float * tmp_buffer)\n"
153 "{\n"
154 " float tmp = 0;\n"
155 " for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0))\n"
156 " tmp += vec1[i+start1] * vec2[i+start2];\n"
157 " tmp_buffer[get_local_id(0)] = tmp;\n"
158 " \n"
159 " helper_inner_prod_parallel_reduction(tmp_buffer);\n"
160 " \n"
161 " return tmp_buffer[0];\n"
162 "}\n"
163 "__kernel void inner_prod(\n"
164 " __global const float * vec1,\n"
165 " unsigned int start1,\n"
166 " unsigned int size1,\n"
167 " __global const float * vec2,\n"
168 " unsigned int start2,\n"
169 " unsigned int size2,\n"
170 " __local float * tmp_buffer,\n"
171 " global float * group_buffer)\n"
172 "{\n"
173 " float tmp = impl_inner_prod(vec1,\n"
174 " ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
175 " ((get_group_id(0) + 1) * size1) / get_num_groups(0) - ( get_group_id(0) * size1) / get_num_groups(0),\n"
176 " vec2,\n"
177 " ( get_group_id(0) * size2) / get_num_groups(0) + start2,\n"
178 " ((get_group_id(0) + 1) * size2) / get_num_groups(0) - ( get_group_id(0) * size2) / get_num_groups(0),\n"
179 " tmp_buffer);\n"
180 " \n"
181 " if (get_local_id(0) == 0)\n"
182 " group_buffer[get_group_id(0)] = tmp;\n"
183 " \n"
184 "}\n"
185 ; //vector_align1_inner_prod
186 
187 const char * const vector_align1_sum =
188 "__kernel void sum(\n"
189 " __global float * vec1,\n"
190 " unsigned int start1,\n"
191 " unsigned int size1,\n"
192 " __global float * result) \n"
193 "{ \n"
194 " //parallel reduction on global memory (make sure get_global_size(0) is a power of 2)\n"
195 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
196 " {\n"
197 " if (get_global_id(0) < stride)\n"
198 " vec1[get_global_id(0)+start1] += vec1[get_global_id(0)+start1+stride];\n"
199 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
200 " }\n"
201 " \n"
202 " if (get_global_id(0) == 0)\n"
203 " *result = vec1[0]; \n"
204 "}\n"
205 ; //vector_align1_sum
206 
207 const char * const vector_align1_cpu_mul_add =
208 "__kernel void cpu_mul_add(\n"
209 " __global const float * vec1,\n"
210 " unsigned int start1,\n"
211 " unsigned int size1,\n"
212 " float factor,\n"
213 " __global const float * vec2,\n"
214 " unsigned int start2,\n"
215 " unsigned int size2,\n"
216 " __global float * result,\n"
217 " unsigned int start3,\n"
218 " unsigned int size3\n"
219 " ) \n"
220 "{ \n"
221 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
222 " result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
223 "}\n"
224 ; //vector_align1_cpu_mul_add
225 
226 const char * const vector_align1_norm_1 =
227 "//helper:\n"
228 "void helper_norm1_parallel_reduction( __local float * tmp_buffer )\n"
229 "{\n"
230 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
231 " {\n"
232 " barrier(CLK_LOCAL_MEM_FENCE);\n"
233 " if (get_global_id(0) < stride)\n"
234 " tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
235 " }\n"
236 "}\n"
237 "\n"
238 "////// norm_1\n"
239 "float impl_norm_1(\n"
240 " __global const float * vec,\n"
241 " unsigned int start_index,\n"
242 " unsigned int end_index,\n"
243 " __local float * tmp_buffer)\n"
244 "{\n"
245 " float tmp = 0;\n"
246 " for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
247 " tmp += fabs(vec[i]);\n"
248 " \n"
249 " tmp_buffer[get_local_id(0)] = tmp;\n"
250 " \n"
251 " helper_norm1_parallel_reduction(tmp_buffer);\n"
252 " \n"
253 " return tmp_buffer[0];\n"
254 "};\n"
255 "\n"
256 "__kernel void norm_1(\n"
257 " __global const float * vec,\n"
258 " unsigned int start1,\n"
259 " unsigned int size1,\n"
260 " __local float * tmp_buffer,\n"
261 " global float * group_buffer)\n"
262 "{\n"
263 " float tmp = impl_norm_1(vec,\n"
264 " ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
265 " ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
266 " tmp_buffer);\n"
267 " \n"
268 " if (get_local_id(0) == 0)\n"
269 " group_buffer[get_group_id(0)] = tmp; \n"
270 "}\n"
271 "\n"
272 ; //vector_align1_norm_1
273 
274 const char * const vector_align1_mul_sub =
275 "///// multiply subtract:\n"
276 "__kernel void mul_sub(\n"
277 " __global const float * vec1,\n"
278 " unsigned int start1,\n"
279 " unsigned int size1,\n"
280 " __global const float * fac,\n"
281 " __global const float * vec2,\n"
282 " unsigned int start2,\n"
283 " unsigned int size2,\n"
284 " __global float * result,\n"
285 " unsigned int start3,\n"
286 " unsigned int size3\n"
287 " ) \n"
288 "{ \n"
289 " float factor = *fac;\n"
290 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
291 " result[i+start3] = vec1[i+start1] * factor - vec2[i+start2];\n"
292 "}\n"
293 ; //vector_align1_mul_sub
294 
295 const char * const vector_align1_mul_add =
296 "__kernel void mul_add(\n"
297 " __global const float * vec1,\n"
298 " unsigned int start1,\n"
299 " unsigned int size1,\n"
300 " __global const float * fac,\n"
301 " __global const float * vec2,\n"
302 " unsigned int start2,\n"
303 " unsigned int size2,\n"
304 " __global float * result,\n"
305 " unsigned int start3,\n"
306 " unsigned int size3\n"
307 " ) \n"
308 "{ \n"
309 " float factor = *fac;\n"
310 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
311 " result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
312 "}\n"
313 ; //vector_align1_mul_add
314 
315 const char * const vector_align1_swap =
316 "////// swap:\n"
317 "__kernel void swap(\n"
318 " __global float * vec1,\n"
319 " unsigned int start1,\n"
320 " unsigned int size1,\n"
321 " __global float * vec2,\n"
322 " unsigned int start2,\n"
323 " unsigned int size2\n"
324 " ) \n"
325 "{ \n"
326 " float tmp;\n"
327 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
328 " {\n"
329 " tmp = vec2[i+start2];\n"
330 " vec2[i+start2] = vec1[i+start1];\n"
331 " vec1[i+start1] = tmp;\n"
332 " }\n"
333 "}\n"
334 " \n"
335 ; //vector_align1_swap
336 
337 const char * const vector_align1_inplace_div_add =
338 "///// divide add:\n"
339 "__kernel void inplace_div_add(\n"
340 " __global float * vec1,\n"
341 " unsigned int start1,\n"
342 " unsigned int size1,\n"
343 " __global const float * vec2,\n"
344 " unsigned int start2,\n"
345 " unsigned int size2,\n"
346 " __global const float * fac) //CPU variant is mapped to mult_add\n"
347 "{ \n"
348 " float factor = *fac;\n"
349 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
350 " vec1[i+start1] -= vec2[i+start2] / factor;\n"
351 "}\n"
352 ; //vector_align1_inplace_div_add
353 
354 const char * const vector_align1_sqrt_sum =
355 "__kernel void sqrt_sum(\n"
356 " __global float * vec1,\n"
357 " unsigned int start1,\n"
358 " unsigned int size1,\n"
359 " __global float * result) \n"
360 "{ \n"
361 " //parallel reduction on global memory: (make sure get_global_size(0) is a power of 2)\n"
362 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
363 " {\n"
364 " if (get_global_id(0) < stride)\n"
365 " vec1[get_global_id(0)+start1] += vec1[get_global_id(0)+start1+stride];\n"
366 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
367 " }\n"
368 " \n"
369 " if (get_global_id(0) == 0)\n"
370 " *result = sqrt(vec1[start1]);\n"
371 " \n"
372 "}\n"
373 ; //vector_align1_sqrt_sum
374 
375 const char * const vector_align1_diag_precond =
376 "__kernel void diag_precond(\n"
377 " __global const float * diag_A_inv, \n"
378 " unsigned int start1,\n"
379 " unsigned int size1,\n"
380 " __global float * x, \n"
381 " unsigned int start2,\n"
382 " unsigned int size2) \n"
383 "{ \n"
384 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
385 " x[i+start2] *= diag_A_inv[i+start1];\n"
386 "}\n"
387 ; //vector_align1_diag_precond
388 
389 const char * const vector_align1_cpu_inplace_mult =
390 "__kernel void cpu_inplace_mult(\n"
391 " __global float * vec,\n"
392 " unsigned int start1,\n"
393 " unsigned int size1,\n"
394 " float factor) \n"
395 "{ \n"
396 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
397 " vec[i+start1] *= factor;\n"
398 "}\n"
399 ; //vector_align1_cpu_inplace_mult
400 
401 const char * const vector_align1_clear =
402 "__kernel void clear(\n"
403 " __global float * vec,\n"
404 " unsigned int start1,\n"
405 " unsigned int size1) \n"
406 "{ \n"
407 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
408 " vec[i+start1] = 0;\n"
409 "}\n"
410 ; //vector_align1_clear
411 
413 "__kernel void cpu_inplace_mul_add(\n"
414 " __global float * vec1,\n"
415 " unsigned int start1,\n"
416 " unsigned int size1,\n"
417 " __global const float * vec2,\n"
418 " unsigned int start2,\n"
419 " unsigned int size2,\n"
420 " float factor) \n"
421 "{ \n"
422 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
423 " vec1[i+start1] += vec2[i+start2] * factor;\n"
424 "}\n"
425 ; //vector_align1_cpu_inplace_mul_add
426 
427 const char * const vector_align1_vmax =
428 "__kernel void vmax(\n"
429 " __global float * vec1,\n"
430 " unsigned int start1,\n"
431 " unsigned int size1,\n"
432 " __global float * result) \n"
433 "{ \n"
434 " //parallel reduction on global memory (make sure that size is a power of 2)\n"
435 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
436 " {\n"
437 " if (get_global_id(0) < stride)\n"
438 " vec1[get_global_id(0)+start1] = fmax(vec1[get_global_id(0)+start1+stride], vec1[get_global_id(0)+start1]);\n"
439 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
440 " }\n"
441 " \n"
442 " if (get_global_id(0) == 0)\n"
443 " *result = vec1[start1];\n"
444 "}\n"
445 ; //vector_align1_vmax
446 
447 const char * const vector_align1_norm_inf =
448 "\n"
449 "////// norm_inf\n"
450 "float impl_norm_inf(\n"
451 " __global const float * vec,\n"
452 " unsigned int start_index,\n"
453 " unsigned int end_index,\n"
454 " __local float * tmp_buffer)\n"
455 "{\n"
456 " float tmp = 0;\n"
457 " for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
458 " tmp = fmax(fabs(vec[i]), tmp);\n"
459 " tmp_buffer[get_local_id(0)] = tmp;\n"
460 " \n"
461 " //step 2: parallel reduction:\n"
462 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
463 " {\n"
464 " barrier(CLK_LOCAL_MEM_FENCE);\n"
465 " if (get_global_id(0) < stride)\n"
466 " tmp_buffer[get_global_id(0)] = fmax(tmp_buffer[get_global_id(0)], tmp_buffer[get_global_id(0)+stride]);\n"
467 " }\n"
468 " \n"
469 " return tmp_buffer[0];\n"
470 "}\n"
471 "\n"
472 "__kernel void norm_inf(\n"
473 " __global const float * vec,\n"
474 " unsigned int start1,\n"
475 " unsigned int size1,\n"
476 " __local float * tmp_buffer,\n"
477 " global float * group_buffer)\n"
478 "{\n"
479 " float tmp = impl_norm_inf(vec,\n"
480 " ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
481 " ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
482 " tmp_buffer);\n"
483 " \n"
484 " if (get_local_id(0) == 0)\n"
485 " group_buffer[get_group_id(0)] = tmp; \n"
486 "}\n"
487 ; //vector_align1_norm_inf
488 
489 const char * const vector_align1_cpu_mult =
490 "__kernel void cpu_mult(\n"
491 " __global const float * vec,\n"
492 " unsigned int start1,\n"
493 " unsigned int size1,\n"
494 " float factor, \n"
495 " __global float * result,\n"
496 " unsigned int start2,\n"
497 " unsigned int size2) \n"
498 "{ \n"
499 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
500 " result[i+start2] = vec[i+start1] * factor;\n"
501 "}\n"
502 ; //vector_align1_cpu_mult
503 
504 const char * const vector_align1_index_norm_inf =
505 "//index_norm_inf:\n"
506 "unsigned int float_vector1_index_norm_inf_impl(\n"
507 " __global const float * vec,\n"
508 " unsigned int start1,\n"
509 " unsigned int size1,\n"
510 " __local float * float_buffer,\n"
511 " __local unsigned int * index_buffer)\n"
512 "{\n"
513 " //step 1: fill buffer:\n"
514 " float cur_max = 0.0f;\n"
515 " float tmp;\n"
516 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
517 " {\n"
518 " tmp = fabs(vec[i+start1]);\n"
519 " if (cur_max < tmp)\n"
520 " {\n"
521 " float_buffer[get_global_id(0)] = tmp;\n"
522 " index_buffer[get_global_id(0)] = i;\n"
523 " cur_max = tmp;\n"
524 " }\n"
525 " }\n"
526 " \n"
527 " //step 2: parallel reduction:\n"
528 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
529 " {\n"
530 " barrier(CLK_LOCAL_MEM_FENCE);\n"
531 " if (get_global_id(0) < stride)\n"
532 " {\n"
533 " //find the first occurring index\n"
534 " if (float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride])\n"
535 " {\n"
536 " index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride];\n"
537 " float_buffer[get_global_id(0)] = float_buffer[get_global_id(0)+stride];\n"
538 " }\n"
539 " \n"
540 " //index_buffer[get_global_id(0)] = float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride] ? index_buffer[get_global_id(0)+stride] : index_buffer[get_global_id(0)];\n"
541 " //float_buffer[get_global_id(0)] = max(float_buffer[get_global_id(0)], float_buffer[get_global_id(0)+stride]);\n"
542 " }\n"
543 " }\n"
544 " \n"
545 " return index_buffer[0];\n"
546 "}\n"
547 "\n"
548 "__kernel void index_norm_inf(\n"
549 " __global float * vec,\n"
550 " unsigned int start1,\n"
551 " unsigned int size1,\n"
552 " __local float * float_buffer,\n"
553 " __local unsigned int * index_buffer,\n"
554 " global unsigned int * result) \n"
555 "{ \n"
556 " unsigned int tmp = float_vector1_index_norm_inf_impl(vec, start1, size1, float_buffer, index_buffer);\n"
557 " if (get_global_id(0) == 0) *result = tmp;\n"
558 "}\n"
559 "\n"
560 "\n"
561 ; //vector_align1_index_norm_inf
562 
563 const char * const vector_align1_norm_2 =
564 "//helper:\n"
565 "void helper_norm2_parallel_reduction( __local float * tmp_buffer )\n"
566 "{\n"
567 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
568 " {\n"
569 " barrier(CLK_LOCAL_MEM_FENCE);\n"
570 " if (get_global_id(0) < stride)\n"
571 " tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
572 " }\n"
573 "}\n"
574 "\n"
575 "////// norm_2\n"
576 "float impl_norm_2(\n"
577 " __global const float * vec,\n"
578 " unsigned int start_index,\n"
579 " unsigned int end_index,\n"
580 " __local float * tmp_buffer)\n"
581 "{\n"
582 " float tmp = 0;\n"
583 " float vec_entry = 0;\n"
584 " for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
585 " {\n"
586 " vec_entry = vec[i];\n"
587 " tmp += vec_entry * vec_entry;\n"
588 " }\n"
589 " tmp_buffer[get_local_id(0)] = tmp;\n"
590 " \n"
591 " helper_norm2_parallel_reduction(tmp_buffer);\n"
592 " \n"
593 " return tmp_buffer[0];\n"
594 "};\n"
595 "\n"
596 "__kernel void norm_2(\n"
597 " __global const float * vec,\n"
598 " unsigned int start1,\n"
599 " unsigned int size1,\n"
600 " __local float * tmp_buffer,\n"
601 " global float * group_buffer)\n"
602 "{\n"
603 " float tmp = impl_norm_2(vec,\n"
604 " ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
605 " ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
606 " tmp_buffer);\n"
607 " \n"
608 " if (get_local_id(0) == 0)\n"
609 " group_buffer[get_group_id(0)] = tmp; \n"
610 "}\n"
611 "\n"
612 ; //vector_align1_norm_2
613 
614 const char * const vector_align1_mult =
615 "__kernel void mult(\n"
616 " __global const float * vec,\n"
617 " unsigned int start1,\n"
618 " unsigned int size1,\n"
619 " __global const float * fac, \n"
620 " __global float * result,\n"
621 " unsigned int start3,\n"
622 " unsigned int size3) \n"
623 "{ \n"
624 " float factor = *fac;\n"
625 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
626 " result[i+start3] = vec[i+start1] * factor;\n"
627 "}\n"
628 ; //vector_align1_mult
629 
630 const char * const vector_align1_divide =
631 "// Note: name 'div' is not allowed by the jit-compiler\n"
632 "__kernel void divide(\n"
633 " __global const float * vec,\n"
634 " unsigned int start1,\n"
635 " unsigned int size1,\n"
636 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
637 " __global float * result,\n"
638 " unsigned int start3,\n"
639 " unsigned int size3) \n"
640 "{ \n"
641 " float factor = *fac;\n"
642 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
643 " result[i+start3] = vec[i+start1] / factor;\n"
644 "}\n"
645 ; //vector_align1_divide
646 
647 const char * const vector_align1_inplace_sub =
648 "__kernel void inplace_sub(\n"
649 " __global float * vec1,\n"
650 " unsigned int start1,\n"
651 " unsigned int size1,\n"
652 " __global const float * vec2,\n"
653 " unsigned int start2,\n"
654 " unsigned int size2) \n"
655 "{ \n"
656 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
657 " vec1[i+start1] -= vec2[i+start2];\n"
658 "}\n"
659 ; //vector_align1_inplace_sub
660 
661 const char * const vector_align1_plane_rotation =
662 "////// plane rotation: (x,y) <- (\alpha x + \beta y, -\beta x + \alpha y)\n"
663 "__kernel void plane_rotation(\n"
664 " __global float * vec1,\n"
665 " unsigned int start1,\n"
666 " unsigned int size1,\n"
667 " __global float * vec2, \n"
668 " unsigned int start2,\n"
669 " unsigned int size2,\n"
670 " float alpha,\n"
671 " float beta) \n"
672 "{ \n"
673 " float tmp1 = 0;\n"
674 " float tmp2 = 0;\n"
675 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
676 " {\n"
677 " tmp1 = vec1[i+start1];\n"
678 " tmp2 = vec2[i+start2];\n"
679 " \n"
680 " vec1[i+start1] = alpha * tmp1 + beta * tmp2;\n"
681 " vec2[i+start2] = alpha * tmp2 - beta * tmp1;\n"
682 " }\n"
683 "}\n"
684 ; //vector_align1_plane_rotation
685 
686 const char * const vector_align16_inplace_mult =
687 "__kernel void inplace_mult(\n"
688 " __global float16 * vec,\n"
689 " unsigned int start1,\n"
690 " unsigned int size1,\n"
691 " __global const float * fac) \n"
692 "{ \n"
693 " float factor = *fac;\n"
694 " unsigned int i_end = size1/16;\n"
695 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
696 " vec[i+start1] *= factor;\n"
697 "}\n"
698 ; //vector_align16_inplace_mult
699 
700 const char * const vector_align16_inplace_divide =
701 "__kernel void inplace_divide(\n"
702 " __global float16 * vec,\n"
703 " unsigned int start1,\n"
704 " unsigned int size1,\n"
705 " __global const float * fac) //note: CPU variant is mapped to prod_scalar\n"
706 "{ \n"
707 " float factor = *fac;\n"
708 " unsigned int i_end = size1/16;\n"
709 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
710 " vec[i+start1] /= factor;\n"
711 "}\n"
712 ; //vector_align16_inplace_divide
713 
714 const char * const vector_align16_inplace_add =
715 "__kernel void inplace_add(\n"
716 " __global float16 * vec1,\n"
717 " unsigned int start1,\n"
718 " unsigned int size1,\n"
719 " __global const float16 * vec2,\n"
720 " unsigned int start2,\n"
721 " unsigned int size2) \n"
722 "{ \n"
723 " unsigned int i_end = size1/16;\n"
724 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
725 " vec1[i+start1] += vec2[i+start2];\n"
726 "}\n"
727 ; //vector_align16_inplace_add
728 
729 const char * const vector_align16_cpu_inplace_mul =
730 "\n"
731 "__kernel void cpu_inplace_mult(\n"
732 " __global float16 * vec,\n"
733 " unsigned int start1,\n"
734 " unsigned int size1,\n"
735 " float factor) \n"
736 "{ \n"
737 " unsigned int i_end = size1/16;\n"
738 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
739 " vec[i+start1] *= factor;\n"
740 "}\n"
741 "\n"
742 ; //vector_align16_cpu_inplace_mul
743 
744 const char * const vector_align16_add =
745 "__kernel void add(\n"
746 " __global const float16 * vec1,\n"
747 " unsigned int start1,\n"
748 " unsigned int size1,\n"
749 " __global const float16 * vec2, \n"
750 " unsigned int start2,\n"
751 " unsigned int size2,\n"
752 " __global float16 * result,\n"
753 " unsigned int start3,\n"
754 " unsigned int size3)\n"
755 "{ \n"
756 " unsigned int i_end = size/16;\n"
757 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
758 " result[i+start3] = vec1[i+start1] + vec2[i+start2];\n"
759 "}\n"
760 ; //vector_align16_add
761 
762 const char * const vector_align16_sub =
763 "__kernel void sub(\n"
764 " __global const float16 * vec1,\n"
765 " unsigned int start1,\n"
766 " unsigned int size1,\n"
767 " __global const float16 * vec2, \n"
768 " unsigned int start2,\n"
769 " unsigned int size2,\n"
770 " __global float16 * result,\n"
771 " unsigned int start3,\n"
772 " unsigned int size3)\n"
773 "{ \n"
774 " unsigned int i_end = size1 / 16;\n"
775 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
776 " result[i+start3] = vec1[i+start1] - vec2[i+start2];\n"
777 "}\n"
778 ; //vector_align16_sub
779 
780 const char * const vector_align16_cpu_mult =
781 "__kernel void cpu_mult(\n"
782 " __global const float16 * vec,\n"
783 " unsigned int start1,\n"
784 " unsigned int size1,\n"
785 " float factor, \n"
786 " __global float16 * result,\n"
787 " unsigned int start2,\n"
788 " unsigned int size2) \n"
789 "{ \n"
790 " unsigned int i_end = size1/16;\n"
791 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
792 " result[i+start2] = vec[i+start1] * factor;\n"
793 "}\n"
794 ; //vector_align16_cpu_mult
795 
796 const char * const vector_align16_mult =
797 "__kernel void mult(\n"
798 " __global const float16 * vec,\n"
799 " unsigned int start1,\n"
800 " unsigned int size1,\n"
801 " __global const float * fac, \n"
802 " __global float16 * result,\n"
803 " unsigned int start2,\n"
804 " unsigned int size2) \n"
805 "{ \n"
806 " float factor = *fac;\n"
807 " unsigned int i_end = size1/16;\n"
808 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
809 " result[i+start2] = vec[i+start1] * factor;\n"
810 "}\n"
811 ; //vector_align16_mult
812 
813 const char * const vector_align16_divide =
814 "//Note: 'div' cannot be used because of complaints by the jit-compiler\n"
815 "__kernel void divide(\n"
816 " __global const float16 * vec,\n"
817 " unsigned int start1,\n"
818 " unsigned int size1,\n"
819 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
820 " __global float16 * result,\n"
821 " unsigned int start2,\n"
822 " unsigned int size2) \n"
823 "{ \n"
824 " float factor = *fac;\n"
825 " unsigned int i_end = size/16;\n"
826 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
827 " result[i+start2] = vec[i+start1] / factor;\n"
828 "}\n"
829 ; //vector_align16_divide
830 
831 const char * const vector_align16_inplace_sub =
832 "__kernel void inplace_sub(\n"
833 " __global float16 * vec1,\n"
834 " unsigned int start1,\n"
835 " unsigned int size1,\n"
836 " __global const float16 * vec2,\n"
837 " unsigned int start2,\n"
838 " unsigned int size2) \n"
839 "{ \n"
840 " unsigned int i_end = size1/16;\n"
841 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
842 " vec1[i+start1] -= vec2[i+start2];\n"
843 "}\n"
844 ; //vector_align16_inplace_sub
845 
846 const char * const vector_align4_inplace_mul_add =
847 "__kernel void inplace_mul_add(\n"
848 " __global float4 * vec1,\n"
849 " unsigned int start1,\n"
850 " unsigned int size1,\n"
851 " __global const float4 * vec2,\n"
852 " unsigned int start2,\n"
853 " unsigned int size2,\n"
854 " __global const float * fac) \n"
855 "{ \n"
856 " float factor = *fac;\n"
857 " unsigned int size_div_4 = size1/4;\n"
858 " for (unsigned int i = get_global_id(0); i < size_div_4; i += get_global_size(0))\n"
859 " vec1[i+start1] += vec2[i+start2] * factor;\n"
860 "}\n"
861 ; //vector_align4_inplace_mul_add
862 
863 const char * const vector_align4_inplace_div_sub =
864 "__kernel void inplace_div_sub(\n"
865 " __global float4 * vec1,\n"
866 " unsigned int start1,\n"
867 " unsigned int size1,\n"
868 " __global const float4 * vec2,\n"
869 " unsigned int start2,\n"
870 " unsigned int size2,\n"
871 " __global const float * fac) //CPU variant is mapped to mult_add\n"
872 "{ \n"
873 " float factor = *fac;\n"
874 " unsigned int i_end = size1/4;\n"
875 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
876 " vec1[i+start1] -= vec2[i+start2] / factor;\n"
877 "}\n"
878 ; //vector_align4_inplace_div_sub
879 
880 const char * const vector_align4_inplace_mul_sub =
881 "__kernel void inplace_mul_sub(\n"
882 " __global float4 * vec1,\n"
883 " unsigned int start1,\n"
884 " unsigned int size1,\n"
885 " __global const float4 * vec2,\n"
886 " unsigned int start2,\n"
887 " unsigned int size2,\n"
888 " __global const float * fac) //CPU variant is mapped to mult_add\n"
889 "{ \n"
890 " float factor = *fac;\n"
891 " unsigned int i_end = size/4;\n"
892 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
893 " vec1[i+start1] -= vec2[i+start2] * factor;\n"
894 "}\n"
895 ; //vector_align4_inplace_mul_sub
896 
897 const char * const vector_align4_cpu_mul_add =
898 "__kernel void cpu_mul_add(\n"
899 " __global const float4 * vec1,\n"
900 " unsigned int start1,\n"
901 " unsigned int size1,\n"
902 " float factor,\n"
903 " __global const float4 * vec2,\n"
904 " unsigned int start2,\n"
905 " unsigned int size2,\n"
906 " __global float4 * result,\n"
907 " unsigned int start3,\n"
908 " unsigned int size3) \n"
909 "{ \n"
910 " unsigned int i_end = size1/4;\n"
911 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
912 " result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
913 "}\n"
914 ; //vector_align4_cpu_mul_add
915 
916 const char * const vector_align4_mul_add =
917 "__kernel void mul_add(\n"
918 " __global const float4 * vec1,\n"
919 " unsigned int start1,\n"
920 " unsigned int size1,\n"
921 " __global const float * fac,\n"
922 " __global const float4 * vec2,\n"
923 " unsigned int start2,\n"
924 " unsigned int size2,\n"
925 " __global float4 * result,\n"
926 " unsigned int start3,\n"
927 " unsigned int size3) \n"
928 "{ \n"
929 " float factor = *fac;\n"
930 " unsigned int i_end = size1/4;\n"
931 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
932 " result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
933 "}\n"
934 ; //vector_align4_mul_add
935 
936 const char * const vector_align4_inplace_div_add =
937 "__kernel void inplace_div_add(\n"
938 " __global float4 * vec1,\n"
939 " unsigned int start1,\n"
940 " unsigned int size1,\n"
941 " __global const float4 * vec2,\n"
942 " unsigned int start2,\n"
943 " unsigned int size2,\n"
944 " __global const float * fac) //CPU variant is mapped to mult_add\n"
945 "{ \n"
946 " float factor = *fac;\n"
947 " unsigned int i_end = size1 / 4;\n"
948 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
949 " vec1[i+start1] -= vec2[i+start2] / factor;\n"
950 "}\n"
951 ; //vector_align4_inplace_div_add
952 
954 "__kernel void cpu_inplace_mul_add(\n"
955 " __global float4 * vec1,\n"
956 " unsigned int start1,\n"
957 " unsigned int size1,\n"
958 " __global const float4 * vec2,\n"
959 " unsigned int start2,\n"
960 " unsigned int size2,\n"
961 " float factor) \n"
962 "{ \n"
963 " unsigned int i_end = size1/4;\n"
964 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
965 " vec1[i+start1] += vec2[i+start2] * factor;\n"
966 "}\n"
967 ; //vector_align4_cpu_inplace_mul_add
968 
969  } //namespace kernels
970  } //namespace linalg
971 } //namespace viennacl
972 #endif