Open3D (C++ API)  0.15.1
SlabHashBackendImpl.h
Go to the documentation of this file.
1// ----------------------------------------------------------------------------
2// - Open3D: www.open3d.org -
3// ----------------------------------------------------------------------------
4// The MIT License (MIT)
5//
6// Copyright (c) 2018-2021 www.open3d.org
7//
8// Permission is hereby granted, free of charge, to any person obtaining a copy
9// of this software and associated documentation files (the "Software"), to deal
10// in the Software without restriction, including without limitation the rights
11// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12// copies of the Software, and to permit persons to whom the Software is
13// furnished to do so, subject to the following conditions:
14//
15// The above copyright notice and this permission notice shall be included in
16// all copies or substantial portions of the Software.
17//
18// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24// IN THE SOFTWARE.
25// ----------------------------------------------------------------------------
26
27// Copyright 2019 Saman Ashkiani
28//
29// Licensed under the Apache License, Version 2.0 (the "License");
30// you may not use this file except in compliance with the License.
31// You may obtain a copy of the License at
32//
33// http://www.apache.org/licenses/LICENSE-2.0
34//
35// Unless required by applicable law or agreed to in writing, software
36// distributed under the License is distributed on an "AS IS" BASIS,
37// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
38// implied. See the License for the specific language governing permissions
39// and limitations under the License.
40
41#pragma once
42
49
50namespace open3d {
51namespace core {
52
53// Each slab contains a collection of uint32_t entries.
54// Each uint32_t entry can represent:
55// 0) an empty placeholder;
56// 1) a stored buf_index;
57// 2) a ptr to the next slab if at the end of the slab.
58// In case 0) and 1), it is interpreted as a buf_index_t.
59// In case 2), it is intepreted as uint32_t.
60// They are equivalent, but we differentiate them in the implementation to
61// emphasize the differences.
62
63template <typename Key, typename Hash, typename Eq>
65public:
67
68 __host__ void Setup(int64_t init_buckets,
69 const SlabNodeManagerImpl& node_mgr_impl,
70 const CUDAHashBackendBufferAccessor& buffer_accessor);
71
73 __device__ bool Insert(bool lane_active,
74 uint32_t lane_id,
75 uint32_t bucket_id,
76 const Key& key,
77 buf_index_t buf_index);
78
80 __device__ Pair<buf_index_t, bool> Find(bool lane_active,
81 uint32_t lane_id,
82 uint32_t bucket_id,
83 const Key& key);
84
86 __device__ Pair<buf_index_t, bool> Erase(bool lane_active,
87 uint32_t lane_id,
88 uint32_t bucket_id,
89 const Key& key);
90
92 __device__ void WarpSyncKey(const Key& key, uint32_t lane_id, Key& ret_key);
93
95 __device__ int32_t WarpFindKey(const Key& src_key,
96 uint32_t lane_id,
97 uint32_t slab_entry);
98
100 __device__ int32_t WarpFindEmpty(uint32_t slab_entry);
101
102 // Hash function.
103 __device__ int64_t ComputeBucket(const Key& key) const;
104
105 // Node manager.
106 __device__ uint32_t AllocateSlab(uint32_t lane_id);
107 __device__ void FreeSlab(uint32_t slab_ptr);
108
109 // Helpers.
110 __device__ uint32_t* SlabEntryPtr(uint32_t bucket_id,
111 uint32_t lane_id,
112 uint32_t slab_ptr) {
113 return (slab_ptr == kHeadSlabAddr)
114 ? SlabEntryPtrFromHead(bucket_id, lane_id)
115 : SlabEntryPtrFromNodes(slab_ptr, lane_id);
116 }
117
119 uint32_t lane_id) {
120 return node_mgr_impl_.get_unit_ptr_from_slab(slab_ptr, lane_id);
121 }
123 uint32_t lane_id) {
124 return reinterpret_cast<uint32_t*>(bucket_list_head_) +
125 bucket_id * kWarpSize + lane_id;
126 }
127
128public:
132
136
137 // TODO: verify size with alignment
138 int key_size_in_int_ = sizeof(Key) / sizeof(int);
139};
140
142template <typename Key, typename Hash, typename Eq>
144 const void* input_keys,
145 buf_index_t* output_buf_indices,
146 int heap_counter_prev,
147 int64_t count);
148
149template <typename Key, typename Hash, typename Eq>
151 const void* input_keys,
152 buf_index_t* output_buf_indices,
153 bool* output_masks,
154 int64_t count);
155
156template <typename Key, typename Hash, typename Eq, typename block_t>
158 const void* const* input_values_soa,
159 buf_index_t* output_buf_indices,
160 bool* output_masks,
161 int64_t count,
162 int64_t n_values);
163
164template <typename Key, typename Hash, typename Eq>
166 const void* input_keys,
167 buf_index_t* output_buf_indices,
168 bool* output_masks,
169 int64_t count);
170
171template <typename Key, typename Hash, typename Eq>
173 const void* input_keys,
174 buf_index_t* output_buf_indices,
175 bool* output_masks,
176 int64_t count);
177
178template <typename Key, typename Hash, typename Eq>
180 buf_index_t* output_buf_indices,
181 bool* output_masks,
182 int64_t count);
183
184template <typename Key, typename Hash, typename Eq>
186 buf_index_t* output_buf_indices,
187 uint32_t* output_count);
188
189template <typename Key, typename Hash, typename Eq>
190__global__ void CountElemsPerBucketKernel(
191 SlabHashBackendImpl<Key, Hash, Eq> impl, int64_t* bucket_elem_counts);
192
193template <typename Key, typename Hash, typename Eq>
195 : bucket_count_(0), bucket_list_head_(nullptr) {}
196
197template <typename Key, typename Hash, typename Eq>
199 int64_t init_buckets,
200 const SlabNodeManagerImpl& allocator_impl,
201 const CUDAHashBackendBufferAccessor& buffer_accessor) {
202 bucket_count_ = init_buckets;
203 node_mgr_impl_ = allocator_impl;
204 buffer_accessor_ = buffer_accessor;
205}
206
207template <typename Key, typename Hash, typename Eq>
209 bool lane_active,
210 uint32_t lane_id,
211 uint32_t bucket_id,
212 const Key& key,
213 buf_index_t buf_index) {
214 uint32_t work_queue = 0;
215 uint32_t prev_work_queue = 0;
216 uint32_t slab_ptr = kHeadSlabAddr;
217 Key src_key;
218
219 bool mask = false;
220
221 // > Loop when we have active lanes
222 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
223 // 0. Restart from linked list head if last insertion is finished
224 slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
225 uint32_t src_lane = __ffs(work_queue) - 1;
226 uint32_t src_bucket =
227 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
228 WarpSyncKey(key, src_lane, src_key);
229
230 uint32_t slab_entry = *SlabEntryPtr(src_bucket, lane_id, slab_ptr);
231
232 int32_t lane_found = WarpFindKey(src_key, lane_id, slab_entry);
233 int32_t lane_empty = WarpFindEmpty(slab_entry);
234
235 // Branch 1: key already existing, ABORT
236 if (lane_found >= 0) {
237 if (lane_id == src_lane) {
238 lane_active = false;
239 }
240 }
241
242 // Branch 2: empty slot available, try to insert
243 else if (lane_empty >= 0) {
244 // Cannot merge if statements.
245 // otherwise the warp flow will be interrupted.
246 if (lane_id == src_lane) {
247 // Now regard the entry as a value of buf_index
248 const uint32_t* empty_entry_ptr =
249 SlabEntryPtr(src_bucket, lane_empty, slab_ptr);
250
251 uint32_t old_empty_entry_value =
252 atomicCAS((unsigned int*)empty_entry_ptr,
253 kEmptyNodeAddr, buf_index);
254
255 // Branch 2.1: SUCCEED
256 if (old_empty_entry_value == kEmptyNodeAddr) {
257 lane_active = false;
258 mask = true;
259 }
260 // Branch 2.2: failed: RESTART
261 // In the consequent attempt,
262 // > if the same key was inserted in this slot,
263 // we fall back to Branch 1;
264 // > if a different key was inserted,
265 // we go to Branch 2 or 3.
266 }
267 }
268
269 // Branch 3: nothing found in this slab, goto next slab
270 else {
271 // broadcast next slab
272 uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
273 kNextSlabPtrLaneId, kWarpSize);
274
275 // Branch 3.1: next slab existing, RESTART at updated slab ptr
276 if (next_slab_ptr != kEmptySlabAddr) {
277 slab_ptr = next_slab_ptr;
278 }
279
280 // Branch 3.2: next slab empty, try to allocate one from the Slab
281 // buffer.
282 else {
283 // Warp allocate, must be outside the condition clause.
284 uint32_t new_next_slab_ptr = AllocateSlab(lane_id);
285
286 if (lane_id == kNextSlabPtrLaneId) {
287 const uint32_t* next_slab_entry_ptr = SlabEntryPtr(
288 src_bucket, kNextSlabPtrLaneId, slab_ptr);
289
290 uint32_t old_next_slab_entry_value =
291 atomicCAS((unsigned int*)next_slab_entry_ptr,
292 kEmptySlabAddr, new_next_slab_ptr);
293
294 // Branch 3.2.1: other thread has allocated,
295 // RESTART. In the consequent attempt, goto Branch 2.
296 if (old_next_slab_entry_value != kEmptySlabAddr) {
297 FreeSlab(new_next_slab_ptr);
298 }
299
300 // Branch 3.2.2: this thread allocated succesfully.
301 // RESTART, goto Branch 2
302 }
303 }
304 }
305
306 prev_work_queue = work_queue;
307 }
308
309 return mask;
310}
311
312template <typename Key, typename Hash, typename Eq>
314 bool lane_active,
315 uint32_t lane_id,
316 uint32_t bucket_id,
317 const Key& query_key) {
318 uint32_t work_queue = 0;
319 uint32_t prev_work_queue = work_queue;
320 uint32_t slab_ptr = kHeadSlabAddr;
321
322 buf_index_t buf_index = kNullAddr;
323 bool mask = false;
324
325 // > Loop when we have active lanes.
326 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
327 // 0. Restart from linked list head if the last query is finished.
328 slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
329 uint32_t src_lane = __ffs(work_queue) - 1;
330 uint32_t src_bucket =
331 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
332
333 Key src_key;
334 WarpSyncKey(query_key, src_lane, src_key);
335
336 // Each lane in the warp reads a unit in the slab in parallel.
337 const uint32_t slab_entry =
338 *SlabEntryPtr(src_bucket, lane_id, slab_ptr);
339
340 int32_t lane_found = WarpFindKey(src_key, lane_id, slab_entry);
341
342 // 1. Found in this slab, SUCCEED.
343 if (lane_found >= 0) {
344 // broadcast found value
345 uint32_t found_buf_index = __shfl_sync(kSyncLanesMask, slab_entry,
346 lane_found, kWarpSize);
347
348 if (lane_id == src_lane) {
349 lane_active = false;
350 buf_index = found_buf_index;
351 mask = true;
352 }
353 }
354
355 // 2. Not found in this slab.
356 else {
357 // Broadcast next slab: lane 31 reads 'next'.
358 uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
359 kNextSlabPtrLaneId, kWarpSize);
360
361 // 2.1. Next slab is empty, ABORT.
362 if (next_slab_ptr == kEmptySlabAddr) {
363 if (lane_id == src_lane) {
364 lane_active = false;
365 }
366 }
367 // 2.2. Next slab exists, RESTART.
368 else {
369 slab_ptr = next_slab_ptr;
370 }
371 }
372
373 prev_work_queue = work_queue;
374 }
375
376 return make_pair(buf_index, mask);
377}
378
379template <typename Key, typename Hash, typename Eq>
381 bool lane_active,
382 uint32_t lane_id,
383 uint32_t bucket_id,
384 const Key& key) {
385 uint32_t work_queue = 0;
386 uint32_t prev_work_queue = 0;
387 uint32_t slab_ptr = kHeadSlabAddr;
388 Key src_key;
389
390 buf_index_t buf_index = 0;
391 bool mask = false;
392
393 // > Loop when we have active lanes.
394 while ((work_queue = __ballot_sync(kSyncLanesMask, lane_active))) {
395 // 0. Restart from linked list head if last insertion is finished.
396 slab_ptr = (prev_work_queue != work_queue) ? kHeadSlabAddr : slab_ptr;
397 uint32_t src_lane = __ffs(work_queue) - 1;
398 uint32_t src_bucket =
399 __shfl_sync(kSyncLanesMask, bucket_id, src_lane, kWarpSize);
400
401 WarpSyncKey(key, src_lane, src_key);
402
403 const uint32_t slab_entry =
404 *SlabEntryPtr(src_bucket, lane_id, slab_ptr);
405
406 int32_t lane_found = WarpFindKey(src_key, lane_id, slab_entry);
407
408 // Branch 1: key found.
409 if (lane_found >= 0) {
410 if (lane_id == src_lane) {
411 uint32_t* found_entry_ptr =
412 SlabEntryPtr(src_bucket, lane_found, slab_ptr);
413
414 uint32_t old_found_entry_value = atomicExch(
415 (unsigned int*)found_entry_ptr, kEmptyNodeAddr);
416
417 // Branch 1.2: other thread might have done the job,
418 // avoid double free.
419 mask = (old_found_entry_value != kEmptyNodeAddr);
420 buf_index = old_found_entry_value;
421 }
422 } else { // no matching slot found:
423 uint32_t next_slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
424 kNextSlabPtrLaneId, kWarpSize);
425 if (next_slab_ptr == kEmptySlabAddr) {
426 // not found:
427 if (lane_id == src_lane) {
428 lane_active = false;
429 }
430 } else {
431 slab_ptr = next_slab_ptr;
432 }
433 }
434 prev_work_queue = work_queue;
435 }
436
437 return make_pair(buf_index, mask);
438}
439
440template <typename Key, typename Hash, typename Eq>
442 const Key& key, uint32_t lane_id, Key& ret_key) {
443 auto dst_key_ptr = reinterpret_cast<int*>(&ret_key);
444 auto src_key_ptr = reinterpret_cast<const int*>(&key);
445 for (int i = 0; i < key_size_in_int_; ++i) {
446 dst_key_ptr[i] =
447 __shfl_sync(kSyncLanesMask, src_key_ptr[i], lane_id, kWarpSize);
448 }
449}
450
451template <typename Key, typename Hash, typename Eq>
453 const Key& key, uint32_t lane_id, uint32_t slab_entry) {
454 bool is_lane_found =
455 // Select key lanes.
456 ((1 << lane_id) & kNodePtrLanesMask)
457 // Validate key buf_indices.
458 && (slab_entry != kEmptyNodeAddr)
459 // Find keys in buffer. Now slab_entry is interpreted as buf_index.
460 &&
461 eq_fn_(*static_cast<Key*>(buffer_accessor_.GetKeyPtr(slab_entry)),
462 key);
463
464 return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_found)) - 1;
465}
466
467template <typename Key, typename Hash, typename Eq>
468__device__ int32_t
470 bool is_lane_empty = (slab_entry == kEmptyNodeAddr);
471 return __ffs(__ballot_sync(kNodePtrLanesMask, is_lane_empty)) - 1;
472}
473
474template <typename Key, typename Hash, typename Eq>
475__device__ int64_t
477 return hash_fn_(key) % bucket_count_;
478}
479
480template <typename Key, typename Hash, typename Eq>
481__device__ uint32_t
483 return node_mgr_impl_.WarpAllocate(lane_id);
484}
485
486template <typename Key, typename Hash, typename Eq>
487__device__ __forceinline__ void SlabHashBackendImpl<Key, Hash, Eq>::FreeSlab(
488 uint32_t slab_ptr) {
489 node_mgr_impl_.FreeUntouched(slab_ptr);
490}
491
492template <typename Key, typename Hash, typename Eq>
494 const void* input_keys,
495 buf_index_t* output_buf_indices,
496 int heap_counter_prev,
497 int64_t count) {
498 const Key* input_keys_templated = static_cast<const Key*>(input_keys);
499 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
500
501 if (tid < count) {
502 // First write ALL input_keys to avoid potential thread conflicts.
503 buf_index_t buf_index =
504 impl.buffer_accessor_.heap_[heap_counter_prev + tid];
505 void* key = impl.buffer_accessor_.GetKeyPtr(buf_index);
506 *static_cast<Key*>(key) = input_keys_templated[tid];
507 output_buf_indices[tid] = buf_index;
508 }
509}
510
511template <typename Key, typename Hash, typename Eq>
513 const void* input_keys,
514 buf_index_t* output_buf_indices,
515 bool* output_masks,
516 int64_t count) {
517 const Key* input_keys_templated = static_cast<const Key*>(input_keys);
518 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
519 uint32_t lane_id = tid & 0x1F;
520
521 if (tid - lane_id >= count) {
522 return;
523 }
524
525 impl.node_mgr_impl_.Init(tid, lane_id);
526
527 bool lane_active = false;
528 uint32_t bucket_id = 0;
529 buf_index_t buf_index = 0;
530
531 // Dummy for warp sync.
532 Key key;
533 if (tid < count) {
534 lane_active = true;
535 key = input_keys_templated[tid];
536 buf_index = output_buf_indices[tid];
537 bucket_id = impl.ComputeBucket(key);
538 }
539
540 // Index out-of-bound threads still have to run for warp synchronization.
541 bool mask = impl.Insert(lane_active, lane_id, bucket_id, key, buf_index);
542
543 if (tid < count) {
544 output_masks[tid] = mask;
545 }
546}
547
548template <typename Key, typename Hash, typename Eq, typename block_t>
550 const void* const* input_values_soa,
551 buf_index_t* output_buf_indices,
552 bool* output_masks,
553 int64_t count,
554 int64_t n_values) {
555 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
556
557 if (tid < count) {
558 buf_index_t buf_index = output_buf_indices[tid];
559
560 if (output_masks[tid]) {
561 for (int j = 0; j < n_values; ++j) {
562 int64_t blocks_per_element =
564
565 block_t* dst_value = static_cast<block_t*>(
566 impl.buffer_accessor_.GetValuePtr(buf_index, j));
567 const block_t* src_value =
568 static_cast<const block_t*>(input_values_soa[j]) +
569 blocks_per_element * tid;
570 for (int b = 0; b < blocks_per_element; ++b) {
571 dst_value[b] = src_value[b];
572 }
573 }
574 } else {
575 impl.buffer_accessor_.DeviceFree(buf_index);
576 }
577 }
578}
579
580template <typename Key, typename Hash, typename Eq>
582 const void* input_keys,
583 buf_index_t* output_buf_indices,
584 bool* output_masks,
585 int64_t count) {
586 const Key* input_keys_templated = static_cast<const Key*>(input_keys);
587 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
588 uint32_t lane_id = threadIdx.x & 0x1F;
589
590 // This warp is idle.
591 if ((tid - lane_id) >= count) {
592 return;
593 }
594
595 // Initialize the memory allocator on each warp.
596 impl.node_mgr_impl_.Init(tid, lane_id);
597
598 bool lane_active = false;
599 uint32_t bucket_id = 0;
600
601 // Dummy for warp sync
602 Key key;
604
605 if (tid < count) {
606 lane_active = true;
607 key = input_keys_templated[tid];
608 bucket_id = impl.ComputeBucket(key);
609 }
610
611 result = impl.Find(lane_active, lane_id, bucket_id, key);
612
613 if (tid < count) {
614 output_buf_indices[tid] = result.first;
615 output_masks[tid] = result.second;
616 }
617}
618
619template <typename Key, typename Hash, typename Eq>
621 const void* input_keys,
622 buf_index_t* output_buf_indices,
623 bool* output_masks,
624 int64_t count) {
625 const Key* input_keys_templated = static_cast<const Key*>(input_keys);
626 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
627 uint32_t lane_id = threadIdx.x & 0x1F;
628
629 if (tid - lane_id >= count) {
630 return;
631 }
632
633 impl.node_mgr_impl_.Init(tid, lane_id);
634
635 bool lane_active = false;
636 uint32_t bucket_id = 0;
637
638 // Dummy for warp sync
639 Key key;
640 if (tid < count) {
641 lane_active = true;
642 key = input_keys_templated[tid];
643 bucket_id = impl.ComputeBucket(key);
644 }
645
646 auto result = impl.Erase(lane_active, lane_id, bucket_id, key);
647
648 if (tid < count) {
649 output_buf_indices[tid] = result.first;
650 output_masks[tid] = result.second;
651 }
652}
653
654template <typename Key, typename Hash, typename Eq>
656 buf_index_t* output_buf_indices,
657 bool* output_masks,
658 int64_t count) {
659 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
660 if (tid < count && output_masks[tid]) {
661 impl.buffer_accessor_.DeviceFree(output_buf_indices[tid]);
662 }
663}
664
665template <typename Key, typename Hash, typename Eq>
667 buf_index_t* output_buf_indices,
668 uint32_t* output_count) {
669 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
670 uint32_t lane_id = threadIdx.x & 0x1F;
671
672 // Assigning a warp per bucket.
673 uint32_t bucket_id = tid >> 5;
674 if (bucket_id >= impl.bucket_count_) {
675 return;
676 }
677
678 impl.node_mgr_impl_.Init(tid, lane_id);
679
680 uint32_t slab_entry = *impl.SlabEntryPtrFromHead(bucket_id, lane_id);
681 bool is_active = slab_entry != kEmptyNodeAddr;
682
683 if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
684 uint32_t index = atomicAdd(output_count, 1);
685 output_buf_indices[index] = slab_entry;
686 }
687
688 uint32_t slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
689 kNextSlabPtrLaneId, kWarpSize);
690
691 // Count following nodes,
692 while (slab_ptr != kEmptySlabAddr) {
693 slab_entry = *impl.SlabEntryPtrFromNodes(slab_ptr, lane_id);
694 is_active = (slab_entry != kEmptyNodeAddr);
695
696 if (is_active && ((1 << lane_id) & kNodePtrLanesMask)) {
697 uint32_t index = atomicAdd(output_count, 1);
698 output_buf_indices[index] = slab_entry;
699 }
700 slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry, kNextSlabPtrLaneId,
701 kWarpSize);
702 }
703}
704
705template <typename Key, typename Hash, typename Eq>
707 SlabHashBackendImpl<Key, Hash, Eq> impl, int64_t* bucket_elem_counts) {
708 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
709 uint32_t lane_id = threadIdx.x & 0x1F;
710
711 // Assigning a warp per bucket.
712 uint32_t bucket_id = tid >> 5;
713 if (bucket_id >= impl.bucket_count_) {
714 return;
715 }
716
717 impl.node_mgr_impl_.Init(tid, lane_id);
718
719 uint32_t count = 0;
720
721 // Count head node.
722 uint32_t slab_entry = *impl.SlabEntryPtrFromHead(bucket_id, lane_id);
723 count += __popc(
724 __ballot_sync(kNodePtrLanesMask, slab_entry != kEmptyNodeAddr));
725 uint32_t slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry,
726 kNextSlabPtrLaneId, kWarpSize);
727
728 // Count following nodes.
729 while (slab_ptr != kEmptySlabAddr) {
730 slab_entry = *impl.SlabEntryPtrFromNodes(slab_ptr, lane_id);
731 count += __popc(
732 __ballot_sync(kNodePtrLanesMask, slab_entry != kEmptyNodeAddr));
733 slab_ptr = __shfl_sync(kSyncLanesMask, slab_entry, kNextSlabPtrLaneId,
734 kWarpSize);
735 }
736
737 // Write back the results.
738 if (lane_id == 0) {
739 bucket_elem_counts[bucket_id] = count;
740 }
741}
742
743} // namespace core
744} // namespace open3d
Common CUDA utilities.
Definition: CUDAHashBackendBufferAccessor.h:43
int64_t * value_blocks_per_element_
Definition: CUDAHashBackendBufferAccessor.h:127
__device__ void * GetValuePtr(buf_index_t ptr, int value_idx=0)
Definition: CUDAHashBackendBufferAccessor.h:110
__device__ void * GetKeyPtr(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:107
__device__ void DeviceFree(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:102
buf_index_t * heap_
Definition: CUDAHashBackendBufferAccessor.h:115
Definition: SlabHashBackendImpl.h:64
__device__ Pair< buf_index_t, bool > Erase(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key)
Warp-erase an entry at key.
Definition: SlabHashBackendImpl.h:380
__device__ Pair< buf_index_t, bool > Find(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key)
Warp-find a buf_index and its mask at key.
Definition: SlabHashBackendImpl.h:313
__device__ bool Insert(bool lane_active, uint32_t lane_id, uint32_t bucket_id, const Key &key, buf_index_t buf_index)
Warp-insert a pre-allocated buf_index at key.
Definition: SlabHashBackendImpl.h:208
__device__ uint32_t AllocateSlab(uint32_t lane_id)
Definition: SlabHashBackendImpl.h:482
__device__ uint32_t * SlabEntryPtr(uint32_t bucket_id, uint32_t lane_id, uint32_t slab_ptr)
Definition: SlabHashBackendImpl.h:110
__device__ void FreeSlab(uint32_t slab_ptr)
Definition: SlabHashBackendImpl.h:487
__host__ void Setup(int64_t init_buckets, const SlabNodeManagerImpl &node_mgr_impl, const CUDAHashBackendBufferAccessor &buffer_accessor)
Definition: SlabHashBackendImpl.h:198
int64_t bucket_count_
Definition: SlabHashBackendImpl.h:131
Slab * bucket_list_head_
Definition: SlabHashBackendImpl.h:133
__device__ uint32_t * SlabEntryPtrFromNodes(uint32_t slab_ptr, uint32_t lane_id)
Definition: SlabHashBackendImpl.h:118
Hash hash_fn_
Definition: SlabHashBackendImpl.h:129
__device__ uint32_t * SlabEntryPtrFromHead(uint32_t bucket_id, uint32_t lane_id)
Definition: SlabHashBackendImpl.h:122
__device__ void WarpSyncKey(const Key &key, uint32_t lane_id, Key &ret_key)
Warp-synchronize a key in a slab.
Definition: SlabHashBackendImpl.h:441
__device__ int32_t WarpFindKey(const Key &src_key, uint32_t lane_id, uint32_t slab_entry)
Warp-find a key in a slab.
Definition: SlabHashBackendImpl.h:452
__device__ int64_t ComputeBucket(const Key &key) const
Definition: SlabHashBackendImpl.h:476
SlabHashBackendImpl()
Definition: SlabHashBackendImpl.h:194
int key_size_in_int_
Definition: SlabHashBackendImpl.h:138
__device__ int32_t WarpFindEmpty(uint32_t slab_entry)
Warp-find the first empty slot in a slab.
Definition: SlabHashBackendImpl.h:469
Eq eq_fn_
Definition: SlabHashBackendImpl.h:130
SlabNodeManagerImpl node_mgr_impl_
Definition: SlabHashBackendImpl.h:134
CUDAHashBackendBufferAccessor buffer_accessor_
Definition: SlabHashBackendImpl.h:135
Definition: SlabNodeManager.h:58
Definition: SlabNodeManager.h:67
__device__ void Init(uint32_t &tid, uint32_t &lane_id)
Definition: SlabNodeManager.h:87
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const buf_index_t &next_slab_ptr, const uint32_t &lane_id)
Definition: SlabNodeManager.h:76
__device__ void FreeUntouched(buf_index_t ptr)
Definition: SlabNodeManager.h:153
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: SlabNodeManager.h:97
int count
Definition: FilePCD.cpp:61
__global__ void InsertKernelPass1(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:512
__global__ void InsertKernelPass0(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, int heap_counter_prev, int64_t count)
Kernels.
Definition: SlabHashBackendImpl.h:493
uint32_t buf_index_t
Definition: HashBackendBuffer.h:63
__global__ void EraseKernelPass1(SlabHashBackendImpl< Key, Hash, Eq > impl, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:655
OPEN3D_HOST_DEVICE Pair< First, Second > make_pair(const First &_first, const Second &_second)
Definition: SlabTraits.h:68
__global__ void FindKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:581
__global__ void EraseKernelPass0(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: SlabHashBackendImpl.h:620
__global__ void GetActiveIndicesKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, buf_index_t *output_buf_indices, uint32_t *output_count)
Definition: SlabHashBackendImpl.h:666
__global__ void CountElemsPerBucketKernel(SlabHashBackendImpl< Key, Hash, Eq > impl, int64_t *bucket_elem_counts)
Definition: SlabHashBackendImpl.h:706
__global__ void InsertKernelPass2(SlabHashBackendImpl< Key, Hash, Eq > impl, const void *const *input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count, int64_t n_values)
Definition: SlabHashBackendImpl.h:549
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:567
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t int32_t
Definition: K4aPlugin.cpp:414
Definition: PinholeCameraIntrinsic.cpp:35
Definition: SlabTraits.h:59
Second second
Definition: SlabTraits.h:61
First first
Definition: SlabTraits.h:60