Open3D (C++ API)  0.15.1
SlabNodeManager.h
Go to the documentation of this file.
1// ----------------------------------------------------------------------------
2// - Open3D: www.open3d.org -
3// ----------------------------------------------------------------------------
4// The MIT License (MIT)
5//
6// Copyright (c) 2018-2021 www.open3d.org
7//
8// Permission is hereby granted, free of charge, to any person obtaining a copy
9// of this software and associated documentation files (the "Software"), to deal
10// in the Software without restriction, including without limitation the rights
11// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12// copies of the Software, and to permit persons to whom the Software is
13// furnished to do so, subject to the following conditions:
14//
15// The above copyright notice and this permission notice shall be included in
16// all copies or substantial portions of the Software.
17//
18// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24// IN THE SOFTWARE.
25// ----------------------------------------------------------------------------
26
27// Copyright 2019 Saman Ashkiani
28// Rewritten by Wei Dong 2019 - 2020
29// Licensed under the Apache License, Version 2.0 (the "License");
30// you may not use this file except in compliance with the License.
31// You may obtain a copy of the License at
32//
33// http://www.apache.org/licenses/LICENSE-2.0
34//
35// Unless required by applicable law or agreed to in writing, software
36// distributed under the License is distributed on an "AS IS" BASIS,
37// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
38// implied. See the License for the specific language governing permissions
39// and limitations under the License.
40
41#pragma once
42
43#include <thrust/device_vector.h>
44
45#include <memory>
46#include <random>
47
52
53namespace open3d {
54namespace core {
55
58class Slab {
59public:
62 buf_index_t kv_pair_ptrs[kWarpSize - 1];
65};
66
68public:
70 : super_blocks_(nullptr),
71 hash_coef_(0),
72 num_attempts_(0),
73 memory_block_index_(0),
74 super_block_index_(0) {}
75
76 __device__ __forceinline__ uint32_t* get_unit_ptr_from_slab(
77 const buf_index_t& next_slab_ptr, const uint32_t& lane_id) {
78 return super_blocks_ + addressDecoder(next_slab_ptr) + lane_id;
79 }
80 __device__ __forceinline__ uint32_t* get_ptr_for_bitmap(
81 const uint32_t super_block_idx, const uint32_t bitmap_idx) {
82 return super_blocks_ + super_block_idx * kUIntsPerSuperBlock +
83 bitmap_idx;
84 }
85
86 // Objective: each warp selects its own memory_block warp allocator.
87 __device__ void Init(uint32_t& tid, uint32_t& lane_id) {
88 // Hashing the memory block to be used.
89 createMemBlockIndex(tid >> 5);
90
91 // Loading the assigned memory block.
92 memory_block_bitmap_ =
93 super_blocks_[super_block_index_ * kUIntsPerSuperBlock +
94 memory_block_index_ * kSlabsPerBlock + lane_id];
95 }
96
97 __device__ uint32_t WarpAllocate(const uint32_t& lane_id) {
98 // Try and allocate a new memory units within the memory_block memory
99 // block if it returns 0xFFFFFFFF, then there was not any empty memory
100 // unit a new memory_block block should be chosen, and repeat again
101 // allocated result: 5 bits: super_block_index
102 // 17 bits: memory block index
103 // 5 bits: memory unit index (hi-bits of 10bit)
104 // 5 bits: memory unit index (lo-bits of 10bit)
105 int empty_lane = -1;
106 uint32_t free_lane;
107 uint32_t read_bitmap = memory_block_bitmap_;
108 uint32_t allocated_result = kNotFoundFlag;
109 // Works as long as <31 bit are used in the allocated_result
110 // in other words, if there are 32 super blocks and at most 64k blocks
111 // per super block.
112
113 while (allocated_result == kNotFoundFlag) {
114 empty_lane = __ffs(~memory_block_bitmap_) - 1;
115 free_lane = __ballot_sync(kSyncLanesMask, empty_lane >= 0);
116 if (free_lane == 0) {
117 // all bitmaps are full: need to be rehashed again.
118 updateMemBlockIndex((threadIdx.x + blockIdx.x * blockDim.x) >>
119 5);
120 read_bitmap = memory_block_bitmap_;
121 continue;
122 }
123 uint32_t src_lane = __ffs(free_lane) - 1;
124 if (src_lane == lane_id) {
125 read_bitmap = atomicCAS(
127 super_block_index_ * kUIntsPerSuperBlock +
128 memory_block_index_ * kSlabsPerBlock + lane_id,
129 memory_block_bitmap_,
130 memory_block_bitmap_ | (1 << empty_lane));
131 if (read_bitmap == memory_block_bitmap_) {
132 // Successful attempt.
133 memory_block_bitmap_ |= (1 << empty_lane);
134 allocated_result =
135 (super_block_index_ << kSuperBlockMaskBits) |
136 (memory_block_index_ << kBlockMaskBits) |
137 (lane_id << kSlabMaskBits) | empty_lane;
138 } else {
139 // Not successful: updating the current bitmap.
140 memory_block_bitmap_ = read_bitmap;
141 }
142 }
143 // Asking for the allocated result.
144 allocated_result =
145 __shfl_sync(kSyncLanesMask, allocated_result, src_lane);
146 }
147 return allocated_result;
148 }
149
150 // This function, frees a recently allocated memory unit by a single thread.
151 // Since it is untouched, there shouldn't be any worries for the actual
152 // memory contents to be reset again.
153 __device__ void FreeUntouched(buf_index_t ptr) {
154 atomicAnd(super_blocks_ +
155 getSuperBlockIndex(ptr) * kUIntsPerSuperBlock +
156 getMemBlockIndex(ptr) * kSlabsPerBlock +
157 (getMemUnitIndex(ptr) >> 5),
158 ~(1 << (getMemUnitIndex(ptr) & 0x1F)));
159 }
160
161private:
162 __device__ __host__ __forceinline__ uint32_t
163 getSuperBlockIndex(buf_index_t address) const {
164 return address >> kSuperBlockMaskBits;
165 }
166 __device__ __host__ __forceinline__ uint32_t
167 getMemBlockIndex(buf_index_t address) const {
168 return ((address >> kBlockMaskBits) & 0x1FFFF);
169 }
170 __device__ __host__ __forceinline__ buf_index_t
171 getMemBlockAddress(buf_index_t address) const {
172 return (kBitmapsPerSuperBlock +
173 getMemBlockIndex(address) * kUIntsPerBlock);
174 }
175 __device__ __host__ __forceinline__ uint32_t
176 getMemUnitIndex(buf_index_t address) const {
177 return address & 0x3FF;
178 }
179 __device__ __host__ __forceinline__ buf_index_t
180 getMemUnitAddress(buf_index_t address) {
181 return getMemUnitIndex(address) * kWarpSize;
182 }
183
184 // Called at the beginning of the kernel.
185 __device__ void createMemBlockIndex(uint32_t global_warp_id) {
186 super_block_index_ = global_warp_id % kSuperBlocks;
187 memory_block_index_ = (hash_coef_ * global_warp_id) >>
188 (32 - kBlocksPerSuperBlockInBits);
189 }
190
191 // Called when the allocator fails to find an empty unit to allocate.
192 __device__ void updateMemBlockIndex(uint32_t global_warp_id) {
193 num_attempts_++;
194 super_block_index_++;
195 super_block_index_ =
196 (super_block_index_ == kSuperBlocks) ? 0 : super_block_index_;
197 memory_block_index_ = (hash_coef_ * (global_warp_id + num_attempts_)) >>
198 (32 - kBlocksPerSuperBlockInBits);
199 // Loading the assigned memory block.
200 memory_block_bitmap_ =
201 *((super_blocks_ + super_block_index_ * kUIntsPerSuperBlock) +
202 memory_block_index_ * kSlabsPerBlock + (threadIdx.x & 0x1f));
203 }
204
205 __host__ __device__ buf_index_t
206 addressDecoder(buf_index_t address_ptr_index) {
207 return getSuperBlockIndex(address_ptr_index) * kUIntsPerSuperBlock +
208 getMemBlockAddress(address_ptr_index) +
209 getMemUnitIndex(address_ptr_index) * kWarpSize;
210 }
211
212 __host__ __device__ void print_address(buf_index_t address_ptr_index) {
213 printf("Super block Index: %d, Memory block index: %d, Memory unit "
214 "index: "
215 "%d\n",
216 getSuperBlockIndex(address_ptr_index),
217 getMemBlockIndex(address_ptr_index),
218 getMemUnitIndex(address_ptr_index));
219 }
220
221public:
225 uint32_t hash_coef_; // A random 32-bit.
226
227private:
229 uint32_t num_attempts_;
230 uint32_t memory_block_index_;
231 uint32_t memory_block_bitmap_;
232 uint32_t super_block_index_;
233};
234
236 uint32_t* slabs_per_superblock);
237
239public:
240 SlabNodeManager(const Device& device) : device_(device) {
242 std::mt19937 rng(time(0));
243 impl_.hash_coef_ = rng();
244
248 kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t),
249 device_));
250 Reset();
251 }
252
254
255 void Reset() {
256 OPEN3D_CUDA_CHECK(cudaMemset(
257 impl_.super_blocks_, 0xFF,
258 kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t)));
259
260 for (uint32_t i = 0; i < kSuperBlocks; i++) {
261 // setting bitmaps into zeros:
262 OPEN3D_CUDA_CHECK(cudaMemset(
263 impl_.super_blocks_ + i * kUIntsPerSuperBlock, 0x00,
264 kBlocksPerSuperBlock * kSlabsPerBlock * sizeof(uint32_t)));
265 }
267 OPEN3D_CUDA_CHECK(cudaGetLastError());
268 }
269
270 std::vector<int> CountSlabsPerSuperblock() {
271 const uint32_t num_super_blocks = kSuperBlocks;
272
273 thrust::device_vector<uint32_t> slabs_per_superblock(kSuperBlocks);
274 thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(),
275 0);
276
277 // Counting total number of allocated memory units.
278 int num_mem_units = kBlocksPerSuperBlock * 32;
279 int num_cuda_blocks =
280 (num_mem_units + kThreadsPerBlock - 1) / kThreadsPerBlock;
281 CountSlabsPerSuperblockKernel<<<num_cuda_blocks, kThreadsPerBlock, 0,
282 core::cuda::GetStream()>>>(
283 impl_, thrust::raw_pointer_cast(slabs_per_superblock.data()));
285 OPEN3D_CUDA_CHECK(cudaGetLastError());
286
287 std::vector<int> result(num_super_blocks);
288 thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(),
289 result.begin());
290
291 return result;
292 }
293
294public:
297};
298} // namespace core
299} // namespace open3d
Common CUDA utilities.
#define OPEN3D_CUDA_CHECK(err)
Definition: CUDAUtils.h:66
Definition: Device.h:39
static void * Malloc(size_t byte_size, const Device &device)
Definition: MemoryManager.cpp:41
static void Free(void *ptr, const Device &device)
Frees previously allocated memory at address ptr on device device.
Definition: MemoryManager.cpp:47
Definition: SlabNodeManager.h:58
buf_index_t kv_pair_ptrs[kWarpSize - 1]
Definition: SlabNodeManager.h:62
buf_index_t next_slab_ptr
An internal ptr managed by InternalNodeManager.
Definition: SlabNodeManager.h:64
Definition: SlabNodeManager.h:238
std::vector< int > CountSlabsPerSuperblock()
Definition: SlabNodeManager.h:270
SlabNodeManager(const Device &device)
Definition: SlabNodeManager.h:240
void Reset()
Definition: SlabNodeManager.h:255
~SlabNodeManager()
Definition: SlabNodeManager.h:253
SlabNodeManagerImpl impl_
Definition: SlabNodeManager.h:295
Device device_
Definition: SlabNodeManager.h:296
Definition: SlabNodeManager.h:67
uint32_t * super_blocks_
A pointer to each super-block.
Definition: SlabNodeManager.h:223
__device__ void Init(uint32_t &tid, uint32_t &lane_id)
Definition: SlabNodeManager.h:87
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const buf_index_t &next_slab_ptr, const uint32_t &lane_id)
Definition: SlabNodeManager.h:76
uint32_t hash_coef_
hash_coef (register): used as (16 bits, 16 bits) for hashing.
Definition: SlabNodeManager.h:225
__device__ __forceinline__ uint32_t * get_ptr_for_bitmap(const uint32_t super_block_idx, const uint32_t bitmap_idx)
Definition: SlabNodeManager.h:80
__device__ void FreeUntouched(buf_index_t ptr)
Definition: SlabNodeManager.h:153
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition: SlabNodeManager.h:97
SlabNodeManagerImpl()
Definition: SlabNodeManager.h:69
void Synchronize()
Definition: CUDAUtils.cpp:78
uint32_t buf_index_t
Definition: HashBackendBuffer.h:63
__global__ void CountSlabsPerSuperblockKernel(SlabNodeManagerImpl impl, uint32_t *slabs_per_superblock)
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:567
Definition: PinholeCameraIntrinsic.cpp:35