赞
踩
rasterizer_impl.cu:
// 查找最高有效位(most significant bit),输入变量n表示tile编号最大值x、y的乘积 uint32_t getHigherMsb(uint32_t n) { uint32_t msb = sizeof(n) * 4; //4*4=16 uint32_t step = msb; while (step > 1) { step /= 2; //缩小2倍 if (n >> msb) //右移16位,相当于除以2的16次方 msb += step; else msb -= step; } if (n >> msb) //如果n的最高位大于0,则msb+1 msb++; return msb; } // Wrapper method to call auxiliary coarse frustum containment test. // Mark all Gaussians that pass it. __global__ void checkFrustum(int P, const float* orig_points, const float* viewmatrix, const float* projmatrix, bool* present) { auto idx = cg::this_grid().thread_rank(); if (idx >= P) return; float3 p_view; present[idx] = in_frustum(idx, orig_points, viewmatrix, projmatrix, false, p_view); } //计算2d高斯椭圆中心点points_xy在2d像素平面上占据的tile的tileID,并将tileID|depth组合成64位的key值,value值为高斯球的编号 __global__ void duplicateWithKeys( int P, const float2* points_xy, const float* depths, const uint32_t* offsets, //累计的tiles数量的数组 uint64_t* gaussian_keys_unsorted, //未排序的key(tileID|depth) uint32_t* gaussian_values_unsorted, //未排序的valu(depth) int* radii, //高斯球的半径 dim3 grid) //block编号的xy两个极大值 { auto idx = cg::this_grid().thread_rank(); if (idx >= P) return; // Generate no key/value pair for invisible Gaussians if (radii[idx] > 0) { // Find this Gaussian's offset in buffer for writing keys/values. uint32_t off = (idx == 0) ? 0 : offsets[idx - 1]; //第idx个高斯球前面已经占据的tiles总数 uint2 rect_min, rect_max; //计算像素点points_xy[idx]在半径为radii[idx]的圆所占据的网格编号的最小值和最大值 getRect(points_xy[idx], radii[idx], rect_min, rect_max, grid); // For each tile that the bounding rect overlaps, emit a // key/value pair. The key is | tile ID | depth |, // and the value is the ID of the Gaussian. Sorting the values // with this key yields Gaussian IDs in a list, such that they // are first sorted by tile and then by depth. for (int y = rect_min.y; y < rect_max.y; y++) { for (int x = rect_min.x; x < rect_max.x; x++) { uint64_t key = y * grid.x + x; //计算当前block所在的tileID key <<= 32; //左移32位,把后面的32位空出来 key |= *((uint32_t*)&depths[idx]); //后32位存高斯的深度值 //把上步得到的key,存到第off个(也即是当前高斯球前面的所有高斯所占据的tiles数量)gaussian_keys_unsorted的uint64_t数组中 gaussian_keys_unsorted[off] = key; //values设置为idx gaussian_values_unsorted[off] = idx; off++; } } } } // Check keys to see if it is at the start/end of one tile's range in // the full sorted list. If yes, write start/end of this tile. // Run once per instanced (duplicated) Gaussian ID. //一个thread处理一个point_list_keys中的tile,总共L个tile;point_list_keys:已经排序过的key列表,tileID从小到大排列(优先),depth从小到大排列; //ranges:每一项存储对应tile的的id范围[0,L-1],这个id表示的是在point_list_keys中的索引,通过binningState.point_list找到对应高斯球编号 //例如:point_list_keys值如下:tileID:0 0 0 0 1 1 1 2 2 3 4 4... // depth: 1 2 3 4 1 4 5 3 4 2 3 5... ,那么point_list_keys[0]中的tileID即为0,ranges[0].x = 0 __global__ void identifyTileRanges(int L, uint64_t* point_list_keys, uint2* ranges) { auto idx = cg::this_grid().thread_rank(); if (idx >= L) return; // Read tile ID from key. Update start/end of tile range if at limit. uint64_t key = point_list_keys[idx]; uint32_t currtile = key >> 32; //key右移32位,得到tileID if (idx == 0) //第0个point_list_keys,也即tileID最小且depth最小的point_list_keys项 ranges[currtile].x = 0; //tileID最小的ranges项的x值赋值为0 else //如果idx=4,那么currtile=1,prevtile=0,就设置ranges[0].y=4,ranges[1].x=4, { uint32_t prevtile = point_list_keys[idx - 1] >> 32; //前一个tileID if (currtile != prevtile) //与前一个tileID不相等的话,设置前一个ranges.y和自己的ranges.x { ranges[prevtile].y = idx; ranges[currtile].x = idx; } } if (idx == L - 1) //最后一个point_list_keys项的话,设置ranges[currtile].y为L ranges[currtile].y = L; } // Mark Gaussians as visible/invisible, based on view frustum testing void CudaRasterizer::Rasterizer::markVisible( int P, float* means3D, float* viewmatrix, float* projmatrix, bool* present) { checkFrustum << <(P + 255) / 256, 256 >> > ( P, means3D, viewmatrix, projmatrix, present); } //给GeometryState geom分配内存,P:高斯球数量 CudaRasterizer::GeometryState CudaRasterizer::GeometryState::fromChunk(char*& chunk, size_t P) { GeometryState geom; obtain(chunk, geom.depths, P, 128); obtain(chunk, geom.clamped, P * 3, 128); obtain(chunk, geom.internal_radii, P, 128); obtain(chunk, geom.means2D, P, 128); obtain(chunk, geom.cov3D, P * 6, 128); obtain(chunk, geom.conic_opacity, P, 128); obtain(chunk, geom.rgb, P * 3, 128); obtain(chunk, geom.tiles_touched, P, 128); //计算数组的前缀和,InclusiveSum表示包括自身,ExclusiveSum表示不包括自身 //参数:第三个in,第四个out,最后一个num。当第一个参数为NULL时, 所需的分配大小被写入第二个参数,并且不执行任何工作 //https://github.com/dmlc/cub/blob/master/cub/device/device_scan.cuh cub::DeviceScan::InclusiveSum(nullptr, geom.scan_size, geom.tiles_touched, geom.tiles_touched, P); //计算所需内存大小 obtain(chunk, geom.scanning_space, geom.scan_size, 128); obtain(chunk, geom.point_offsets, P, 128); return geom; } //给ImageState img分配内存,N:图片中tile的总数 CudaRasterizer::ImageState CudaRasterizer::ImageState::fromChunk(char*& chunk, size_t N) { ImageState img; obtain(chunk, img.accum_alpha, N, 128); obtain(chunk, img.n_contrib, N, 128); obtain(chunk, img.ranges, N, 128); return img; } //给BinningState binning分配内存,表示装箱状态,P:高斯球数量 CudaRasterizer::BinningState CudaRasterizer::BinningState::fromChunk(char*& chunk, size_t P) { BinningState binning; obtain(chunk, binning.point_list, P, 128); obtain(chunk, binning.point_list_unsorted, P, 128); obtain(chunk, binning.point_list_keys, P, 128); obtain(chunk, binning.point_list_keys_unsorted, P, 128); cub::DeviceRadixSort::SortPairs( //基数排序, nullptr, binning.sorting_size, binning.point_list_keys_unsorted, binning.point_list_keys, binning.point_list_unsorted, binning.point_list, P); obtain(chunk, binning.list_sorting_space, binning.sorting_size, 128); return binning; } // Forward rendering procedure for differentiable rasterization // of Gaussians. int CudaRasterizer::Rasterizer::forward( std::function<char* (size_t)> geometryBuffer, std::function<char* (size_t)> binningBuffer, std::function<char* (size_t)> imageBuffer, const int P, int D, int M, const float* background, const int width, int height, const float* means3D, const float* shs, const float* colors_precomp, const float* opacities, const float* scales, const float scale_modifier, const float* rotations, const float* cov3D_precomp, const float* viewmatrix, const float* projmatrix, const float* cam_pos, const float tan_fovx, float tan_fovy, const bool prefiltered, float* out_color, int* radii, bool debug) { //计算焦距fx,fy const float focal_y = height / (2.0f * tan_fovy); const float focal_x = width / (2.0f * tan_fovx); //模版函数required调用fromChunk函数来获取内存,返回结束地址,也即所需存储大小 size_t chunk_size = required<GeometryState>(P); //调用rasterize_points.cu文件中的resizeFunctional函数里面嵌套的匿名函数lambda来调整显存块大小,并返回首地址 char* chunkptr = geometryBuffer(chunk_size); // 用显存块首地址作为参数,调用本文件上的fromChunk函数来申请显存 GeometryState geomState = GeometryState::fromChunk(chunkptr, P); if (radii == nullptr) { radii = geomState.internal_radii; } //网格中block块的数量,确保覆盖全图 dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1); //块中线程数量,16*16 dim3 block(BLOCK_X, BLOCK_Y, 1); // Dynamically resize image-based auxiliary buffers during training //模版函数required调用fromChunk函数来获取内存,返回结束地址,也即所需存储大小 size_t img_chunk_size = required<ImageState>(width * height); char* img_chunkptr = imageBuffer(img_chunk_size); ImageState imgState = ImageState::fromChunk(img_chunkptr, width * height); if (NUM_CHANNELS != 3 && colors_precomp == nullptr) { throw std::runtime_error("For non-RGB, provide precomputed Gaussian colors!"); } // Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB) CHECK_CUDA(FORWARD::preprocess( P, D, M, means3D, (glm::vec3*)scales, scale_modifier, (glm::vec4*)rotations, opacities, shs, geomState.clamped, cov3D_precomp, colors_precomp, viewmatrix, projmatrix, (glm::vec3*)cam_pos, width, height, focal_x, focal_y, tan_fovx, tan_fovy, radii, geomState.means2D, geomState.depths, geomState.cov3D, geomState.rgb, geomState.conic_opacity, tile_grid, geomState.tiles_touched, prefiltered ), debug) // Compute prefix sum over full list of touched tile counts by Gaussians // E.g., [2, 3, 0, 2, 1] -> [2, 5, 5, 7, 8] //同步运行InclusiveSum,获取tiles_touched数组的前缀和,存到point_offsets中 CHECK_CUDA(cub::DeviceScan::InclusiveSum(geomState.scanning_space, geomState.scan_size, geomState.tiles_touched, geomState.point_offsets, P), debug) // Retrieve total number of Gaussian instances to launch and resize aux buffers int num_rendered; // 把指针(point_offsets + P - 1),也就是point_offsets数组的最后一个元素的值,赋给num_rendered,也就是总共覆盖的tiles数量 CHECK_CUDA(cudaMemcpy(&num_rendered, geomState.point_offsets + P - 1, sizeof(int), cudaMemcpyDeviceToHost), debug); //计算所需的BinningState的数量,即每个高斯球覆盖的tile都有对应的装箱状态BinningState数据 size_t binning_chunk_size = required<BinningState>(num_rendered); //调整显存块大小,并返回首地址 char* binning_chunkptr = binningBuffer(binning_chunk_size); //用显存块首地址作为参数,调用fromChunk函数来申请显存 BinningState binningState = BinningState::fromChunk(binning_chunkptr, num_rendered); // For each instance to be rendered, produce adequate [ tile | depth ] key // and corresponding dublicated Gaussian indices to be sorted duplicateWithKeys << <(P + 255) / 256, 256 >> > ( P, geomState.means2D, geomState.depths, geomState.point_offsets, // 这里用到上步InclusiveSum得到的累计高斯球touch的tiles数 binningState.point_list_keys_unsorted, // 存储key(tileID|depth) binningState.point_list_unsorted, // 存储对应的高斯球idx radii, // 像素平面上高斯圆的半径,最长轴的3倍 tile_grid) // 全图中tile的数量 CHECK_CUDA(, debug) // 同步,并检查错误 //查找tile_grid.x * tile_grid.y的最高位 int bit = getHigherMsb(tile_grid.x * tile_grid.y); // Sort complete list of (duplicated) Gaussian indices by keys//依据key进行基数排序, //https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html CHECK_CUDA(cub::DeviceRadixSort::SortPairs( binningState.list_sorting_space, //辅助空间 binningState.sorting_size, //辅助空间大小 binningState.point_list_keys_unsorted, binningState.point_list_keys, //d_keys_in,d_keys_out binningState.point_list_unsorted, binningState.point_list, //d_values_in, d_values_out num_rendered, 0, 32 + bit), debug) //总共覆盖的tiles数量,开始bit位,结束比特位 //imgState.ranges的值设置成0,长度为tile_grid.x * tile_grid.y * sizeof(uint2) CHECK_CUDA(cudaMemset(imgState.ranges, 0, tile_grid.x * tile_grid.y * sizeof(uint2)), debug); // Identify start and end of per-tile workloads in sorted list if (num_rendered > 0) identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > ( num_rendered, binningState.point_list_keys, imgState.ranges); CHECK_CUDA(, debug) // Let each tile blend its range of Gaussians independently in parallel const float* feature_ptr = colors_precomp != nullptr ? colors_precomp : geomState.rgb; CHECK_CUDA(FORWARD::render( tile_grid, block, imgState.ranges, binningState.point_list, width, height, geomState.means2D, feature_ptr, geomState.conic_opacity, imgState.accum_alpha, imgState.n_contrib, background, out_color), debug) return num_rendered; }
forward.cu:
// 2D协方差矩阵计算的前向版本,mean:高斯点在世界坐标系的坐标,focla_x,focal_y:相机焦距,tan_fovx,tan_fovy:相机视场角的tan值,cov3D:3d椭球的协方差矩阵,viewmatrix:视角变换矩阵 // 计算结果存在cov中,并返回 __device__ float3 computeCov2D(const float3& mean, float focal_x, float focal_y, float tan_fovx, float tan_fovy, const float* cov3D, const float* viewmatrix) { // The following models the steps outlined by equations 29 // and 31 in "EWA Splatting" (Zwicker et al., 2002). // Additionally considers aspect / scaling of viewport. // Transposes used to account for row-/column-major conventions. float3 t = transformPoint4x3(mean, viewmatrix); const float limx = 1.3f * tan_fovx; const float limy = 1.3f * tan_fovy; const float txtz = t.x / t.z; const float tytz = t.y / t.z; t.x = min(limx, max(-limx, txtz)) * t.z; t.y = min(limy, max(-limy, tytz)) * t.z; glm::mat3 J = glm::mat3( focal_x / t.z, 0.0f, -(focal_x * t.x) / (t.z * t.z), 0.0f, focal_y / t.z, -(focal_y * t.y) / (t.z * t.z), 0, 0, 0); glm::mat3 W = glm::mat3( viewmatrix[0], viewmatrix[4], viewmatrix[8], viewmatrix[1], viewmatrix[5], viewmatrix[9], viewmatrix[2], viewmatrix[6], viewmatrix[10]); glm::mat3 T = W * J; //Vrk:3d协方差矩阵 glm::mat3 Vrk = glm::mat3( cov3D[0], cov3D[1], cov3D[2], cov3D[1], cov3D[3], cov3D[4], cov3D[2], cov3D[4], cov3D[5]); //cov: (W^T*J)^T * Vrk^T * (W^T*J) == (J*W)* Vrk^T * (W^T*J^T) glm::mat3 cov = glm::transpose(T) * glm::transpose(Vrk) * T; // Apply low-pass filter: every Gaussian should be at least // one pixel wide/high. Discard 3rd row and column.//确保协方差矩阵正定,数值稳定性考虑 cov[0][0] += 0.3f; cov[1][1] += 0.3f; return { float(cov[0][0]), float(cov[0][1]), float(cov[1][1]) }; } // 在世界坐标系中,将高斯的旋转和缩放属性转换为3d协方差矩阵,同时注意四元数归一化。 // scale:缩放,mod:缩放的模长,rot:四元数旋转 // cov3D:返回的协方差矩阵,RS*(RS)T __device__ void computeCov3D(const glm::vec3 scale, float mod, const glm::vec4 rot, float* cov3D) { // Create scaling matrix glm::mat3 S = glm::mat3(1.0f); S[0][0] = mod * scale.x; S[1][1] = mod * scale.y; S[2][2] = mod * scale.z; // Normalize quaternion to get valid rotation//从四元数计算旋转矩阵 glm::vec4 q = rot;// / glm::length(rot); float r = q.x; float x = q.y; float y = q.z; float z = q.w; // Compute rotation matrix from quaternion glm::mat3 R = glm::mat3( 1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y), 2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x), 2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y) ); glm::mat3 M = S * R; // Compute 3D world covariance matrix Sigma glm::mat3 Sigma = glm::transpose(M) * M; // Covariance is symmetric, only store upper right cov3D[0] = Sigma[0][0]; cov3D[1] = Sigma[0][1]; cov3D[2] = Sigma[0][2]; cov3D[3] = Sigma[1][1]; cov3D[4] = Sigma[1][2]; cov3D[5] = Sigma[2][2]; } // 计算每一个高斯球投影出来的圆半径及覆盖的范围,一个线程处理一个高斯球 template<int C> __global__ void preprocessCUDA(int P, int D, int M, // P:高斯球的数量,D:,M: const float* orig_points, const glm::vec3* scales, const float scale_modifier, const glm::vec4* rotations, const float* opacities, const float* shs, bool* clamped, const float* cov3D_precomp, const float* colors_precomp, const float* viewmatrix, const float* projmatrix, const glm::vec3* cam_pos, const int W, int H, const float tan_fovx, float tan_fovy, const float focal_x, float focal_y, int* radii, float2* points_xy_image, float* depths, float* cov3Ds, float* rgb, float4* conic_opacity, const dim3 grid, //block块的范围 uint32_t* tiles_touched, // bool prefiltered) { auto idx = cg::this_grid().thread_rank(); // 线程在组内的标号,区间为[0,num_threads) if (idx >= P) // 如果编号大于高斯球的数量,则返回,也就是说一个线程对应一个高斯球处理 return; // Initialize radius and touched tiles to 0. If this isn't changed, // this Gaussian will not be processed further.//半径和tile数 radii[idx] = 0; tiles_touched[idx] = 0; // Perform near culling, quit if outside执行近剔除,如果超出范围则退出.idx:0-255,thread的id号; float3 p_view; if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view)) return; // Transform point by projecting //p_orig:空间3d点,projmatrix:投影矩阵c2w float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] }; float4 p_hom = transformPoint4x4(p_orig, projmatrix); float p_w = 1.0f / (p_hom.w + 0.0000001f); //投影到相机ndc坐标系后的高斯球中心坐标 float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; // If 3D covariance matrix is precomputed, use it, otherwise compute from scaling and rotation parameters. const float* cov3D; if (cov3D_precomp != nullptr) { cov3D = cov3D_precomp + idx * 6; //第idx个高斯球的3d协方差 } else { //用缩放和旋转计算三个轴长度(3d协方差矩阵) computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6); cov3D = cov3Ds + idx * 6; } // Compute 2D screen-space covariance matrix//计算椭球投影成椭圆的样子,用对称2D矩阵记录(abbc) float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix); // Invert covariance (EWA algorithm)//数值求2*2方阵[cov.x,cov.y,cov.y,cov.z]abbc的特征值 float det = (cov.x * cov.z - cov.y * cov.y); //2*2矩阵的模,ac - b^2 if (det == 0.0f) return; float det_inv = 1.f / det; float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv }; //2d协方差矩阵的逆 det*(c,-b,a) // Compute extent in screen space (by finding eigenvalues of // 2D covariance matrix). Use extent to compute a bounding rectangle // of screen-space tiles that this Gaussian overlaps with. Quit if // rectangle covers 0 tiles. float mid = 0.5f * (cov.x + cov.z); float lambda1 = mid + sqrt(max(0.1f, mid * mid - det)); float lambda2 = mid - sqrt(max(0.1f, mid * mid - det)); float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2))); //最长轴的3倍,覆盖99%的区域 float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) }; //高斯球中心的像素坐标 uint2 rect_min, rect_max; //计算圆覆盖了哪些tile,返回rect_min, rect_max getRect(point_image, my_radius, rect_min, rect_max, grid); if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0) return; // If colors have been precomputed, use them, otherwise convert // spherical harmonics coefficients to RGB color.//从球谐函数系数求投影点颜色 if (colors_precomp == nullptr) { glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped); rgb[idx * C + 0] = result.x; //C为模版参数,表示通道数,这里是3 rgb[idx * C + 1] = result.y; rgb[idx * C + 2] = result.z; } // Store some useful helper data for the next steps. depths[idx] = p_view.z; //第idx个高斯球的中心点的深度值 radii[idx] = my_radius; //高斯椭圆的长轴半径,超过这个范围高斯分布的值很小 points_xy_image[idx] = point_image; //第idx个高斯椭圆的像素坐标 // Inverse 2D covariance and opacity neatly pack into one float4 conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] }; //2d协方差矩阵的逆和不透明度 tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x); //存储所覆盖的tile的数量 } // Main rasterization method. Collaboratively works on one tile per // block, each thread treats one pixel. Alternates between fetching // and rasterizing data.//渲染,一个block一个tile,一个thread一个像素 template <uint32_t CHANNELS> __global__ void __launch_bounds__(BLOCK_X * BLOCK_Y) renderCUDA( const uint2* __restrict__ ranges, const uint32_t* __restrict__ point_list, int W, int H, const float2* __restrict__ points_xy_image, const float* __restrict__ features, const float4* __restrict__ conic_opacity, float* __restrict__ final_T, uint32_t* __restrict__ n_contrib, const float* __restrict__ bg_color, float* __restrict__ out_color) { // Identify current tile and associated min/max pixel range. auto block = cg::this_thread_block(); //创建线程块block uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X; //水平块数 //group_index():当前block在grid中的三维索引,相当于blockIdx uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y }; //最小像素坐标 uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) }; //最大像素坐标 //thread_index():当前thread在block中的三维索引,相当于threadIdx uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y }; //当前线程正在处理的像素坐标 uint32_t pix_id = W * pix.y + pix.x; //像素id,二维展平成一维后 float2 pixf = { (float)pix.x, (float)pix.y }; //浮点型像素坐标 // Check if this thread is associated with a valid pixel or outside.//在图像内的像素点 bool inside = pix.x < W&& pix.y < H; // Done threads can help with fetching, but don't rasterize //渲染结束标志,初始值取inside的相反值 bool done = !inside; // Load start/end range of IDs to process in bit sorted list. // 当前tile在point_list_keys中的范围,包括x下界和y上届 uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x]; // rounds表示每个线程需要处理的高斯球的数量 const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE); // toDo=range.y-range.x表示当前tile(block)对应的需要计算的高斯球的数量 int toDo = range.y - range.x; // Allocate storage for batches of collectively fetched data. //共享显存,被同一个block的thread共享的存储 __shared__ int collected_id[BLOCK_SIZE]; // 记录各线程处理的高斯球的编号 __shared__ float2 collected_xy[BLOCK_SIZE]; // 记录各线程处理的高斯球中心在2d平面的投影坐标 __shared__ float4 collected_conic_opacity[BLOCK_SIZE]; // 记录各线程处理的高斯球的2d协方差矩阵的逆和不透明度 // Initialize helper variables float T = 1.0f; // 透射率 uint32_t contributor = 0; // 计算经过了多少高斯 uint32_t last_contributor = 0; // 存储最终经过的高斯球数量 float C[CHANNELS] = { 0 }; // 最后渲染的颜色 // Iterate over batches until all done or range is complete for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE) { // 如果一个block里面全部thread完成,则退出循环 int num_done = __syncthreads_count(done); if (num_done == BLOCK_SIZE) break; // 共同从全局显存读取每一个高斯数据到共享显存,已经结束的thread去取 int progress = i * BLOCK_SIZE + block.thread_rank(); //thread_rank:当前线程在组内的标号,区间为[0, num_threads) if (range.x + progress < range.y) //如果当前线程有效,即处理的高斯球不越界 { //point_list表示与已排序的point_list_keys对应的高斯球编号,coll_id为当前线程处理的高斯球编号 int coll_id = point_list[range.x + progress]; collected_id[block.thread_rank()] = coll_id; //当前线程处理的高斯球id collected_xy[block.thread_rank()] = points_xy_image[coll_id]; //points_xy_image:高斯椭圆中心在像素平面的坐标 collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id]; //2d协方差矩阵的逆和不透明度 } block.sync(); // 每个线程遍历当前batch一堆高斯球 for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++) { // Keep track of current position in range contributor++; // Resample using conic matrix (cf. "Surface // Splatting" by Zwicker et al., 2001)//圆锥形矩阵 float2 xy = collected_xy[j]; //2d高斯椭圆中心像素坐标 float2 d = { xy.x - pixf.x, xy.y - pixf.y }; //像素点与2d高斯中心点的距离 float4 con_o = collected_conic_opacity[j]; //2d协方差矩阵的逆和不透明度 float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y; // 计算高斯分布概率的指数部分 if (power > 0.0f) continue; // Eq. (2) from 3D Gaussian splatting paper.// 通过与高斯不透明度相乘获得alpha值及其从平均值开始的指数衰减。避免数值不稳定性 // Obtain alpha by multiplying with Gaussian opacity // and its exponential falloff from mean. // Avoid numerical instabilities (see paper appendix). float alpha = min(0.99f, con_o.w * exp(power)); //通过power值和不透明度计算alpha值 if (alpha < 1.0f / 255.0f) continue; float test_T = T * (1 - alpha); //由alpha计算透射率 if (test_T < 0.0001f) { done = true; continue; } // Eq. (3) from 3D Gaussian splatting paper.//颜色alpha合成 for (int ch = 0; ch < CHANNELS; ch++) C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T; T = test_T; // Keep track of last range entry to update this // pixel. last_contributor = contributor; } } // 所有处理有效像素的thread都会将其最终渲染数据 // 写入帧缓冲区和辅助缓冲区 if (inside) { final_T[pix_id] = T; // 输出最终pix_id像素点的T值 n_contrib[pix_id] = last_contributor; // 输出贡献的高斯球数 for (int ch = 0; ch < CHANNELS; ch++) out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch]; // } }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。