赞
踩
一种较好的做法是基于shared mem,合并内存访问读取到shared mem,读取整个warpSize x warpSize大小的矩阵块。然后基于shared mem索引变换读取实现转置效果,最后写回同样可以合并内存访问。可以考虑使用一个warp或者一个thread block读取warpSize x warpSize大小的矩阵,基于shared mem转置后写回。中小尺寸采用后者,也就是一个thread block转置一个warpSize x warpSize大小的矩阵,可以创建更多的线程提高硬件利用率,性能更优。
参考CUDA实现代码
- #include <stdio.h>
- #include <iostream>
- using namespace std;
-
- #include <cuda_runtime.h>
- #include "utils/cuda_mem_helper.h"
- #include "utils/cuda_stream_helper.h"
-
- #define THREAD_PER_WARP 32
- #define WARP_PER_BLOCK 8
- #define BLOCK_RD_NUM (THREAD_PER_WARP / WARP_PER_BLOCK)
-
- /*
- using a warp to transpose warpSize*warpSize block
- */
- template <typename T>
- __global__ void transpose_2d_warp(const T* __restrict__ in, T* __restrict__ out,
- const int row, const int col,
- int warp_row, int warp_col, int total_warp) {
- const int tid = blockDim.x * blockIdx.x + threadIdx.x; // global thread id
- const int warp_bid = threadIdx.x / THREAD_PER_WARP; // warp id in thread block
- const int warp_gid = tid / THREAD_PER_WARP; // warp id in grid
-
- const int lane = threadIdx.x % THREAD_PER_WARP; // thread id in warp
- const int warp_id_c = warp_gid % warp_col;
- const int warp_id_r = warp_gid / warp_col;
-
- // add array padding to handle bank-conflict
- __shared__ T block_data[WARP_PER_BLOCK][THREAD_PER_WARP][THREAD_PER_WARP + 1];
-
- const int row_bias = warp_id_r * THREAD_PER_WARP;
- const int col_bias = warp_id_c * THREAD_PER_WARP;
-
- // read block from input
- for (int i = 0; i < THREAD_PER_WARP; i++) {
- int addr = (row_bias + i) * col + col_bias + lane;
- block_data[warp_bid][i][lane] = in[addr];
- }
- __syncthreads();
-
- // write block to output
- for (int i = 0; i < THREAD_PER_WARP; i++) {
- int tgt_c = col_bias + i;
- int tgt_r = row_bias + lane;
- if ((tgt_r < row) && (tgt_c < col)) {
- int addr = tgt_c * row + tgt_r;
- out[addr] = block_data[warp_bid][lane][i];
- }
- }
- }
-
- /*
- using a thread block to transpose warpSize*warpSize block
- */
- template <typename T>
- __global__ void transpose_2d_block(const T* __restrict__ in, T* __restrict__ out,
- const int row, const int col) {
- int block_id = blockIdx.x;
- int block_num_col = col / warpSize;
-
- int block_id_row = block_id / block_num_col;
- int block_id_col = block_id % block_num_col;
-
- int row_offset = block_id_row * warpSize;
- int col_offset = block_id_col * warpSize;
-
- int row_id = threadIdx.x / warpSize;
- int col_id = threadIdx.x % warpSize;
-
- // add array padding to handle bank-conflict
- __shared__ T block_data[THREAD_PER_WARP][THREAD_PER_WARP + 1];
-
- #pragma unroll
- for (int i = 0; i < BLOCK_RD_NUM; i++) {
- int row_pos = i * WARP_PER_BLOCK + row_id;
- int in_addr = (row_offset + row_pos) * col + col_offset + col_id;
- block_data[row_pos][col_id] = in[in_addr];
- }
- __syncthreads();
-
-
- #pragma unroll
- for (int i = 0; i < BLOCK_RD_NUM; i++) {
- int row_pos = i * WARP_PER_BLOCK + row_id;
- int out_addr = (col_offset + row_pos) * row + row_offset + col_id;
-
- out[out_addr] = block_data[col_id][row_pos];
- }
- }
-
- template <typename T>
- void Transpose2DWarp(const T* in, T* out, const int row, const int col, cudaStream_t & stream) {
- const int warp_row = (row + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
- const int warp_col = (col + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
- const int total_warp = warp_row * warp_col;
-
- const int block_size = THREAD_PER_WARP * WARP_PER_BLOCK;
- const int grid_size = (total_warp + WARP_PER_BLOCK - 1) / WARP_PER_BLOCK;
-
- transpose_2d_warp <<< grid_size, block_size, 0, stream>>>(in, out, row, col, warp_row, warp_col, total_warp);
- }
-
-
- template <typename T>
- void Transpose2DBlock(const T* in, T* out, const int row, const int col, cudaStream_t & stream) {
- const int block_row = (row + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
- const int block_col = (col + THREAD_PER_WARP - 1) / THREAD_PER_WARP;
- const int total_block = block_row * block_col;
-
- const int block_size = THREAD_PER_WARP * WARP_PER_BLOCK;
- const int grid_size = total_block;
-
- transpose_2d_block <<< grid_size, block_size, 0, stream>>>(in, out, row, col);
- }
-
- int main(void) {
- cudaError_t err = cudaSuccess;
-
- int row = 256;
- int col = 256;
-
- CudaMemoryHelper<float> data_in({row, col});
- CudaMemoryHelper<float> data_out({row, col});
-
- data_in.StepInitHostMem(1.0f);
- data_in.CopyMemH2D();
- data_in.PrintElems(4, 512, col);
-
- CudaStreamHelper stream_helper;
- auto & stream = stream_helper.stream;
-
- int eval_num = 20;
-
- int thread_num = row * col;
- int threadsPerBlock = std::min(128, col);
- int blocksPerGrid = (thread_num + threadsPerBlock - 1) / threadsPerBlock;
- printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
-
- for (int i = 0; i < eval_num; i++) {
- Transpose2DWarp(data_in.d_mem, data_out.d_mem, row, col, stream);
- }
- stream_helper.Sync();
-
- for (int i = 0; i < eval_num; i++) {
- Transpose2DBlock(data_in.d_mem, data_out.d_mem, row, col, stream);
- }
- stream_helper.Sync();
-
- data_out.CopyMemD2H();
-
- printf("results0:\n");
- // verify results
- data_out.PrintElems(1, 1024, row);
-
- return 0;
- }
可以参考
CUDA学习(二)矩阵转置及优化(合并访问、共享内存、bank conflict) - 知乎
端侧GPU没有shared mem怎么处理?
可以考虑一个线程转置4x4大小矩阵块,连续读取4行,每次读取都能pack4读取,然后基于寄存器转置,写回仍然能pack4写回。
opencl参考代码
- #include <iostream>
- #include <memory>
- #include <string>
- #include <vector>
- #include <chrono>
-
- #include "mem_helper.h"
-
- #define CL_HPP_TARGET_OPENCL_VERSION 300
- #include <CL/opencl.hpp>
-
- using TEST_DTYPE = half;
-
- using namespace std;
-
- // must be the same with real implement
- #define TH_ROW_SIZE 4
- #define TH_COL_SIZE 4
-
- // should be used only for small channel_size (<=768)
- std::string kernel_source{R"(
- #pragma OPENCL EXTENSION cl_khr_fp16 : enable
- #define TH_ROW_SIZE 4
- #define TH_COL_SIZE 4
- #define DTYPE half
- #define DTYPE_PACK4 half4
- // use a warp to calculate
- kernel void transpose_kernel(__global const DTYPE* __restrict__ d_in, __global DTYPE* __restrict__ d_out, int hight,
- int width) {
- int gid = get_global_id(0);
- int batch_id = get_global_id(1);
- int batch_addr = batch_id * hight * width;
- int col_th_num = (width + TH_COL_SIZE - 1) / TH_COL_SIZE;
- int row_id = gid / col_th_num;
- int col_id = gid % col_th_num;
- int row_pos = row_id * TH_ROW_SIZE;
- int col_pos = col_id * TH_COL_SIZE;
- // read 4 x 4 tile
- DTYPE_PACK4 in_datas[TH_ROW_SIZE];
- #pragma unroll
- for (int rcnt = 0; rcnt < TH_ROW_SIZE; rcnt++) {
- int in_addr = batch_addr + (row_pos + rcnt) * width + col_pos;
- const DTYPE_PACK4* d_in_p4 = (const DTYPE_PACK4*)&d_in[in_addr];
- in_datas[rcnt] = d_in_p4[0];
- }
- DTYPE_PACK4 out_datas[TH_COL_SIZE];
- out_datas[0] = (DTYPE_PACK4)(in_datas[0].s0, in_datas[1].s0, in_datas[2].s0, in_datas[3].s0);
- out_datas[1] = (DTYPE_PACK4)(in_datas[0].s1, in_datas[1].s1, in_datas[2].s1, in_datas[3].s1);
- out_datas[2] = (DTYPE_PACK4)(in_datas[0].s2, in_datas[1].s2, in_datas[2].s2, in_datas[3].s2);
- out_datas[3] = (DTYPE_PACK4)(in_datas[0].s3, in_datas[1].s3, in_datas[2].s3, in_datas[3].s3);
- #pragma unroll
- for (int ccnt = 0; ccnt < TH_COL_SIZE; ccnt++) {
- int out_addr = batch_addr + (col_pos + ccnt) * hight + row_pos;
- DTYPE_PACK4* d_out_p4 = (const DTYPE_PACK4*)&d_out[out_addr];
- d_out_p4[0] = out_datas[ccnt];
- }
- }
- )"};
-
- int main() {
- std::vector<cl::Platform> platforms;
- cl::Platform::get(&platforms);
- std::cout << "get platform num:" << platforms.size() << std::endl;
-
- cl::Platform plat;
- for (auto& p : platforms) {
- std::string platver = p.getInfo<CL_PLATFORM_VERSION>();
- if (platver.find("OpenCL 2.") != std::string::npos || platver.find("OpenCL 3.") != std::string::npos) {
- // Note: an OpenCL 3.x platform may not support all required features!
- plat = p;
- }
- }
- if (plat() == 0) {
- std::cout << "No OpenCL 2.0 or newer platform found.\n";
- return -1;
- }
-
- std::cout << "platform name:" << plat.getInfo<CL_PLATFORM_NAME>() << std::endl;
-
- cl::Platform newP = cl::Platform::setDefault(plat);
- if (newP != plat) {
- std::cout << "Error setting default platform.\n";
- return -1;
- }
-
- // get default device (CPUs, GPUs) of the default platform
- std::vector<cl::Device> all_devices;
- newP.getDevices(CL_DEVICE_TYPE_GPU, &all_devices); // CL_DEVICE_TYPE_ALL
- std::cout << "get all_devices num:" << all_devices.size() << std::endl;
-
- if (all_devices.size() == 0) {
- std::cout << " No devices found. Check OpenCL installation!\n";
- exit(1);
- }
-
- // cl::Device default_device = cl::Device::getDefault();
- cl::Device default_device = all_devices[0];
- std::cout << "device name: " << default_device.getInfo<CL_DEVICE_NAME>() << std::endl;
-
- // a context is like a "runtime link" to the device and platform;
- // i.e. communication is possible
- cl::Context context({default_device});
- cl::CommandQueue queue(context, default_device);
-
- int batch = 4;
- int hight = 1024;
- int width = 1024;
- vector<int> shape1 = {batch, hight, width};
-
- MemoryHelper<TEST_DTYPE> mem_in(shape1);
- MemoryHelper<TEST_DTYPE> mem_out(shape1);
- mem_in.StepInit(1.0f);
-
- // CL_MEM_WRITE_ONLY CL_MEM_READ_ONLY CL_MEM_READ_WRITE
- cl::Buffer d_in = cl::Buffer(context, CL_MEM_READ_WRITE, mem_in.bytes);
- cl::Buffer d_out = cl::Buffer(context, CL_MEM_READ_WRITE, mem_out.bytes);
-
- memset(mem_out.Mem(), 0, mem_out.bytes);
-
- // push write commands to queue
- queue.enqueueWriteBuffer(d_in, CL_TRUE, 0, mem_in.bytes, mem_in.Mem());
-
- std::vector<std::string> programStrings;
- programStrings.push_back(kernel_source);
- cl::Program program(context, programStrings);
-
- if (program.build({default_device}, "-cl-std=CL3.0") != CL_SUCCESS) {
- std::cout << "Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << std::endl;
- exit(1);
- }
-
- auto cl_kernel = cl::KernelFunctor<cl::Buffer, cl::Buffer, int, int>(program, "transpose_kernel");
-
- int local_thread_num = 128 * 4;
-
- int row_th_num = (hight + TH_ROW_SIZE - 1) / TH_ROW_SIZE;
- int col_th_num = (width + TH_COL_SIZE - 1) / TH_COL_SIZE;
- int total_thread_num = row_th_num * col_th_num;
-
- local_thread_num = std::min(local_thread_num, total_thread_num);
-
- cout << "total_thread_num: " << total_thread_num << endl;
- cout << "local_thread_num: " << local_thread_num << endl;
-
- // global, or global, local, or offset, global, local
- cl::EnqueueArgs kernel_args(queue, cl::NDRange(total_thread_num, batch), cl::NDRange(local_thread_num));
-
- int warmup_num = 50;
- int eval_num = 50;
- for (int i = 0; i < warmup_num; i++) {
- cl_kernel(kernel_args, d_in, d_out, hight, width);
- }
- queue.finish();
-
- auto t1 = std::chrono::high_resolution_clock::now();
- for (int i = 0; i < eval_num; i++) {
- cl_kernel(kernel_args, d_in, d_out, hight, width);
- }
- queue.finish();
- auto t2 = std::chrono::high_resolution_clock::now();
- auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
- float mean_time_ms = duration / 1000.0f / eval_num;
-
- printf("exec time us: %lld %d\n", duration, eval_num);
- printf("exec time: %f ms\n", mean_time_ms);
- printf("batch, hight, width: %d %d %d\n", batch, hight, width);
-
- queue.enqueueReadBuffer(d_out, CL_TRUE, 0, mem_out.bytes, mem_out.Mem());
- mem_in.PrintElems(1, 512, width);
- mem_out.PrintElems(1, 512, hight);
- return 0;
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。