赞
踩
ndarray.py
文件中完成一些python array操作
out[cnt++] = in[strides[0]*i + strides[1]*j + strides[2]*k];
(这里是按三维矩阵算的,下面的代码通过while循环可以扩展到其他维度)
_strided_index_setter(&a, out, shape, strides, offset, INDEX_IN);
// 从后往前每一个维度,都根据该维度的步长、offset以及循环次数来确定本次要copy矩阵a的哪一个元素
// 其实就是根据strides、offset来确定一维下的index
void _strided_index_setter(const AlignedArray* a, AlignedArray* out, std::vector<uint32_t> shape,
std::vector<uint32_t> strides, size_t offset, strided_index_mode mode, int val=-1) {
int depth = shape.size();
std::vector<uint32_t> loop(depth, 0);
int cnt = 0;
while (true) {
// inner loop
int index = offset;
for (int i = 0; i < depth; i++) {
index += strides[i] * loop[i];
}
switch (mode) {
case INDEX_OUT: out->ptr[index] = a->ptr[cnt++]; break;
case INDEX_IN: out->ptr[cnt++] = a->ptr[index]; break;
case SET_VAL: out->ptr[index] = val; break;
}
// increment
loop[depth - 1]++;
// carry
int idx = depth - 1;
while (loop[idx] == shape[idx]) {
if (idx == 0) {
// overflow
return;
}
loop[idx--] = 0;
loop[idx]++;
}
}
}
_strided_index_setter(&a, out, shape, strides, offset, INDEX_OUT);
_strided_index_setter(nullptr, out, shape, strides, offset, SET_VAL, val);
void ReduceMax(const AlignedArray& a, AlignedArray* out, size_t reduce_size) {
for (int i = 0; i < out->size; i++) {
scalar_t max = a.ptr[i * reduce_size];
for (int j = 0; j < reduce_size; j++) {
max = std::max(max, a.ptr[i * reduce_size + j]);
}
out->ptr[i] = max;
}
}
void ReduceSum(const AlignedArray& a, AlignedArray* out, size_t reduce_size) {
for (int i = 0; i < out->size; i++) {
scalar_t sum = 0;
for (int j = 0; j < reduce_size; j++) {
sum += a.ptr[i * reduce_size + j];
}
out->ptr[i] = sum;
}
}
Matmul
:void Matmul(const AlignedArray& a, const AlignedArray& b, AlignedArray* out, uint32_t m, uint32_t n,
uint32_t p) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < p; j++) {
out->ptr[i * p + j] = 0;
for (int k = 0; k < n; k++) {
out->ptr[i * p + j] += a.ptr[i * n + k] * b.ptr[k * p + j];
}
}
}
}
MatmulTiled
和AlignedDot
:前者负责根据分片大小计算目前参与运算的是a、b、out的哪些元素(哪部分block);后者根据前者传进来的block进行计算(a分片与b分片进行矩阵乘法得到out分片void MatmulTiled(const AlignedArray& a, const AlignedArray& b, AlignedArray* out, uint32_t m,
uint32_t n, uint32_t p) {
for (int i = 0; i < m * p; i++) out->ptr[i] = 0;
for (int i = 0; i < m / TILE; i++) {
for (int j = 0; j < p / TILE; j++) {
for (int k = 0; k < n / TILE; k++) {
AlignedDot(&a.ptr[i * n * TILE + k * TILE * TILE],
&b.ptr[k * p * TILE + j * TILE * TILE],
&out->ptr[i * p * TILE + j * TILE * TILE]);
}
}
}
}
inline void AlignedDot(const float* __restrict__ a,
const float* __restrict__ b,
float* __restrict__ out) {
a = (const float*)__builtin_assume_aligned(a, TILE * ELEM_SIZE);
b = (const float*)__builtin_assume_aligned(b, TILE * ELEM_SIZE);
out = (float*)__builtin_assume_aligned(out, TILE * ELEM_SIZE);
for (int i = 0; i < TILE; i++) {
for (int j = 0; j < TILE; j++) {
for (int k = 0; k < TILE; k++) {
out[i * TILE + j] += a[i * TILE + k] * b[k * TILE + j];
}
}
}
}
__device__ size_t index_transform(size_t index, CudaVec shape, CudaVec strides, size_t offset) {
size_t idxs[MAX_VEC_SIZE];
size_t cur_size, pre_size = 1;
// 将给定的线性索引映射回多维数组的索引,即计算index值在各维度上对应是第几个
// 思路就是从后往前遍历shape,cur_size表示当前处理的维度由几个元素构成
// index%cur_size/pre_size就表示在当前维度的第几个分量
// index%cur_size表示是当前维度(只看当前维)的第几个元素,再/pre_size就表示是当前维度的第几块
for (int i = shape.size - 1; i >= 0; i--) {
cur_size = pre_size * shape.data[i];
idxs[i] = index % cur_size / pre_size;
pre_size = cur_size;
}
// 根据上述算好的多维数组索引,计算在原非紧凑型矩阵中的线性索引
size_t comp_idx = offset;
for (int i = 0; i < shape.size; i++)
comp_idx += idxs[i] * strides.data[i];
return comp_idx;
}
__global__ void CompactKernel(const scalar_t* a, scalar_t* out, size_t size, CudaVec shape,
CudaVec strides, size_t offset) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
/// BEGIN YOUR SOLUTION
if (gid < size)
out[gid] = a[index_transform(gid, shape, strides, offset)];
/// END YOUR SOLUTION
}
void Compact(const CudaArray& a, CudaArray* out, std::vector<uint32_t> shape,
std::vector<uint32_t> strides, size_t offset) {
CudaDims dim = CudaOneDim(out->size);
CompactKernel<<<dim.grid, dim.block>>>(a.ptr, out->ptr, out->size, VecToCuda(shape),
VecToCuda(strides), offset);
}
__global__ void EwiseSetitemKernel(const scalar_t* a, scalar_t* out, size_t size, CudaVec shape,
CudaVec strides, size_t offset) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size)
out[index_transform(gid, shape, strides, offset)] = a[gid];
}
void EwiseSetitem(const CudaArray& a, CudaArray* out, std::vector<uint32_t> shape,
std::vector<uint32_t> strides, size_t offset) {
/// BEGIN YOUR SOLUTION
CudaDims dim = CudaOneDim(out->size);
EwiseSetitemKernel<<<dim.grid, dim.block>>>(a.ptr, out->ptr, a.size, VecToCuda(shape),
VecToCuda(strides), offset);
/// END YOUR SOLUTION
}
__global__ void ScalarSetitemKernel(size_t size, scalar_t val, scalar_t* out, CudaVec shape,
CudaVec strides, size_t offset) {
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid < size)
out[index_transform(gid, shape, strides, offset)] = val;
}
void ScalarSetitem(size_t size, scalar_t val, CudaArray* out, std::vector<uint32_t> shape,
std::vector<uint32_t> strides, size_t offset) {
/// BEGIN YOUR SOLUTION
CudaDims dim = CudaOneDim(out->size);
ScalarSetitemKernel<<<dim.grid, dim.block>>>(size, val, out->ptr, VecToCuda(shape),
VecToCuda(strides), offset);
/// END YOUR SOLUTION
}
__global__ void EwiseMulKernel(const scalar_t* a, const scalar_t* b, scalar_t* out, size_t size){
size_t gid = blockInx.x * blockDim.x + threadIdx.x;
if (gid<size){
out[gid] = a[gid] * b[gid];
}
}
void EwiseMul(const CudaArray &a, const CudaArray &b, CudaArray* out){
CudaDims dim = CudaOneDim(out->size);
EwiseMulKernel<<<dim.grid, dim.block>>>(a.ptr, b.ptr, out->ptr, out->size);
}
__global__ void ReduceMaxKernel(const scalar_t* a, scalar_t* out, size_t reduce_size, size_t size){
size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
if (gid<size){
size_t offset = gid * reduce_size;
scalar_t reduce_max = a[offset];
for (int i=1; i<reduce_size; i++){
reduce_max = max(reduce_max, a[offset+i]);
}
out[gid] = reduce_max;
}
}
void ReduceMax(const CudaArray& a, CudaArray* out, size_t reduce_size) {
CudaDims dim = CudaOneDim(out->size);
ReduceMaxKernel<<<dim.grid, dim.block>>>(a.ptr, out->ptr, reduce_size, out->size);
}
__global__ void MatmulKernel(const scalar_t* a, const scalar_t* b, scalar_t* out, uint32_t M,
uint32_t N, uint32_t P) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
size_t j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < M && j < P) {
out[i * P + j] = 0;
for (int k = 0; k < N; k++) {
out[i * P + j] += a[i * N + k] * b[k * P + j];
}
}
}
void Matmul(const CudaArray& a, const CudaArray& b, CudaArray* out, uint32_t M, uint32_t N,
uint32_t P) {
dim3 grid(BASE_THREAD_NUM, BASE_THREAD_NUM, 1);
dim3 block((M + BASE_THREAD_NUM - 1) / BASE_THREAD_NUM, (P + BASE_THREAD_NUM - 1) / BASE_THREAD_NUM, 1);
MatmulKernel<<<grid, block>>>(a.ptr, b.ptr, out->ptr, M, N, P);
}
如上图的结构,首先定义Cuda的维度:
struct CudaDims {
dim3 block, grid;
};
根据数据数量来判定需要多少个block、多少个thread(都是一个维度下的)
CudaDims CudaOneDim(size_t size) {
/**
* Utility function to get cuda dimensions for 1D call
*/
CudaDims dim;
size_t num_blocks = (size + BASE_THREAD_NUM - 1) / BASE_THREAD_NUM;
dim.block = dim3(BASE_THREAD_NUM, 1, 1); // 一个block里的线程
dim.grid = dim3(num_blocks, 1, 1); // 一个grid里的block
return dim;
}
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。