向量加和:
#include <iostream>__global__ void vectorAdd(int n, const float* a, const float* b, float* c) {int i = blockDim.x * blockIdx.x + threadIdx.x;if (i < n) {c[i] = a[i] + b[i];}
}
int main() {int n = 1 << 20;size_t size = n * sizeof(float);float *a, *b, *c;cudaMallocManaged(&a, size);cudaMallocManaged(&b, size);cudaMallocManaged(&c, size);for (int i = 0; i < n; i++) {a[i] = 1.1f;b[i] = 2.3f;}int threadPerBlock = 256;int blockPerGrid = (n + threadPerBlock - 1) / threadPerBlock;vectorAdd<<<blockPerGrid, threadPerBlock>>>(n, a, b, c);cudaDeviceSynchronize();std::cout << c[0] << " " << c[n / 2] << " " << c[n - 1] << std::endl;cudaFree(a);cudaFree(b);cudaFree(c);return 0;
}
矩阵乘
#include <iostream>__global__ void matrixMul(int m,int n,int k,const float* a,const float* b,float* c) {// Shape: (m,n) @ (n,k) = (m,k)int col = blockDim.x * blockIdx.x + threadIdx.x;int row = blockDim.y * blockIdx.y + threadIdx.y;if (col < k && row < m) {float sum = 0.0f;for (int i = 0; i < n; i++) {sum += a[n * row + i] * b[i * k + col];}c[row * k + col] = sum;}
}
int main() {int m = 3;int n = 2;int k = 4;size_t size_a = m * n * sizeof(float);size_t size_b = n * k * sizeof(float);size_t size_c = m * k * sizeof(float);float *a, *b, *c;cudaMallocManaged(&a, size_a);cudaMallocManaged(&b, size_b);cudaMallocManaged(&c, size_c);for (int i = 0; i < m * n; i++) {a[i] = 2.0f;}for (int i = 0; i < n * k; i++) {b[i] = 3.0f;}dim3 threadPerBlock(16, 16);dim3 blockPerGrid((k + threadPerBlock.x - 1) / threadPerBlock.x,(m + threadPerBlock.y - 1) / threadPerBlock.y);matrixMul<<<blockPerGrid, threadPerBlock>>>(m, n, k, a, b, c);cudaDeviceSynchronize();std::cout << c[0] << " " << c[2 * k] << " " << c[(m - 1) * k + k - 1]<< std::endl;cudaFree(a);cudaFree(b);cudaFree(c);return 0;
}
无通道的卷积
#include <iostream>__global__ void conv2D(int H,int W,int kH,int kW,int padH,int padW,int strideH,int strideW,int Hout,int Wout,float* img,float* kernel,float* output) {int ox = blockDim.x * blockIdx.x + threadIdx.x; // 当前输出列int oy = blockDim.y * blockIdx.y + threadIdx.y; // 当前输出行if (ox >= Wout || oy >= Hout) {return;}float sum = 0.0f;// 被卷积的起点坐标int in_x0 = ox * strideW - padW;int in_y0 = oy * strideH - padH;for (int i = 0; i < kH; i++) {for (int j = 0; j < kW; j++) {int in_xi = in_x0 + j;int in_yi = in_y0 + i;if (in_xi >= 0 && in_xi < W && in_yi >= 0 && in_yi < H) {sum += img[in_yi * W + in_xi] * kernel[i * kW + j];}}}output[oy * Wout + ox] = sum;
}int main() {int H = 12, W = 12;int kH = 3, kW = 3;int padH = 1, padW = 1;int strideH = 1, strideW = 1;int Hout = (H + 2 * padH - kH) / strideH + 1;int Wout = (W + 2 * padW - kW) / strideW + 1;size_t img_size = H * W * sizeof(float);size_t kernel_size = kH * kW * sizeof(float);size_t out_size = Hout * Wout * sizeof(float);float *img, *kernel, *output;cudaMallocManaged(&img, img_size);cudaMallocManaged(&kernel, kernel_size);cudaMallocManaged(&output, out_size);for (int i = 0; i < H * W; i++) {img[i] = 10.0f;}for (int i = 0; i < kH * kW; i++) {kernel[i] = 0.5f;}dim3 threadPerBlock(16, 16);dim3 blockPerGrid((Wout + threadPerBlock.x - 1) / threadPerBlock.x,(Hout + threadPerBlock.y - 1) / threadPerBlock.y);conv2D<<<blockPerGrid, threadPerBlock>>>(H, W, kH, kW, padH, padW, strideH,strideW, Hout, Wout, img, kernel,output);cudaDeviceSynchronize();for (int i = 0; i < Hout; i++) {for (int j = 0; j < Wout; j++) {std::cout << output[i * Wout + j] << " ";}std::cout << std::endl;}return 0;
}
参考文献
- CUDA 入门教程:更简单的介绍 (更新版)