做亚马逊有什么网站可以借鉴,网站的服务器选择,黄埔区网站建设,网页制作公司南昌测试cudaStream队列的深度 一.代码二.编译运行[得出队列深度为512] 以下代码片段用于测试cudaStream队列的深度 方法: 主线程一直发任务,启一个线程cudaEventQuery查询已完成的任务,二个计数器的值相减
一.代码
#include iostream
#include thread
#include … 测试cudaStream队列的深度 一.代码二.编译运行[得出队列深度为512] 以下代码片段用于测试cudaStream队列的深度 方法: 主线程一直发任务,启一个线程cudaEventQuery查询已完成的任务,二个计数器的值相减
一.代码
#include iostream
#include thread
#include chrono
#include cuda_runtime.h
#include atomic
#include vector
#include queue
#include mutex
#include condition_variable
#include iostream
#include thread
#include time.h__global__ void time_consuming_kernel(int *data) {int idx blockIdx.x * blockDim.x threadIdx.x;float value 0.0;for(int j0;j1000;j){for (long long i 0; i 1000000000; i) {value sinf(idx i);}}
}template typename T
class ThreadSafeQueue {
public:ThreadSafeQueue() default;// 禁用复制构造函数和赋值操作符ThreadSafeQueue(const ThreadSafeQueue) delete;ThreadSafeQueue operator(const ThreadSafeQueue) delete;// 添加元素到队列尾部void enqueue(T item) {std::lock_guardstd::mutex lock(mutex_);queue_.push(std::move(item));cond_var_.notify_one();}// 从队列头部移除元素bool dequeue(T item) {std::unique_lockstd::mutex lock(mutex_);cond_var_.wait(lock, [this] { return !queue_.empty(); });item std::move(queue_.front());queue_.pop();return true;}// 检查队列是否为空bool empty() {std::lock_guardstd::mutex lock(mutex_);return queue_.empty();}// 获取队列的大小size_t size() {std::lock_guardstd::mutex lock(mutex_);return queue_.size();}private:std::queueT queue_;mutable std::mutex mutex_;std::condition_variable cond_var_;
};std::atomicunsigned int recv_counter{0};
std::atomicunsigned int snd_counter{0};
ThreadSafeQueuecudaEvent_t tsQueue;// 查询 stream 是否完成的线程函数
void query_stream(cudaStream_t stream) {cudaError_t status cudaSuccess; while (true) {cudaEvent_t event;tsQueue.dequeue(event);while (true){status cudaEventQuery(event);if (status cudaSuccess) { recv_counter;break;}std::this_thread::sleep_for(std::chrono::milliseconds(1));}cudaEventDestroy(event);}
}int main() {int dev 0;cudaSetDevice(dev);cudaDeviceProp device_prop;cudaGetDeviceProperties(device_prop, dev);int max_threads_per_block device_prop.maxThreadsPerBlock;int max_blocks_per_grid_dim device_prop.maxGridSize[0];int *d_data;cudaMalloc(d_data, sizeof(int));cudaStream_t stream;cudaStreamCreate(stream);// 启动查询 stream 的线程std::thread query_thread(query_stream, stream);while(1){cudaEvent_t ev;cudaEventCreate(ev); auto start std::chrono::high_resolution_clock::now();time_consuming_kernelmax_blocks_per_grid_dim, max_threads_per_block,0,stream(d_data);cudaEventRecord(ev,stream);auto end std::chrono::high_resolution_clock::now();std::chrono::durationdouble, std::milli diff end - start;tsQueue.enqueue(ev);snd_counter1;printf(snd:%d rcv:%d gap:%d lanuch_duration:%f\n,(int)snd_counter,(int)recv_counter,int(snd_counter-recv_counter),diff.count());}// 等待线程完成query_thread.join();// 清理资源cudaStreamDestroy(stream);cudaFree(d_data);return 0;
}二.编译运行[得出队列深度为512]
/usr/local/cuda/bin/nvcc -o demo main.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64
./demo输出
snd:509 rcv:0 gap:509 lanuch_duration:0.004661
snd:510 rcv:0 gap:510 lanuch_duration:0.003677
snd:511 rcv:0 gap:511 lanuch_duration:0.004387
snd:512 rcv:0 gap:512 lanuch_duration:11307.932958
snd:513 rcv:1 gap:512 lanuch_duration:11302.601698
snd:514 rcv:2 gap:512 lanuch_duration:11302.245001