Vulkan_多线程渲染

用了这么长时间的vulkan渲染，我们本次主要来尝试使用vulkan相对于其他图形api的一个优势：多线程渲染。

本次我们主要来实现：在多个线程中平分渲染1024个模型并且在cpu中根据模型位置实现一个简单的视锥体剔除。

一、理论基础

文章开始之前，我们先来介绍两个vulkan常用功能：Fence及副命令缓冲区。
栅栏(fence)：当主机需要等待设备完成某次提交中的大量工作时使用，通常需要操作系统的协助。是中等级量的同步语句。
副命令缓冲区：是可以由主命令缓冲区调用的命令缓冲区，副命令缓冲区可以从主命令缓冲区中继承部分状态从而减少重置管线整个状态导致的巨大开销。

二、数据准备

2.1 数据定义

2.1.1 多线程类

自定义一个线程管理类来实现多线程处理

#include <vector>
#include <thread>
#include <queue>
#include <mutex>
#include <condition_variable>
#include <functional>template<typename T, typename ...Args>
std::unique_ptr<T> make_unique(Args&& ...args)
{return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
}namespace vks
{class Thread{private:bool destroying = false;std::thread worker;std::queue<std::function<void()>> jobQueue;std::mutex queueMutex;std::condition_variable condition;// Loop through all remaining jobs 循环所有剩余的作业void queueLoop(){while (true){std::function<void()> job;{std::unique_lock<std::mutex> lock(queueMutex);condition.wait(lock, [this] { return !jobQueue.empty() || destroying; });if (destroying){break;}job = jobQueue.front();}job();{std::lock_guard<std::mutex> lock(queueMutex);jobQueue.pop();condition.notify_one();}}}public:Thread(){worker = std::thread(&Thread::queueLoop, this);}~Thread(){if (worker.joinable()){wait();queueMutex.lock();destroying = true;condition.notify_one();queueMutex.unlock();worker.join();}}// 在线程队列中添加一个新作业void addJob(std::function<void()> function){std::lock_guard<std::mutex> lock(queueMutex);jobQueue.push(std::move(function));condition.notify_one();}// 等待直到所有工作项都完成void wait(){std::unique_lock<std::mutex> lock(queueMutex);condition.wait(lock, [this]() { return jobQueue.empty(); });}};class ThreadPool{public:std::vector<std::unique_ptr<Thread>> threads;// 设置要在此池中分配的线程数void setThreadCount(uint32_t count){threads.clear();for (auto i = 0; i < count; i++){threads.push_back(make_unique<Thread>());}}// 等待，直到所有线程都完成了它们的工作项void wait(){for (auto &thread : threads){thread->wait();}}};
}

这里将每个Thread需要执行的任务放在了一个jobQueue中，在jobQueue中没有任何任务时，将当前线程睡眠，而当有新的任务加入进来以后，唤醒该线程执行任务。
ThreadPool负责创建Thread，在每一帧中通过Wait函数，来等待每个线程中的所有任务都结束。

2.1.2 视锥体类

我们实现一个视锥体类来实现判断模型是否在视锥体之中：

#include <array>
#include <math.h>
#include <glm/glm.hpp>namespace vks
{class Frustum{public:enum side { LEFT = 0, RIGHT = 1, TOP = 2, BOTTOM = 3, BACK = 4, FRONT = 5 };std::array<glm::vec4, 6> planes;void update(glm::mat4 matrix){planes[LEFT].x = matrix[0].w + matrix[0].x;planes[LEFT].y = matrix[1].w + matrix[1].x;planes[LEFT].z = matrix[2].w + matrix[2].x;planes[LEFT].w = matrix[3].w + matrix[3].x;planes[RIGHT].x = matrix[0].w - matrix[0].x;planes[RIGHT].y = matrix[1].w - matrix[1].x;planes[RIGHT].z = matrix[2].w - matrix[2].x;planes[RIGHT].w = matrix[3].w - matrix[3].x;planes[TOP].x = matrix[0].w - matrix[0].y;planes[TOP].y = matrix[1].w - matrix[1].y;planes[TOP].z = matrix[2].w - matrix[2].y;planes[TOP].w = matrix[3].w - matrix[3].y;planes[BOTTOM].x = matrix[0].w + matrix[0].y;planes[BOTTOM].y = matrix[1].w + matrix[1].y;planes[BOTTOM].z = matrix[2].w + matrix[2].y;planes[BOTTOM].w = matrix[3].w + matrix[3].y;planes[BACK].x = matrix[0].w + matrix[0].z;planes[BACK].y = matrix[1].w + matrix[1].z;planes[BACK].z = matrix[2].w + matrix[2].z;planes[BACK].w = matrix[3].w + matrix[3].z;planes[FRONT].x = matrix[0].w - matrix[0].z;planes[FRONT].y = matrix[1].w - matrix[1].z;planes[FRONT].z = matrix[2].w - matrix[2].z;planes[FRONT].w = matrix[3].w - matrix[3].z;for (auto i = 0; i < planes.size(); i++){float length = sqrtf(planes[i].x * planes[i].x + planes[i].y * planes[i].y + planes[i].z * planes[i].z);planes[i] /= length;}}bool checkSphere(glm::vec3 pos, float radius){for (auto i = 0; i < planes.size(); i++){if ((planes[i].x * pos.x) + (planes[i].y * pos.y) + (planes[i].z * pos.z) + planes[i].w <= -radius){return false;}}return true;}};
}

2.1.3 场景数据

除了上边两个类外，我们需要在场景中定义一些基本的数据结构来存放多线程场景所需要的数据：

...// 用于线程推常量块的共享矩阵 struct {glm::mat4 projection;glm::mat4 view;} matrices;// 每个线程中的绘制个数uint32_t numObjectsPerThread = totalObjects/numThreads; // 模型总个数uint32_t totalObjects = 1024; // 并发线程数uint32_t numThreads= std::thread::hardware_concurrency();// 使用推入常量更新着色器struct ThreadPushConstantBlock {glm::mat4 mvp;glm::vec3 color;};//模型数据信息struct ObjectData {glm::mat4 model;glm::vec3 pos;glm::vec3 rotation;float rotationDir;float rotationSpeed;float scale;float deltaT;float stateT = 0;bool visible = true;};struct ThreadData {VkCommandPool commandPool;// 每个渲染对象对应一个命令缓冲区std::vector<VkCommandBuffer> commandBuffer;// 每个渲染对象对应一个推入常量块std::vector<ThreadPushConstantBlock> pushConstBlock;// 每个对象的信息(位置、旋转等)std::vector<ObjectData> objectData;};std::vector<ThreadData> threadData;vks::ThreadPool threadPool;// 栅栏等待所有命令缓冲区完成后再呈现给交换链VkFence renderFence = {};// 场景大小float objectSphereDim;// frustum用于剔除不可见对象vks::Frustum frustum;//生成随机数用std::default_random_engine rndEngine;

2.2 初始化多线程

首先我们来创建一个函数prepareMultiThreadedRenderer来根据本机初始化所有线程数据并创建推入常量数据：

 void prepareMultiThreadedRenderer(){// 因为这个演示更新每个帧上的命令缓冲区 我们不使用基类中的per framebuffer命令缓冲区，而是创建一个主命令缓冲区VkCommandBufferAllocateInfo cmdBufAllocateInfo =vks::initializers::commandBufferAllocateInfo(cmdPool,VK_COMMAND_BUFFER_LEVEL_PRIMARY,1);VK_CHECK_RESULT(vkAllocateCommandBuffers(device, &cmdBufAllocateInfo, &primaryCommandBuffer));threadData.resize(numThreads);for (uint32_t i = 0; i < numThreads; i++) {ThreadData *thread = &threadData[i];// 为每个线程创建一个命令池VkCommandPoolCreateInfo cmdPoolInfo = vks::initializers::commandPoolCreateInfo();cmdPoolInfo.queueFamilyIndex = swapChain.queueNodeIndex;cmdPoolInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;VK_CHECK_RESULT(vkCreateCommandPool(device, &cmdPoolInfo, nullptr, &thread->commandPool));// 每个由这个线程更新的对象都有一个辅助命令缓冲区thread->commandBuffer.resize(numObjectsPerThread);// 为每个线程生成辅助命令缓冲区VkCommandBufferAllocateInfo secondaryCmdBufAllocateInfo =vks::initializers::commandBufferAllocateInfo(thread->commandPool,VK_COMMAND_BUFFER_LEVEL_SECONDARY,thread->commandBuffer.size());VK_CHECK_RESULT(vkAllocateCommandBuffers(device, &secondaryCmdBufAllocateInfo, thread->commandBuffer.data()));thread->pushConstBlock.resize(numObjectsPerThread);thread->objectData.resize(numObjectsPerThread);for (uint32_t j = 0; j < numObjectsPerThread; j++) {float theta = 2.0f * float(M_PI) * rnd(1.0f);float phi = acos(1.0f - 2.0f * rnd(1.0f));thread->objectData[j].pos = glm::vec3(sin(phi) * cos(theta), 0.0f, cos(phi)) * 35.0f;thread->objectData[j].rotation = glm::vec3(0.0f, rnd(360.0f), 0.0f);thread->objectData[j].deltaT = rnd(1.0f);thread->objectData[j].rotationDir = (rnd(100.0f) < 50.0f) ? 1.0f : -1.0f;thread->objectData[j].rotationSpeed = (2.0f + rnd(4.0f)) * thread->objectData[j].rotationDir;thread->objectData[j].scale = 0.75f + rnd(0.5f);thread->pushConstBlock[j].color = glm::vec3(rnd(1.0f), rnd(1.0f), rnd(1.0f));}}//根据相机更新共享矩阵matrices.projection = camera.matrices.perspective;matrices.view = camera.matrices.view;frustum.update(matrices.projection * matrices.view);}

三、多线程渲染

3.1 栅栏的创建与使用

首先我们在初始时候创建一个栅栏用于同步数据

 //为同步创建一个fenceVkFenceCreateInfo fenceCreateInfo {};fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;fenceCreateInfo.flags = VK_FENCE_CREATE_SIGNALED_BIT;vkCreateFence(device, &fenceCreateInfo, nullptr, &renderFence);

创建好栅栏数据后，我们在renderLoop中实时使用同步数据

 //等待fence发出信号，表明所有命令缓冲区都准备好了VkResult fenceRes;do {fenceRes = vkWaitForFences(device, 1, &renderFence, VK_TRUE, 100000000);} while (fenceRes == VK_TIMEOUT);VK_CHECK_RESULT(fenceRes);vkResetFences(device, 1, &renderFence);VulkanExampleBase::prepareFrame();//下文讲到updateCommandBuffers(frameBuffers[currentBuffer]);submitInfo.commandBufferCount = 1;submitInfo.pCommandBuffers = &primaryCommandBuffer;VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &submitInfo, renderFence));

3.2 多线程更新命令缓冲区

我们主要是创建一个updateCommandBuffers函数来使用线程池更新辅助命令缓冲区，并将它们放入主命令缓冲区中，主命令缓冲区之后提交给队列以进行呈现：

 void updateCommandBuffers(VkFramebuffer frameBuffer){// 包含要提交的辅助命令缓冲区列表std::vector<VkCommandBuffer> commandBuffers;VkCommandBufferBeginInfo cmdBufInfo = vks::initializers::commandBufferBeginInfo();VkClearValue clearValues[2];clearValues[0].color = defaultClearColor;clearValues[1].depthStencil = { 1.0f, 0 };VkRenderPassBeginInfo renderPassBeginInfo = vks::initializers::renderPassBeginInfo();renderPassBeginInfo.renderPass = renderPass;renderPassBeginInfo.renderArea.offset.x = 0;renderPassBeginInfo.renderArea.offset.y = 0;renderPassBeginInfo.renderArea.extent.width = width;renderPassBeginInfo.renderArea.extent.height = height;renderPassBeginInfo.clearValueCount = 2;renderPassBeginInfo.pClearValues = clearValues;renderPassBeginInfo.framebuffer = frameBuffer;// 设定目标帧缓冲器VK_CHECK_RESULT(vkBeginCommandBuffer(primaryCommandBuffer, &cmdBufInfo));// 主命令缓冲区不包含任何呈现命令// 从辅助命令缓冲区存储(和检索)这些命令vkCmdBeginRenderPass(primaryCommandBuffer, &renderPassBeginInfo, VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS);// 二级命令缓冲区的继承信息VkCommandBufferInheritanceInfo inheritanceInfo = vks::initializers::commandBufferInheritanceInfo();inheritanceInfo.renderPass = renderPass;// Secondary command buffer also use the currently active framebuffer// 辅助命令缓冲区也使用当前活动的framebufferinheritanceInfo.framebuffer = frameBuffer;// 为每个要呈现的对象在线程队列中添加一个作业for (uint32_t t = 0; t < numThreads; t++){for (uint32_t i = 0; i < numObjectsPerThread; i++){//为每个线程构建辅助命令缓冲区threadPool.threads[t]->addJob([=] { threadRenderCode(t, i, inheritanceInfo); });}}threadPool.wait();//  只有当对象在当前视锥视图内时才提交for (uint32_t t = 0; t < numThreads; t++){for (uint32_t i = 0; i < numObjectsPerThread; i++){if (threadData[t].objectData[i].visible){commandBuffers.push_back(threadData[t].commandBuffer[i]);}}}// 从辅助命令缓冲区执行render命令vkCmdExecuteCommands(primaryCommandBuffer, commandBuffers.size(), commandBuffers.data());vkCmdEndRenderPass(primaryCommandBuffer);VK_CHECK_RESULT(vkEndCommandBuffer(primaryCommandBuffer));}

这是在每一帧调用的总的更新函数，可以看到，所有线程的CommandBuffer，都是内嵌在一个大的CommandBuffer的一个RenderPass内部的。每个模型对应一个CommandBuffer，一个线程在一帧内要处理多个CommandBuffer。

3.3 为每个线程构建辅助命令缓冲区

在上述流程中，我们需要为每个线程构建辅助命令缓冲区,即threadRenderCode函数所做的内容：

 void threadRenderCode(uint32_t threadIndex, uint32_t cmdBufferIndex, VkCommandBufferInheritanceInfo inheritanceInfo){ThreadData *thread = &threadData[threadIndex];ObjectData *objectData = &thread->objectData[cmdBufferIndex];// 检查是否在视锥体范围内objectData->visible = frustum.checkSphere(objectData->pos, objectSphereDim * 0.5f); if (!objectData->visible)return;VkCommandBufferBeginInfo commandBufferBeginInfo = vks::initializers::commandBufferBeginInfo();commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;commandBufferBeginInfo.pInheritanceInfo = &inheritanceInfo;VkCommandBuffer cmdBuffer = thread->commandBuffer[cmdBufferIndex];VK_CHECK_RESULT(vkBeginCommandBuffer(cmdBuffer, &commandBufferBeginInfo));VkViewport viewport = vks::initializers::viewport((float)width, (float)height, 0.0f, 1.0f);vkCmdSetViewport(cmdBuffer, 0, 1, &viewport);VkRect2D scissor = vks::initializers::rect2D(width, height, 0, 0);vkCmdSetScissor(cmdBuffer, 0, 1, &scissor);vkCmdBindPipeline(cmdBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipelines.phong);// 实时更新各model的位置等数据if (!paused) {objectData->rotation.y += 2.5f * objectData->rotationSpeed * frameTimer;if (objectData->rotation.y > 360.0f) {objectData->rotation.y -= 360.0f;}objectData->deltaT += 0.15f * frameTimer;if (objectData->deltaT > 1.0f)objectData->deltaT -= 1.0f;objectData->pos.y = sin(glm::radians(objectData->deltaT * 360.0f)) * 2.5f;}objectData->model = glm::translate(glm::mat4(1.0f), objectData->pos);objectData->model = glm::rotate(objectData->model, -sinf(glm::radians(objectData->deltaT * 360.0f)) * 0.25f, glm::vec3(objectData->rotationDir, 0.0f, 0.0f));objectData->model = glm::rotate(objectData->model, glm::radians(objectData->rotation.y), glm::vec3(0.0f, objectData->rotationDir, 0.0f));objectData->model = glm::rotate(objectData->model, glm::radians(objectData->deltaT * 360.0f), glm::vec3(0.0f, objectData->rotationDir, 0.0f));objectData->model = glm::scale(objectData->model, glm::vec3(objectData->scale));thread->pushConstBlock[cmdBufferIndex].mvp = matrices.projection * matrices.view * objectData->model;// 更新着色器推送常量块，其中包含模型视图矩阵vkCmdPushConstants(cmdBuffer,pipelineLayout,VK_SHADER_STAGE_VERTEX_BIT,0,sizeof(ThreadPushConstantBlock),&thread->pushConstBlock[cmdBufferIndex]);VkDeviceSize offsets[1] = { 0 };vkCmdBindVertexBuffers(cmdBuffer, 0, 1, &models.ufo.vertices.buffer, offsets);vkCmdBindIndexBuffer(cmdBuffer, models.ufo.indices.buffer, 0, VK_INDEX_TYPE_UINT32);vkCmdDrawIndexed(cmdBuffer, models.ufo.indexCount, 1, 0, 0, 0);VK_CHECK_RESULT(vkEndCommandBuffer(cmdBuffer));}

上面则是每个线程所要执行的具体的任务，比较直观，在获取到模型对应的CommandBuffer后，先对飞碟本身的数据信息进行更新，然后对CommandBuffer进行重新写入，尽管感觉上我们只需要重新提交一次PushConstant命令，但是所有的其他不变的状态也需要再进行一次提交，比如VertexBuffer、IndexBuffer、Scissor、Viewport，这里就体现出与传统API的区别了，如果在D3D11中，我们只需要将某个ConstantBuffer修改一下，其他的都不需要动，然后直接提交DrawCall就行，但是在Vulkan中，每个CommandBuffer内的状态只要需要修改一点，那么其他所有的状态都要跟着再进行一次设定。
经历了上述一大串的操作之后，我们可以看到将1024个模型分入多个线程中循环共同渲染结果如下：