Relay外部库使用

Relay外部库使用
本文介绍如何将cuDNN或cuBLAS等外部库与Relay一起使用。
Relay内部使用TVM生成目标特定的代码。例如，使用cuda后端，TVM为用户提供的网络中的所有层生成cuda内核。有时将各种供应商开发的外部库合并到Relay中也很有帮助。幸运的是，TVM具有透明地调用这些库的机制。对于Relay用户，要做的只是适当地设置目标字符串。
在可以使用Relay的外部库之前，TVM必须与要使用的库一起构建。例如，要使用cuDNN，需要启用cmake / config.cmake中的USE_CUDNN选项，并在必要时指定cuDNN include和库目录。
首先，导入Relay和TVM。
import tvm
from tvm import te
import numpy as np
from tvm.contrib import graph_runtime as runtime
from tvm import relay
from tvm.relay import testing
import tvm.testing
创建一个简单的网络
创建一个非常简单的网络进行演示。由卷积，批处理归一化和ReLU激活组成。
out_channels = 16
batch_size = 1

data = relay.var(“data”, relay.TensorType((batch_size, 3, 224, 224), “float32”))
weight = relay.var(“weight”)
bn_gamma = relay.var(“bn_gamma”)
bn_beta = relay.var(“bn_beta”)
bn_mmean = relay.var(“bn_mean”)
bn_mvar = relay.var(“bn_var”)

simple_net = relay.nn.conv2d(
data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, padding=(1, 1)
)
simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
simple_net = relay.nn.relu(simple_net)
simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net)

data_shape = (batch_size, 3, 224, 224)
net, params = testing.create_workload(simple_net)
使用cuda后端构建并运行
像往常一样，使用cuda后端构建并运行此网络。通过将日志记录级别设置为DEBUG，Relay图编译的结果将作为伪代码转储。
import logging

logging.basicConfig(level=logging.DEBUG) # to dump TVM IR after fusion

target = “cuda”
lib = relay.build_module.build(net, target, params=params)

ctx = tvm.context(target, 0)
data = np.random.uniform(-1, 1, size=data_shape).astype(“float32”)
module = runtime.GraphModule(lib"default")
module.set_input(“data”, data)
module.run()
out_shape = (batch_size, out_channels, 224, 224)
out = module.get_output(0, tvm.nd.empty(out_shape))
out_cuda = out.asnumpy()
生成的伪代码应如下所示。注意如何将偏差添加，批处理规范化和ReLU激活融合到卷积内核中。TVM根据此表示生成单个融合内核。
produce tensor {
// attr [iter_var(blockIdx.z, , blockIdx.z)] thread_extent = 1
// attr [compute] storage_scope = “local”
allocate compute[float32 * 32]
// attr [pad_temp.shared] storage_scope = “shared”
allocate pad_temp.shared[float32 * 180]
// attr [placeholder.shared] storage_scope = “shared”
allocate placeholder.shared[float32 * 144]
// attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 28
// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 14
// attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4
// attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1
// attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16
produce compute {
compute[0] = 0.000000f
compute[1] = 0.000000f
compute[2] = 0.000000f
compute[3] = 0.000000f
compute[4] = 0.000000f
compute[5] = 0.000000f
compute[6] = 0.000000f
compute[7] = 0.000000f
compute[8] = 0.000000f
compute[9] = 0.000000f
compute[10] = 0.000000f
compute[11] = 0.000000f
compute[12] = 0.000000f
compute[13] = 0.000000f
compute[14] = 0.000000f
compute[15] = 0.000000f
compute[16] = 0.000000f
compute[17] = 0.000000f
compute[18] = 0.000000f
compute[19] = 0.000000f
compute[20] = 0.000000f
compute[21] = 0.000000f
compute[22] = 0.000000f
compute[23] = 0.000000f
compute[24] = 0.000000f
compute[25] = 0.000000f
compute[26] = 0.000000f
compute[27] = 0.000000f
compute[28] = 0.000000f
compute[29] = 0.000000f
compute[30] = 0.000000f
compute[31] = 0.000000f
for (rc.outer, 0, 3) {
produce pad_temp.shared {
// attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4
// attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1
// attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16
if (likely(((threadIdx.z15) < (60 - threadIdx.x)))) {
if (likely((threadIdx.x < 15))) {
pad_temp.shared[(((((threadIdx.z15) + threadIdx.x)/60)180) + ((((((threadIdx.z15) + threadIdx.x)/6) % 10)18) + ((((threadIdx.z3) + threadIdx.x)3) % 18)))] = tvm_if_then_else((((((1 - ((((threadIdx.z15) + threadIdx.x)/6) % 10)) <= (blockIdx.y8)) && ((blockIdx.y8) < (225 - ((((threadIdx.z15) + threadIdx.x)/6) % 10)))) && ((1 - ((((threadIdx.z3) + threadIdx.x)3) % 18)) <= (blockIdx.x16))) && ((blockIdx.x16) < (225 - ((((threadIdx.z3) + threadIdx.x)3) % 18)))), placeholder[((((((((blockIdx.y112) + blockIdx.x) + (rc.outer3136)) + ((((threadIdx.z15) + threadIdx.x)/60)9408))16) + ((((threadIdx.z3) + threadIdx.x)3) % 18)) + (((((threadIdx.z15) + threadIdx.x)/6) % 10)224)) + -225)], 0.000000f)
pad_temp.shared[(((((((threadIdx.z15) + threadIdx.x)3) + 1)/180)180) + ((((((((threadIdx.z15) + threadIdx.x)3) + 1)/18) % 10)18) + (((((threadIdx.z3) + threadIdx.x)3) + 1) % 18)))] = tvm_if_then_else((((((1 - ((((((threadIdx.z15) + threadIdx.x)3) + 1)/18) % 10)) <= (blockIdx.y8)) && ((blockIdx.y8) < (225 - ((((((threadIdx.z15) + threadIdx.x)3) + 1)/18) % 10)))) && ((1 - (((((threadIdx.z3) + threadIdx.x)3) + 1) % 18)) <= (blockIdx.x16))) && ((blockIdx.x16) < (225 - (((((threadIdx.z3) + threadIdx.x)3) + 1) % 18)))), placeholder[((((((((blockIdx.y112) + blockIdx.x) + (rc.outer3136)) + ((((((threadIdx.z15) + threadIdx.x)3) + 1)/180)9408))16) + (((((threadIdx.z3) + threadIdx.x)3) + 1) % 18)) + (((((((threadIdx.z15) + threadIdx.x)3) + 1)/18) % 10)224)) + -225)], 0.000000f)
pad_temp.shared[(((((((threadIdx.z15) + threadIdx.x)3) + 2)/180)180) + ((((((((threadIdx.z15) + threadIdx.x)3) + 2)/18) % 10)18) + (((((threadIdx.z3) + threadIdx.x)3) + 2) % 18)))] = tvm_if_then_else((((((1 - ((((((threadIdx.z15) + threadIdx.x)3) + 2)/18) % 10)) <= (blockIdx.y8)) && ((blockIdx.y8) < (225 - ((((((threadIdx.z15) + threadIdx.x)3) + 2)/18) % 10)))) && ((1 - (((((threadIdx.z3) + threadIdx.x)3) + 2) % 18)) <= (blockIdx.x16))) && ((blockIdx.x16) < (225 - (((((threadIdx.z3) + threadIdx.x)3) + 2) % 18)))), placeholder[((((((((blockIdx.y112) + blockIdx.x) + (rc.outer3136)) + ((((((threadIdx.z15) + threadIdx.x)3) + 2)/180)9408))16) + (((((threadIdx.z3) + threadIdx.x)3) + 2) % 18)) + (((((((threadIdx.z15) + threadIdx.x)3) + 2)/18) % 10)224)) + -225)], 0.000000f)
}
}
}
produce placeholder.shared {
// attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4
// attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1
// attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16
if (likely(((threadIdx.z4) < (16 - (threadIdx.x/3))))) {
if (likely(((threadIdx.z12) < (48 - threadIdx.x)))) {
if (likely((threadIdx.x < 12))) {
placeholder.shared[(((((threadIdx.z4) + (threadIdx.x/3))3) + (threadIdx.x % 3))3)] = placeholder[(((((rc.outer + (threadIdx.z12)) + ((threadIdx.x/3)3))3) + (threadIdx.x % 3))3)]
placeholder.shared[((((((threadIdx.z4) + (threadIdx.x/3))3) + (threadIdx.x % 3))3) + 1)] = placeholder[((((((rc.outer + (threadIdx.z12)) + ((threadIdx.x/3)3))3) + (threadIdx.x % 3))3) + 1)]
placeholder.shared[((((((threadIdx.z4) + (threadIdx.x/3))3) + (threadIdx.x % 3))3) + 2)] = placeholder[((((((rc.outer + (threadIdx.z12)) + ((threadIdx.x/3)3))3) + (threadIdx.x % 3))3) + 2)]
}
}
}
}
compute[0] = (compute[0] + (pad_temp.shared[threadIdx.x]placeholder.shared[(threadIdx.z36)]))
compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 18)]placeholder.shared[(threadIdx.z36)]))
compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 36)]placeholder.shared[(threadIdx.z36)]))
compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 54)]placeholder.shared[(threadIdx.z36)]))
compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 72)]placeholder.shared[(threadIdx.z36)]))
compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 90)]placeholder.shared[(threadIdx.z36)]))
compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 108)]placeholder.shared[(threadIdx.z36)]))
compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 126)]placeholder.shared[(threadIdx.z36)]))
compute[8] = (compute[8] + (pad_temp.shared[threadIdx.x]placeholder.shared[((threadIdx.z36) + 9)]))
compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 18)]placeholder.shared[((threadIdx.z36) + 9)]))
compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 36)]placeholder.shared[((threadIdx.z36) + 9)]))
compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 54)]placeholder.shared[((threadIdx.z36) + 9)]))
compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 72)]placeholder.shared[((threadIdx.z36) + 9)]))
compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 90)]placeholder.shared[((threadIdx.z36) + 9)]))
compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 108)]placeholder.shared[((threadIdx.z36) + 9)]))
compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 126)]placeholder.shared[((threadIdx.z36) + 9)]))
compute[16] = (compute[16] + (pad_temp.shared[threadIdx.x]placeholder.shared[((threadIdx.z36) + 18)]))
compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 18)]placeholder.shared[((threadIdx.z36) + 18)]))
compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 36)]placeholder.shared[((threadIdx.z36) + 18)]))
compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 54)]placeholder.shared[((threadIdx.z36) + 18)]))
compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 72)]placeholder.shared[((threadIdx.z36) + 18)]))
compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 90)]placeholder.shared[((threadIdx.z36) + 18)]))
compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 108)]placeholder.shared[((threadIdx.z36) + 18)]))
compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 126)]placeholder.shared[((threadIdx.z36) + 18)]))
compute[24] = (compute[24] + (pad_temp.shared[threadIdx.x]placeholder.shared[((threadIdx.z36) + 27)]))
compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 18)]placeholder.shared[((threadIdx.z36) + 27)]))
compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 36)]placeholder.shared[((threadIdx.z36) + 27)]))
compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 54)]placeholder.shared[((threadIdx.z36) + 27)]))
compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 72)]placeholder.shared[((threadIdx.z36) + 27)]))
compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 90)]placeholder.shared[((threadIdx.z36) + 27)]))
compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 108)]placeholder.shared[((threadIdx.z36) + 27)]))
compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 126)]placeholder.shared[((threadIdx.z36) + 27)]))
compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 1)]placeholder.shared[((threadIdx.z36) + 1)]))
compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 19)]placeholder.shared[((threadIdx.z36) + 1)]))
compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 37)]placeholder.shared[((threadIdx.z36) + 1)]))
compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 55)]placeholder.shared[((threadIdx.z36) + 1)]))
compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 73)]placeholder.shared[((threadIdx.z36) + 1)]))
compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 91)]placeholder.shared[((threadIdx.z36) + 1)]))
compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 109)]placeholder.shared[((threadIdx.z36) + 1)]))
compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 127)]placeholder.shared[((threadIdx.z36) + 1)]))
compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 1)]placeholder.shared[((threadIdx.z36) + 10)]))
compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 19)]placeholder.shared[((threadIdx.z36) + 10)]))
compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 37)]placeholder.shared[((threadIdx.z36) + 10)]))
compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 55)]placeholder.shared[((threadIdx.z36) + 10)]))
compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 73)]placeholder.shared[((threadIdx.z36) + 10)]))
compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 91)]placeholder.shared[((threadIdx.z36) + 10)]))
compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 109)]placeholder.shared[((threadIdx.z36) + 10)]))
compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 127)]placeholder.shared[((threadIdx.z36) + 10)]))
compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 1)]placeholder.shared[((threadIdx.z36) + 19)]))
compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 19)]placeholder.shared[((threadIdx.z36) + 19)]))
compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 37)]placeholder.shared[((threadIdx.z36) + 19)]))
compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 55)]placeholder.shared[((threadIdx.z36) + 19)]))
compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 73)]placeholder.shared[((threadIdx.z36) + 19)]))
compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 91)]placeholder.shared[((threadIdx.z36) + 19)]))
compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 109)]placeholder.shared[((threadIdx.z36) + 19)]))
compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 127)]placeholder.shared[((threadIdx.z36) + 19)]))
compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 1)]placeholder.shared[((threadIdx.z36) + 28)]))
compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 19)]placeholder.shared[((threadIdx.z36) + 28)]))
compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 37)]placeholder.shared[((threadIdx.z36) + 28)]))
compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 55)]placeholder.shared[((threadIdx.z36) + 28)]))
compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 73)]placeholder.shared[((threadIdx.z36) + 28)]))
compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 91)]placeholder.shared[((threadIdx.z36) + 28)]))
compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 109)]placeholder.shared[((threadIdx.z36) + 28)]))
compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 127)]placeholder.shared[((threadIdx.z36) + 28)]))
compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 2)]placeholder.shared[((threadIdx.z36) + 2)]))
compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 20)]placeholder.shared[((threadIdx.z36) + 2)]))
compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 38)]placeholder.shared[((threadIdx.z36) + 2)]))
compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 56)]placeholder.shared[((threadIdx.z36) + 2)]))
compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 74)]placeholder.shared[((threadIdx.z36) + 2)]))
compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 92)]placeholder.shared[((threadIdx.z36) + 2)]))
compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 110)]placeholder.shared[((threadIdx.z36) + 2)]))
compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 128)]placeholder.shared[((threadIdx.z36) + 2)]))
compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 2)]placeholder.shared[((threadIdx.z36) + 11)]))
compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 20)]placeholder.shared[((threadIdx.z36) + 11)]))
compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 38)]placeholder.shared[((threadIdx.z36) + 11)]))
compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 56)]placeholder.shared[((threadIdx.z36) + 11)]))
compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 74)]placeholder.shared[((threadIdx.z36) + 11)]))
compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 92)]placeholder.shared[((threadIdx.z36) + 11)]))
compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 110)]placeholder.shared[((threadIdx.z36) + 11)]))
compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 128)]placeholder.shared[((threadIdx.z36) + 11)]))
compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 2)]placeholder.shared[((threadIdx.z36) + 20)]))
compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 20)]placeholder.shared[((threadIdx.z36) + 20)]))
compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 38)]placeholder.shared[((threadIdx.z36) + 20)]))
compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 56)]placeholder.shared[((threadIdx.z36) + 20)]))
compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 74)]placeholder.shared[((threadIdx.z36) + 20)]))
compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 92)]placeholder.shared[((threadIdx.z36) + 20)]))
compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 110)]placeholder.shared[((threadIdx.z36) + 20)]))
compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 128)]placeholder.shared[((threadIdx.z36) + 20)]))
compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 2)]placeholder.shared[((threadIdx.z36) + 29)]))
compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 20)]placeholder.shared[((threadIdx.z36) + 29)]))
compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 38)]placeholder.shared[((threadIdx.z36) + 29)]))
compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 56)]placeholder.shared[((threadIdx.z36) + 29)]))
compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 74)]placeholder.shared[((threadIdx.z36) + 29)]))
compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 92)]placeholder.shared[((threadIdx.z36) + 29)]))
compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 110)]placeholder.shared[((threadIdx.z36) + 29)]))
compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 128)]placeholder.shared[((threadIdx.z36) + 29)]))
compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 18)]placeholder.shared[((threadIdx.z36) + 3)]))
compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 36)]placeholder.shared[((threadIdx.z36) + 3)]))
compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 54)]placeholder.shared[((threadIdx.z36) + 3)]))
compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 72)]placeholder.shared[((threadIdx.z36) + 3)]))
compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 90)]placeholder.shared[((threadIdx.z36) + 3)]))
compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 108)]placeholder.shared[((threadIdx.z36) + 3)]))
compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 126)]placeholder.shared[((threadIdx.z36) + 3)]))
compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 144)]placeholder.shared[((threadIdx.z36) + 3)]))
compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 18)]placeholder.shared[((threadIdx.z36) + 12)]))
compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 36)]placeholder.shared[((threadIdx.z36) + 12)]))
compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 54)]placeholder.shared[((threadIdx.z36) + 12)]))
compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 72)]placeholder.shared[((threadIdx.z36) + 12)]))
compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 90)]placeholder.shared[((threadIdx.z36) + 12)]))
compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 108)]placeholder.shared[((threadIdx.z36) + 12)]))
compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 126)]placeholder.shared[((threadIdx.z36) + 12)]))
compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 144)]placeholder.shared[((threadIdx.z36) + 12)]))
compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 18)]placeholder.shared[((threadIdx.z36) + 21)]))
compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 36)]placeholder.shared[((threadIdx.z36) + 21)]))
compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 54)]placeholder.shared[((threadIdx.z36) + 21)]))
compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 72)]placeholder.shared[((threadIdx.z36) + 21)]))
compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 90)]placeholder.shared[((threadIdx.z36) + 21)]))
compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 108)]placeholder.shared[((threadIdx.z36) + 21)]))
compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 126)]placeholder.shared[((threadIdx.z36) + 21)]))
compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 144)]placeholder.shared[((threadIdx.z36) + 21)]))
compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 18)]placeholder.shared[((threadIdx.z36) + 30)]))
compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 36)]placeholder.shared[((threadIdx.z36) + 30)]))
compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 54)]placeholder.shared[((threadIdx.z36) + 30)]))
compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 72)]placeholder.shared[((threadIdx.z36) + 30)]))
compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 90)]placeholder.shared[((threadIdx.z36) + 30)]))
compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 108)]placeholder.shared[((threadIdx.z36) + 30)]))
compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 126)]placeholder.shared[((threadIdx.z36) + 30)]))
compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 144)]placeholder.shared[((threadIdx.z36) + 30)]))
compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 19)]placeholder.shared[((threadIdx.z36) + 4)]))
compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 37)]placeholder.shared[((threadIdx.z36) + 4)]))
compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 55)]placeholder.shared[((threadIdx.z36) + 4)]))
compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 73)]placeholder.shared[((threadIdx.z36) + 4)]))
compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 91)]placeholder.shared[((threadIdx.z36) + 4)]))
compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 109)]placeholder.shared[((threadIdx.z36) + 4)]))
compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 127)]placeholder.shared[((threadIdx.z36) + 4)]))
compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 145)]placeholder.shared[((threadIdx.z36) + 4)]))
compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 19)]placeholder.shared[((threadIdx.z36) + 13)]))
compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 37)]placeholder.shared[((threadIdx.z36) + 13)]))
compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 55)]placeholder.shared[((threadIdx.z36) + 13)]))
compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 73)]placeholder.shared[((threadIdx.z36) + 13)]))
compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 91)]placeholder.shared[((threadIdx.z36) + 13)]))
compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 109)]placeholder.shared[((threadIdx.z36) + 13)]))
compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 127)]placeholder.shared[((threadIdx.z36) + 13)]))
compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 145)]placeholder.shared[((threadIdx.z36) + 13)]))
compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 19)]placeholder.shared[((threadIdx.z36) + 22)]))
compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 37)]placeholder.shared[((threadIdx.z36) + 22)]))
compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 55)]placeholder.shared[((threadIdx.z36) + 22)]))
compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 73)]placeholder.shared[((threadIdx.z36) + 22)]))
compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 91)]placeholder.shared[((threadIdx.z36) + 22)]))
compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 109)]placeholder.shared[((threadIdx.z36) + 22)]))
compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 127)]placeholder.shared[((threadIdx.z36) + 22)]))
compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 145)]placeholder.shared[((threadIdx.z36) + 22)]))
compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 19)]placeholder.shared[((threadIdx.z36) + 31)]))
compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 37)]placeholder.shared[((threadIdx.z36) + 31)]))
compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 55)]placeholder.shared[((threadIdx.z36) + 31)]))
compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 73)]placeholder.shared[((threadIdx.z36) + 31)]))
compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 91)]placeholder.shared[((threadIdx.z36) + 31)]))
compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 109)]placeholder.shared[((threadIdx.z36) + 31)]))
compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 127)]placeholder.shared[((threadIdx.z36) + 31)]))
compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 145)]placeholder.shared[((threadIdx.z36) + 31)]))
compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 20)]placeholder.shared[((threadIdx.z36) + 5)]))
compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 38)]placeholder.shared[((threadIdx.z36) + 5)]))
compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 56)]placeholder.shared[((threadIdx.z36) + 5)]))
compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 74)]placeholder.shared[((threadIdx.z36) + 5)]))
compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 92)]placeholder.shared[((threadIdx.z36) + 5)]))
compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 110)]placeholder.shared[((threadIdx.z36) + 5)]))
compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 128)]placeholder.shared[((threadIdx.z36) + 5)]))
compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 146)]placeholder.shared[((threadIdx.z36) + 5)]))
compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 20)]placeholder.shared[((threadIdx.z36) + 14)]))
compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 38)]placeholder.shared[((threadIdx.z36) + 14)]))
compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 56)]placeholder.shared[((threadIdx.z36) + 14)]))
compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 74)]placeholder.shared[((threadIdx.z36) + 14)]))
compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 92)]placeholder.shared[((threadIdx.z36) + 14)]))
compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 110)]placeholder.shared[((threadIdx.z36) + 14)]))
compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 128)]placeholder.shared[((threadIdx.z36) + 14)]))
compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 146)]placeholder.shared[((threadIdx.z36) + 14)]))
compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 20)]placeholder.shared[((threadIdx.z36) + 23)]))
compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 38)]placeholder.shared[((threadIdx.z36) + 23)]))
compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 56)]placeholder.shared[((threadIdx.z36) + 23)]))
compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 74)]placeholder.shared[((threadIdx.z36) + 23)]))
compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 92)]placeholder.shared[((threadIdx.z36) + 23)]))
compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 110)]placeholder.shared[((threadIdx.z36) + 23)]))
compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 128)]placeholder.shared[((threadIdx.z36) + 23)]))
compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 146)]placeholder.shared[((threadIdx.z36) + 23)]))
compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 20)]placeholder.shared[((threadIdx.z36) + 32)]))
compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 38)]placeholder.shared[((threadIdx.z36) + 32)]))
compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 56)]placeholder.shared[((threadIdx.z36) + 32)]))
compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 74)]placeholder.shared[((threadIdx.z36) + 32)]))
compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 92)]placeholder.shared[((threadIdx.z36) + 32)]))
compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 110)]placeholder.shared[((threadIdx.z36) + 32)]))
compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 128)]placeholder.shared[((threadIdx.z36) + 32)]))
compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 146)]placeholder.shared[((threadIdx.z36) + 32)]))
compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 36)]placeholder.shared[((threadIdx.z36) + 6)]))
compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 54)]placeholder.shared[((threadIdx.z36) + 6)]))
compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 72)]placeholder.shared[((threadIdx.z36) + 6)]))
compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 90)]placeholder.shared[((threadIdx.z36) + 6)]))
compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 108)]placeholder.shared[((threadIdx.z36) + 6)]))
compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 126)]placeholder.shared[((threadIdx.z36) + 6)]))
compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 144)]placeholder.shared[((threadIdx.z36) + 6)]))
compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 162)]placeholder.shared[((threadIdx.z36) + 6)]))
compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 36)]placeholder.shared[((threadIdx.z36) + 15)]))
compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 54)]placeholder.shared[((threadIdx.z36) + 15)]))
compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 72)]placeholder.shared[((threadIdx.z36) + 15)]))
compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 90)]placeholder.shared[((threadIdx.z36) + 15)]))
compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 108)]placeholder.shared[((threadIdx.z36) + 15)]))
compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 126)]placeholder.shared[((threadIdx.z36) + 15)]))
compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 144)]placeholder.shared[((threadIdx.z36) + 15)]))
compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 162)]placeholder.shared[((threadIdx.z36) + 15)]))
compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 36)]placeholder.shared[((threadIdx.z36) + 24)]))
compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 54)]placeholder.shared[((threadIdx.z36) + 24)]))
compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 72)]placeholder.shared[((threadIdx.z36) + 24)]))
compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 90)]placeholder.shared[((threadIdx.z36) + 24)]))
compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 108)]placeholder.shared[((threadIdx.z36) + 24)]))
compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 126)]placeholder.shared[((threadIdx.z36) + 24)]))
compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 144)]placeholder.shared[((threadIdx.z36) + 24)]))
compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 162)]placeholder.shared[((threadIdx.z36) + 24)]))
compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 36)]placeholder.shared[((threadIdx.z36) + 33)]))
compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 54)]placeholder.shared[((threadIdx.z36) + 33)]))
compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 72)]placeholder.shared[((threadIdx.z36) + 33)]))
compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 90)]placeholder.shared[((threadIdx.z36) + 33)]))
compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 108)]placeholder.shared[((threadIdx.z36) + 33)]))
compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 126)]placeholder.shared[((threadIdx.z36) + 33)]))
compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 144)]placeholder.shared[((threadIdx.z36) + 33)]))
compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 162)]placeholder.shared[((threadIdx.z36) + 33)]))
compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 37)]placeholder.shared[((threadIdx.z36) + 7)]))
compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 55)]placeholder.shared[((threadIdx.z36) + 7)]))
compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 73)]placeholder.shared[((threadIdx.z36) + 7)]))
compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 91)]placeholder.shared[((threadIdx.z36) + 7)]))
compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 109)]placeholder.shared[((threadIdx.z36) + 7)]))
compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 127)]placeholder.shared[((threadIdx.z36) + 7)]))
compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 145)]placeholder.shared[((threadIdx.z36) + 7)]))
compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 163)]placeholder.shared[((threadIdx.z36) + 7)]))
compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 37)]placeholder.shared[((threadIdx.z36) + 16)]))
compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 55)]placeholder.shared[((threadIdx.z36) + 16)]))
compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 73)]placeholder.shared[((threadIdx.z36) + 16)]))
compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 91)]placeholder.shared[((threadIdx.z36) + 16)]))
compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 109)]placeholder.shared[((threadIdx.z36) + 16)]))
compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 127)]placeholder.shared[((threadIdx.z36) + 16)]))
compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 145)]placeholder.shared[((threadIdx.z36) + 16)]))
compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 163)]placeholder.shared[((threadIdx.z36) + 16)]))
compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 37)]placeholder.shared[((threadIdx.z36) + 25)]))
compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 55)]placeholder.shared[((threadIdx.z36) + 25)]))
compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 73)]placeholder.shared[((threadIdx.z36) + 25)]))
compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 91)]placeholder.shared[((threadIdx.z36) + 25)]))
compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 109)]placeholder.shared[((threadIdx.z36) + 25)]))
compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 127)]placeholder.shared[((threadIdx.z36) + 25)]))
compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 145)]placeholder.shared[((threadIdx.z36) + 25)]))
compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 163)]placeholder.shared[((threadIdx.z36) + 25)]))
compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 37)]placeholder.shared[((threadIdx.z36) + 34)]))
compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 55)]placeholder.shared[((threadIdx.z36) + 34)]))
compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 73)]placeholder.shared[((threadIdx.z36) + 34)]))
compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 91)]placeholder.shared[((threadIdx.z36) + 34)]))
compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 109)]placeholder.shared[((threadIdx.z36) + 34)]))
compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 127)]placeholder.shared[((threadIdx.z36) + 34)]))
compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 145)]placeholder.shared[((threadIdx.z36) + 34)]))
compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 163)]placeholder.shared[((threadIdx.z36) + 34)]))
compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 38)]placeholder.shared[((threadIdx.z36) + 8)]))
compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 56)]placeholder.shared[((threadIdx.z36) + 8)]))
compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 74)]placeholder.shared[((threadIdx.z36) + 8)]))
compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 92)]placeholder.shared[((threadIdx.z36) + 8)]))
compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 110)]placeholder.shared[((threadIdx.z36) + 8)]))
compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 128)]placeholder.shared[((threadIdx.z36) + 8)]))
compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 146)]placeholder.shared[((threadIdx.z36) + 8)]))
compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 164)]placeholder.shared[((threadIdx.z36) + 8)]))
compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 38)]placeholder.shared[((threadIdx.z36) + 17)]))
compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 56)]placeholder.shared[((threadIdx.z36) + 17)]))
compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 74)]placeholder.shared[((threadIdx.z36) + 17)]))
compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 92)]placeholder.shared[((threadIdx.z36) + 17)]))
compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 110)]placeholder.shared[((threadIdx.z36) + 17)]))
compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 128)]placeholder.shared[((threadIdx.z36) + 17)]))
compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 146)]placeholder.shared[((threadIdx.z36) + 17)]))
compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 164)]placeholder.shared[((threadIdx.z36) + 17)]))
compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 38)]placeholder.shared[((threadIdx.z36) + 26)]))
compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 56)]placeholder.shared[((threadIdx.z36) + 26)]))
compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 74)]placeholder.shared[((threadIdx.z36) + 26)]))
compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 92)]placeholder.shared[((threadIdx.z36) + 26)]))
compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 110)]placeholder.shared[((threadIdx.z36) + 26)]))
compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 128)]placeholder.shared[((threadIdx.z36) + 26)]))
compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 146)]placeholder.shared[((threadIdx.z36) + 26)]))
compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 164)]placeholder.shared[((threadIdx.z36) + 26)]))
compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 38)]placeholder.shared[((threadIdx.z36) + 35)]))
compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 56)]placeholder.shared[((threadIdx.z36) + 35)]))
compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 74)]placeholder.shared[((threadIdx.z36) + 35)]))
compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 92)]placeholder.shared[((threadIdx.z36) + 35)]))
compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 110)]placeholder.shared[((threadIdx.z36) + 35)]))
compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 128)]placeholder.shared[((threadIdx.z36) + 35)]))
compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 146)]placeholder.shared[((threadIdx.z36) + 35)]))
compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 164)]placeholder.shared[((threadIdx.z36) + 35)]))
}
}
tensor[(((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x)] = max(((compute[0]placeholder[(threadIdx.z4)]) + placeholder[(threadIdx.z4)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 224)] = max(((compute[1]placeholder[(threadIdx.z4)]) + placeholder[(threadIdx.z4)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 448)] = max(((compute[2]placeholder[(threadIdx.z4)]) + placeholder[(threadIdx.z4)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 672)] = max(((compute[3]placeholder[(threadIdx.z4)]) + placeholder[(threadIdx.z4)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 896)] = max(((compute[4]placeholder[(threadIdx.z4)]) + placeholder[(threadIdx.z4)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 1120)] = max(((compute[5]placeholder[(threadIdx.z4)]) + placeholder[(threadIdx.z4)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 1344)] = max(((compute[6]placeholder[(threadIdx.z4)]) + placeholder[(threadIdx.z4)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 1568)] = max(((compute[7]placeholder[(threadIdx.z4)]) + placeholder[(threadIdx.z4)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 50176)] = max(((compute[8]placeholder[((threadIdx.z4) + 1)]) + placeholder[((threadIdx.z4) + 1)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 50400)] = max(((compute[9]placeholder[((threadIdx.z4) + 1)]) + placeholder[((threadIdx.z4) + 1)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 50624)] = max(((compute[10]placeholder[((threadIdx.z4) + 1)]) + placeholder[((threadIdx.z4) + 1)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 50848)] = max(((compute[11]placeholder[((threadIdx.z4) + 1)]) + placeholder[((threadIdx.z4) + 1)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 51072)] = max(((compute[12]placeholder[((threadIdx.z4) + 1)]) + placeholder[((threadIdx.z4) + 1)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 51296)] = max(((compute[13]placeholder[((threadIdx.z4) + 1)]) + placeholder[((threadIdx.z4) + 1)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 51520)] = max(((compute[14]placeholder[((threadIdx.z4) + 1)]) + placeholder[((threadIdx.z4) + 1)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 51744)] = max(((compute[15]placeholder[((threadIdx.z4) + 1)]) + placeholder[((threadIdx.z4) + 1)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 100352)] = max(((compute[16]placeholder[((threadIdx.z4) + 2)]) + placeholder[((threadIdx.z4) + 2)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 100576)] = max(((compute[17]placeholder[((threadIdx.z4) + 2)]) + placeholder[((threadIdx.z4) + 2)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 100800)] = max(((compute[18]placeholder[((threadIdx.z4) + 2)]) + placeholder[((threadIdx.z4) + 2)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 101024)] = max(((compute[19]placeholder[((threadIdx.z4) + 2)]) + placeholder[((threadIdx.z4) + 2)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 101248)] = max(((compute[20]placeholder[((threadIdx.z4) + 2)]) + placeholder[((threadIdx.z4) + 2)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 101472)] = max(((compute[21]placeholder[((threadIdx.z4) + 2)]) + placeholder[((threadIdx.z4) + 2)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 101696)] = max(((compute[22]placeholder[((threadIdx.z4) + 2)]) + placeholder[((threadIdx.z4) + 2)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 101920)] = max(((compute[23]placeholder[((threadIdx.z4) + 2)]) + placeholder[((threadIdx.z4) + 2)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 150528)] = max(((compute[24]placeholder[((threadIdx.z4) + 3)]) + placeholder[((threadIdx.z4) + 3)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 150752)] = max(((compute[25]placeholder[((threadIdx.z4) + 3)]) + placeholder[((threadIdx.z4) + 3)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 150976)] = max(((compute[26]placeholder[((threadIdx.z4) + 3)]) + placeholder[((threadIdx.z4) + 3)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 151200)] = max(((compute[27]placeholder[((threadIdx.z4) + 3)]) + placeholder[((threadIdx.z4) + 3)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 151424)] = max(((compute[28]placeholder[((threadIdx.z4) + 3)]) + placeholder[((threadIdx.z4) + 3)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 151648)] = max(((compute[29]placeholder[((threadIdx.z4) + 3)]) + placeholder[((threadIdx.z4) + 3)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z12544))16) + threadIdx.x) + 151872)] = max(((compute[30]placeholder[((threadIdx.z4) + 3)]) + placeholder[((threadIdx.z4) + 3)]), 0.000000f)
tensor[((((((blockIdx.y112) + blockIdx.x) + (threadIdx.z*12544))16) + threadIdx.x) + 152096)] = max(((compute[31]placeholder[((threadIdx.z4) + 3)]) + placeholder[((threadIdx.z4) + 3)]), 0.000000f)
}
将cuDNN用于卷积层
可以使用cuDNN将卷积内核替换为cuDNN。将选项“ -libs = cudnn”附加到目标字符串。
net, params = testing.create_workload(simple_net)
target = “cuda -libs=cudnn” # use cudnn for convolution
lib = relay.build_module.build(net, target, params=params)

Relay外部库使用相关推荐

Ch2r_ood_understanding 本文档为论文限定领域口语对话系统中超出领域话语的对话行为识别的部分实验代码。代码基于Python，需要用到的外部库有： Keras（搭建神经网络） S
Ch2r_ood_understanding 本文档为论文限定领域口语对话系统中超出领域话语的对话行为识别的部分实验代码.代码基于Python,需要用到的外部库有: Keras(搭建神经网络) Sci ...
Linux下GCC的安装，GCC链接外部库
GCC的安装环境RedHatLinuxAS5 安装盘:RedHatLinuxAS5正式版DVD.iso 查看gcc版本: # gcc -v 检查是否安装rpm包 # rpm -qa | grep r ...
python外部库是什么_如何使用Portable Python安装外部库？
由于管理员权限,我无法在我的机器上安装Python,但我成功下载/打开了Portable Python.我在Windows 7 64位机器上.我怎样才能使用以前的外部库,比如Numpy或Gmpy? 解 ...
Visual Studio引入外部库 ---- 弄懂静态库lib和动态库dll
这两天由于想要研究一下socket的相关内容,但是没想到引入外部库还有这么多门道. 根据维基百科定义:一个现代编译器的主要工作流程如下:源代码(source code)→ 预处理器(preproces ...
react引入外部js_React.js：无需使用外部库即可实现拖放功能
react引入外部js by Rajesh Pillai 由Rajesh Pillai React.js:无需使用外部库即可实现拖放功能 (React.js: implement the drag a ...
java使用外部库_在Java中使用外部库
java使用外部库 Java附带了一组核心库,其中包括定义常用数据类型和相关行为的库,例如String或Date : 与主机操作系统进行交互的实用程序,例如System或File : 有用的子系统来管 ...
android 粘性view_android - 如何在RecyclerView中制作粘性标头？（有或没有外部库） - SO中文参考 - www.soinside.com...
我正在使用room db,我有一张桌子,可以从其中获取LiveData列表.在该表中,有一个"日期"列,用于存储当前日期.当前日期默认为选中状态,但是在数据库中插入数据时,用户也可 ...
cmake使用教程（五）调用opencv外部库和自己生成的库
这一节是我真正使用时遇到的,就是我需要构建动态库,但是这个动态库依赖外部的一些库如opencv.matlab等,那么若何构建这样的动态库呢?构建好如何测试呢?本篇将详细介绍其过程: 1.ubuntu安 ...
cmake添加pthread外部库方法
CMakeLists.txt文件 cmake_minimum_required(VERSION 3.10) project(nnb VERSION 1.0)#设置c++标准 set(CMAKE_CXX ...

Relay外部库使用

Relay外部库使用相关推荐

最新文章

热门文章