PhysFormer: Facial Video-based Physiological Measurement with Temporal Difference Transformer


Recent deep learning approaches focus on mining subtle rPPG clues using convolutional neural networks with limited spatio-temporal receptive fields, which neglect the long-range spatio-temporal perception and interaction for rPPG modeling.(长程时间关系)

反例:Unifying frame rate and temporal dilations for improved remote pulse detection(SCI三区水论文)

the temporal difference transformers

提出了:global spatio-temporal attention based on the fine-grained temporal skin color differences


  • subtle skin color changes

  • long-time monitoring task

  • a video sequence to signal sequence problem

we also propose the label distribution learning and a curriculum learning inspired dynamic constraint in frequency domain, which provide elaborate supervisions for PhysFormer and alleviate overfitting.



埋个伏笔下次再讲差分卷积在计算机视觉中的应用 - 知乎 (

class CDC_T(nn.Module):def __init__(self, in_channels, out_channels, kernel_size=3, stride=1,padding=1, dilation=1, groups=1, bias=False, theta=0.6):super(CDC_T, self).__init__()self.conv = nn.Conv3d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding,dilation=dilation, groups=groups, bias=bias)self.theta = thetadef forward(self, x):out_normal = self.conv(x)if math.fabs(self.theta - 0.0) < 1e-8:return out_normalelse:# pdb.set_trace()[C_out, C_in, t, kernel_size, kernel_size] = self.conv.weight.shape# only CD works on temporal kernel size>1if self.conv.weight.shape[2] > 1:kernel_diff = self.conv.weight[:, :, 0, :, :].sum(2).sum(2) + self.conv.weight[:, :, 2, :, :].sum(2).sum(2)kernel_diff = kernel_diff[:, :, None, None, None]out_diff = F.conv3d(input=x, weight=kernel_diff, bias=self.conv.bias, stride=self.conv.stride,padding=0, dilation=self.conv.dilation, groups=self.conv.groups)return out_normal - self.theta * out_diffelse:return out_normal


    def forward(self, x, gra_sharp):    # [B, 4*4*40, 128]"""x, q(query), k(key), v(value) : (B(batch_size), S(seq_len), D(dim))mask : (B(batch_size) x S(seq_len))* split D(dim) into (H(n_heads), W(width of head)) ; D = H * W"""# (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)[B, P, C]=x.shapex = x.transpose(1, 2).view(B, C, P//16, 4, 4)      # [B, dim, 40, 4, 4]q, k, v = self.proj_q(x), self.proj_k(x), self.proj_v(x)q = q.flatten(2).transpose(1, 2)  # [B, 4*4*40, dim]k = k.flatten(2).transpose(1, 2)  # [B, 4*4*40, dim]v = v.flatten(2).transpose(1, 2)  # [B, 4*4*40, dim]q, k, v = (split_last(x, (self.n_heads, -1)).transpose(1, 2) for x in [q, k, v])# (B, H, S, W) @ (B, H, W, S) -> (B, H, S, S) -softmax-> (B, H, S, S)scores = q @ k.transpose(-2, -1) / gra_sharpscores = self.drop(F.softmax(scores, dim=-1))# (B, H, S, S) @ (B, H, S, W) -> (B, H, S, W) -trans-> (B, S, H, W)h = (scores @ v).transpose(1, 2).contiguous()# -merge-> (B, S, D)h = merge_last(h, 2)self.scores = scoresreturn h, scores


    def forward(self, x, gra_sharp):b, c, t, fh, fw = x.shapex = self.Stem0(x)x = self.Stem1(x)x = self.Stem2(x)  # [B, 64, 160, 64, 64]x = self.patch_embedding(x)  # [B, 64, 40, 4, 4]x = x.flatten(2).transpose(1, 2)  # [B, 40*4*4, 64]Trans_features, Score1 =  self.transformer1(x, gra_sharp)  # [B, 4*4*40, 64]Trans_features2, Score2 =  self.transformer2(Trans_features, gra_sharp)  # [B, 4*4*40, 64]Trans_features3, Score3 =  self.transformer3(Trans_features2, gra_sharp)  # [B, 4*4*40, 64]#Trans_features3 = self.normLast(Trans_features3)# upsampling heads#features_last = Trans_features3.transpose(1, 2).view(b, self.dim, 40, 4, 4) # [B, 64, 40, 4, 4]features_last = Trans_features3.transpose(1, 2).view(b, self.dim, t//4, 4, 4) # [B, 64, 40, 4, 4]features_last = self.upsample(features_last)         # x [B, 64, 7*7, 80]features_last = self.upsample2(features_last)           # x [B, 32, 7*7, 160]features_last = torch.mean(features_last,3)     # x [B, 32, 160, 4]features_last = torch.mean(features_last,3)     # x [B, 32, 160]rPPG = self.ConvBlockLast(features_last)    # x [B, 1, 160]#pdb.set_trace()rPPG = rPPG.squeeze(1)return rPPG, Score1, Score2, Score3

Label Distribution Learning


Curriculum Learning Guided Dynamic Loss

