Spaces:

mrbear1024
/

mimictalk

Build error

App Files Files Community

mimictalk / modules /syncnet /models.py

mrbear1024

init project

8eb4303 23 days ago

raw

history blame contribute delete

8.37 kB

	import torch
	from torch import nn
	from torch.nn import functional as F
	import numpy as np
	import math


	class Conv1d(nn.Module):
	def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, args, *kwargs):
	super().__init__(args, *kwargs)
	self.conv_block = nn.Sequential(
	nn.Conv1d(cin, cout, kernel_size, stride, padding),
	nn.BatchNorm1d(cout)
	)
	self.act = nn.ReLU()
	self.residual = residual

	def forward(self, x):
	out = self.conv_block(x)
	if self.residual:
	out += x
	return self.act(out)

	class LossScale(nn.Module):
	def __init__(self, init_w=10.0, init_b=-5.0):
	super(LossScale, self).__init__()

	self.wC = nn.Parameter(torch.tensor(init_w))
	self.bC = nn.Parameter(torch.tensor(init_b))

	class CLIPLoss(nn.Module):
	def __init__(self,):
	super().__init__()

	def forward(self, audio_features, motion_features, logit_scale, clip_mask=None):
	logits_per_audio = logit_scale * audio_features @ motion_features.T # [b,c]
	logits_per_motion = logit_scale * motion_features @ audio_features.T # [b,c]
	if clip_mask is not None:
	logits_per_audio += clip_mask
	logits_per_motion += clip_mask
	labels = torch.arange(logits_per_motion.shape[0]).to(logits_per_motion.device)
	motion_loss = F.cross_entropy(logits_per_motion, labels)
	audio_loss = F.cross_entropy(logits_per_audio, labels)
	clip_loss = (motion_loss + audio_loss) / 2
	ret = {
	"audio_loss": audio_loss,
	"motion_loss": motion_loss,
	"clip_loss": clip_loss
	}
	return ret

	def compute_sync_conf(self, audio_features, motion_features, return_matrix=False):
	logits_per_audio = audio_features @ motion_features.T # [b,c]
	if return_matrix:
	return logits_per_audio
	return logits_per_audio[range(len(audio_features)), range(len(audio_features))]

	class LandmarkHubertSyncNet(nn.Module):
	def __init__(self, lm_dim=60, audio_dim=1024, num_layers_per_block=3, base_hid_size=128, out_dim=512):
	super(LandmarkHubertSyncNet, self).__init__()

	self.clip_loss_fn = CLIPLoss()
	self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) * 0
	self.logit_scale_2 = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) * 0
	self.logit_scale_max = math.log(1. / 0.01)

	# hubert = torch.rand(B, 1024, t=10)
	hubert_layers = [
	Conv1d(audio_dim, base_hid_size, kernel_size=3, stride=1, padding=1)
	]

	hubert_layers.append(
	Conv1d(base_hid_size, base_hid_size, kernel_size=3, stride=1, padding=1),
	)
	hubert_layers += [
	Conv1d(base_hid_size, base_hid_size, kernel_size=3, stride=1, padding=1, residual=True) for _ in range(num_layers_per_block-1)
	]

	hubert_layers.append(
	Conv1d(base_hid_size, 2*base_hid_size, kernel_size=3, stride=2, padding=1),
	)
	hubert_layers += [
	Conv1d(2base_hid_size, 2base_hid_size, kernel_size=3, stride=1, padding=1, residual=True) for _ in range(num_layers_per_block-1)
	]

	hubert_layers.append(
	Conv1d(2base_hid_size, 4base_hid_size, kernel_size=3, stride=2, padding=1),
	)
	hubert_layers += [
	Conv1d(4base_hid_size, 4base_hid_size, kernel_size=3, stride=1, padding=1, residual=True) for _ in range(num_layers_per_block-1)
	]

	hubert_layers += [
	Conv1d(4base_hid_size, 4base_hid_size, kernel_size=3, stride=1, padding=1),
	Conv1d(4base_hid_size, 4base_hid_size, kernel_size=3, stride=1, padding=0),
	Conv1d(4base_hid_size, 4base_hid_size, kernel_size=1, stride=1, padding=0),
	Conv1d(4*base_hid_size, out_dim, kernel_size=1, stride=1, padding=0),
	]
	self.hubert_encoder = nn.Sequential(*hubert_layers)

	# mouth = torch.rand(B, 20*3, t=5)
	mouth_layers = [
	Conv1d(lm_dim, 96, kernel_size=3, stride=1, padding=1)
	]

	mouth_layers.append(
	Conv1d(96, base_hid_size, kernel_size=3, stride=1, padding=1),
	)
	mouth_layers += [
	Conv1d(base_hid_size, base_hid_size, kernel_size=3, stride=1, padding=1, residual=True) for _ in range(num_layers_per_block-1)
	]

	mouth_layers.append(
	Conv1d(base_hid_size, 2*base_hid_size, kernel_size=3, stride=2, padding=1),
	)
	mouth_layers += [
	Conv1d(2base_hid_size, 2base_hid_size, kernel_size=3, stride=1, padding=1, residual=True) for _ in range(num_layers_per_block-1)
	]

	mouth_layers.append(
	Conv1d(2base_hid_size, 4base_hid_size, kernel_size=3, stride=1, padding=1),
	)
	mouth_layers += [
	Conv1d(4base_hid_size, 4base_hid_size, kernel_size=3, stride=1, padding=1, residual=True) for _ in range(num_layers_per_block-1)
	]

	mouth_layers += [
	Conv1d(4base_hid_size, 4base_hid_size, kernel_size=3, stride=1, padding=1),
	Conv1d(4base_hid_size, 4base_hid_size, kernel_size=3, stride=1, padding=0),
	Conv1d(4base_hid_size, 4base_hid_size, kernel_size=1, stride=1, padding=0),
	Conv1d(4*base_hid_size, out_dim, kernel_size=1, stride=1, padding=0),
	]
	self.mouth_encoder = nn.Sequential(*mouth_layers)

	self.lm_dim = lm_dim
	self.audio_dim = audio_dim
	self.logloss = nn.BCELoss()

	def forward(self, hubert, mouth_lm):
	# hubert := (B, T=10, C=1024)
	# mouth_lm3d := (B, T=5, C=60)
	hubert = hubert.transpose(1,2)
	mouth_lm = mouth_lm.transpose(1,2)
	mouth_embedding = self.mouth_encoder(mouth_lm)
	audio_embedding = self.hubert_encoder(hubert)
	audio_embedding = audio_embedding.view(audio_embedding.size(0), -1)
	mouth_embedding = mouth_embedding.view(mouth_embedding.size(0), -1)
	audio_embedding = F.normalize(audio_embedding, p=2, dim=1)
	mouth_embedding = F.normalize(mouth_embedding, p=2, dim=1)
	return audio_embedding, mouth_embedding

	def cal_sync_loss(self, audio_embedding, mouth_embedding, label, reduction='none'):
	if isinstance(label, torch.Tensor): # finegrained label
	gt_d = label.float().view(-1).to(audio_embedding.device)
	else: # int to represent global label, 1 denotes positive, and 0 denotes negative, used when calculate sync loss for other models
	gt_d = (torch.ones([audio_embedding.shape[0]]) * label).float().to(audio_embedding.device) # int
	d = F.cosine_similarity(audio_embedding, mouth_embedding) # [B]
	loss = F.binary_cross_entropy(d.reshape([audio_embedding.shape[0],]), gt_d, reduction=reduction)
	return loss, d

	def cal_clip_loss(self, audio_embedding, mouth_embedding, clip_mask=None):
	# logit_scale = torch.clamp(self.logit_scale, max=self.logit_scale_max).exp()
	logit_scale = 1
	clip_ret = self.clip_loss_fn(audio_embedding, mouth_embedding, logit_scale, clip_mask=clip_mask)
	loss = clip_ret['clip_loss']
	return loss

	def cal_clip_loss_local(self, audio_embedding, mouth_embedding, clip_mask=None):
	# logit_scale = torch.clamp(self.logit_scale_2, max=self.logit_scale_max).exp()
	logit_scale = 1
	clip_ret = self.clip_loss_fn(audio_embedding, mouth_embedding, logit_scale, clip_mask=clip_mask)
	loss = clip_ret['clip_loss']
	return loss

	def compute_sync_conf(self, audio_embedding, mouth_embedding, return_matrix=False):
	# logit_scale = torch.clamp(self.logit_scale, max=self.logit_scale_max).exp()
	logit_scale = 1
	clip_ret = self.clip_loss_fn.compute_sync_conf(audio_embedding, mouth_embedding, return_matrix)
	return clip_ret

	if __name__ == '__main__':
	syncnet = LandmarkHubertSyncNet(lm_dim=204)
	hubert = torch.rand(2, 10, 1024)
	lm = torch.rand(2, 5, 204)
	mel_embedding, exp_embedding = syncnet(hubert, lm)
	label = torch.tensor([1., 0.])
	loss = syncnet.cal_sync_loss(mel_embedding, exp_embedding, label)
	print(" ")