CMU-Multimodal SDK Version 1.2.0(mmsdk)Windows配置与使用+pytorch代码demo





1. 需要下载的内容

步骤1:下载官方github的SDK包:CMU-MultiComp-Lab/CMU-MultimodalSDK (github.com)





步骤1:在anaconda的虚拟环境 路径下的Lib\site-packages,创建一个文本文档,命名为’mypkpath‘,在该文档中添加上一步的SDK路径,保存之后将文件后缀改为“pth”。


pip install h5py validators tqdm numpy argparse requests









from mmsdk import mmdatasdk as md

3. 下载数据集


  1. from mmsdk import mmdatasdk as md
  2. DATASET = md.cmu_mosi
  3. DATA_PATH = './data'
  4. try:
  5. md.mmdataset(DATASET.highlevel,DATA_PATH)
  6. except:
  7. print('have been downloaded')


  1. from mmsdk import mmdatasdk as md
  2. DATASET = md.cmu_mosi
  3. DATA_PATH = './data'
  4. data_files = os.listdir(DATA_PATH)
  5. print('\n'.join(data_files))
  6. visual_field = 'CMU_MOSI_Visual_Facet_41'
  7. acoustic_field = 'CMU_MOSI_COVAREP'
  8. text_field = 'CMU_MOSI_ModifiedTimestampedWords'
  9. features = [
  10. text_field,
  11. visual_field,
  12. acoustic_field
  13. ]
  14. recipe = {feat: os.path.join(DATA_PATH, feat) + '.csd' for feat in features}
  15. dataset = md.mmdataset(recipe)
  16. print(list(dataset[text_field].keys())[55])
  17. print('done!')


  1. def avg(intervals: np.array, features: np.array) -> np.array:
  2. try:
  3. return np.average(features, axis=0)
  4. except:
  5. return features
  6. # first we align to words with averaging, collapse_function receives a list of functions
  7. dataset.align(text_field, collapse_functions=[avg])
  8. label_field = 'CMU_MOSI_Opinion_Labels'
  9. # we add and align to lables to obtain labeled segments
  10. # this time we don't apply collapse functions so that the temporal sequences are preserved
  11. label_recipe = {label_field: os.path.join(DATA_PATH, label_field + '.csd')}
  12. dataset.add_computational_sequences(label_recipe, destination=None)
  13. dataset.align(label_field, replace=True)
  14. print(list(dataset[text_field].keys())[55])
  15. print('done!')
  16. ###保存
  17. deploy_files={x:x for x in dataset.computational_sequences.keys()}
  18. dataset.deploy("./deployed",deploy_files)
  19. aligned_cmumosi_highlevel=md.mmdataset('./deployed')

4. 一次小小的完整训练

  1. from mmsdk import mmdatasdk
  2. import os, re
  3. import numpy as np
  4. from torch.utils.data import DataLoader
  5. from collections import defaultdict
  6. import torch
  7. import torch.nn as nn
  8. from tqdm import tqdm_notebook
  9. from torch.optim import Adam, SGD
  10. from sklearn.metrics import accuracy_score
  11. import torch, os
  12. from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
  13. from torch.utils.data import DataLoader, Dataset
  14. from tqdm import tqdm_notebook
  15. word2id = defaultdict(lambda: len(word2id))
  16. UNK = word2id['<unk>']
  17. PAD = word2id['<pad>']
  18. # def return_unk():
  19. # return UNK
  20. # word2id.default_factory = return_unk
  21. def splittraindevtest(train_split, dev_split, test_split, dataset, features):
  22. EPS = 0
  23. text_field, visual_field, acoustic_field, label_field = features
  24. # construct a word2id mapping that automatically takes increment when new words are encountered
  25. # place holders for the final train/dev/test dataset
  26. train = []
  27. dev = []
  28. test = []
  29. # define a regular expression to extract the video ID out of the keys
  30. pattern = re.compile('(.*)\[.*\]')
  31. num_drop = 0 # a counter to count how many data points went into some processing issues
  32. for segment in dataset[label_field].keys():
  33. # get the video ID and the features out of the aligned dataset
  34. vid = re.search(pattern, segment).group(1)
  35. label = dataset[label_field][segment]['features']
  36. _words = dataset[text_field][segment]['features']
  37. _visual = dataset[visual_field][segment]['features']
  38. _acoustic = dataset[acoustic_field][segment]['features']
  39. # if the sequences are not same length after alignment, there must be some problem with some modalities
  40. # we should drop it or inspect the data again
  41. if not _words.shape[0] == _visual.shape[0] == _acoustic.shape[0]:
  42. print(
  43. f"Encountered datapoint {vid} with text shape {_words.shape}, visual shape {_visual.shape}, acoustic shape {_acoustic.shape}")
  44. num_drop += 1
  45. continue
  46. # remove nan values
  47. label = np.nan_to_num(label)
  48. _visual = np.nan_to_num(_visual)
  49. _acoustic = np.nan_to_num(_acoustic)
  50. # remove speech pause tokens - this is in general helpful
  51. # we should remove speech pauses and corresponding visual/acoustic features together
  52. # otherwise modalities would no longer be aligned
  53. words = []
  54. visual = []
  55. acoustic = []
  56. for i, word in enumerate(_words):
  57. if word[0] != b'sp':
  58. words.append(word2id[word[0].decode('utf-8')]) # SDK stores strings as bytes, decode into strings here
  59. visual.append(_visual[i, :])
  60. acoustic.append(_acoustic[i, :])
  61. words = np.asarray(words)
  62. visual = np.asarray(visual)
  63. acoustic = np.asarray(acoustic)
  64. # z-normalization per instance and remove nan/infs
  65. visual = np.nan_to_num((visual - visual.mean(0, keepdims=True)) / (EPS + np.std(visual, axis=0, keepdims=True)))
  66. acoustic = np.nan_to_num(
  67. (acoustic - acoustic.mean(0, keepdims=True)) / (EPS + np.std(acoustic, axis=0, keepdims=True)))
  68. if vid in train_split:
  69. train.append(((words, visual, acoustic), label, segment))
  70. elif vid in dev_split:
  71. dev.append(((words, visual, acoustic), label, segment))
  72. elif vid in test_split:
  73. test.append(((words, visual, acoustic), label, segment))
  74. else:
  75. print(f"Found video that doesn't belong to any splits: {vid}")
  76. def return_unk():
  77. return UNK
  78. word2id.default_factory = return_unk
  79. print(f"Total number of {num_drop} datapoints have been dropped.")
  80. return train, dev, test
  81. def multi_collate(batch):
  82. '''
  83. Collate functions assume batch = [Dataset[i] for i in index_set]
  84. '''
  85. # for later use we sort the batch in descending order of length
  86. batch = sorted(batch, key=lambda x: x[0][0].shape[0], reverse=True)
  87. # get the data out of the batch - use pad sequence util functions from PyTorch to pad things
  88. labels = torch.cat([torch.from_numpy(sample[1]) for sample in batch], dim=0)
  89. sentences = pad_sequence([torch.LongTensor(sample[0][0]) for sample in batch], padding_value=PAD)
  90. visual = pad_sequence([torch.FloatTensor(sample[0][1]) for sample in batch])
  91. acoustic = pad_sequence([torch.FloatTensor(sample[0][2]) for sample in batch])
  92. # lengths are useful later in using RNNs
  93. lengths = torch.LongTensor([sample[0][0].shape[0] for sample in batch])
  94. return sentences, visual, acoustic, labels, lengths
  95. class LFLSTM(nn.Module):
  96. def __init__(self, input_sizes, hidden_sizes, fc1_size, output_size, dropout_rate):
  97. super(LFLSTM, self).__init__()
  98. self.input_size = input_sizes
  99. self.hidden_size = hidden_sizes
  100. self.fc1_size = fc1_size
  101. self.output_size = output_size
  102. self.dropout_rate = dropout_rate
  103. # defining modules - two layer bidirectional LSTM with layer norm in between
  104. self.embed = nn.Embedding(len(word2id), input_sizes[0])
  105. self.trnn1 = nn.LSTM(input_sizes[0], hidden_sizes[0], bidirectional=True)
  106. self.trnn2 = nn.LSTM(2 * hidden_sizes[0], hidden_sizes[0], bidirectional=True)
  107. self.vrnn1 = nn.LSTM(input_sizes[1], hidden_sizes[1], bidirectional=True)
  108. self.vrnn2 = nn.LSTM(2 * hidden_sizes[1], hidden_sizes[1], bidirectional=True)
  109. self.arnn1 = nn.LSTM(input_sizes[2], hidden_sizes[2], bidirectional=True)
  110. self.arnn2 = nn.LSTM(2 * hidden_sizes[2], hidden_sizes[2], bidirectional=True)
  111. self.fc1 = nn.Linear(sum(hidden_sizes) * 4, fc1_size)
  112. self.fc2 = nn.Linear(fc1_size, output_size)
  113. self.relu = nn.ReLU()
  114. self.dropout = nn.Dropout(dropout_rate)
  115. self.tlayer_norm = nn.LayerNorm((hidden_sizes[0] * 2,))
  116. self.vlayer_norm = nn.LayerNorm((hidden_sizes[1] * 2,))
  117. self.alayer_norm = nn.LayerNorm((hidden_sizes[2] * 2,))
  118. self.bn = nn.BatchNorm1d(sum(hidden_sizes) * 4)
  119. def extract_features(self, sequence, lengths, rnn1, rnn2, layer_norm):
  120. packed_sequence = pack_padded_sequence(sequence, lengths.cpu())
  121. packed_h1, (final_h1, _) = rnn1(packed_sequence)
  122. padded_h1, _ = pad_packed_sequence(packed_h1)
  123. normed_h1 = layer_norm(padded_h1)
  124. packed_normed_h1 = pack_padded_sequence(normed_h1, lengths.cpu())
  125. _, (final_h2, _) = rnn2(packed_normed_h1)
  126. return final_h1, final_h2
  127. def fusion(self, sentences, visual, acoustic, lengths):
  128. batch_size = lengths.size(0)
  129. sentences = self.embed(sentences)
  130. # extract features from text modality
  131. final_h1t, final_h2t = self.extract_features(sentences, lengths, self.trnn1, self.trnn2, self.tlayer_norm)
  132. # extract features from visual modality
  133. final_h1v, final_h2v = self.extract_features(visual, lengths, self.vrnn1, self.vrnn2, self.vlayer_norm)
  134. # extract features from acoustic modality
  135. final_h1a, final_h2a = self.extract_features(acoustic, lengths, self.arnn1, self.arnn2, self.alayer_norm)
  136. # simple late fusion -- concatenation + normalization
  137. h = torch.cat((final_h1t, final_h2t, final_h1v, final_h2v, final_h1a, final_h2a),
  138. dim=2).permute(1, 0, 2).contiguous().view(batch_size, -1)
  139. return self.bn(h)
  140. def forward(self, sentences, visual, acoustic, lengths):
  141. batch_size = lengths.size(0)
  142. h = self.fusion(sentences, visual, acoustic, lengths)
  143. h = self.fc1(h)
  144. h = self.dropout(h)
  145. h = self.relu(h)
  146. o = self.fc2(h)
  147. return o
  148. def load_emb(w2i, path_to_embedding, embedding_size=300, embedding_vocab=2196017, init_emb=None):
  149. if init_emb is None:
  150. emb_mat = np.random.randn(len(w2i), embedding_size)
  151. else:
  152. emb_mat = init_emb
  153. f = open(path_to_embedding, 'r')
  154. found = 0
  155. for line in tqdm_notebook(f, total=embedding_vocab):
  156. content = line.strip().split()
  157. vector = np.asarray(list(map(lambda x: float(x), content[-300:])))
  158. word = ' '.join(content[:-300])
  159. if word in w2i:
  160. idx = w2i[word]
  161. emb_mat[idx, :] = vector
  162. found += 1
  163. print(f"Found {found} words in the embedding file.")
  164. return torch.tensor(emb_mat).float()
  165. def run(train_loader, dev_loader, test_loader):
  166. torch.manual_seed(123)
  167. torch.cuda.manual_seed_all(123)
  168. CUDA = True # torch.cuda.is_available()
  169. MAX_EPOCH = 5
  170. text_size = 300
  171. visual_size = 47
  172. acoustic_size = 74
  173. # define some model settings and hyper-parameters
  174. input_sizes = [text_size, visual_size, acoustic_size]
  175. hidden_sizes = [int(text_size * 1.5), int(visual_size * 1.5), int(acoustic_size * 1.5)]
  176. fc1_size = sum(hidden_sizes) // 2
  177. dropout = 0.25
  178. output_size = 1
  179. curr_patience = patience = 8
  180. num_trials = 3
  181. grad_clip_value = 1.0
  182. weight_decay = 0.1
  183. CACHE_PATH = r'D:\Speech\jupyter_notes_zyx\CMU-MultimodalSDK-Tutorials-master\CMU-MultimodalSDK-Tutorials-master\data\embedding_and_mapping.pt'
  184. if os.path.exists(CACHE_PATH):
  185. pretrained_emb, word2id = torch.load(CACHE_PATH)
  186. else:
  187. pretrained_emb = None
  188. model = LFLSTM(input_sizes, hidden_sizes, fc1_size, output_size, dropout)
  189. if pretrained_emb is not None:
  190. model.embed.weight.data = pretrained_emb
  191. model.embed.requires_grad = False
  192. optimizer = Adam([param for param in model.parameters() if param.requires_grad], weight_decay=weight_decay)
  193. if CUDA:
  194. model.cuda()
  195. criterion = nn.L1Loss(reduction='sum')
  196. criterion_test = nn.L1Loss(reduction='sum')
  197. best_valid_loss = float('inf')
  198. lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
  199. lr_scheduler.step() # for some reason it seems the StepLR needs to be stepped once first
  200. train_losses = []
  201. valid_losses = []
  202. for e in range(MAX_EPOCH):
  203. model.train()
  204. train_iter = tqdm_notebook(train_loader)
  205. train_loss = 0.0
  206. for batch in train_iter:
  207. model.zero_grad()
  208. t, v, a, y, l = batch
  209. batch_size = t.size(0)
  210. if CUDA:
  211. t = t.cuda()
  212. v = v.cuda()
  213. a = a.cuda()
  214. y = y.cuda()
  215. l = l.cuda()
  216. y_tilde = model(t, v, a, l)
  217. loss = criterion(y_tilde, y)
  218. loss.backward()
  219. torch.nn.utils.clip_grad_value_([param for param in model.parameters() if param.requires_grad], grad_clip_value)
  220. optimizer.step()
  221. train_iter.set_description(f"Epoch {e}/{MAX_EPOCH}, current batch loss: {round(loss.item() / batch_size, 4)}")
  222. train_loss += loss.item()
  223. train_loss = train_loss / len(train)
  224. train_losses.append(train_loss)
  225. print(f"Training loss: {round(train_loss, 4)}")
  226. model.eval()
  227. with torch.no_grad():
  228. valid_loss = 0.0
  229. for batch in dev_loader:
  230. model.zero_grad()
  231. t, v, a, y, l = batch
  232. if CUDA:
  233. t = t.cuda()
  234. v = v.cuda()
  235. a = a.cuda()
  236. y = y.cuda()
  237. l = l.cuda()
  238. y_tilde = model(t, v, a, l)
  239. loss = criterion(y_tilde, y)
  240. valid_loss += loss.item()
  241. valid_loss = valid_loss / len(dev)
  242. valid_losses.append(valid_loss)
  243. print(f"Validation loss: {round(valid_loss, 4)}")
  244. print(f"Current patience: {curr_patience}, current trial: {num_trials}.")
  245. if valid_loss <= best_valid_loss:
  246. best_valid_loss = valid_loss
  247. print("Found new best model on dev set!")
  248. torch.save(model.state_dict(), 'model.std')
  249. torch.save(optimizer.state_dict(), 'optim.std')
  250. curr_patience = patience
  251. else:
  252. curr_patience -= 1
  253. if curr_patience <= -1:
  254. print("Running out of patience, loading previous best model.")
  255. num_trials -= 1
  256. curr_patience = patience
  257. model.load_state_dict(torch.load('model.std'))
  258. optimizer.load_state_dict(torch.load('optim.std'))
  259. lr_scheduler.step()
  260. print(f"Current learning rate: {optimizer.state_dict()['param_groups'][0]['lr']}")
  261. if num_trials <= 0:
  262. print("Running out of patience, early stopping.")
  263. break
  264. model.load_state_dict(torch.load('model.std'))
  265. y_true = []
  266. y_pred = []
  267. model.eval()
  268. with torch.no_grad():
  269. test_loss = 0.0
  270. for batch in test_loader:
  271. model.zero_grad()
  272. t, v, a, y, l = batch
  273. if CUDA:
  274. t = t.cuda()
  275. v = v.cuda()
  276. a = a.cuda()
  277. y = y.cuda()
  278. l = l.cuda()
  279. y_tilde = model(t, v, a, l)
  280. loss = criterion_test(y_tilde, y)
  281. y_true.append(y_tilde.detach().cpu().numpy())
  282. y_pred.append(y.detach().cpu().numpy())
  283. test_loss += loss.item()
  284. print(f"Test set performance: {test_loss / len(test)}")
  285. y_true = np.concatenate(y_true, axis=0)
  286. y_pred = np.concatenate(y_pred, axis=0)
  287. y_true_bin = y_true >= 0
  288. y_pred_bin = y_pred >= 0
  289. bin_acc = accuracy_score(y_true_bin, y_pred_bin)
  290. print(f"Test set accuracy is {bin_acc}")
  291. if __name__=='__main__':
  292. visual_field1 = 'CMU_MOSI_Visual_Facet_41'
  293. acoustic_field1 = 'CMU_MOSI_COVAREP'
  294. text_field1 = 'CMU_MOSI_ModifiedTimestampedWords'
  295. label_field1 = 'CMU_MOSI_Opinion_Labels'
  296. features1 = [
  297. text_field1,
  298. visual_field1,
  299. acoustic_field1,
  300. label_field1
  301. ]
  302. DATA_PATH1 = './mymydeployed'
  303. recipe1 = {feat: os.path.join(DATA_PATH1, feat) + '.csd' for feat in features1}
  304. dataset1 = mmdatasdk.mmdataset(recipe1)
  305. print(list(dataset1[text_field1].keys())[55])
  306. tensors = dataset1.get_tensors(seq_len=25, non_sequences=["Opinion Segment Labels"], direction=False,
  307. folds=[mmdatasdk.cmu_mosi.standard_folds.standard_train_fold, mmdatasdk.cmu_mosi.
  308. standard_folds.standard_valid_fold,
  309. mmdatasdk.cmu_mosi.standard_folds.standard_test_fold])
  310. fold_names = ["train", "valid", "test"]
  311. train, dev, test = splittraindevtest(mmdatasdk.cmu_mosi.standard_folds.standard_train_fold, mmdatasdk.cmu_mosi.
  312. standard_folds.standard_valid_fold, mmdatasdk.cmu_mosi.standard_folds.standard_test_fold,
  313. dataset1, features1)
  314. batch_sz = 56
  315. train_loader = DataLoader(train, shuffle=True, batch_size=batch_sz, collate_fn=multi_collate)
  316. dev_loader = DataLoader(dev, shuffle=False, batch_size=batch_sz * 3, collate_fn=multi_collate)
  317. test_loader = DataLoader(test, shuffle=False, batch_size=batch_sz * 3, collate_fn=multi_collate)
  318. run(train_loader, dev_loader, test_loader)

