1 下载数据
// An highlighted block
from mmsdk import mmdatasdk as md
DATASET = md.cmu_mosi
// 下载
print('have been downloaded')
2 看一下下载好的文件
我们可以看到在DATA_PATH的文件路径中,都是以.csd结尾的文件,这是SDK中的一种称之为计算序列(computational sequences)的数据结构。
3 载入多模态数据
构建一个字典,格式为 {modality_file_name, csd_path}然后再传到md包里面,构建一个数据集
visual_field = 'CMU_MOSI_Visual_Facet_41.csd'
acoustic_field = 'CMU_MOSI_COVAREP.csd'
text_field = 'CMU_MOSI_ModifiedTimestampedWords.csd'
features = [
recipe = {feat: os.path.join(DATA_PATH, feat) for feat in features}
dataset = md.mmdataset(recipe)
4 看一下这个数据集
print(list(dataset.keys())) # 数据集的key print("=" * 50) print(list(dataset[visual_field].keys())[:10]) # 视觉模态的前十个key,这里即前十个id print("=" * 50) # 第十五个视频的keys,即那个一个元组 some_id = list(dataset[visual_field].keys())[15] print(list(dataset[visual_field][some_id].keys())) print("=" * 50) # 看一下时间戳的shape print(list(dataset[visual_field][some_id]['intervals'].shape)) # print(list(dataset[visual_field][some_id]['intervals'])) print("=" * 50) # 看一下每一个模态的shape print(list(dataset[visual_field][some_id]['features'].shape)) print(list(dataset[text_field][some_id]['features'].shape)) print(list(dataset[acoustic_field][some_id]['features'].shape)) # 不同的模态有不同的time step print("Different modalities have different number of time steps!")
5 对齐不同的time step
主要思想就是:将其他的模态的对齐到文本模态上,使得所有模态的time step长度是一致的。首先将其他模态的特征放到一个“桶”中,然后对这个进行处理,这里用到的函数叫做 collapse function。主要是作了pooling的操作
# we define a simple averaging function that does not depend on intervals
def avg(intervals: np.array, features: np.array) -> np.array:
# 虽然没有用到intervals,但是还是要作为参数,不然会报错
return np.average(features, axis=0)
return features
# first we align to words with averaging, collapse_function receives a list of functions
dataset.align(text_field, collapse_functions=[avg])
6 将标签给对齐中
label_field = 'CMU_MOSI_Opinion_Labels' # 标签计算序列
# we add and align to lables to obtain labeled segments
# this time we don't apply collapse functions so that the temporal sequences are preserved
# 构建一个字典
label_recipe = {label_field: os.path.join(DATA_PATH, label_field + '.csd')}
dataset.add_computational_sequences(label_recipe, destination=None)
7 分割数据集
SDK会分配每一个视频的id给我们让我们分割train/test/dev set。但是我在对齐之后已经将id变成了id[seg],因此我们需要利用训练来匹配出每一个id并且将数据放到相应的数据集中。并且,对于每一个特征都利用了z-normalization,并且将文本用唯一的id来替代
# 获得相应的id train_split = DATASET.standard_folds.standard_train_fold dev_split = DATASET.standard_folds.standard_valid_fold test_split = DATASET.standard_folds.standard_test_fold from collections import defaultdict word2id = defaultdict(lambda: len(word2id)) UNK = word2id['<unk>'] PAD = word2id['<pad>'] train = [] test = [] dev = [] # 正则的规则 pattern = re.compile('(.*)\[.*\]') num_drop = 0 for segment in dataset[label_field].keys(): # segment为 id[seg] vid = re.search(pattern, segment).group(1) label = dataset[label_field][segment]['features'] _words = dataset[text_field][segment]['features'] _visual = dataset[visual_field][segment]['features'] _acoustic = dataset[acoustic_field][segment]['features'] # 边界处理 # 长度 if not (_words.shape[0] == _visual.shape[0] == _acoustic.shape[0]): print('the length of these modalities is different,drop!') num_drop += 0 continue # 移除nan label = np.nan_to_num(label) _visual = np.nan_to_num(_visual) _acoustic = np.nan_to_num(_acoustic) # 音频模态中的停顿 words = [] visual = [] acoustic = [] for i, word in enumerate(_words): if(word[0] != b'sp'): # 放到列表之前构建了之间 words.append(word2id[word.decode('utf-8')]) visual.append(_visual) _acoustic.append(acoustic) # 将数组列表为数组 words = np.asarray(words) visual = np.asarray(visual) acoustic = np.asarray(acoustic) # 归一化 -- z-normalization visual = np.nan_to_num((visual - visual.mean(0, keepdims=True)) / (EPS + np.std(visual, axis=0, keepdims=True))) acoustic = np.nan_to_num((acoustic - acoustic.mean(0, keepdims=True)) / (EPS + np.std(acoustic, axis=0, keepdims=True))) # 利用id进行匹配放到上面的列表中。 if vid in train_split: train.append(((words, visual, acoustic),label, segment)) # 放入一个元组,元素包含三个部分 elif vid in dev_split: dev.append(((words, visual, acoustic), label, segment)) elif vid in test_split: test.append(((words, visual, acoustic), label, segment)) else: print(f"Found video that doesn't belong to any splits: {vid}") print(f"Total number of {num_drop} datapoints have been dropped.") # turn off the word2id - define a named function here to allow for pickling # 字典已经构建完毕,不需要再进行构建了。如果遇见了字典没有的字,返回UNK def return_unk(): return UNK word2id.defalut_factory = return_unk
8 pytorch中的collate function以及构建DataLoader
我们已经得到了train/test/dev set,他们的格式是list。在pytorch中,可以使用collate_functions来从数据及中收集批量数据。
def multi_collate(batch): ''' Collate functions assume batch = [Dataset[i] for i in index_set] ''' # for later use we sort the batch in descending order of length batch = sorted(batch, key=lambda x: x[0][0].shape[0], reverse=True) # get the data out of the batch - use pad sequence util functions from PyTorch to pad things labels = torch.cat([torch.from_numpy(sample[1]) for sample in batch], dim=0) # T his function returns a Tensor of size T x B x * or B x T x * # where T is the length of the longest sequence. 注意实例化学习 sentences = pad_sequence([torch.LongTensor(sample[0][0]) for sample in batch], padding_value=PAD) visual = pad_sequence([torch.FloatTensor(sample[0][1]) for sample in batch]) acoustic = pad_sequence([torch.FloatTensor(sample[0][2]) for sample in batch]) # lengths are useful later in using RNNs lengths = torch.LongTensor([sample[0][0].shape[0] for sample in batch]) return sentences, visual, acoustic, labels, lengths # construct dataloaders, dev and test could use around ~X3 times batch size since no_grad is used during eval batch_sz = 56 train_loader = DataLoader(train, shuffle=True, batch_size=batch_sz, collate_fn=multi_collate) dev_loader = DataLoader(dev, shuffle=False, batch_size=batch_sz*3, collate_fn=multi_collate) test_loader = DataLoader(test, shuffle=False, batch_size=batch_sz*3, collate_fn=multi_collate) # let's create a temporary dataloader just to see how the batch looks like, # 注意这个的batchsize是8 temp_loader = iter(DataLoader(test, shuffle=True, batch_size=8, collate_fn=multi_collate)) batch = next(temp_loader) # size为:T-B-* print(batch[0].shape) # word vectors, padded to maxlen print(batch[1].shape) # visual features print(batch[2].shape) # acoustic features print(batch[3]) # labels print(batch[4]) # lengths
