赞
踩
问题描述:
在这里基于卷积循环神经网络,做一个图像序列的预测。输入连续的16张图像帧,图像大小(3,128,128)。利用卷积网络对每连续的16张图像进行Encoder特征提取,然后将提取的特征序列输入到循环神经网络(LSTM)中,之后通过Decoder反卷积成原图像大小的troch(3,12,8,128),也可以当做根据前16帧生成了第17帧图像,原序列第17帧图像作为label,计算loss。
数据预处理:
1、想得到一个txt文本,里面每一行记录连续的17帧图像的路径
2、如何生成我们想要的txt路径文件 ?
数据集存储方式:
1)文件名按数字顺序:0,1,2。。。
2)每个文件夹下面都是一个视频的分解帧,命名方式如下:
3、代码:
import os # dir='/home/lab226/wdf/imgsrc' fp = open('./img_path.txt','w+') imgfile_list = os.listdir('/home/lab226/wdf/imgsrc') #对文件夹列表按文件名的数字顺序排序 imgfile_list.sort(key= lambda x:int(x[:])) #print(img_list) seqsize =17 for imgfile in imgfile_list: filepath = os.path.join(dir,imgfile) img_list = os.listdir(filepath) #这个排序比较重要,因为我们要顺序取,但是文件的存储方式并不是按照我们理解的数字顺序存储 img_list.sort(key=lambda x: int(x[:-4])) #滑窗取序列,步长为8 for i in range(0, len(img_list)-seqsize, 8): for j in range(i,i+seqsize): img = img_list[j] path = os.path.join(filepath, img) if j == i+seqsize-1: fp.write(path+'\n') else: fp.write(path+' ') fp.close()
数据加载:
我写了自己的SeqDataset,改写了Dataset类中的__getitem__()函数,使得每次迭代返回连续的16张图像和第17张标签图像。详细代码如下:
class SeqDataset(Dataset): def __init__(self, txt, transform=None, target_transform=None, loader=default_loader): fh = open(txt, 'r') imgseqs = [] for line in fh: line = line.strip('\n') line = line.rstrip() imgseqs.append(line) self.num_samples = len(imgseqs) self.imgseqs = imgseqs self.transform = transform self.target_transform = target_transform self.loader = loader def __getitem__(self, index): current_index = np.random.choice(range(0, self.num_samples)) imgs_path = self.imgseqs[current_index].split() current_imgs = [] current_imgs_path = imgs_path[:len(imgs_path)-1] current_label_path = imgs_path[len(imgs_path)-1] current_label = self.loader(current_label_path) for frame in current_imgs_path: img = self.loader(frame) if self.transform is not None: img = self.transform(img) current_imgs.append(img) current_label = self.transform(current_label) #print(current_label.shape) batch_cur_imgs = np.stack(current_imgs, axis=0) return batch_cur_imgs, current_label transform_list = [ transforms.ToTensor() ] data_transforms = transforms.Compose( transform_list ) train_data = SeqDataset(txt='./img_path.txt',transform=data_transforms) train_loader = DataLoader(train_data, shuffle=True, num_workers=20,batch_size=BATCH_SIZE)
模型介绍:
由Encoder+LSTM和Decoder这两部分组成
具体代码:
class EncoderMUG2d_LSTM(nn.Module): def __init__(self, input_nc=3, encode_dim=1024, lstm_hidden_size=1024, seq_len=SEQ_SIZE, num_lstm_layers=1, bidirectional=False): super(EncoderMUG2d_LSTM, self).__init__() self.seq_len = seq_len self.num_directions = 2 if bidirectional else 1 self.num_lstm_layers = num_lstm_layers self.lstm_hidden_size = lstm_hidden_size #3*128*128 self.encoder = nn.Sequential( nn.Conv2d(input_nc, 32, 4,2,1), # 32*64*64 nn.BatchNorm2d(32), nn.LeakyReLU(0.2, inplace=True), #32*63*63 nn.Conv2d(32, 64, 4, 2, 1), # 64*32*32 nn.BatchNorm2d(64), nn.LeakyReLU(0.2, inplace=True), #64*31*31 nn.Conv2d(64, 128, 4, 2, 1), # 128*16*16 nn.BatchNorm2d(128), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(128, 256, 4, 2, 1), # 256*8*8 nn.BatchNorm2d(256), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(256, 512, 4, 2, 1), # 512*4*4 nn.BatchNorm2d(512), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(512, 512, 4, 2, 1), # 512*2*2 nn.BatchNorm2d(512), nn.LeakyReLU(0.2, inplace=True), nn.Conv2d(512, 1024, 4, 2, 1), # 1024*1*1 nn.BatchNorm2d(1024), nn.LeakyReLU(0.2, inplace=True), ) self.fc = nn.Linear(1024, encode_dim) self.lstm = nn.LSTM(encode_dim, encode_dim, batch_first=True) def init_hidden(self, x): batch_size = x.size(0) h = x.data.new( self.num_directions * self.num_lstm_layers, batch_size, self.lstm_hidden_size).zero_() c = x.data.new( self.num_directions * self.num_lstm_layers, batch_size, self.lstm_hidden_size).zero_() return Variable(h), Variable(c) def forward(self, x): #x.shape [batchsize,seqsize,3,128,128] B = x.size(0) x = x.view(B * SEQ_SIZE, 3, 128, 128) #x.shape[batchsize*seqsize,3,128,128] # [batchsize*seqsize, 3, 128, 128] -> [batchsize*seqsize, 1024,1,1] x = self.encoder(x) #[batchsize * seqsize, 1024, 1, 1]-> [batchsize*seqsize, 1024] x = x.view(-1, 1024) # [batchsize * seqsize, 1024] x = self.fc(x) # [batchsize , seqsize ,1024] x = x.view(-1, SEQ_SIZE, x.size(1)) h0, c0 = self.init_hidden(x) output, (hn,cn) = self.lstm(x,(h0,c0)) return hn class DecoderMUG2d(nn.Module): def __init__(self, output_nc=3, encode_dim=1024): #output size: 64x64 super(DecoderMUG2d, self).__init__() self.project = nn.Sequential( nn.Linear(encode_dim, 1024*1*1), nn.ReLU(inplace=True) ) self.decoder = nn.Sequential( nn.ConvTranspose2d(1024, 512, 4), # 512*4*4 nn.BatchNorm2d(512), nn.ReLU(True), nn.ConvTranspose2d(512, 256, 4, stride=2), # 256*10*10 nn.BatchNorm2d(256), nn.ReLU(True), nn.ConvTranspose2d(256, 128, 4), # 128*13*13 nn.BatchNorm2d(128), nn.ReLU(True), nn.ConvTranspose2d(128, 64, 4,stride=2), # 64*28*28 nn.BatchNorm2d(64), nn.ReLU(True), nn.ConvTranspose2d(64, 32, 4), # 32*31*31 nn.BatchNorm2d(32), nn.ReLU(True), nn.ConvTranspose2d(32, 16, 4,stride=2), # 16*64*64 nn.BatchNorm2d(16), nn.ReLU(True), nn.ConvTranspose2d(16, output_nc, 4, stride=2, padding=1), # 3*128*128 nn.Sigmoid(), ) def forward(self, x): x = self.project(x) x = x.view(-1, 1024, 1, 1) decode = self.decoder(x) return decode class net(nn.Module): def __init__(self): super(net,self).__init__() self.n1 = EncoderMUG2d_LSTM() self.n2 = DecoderMUG2d() def forward(self, x): output = self.n1(x) output = self.n2(output) #B*3*128*128 return output
训练过程:
if __name__ == '__main__': model = net() if torch.cuda.is_available(): model.cuda() optimizer = optim.Adam(model.parameters(), lr=learning_rate) loss_func = nn.MSELoss() inputs, label = next(iter(train_loader)) for epoch in range(10): print('epoch {}'.format(epoch + 1)) train_loss = 0. train_acc = 0. #count = 1 for batch_x, batch_y in train_loader: inputs, label = Variable(batch_x).cuda(), Variable(batch_y).cuda() output = model(inputs) loss = loss_func(output, label)/label.shape[0] optimizer.zero_grad() loss.backward() optimizer.step() print('epoch: {}, Loss: {:.4f}'.format(epoch + 1, loss.data.cpu().numpy())) if (epoch + 1) % 5 == 0: # 每 5 次,保存一下解码的图片和原图片 pic = to_img(output.cpu().data) img = to_img(label.cpu().data) if not os.path.exists('./conv_autoencoder'): os.mkdir('./conv_autoencoder') save_image(pic, './conv_autoencoder/decode_image_{}.png'.format(epoch + 1)) save_image(img, './conv_autoencoder/raw_image_{}.png'.format(epoch + 1)) #count = count +1 torch.save(model.state_dict(), PATH_SAVE)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。