赞
踩
# 1. 环境安装pytorch
# 2. 安装tqdm
pip install ftfy regex tqdm
# 3. 安装clip
pip install git+https://github.com/openai/CLIP.git
# 内网使用pip install git+https://github.91chi.fun/https://github.com/openai/CLIP.git
# 1. 返回可以用的模型 clip.available_models() ['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px'] # 2. 返回对应的模型和图像转换器 model, preprocess = clip.load("ViT-B/32") # 3. preprocess将Image转换成tensor[3, 224, 224],然后unsqueeze(0)转成[batch_size, 3, 3, 224]后才能输入模型 image = preprocess(Image.open("CLIP.png")).unsqueeze(0) # 4. 将多个句子[batch_size]的每个句子转换成向量[batch_size, context_length] # 每个句子开头加一个BOS(49406) EOS(49407),然后填充到长度context_length(默认值为77) # (若长度大于context_length-2,需设置参数truncate=True,然后返回值为BOS 内容 EOS,即EOS没有被切割掉) text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) # [3, 77] # 5. 获取多个图片的特征 image_features = model.encode_image(image) # 6. 获取多个文本的特征 text_features = model.encode_text(text) # 7. 获取 多个图片和多个文本 之间余弦相似度(0~1) logits_per_image, logits_per_text = model(image, text)
import torch import clip from PIL import Image device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load("ViT-B/32", device=device) image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device) text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) with torch.no_grad(): image_features = model.encode_image(image) text_features = model.encode_text(text) logits_per_image, logits_per_text = model(image, text) probs = logits_per_image.softmax(dim=-1).cpu().numpy() print("Label probs:", probs) # prints: [[0.9927937 0.00421068 0.00299572]]
import os import clip import torch from torchvision.datasets import CIFAR100 # Load the model device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load('ViT-B/32', device) # Download the dataset cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False) # Prepare the inputs image, class_id = cifar100[3637] image_input = preprocess(image).unsqueeze(0).to(device) text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar100.classes]).to(device) # Calculate features with torch.no_grad(): image_features = model.encode_image(image_input) text_features = model.encode_text(text_inputs) # Pick the top 5 most similar labels for the image image_features /= image_features.norm(dim=-1, keepdim=True) text_features /= text_features.norm(dim=-1, keepdim=True) # 这里乘100没什么用,意思是用百分比表示相似度 similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1) values, indices = similarity[0].topk(5) # Print the result print("\nTop predictions:\n") for value, index in zip(values, indices): print(f"{cifar100.classes[index]:>16s}: {100 * value.item():.2f}%")
import os import clip import torch import numpy as np from sklearn.linear_model import LogisticRegression from torch.utils.data import DataLoader from torchvision.datasets import CIFAR100 from tqdm import tqdm # Load the model device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load('ViT-B/32', device) # Load the dataset root = os.path.expanduser("~/.cache") train = CIFAR100(root, download=True, train=True, transform=preprocess) test = CIFAR100(root, download=True, train=False, transform=preprocess) def get_features(dataset): all_features = [] all_labels = [] with torch.no_grad(): for images, labels in tqdm(DataLoader(dataset, batch_size=100)): features = model.encode_image(images.to(device)) all_features.append(features) all_labels.append(labels) return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy() # Calculate the image features train_features, train_labels = get_features(train) test_features, test_labels = get_features(test) # Perform logistic regression classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1) classifier.fit(train_features, train_labels) # Evaluate using the logistic regression classifier predictions = classifier.predict(test_features) accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100. print(f"Accuracy = {accuracy:.3f}")
# 因为我们的模型只用到了CLIP视觉的编码器,所以我们只输出视觉编码器的参数有没有变化即可 # 不打开位置1和位置2,全部输出False,即所有参数都进行了更新 # 仅打开位置1,CLIP的参数为True,Linear为False,即Linear的参数更新 # 仅打开位置2,CLIP的参数为Flase,Linear为True,即只有CLIP的参数更新 import os import clip from torch import nn from torch.utils.data import DataLoader from torchvision.datasets import CIFAR10 from torch.nn import functional as F import torch class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.model, self.preprocess = clip.load('ViT-B/32', 'cpu') self.linear = nn.Linear(512, 10) # 位置2 for param in self.linear.parameters(): param.requires_grad = False def forward(self, x): features = self.model.encode_image(x) # 位置1 # features = features.detach() return self.linear(features) net = Net() optimizer = torch.optim.SGD(net.parameters(), lr=1e-2) root = os.path.expanduser("~/.cache") train = CIFAR10(root, download=True, train=True, transform=net.preprocess) train = next(iter(DataLoader(train, batch_size=8))) storeParam = {} for name, param in net.model.visual.named_parameters(): storeParam[name] = param.detach().clone() for name, param in net.linear.named_parameters(): storeParam[name] = param.detach().clone() for i in range(10): out = net(train[0]) loss = F.cross_entropy(out, train[1]) optimizer.zero_grad() loss.backward() optimizer.step() print(loss.item()) for name, param in net.model.visual.named_parameters(): print(f"{name} {torch.equal(param, storeParam[name])}") for name, param in net.linear.named_parameters(): print(f"{name} {torch.equal(param, storeParam[name])}")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。