赞
踩
openai CLIP
模型权重:
https://huggingface.co/openai/clip-vit-base-patch32
- from PIL import Image
- import requests
- import torch
- import torch.nn as nn
- from transformers import CLIPProcessor, CLIPModel
-
- model = CLIPModel.from_pretrained("clip-vit-base-patch32")
- processor = CLIPProcessor.from_pretrained("clip-vit-base-patch32")
-
- url = "000000039769.jpg"
- image = Image.open(url)
-
- inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True)
-
- print("inputs:", inputs.keys())
-
- outputs = model(**inputs)
- logits_per_image = outputs.logits_per_image # this is the image-text similarity score
- probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
-
-
- class ImgModelWrapper(nn.Module):
- def __init__(self, model):
- super(ImgModelWrapper, self).__init__()
- self.model = model
-
- def forward(self, pixel_values):
- image_features = model.get_image_features(pixel_values=pixel_values)
- return image_features
-
-
- class TxtModelWrapper(nn.Module):
- def __init__(self, model):
- super(TxtModelWrapper, self).__init__()
- self.model = model
-
- def forward(self, input_ids, attention_mask):
- text_features = model.get_text_features(input_ids=input_ids, attention_mask=attention_mask)
- return text_features
-
-
- img_model = ImgModelWrapper(model)
- txt_model = TxtModelWrapper(model)
-
-
- torch.onnx.export(img_model, # model being run
- (inputs.pixel_values), # model input (or a tuple for multiple inputs)
- "clip_img.onnx", # where to save the model (can be a file or file-like object)
- export_params=True, # store the trained parameter weights inside the model file
- opset_version=15, # the ONNX version to export the model to
- do_constant_folding=False, # whether to execute constant folding for optimization
- input_names=['pixel_values'], # the model's input names
- # output_names=['output'], # the model's output names
- # dynamic_axes={'pixel_values': {0: 'batch', 2: 'hight', 3: 'width'}},
- )
-
- torch.onnx.export(txt_model, # model being run
- (inputs.input_ids, inputs.attention_mask), # model input (or a tuple for multiple inputs)
- "clip_txt.onnx", # where to save the model (can be a file or file-like object)
- export_params=True, # store the trained parameter weights inside the model file
- opset_version=15, # the ONNX version to export the model to
- do_constant_folding=False, # whether to execute constant folding for optimization
- input_names=['input_ids', 'attention_mask'], # the model's input names
- # output_names=['output'], # the model's output names
- dynamic_axes={'input_ids': {0: 'batch', 1: 'seq'},
- 'attention_mask': {0: 'batch', 1: 'seq'}},
- )

chinese-clip用上面类似的方法
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。