- from transformers import AutoModelForCausalLM
- model = AutoModelForCausalLM.from_pretrained(
- "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
- )
- from transformers import AutoTokenizer
- tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
- model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
- generated_ids = model.generate(**model_inputs)
- tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
- 'A list of colors: red, blue, green, yellow, orange, purple, pink,'
- tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default
- model_inputs = tokenizer(
- ["A list of colors: red, blue", "Portugal is"], return_tensors="pt", padding=True
- ).to("cuda")
- generated_ids = model.generate(**model_inputs)
- tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
- ['A list of colors: red, blue, green, yellow, orange, purple, pink,',
- 'Portugal is a country in southwestern Europe, on the Iber']
- model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
- # By default, the output will contain up to 20 tokens
- generated_ids = model.generate(**model_inputs)
- tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
- 'A sequence of numbers: 1, 2, 3, 4, 5'
- # Setting `max_new_tokens` allows you to control the maximum length
- generated_ids = model.generate(**model_inputs, max_new_tokens=50)
- tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
- 'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,'
默认情况下,generate在每次迭代中选择最可能的标记(greedy decoding),除非在GenerationConfig文件中指定。
- # Set seed or reproducibility -- you don't need this unless you want full reproducibility
- from transformers import set_seed
- set_seed(42)
- model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
- # LLM + greedy decoding = repetitive, boring output
- generated_ids = model.generate(**model_inputs)
- tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
- 'I am a cat. I am a cat. I am a cat. I am a cat'
- # With sampling, the output becomes more creative!
- generated_ids = model.generate(**model_inputs, do_sample=True)
- tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
- 'I am a cat. Specifically, I am an indoor-only cat. I'
- # The tokenizer initialized above has right-padding active by default: the 1st sequence,
- # which is shorter, has padding on the right side. Generation fails to capture the logic.
- model_inputs = tokenizer(
- ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
- ).to("cuda")
- generated_ids = model.generate(**model_inputs)
- tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
- '1, 2, 33333333333'
- # With left-padding, it works as expected!
- tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
- tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default
- model_inputs = tokenizer(
- ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
- ).to("cuda")
- generated_ids = model.generate(**model_inputs)
- tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
- '1, 2, 3, 4, 5, 6,'

- tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
- model = AutoModelForCausalLM.from_pretrained(
- "HuggingFaceH4/zephyr-7b-alpha", device_map="auto", load_in_4bit=True
- )
- set_seed(0)
- prompt = """How many helicopters can a human eat in one sitting? Reply as a thug."""
- model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
- input_length = model_inputs.input_ids.shape[1]
- generated_ids = model.generate(**model_inputs, max_new_tokens=20)
- print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
- "I'm not a thug, but i can tell you that a human cannot eat"
- # Oh no, it did not follow our instruction to reply as a thug! Let's see what happens when we write
- # a better prompt and use the right template for this model (through `tokenizer.apply_chat_template`)
- set_seed(0)
- messages = [
- {
- "role": "system",
- "content": "You are a friendly chatbot who always responds in the style of a thug",
- },
- {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
- ]
- model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
- input_length = model_inputs.shape[1]
- generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=20)
- print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
- 'None, you thug. How bout you try to focus on more useful questions?'
- # As we can see, it followed a proper thug style 声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Cpp五条/article/detail/322600推荐阅读
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。