Llama部署与运行

下载

到huggingface下载第三方下载好的权重：https://huggingface.co/NousResearch/Llama-2-7b-hf（这个已经是huggingface格式的了）

或者到镜像站下载

运行

光是下载就琢磨了好久，更新一下运行记录。

使用中文模型Chinese-LLaMA-Alpaca的scripts/inference/inference_hf.py脚本来运行一下模型（注意安装好依赖包，如果出错可能是依赖包的版本不符合requirements.txt的问题）

运行命令：

python /home/jc/workspace/exp/test_llama.py \
	--base_model /home/jc/workspace/llama/Llama-2-7b-hf \
	--with_prompt \
	--interactive

其中--base_model为在下载阶段，下载下来的文件夹的路径。

运行结果如下：

初步的运行完成，此时为interactive运行模式，即一问一答的模式，类似于ChatGPT的Web版

还有一种模式是batch模式，把--interactive去掉即可。即把数据批量喂给模型，输出的结果输出到一个文件中

附一个自己修改过的运行脚本，以类的方式实现：

ModelInterface.py：

import json
from shutil import copyfile

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

from settings import *

def gen_prompt(instruction, input=None, template=PROMPT_TEMPLATE):
	"""Generate prompt before feeding inputs into LLM"""
	if input:
		instruction = instruction + '\n' + input
	return template.format_map({'instruction': instruction})


class ModelInterface:
	def __init__(self):
		self.args = None
		self.generation_config = None
		self.model_path = None
		self.model = None
		self.tokenizer = None
		self.device = None
		self.setup()

	def setup(self):	
		self.args = parse_args()
		self.device = apply_settings(args=self.args)
		self.model_path = model_path[self.args.model_name]
		
		self.generation_config = GenerationConfig(
			temperature=self.args.temperature,
			repetition_penalty=self.args.repetition_penalty,
			do_sample=self.args.do_sample,
			num_beams=self.args.num_beams,
			top_p=self.args.top_p,
			top_k=self.args.top_k,
			max_new_tokens=self.args.max_new_tokens,
		)
		# if not self.args.eval_only:
		self.load_model()
	  
	def load_model(self):
		# bnb_config = BitsAndBytesConfig(
		# load_in_4bit=True, # 用4bit动态量化，大大减少资源需求
		# # bnb_4bit_use_double_quant=True,
		# # bnb_4bit_quant_type="nf4",
		# # bnb_4bit_compute_dtype=torch.bfloat16
		# )
		tokenizer = AutoTokenizer.from_pretrained(self.model_path)
		base_model = AutoModelForCausalLM.from_pretrained(
			self.model_path,
			device_map='auto', # 使模型能够移动到GPU
			load_in_4bit=True
			# quantization_config=bnb_config
		)
		
		# 检查模型字典词数
		model_vocab_size = base_model.get_input_embeddings().weight.size(0)
		tokenzier_vocab_size = len(tokenizer)
		print(f">>> Model name: {self.args.model_name}")
		print(f">>> Vocab of the base model: {model_vocab_size}")
		print(f">>> Vocab of the tokenizer: {tokenzier_vocab_size}")
		if model_vocab_size!=tokenzier_vocab_size:
			assert tokenzier_vocab_size > model_vocab_size # 训练字典集中的词数不能小于测试集中的词数，否则会超纲
			print(">>> Resize model embeddings to fit tokenizer")
			base_model.resize_token_embeddings(tokenzier_vocab_size)
		
		# 合并微调模型
		if self.args.lora_model is not None:
			print(">>> Loading peft model")
			load_type = torch.float16
			model = PeftModel.from_pretrained(
				base_model,
				self.args.lora_model,
				torch_dtype=load_type,
				device_map='auto',)
		else:
			model = base_model
		  
		if self.device==torch.device('cpu'):
			model.float() # 将模型转换为float32，以便在CPU上运行
		  
		model.eval() # 使模型进入 evaluation mode
		print(">>> Finish loading model and tokenizer")
		  
		self.model, self.tokenizer = model, tokenizer
	  
	def run_iter(self):
		"""Run LLM iteratively"""
		print(">>> Running LLM iteratively")
		with torch.no_grad():
			while True:
				raw_input_text = input(">>> Input: ")
				if len(raw_input_text.strip())==0:
					break
				
				if self.args.with_prompt:
					input_text = gen_prompt(instruction=raw_input_text)
				else:
					input_text = raw_input_text
				
				model_inputs = self.tokenizer(input_text, return_tensors="pt")
				generated_ids = self.model.generate(
					input_ids = model_inputs["input_ids"].to(self.device),
					attention_mask = model_inputs['attention_mask'].to(self.device),
					eos_token_id=self.tokenizer.eos_token_id,
					pad_token_id=self.tokenizer.pad_token_id,
					generation_config=self.generation_config
				)[0]
				output = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
				if self.args.with_prompt:
					response = output.split("### Response:")[1].strip()
				else:
					response = output
				print(">>> Response: ", response, "\n")
	  
	def run_batch(self, data, file_out):
		print(">>> Feeding data to LLM in a batch")
		results = []
		with torch.no_grad():
			for index, example in enumerate(data):
				if self.args.with_prompt is True:
					input_text = gen_prompt(instruction=example)
				else:
					input_text = example
				inputs = self.tokenizer(input_text,return_tensors="pt") #add_special_tokens=False ?
				# print(len(inputs["input_ids"][0]))
				generated_ids = self.model.generate(
					input_ids = inputs["input_ids"].to(self.device),
					attention_mask = inputs['attention_mask'].to(self.device),
					# eos_token_id=self.tokenizer.eos_token_id,
					eos_token_id=2,
					# pad_token_id=self.tokenizer.pad_token_id,
					pad_token_id=2,
					generation_config=self.generation_config
				)[0]
				output = self.tokenizer.decode(generated_ids,skip_special_tokens=True)
				if self.args.with_prompt:
					response = output.split("### Response:")[1].strip()
				else:
					response = output
				print(f"======={index}=======")
				print(f"Input: \n{example}\n")
				print(f"Output: \n{response}\n")
				  
				results.append({"Input":input_text, "Output":response})
		  
		if file_out:
			if os.path.exists(file_out):
				copyfile(file_out, file_out+".bak")
			with open(file_out, 'w') as f:
				json.dump(results, f, ensure_ascii=False, indent=2)
		else:
			return results
		
if __name__ == '__main__':
	mi = ModelInterface()
	mi.setup()
	
	data = [
		"Please tell me a joke.",
		"Please ...",
	]
	file_out = "test.json"
	mi.run_batch(data, file_out)

settings.py：

import os
import sys
import torch
import argparse

from transformers import GenerationConfig, BitsAndBytesConfig

model_path = {
	'llama-7b': "your_path",
	'llama-13b': "your_path",
}
PROMPT_TEMPLATE = (
	"Below is an instruction that describes a task. "
	"Write a response that appropriately completes the request.\n\n"
	"### Instruction:\n\n{instruction}\n\n### Response:\n\n"
)

def parse_args():
	parser = argparse.ArgumentParser()
	
	# For model/ setup
	parser.add_argument('--model_name', default="code-llama-13b-python", type=str)
	# parser.add_argument('--base_model', default=LLM_PATH, type=str)
	parser.add_argument('--lora_model', default=None, type=str,help="If None, perform inference on the base model")
	# parser.add_argument('--tokenizer_path',default=TOKENIZER_PATH,type=str)
	# parser.add_argument('--data_file', default=None, type=str,help="A file that contains instructions (one instruction per line)")
	parser.add_argument('--out_dir', default="~/log", type=str,help="Output directory of test log files.")
	parser.add_argument('--interactive', default=True, action='store_true',help="run in the instruction mode (single-turn)")
	parser.add_argument('--predictions_file', default='./predictions.json', type=str)
	parser.add_argument('--gpus', default="0", type=str)
	parser.add_argument('--only_cpu', action='store_true',help='only use CPU for inference')
	parser.add_argument('--alpha', type=str,default="1.0", help="The scaling factor of NTK method, can be a float or 'auto'. ")
	parser.add_argument('--load_in_8bit', action='store_true', help="Load the LLM in the 8bit mode")
	
	# For prompt
	parser.add_argument('--with_prompt', default=False, action='store_true',help="Wrap the input with the text prompt template automatically")
	
	# Generation Config -- all are default
	parser.add_argument('--temperature', default=1.0, type=float)
	parser.add_argument('--top_k', default=50, type=int)
	parser.add_argument('--top_p', default=1, type=float)
	parser.add_argument('--do_sample', default=False, action='store_true')
	parser.add_argument('--num_beams', default=1, type=int)
	parser.add_argument('--repetition_penalty', default=1.0, type=float)
	parser.add_argument('--max_new_tokens', default=300, type=int)
	
	args = parser.parse_args()
	return args  

def apply_settings(args):
	if args.only_cpu is True:
		args.gpus = ""
	os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
	
	if torch.cuda.is_available():
		device = torch.device(0)
	else:
		device = torch.device('cpu')
	
	return device

# apply_attention_patch(use_memory_efficient_attention=True)
# apply_ntk_scaling_patch(args.alpha)

参考

https://juejin.cn/post/7283803914646814720

https://zhuanlan.zhihu.com/p/648548363