下载

到huggingface下载第三方下载好的权重:https://huggingface.co/NousResearch/Llama-2-7b-hf(这个已经是huggingface格式的了)

或者到镜像站下载

运行

光是下载就琢磨了好久,更新一下运行记录。

使用中文模型Chinese-LLaMA-Alpacascripts/inference/inference_hf.py脚本来运行一下模型(注意安装好依赖包,如果出错可能是依赖包的版本不符合requirements.txt的问题)

运行命令:

1
2
3
4
python /home/jc/workspace/exp/test_llama.py \
--base_model /home/jc/workspace/llama/Llama-2-7b-hf \
--with_prompt \
--interactive

其中--base_model为在下载阶段,下载下来的文件夹的路径。

运行结果如下:

初步的运行完成,此时为interactive运行模式,即一问一答的模式,类似于ChatGPT的Web版

还有一种模式是batch模式,把--interactive去掉即可。即把数据批量喂给模型,输出的结果输出到一个文件中

附一个自己修改过的运行脚本,以类的方式实现:

ModelInterface.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import json
from shutil import copyfile

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

from settings import *

def gen_prompt(instruction, input=None, template=PROMPT_TEMPLATE):
"""Generate prompt before feeding inputs into LLM"""
if input:
instruction = instruction + '\n' + input
return template.format_map({'instruction': instruction})


class ModelInterface:
def __init__(self):
self.args = None
self.generation_config = None
self.model_path = None
self.model = None
self.tokenizer = None
self.device = None
self.setup()

def setup(self):
self.args = parse_args()
self.device = apply_settings(args=self.args)
self.model_path = model_path[self.args.model_name]

self.generation_config = GenerationConfig(
temperature=self.args.temperature,
repetition_penalty=self.args.repetition_penalty,
do_sample=self.args.do_sample,
num_beams=self.args.num_beams,
top_p=self.args.top_p,
top_k=self.args.top_k,
max_new_tokens=self.args.max_new_tokens,
)
# if not self.args.eval_only:
self.load_model()

def load_model(self):
# bnb_config = BitsAndBytesConfig(
# load_in_4bit=True, # 用4bit动态量化,大大减少资源需求
# # bnb_4bit_use_double_quant=True,
# # bnb_4bit_quant_type="nf4",
# # bnb_4bit_compute_dtype=torch.bfloat16
# )
tokenizer = AutoTokenizer.from_pretrained(self.model_path)
base_model = AutoModelForCausalLM.from_pretrained(
self.model_path,
device_map='auto', # 使模型能够移动到GPU
load_in_4bit=True
# quantization_config=bnb_config
)

# 检查模型字典词数
model_vocab_size = base_model.get_input_embeddings().weight.size(0)
tokenzier_vocab_size = len(tokenizer)
print(f">>> Model name: {self.args.model_name}")
print(f">>> Vocab of the base model: {model_vocab_size}")
print(f">>> Vocab of the tokenizer: {tokenzier_vocab_size}")
if model_vocab_size!=tokenzier_vocab_size:
assert tokenzier_vocab_size > model_vocab_size # 训练字典集中的词数不能小于测试集中的词数,否则会超纲
print(">>> Resize model embeddings to fit tokenizer")
base_model.resize_token_embeddings(tokenzier_vocab_size)

# 合并微调模型
if self.args.lora_model is not None:
print(">>> Loading peft model")
load_type = torch.float16
model = PeftModel.from_pretrained(
base_model,
self.args.lora_model,
torch_dtype=load_type,
device_map='auto',)
else:
model = base_model

if self.device==torch.device('cpu'):
model.float() # 将模型转换为float32,以便在CPU上运行

model.eval() # 使模型进入 evaluation mode
print(">>> Finish loading model and tokenizer")

self.model, self.tokenizer = model, tokenizer

def run_iter(self):
"""Run LLM iteratively"""
print(">>> Running LLM iteratively")
with torch.no_grad():
while True:
raw_input_text = input(">>> Input: ")
if len(raw_input_text.strip())==0:
break

if self.args.with_prompt:
input_text = gen_prompt(instruction=raw_input_text)
else:
input_text = raw_input_text

model_inputs = self.tokenizer(input_text, return_tensors="pt")
generated_ids = self.model.generate(
input_ids = model_inputs["input_ids"].to(self.device),
attention_mask = model_inputs['attention_mask'].to(self.device),
eos_token_id=self.tokenizer.eos_token_id,
pad_token_id=self.tokenizer.pad_token_id,
generation_config=self.generation_config
)[0]
output = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
if self.args.with_prompt:
response = output.split("### Response:")[1].strip()
else:
response = output
print(">>> Response: ", response, "\n")

def run_batch(self, data, file_out):
print(">>> Feeding data to LLM in a batch")
results = []
with torch.no_grad():
for index, example in enumerate(data):
if self.args.with_prompt is True:
input_text = gen_prompt(instruction=example)
else:
input_text = example
inputs = self.tokenizer(input_text,return_tensors="pt") #add_special_tokens=False ?
# print(len(inputs["input_ids"][0]))
generated_ids = self.model.generate(
input_ids = inputs["input_ids"].to(self.device),
attention_mask = inputs['attention_mask'].to(self.device),
# eos_token_id=self.tokenizer.eos_token_id,
eos_token_id=2,
# pad_token_id=self.tokenizer.pad_token_id,
pad_token_id=2,
generation_config=self.generation_config
)[0]
output = self.tokenizer.decode(generated_ids,skip_special_tokens=True)
if self.args.with_prompt:
response = output.split("### Response:")[1].strip()
else:
response = output
print(f"======={index}=======")
print(f"Input: \n{example}\n")
print(f"Output: \n{response}\n")

results.append({"Input":input_text, "Output":response})

if file_out:
if os.path.exists(file_out):
copyfile(file_out, file_out+".bak")
with open(file_out, 'w') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
else:
return results

if __name__ == '__main__':
mi = ModelInterface()
mi.setup()

data = [
"Please tell me a joke.",
"Please ...",
]
file_out = "test.json"
mi.run_batch(data, file_out)

settings.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import sys
import torch
import argparse

from transformers import GenerationConfig, BitsAndBytesConfig

model_path = {
'llama-7b': "your_path",
'llama-13b': "your_path",
}
PROMPT_TEMPLATE = (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n\n{instruction}\n\n### Response:\n\n"
)

def parse_args():
parser = argparse.ArgumentParser()

# For model/ setup
parser.add_argument('--model_name', default="code-llama-13b-python", type=str)
# parser.add_argument('--base_model', default=LLM_PATH, type=str)
parser.add_argument('--lora_model', default=None, type=str,help="If None, perform inference on the base model")
# parser.add_argument('--tokenizer_path',default=TOKENIZER_PATH,type=str)
# parser.add_argument('--data_file', default=None, type=str,help="A file that contains instructions (one instruction per line)")
parser.add_argument('--out_dir', default="~/log", type=str,help="Output directory of test log files.")
parser.add_argument('--interactive', default=True, action='store_true',help="run in the instruction mode (single-turn)")
parser.add_argument('--predictions_file', default='./predictions.json', type=str)
parser.add_argument('--gpus', default="0", type=str)
parser.add_argument('--only_cpu', action='store_true',help='only use CPU for inference')
parser.add_argument('--alpha', type=str,default="1.0", help="The scaling factor of NTK method, can be a float or 'auto'. ")
parser.add_argument('--load_in_8bit', action='store_true', help="Load the LLM in the 8bit mode")

# For prompt
parser.add_argument('--with_prompt', default=False, action='store_true',help="Wrap the input with the text prompt template automatically")

# Generation Config -- all are default
parser.add_argument('--temperature', default=1.0, type=float)
parser.add_argument('--top_k', default=50, type=int)
parser.add_argument('--top_p', default=1, type=float)
parser.add_argument('--do_sample', default=False, action='store_true')
parser.add_argument('--num_beams', default=1, type=int)
parser.add_argument('--repetition_penalty', default=1.0, type=float)
parser.add_argument('--max_new_tokens', default=300, type=int)

args = parser.parse_args()
return args

def apply_settings(args):
if args.only_cpu is True:
args.gpus = ""
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus

if torch.cuda.is_available():
device = torch.device(0)
else:
device = torch.device('cpu')

return device

# apply_attention_patch(use_memory_efficient_attention=True)
# apply_ntk_scaling_patch(args.alpha)

参考

https://juejin.cn/post/7283803914646814720

https://zhuanlan.zhihu.com/p/648548363