fb700 commited on
Commit
7759fd7
·
1 Parent(s): ccc1b16

Delete chatllm.py

Browse files
Files changed (1) hide show
  1. chatllm.py +0 -160
chatllm.py DELETED
@@ -1,160 +0,0 @@
1
-
2
- import os
3
- from typing import Dict, List, Optional, Tuple, Union
4
-
5
- import torch
6
- from langchain.llms.base import LLM
7
- from langchain.llms.utils import enforce_stop_tokens
8
- from transformers import AutoModel, AutoTokenizer
9
-
10
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
11
-
12
- DEVICE = "cuda"
13
- DEVICE_ID = "0"
14
- CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
15
-
16
-
17
- def torch_gc():
18
- if torch.cuda.is_available():
19
- with torch.cuda.device(CUDA_DEVICE):
20
- torch.cuda.empty_cache()
21
- torch.cuda.ipc_collect()
22
-
23
- def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
24
- # transformer.word_embeddings 占用1层
25
- # transformer.final_layernorm 和 lm_head 占用1层
26
- # transformer.layers 占用 28 层
27
- # 总共30层分配到num_gpus张卡上
28
- num_trans_layers = 28
29
- per_gpu_layers = 30 / num_gpus
30
-
31
- # bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
32
- # windows下 model.device 会被设置成 transformer.word_embeddings.device
33
- # linux下 model.device 会被设置成 lm_head.device
34
- # 在调用chat或者stream_chat时,input_ids会被放到model.device上
35
- # 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
36
- # 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
37
- device_map = {'transformer.word_embeddings': 0,
38
- 'transformer.final_layernorm': 0, 'lm_head': 0}
39
-
40
- used = 2
41
- gpu_target = 0
42
- for i in range(num_trans_layers):
43
- if used >= per_gpu_layers:
44
- gpu_target += 1
45
- used = 0
46
- assert gpu_target < num_gpus
47
- device_map[f'transformer.layers.{i}'] = gpu_target
48
- used += 1
49
-
50
- return device_map
51
-
52
-
53
-
54
- class ChatLLM(LLM):
55
- max_token: int = 10000
56
- temperature: float = 0.1
57
- top_p = 0.9
58
- history = []
59
- tokenizer: object = None
60
- model: object = None
61
-
62
- def __init__(self):
63
- super().__init__()
64
-
65
- @property
66
- def _llm_type(self) -> str:
67
- return "ChatLLM"
68
-
69
- def _call(self,
70
- prompt: str,
71
- stop: Optional[List[str]] = None) -> str:
72
-
73
- if self.model == 'Minimax':
74
- import requests
75
-
76
- group_id = os.getenv('group_id')
77
- api_key = os.getenv('api_key')
78
-
79
- url = f'https://api.minimax.chat/v1/text/chatcompletion?GroupId={group_id}'
80
- headers = {
81
- "Authorization": f"Bearer {api_key}",
82
- "Content-Type": "application/json"
83
- }
84
- request_body = {
85
- "model": "abab5-chat",
86
- "tokens_to_generate": 512,
87
- 'messages': []
88
- }
89
-
90
- for i in self.history:
91
- h_input = i[0]
92
- h_reply = i[1]
93
- request_body['messages'].append({
94
- "sender_type": "USER",
95
- "text": h_input
96
- })
97
- request_body['messages'].append({"sender_type": "BOT", "text": h_reply})
98
-
99
- request_body['messages'].append({"sender_type": "USER", "text": prompt})
100
- resp = requests.post(url, headers=headers, json=request_body)
101
- response = resp.json()['reply']
102
- # 将当次的ai回复内容加入messages
103
- request_body['messages'].append({"sender_type": "BOT", "text": response})
104
- self.history.append((prompt, response))
105
-
106
- else:
107
-
108
- response, _ = self.model.chat(
109
- self.tokenizer,
110
- prompt,
111
- history=self.history,
112
- max_length=self.max_token,
113
- temperature=self.temperature,
114
- )
115
- torch_gc()
116
- if stop is not None:
117
- response = enforce_stop_tokens(response, stop)
118
- self.history = self.history+[[None, response]]
119
- return response
120
-
121
- def load_model(self,
122
- model_name_or_path: str = "fb700/chatglm-fitness-RLHF",
123
- llm_device=DEVICE,
124
- device_map: Optional[Dict[str, int]] = None,
125
- **kwargs):
126
- self.tokenizer = AutoTokenizer.from_pretrained(
127
- model_name_or_path,
128
- trust_remote_code=True
129
- )
130
- if torch.cuda.is_available() and llm_device.lower().startswith("cuda"):
131
- # 根据当前设备GPU数量决定是否进行多卡部署
132
- num_gpus = torch.cuda.device_count()
133
- if num_gpus < 2 and device_map is None:
134
- self.model = (
135
- AutoModel.from_pretrained(
136
- model_name_or_path,
137
- trust_remote_code=True,
138
- **kwargs)
139
- .half()
140
- .quantize(8)
141
- .cuda()
142
- )
143
- else:
144
- from accelerate import dispatch_model
145
-
146
- model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, **kwargs).half()
147
- # 可传入device_map自定义每张卡的部署情况
148
- if device_map is None:
149
- device_map = auto_configure_device_map(num_gpus)
150
-
151
- self.model = dispatch_model(model, device_map=device_map)
152
- else:
153
- self.model = (
154
- AutoModel.from_pretrained(
155
- model_name_or_path,
156
- trust_remote_code=True)
157
- .float()
158
- .to(llm_device)
159
- )
160
- self.model = self.model.eval()