Skip to content

Commit a559e1c

Browse files
committed
Merge branch 'dev'
2 parents c752b01 + e345eea commit a559e1c

File tree

98 files changed

+4475
-1160
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

98 files changed

+4475
-1160
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@ llama-cpp-env/
2020
*-env/
2121
build-envs/
2222
portable-git/
23+
llm_cache/*

LlamaCPP/llama_adapter.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import threading
22
from queue import Empty, Queue
33
import json
4+
import time
45
import traceback
56
from typing import Dict, List, Callable
67
#from model_downloader import NotEnoughDiskSpaceException, DownloadException
@@ -80,7 +81,7 @@ def error_callback(self, ex: Exception):
8081
elif isinstance(ex, RuntimeError):
8182
self.put_msg({"type": "error", "err_type": "runtime_error"})
8283
else:
83-
self.put_msg({"type": "error", "err_type": "unknow_exception"})
84+
self.put_msg({"type": "error", "err_type": "unknown_exception"})
8485
print(f"exception:{str(ex)}")
8586

8687
def text_conversation(self, params: LLMParams):
@@ -92,7 +93,13 @@ def text_conversation(self, params: LLMParams):
9293
return self.generator()
9394

9495

95-
def stream_function(self, stream):
96+
def stream_function(self, stream):
97+
num_tokens = 0
98+
start_time = time.time()
99+
is_first = True
100+
first_token_time = 0.0
101+
last_token_time = 0.0
102+
96103
for output in stream:
97104
if self.llm_interface.stop_generate:
98105
self.llm_interface.stop_generate = False
@@ -104,6 +111,26 @@ def stream_function(self, stream):
104111
else:
105112
# openai style
106113
self.text_out_callback(output["choices"][0]["delta"].get("content",""))
114+
num_tokens += 1
115+
116+
if is_first:
117+
first_token_time = time.time()
118+
is_first = False
119+
120+
last_token_time = time.time()
121+
122+
metrics_data = {
123+
"type": "metrics",
124+
"num_tokens": num_tokens,
125+
"total_time": last_token_time - start_time,
126+
"overall_tokens_per_second": num_tokens / (last_token_time - start_time),
127+
"second_plus_tokens_per_second": (num_tokens - 1) / (last_token_time - first_token_time),
128+
"first_token_latency": first_token_time - start_time,
129+
"after_token_latency": (last_token_time - first_token_time) / (num_tokens - 1) if num_tokens > 1 else None
130+
}
131+
132+
self.put_msg(metrics_data)
133+
107134
self.put_msg({"type": "finish"})
108135

109136
def text_conversation_run(
@@ -121,7 +148,7 @@ def text_conversation_run(
121148
)
122149

123150
full_prompt = convert_prompt(prompt)
124-
stream = self.llm_interface.create_chat_completion(full_prompt)
151+
stream = self.llm_interface.create_chat_completion(full_prompt, params.max_tokens)
125152
self.stream_function(stream)
126153

127154
except Exception as ex:

LlamaCPP/llama_cpp_backend.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,10 @@ def load_model(self, params: LLMParams, n_gpu_layers: int = -1, context_length:
3434
if callback is not None:
3535
callback("finish")
3636

37-
def create_chat_completion(self, messages: List[Dict[str, str]]):
37+
def create_chat_completion(self, messages: List[Dict[str, str]], max_tokens: int = 1024):
3838
completion: Iterator[CreateChatCompletionStreamResponse] = self._model.create_chat_completion(
3939
messages=messages,
40+
max_tokens=max_tokens,
4041
stream=True,
4142
)
4243
return completion

LlamaCPP/llama_params.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
1-
from typing import Dict, List
1+
from typing import Any, Dict, List
22

33
class LLMParams:
44
prompt: List[Dict[str, str]]
55
device: int
66
enable_rag: bool
77
model_repo_id: str
8+
max_tokens: int
9+
generation_parameters: Dict[str, Any]
810

911
def __init__(
10-
self, prompt: list, device: int, enable_rag: bool, model_repo_id: str
12+
self, prompt: list, device: int, enable_rag: bool, model_repo_id: str, max_tokens: int, **kwargs
1113
) -> None:
1214
self.prompt = prompt
1315
self.device = device
1416
self.enable_rag = enable_rag
15-
self.model_repo_id = model_repo_id
17+
self.model_repo_id = model_repo_id
18+
self.max_tokens = max_tokens
19+
self.generation_parameters = kwargs

OpenVINO/.gitignore

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
.vscode/
2+
__pycache__/
3+
models/llm/
4+
temp/
5+
test/
6+
dist/
7+
build/
8+
cache/
9+
test/
10+
env/
11+
12+
!tools/*.exe
13+
llm_cache/
14+
TinyLlama-*
15+
laion/
16+
db/

OpenVINO/openvino_adapter.py

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
import threading
2+
from queue import Empty, Queue
3+
import json
4+
import time
5+
import traceback
6+
from typing import Dict, List, Callable
7+
from openvino_interface import LLMInterface
8+
from openvino_params import LLMParams
9+
10+
RAG_PROMPT_FORMAT = "Answer the questions based on the information below. \n{context}\n\nQuestion: {prompt}"
11+
12+
class LLM_SSE_Adapter:
13+
msg_queue: Queue
14+
finish: bool
15+
singal: threading.Event
16+
llm_interface: LLMInterface
17+
should_stop: bool
18+
19+
def __init__(self, llm_interface: LLMInterface):
20+
self.msg_queue = Queue(-1)
21+
self.finish = False
22+
self.singal = threading.Event()
23+
self.llm_interface = llm_interface
24+
self.should_stop = False
25+
self.num_tokens = 0
26+
self.start_time = 0
27+
self.first_token_time = 0
28+
self.last_token_time = 0
29+
self.is_first_token = True
30+
31+
def put_msg(self, data):
32+
self.msg_queue.put_nowait(data)
33+
self.singal.set()
34+
35+
def load_model_callback(self, event: str):
36+
data = {"type": "load_model", "event": event}
37+
self.put_msg(data)
38+
39+
def text_in_callback(self, msg: str):
40+
data = {"type": "text_in", "value": msg}
41+
self.put_msg(data)
42+
43+
def text_out_callback(self, msg: str, type=1):
44+
data = {"type": "text_out", "value": msg, "dtype": type}
45+
self.put_msg(data)
46+
47+
def first_latency_callback(self, first_latency: str):
48+
data = {"type": "first_token_latency", "value": first_latency}
49+
self.put_msg(data)
50+
51+
def after_latency_callback(self, after_latency: str):
52+
data = {"type": "after_token_latency", "value": after_latency}
53+
self.put_msg(data)
54+
55+
def sr_latency_callback(self, sr_latency: str):
56+
data = {"type": "sr_latency", "value": sr_latency}
57+
self.put_msg(data)
58+
59+
def error_callback(self, ex: Exception):
60+
if (
61+
isinstance(ex, NotImplementedError)
62+
and ex.__str__() == "Access to repositories lists is not implemented."
63+
):
64+
self.put_msg(
65+
{
66+
"type": "error",
67+
"err_type": "repositories_not_found",
68+
}
69+
)
70+
# elif isinstance(ex, NotEnoughDiskSpaceException):
71+
# self.put_msg(
72+
# {
73+
# "type": "error",
74+
# "err_type": "not_enough_disk_space",
75+
# "need": bytes2human(ex.requires_space),
76+
# "free": bytes2human(ex.free_space),
77+
# }
78+
# )
79+
# elif isinstance(ex, DownloadException):
80+
# self.put_msg({"type": "error", "err_type": "download_exception"})
81+
# # elif isinstance(ex, llm_biz.StopGenerateException):
82+
# # pass
83+
elif isinstance(ex, RuntimeError):
84+
self.put_msg({"type": "error", "err_type": "runtime_error"})
85+
else:
86+
self.put_msg({"type": "error", "err_type": "unknown_exception"})
87+
self.put_msg(f"exception:{str(ex)}")
88+
89+
def text_conversation(self, params: LLMParams):
90+
thread = threading.Thread(
91+
target=self.text_conversation_run,
92+
args=[params],
93+
)
94+
thread.start()
95+
return self.generator()
96+
97+
98+
def stream_function(self, output):
99+
if self.is_first_token:
100+
self.first_token_time = time.time()
101+
self.is_first_token = False
102+
103+
self.text_out_callback(output)
104+
self.num_tokens += 1
105+
106+
if self.llm_interface.stop_generate:
107+
self.put_msg("Stopping generation.")
108+
return True # Stop generation
109+
110+
return False
111+
112+
113+
def text_conversation_run(
114+
self,
115+
params: LLMParams,
116+
):
117+
try:
118+
self.llm_interface.load_model(params, callback=self.load_model_callback)
119+
120+
# Reset metrics tracking
121+
self.num_tokens = 0
122+
self.start_time = time.time()
123+
self.first_token_time = 0
124+
self.last_token_time = 0
125+
self.is_first_token = True
126+
127+
prompt = params.prompt
128+
full_prompt = convert_prompt(prompt)
129+
self.llm_interface.create_chat_completion(full_prompt, self.stream_function, params.max_tokens)
130+
131+
# Calculate and send metrics
132+
self.last_token_time = time.time()
133+
metrics_data = {
134+
"type": "metrics",
135+
"num_tokens": self.num_tokens,
136+
"total_time": self.last_token_time - self.start_time,
137+
"overall_tokens_per_second": self.num_tokens / (self.last_token_time - self.start_time) if self.num_tokens > 0 else 0,
138+
"second_plus_tokens_per_second": (self.num_tokens - 1) / (self.last_token_time - self.first_token_time) if self.num_tokens > 1 else None,
139+
"first_token_latency": self.first_token_time - self.start_time if self.num_tokens > 0 else None,
140+
"after_token_latency": (self.last_token_time - self.first_token_time) / (self.num_tokens - 1) if self.num_tokens > 1 else None
141+
}
142+
self.put_msg(metrics_data)
143+
self.put_msg({"type": "finish"})
144+
145+
except Exception as ex:
146+
traceback.print_exc()
147+
self.error_callback(ex)
148+
finally:
149+
self.llm_interface.stop_generate = False
150+
self.finish = True
151+
self.singal.set()
152+
153+
def generator(self):
154+
while True:
155+
while not self.msg_queue.empty():
156+
try:
157+
data = self.msg_queue.get_nowait()
158+
msg = f"data:{json.dumps(data)}\0"
159+
print(msg)
160+
yield msg
161+
except Empty(Exception):
162+
break
163+
if not self.finish:
164+
self.singal.clear()
165+
self.singal.wait()
166+
else:
167+
break
168+
169+
170+
_default_prompt = {
171+
"role": "system",
172+
"content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user. Please keep the output text language the same as the user input.",
173+
}
174+
175+
def convert_prompt(prompt: List[Dict[str, str]]):
176+
chat_history = [_default_prompt]
177+
prompt_len = prompt.__len__()
178+
i = 0
179+
while i < prompt_len:
180+
chat_history.append({"role": "user", "content": prompt[i].get("question")})
181+
if i < prompt_len - 1:
182+
chat_history.append(
183+
{"role": "assistant", "content": prompt[i].get("answer")}
184+
)
185+
i = i + 1
186+
return chat_history
187+
188+
189+
def process_rag(
190+
prompt: str,
191+
device: str,
192+
text_out_callback: Callable[[str, int], None] = None,
193+
):
194+
import rag
195+
rag.to(device)
196+
query_success, context, rag_source = rag.query(prompt)
197+
if query_success:
198+
print("rag query input\r\n{}output:\r\n{}".format(prompt, context))
199+
prompt = RAG_PROMPT_FORMAT.format(prompt=prompt, context=context)
200+
if text_out_callback is not None:
201+
text_out_callback(rag_source, 2)
202+
return prompt

OpenVINO/openvino_backend.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from typing import Dict, List, Callable
2+
from os import path
3+
from openvino_interface import LLMInterface
4+
import openvino_genai
5+
from openvino_params import LLMParams
6+
import openvino_model_config as model_config
7+
import gc
8+
9+
class OpenVino(LLMInterface):
10+
def __init__(self):
11+
self._model = None
12+
self.stop_generate = False
13+
self._last_repo_id = None
14+
15+
def load_model(self, params: LLMParams, callback: Callable[[str], None] = None):
16+
model_repo_id = params.model_repo_id
17+
if self._model is None or self._last_repo_id != model_repo_id:
18+
if callback is not None:
19+
callback("start")
20+
self.unload_model()
21+
callback(params.model_repo_id)
22+
23+
model_base_path = model_config.openVINOConfig.get("openvinoLLM")
24+
model_name = model_repo_id.replace("/", "---")
25+
model_path = path.abspath(path.join(model_base_path, model_name))
26+
27+
enable_compile_cache = dict()
28+
enable_compile_cache["CACHE_DIR"] = "llm_cache"
29+
self._model = openvino_genai.LLMPipeline(model_path, "AUTO", **enable_compile_cache)
30+
self._tokenizer = self._model.get_tokenizer()
31+
32+
self._last_repo_id = model_repo_id
33+
if callback is not None:
34+
callback("finish")
35+
36+
37+
def create_chat_completion(self, messages: List[Dict[str, str]], streamer: Callable[[str], None], max_tokens: int = 1024):
38+
config = openvino_genai.GenerationConfig()
39+
config.max_new_tokens = max_tokens
40+
41+
full_prompt = self._tokenizer.apply_chat_template(messages, add_generation_prompt=True)
42+
return self._model.generate(full_prompt, config, streamer)
43+
44+
45+
def unload_model(self):
46+
if self._model is not None:
47+
del self._model
48+
gc.collect()
49+
self._model = None
50+
51+
def get_backend_type(self):
52+
return "openvino"

0 commit comments

Comments
 (0)