intel
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎LlamaCPP/llama_adapter.py‎
Lines changed: 30 additions & 3 deletions b/‎LlamaCPP/llama_adapter.py‎
Lines changed: 30 additions & 3 deletions
diff --git a/‎LlamaCPP/llama_cpp_backend.py‎
Lines changed: 2 additions & 1 deletion b/‎LlamaCPP/llama_cpp_backend.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎LlamaCPP/llama_params.py‎
Lines changed: 7 additions & 3 deletions b/‎LlamaCPP/llama_params.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎OpenVINO/.gitignore‎
Lines changed: 16 additions & 0 deletions b/‎OpenVINO/.gitignore‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎OpenVINO/openvino_adapter.py‎
Lines changed: 202 additions & 0 deletions b/‎OpenVINO/openvino_adapter.py‎
Lines changed: 202 additions & 0 deletions
diff --git a/‎OpenVINO/openvino_backend.py‎
Lines changed: 52 additions & 0 deletions b/‎OpenVINO/openvino_backend.py‎
Lines changed: 52 additions & 0 deletions
@@ -20,3 +20,4 @@ llama-cpp-env/
 *-env/
 build-envs/
 portable-git/
+llm_cache/*
@@ -1,6 +1,7 @@
 import threading
 from queue import Empty, Queue
 import json
+import time
 import traceback
 from typing import Dict, List, Callable
 #from model_downloader import NotEnoughDiskSpaceException, DownloadException
@@ -80,7 +81,7 @@ def error_callback(self, ex: Exception):
         elif isinstance(ex, RuntimeError):
             self.put_msg({"type": "error", "err_type": "runtime_error"})
         else:
-            self.put_msg({"type": "error", "err_type": "unknow_exception"})
+            self.put_msg({"type": "error", "err_type": "unknown_exception"})
         print(f"exception:{str(ex)}")
 
     def text_conversation(self, params: LLMParams):
@@ -92,7 +93,13 @@ def text_conversation(self, params: LLMParams):
         return self.generator()
 
 
-    def stream_function(self, stream):
+    def stream_function(self, stream):  
+        num_tokens = 0
+        start_time = time.time()
+        is_first = True
+        first_token_time = 0.0
+        last_token_time = 0.0
+
         for output in stream:
             if self.llm_interface.stop_generate:
                 self.llm_interface.stop_generate = False
@@ -104,6 +111,26 @@ def stream_function(self, stream):
             else:
                 # openai style
                 self.text_out_callback(output["choices"][0]["delta"].get("content",""))
+                num_tokens += 1
+
+                if is_first:
+                    first_token_time = time.time()
+                    is_first = False
+
+        last_token_time = time.time()
+
+        metrics_data = {
+            "type": "metrics",
+            "num_tokens": num_tokens,
+            "total_time": last_token_time - start_time,
+            "overall_tokens_per_second": num_tokens / (last_token_time - start_time),
+            "second_plus_tokens_per_second": (num_tokens - 1) / (last_token_time - first_token_time),
+            "first_token_latency": first_token_time - start_time,
+            "after_token_latency": (last_token_time - first_token_time) / (num_tokens - 1) if num_tokens > 1 else None
+        }
+
+        self.put_msg(metrics_data)
+
         self.put_msg({"type": "finish"})
 
     def text_conversation_run(
@@ -121,7 +148,7 @@ def text_conversation_run(
                 )
 
             full_prompt = convert_prompt(prompt)
-            stream = self.llm_interface.create_chat_completion(full_prompt)
+            stream = self.llm_interface.create_chat_completion(full_prompt, params.max_tokens)
             self.stream_function(stream)	
 
         except Exception as ex:
 
@@ -34,9 +34,10 @@ def load_model(self, params: LLMParams, n_gpu_layers: int = -1, context_length:
             if callback is not None:
                 callback("finish")
 
-    def create_chat_completion(self, messages: List[Dict[str, str]]):
+    def create_chat_completion(self, messages: List[Dict[str, str]], max_tokens: int = 1024):
         completion: Iterator[CreateChatCompletionStreamResponse] = self._model.create_chat_completion(
             messages=messages,
+            max_tokens=max_tokens,
             stream=True,
         )
         return completion
 
@@ -1,15 +1,19 @@
-from typing import Dict, List
+from typing import Any, Dict, List
 
 class LLMParams:
     prompt: List[Dict[str, str]]
     device: int
     enable_rag: bool 
     model_repo_id: str
+    max_tokens: int
+    generation_parameters: Dict[str, Any]
 
     def __init__(
-        self, prompt: list, device: int, enable_rag: bool, model_repo_id: str
+        self, prompt: list, device: int, enable_rag: bool, model_repo_id: str, max_tokens: int, **kwargs
     ) -> None:
         self.prompt = prompt
         self.device = device
         self.enable_rag = enable_rag
-        self.model_repo_id = model_repo_id
+        self.model_repo_id = model_repo_id
+        self.max_tokens = max_tokens
+        self.generation_parameters = kwargs
@@ -0,0 +1,16 @@
+.vscode/
+__pycache__/
+models/llm/
+temp/
+test/
+dist/
+build/
+cache/
+test/
+env/
+
+!tools/*.exe
+llm_cache/
+TinyLlama-*
+laion/
+db/
@@ -0,0 +1,202 @@
+import threading
+from queue import Empty, Queue
+import json
+import time
+import traceback
+from typing import Dict, List, Callable
+from openvino_interface import LLMInterface
+from openvino_params import LLMParams
+
+RAG_PROMPT_FORMAT = "Answer the questions based on the information below. \n{context}\n\nQuestion: {prompt}"
+
+class LLM_SSE_Adapter:
+    msg_queue: Queue
+    finish: bool
+    singal: threading.Event
+    llm_interface: LLMInterface
+    should_stop: bool
+
+    def __init__(self, llm_interface: LLMInterface):
+        self.msg_queue = Queue(-1)
+        self.finish = False
+        self.singal = threading.Event()
+        self.llm_interface = llm_interface
+        self.should_stop = False
+        self.num_tokens = 0
+        self.start_time = 0
+        self.first_token_time = 0
+        self.last_token_time = 0
+        self.is_first_token = True
+
+    def put_msg(self, data):
+        self.msg_queue.put_nowait(data)
+        self.singal.set()
+
+    def load_model_callback(self, event: str):
+        data = {"type": "load_model", "event": event}
+        self.put_msg(data)
+
+    def text_in_callback(self, msg: str):
+        data = {"type": "text_in", "value": msg}
+        self.put_msg(data)
+
+    def text_out_callback(self, msg: str, type=1):
+        data = {"type": "text_out", "value": msg, "dtype": type}
+        self.put_msg(data)
+
+    def first_latency_callback(self, first_latency: str):
+        data = {"type": "first_token_latency", "value": first_latency}
+        self.put_msg(data)
+
+    def after_latency_callback(self, after_latency: str):
+        data = {"type": "after_token_latency", "value": after_latency}
+        self.put_msg(data)
+
+    def sr_latency_callback(self, sr_latency: str):
+        data = {"type": "sr_latency", "value": sr_latency}
+        self.put_msg(data)
+
+    def error_callback(self, ex: Exception):
+        if (
+            isinstance(ex, NotImplementedError)
+            and ex.__str__() == "Access to repositories lists is not implemented."
+        ):
+            self.put_msg(
+                {
+                    "type": "error",
+                    "err_type": "repositories_not_found",
+                }
+            )
+        # elif isinstance(ex, NotEnoughDiskSpaceException):
+        #     self.put_msg(
+        #         {
+        #             "type": "error",
+        #             "err_type": "not_enough_disk_space",
+        #             "need": bytes2human(ex.requires_space),
+        #             "free": bytes2human(ex.free_space),
+        #         }
+        #     )
+        # elif isinstance(ex, DownloadException):
+        #     self.put_msg({"type": "error", "err_type": "download_exception"})
+        # # elif isinstance(ex, llm_biz.StopGenerateException):
+        # #     pass
+        elif isinstance(ex, RuntimeError):
+            self.put_msg({"type": "error", "err_type": "runtime_error"})
+        else:
+            self.put_msg({"type": "error", "err_type": "unknown_exception"})
+        self.put_msg(f"exception:{str(ex)}")
+
+    def text_conversation(self, params: LLMParams):
+        thread = threading.Thread(
+            target=self.text_conversation_run,
+            args=[params],
+        )
+        thread.start()
+        return self.generator()
+    
+
+    def stream_function(self, output):
+        if self.is_first_token:
+            self.first_token_time = time.time()
+            self.is_first_token = False
+        
+        self.text_out_callback(output)
+        self.num_tokens += 1
+        
+        if self.llm_interface.stop_generate:
+            self.put_msg("Stopping generation.")
+            return True  # Stop generation
+        
+        return False  
+    
+
+    def text_conversation_run(
+        self,
+        params: LLMParams,
+    ):
+        try:
+            self.llm_interface.load_model(params, callback=self.load_model_callback)
+            
+            # Reset metrics tracking
+            self.num_tokens = 0
+            self.start_time = time.time()
+            self.first_token_time = 0
+            self.last_token_time = 0
+            self.is_first_token = True
+            
+            prompt = params.prompt
+            full_prompt = convert_prompt(prompt)
+            self.llm_interface.create_chat_completion(full_prompt, self.stream_function, params.max_tokens)
+            
+            # Calculate and send metrics
+            self.last_token_time = time.time()
+            metrics_data = {
+                "type": "metrics",
+                "num_tokens": self.num_tokens,
+                "total_time": self.last_token_time - self.start_time,
+                "overall_tokens_per_second": self.num_tokens / (self.last_token_time - self.start_time) if self.num_tokens > 0 else 0,
+                "second_plus_tokens_per_second": (self.num_tokens - 1) / (self.last_token_time - self.first_token_time) if self.num_tokens > 1 else None,
+                "first_token_latency": self.first_token_time - self.start_time if self.num_tokens > 0 else None,
+                "after_token_latency": (self.last_token_time - self.first_token_time) / (self.num_tokens - 1) if self.num_tokens > 1 else None
+            }
+            self.put_msg(metrics_data)
+            self.put_msg({"type": "finish"})
+        
+        except Exception as ex:
+            traceback.print_exc()
+            self.error_callback(ex)
+        finally:
+            self.llm_interface.stop_generate = False
+            self.finish = True
+            self.singal.set()
+
+    def generator(self):
+        while True:
+            while not self.msg_queue.empty():
+                try:
+                    data = self.msg_queue.get_nowait()
+                    msg = f"data:{json.dumps(data)}\0"
+                    print(msg)
+                    yield msg
+                except Empty(Exception):
+                    break
+            if not self.finish:
+                self.singal.clear()
+                self.singal.wait()
+            else:
+                break
+
+
+_default_prompt = {
+        "role": "system",
+        "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user. Please keep the output text language the same as the user input.",
+    }
+
+def convert_prompt(prompt: List[Dict[str, str]]):
+    chat_history = [_default_prompt]
+    prompt_len = prompt.__len__()
+    i = 0
+    while i < prompt_len:
+        chat_history.append({"role": "user", "content": prompt[i].get("question")})
+        if i < prompt_len - 1:
+            chat_history.append(
+                {"role": "assistant", "content": prompt[i].get("answer")}
+            )
+        i = i + 1
+    return chat_history
+
+
+def process_rag(
+        prompt: str,
+        device: str,
+        text_out_callback: Callable[[str, int], None] = None,
+    ):
+        import rag
+        rag.to(device)
+        query_success, context, rag_source = rag.query(prompt)
+        if query_success:
+            print("rag query input\r\n{}output:\r\n{}".format(prompt, context))
+            prompt = RAG_PROMPT_FORMAT.format(prompt=prompt, context=context)
+            if text_out_callback is not None:
+                text_out_callback(rag_source, 2)
+        return prompt
@@ -0,0 +1,52 @@
+from typing import Dict, List, Callable
+from os import path
+from openvino_interface import LLMInterface
+import openvino_genai
+from openvino_params import LLMParams
+import openvino_model_config as model_config
+import gc
+
+class OpenVino(LLMInterface):
+    def __init__(self):
+        self._model = None
+        self.stop_generate = False
+        self._last_repo_id = None
+
+    def load_model(self, params: LLMParams, callback: Callable[[str], None] = None):
+        model_repo_id = params.model_repo_id
+        if self._model is None or self._last_repo_id != model_repo_id:
+            if callback is not None:
+                callback("start")
+            self.unload_model()
+            callback(params.model_repo_id)
+
+            model_base_path = model_config.openVINOConfig.get("openvinoLLM")
+            model_name = model_repo_id.replace("/", "---")
+            model_path = path.abspath(path.join(model_base_path, model_name))
+
+            enable_compile_cache = dict()
+            enable_compile_cache["CACHE_DIR"] = "llm_cache"
+            self._model = openvino_genai.LLMPipeline(model_path, "AUTO", **enable_compile_cache)
+            self._tokenizer = self._model.get_tokenizer()
+
+            self._last_repo_id = model_repo_id
+            if callback is not None:
+                callback("finish")
+
+
+    def create_chat_completion(self, messages: List[Dict[str, str]], streamer: Callable[[str], None], max_tokens: int = 1024):
+        config = openvino_genai.GenerationConfig()
+        config.max_new_tokens = max_tokens
+
+        full_prompt = self._tokenizer.apply_chat_template(messages, add_generation_prompt=True)
+        return self._model.generate(full_prompt, config, streamer)
+
+
+    def unload_model(self):
+        if self._model is not None:
+            del self._model
+        gc.collect()
+        self._model = None
+
+    def get_backend_type(self):
+        return "openvino"