OVMS model loading time is much longer than openvino model loading time and crashes from time to time

**Describe the bug**
In our application we have a need to switch models due to resource constraint. When we compare model loading time in OVMS vs in native openvino, we found the overhead is quite heavy.

**To Reproduce**
Steps to reproduce the behavior:
1. Update testing script for ovms root and model files exported for ovms:
```Python
import gc
import itertools
import json
from pathlib import Path
import time
import httpx
from transformers import AutoTokenizer
import subprocess

from optimum.intel import OVModelForCausalLM

def ov_load_model(model_path: Path):
    ov_config = {"INFERENCE_PRECISION_HINT": "f16"}
    model = OVModelForCausalLM.from_pretrained(model_path, compile=True, compile_only=True, device="GPU", ov_config=ov_config)
    model.compile()
    tok = AutoTokenizer.from_pretrained(model_path)
    return model, tok


class OVMS:
    _ovms_process = None
    port = 9091
    rest_port = 8081
    model_config = Path("model_config.json")
    client = httpx.Client(trust_env=False)

    @classmethod
    def prompt_to_start_ovms(cls, ovms_root: Path):
        ovms_binary_path = ovms_root / "ovms.exe"
        assert ovms_binary_path.exists()

        # Set environment variables for ovms.exe execution
        cmd_start_server = f'"{ovms_root/"setupvars.bat"}" && "{ovms_binary_path}" --port {cls.port} --rest_port {cls.rest_port} --config_path "{cls.model_config.absolute()}" --file_system_poll_wait_seconds 0'
        print(cmd_start_server)
        input(f"Use command to start OVMS server: {cmd_start_server}\nPress Enter to continue...")

    @classmethod
    def update_ovms_config_file(cls, model_path: Path):
        config_path = cls.model_config
        if not config_path.exists():
            raise FileNotFoundError(f"OVMS config file {config_path} does not exist.")
        
        config_data = {
            "mediapipe_config_list": [{
                "name": model_path.name,
                "base_path": str(model_path)
            }],
            "model_config_list": []
        }
        with config_path.open("w", encoding="utf-8") as f:
            json.dump(config_data, f, indent=4, ensure_ascii=False)
    
    @classmethod
    def wait_until_model_status(cls, model_name: str, wait_ready=True):
        url = f"http://localhost:{cls.rest_port}/v2/models/{model_name}/ready"
        #print(f"Checking if model {model_name} is ready:{wait_ready} at {url}...")
        while True:
            try:
                response = cls.client.get(url)
                if wait_ready and response.status_code == 200:
                    #print(f"Model {model_name} is ready.")
                    return
                elif not wait_ready and response.status_code == 404:
                    #print(f"Model {model_name} is unloaded as expected")
                    return
            except httpx.RequestError as e:
                print(f"Request error: {e}, retrying...")
            time.sleep(0.1)

    @classmethod
    def reload_ovms_config(cls):
        url = f"http://localhost:{cls.rest_port}/v1/config/reload"
        response = cls.client.post(url, timeout=60)
        if response.status_code != 200:
            raise RuntimeError(f"Failed to reload OVMS config: {response.text}")

def ovms_load_model(model_path: Path):
    OVMS.update_ovms_config_file(model_path)
    OVMS.reload_ovms_config()


class TimerContext:
    def __init__(self, name: str):
        self.name = name
        self.start_time = None

    def __enter__(self):
        self.start_time = time.time()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        elapsed_time = time.time() - self.start_time
        print(f"{self.name} took {elapsed_time:.2f} seconds")

def test_ovms_load_model(ovms_root:Path, model_a:Path, model_b:Path, iterations=10):
    OVMS.update_ovms_config_file(model_b)
    OVMS.prompt_to_start_ovms(ovms_root)
    OVMS.wait_until_model_status(model_b.name, wait_ready=True)
    for i, model_path in itertools.product(range(iterations), (model_a, model_b)):
        with TimerContext(f"Loading model {model_path.name:40}"):
            ovms_load_model(model_path)
        time.sleep(1) # wait for a bit before to unload current model

def test_ov_load_model(model_a: Path, model_b: Path, iterations=10):
    with TimerContext(f"Intialize by loading model {model_b.name:40}"):
        model, tok = ov_load_model(model_b)
    for i, model_path in itertools.product(range(iterations), (model_a, model_b)):
        with TimerContext(f"Loading model {model_path.name:40}"):
            del model, tok
            gc.collect()
            model, tok = ov_load_model(model_path)


if __name__ == "__main__":
    ovms_root = Path(r"c:\workspace\ovms")
    model_a = Path(r"C:\workspace\local_models\Qwen2.5-7B-Instruct-int4-ov")
    model_b = Path(r"C:\workspace\local_models\Phi-3-mini-4k-instruct-int4-ov")

    if not model_a.exists() or not model_b.exists():
        raise FileNotFoundError("Model paths do not exist.")
    
    print("Testing OVMS model loading...")
    test_ovms_load_model(ovms_root, model_a, model_b, 5)

    input("Terminate ovms process and press Enter to continue...")

    print("Testing OpenVINO model loading...")
    test_ov_load_model(model_a, model_b, 5)
```
2. run the test script in command prompt, follow command to start and stop ovms
3. Sample output:
```
Testing OVMS model loading...
"C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\setupvars.bat" && "C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\ovms.exe" --port 9091 --rest_port 8081 --config_path "C:\workspace\qually\applications.ai.superbuilder\pybackend\model_config.json" --file_system_poll_wait_seconds 0
Use command to start OVMS server: "C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\setupvars.bat" && "C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\ovms.exe" --port 9091 --rest_port 8081 --config_path "C:\workspace\qually\applications.ai.superbuilder\pybackend\model_config.json" --file_system_poll_wait_seconds 0
Press Enter to continue...
Loading model Qwen2.5-7B-Instruct-int4-ov              took 6.59 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 4.49 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 5.99 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 3.98 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 6.19 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 3.95 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 6.13 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 3.95 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 6.10 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 4.15 seconds
Terminate ovms process and press Enter to continue...
Testing OpenVINO model loading...
Intialize by loading model Phi-3-mini-4k-instruct-int4-ov           took 3.54 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 6.56 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 1.66 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 3.54 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 1.65 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 3.52 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 1.64 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 3.46 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 1.63 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 3.46 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 1.65 seconds
```
Overhead is quite heavy. 3.5s -> 6.1s for 7B model and 1.65s -> 4s for 3.8B model

**Expected behavior**
Similiar or better model loading time comparing to native openvino.

**Logs**
Logs from OVMS, ideally with --log_level DEBUG. Logs from client.

**Configuration**
1. OVMS version: 2025.2.1
2. OVMS config.json file: script generated
3. pbtxt:
```
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"

node: {
  name: "LLMExecutor"
  calculator: "HttpLLMCalculator"
  input_stream: "LOOPBACK:loopback"
  input_stream: "HTTP_REQUEST_PAYLOAD:input"
  input_side_packet: "LLM_NODE_RESOURCES:llm"
  output_stream: "LOOPBACK:loopback"
  output_stream: "HTTP_RESPONSE_PAYLOAD:output"
  input_stream_info: {
    tag_index: 'LOOPBACK:0',
    back_edge: true
  }
  node_options: {
      [type.googleapis.com / mediapipe.LLMCalculatorOptions]: {
          models_path: "./",
          plugin_config: '{}',
          enable_prefix_caching: false,
          cache_size: 2,
          max_num_seqs: 256,
          device: "GPU",
      }
  }
  input_stream_handler {
    input_stream_handler: "SyncSetInputStreamHandler",
    options {
      [mediapipe.SyncSetInputStreamHandlerOptions.ext] {
        sync_set {
          tag_index: "LOOPBACK:0"
        }
      }
    }
  }
}
```
4. ovms will crash from time to time:
script output:
```
Loading model Qwen2.5-7B-Instruct-int4-ov              took 8.65 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov           took 4.18 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov              took 4.81 seconds
Traceback (most recent call last):
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 101, in map_httpcore_exceptions
    yield
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 250, in handle_request
    resp = self._pool.handle_request(req)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\connection_pool.py", line 256, in handle_request
    raise exc from None
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\connection_pool.py", line 236, in handle_request
    response = connection.handle_request(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\connection.py", line 103, in handle_request
    return self._connection.handle_request(request)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 136, in handle_request
    raise exc
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 106, in handle_request
    ) = self._receive_response_headers(**kwargs)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 177, in _receive_response_headers
    event = self._receive_event(timeout=timeout)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 217, in _receive_event
    data = self._network_stream.read(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_backends\sync.py", line 126, in read
    with map_exceptions(exc_map):
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\qually\miniforge3\envs\superpy312\Lib\contextlib.py", line 158, in __exit__
    self.gen.throw(value)
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_exceptions.py", line 14, in map_exceptions
    raise to_exc(exc) from exc
httpcore.ReadError: [WinError 10054] An existing connection was forcibly closed by the remote host

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 123, in <module>
    test_ovms_load_model(ovms_root, model_a, model_b, 5)
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 101, in test_ovms_load_model
    ovms_load_model(model_path)
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 79, in ovms_load_model
    OVMS.reload_ovms_config()
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 73, in reload_ovms_config
    response = cls.client.post(url, timeout=60)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 1144, in post
    return self.request(
           ^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 825, in request
    return self.send(request, auth=auth, follow_redirects=follow_redirects)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 914, in send
    response = self._send_handling_auth(
               ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 942, in _send_handling_auth
    response = self._send_handling_redirects(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 979, in _send_handling_redirects
    response = self._send_single_request(request)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 1014, in _send_single_request
    response = transport.handle_request(request)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 249, in handle_request
    with map_httpcore_exceptions():
         ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\qually\miniforge3\envs\superpy312\Lib\contextlib.py", line 158, in __exit__
    self.gen.throw(value)
  File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 118, in map_httpcore_exceptions
    raise mapped_exc(message) from exc
httpx.ReadError: [WinError 10054] An existing connection was forcibly closed by the remote host
```
and ovms output:
```
[2025-07-29 11:58:26.840][44664][llm_executor][info][llm_executor.hpp:90] All requests: 0; Scheduled requests: 0;
[2025-07-29 11:58:26.892][114868][modelmanager][info][../dags/pipelinedefinitionstatus.hpp:60] Mediapipe: Qwen2.5-7B-Instruct-int4-ov state changed to: RETIRED after handling: RetireEvent: 
[2025-07-29 11:58:27.897][114868][modelmanager][error][src/modelmanager.cpp:960] Configuration file doesn't have models property.
[2025-07-29 11:58:27.897][114868][modelmanager][info][src/modelmanager.cpp:658] Configuration file doesn't have custom node libraries property.
[2025-07-29 11:58:27.897][114868][modelmanager][info][src/modelmanager.cpp:701] Configuration file doesn't have pipelines property.
[2025-07-29 11:58:27.897][114868][modelmanager][info][src/mediapipe_internal/mediapipefactory.cpp:103] Reloading mediapipe graph: Qwen2.5-7B-Instruct-int4-ov
[2025-07-29 11:58:27.897][114868][modelmanager][info][../dags/pipelinedefinitionstatus.hpp:60] Mediapipe: Qwen2.5-7B-Instruct-int4-ov state changed to: RELOADING after handling: ReloadEvent: 
[2025-07-29 11:58:27.898][114868][serving][info][src/mediapipe_internal/mediapipegraphdefinition.cpp:421] MediapipeGraphDefinition initializing graph nodes
[2025-07-29 11:58:27.898][114868][modelmanager][info][src/llm/servable_initializer.cpp:268] Initializing Language Model Continuous Batching servable

(superpy312) qually@HP-LNL C:\workspace\qually\applications.ai.superbuilder>
```

**Additional context**
Add any other context about the problem here.


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

OVMS model loading time is much longer than openvino model loading time and crashes from time to time #3539

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

OVMS model loading time is much longer than openvino model loading time and crashes from time to time #3539

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions