-
Notifications
You must be signed in to change notification settings - Fork 225
Open
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
In our application we have a need to switch models due to resource constraint. When we compare model loading time in OVMS vs in native openvino, we found the overhead is quite heavy.
To Reproduce
Steps to reproduce the behavior:
- Update testing script for ovms root and model files exported for ovms:
import gc
import itertools
import json
from pathlib import Path
import time
import httpx
from transformers import AutoTokenizer
import subprocess
from optimum.intel import OVModelForCausalLM
def ov_load_model(model_path: Path):
ov_config = {"INFERENCE_PRECISION_HINT": "f16"}
model = OVModelForCausalLM.from_pretrained(model_path, compile=True, compile_only=True, device="GPU", ov_config=ov_config)
model.compile()
tok = AutoTokenizer.from_pretrained(model_path)
return model, tok
class OVMS:
_ovms_process = None
port = 9091
rest_port = 8081
model_config = Path("model_config.json")
client = httpx.Client(trust_env=False)
@classmethod
def prompt_to_start_ovms(cls, ovms_root: Path):
ovms_binary_path = ovms_root / "ovms.exe"
assert ovms_binary_path.exists()
# Set environment variables for ovms.exe execution
cmd_start_server = f'"{ovms_root/"setupvars.bat"}" && "{ovms_binary_path}" --port {cls.port} --rest_port {cls.rest_port} --config_path "{cls.model_config.absolute()}" --file_system_poll_wait_seconds 0'
print(cmd_start_server)
input(f"Use command to start OVMS server: {cmd_start_server}\nPress Enter to continue...")
@classmethod
def update_ovms_config_file(cls, model_path: Path):
config_path = cls.model_config
if not config_path.exists():
raise FileNotFoundError(f"OVMS config file {config_path} does not exist.")
config_data = {
"mediapipe_config_list": [{
"name": model_path.name,
"base_path": str(model_path)
}],
"model_config_list": []
}
with config_path.open("w", encoding="utf-8") as f:
json.dump(config_data, f, indent=4, ensure_ascii=False)
@classmethod
def wait_until_model_status(cls, model_name: str, wait_ready=True):
url = f"http://localhost:{cls.rest_port}/v2/models/{model_name}/ready"
#print(f"Checking if model {model_name} is ready:{wait_ready} at {url}...")
while True:
try:
response = cls.client.get(url)
if wait_ready and response.status_code == 200:
#print(f"Model {model_name} is ready.")
return
elif not wait_ready and response.status_code == 404:
#print(f"Model {model_name} is unloaded as expected")
return
except httpx.RequestError as e:
print(f"Request error: {e}, retrying...")
time.sleep(0.1)
@classmethod
def reload_ovms_config(cls):
url = f"http://localhost:{cls.rest_port}/v1/config/reload"
response = cls.client.post(url, timeout=60)
if response.status_code != 200:
raise RuntimeError(f"Failed to reload OVMS config: {response.text}")
def ovms_load_model(model_path: Path):
OVMS.update_ovms_config_file(model_path)
OVMS.reload_ovms_config()
class TimerContext:
def __init__(self, name: str):
self.name = name
self.start_time = None
def __enter__(self):
self.start_time = time.time()
return self
def __exit__(self, exc_type, exc_value, traceback):
elapsed_time = time.time() - self.start_time
print(f"{self.name} took {elapsed_time:.2f} seconds")
def test_ovms_load_model(ovms_root:Path, model_a:Path, model_b:Path, iterations=10):
OVMS.update_ovms_config_file(model_b)
OVMS.prompt_to_start_ovms(ovms_root)
OVMS.wait_until_model_status(model_b.name, wait_ready=True)
for i, model_path in itertools.product(range(iterations), (model_a, model_b)):
with TimerContext(f"Loading model {model_path.name:40}"):
ovms_load_model(model_path)
time.sleep(1) # wait for a bit before to unload current model
def test_ov_load_model(model_a: Path, model_b: Path, iterations=10):
with TimerContext(f"Intialize by loading model {model_b.name:40}"):
model, tok = ov_load_model(model_b)
for i, model_path in itertools.product(range(iterations), (model_a, model_b)):
with TimerContext(f"Loading model {model_path.name:40}"):
del model, tok
gc.collect()
model, tok = ov_load_model(model_path)
if __name__ == "__main__":
ovms_root = Path(r"c:\workspace\ovms")
model_a = Path(r"C:\workspace\local_models\Qwen2.5-7B-Instruct-int4-ov")
model_b = Path(r"C:\workspace\local_models\Phi-3-mini-4k-instruct-int4-ov")
if not model_a.exists() or not model_b.exists():
raise FileNotFoundError("Model paths do not exist.")
print("Testing OVMS model loading...")
test_ovms_load_model(ovms_root, model_a, model_b, 5)
input("Terminate ovms process and press Enter to continue...")
print("Testing OpenVINO model loading...")
test_ov_load_model(model_a, model_b, 5)
- run the test script in command prompt, follow command to start and stop ovms
- Sample output:
Testing OVMS model loading...
"C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\setupvars.bat" && "C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\ovms.exe" --port 9091 --rest_port 8081 --config_path "C:\workspace\qually\applications.ai.superbuilder\pybackend\model_config.json" --file_system_poll_wait_seconds 0
Use command to start OVMS server: "C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\setupvars.bat" && "C:\workspace\qually\applications.ai.superbuilder\build\Release\pyllmsrv\ovms\ovms.exe" --port 9091 --rest_port 8081 --config_path "C:\workspace\qually\applications.ai.superbuilder\pybackend\model_config.json" --file_system_poll_wait_seconds 0
Press Enter to continue...
Loading model Qwen2.5-7B-Instruct-int4-ov took 6.59 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 4.49 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 5.99 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 3.98 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 6.19 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 3.95 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 6.13 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 3.95 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 6.10 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 4.15 seconds
Terminate ovms process and press Enter to continue...
Testing OpenVINO model loading...
Intialize by loading model Phi-3-mini-4k-instruct-int4-ov took 3.54 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 6.56 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 1.66 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 3.54 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 1.65 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 3.52 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 1.64 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 3.46 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 1.63 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 3.46 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 1.65 seconds
Overhead is quite heavy. 3.5s -> 6.1s for 7B model and 1.65s -> 4s for 3.8B model
Expected behavior
Similiar or better model loading time comparing to native openvino.
Logs
Logs from OVMS, ideally with --log_level DEBUG. Logs from client.
Configuration
- OVMS version: 2025.2.1
- OVMS config.json file: script generated
- pbtxt:
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node: {
name: "LLMExecutor"
calculator: "HttpLLMCalculator"
input_stream: "LOOPBACK:loopback"
input_stream: "HTTP_REQUEST_PAYLOAD:input"
input_side_packet: "LLM_NODE_RESOURCES:llm"
output_stream: "LOOPBACK:loopback"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
input_stream_info: {
tag_index: 'LOOPBACK:0',
back_edge: true
}
node_options: {
[type.googleapis.com / mediapipe.LLMCalculatorOptions]: {
models_path: "./",
plugin_config: '{}',
enable_prefix_caching: false,
cache_size: 2,
max_num_seqs: 256,
device: "GPU",
}
}
input_stream_handler {
input_stream_handler: "SyncSetInputStreamHandler",
options {
[mediapipe.SyncSetInputStreamHandlerOptions.ext] {
sync_set {
tag_index: "LOOPBACK:0"
}
}
}
}
}
- ovms will crash from time to time:
script output:
Loading model Qwen2.5-7B-Instruct-int4-ov took 8.65 seconds
Loading model Phi-3-mini-4k-instruct-int4-ov took 4.18 seconds
Loading model Qwen2.5-7B-Instruct-int4-ov took 4.81 seconds
Traceback (most recent call last):
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 101, in map_httpcore_exceptions
yield
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 250, in handle_request
resp = self._pool.handle_request(req)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\connection_pool.py", line 256, in handle_request
raise exc from None
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\connection_pool.py", line 236, in handle_request
response = connection.handle_request(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\connection.py", line 103, in handle_request
return self._connection.handle_request(request)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 136, in handle_request
raise exc
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 106, in handle_request
) = self._receive_response_headers(**kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 177, in _receive_response_headers
event = self._receive_event(timeout=timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_sync\http11.py", line 217, in _receive_event
data = self._network_stream.read(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_backends\sync.py", line 126, in read
with map_exceptions(exc_map):
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\qually\miniforge3\envs\superpy312\Lib\contextlib.py", line 158, in __exit__
self.gen.throw(value)
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpcore\_exceptions.py", line 14, in map_exceptions
raise to_exc(exc) from exc
httpcore.ReadError: [WinError 10054] An existing connection was forcibly closed by the remote host
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 123, in <module>
test_ovms_load_model(ovms_root, model_a, model_b, 5)
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 101, in test_ovms_load_model
ovms_load_model(model_path)
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 79, in ovms_load_model
OVMS.reload_ovms_config()
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\try_model_loading.py", line 73, in reload_ovms_config
response = cls.client.post(url, timeout=60)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 1144, in post
return self.request(
^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 825, in request
return self.send(request, auth=auth, follow_redirects=follow_redirects)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 914, in send
response = self._send_handling_auth(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 942, in _send_handling_auth
response = self._send_handling_redirects(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 979, in _send_handling_redirects
response = self._send_single_request(request)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_client.py", line 1014, in _send_single_request
response = transport.handle_request(request)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 249, in handle_request
with map_httpcore_exceptions():
^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\qually\miniforge3\envs\superpy312\Lib\contextlib.py", line 158, in __exit__
self.gen.throw(value)
File "C:\workspace\qually\applications.ai.superbuilder\pybackend\.venv\Lib\site-packages\httpx\_transports\default.py", line 118, in map_httpcore_exceptions
raise mapped_exc(message) from exc
httpx.ReadError: [WinError 10054] An existing connection was forcibly closed by the remote host
and ovms output:
[2025-07-29 11:58:26.840][44664][llm_executor][info][llm_executor.hpp:90] All requests: 0; Scheduled requests: 0;
[2025-07-29 11:58:26.892][114868][modelmanager][info][../dags/pipelinedefinitionstatus.hpp:60] Mediapipe: Qwen2.5-7B-Instruct-int4-ov state changed to: RETIRED after handling: RetireEvent:
[2025-07-29 11:58:27.897][114868][modelmanager][error][src/modelmanager.cpp:960] Configuration file doesn't have models property.
[2025-07-29 11:58:27.897][114868][modelmanager][info][src/modelmanager.cpp:658] Configuration file doesn't have custom node libraries property.
[2025-07-29 11:58:27.897][114868][modelmanager][info][src/modelmanager.cpp:701] Configuration file doesn't have pipelines property.
[2025-07-29 11:58:27.897][114868][modelmanager][info][src/mediapipe_internal/mediapipefactory.cpp:103] Reloading mediapipe graph: Qwen2.5-7B-Instruct-int4-ov
[2025-07-29 11:58:27.897][114868][modelmanager][info][../dags/pipelinedefinitionstatus.hpp:60] Mediapipe: Qwen2.5-7B-Instruct-int4-ov state changed to: RELOADING after handling: ReloadEvent:
[2025-07-29 11:58:27.898][114868][serving][info][src/mediapipe_internal/mediapipegraphdefinition.cpp:421] MediapipeGraphDefinition initializing graph nodes
[2025-07-29 11:58:27.898][114868][modelmanager][info][src/llm/servable_initializer.cpp:268] Initializing Language Model Continuous Batching servable
(superpy312) qually@HP-LNL C:\workspace\qually\applications.ai.superbuilder>
Additional context
Add any other context about the problem here.
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working