bytedance
diff --git a/‎.github/workflows/deploy-docs.yml
+37 b/‎.github/workflows/deploy-docs.yml
+37
diff --git a/‎docs/Makefile
+20 b/‎docs/Makefile
+20
diff --git a/‎docs/make.bat
+35 b/‎docs/make.bat
+35
diff --git a/‎docs/requirements.txt
+1 b/‎docs/requirements.txt
+1
diff --git a/‎docs/source/api.rst
+5 b/‎docs/source/api.rst
+5
diff --git a/‎docs/source/conf.py
+37 b/‎docs/source/conf.py
+37
diff --git a/‎docs/source/design.rst
+87 b/‎docs/source/design.rst
+87
diff --git a/‎docs/source/img/arch.png
195 KB b/‎docs/source/img/arch.png
195 KB
diff --git a/‎docs/source/img/local_gpu_cpy.png
93.9 KB b/‎docs/source/img/local_gpu_cpy.png
93.9 KB
diff --git a/‎docs/source/img/rdma_read.png
93.9 KB b/‎docs/source/img/rdma_read.png
93.9 KB
diff --git a/‎docs/source/img/rdma_write.png
39.8 KB b/‎docs/source/img/rdma_write.png
39.8 KB
diff --git a/‎docs/source/index.rst
+24 b/‎docs/source/index.rst
+24
diff --git a/‎infinistore/__init__.py
+2 b/‎infinistore/__init__.py
+2
@@ -0,0 +1,37 @@
+name: Deploy Documentation to GitHub Pages
+
+on:
+  push:
+    branches:
+      - main
+      - setup-pages
+
+jobs:
+  build-and-deploy:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.x'
+
+    - name: Install dependencies
+      run: |
+        pip install sphinx==6.2.1
+
+    - name: Build documentation
+      run: |
+        cd docs
+        make html
+
+    - name: Deploy to GitHub Pages
+      uses: peaceiris/actions-gh-pages@v3
+      with:
+        github_token: ${{ secrets.GITHUB_TOKEN }}
+        publish_dir: docs/build/html
+        allow_empty_commit: true
+        keep_files: true
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
@@ -0,0 +1 @@
+sphinx==6.2.1
@@ -0,0 +1,5 @@
+API Reference
+==============
+
+.. automodule:: infinistore
+   :members:
@@ -0,0 +1,37 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+
+import sys
+import os
+
+sys.path.insert(0, os.path.abspath("../.."))
+
+project = "infinistore"
+copyright = "2025, [email protected]"
+author = "[email protected]"
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.napoleon",
+]
+autodoc_mock_imports = [
+    "infinistore._infinistore",
+    "torch",
+]
+
+templates_path = ["_templates"]
+exclude_patterns = []
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "alabaster"
+html_static_path = ["_static"]
@@ -0,0 +1,87 @@
+Design and Architecture
+=======================
+
+Introduction
+------------
+
+Motivation
+~~~~~~~~~~
+LLM inference is moving to disaggregated architecture.
+LLM inference is moving from single-instance execution to cluster-level disaggregated architecture. Among all the efforts, prefill-decoding disaggregation is probably the most prominent change. The prefill phase requires more computational power, while the decode phase places a greater demand for memory. With this observation, prefill and decode phase disaggregation is an important aspect to improve inference engine performance.
+In addition to prefill-decode disaggregation, distributed KV cache could also increase the prefix KV cache hit rate, leading to higher GPU resource utilization.
+There are various related papers in this field, and some of them are even already in production:
+
+  - Mooncake: Kimi's production serving platform. A global KV store is made up of distributed DDR and SSD on each GPU host.
+  - Splitwise: A prefill-decode disaggregation system, which requires KV cache transfer between different machines.
+  - AttentionStore: Similar to Mooncake but it considers multi-turn conversation inference with positional-encoding separation from KV cache on a single node.
+  - MemServe: An elastic memory pool managing distributed memory and KV caches across serving instances.
+
+We identified many innovative or potential improvements in this transition.
+While analyzing the works above, we identified many potential improvements or new techniques to build a high-performance and scale cluster-level inference system, such as:
+
+  - Improvements on the request schedulers to build a more extensible and scalable scheduler,
+  - Integrating with specific inference engine features (like extending the existing APC feature in vLLM),
+  - Some new algorithms to better scale the memory pool and re-balance the hot sequences,
+  - Exploring some new techniques such as de-coupled positional encoding, etc.
+
+We are trying to build a high-performance open-source implementation to incorporate all the potential innovations mentioned above, so that different customers don't have to build their own.
+
+
+Features
+--------
+
+Compared to a single instance vLLM, vLLM + InfiniStore supports the following new features:
+
+- Prefill-decoding architecture
+- Historical KV cache in DRAM and SSD: a much larger pool than the current Automatic Prefix Cache (APC) feature in vLLM which is limited to GPU HBM.
+- Cross-host KV cache: one host can reuse the historical KV cache on another host.
+
+
+Architecture
+------------
+
+.. image:: img/arch.png
+   :align: center
+
+1. Infinistore and vLLM are deployed on the same server, reusing the local CPU and memory resources.
+
+2. The memcopy speed within the same machine is significantly faster than RDMA. It is recommended to use local GPU copy when reading and writing to the local Infinistore.
+
+3. Infinistore uses the traditional key-value structure, supporting variable-length keys. This facilitates storing information like model_id, request, and token hash in the key.
+   Since RDMA memory registration is very slow, Infinistore pre-registers memory for RDMA during startup and implements memory management using a memory pool.
+   The current memory management algorithms support bitmap or jemalloc, with bitmap being the default.
+
+4. Read and Write Process:
+
+   a. Prefill Stage:
+      vLLM writes to the kvcache layer by layer during the prefill stage. Communication methods can be either local GPU copy or RDMA.
+      Practical experience shows that the layer-by-layer approach parallelizes network communication and GPU computation. Measurements indicate that during the prefill stage, the network overhead increases by no more than 1%.
+      For a demo implementation, refer to: demo_prefill.py
+
+   b. Decode Stage:
+      In the decode stage, a separate thread in vLLM downloads the kvcache and then notifies the scheduler to start the decoding process.
+      Unlike the current community implementation of vLLM, to ensure that network operations do not block the GPU during the decode stage, an additional thread is required to download data.
+
+Communications
+--------------
+
+
+local gpu copy
+~~~~~~~~~~~~~~
+
+.. image:: img/local_gpu_cpy.png
+   :align: center
+
+
+rdma write
+~~~~~~~~~~
+
+.. image:: img/rdma_write.png
+   :align: center
+
+rdma read
+~~~~~~~~~
+
+
+.. image:: img/rdma_read.png
+   :align: center
@@ -0,0 +1,24 @@
+.. infinistore documentation master file, created by
+   sphinx-quickstart on Mon Jan  6 19:40:38 2025.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to infinistore's documentation!
+=======================================
+
+.. toctree::
+   :maxdepth: 1
+
+
+   design
+   api
+
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
@@ -9,11 +9,13 @@
     check_supported,
     LINK_ETHERNET,
     LINK_IB,
+    register_server,
 )
 
 __all__ = [
     "InfinityConnection",
     "DisableTorchCaching",
+    "register_server",
     "ClientConfig",
     "ServerConfig",
     "TYPE_RDMA",