vLLM 在 Windows 上的安装与部署

原创已于 2026-05-04 10:42:06 修改 · 549 阅读

4 ·

本内容遵循CC 4.0 BY-SA版权协议

GEO检测

标签

#vllm #windows #人工智能

于 2026-05-04 10:10:16 首次发布

vLLM 0.19.1 在 Windows 上的安装与运行技术报告

环境信息

OS: Windows 10/11
GPU: NVIDIA Quadro RTX 3000 with Max-Q Design（计算能力 7.5，6GB VRAM）
CUDA: 12.6
Python: 3.x（Miniconda 环境）
vLLM 版本: 0.19.1+cu126
模型: Qwen/Qwen3-0.6B

一、背景

vLLM 官方主要面向 Linux 环境开发，Windows 支持属于社区实验性质。直接在 Windows 上运行 vllm serve 会遇到多个兼容性问题，需要手动修改源码打补丁。本报告记录了完整的Bug过程和最终可用的修复方案。

二、安装步骤

# 创建 conda 环境
conda create -n vllm python=3.10
conda activate vllm

# 安装 vLLM（CUDA 12.6 版本）
pip install vllm==0.19.1+cu126 --extra-index-url https://download.pytorch.org/whl/cu126

三、Bug记录与修复

Bug 1：ZMQError: not a socket

错误信息

zmq.error.ZMQError: not a socket
  File "vllm\v1\engine\utils.py", line 1039, in wait_for_engine_startup
    events = poller.poll(STARTUP_POLL_PERIOD_MS)

原因

vLLM V1 引擎默认使用多进程模式（AsyncMPClient），通过 ZMQ 跨进程通信。Windows 使用 spawn 方式创建子进程，父进程中创建的 ZMQ socket 无法被子进程继承，导致 “not a socket” 错误。

修复方法

设置环境变量禁用多进程模式：

setx VLLM_ENABLE_V1_MULTIPROCESSING 0

⚠️ 注意：必须用 setx 写入系统环境变量后重新打开 CMD 窗口才生效。用 set 命令只在当前会话有效，但 vLLM serve 会启动子进程，子进程无法继承父进程的 set 变量。

Bug 2：AttributeError: ‘InprocClient’ object has no attribute ‘engine_ranks_managed’

错误信息

AttributeError: 'InprocClient' object has no attribute 'engine_ranks_managed'
  File "vllm\v1\engine\async_llm.py", line 168, in __init__
    engine_idxs=self.engine_core.engine_ranks_managed,

原因

禁用多进程后走 InprocClient 路径，但 InprocClient 是为同步的 LLM 类设计的，缺少 AsyncLLM 所需的 engine_ranks_managed 属性。

修复位置

C:\Users\sheng\miniconda3\envs\vllm\lib\site-packages\vllm\v1\engine\core_client.py

修复方法

在 InprocClient.__init__ 中添加该属性（见Bug 3 的完整代码）。

Bug 3：NotImplementedError — InprocClient 缺少所有 async 方法

错误信息

NotImplementedError
  File "core_client.py", line 221, in get_output_async
  File "core_client.py", line 235, in reset_mm_cache_async

原因

AsyncLLM 调用的全是 _async 后缀的方法，而 InprocClient 只实现了同步版本，父类 EngineCoreClient 中的 async 方法全部 raise NotImplementedError。

修复方法

用以下完整代码替换 core_client.py 中的整个 InprocClient 类：

class InprocClient(EngineCoreClient):
    """
    InprocClient: Windows 兼容版本，补全所有 async 方法，
    并使用后台线程 + asyncio.Queue 驱动引擎步进。
    """

    def __init__(self, *args, **kwargs):
        import asyncio
        import threading
        self.engine_core = EngineCore(*args, **kwargs)
        self.engine_ranks_managed = [0]
        self._has_requests = threading.Event()
        self._output_queue: asyncio.Queue | None = None
        self._loop: asyncio.AbstractEventLoop | None = None
        self._step_thread: threading.Thread | None = None
        self._stopped = False

    def _ensure_step_thread(self):
        if self._step_thread is not None:
            return
        import threading
        self._step_thread = threading.Thread(
            target=self._step_loop,
            name="InprocClientStepThread",
            daemon=True,
        )
        self._step_thread.start()

    def _step_loop(self):
        while not self._stopped:
            self._has_requests.wait(timeout=1.0)
            if self._stopped:
                break
            outputs, model_executed = self.engine_core.step_fn()
            self.engine_core.post_step(model_executed=model_executed)
            result = outputs and outputs.get(0) or None
            if result is not None and (result.outputs or result.scheduler_stats):
                if self._loop and self._output_queue:
                    self._loop.call_soon_threadsafe(
                        self._output_queue.put_nowait, result
                    )
                if not (result.scheduler_stats and
                        getattr(result.scheduler_stats, 'num_running_reqs', 0) > 0):
                    self._has_requests.clear()

    async def get_output_async(self) -> EngineCoreOutputs:
        import asyncio
        if self._output_queue is None:
            self._output_queue = asyncio.Queue()
            self._loop = asyncio.get_running_loop()
            self._ensure_step_thread()
        return await self._output_queue.get()

    def add_request(self, request: EngineCoreRequest) -> None:
        req, request_wave = self.engine_core.preprocess_add_request(request)
        self.engine_core.add_request(req, request_wave)
        self._has_requests.set()

    async def add_request_async(self, request: EngineCoreRequest) -> None:
        self.add_request(request)

    def get_output(self) -> EngineCoreOutputs:
        outputs, model_executed = self.engine_core.step_fn()
        self.engine_core.post_step(model_executed=model_executed)
        return outputs and outputs.get(0) or EngineCoreOutputs()

    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
        return self.engine_core.get_supported_tasks()

    async def get_supported_tasks_async(self) -> tuple[SupportedTask, ...]:
        return self.get_supported_tasks()

    def abort_requests(self, request_ids: list[str]) -> None:
        if len(request_ids) > 0:
            self.engine_core.abort_requests(request_ids)

    async def abort_requests_async(self, request_ids: list[str]) -> None:
        self.abort_requests(request_ids)

    def shutdown(self, timeout: float | None = None) -> None:
        self._stopped = True
        self._has_requests.set()
        self.engine_core.shutdown()

    def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
        self.engine_core.profile(is_start, profile_prefix)

    async def profile_async(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
        self.profile(is_start, profile_prefix)

    def reset_mm_cache(self) -> None:
        self.engine_core.reset_mm_cache()

    async def reset_mm_cache_async(self) -> None:
        self.reset_mm_cache()

    def reset_prefix_cache(
        self, reset_running_requests: bool = False, reset_connector: bool = False
    ) -> bool:
        return self.engine_core.reset_prefix_cache(reset_running_requests, reset_connector)

    async def reset_prefix_cache_async(
        self, reset_running_requests: bool = False, reset_connector: bool = False
    ) -> bool:
        return self.reset_prefix_cache(reset_running_requests, reset_connector)

    def reset_encoder_cache(self) -> None:
        self.engine_core.reset_encoder_cache()

    async def reset_encoder_cache_async(self) -> None:
        self.reset_encoder_cache()

    def sleep(self, level: int = 1, mode: PauseMode = "abort") -> None:
        if mode == "wait":
            raise ValueError("'wait' pause mode is not supported in inproc-engine mode")
        result = self.engine_core.sleep(level, mode)
        assert result is None

    async def sleep_async(self, level: int = 1, mode: PauseMode = "abort") -> None:
        self.sleep(level, mode)

    def wake_up(self, tags: list[str] | None = None) -> None:
        self.engine_core.wake_up(tags)

    async def wake_up_async(self, tags: list[str] | None = None) -> None:
        self.wake_up(tags)

    def is_sleeping(self) -> bool:
        return self.engine_core.is_sleeping()

    async def is_sleeping_async(self) -> bool:
        return self.is_sleeping()

    def execute_dummy_batch(self) -> None:
        self.engine_core.execute_dummy_batch()

    async def execute_dummy_batch_async(self) -> None:
        self.execute_dummy_batch()

    def add_lora(self, lora_request: LoRARequest) -> bool:
        return self.engine_core.add_lora(lora_request)

    async def add_lora_async(self, lora_request: LoRARequest) -> bool:
        return self.add_lora(lora_request)

    def remove_lora(self, lora_id: int) -> bool:
        return self.engine_core.remove_lora(lora_id)

    async def remove_lora_async(self, lora_id: int) -> bool:
        return self.remove_lora(lora_id)

    def list_loras(self) -> set[int]:
        return self.engine_core.list_loras()

    async def list_loras_async(self) -> set[int]:
        return self.list_loras()

    def pin_lora(self, lora_id: int) -> bool:
        return self.engine_core.pin_lora(lora_id)

    async def pin_lora_async(self, lora_id: int) -> bool:
        return self.pin_lora(lora_id)

    def save_sharded_state(
        self, path: str, pattern: str | None = None, max_size: int | None = None
    ) -> None:
        self.engine_core.save_sharded_state(path, pattern, max_size)

    async def save_sharded_state_async(
        self, path: str, pattern: str | None = None, max_size: int | None = None
    ) -> None:
        self.save_sharded_state(path, pattern, max_size)

    def collective_rpc(
        self,
        method: str | Callable[..., _R],
        timeout: float | None = None,
        args: tuple = (),
        kwargs: dict[str, Any] | None = None,
    ) -> list[_R]:
        return self.engine_core.collective_rpc(method, timeout, args, kwargs)

    async def collective_rpc_async(
        self,
        method: str | Callable[..., _R],
        timeout: float | None = None,
        args: tuple = (),
        kwargs: dict[str, Any] | None = None,
    ) -> list[_R]:
        return self.collective_rpc(method, timeout, args, kwargs)

    async def pause_scheduler_async(
        self, mode: PauseMode = "abort", clear_cache: bool = True
    ) -> None:
        pass

    async def resume_scheduler_async(self) -> None:
        pass

    async def is_scheduler_paused_async(self) -> bool:
        return False

    def dp_engines_running(self) -> bool:
        return False

Bug 4：NotImplementedError — loop.add_signal_handler 不支持

错误信息

NotImplementedError
  File "vllm\entrypoints\launcher.py", line 103, in serve_http
    loop.add_signal_handler(signal.SIGINT, signal_handler)

原因

Windows 不支持 UNIX 信号机制，asyncio 的 loop.add_signal_handler() 在 Windows 上抛出 NotImplementedError。

修复位置

C:\Users\sheng\miniconda3\envs\vllm\lib\site-packages\vllm\entrypoints\launcher.py

修复方法

找到 add_signal_handler 相关代码，改为：

import sys
if sys.platform != "win32":
    loop.add_signal_handler(signal.SIGINT, signal_handler)
    loop.add_signal_handler(signal.SIGTERM, signal_handler)
else:
    import signal as _signal
    _signal.signal(_signal.SIGINT, lambda s, f: signal_handler())
    _signal.signal(_signal.SIGTERM, lambda s, f: signal_handler())

Bug 5：AttributeError: ‘InprocClient’ object has no attribute ‘resources’

错误信息

AttributeError: 'InprocClient' object has no attribute 'resources'
  File "vllm\v1\engine\async_llm.py", line 1027, in errored
    return self.engine_core.resources.engine_dead or not self.is_running

原因

AsyncLLM.errored 属性访问 engine_core.resources.engine_dead，InprocClient 没有 resources 对象。

修复方法

在 InprocClient.__init__ 中加入 _FakeResources，同时补上 ensure_alive 方法：

def __init__(self, *args, **kwargs):
    import asyncio
    import threading
    self.engine_core = EngineCore(*args, **kwargs)
    self.engine_ranks_managed = [0]
    self._has_requests = threading.Event()
    self._output_queue: asyncio.Queue | None = None
    self._loop: asyncio.AbstractEventLoop | None = None
    self._step_thread: threading.Thread | None = None
    self._stopped = False

    # Windows 兼容：AsyncLLM 会访问 resources.engine_dead
    class _FakeResources:
        engine_dead = False
    self.resources = _FakeResources()

def ensure_alive(self):
    pass  # InprocClient 始终存活

四、启动命令

所有补丁打完后，用以下命令启动服务：

vllm serve Qwen/Qwen3-0.6B --max-model-len 128 --max-num-seqs 1 --gpu-memory-utilization 0.80

启动成功标志：

INFO:     Application startup complete.

五、测试请求

Windows CMD 下 curl 不支持单引号，需用双引号并转义，或使用文件方式：

方式一：转义双引号（CMD）

curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d "{\"model\":\"Qwen/Qwen3-0.6B\",\"messages\":[{\"role\":\"user\",\"content\":\"你好\"}]}"

方式二：使用 JSON 文件（推荐）

新建 req.json：

{
  "model": "Qwen/Qwen3-0.6B",
  "messages": [{"role": "user", "content": "你好"}]
}

curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d @req.json

方式三：使用 PowerShell

curl -Uri http://localhost:8000/v1/chat/completions `
  -Method POST `
  -Headers @{"Content-Type"="application/json"} `
  -Body '{"model": "Qwen/Qwen3-0.6B", "messages": [{"role":"user", "content":"你好"}]}' `
  -UseBasicParsing

或者使用 PowerShell 原生的 Invoke-RestMethod

Invoke-RestMethod -Uri http://localhost:8000/v1/chat/completions `
  -Method POST `
  -Headers @{"Content-Type"="application/json"} `
  -Body '{"model": "Qwen/Qwen3-0.6B", "messages": [{"role":"user", "content":"你好"}]}' `
  -UseBasicParsing

方法 4：使用 Python 脚本

import requests

response = requests.post(
    "http://localhost:8000/v1/chat/completions",
    headers={"Content-Type": "application/json"},
    json={
        "model": "Qwen/Qwen3-0.6B",
        "messages": [{"role": "user", "content": "你好"}],
        "temperature": 0.6,
        "top_p": 0.95
    }
)
print(response.json())

六、修改文件汇总

文件路径	修改内容
`vllm\v1\engine\core_client.py`	重写 `InprocClient` 类，make_async_mp_client方法中增加对win32的判断，补全所有 async 方法，添加后台步进线程
`vllm\entrypoints\launcher.py`	替换 `add_signal_handler` 为 Windows 兼容写法
系统环境变量	添加 `VLLM_ENABLE_V1_MULTIPROCESSING=0`

七、架构说明

Windows 补丁后的运行架构如下：

┌─────────────────────────────────────────────┐
│                API Server (主进程)            │
│                                              │
│  FastAPI / uvicorn                           │
│       │                                      │
│  AsyncLLM                                    │
│       │                                      │
│  InprocClient (补丁版)                        │
│  ┌────────────────────────────────────┐      │
│  │  asyncio 事件循环                   │      │
│  │    output_handler ──► Queue.get()  │      │
│  │                          ▲         │      │
│  │  后台线程 (StepThread)    │         │      │
│  │    step_fn() ────────► Queue       │      │
│  │    (有请求时才运行)                  │      │
│  └────────────────────────────────────┘      │
│       │                                      │
│  EngineCore (同进程，GPU 推理)                 │
└─────────────────────────────────────────────┘

原生 Linux 架构是多进程通过 ZMQ 通信，Windows 补丁将其改为单进程内后台线程 + asyncio Queue 的方式，规避了 Windows 的多进程和信号限制。

八、注意事项

性能：单进程模式下引擎占用主进程资源，高并发性能不如 Linux 多进程模式。
并发限制：InprocClient 的步进线程是单线程的，--max-num-seqs 1 适合测试，生产环境不建议使用此方案。
升级风险：以上修改针对 vLLM 0.19.1，升级版本后需重新验证兼容性。
推荐方案：生产环境建议使用 WSL2（Windows Subsystem for Linux）运行 vLLM，可完全规避上述所有问题。

九、快速复现检查清单

setx VLLM_ENABLE_V1_MULTIPROCESSING 0，重新打开 CMD 验证
修改 core_client.py：替换 InprocClient 完整类
修改 launcher.py：替换 add_signal_handler 为 Windows 兼容写法
运行 vllm serve 命令，等待 Application startup complete
用 @req.json 方式发送测试请求验证推理正常

十、改动代码附录

v1/engine/core_client.py

class EngineCoreClient(ABC):
    """
    EngineCoreClient: subclasses handle different methods for pushing
        and pulling from the EngineCore for asyncio / multiprocessing.

    Subclasses:
    * InprocClient: In process EngineCore (for V0-style LLMEngine use)
    * SyncMPClient: ZMQ + background proc EngineCore (for LLM)
    * AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM)
    """

    @staticmethod
    def make_client(
        multiprocess_mode: bool,
        asyncio_mode: bool,
        vllm_config: VllmConfig,
        executor_class: type[Executor],
        log_stats: bool,
    ) -> "EngineCoreClient":
        # TODO: support this for debugging purposes.
        if asyncio_mode and not multiprocess_mode:
            raise NotImplementedError(
                "Running EngineCore in asyncio without multiprocessing "
                "is not currently supported."
            )

        if multiprocess_mode and asyncio_mode:
            return EngineCoreClient.make_async_mp_client(
                vllm_config, executor_class, log_stats
            )
		
        # === 改动代码 ===
        if multiprocess_mode and not asyncio_mode:
            if sys.platform == "win32":
                # Windows doesn't support fork or IPC sockets reliably,
                # fall back to in-process engine core
                return InprocClient(vllm_config, executor_class, log_stats)
            return SyncMPClient(vllm_config, executor_class, log_stats)

        return InprocClient(vllm_config, executor_class, log_stats)

    @staticmethod
    @instrument(span_name="Overall Loading")
    def make_async_mp_client(
        vllm_config: VllmConfig,
        executor_class: type[Executor],
        log_stats: bool,
        client_addresses: dict[str, str] | None = None,
        client_count: int = 1,
        client_index: int = 0,
    ) -> "AsyncMPClient":

        # Windows workaround: force inproc mode
        import sys
        if sys.platform == "win32":
            return InprocClient(vllm_config, executor_class, log_stats)

        parallel_config = vllm_config.parallel_config
        client_args = (
            vllm_config,
            executor_class,
            log_stats,
            client_addresses,
            client_count,
            client_index,
        )
        if parallel_config.data_parallel_size > 1:
            if parallel_config.data_parallel_external_lb:
                # External load balancer - client per DP rank.
                return DPAsyncMPClient(*client_args)
            # Internal load balancer - client balances to all DP ranks.
            return DPLBAsyncMPClient(*client_args)
        return AsyncMPClient(*client_args)
    
class InprocClient(EngineCoreClient):
    # === 改动代码 ===
    def __init__(self, *args, **kwargs):
        import asyncio
        import threading
        self.engine_core = EngineCore(*args, **kwargs)
        self.engine_ranks_managed = [0]
        self._has_requests = threading.Event()
        self._output_queue: asyncio.Queue | None = None
        self._loop: asyncio.AbstractEventLoop | None = None
        self._step_thread: threading.Thread | None = None
        self._stopped = False

        # Windows 兼容：AsyncLLM 会访问 resources.engine_dead
        class _FakeResources:
            engine_dead = False
        self.resources = _FakeResources()

    def ensure_alive(self):
        pass  # InprocClient 始终存活
        
    def _ensure_step_thread(self):
        """启动后台步进线程（只启动一次）"""
        if self._step_thread is not None:
            return
        import threading
        self._step_thread = threading.Thread(
            target=self._step_loop,
            name="InprocClientStepThread",
            daemon=True,
        )
        self._step_thread.start()

    def _step_loop(self):
        """后台线程：持续步进引擎，有请求时才运行"""
        import asyncio
        while not self._stopped:
            # 等待有请求才步进
            self._has_requests.wait(timeout=1.0)
            if self._stopped:
                break
            outputs, model_executed = self.engine_core.step_fn()
            self.engine_core.post_step(model_executed=model_executed)
            result = outputs and outputs.get(0) or None
            if result is not None and (result.outputs or result.scheduler_stats):
                # 把结果放入 asyncio queue（线程安全方式）
                if self._loop and self._output_queue:
                    self._loop.call_soon_threadsafe(
                        self._output_queue.put_nowait, result
                    )
                # 检查是否还有运行中的请求
                if not (result.scheduler_stats and
                        getattr(result.scheduler_stats, 'num_running_reqs', 0) > 0):
                    self._has_requests.clear()

    async def get_output_async(self) -> EngineCoreOutputs:
        import asyncio
        # 初始化 asyncio queue 和 loop（只做一次）
        if self._output_queue is None:
            self._output_queue = asyncio.Queue()
            self._loop = asyncio.get_running_loop()
            self._ensure_step_thread()
        return await self._output_queue.get()

    def add_request(self, request: EngineCoreRequest) -> None:
        req, request_wave = self.engine_core.preprocess_add_request(request)
        self.engine_core.add_request(req, request_wave)
        # 通知步进线程有新请求
        self._has_requests.set()

    async def add_request_async(self, request: EngineCoreRequest) -> None:
        self.add_request(request)

    def get_output(self) -> EngineCoreOutputs:
        outputs, model_executed = self.engine_core.step_fn()
        self.engine_core.post_step(model_executed=model_executed)
        return outputs and outputs.get(0) or EngineCoreOutputs()

    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
        return self.engine_core.get_supported_tasks()

    async def get_supported_tasks_async(self) -> tuple[SupportedTask, ...]:
        return self.get_supported_tasks()

    def abort_requests(self, request_ids: list[str]) -> None:
        if len(request_ids) > 0:
            self.engine_core.abort_requests(request_ids)

    async def abort_requests_async(self, request_ids: list[str]) -> None:
        self.abort_requests(request_ids)

    def shutdown(self, timeout: float | None = None) -> None:
        self._stopped = True
        self._has_requests.set()  # 唤醒线程让其退出
        self.engine_core.shutdown()

    def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
        self.engine_core.profile(is_start, profile_prefix)

    async def profile_async(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
        self.profile(is_start, profile_prefix)

    def reset_mm_cache(self) -> None:
        self.engine_core.reset_mm_cache()

    async def reset_mm_cache_async(self) -> None:
        self.reset_mm_cache()

    def reset_prefix_cache(
        self, reset_running_requests: bool = False, reset_connector: bool = False
    ) -> bool:
        return self.engine_core.reset_prefix_cache(
            reset_running_requests, reset_connector
        )

    async def reset_prefix_cache_async(
        self, reset_running_requests: bool = False, reset_connector: bool = False
    ) -> bool:
        return self.reset_prefix_cache(reset_running_requests, reset_connector)

    def reset_encoder_cache(self) -> None:
        self.engine_core.reset_encoder_cache()

    async def reset_encoder_cache_async(self) -> None:
        self.reset_encoder_cache()

    def sleep(self, level: int = 1, mode: PauseMode = "abort") -> None:
        if mode == "wait":
            raise ValueError("'wait' pause mode is not supported in inproc-engine mode")
        result = self.engine_core.sleep(level, mode)
        assert result is None

    async def sleep_async(self, level: int = 1, mode: PauseMode = "abort") -> None:
        self.sleep(level, mode)

    def wake_up(self, tags: list[str] | None = None) -> None:
        self.engine_core.wake_up(tags)

    async def wake_up_async(self, tags: list[str] | None = None) -> None:
        self.wake_up(tags)

    def is_sleeping(self) -> bool:
        return self.engine_core.is_sleeping()

    async def is_sleeping_async(self) -> bool:
        return self.is_sleeping()

    def execute_dummy_batch(self) -> None:
        self.engine_core.execute_dummy_batch()

    async def execute_dummy_batch_async(self) -> None:
        self.execute_dummy_batch()

    def add_lora(self, lora_request: LoRARequest) -> bool:
        return self.engine_core.add_lora(lora_request)

    async def add_lora_async(self, lora_request: LoRARequest) -> bool:
        return self.add_lora(lora_request)

    def remove_lora(self, lora_id: int) -> bool:
        return self.engine_core.remove_lora(lora_id)

    async def remove_lora_async(self, lora_id: int) -> bool:
        return self.remove_lora(lora_id)

    def list_loras(self) -> set[int]:
        return self.engine_core.list_loras()

    async def list_loras_async(self) -> set[int]:
        return self.list_loras()

    def pin_lora(self, lora_id: int) -> bool:
        return self.engine_core.pin_lora(lora_id)

    async def pin_lora_async(self, lora_id: int) -> bool:
        return self.pin_lora(lora_id)

    def save_sharded_state(
        self, path: str, pattern: str | None = None, max_size: int | None = None
    ) -> None:
        self.engine_core.save_sharded_state(path, pattern, max_size)

    async def save_sharded_state_async(
        self, path: str, pattern: str | None = None, max_size: int | None = None
    ) -> None:
        self.save_sharded_state(path, pattern, max_size)

    def collective_rpc(
        self,
        method: str | Callable[..., _R],
        timeout: float | None = None,
        args: tuple = (),
        kwargs: dict[str, Any] | None = None,
    ) -> list[_R]:
        return self.engine_core.collective_rpc(method, timeout, args, kwargs)

    async def collective_rpc_async(
        self,
        method: str | Callable[..., _R],
        timeout: float | None = None,
        args: tuple = (),
        kwargs: dict[str, Any] | None = None,
    ) -> list[_R]:
        return self.collective_rpc(method, timeout, args, kwargs)

    async def pause_scheduler_async(
        self, mode: PauseMode = "abort", clear_cache: bool = True
    ) -> None:
        pass

    async def resume_scheduler_async(self) -> None:
        pass

    async def is_scheduler_paused_async(self) -> bool:
        return False

    def dp_engines_running(self) -> bool:
        return False

entrypoints/launcher.py

async def serve_http(
    app: FastAPI,
    sock: socket.socket | None,
    enable_ssl_refresh: bool = False,
    **uvicorn_kwargs: Any,
):
    """
    Start a FastAPI app using Uvicorn, with support for custom Uvicorn config
    options.  Supports http header limits via h11_max_incomplete_event_size and
    h11_max_header_count.
    """
    logger.info("Available routes are:")
    # post endpoints
    for route in app.routes:
        methods = getattr(route, "methods", None)
        path = getattr(route, "path", None)

        if methods is None or path is None:
            continue

        logger.info("Route: %s, Methods: %s", path, ", ".join(methods))

    # other endpoints
    for route in app.routes:
        endpoint = getattr(route, "endpoint", None)
        methods = getattr(route, "methods", None)
        path = getattr(route, "path", None)

        if endpoint is None or path is None or methods is not None:
            continue

        logger.info("Route: %s, Endpoint: %s", path, endpoint.__name__)

    # Extract header limit options if present
    h11_max_incomplete_event_size = uvicorn_kwargs.pop(
        "h11_max_incomplete_event_size", None
    )
    h11_max_header_count = uvicorn_kwargs.pop("h11_max_header_count", None)

    # Set safe defaults if not provided
    if h11_max_incomplete_event_size is None:
        h11_max_incomplete_event_size = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
    if h11_max_header_count is None:
        h11_max_header_count = H11_MAX_HEADER_COUNT_DEFAULT

    config = uvicorn.Config(app, **uvicorn_kwargs)
    # Set header limits
    config.h11_max_incomplete_event_size = h11_max_incomplete_event_size
    config.h11_max_header_count = h11_max_header_count
    config.load()
    server = uvicorn.Server(config)
    app.state.server = server

    loop = asyncio.get_running_loop()

    watchdog_task = loop.create_task(watchdog_loop(server, app.state.engine_client))
    server_task = loop.create_task(server.serve(sockets=[sock] if sock else None))

    ssl_cert_refresher = (
        None
        if not enable_ssl_refresh
        else SSLCertRefresher(
            ssl_context=config.ssl,
            key_path=config.ssl_keyfile,
            cert_path=config.ssl_certfile,
            ca_path=config.ssl_ca_certs,
        )
    )

    shutdown_event = asyncio.Event()

    def signal_handler() -> None:
        shutdown_event.set()

    async def dummy_shutdown() -> None:
        pass

    # loop.add_signal_handler(signal.SIGINT, signal_handler)
    # loop.add_signal_handler(signal.SIGTERM, signal_handler)
    import sys
    if sys.platform != "win32":
        loop.add_signal_handler(signal.SIGINT, signal_handler)
        loop.add_signal_handler(signal.SIGTERM, signal_handler)
    else:
        # Windows 不支持 add_signal_handler，用 signal.signal 代替
        import signal as _signal
        _signal.signal(_signal.SIGINT, lambda s, f: signal_handler())
        _signal.signal(_signal.SIGTERM, lambda s, f: signal_handler())

在这里插入图片描述