vLLM 在 Windows 上的安装与部署

vLLM 0.19.1 在 Windows 上的安装与运行技术报告

环境信息

  • OS: Windows 10/11
  • GPU: NVIDIA Quadro RTX 3000 with Max-Q Design(计算能力 7.5,6GB VRAM)
  • CUDA: 12.6
  • Python: 3.x(Miniconda 环境)
  • vLLM 版本: 0.19.1+cu126
  • 模型: Qwen/Qwen3-0.6B

一、背景

vLLM 官方主要面向 Linux 环境开发,Windows 支持属于社区实验性质。直接在 Windows 上运行 vllm serve 会遇到多个兼容性问题,需要手动修改源码打补丁。本报告记录了完整的Bug过程和最终可用的修复方案。


二、安装步骤

# 创建 conda 环境
conda create -n vllm python=3.10
conda activate vllm

# 安装 vLLM(CUDA 12.6 版本)
pip install vllm==0.19.1+cu126 --extra-index-url https://download.pytorch.org/whl/cu126

三、Bug记录与修复

Bug 1:ZMQError: not a socket

错误信息

zmq.error.ZMQError: not a socket
  File "vllm\v1\engine\utils.py", line 1039, in wait_for_engine_startup
    events = poller.poll(STARTUP_POLL_PERIOD_MS)

原因

vLLM V1 引擎默认使用多进程模式(AsyncMPClient),通过 ZMQ 跨进程通信。Windows 使用 spawn 方式创建子进程,父进程中创建的 ZMQ socket 无法被子进程继承,导致 “not a socket” 错误。

修复方法

设置环境变量禁用多进程模式:

setx VLLM_ENABLE_V1_MULTIPROCESSING 0

⚠️ 注意:必须用 setx 写入系统环境变量后重新打开 CMD 窗口才生效。用 set 命令只在当前会话有效,但 vLLM serve 会启动子进程,子进程无法继承父进程的 set 变量。


Bug 2:AttributeError: ‘InprocClient’ object has no attribute ‘engine_ranks_managed’

错误信息

AttributeError: 'InprocClient' object has no attribute 'engine_ranks_managed'
  File "vllm\v1\engine\async_llm.py", line 168, in __init__
    engine_idxs=self.engine_core.engine_ranks_managed,

原因

禁用多进程后走 InprocClient 路径,但 InprocClient 是为同步的 LLM 类设计的,缺少 AsyncLLM 所需的 engine_ranks_managed 属性。

修复位置

C:\Users\sheng\miniconda3\envs\vllm\lib\site-packages\vllm\v1\engine\core_client.py

修复方法

InprocClient.__init__ 中添加该属性(见Bug 3 的完整代码)。


Bug 3:NotImplementedError — InprocClient 缺少所有 async 方法

错误信息

NotImplementedError
  File "core_client.py", line 221, in get_output_async
  File "core_client.py", line 235, in reset_mm_cache_async

原因

AsyncLLM 调用的全是 _async 后缀的方法,而 InprocClient 只实现了同步版本,父类 EngineCoreClient 中的 async 方法全部 raise NotImplementedError

修复方法

用以下完整代码替换 core_client.py 中的整个 InprocClient 类:

class InprocClient(EngineCoreClient):
    """
    InprocClient: Windows 兼容版本,补全所有 async 方法,
    并使用后台线程 + asyncio.Queue 驱动引擎步进。
    """

    def __init__(self, *args, **kwargs):
        import asyncio
        import threading
        self.engine_core = EngineCore(*args, **kwargs)
        self.engine_ranks_managed = [0]
        self._has_requests = threading.Event()
        self._output_queue: asyncio.Queue | None = None
        self._loop: asyncio.AbstractEventLoop | None = None
        self._step_thread: threading.Thread | None = None
        self._stopped = False

    def _ensure_step_thread(self):
        if self._step_thread is not None:
            return
        import threading
        self._step_thread = threading.Thread(
            target=self._step_loop,
            name="InprocClientStepThread",
            daemon=True,
        )
        self._step_thread.start()

    def _step_loop(self):
        while not self._stopped:
            self._has_requests.wait(timeout=1.0)
            if self._stopped:
                break
            outputs, model_executed = self.engine_core.step_fn()
            self.engine_core.post_step(model_executed=model_executed)
            result = outputs and outputs.get(0) or None
            if result is not None and (result.outputs or result.scheduler_stats):
                if self._loop and self._output_queue:
                    self._loop.call_soon_threadsafe(
                        self._output_queue.put_nowait, result
                    )
                if not (result.scheduler_stats and
                        getattr(result.scheduler_stats, 'num_running_reqs', 0) > 0):
                    self._has_requests.clear()

    async def get_output_async(self) -> EngineCoreOutputs:
        import asyncio
        if self._output_queue is None:
            self._output_queue = asyncio.Queue()
            self._loop = asyncio.get_running_loop()
            self._ensure_step_thread()
        return await self._output_queue.get()

    def add_request(self, request: EngineCoreRequest) -> None:
        req, request_wave = self.engine_core.preprocess_add_request(request)
        self.engine_core.add_request(req, request_wave)
        self._has_requests.set()

    async def add_request_async(self, request: EngineCoreRequest) -> None:
        self.add_request(request)

    def get_output(self) -> EngineCoreOutputs:
        outputs, model_executed = self.engine_core.step_fn()
        self.engine_core.post_step(model_executed=model_executed)
        return outputs and outputs.get(0) or EngineCoreOutputs()

    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
        return self.engine_core.get_supported_tasks()

    async def get_supported_tasks_async(self) -> tuple[SupportedTask, ...]:
        return self.get_supported_tasks()

    def abort_requests(self, request_ids: list[str]) -> None:
        if len(request_ids) > 0:
            self.engine_core.abort_requests(request_ids)

    async def abort_requests_async(self, request_ids: list[str]) -> None:
        self.abort_requests(request_ids)

    def shutdown(self, timeout: float | None = None) -> None:
        self._stopped = True
        self._has_requests.set()
        self.engine_core.shutdown()

    def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
        self.engine_core.profile(is_start, profile_prefix)

    async def profile_async(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
        self.profile(is_start, profile_prefix)

    def reset_mm_cache(self) -> None:
        self.engine_core.reset_mm_cache()

    async def reset_mm_cache_async(self) -> None:
        self.reset_mm_cache()

    def reset_prefix_cache(
        self, reset_running_requests: bool = False, reset_connector: bool = False
    ) -> bool:
        return self.engine_core.reset_prefix_cache(reset_running_requests, reset_connector)

    async def reset_prefix_cache_async(
        self, reset_running_requests: bool = False, reset_connector: bool = False
    ) -> bool:
        return self.reset_prefix_cache(reset_running_requests, reset_connector)

    def reset_encoder_cache(self) -> None:
        self.engine_core.reset_encoder_cache()

    async def reset_encoder_cache_async(self) -> None:
        self.reset_encoder_cache()

    def sleep(self, level: int = 1, mode: PauseMode = "abort") -> None:
        if mode == "wait":
            raise ValueError("'wait' pause mode is not supported in inproc-engine mode")
        result = self.engine_core.sleep(level, mode)
        assert result is None

    async def sleep_async(self, level: int = 1, mode: PauseMode = "abort") -> None:
        self.sleep(level, mode)

    def wake_up(self, tags: list[str] | None = None) -> None:
        self.engine_core.wake_up(tags)

    async def wake_up_async(self, tags: list[str] | None = None) -> None:
        self.wake_up(tags)

    def is_sleeping(self) -> bool:
        return self.engine_core.is_sleeping()

    async def is_sleeping_async(self) -> bool:
        return self.is_sleeping()

    def execute_dummy_batch(self) -> None:
        self.engine_core.execute_dummy_batch()

    async def execute_dummy_batch_async(self) -> None:
        self.execute_dummy_batch()

    def add_lora(self, lora_request: LoRARequest) -> bool:
        return self.engine_core.add_lora(lora_request)

    async def add_lora_async(self, lora_request: LoRARequest) -> bool:
        return self.add_lora(lora_request)

    def remove_lora(self, lora_id: int) -> bool:
        return self.engine_core.remove_lora(lora_id)

    async def remove_lora_async(self, lora_id: int) -> bool:
        return self.remove_lora(lora_id)

    def list_loras(self) -> set[int]:
        return self.engine_core.list_loras()

    async def list_loras_async(self) -> set[int]:
        return self.list_loras()

    def pin_lora(self, lora_id: int) -> bool:
        return self.engine_core.pin_lora(lora_id)

    async def pin_lora_async(self, lora_id: int) -> bool:
        return self.pin_lora(lora_id)

    def save_sharded_state(
        self, path: str, pattern: str | None = None, max_size: int | None = None
    ) -> None:
        self.engine_core.save_sharded_state(path, pattern, max_size)

    async def save_sharded_state_async(
        self, path: str, pattern: str | None = None, max_size: int | None = None
    ) -> None:
        self.save_sharded_state(path, pattern, max_size)

    def collective_rpc(
        self,
        method: str | Callable[..., _R],
        timeout: float | None = None,
        args: tuple = (),
        kwargs: dict[str, Any] | None = None,
    ) -> list[_R]:
        return self.engine_core.collective_rpc(method, timeout, args, kwargs)

    async def collective_rpc_async(
        self,
        method: str | Callable[..., _R],
        timeout: float | None = None,
        args: tuple = (),
        kwargs: dict[str, Any] | None = None,
    ) -> list[_R]:
        return self.collective_rpc(method, timeout, args, kwargs)

    async def pause_scheduler_async(
        self, mode: PauseMode = "abort", clear_cache: bool = True
    ) -> None:
        pass

    async def resume_scheduler_async(self) -> None:
        pass

    async def is_scheduler_paused_async(self) -> bool:
        return False

    def dp_engines_running(self) -> bool:
        return False

Bug 4:NotImplementedError — loop.add_signal_handler 不支持

错误信息

NotImplementedError
  File "vllm\entrypoints\launcher.py", line 103, in serve_http
    loop.add_signal_handler(signal.SIGINT, signal_handler)

原因

Windows 不支持 UNIX 信号机制,asyncio 的 loop.add_signal_handler() 在 Windows 上抛出 NotImplementedError

修复位置

C:\Users\sheng\miniconda3\envs\vllm\lib\site-packages\vllm\entrypoints\launcher.py

修复方法

找到 add_signal_handler 相关代码,改为:

import sys
if sys.platform != "win32":
    loop.add_signal_handler(signal.SIGINT, signal_handler)
    loop.add_signal_handler(signal.SIGTERM, signal_handler)
else:
    import signal as _signal
    _signal.signal(_signal.SIGINT, lambda s, f: signal_handler())
    _signal.signal(_signal.SIGTERM, lambda s, f: signal_handler())

Bug 5:AttributeError: ‘InprocClient’ object has no attribute ‘resources’

错误信息

AttributeError: 'InprocClient' object has no attribute 'resources'
  File "vllm\v1\engine\async_llm.py", line 1027, in errored
    return self.engine_core.resources.engine_dead or not self.is_running

原因

AsyncLLM.errored 属性访问 engine_core.resources.engine_deadInprocClient 没有 resources 对象。

修复方法

InprocClient.__init__ 中加入 _FakeResources,同时补上 ensure_alive 方法:

def __init__(self, *args, **kwargs):
    import asyncio
    import threading
    self.engine_core = EngineCore(*args, **kwargs)
    self.engine_ranks_managed = [0]
    self._has_requests = threading.Event()
    self._output_queue: asyncio.Queue | None = None
    self._loop: asyncio.AbstractEventLoop | None = None
    self._step_thread: threading.Thread | None = None
    self._stopped = False

    # Windows 兼容:AsyncLLM 会访问 resources.engine_dead
    class _FakeResources:
        engine_dead = False
    self.resources = _FakeResources()

def ensure_alive(self):
    pass  # InprocClient 始终存活

四、启动命令

所有补丁打完后,用以下命令启动服务:

vllm serve Qwen/Qwen3-0.6B --max-model-len 128 --max-num-seqs 1 --gpu-memory-utilization 0.80

启动成功标志:

INFO:     Application startup complete.

五、测试请求

Windows CMD 下 curl 不支持单引号,需用双引号并转义,或使用文件方式:

方式一:转义双引号(CMD)

curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d "{\"model\":\"Qwen/Qwen3-0.6B\",\"messages\":[{\"role\":\"user\",\"content\":\"你好\"}]}"

方式二:使用 JSON 文件(推荐)

新建 req.json

{
  "model": "Qwen/Qwen3-0.6B",
  "messages": [{"role": "user", "content": "你好"}]
}
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d @req.json

方式三:使用 PowerShell

curl -Uri http://localhost:8000/v1/chat/completions `
  -Method POST `
  -Headers @{"Content-Type"="application/json"} `
  -Body '{"model": "Qwen/Qwen3-0.6B", "messages": [{"role":"user", "content":"你好"}]}' `
  -UseBasicParsing

或者使用 PowerShell 原生的 Invoke-RestMethod

Invoke-RestMethod -Uri http://localhost:8000/v1/chat/completions `
  -Method POST `
  -Headers @{"Content-Type"="application/json"} `
  -Body '{"model": "Qwen/Qwen3-0.6B", "messages": [{"role":"user", "content":"你好"}]}' `
  -UseBasicParsing

方法 4:使用 Python 脚本

import requests

response = requests.post(
    "http://localhost:8000/v1/chat/completions",
    headers={"Content-Type": "application/json"},
    json={
        "model": "Qwen/Qwen3-0.6B",
        "messages": [{"role": "user", "content": "你好"}],
        "temperature": 0.6,
        "top_p": 0.95
    }
)
print(response.json())

六、修改文件汇总

文件路径修改内容
vllm\v1\engine\core_client.py重写 InprocClient 类,make_async_mp_client方法中增加对win32的判断,补全所有 async 方法,添加后台步进线程
vllm\entrypoints\launcher.py替换 add_signal_handler 为 Windows 兼容写法
系统环境变量添加 VLLM_ENABLE_V1_MULTIPROCESSING=0

七、架构说明

Windows 补丁后的运行架构如下:

┌─────────────────────────────────────────────┐
│                API Server (主进程)            │
│                                              │
│  FastAPI / uvicorn                           │
│       │                                      │
│  AsyncLLM                                    │
│       │                                      │
│  InprocClient (补丁版)                        │
│  ┌────────────────────────────────────┐      │
│  │  asyncio 事件循环                   │      │
│  │    output_handler ──► Queue.get()  │      │
│  │                          ▲         │      │
│  │  后台线程 (StepThread)    │         │      │
│  │    step_fn() ────────► Queue       │      │
│  │    (有请求时才运行)                  │      │
│  └────────────────────────────────────┘      │
│       │                                      │
│  EngineCore (同进程,GPU 推理)                 │
└─────────────────────────────────────────────┘

原生 Linux 架构是多进程通过 ZMQ 通信,Windows 补丁将其改为单进程内后台线程 + asyncio Queue 的方式,规避了 Windows 的多进程和信号限制。


八、注意事项

  1. 性能:单进程模式下引擎占用主进程资源,高并发性能不如 Linux 多进程模式。
  2. 并发限制InprocClient 的步进线程是单线程的,--max-num-seqs 1 适合测试,生产环境不建议使用此方案。
  3. 升级风险:以上修改针对 vLLM 0.19.1,升级版本后需重新验证兼容性。
  4. 推荐方案:生产环境建议使用 WSL2(Windows Subsystem for Linux)运行 vLLM,可完全规避上述所有问题。

九、快速复现检查清单

  • setx VLLM_ENABLE_V1_MULTIPROCESSING 0,重新打开 CMD 验证
  • 修改 core_client.py:替换 InprocClient 完整类
  • 修改 launcher.py:替换 add_signal_handler 为 Windows 兼容写法
  • 运行 vllm serve 命令,等待 Application startup complete
  • @req.json 方式发送测试请求验证推理正常

十、改动代码附录

v1/engine/core_client.py

class EngineCoreClient(ABC):
    """
    EngineCoreClient: subclasses handle different methods for pushing
        and pulling from the EngineCore for asyncio / multiprocessing.

    Subclasses:
    * InprocClient: In process EngineCore (for V0-style LLMEngine use)
    * SyncMPClient: ZMQ + background proc EngineCore (for LLM)
    * AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM)
    """

    @staticmethod
    def make_client(
        multiprocess_mode: bool,
        asyncio_mode: bool,
        vllm_config: VllmConfig,
        executor_class: type[Executor],
        log_stats: bool,
    ) -> "EngineCoreClient":
        # TODO: support this for debugging purposes.
        if asyncio_mode and not multiprocess_mode:
            raise NotImplementedError(
                "Running EngineCore in asyncio without multiprocessing "
                "is not currently supported."
            )

        if multiprocess_mode and asyncio_mode:
            return EngineCoreClient.make_async_mp_client(
                vllm_config, executor_class, log_stats
            )
		
        # === 改动代码 ===
        if multiprocess_mode and not asyncio_mode:
            if sys.platform == "win32":
                # Windows doesn't support fork or IPC sockets reliably,
                # fall back to in-process engine core
                return InprocClient(vllm_config, executor_class, log_stats)
            return SyncMPClient(vllm_config, executor_class, log_stats)

        return InprocClient(vllm_config, executor_class, log_stats)

    @staticmethod
    @instrument(span_name="Overall Loading")
    def make_async_mp_client(
        vllm_config: VllmConfig,
        executor_class: type[Executor],
        log_stats: bool,
        client_addresses: dict[str, str] | None = None,
        client_count: int = 1,
        client_index: int = 0,
    ) -> "AsyncMPClient":

        # Windows workaround: force inproc mode
        import sys
        if sys.platform == "win32":
            return InprocClient(vllm_config, executor_class, log_stats)

        parallel_config = vllm_config.parallel_config
        client_args = (
            vllm_config,
            executor_class,
            log_stats,
            client_addresses,
            client_count,
            client_index,
        )
        if parallel_config.data_parallel_size > 1:
            if parallel_config.data_parallel_external_lb:
                # External load balancer - client per DP rank.
                return DPAsyncMPClient(*client_args)
            # Internal load balancer - client balances to all DP ranks.
            return DPLBAsyncMPClient(*client_args)
        return AsyncMPClient(*client_args)
    
class InprocClient(EngineCoreClient):
    # === 改动代码 ===
    def __init__(self, *args, **kwargs):
        import asyncio
        import threading
        self.engine_core = EngineCore(*args, **kwargs)
        self.engine_ranks_managed = [0]
        self._has_requests = threading.Event()
        self._output_queue: asyncio.Queue | None = None
        self._loop: asyncio.AbstractEventLoop | None = None
        self._step_thread: threading.Thread | None = None
        self._stopped = False

        # Windows 兼容:AsyncLLM 会访问 resources.engine_dead
        class _FakeResources:
            engine_dead = False
        self.resources = _FakeResources()

    def ensure_alive(self):
        pass  # InprocClient 始终存活
        
    def _ensure_step_thread(self):
        """启动后台步进线程(只启动一次)"""
        if self._step_thread is not None:
            return
        import threading
        self._step_thread = threading.Thread(
            target=self._step_loop,
            name="InprocClientStepThread",
            daemon=True,
        )
        self._step_thread.start()

    def _step_loop(self):
        """后台线程:持续步进引擎,有请求时才运行"""
        import asyncio
        while not self._stopped:
            # 等待有请求才步进
            self._has_requests.wait(timeout=1.0)
            if self._stopped:
                break
            outputs, model_executed = self.engine_core.step_fn()
            self.engine_core.post_step(model_executed=model_executed)
            result = outputs and outputs.get(0) or None
            if result is not None and (result.outputs or result.scheduler_stats):
                # 把结果放入 asyncio queue(线程安全方式)
                if self._loop and self._output_queue:
                    self._loop.call_soon_threadsafe(
                        self._output_queue.put_nowait, result
                    )
                # 检查是否还有运行中的请求
                if not (result.scheduler_stats and
                        getattr(result.scheduler_stats, 'num_running_reqs', 0) > 0):
                    self._has_requests.clear()

    async def get_output_async(self) -> EngineCoreOutputs:
        import asyncio
        # 初始化 asyncio queue 和 loop(只做一次)
        if self._output_queue is None:
            self._output_queue = asyncio.Queue()
            self._loop = asyncio.get_running_loop()
            self._ensure_step_thread()
        return await self._output_queue.get()

    def add_request(self, request: EngineCoreRequest) -> None:
        req, request_wave = self.engine_core.preprocess_add_request(request)
        self.engine_core.add_request(req, request_wave)
        # 通知步进线程有新请求
        self._has_requests.set()

    async def add_request_async(self, request: EngineCoreRequest) -> None:
        self.add_request(request)

    def get_output(self) -> EngineCoreOutputs:
        outputs, model_executed = self.engine_core.step_fn()
        self.engine_core.post_step(model_executed=model_executed)
        return outputs and outputs.get(0) or EngineCoreOutputs()

    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
        return self.engine_core.get_supported_tasks()

    async def get_supported_tasks_async(self) -> tuple[SupportedTask, ...]:
        return self.get_supported_tasks()

    def abort_requests(self, request_ids: list[str]) -> None:
        if len(request_ids) > 0:
            self.engine_core.abort_requests(request_ids)

    async def abort_requests_async(self, request_ids: list[str]) -> None:
        self.abort_requests(request_ids)

    def shutdown(self, timeout: float | None = None) -> None:
        self._stopped = True
        self._has_requests.set()  # 唤醒线程让其退出
        self.engine_core.shutdown()

    def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
        self.engine_core.profile(is_start, profile_prefix)

    async def profile_async(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
        self.profile(is_start, profile_prefix)

    def reset_mm_cache(self) -> None:
        self.engine_core.reset_mm_cache()

    async def reset_mm_cache_async(self) -> None:
        self.reset_mm_cache()

    def reset_prefix_cache(
        self, reset_running_requests: bool = False, reset_connector: bool = False
    ) -> bool:
        return self.engine_core.reset_prefix_cache(
            reset_running_requests, reset_connector
        )

    async def reset_prefix_cache_async(
        self, reset_running_requests: bool = False, reset_connector: bool = False
    ) -> bool:
        return self.reset_prefix_cache(reset_running_requests, reset_connector)

    def reset_encoder_cache(self) -> None:
        self.engine_core.reset_encoder_cache()

    async def reset_encoder_cache_async(self) -> None:
        self.reset_encoder_cache()

    def sleep(self, level: int = 1, mode: PauseMode = "abort") -> None:
        if mode == "wait":
            raise ValueError("'wait' pause mode is not supported in inproc-engine mode")
        result = self.engine_core.sleep(level, mode)
        assert result is None

    async def sleep_async(self, level: int = 1, mode: PauseMode = "abort") -> None:
        self.sleep(level, mode)

    def wake_up(self, tags: list[str] | None = None) -> None:
        self.engine_core.wake_up(tags)

    async def wake_up_async(self, tags: list[str] | None = None) -> None:
        self.wake_up(tags)

    def is_sleeping(self) -> bool:
        return self.engine_core.is_sleeping()

    async def is_sleeping_async(self) -> bool:
        return self.is_sleeping()

    def execute_dummy_batch(self) -> None:
        self.engine_core.execute_dummy_batch()

    async def execute_dummy_batch_async(self) -> None:
        self.execute_dummy_batch()

    def add_lora(self, lora_request: LoRARequest) -> bool:
        return self.engine_core.add_lora(lora_request)

    async def add_lora_async(self, lora_request: LoRARequest) -> bool:
        return self.add_lora(lora_request)

    def remove_lora(self, lora_id: int) -> bool:
        return self.engine_core.remove_lora(lora_id)

    async def remove_lora_async(self, lora_id: int) -> bool:
        return self.remove_lora(lora_id)

    def list_loras(self) -> set[int]:
        return self.engine_core.list_loras()

    async def list_loras_async(self) -> set[int]:
        return self.list_loras()

    def pin_lora(self, lora_id: int) -> bool:
        return self.engine_core.pin_lora(lora_id)

    async def pin_lora_async(self, lora_id: int) -> bool:
        return self.pin_lora(lora_id)

    def save_sharded_state(
        self, path: str, pattern: str | None = None, max_size: int | None = None
    ) -> None:
        self.engine_core.save_sharded_state(path, pattern, max_size)

    async def save_sharded_state_async(
        self, path: str, pattern: str | None = None, max_size: int | None = None
    ) -> None:
        self.save_sharded_state(path, pattern, max_size)

    def collective_rpc(
        self,
        method: str | Callable[..., _R],
        timeout: float | None = None,
        args: tuple = (),
        kwargs: dict[str, Any] | None = None,
    ) -> list[_R]:
        return self.engine_core.collective_rpc(method, timeout, args, kwargs)

    async def collective_rpc_async(
        self,
        method: str | Callable[..., _R],
        timeout: float | None = None,
        args: tuple = (),
        kwargs: dict[str, Any] | None = None,
    ) -> list[_R]:
        return self.collective_rpc(method, timeout, args, kwargs)

    async def pause_scheduler_async(
        self, mode: PauseMode = "abort", clear_cache: bool = True
    ) -> None:
        pass

    async def resume_scheduler_async(self) -> None:
        pass

    async def is_scheduler_paused_async(self) -> bool:
        return False

    def dp_engines_running(self) -> bool:
        return False

entrypoints/launcher.py

async def serve_http(
    app: FastAPI,
    sock: socket.socket | None,
    enable_ssl_refresh: bool = False,
    **uvicorn_kwargs: Any,
):
    """
    Start a FastAPI app using Uvicorn, with support for custom Uvicorn config
    options.  Supports http header limits via h11_max_incomplete_event_size and
    h11_max_header_count.
    """
    logger.info("Available routes are:")
    # post endpoints
    for route in app.routes:
        methods = getattr(route, "methods", None)
        path = getattr(route, "path", None)

        if methods is None or path is None:
            continue

        logger.info("Route: %s, Methods: %s", path, ", ".join(methods))

    # other endpoints
    for route in app.routes:
        endpoint = getattr(route, "endpoint", None)
        methods = getattr(route, "methods", None)
        path = getattr(route, "path", None)

        if endpoint is None or path is None or methods is not None:
            continue

        logger.info("Route: %s, Endpoint: %s", path, endpoint.__name__)

    # Extract header limit options if present
    h11_max_incomplete_event_size = uvicorn_kwargs.pop(
        "h11_max_incomplete_event_size", None
    )
    h11_max_header_count = uvicorn_kwargs.pop("h11_max_header_count", None)

    # Set safe defaults if not provided
    if h11_max_incomplete_event_size is None:
        h11_max_incomplete_event_size = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
    if h11_max_header_count is None:
        h11_max_header_count = H11_MAX_HEADER_COUNT_DEFAULT

    config = uvicorn.Config(app, **uvicorn_kwargs)
    # Set header limits
    config.h11_max_incomplete_event_size = h11_max_incomplete_event_size
    config.h11_max_header_count = h11_max_header_count
    config.load()
    server = uvicorn.Server(config)
    app.state.server = server

    loop = asyncio.get_running_loop()

    watchdog_task = loop.create_task(watchdog_loop(server, app.state.engine_client))
    server_task = loop.create_task(server.serve(sockets=[sock] if sock else None))

    ssl_cert_refresher = (
        None
        if not enable_ssl_refresh
        else SSLCertRefresher(
            ssl_context=config.ssl,
            key_path=config.ssl_keyfile,
            cert_path=config.ssl_certfile,
            ca_path=config.ssl_ca_certs,
        )
    )

    shutdown_event = asyncio.Event()

    def signal_handler() -> None:
        shutdown_event.set()

    async def dummy_shutdown() -> None:
        pass

    # loop.add_signal_handler(signal.SIGINT, signal_handler)
    # loop.add_signal_handler(signal.SIGTERM, signal_handler)
    import sys
    if sys.platform != "win32":
        loop.add_signal_handler(signal.SIGINT, signal_handler)
        loop.add_signal_handler(signal.SIGTERM, signal_handler)
    else:
        # Windows 不支持 add_signal_handler,用 signal.signal 代替
        import signal as _signal
        _signal.signal(_signal.SIGINT, lambda s, f: signal_handler())
        _signal.signal(_signal.SIGTERM, lambda s, f: signal_handler())

在这里插入图片描述
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值