vLLM 0.19.1 在 Windows 上的安装与运行技术报告
环境信息
- OS: Windows 10/11
- GPU: NVIDIA Quadro RTX 3000 with Max-Q Design(计算能力 7.5,6GB VRAM)
- CUDA: 12.6
- Python: 3.x(Miniconda 环境)
- vLLM 版本: 0.19.1+cu126
- 模型: Qwen/Qwen3-0.6B
一、背景
vLLM 官方主要面向 Linux 环境开发,Windows 支持属于社区实验性质。直接在 Windows 上运行 vllm serve 会遇到多个兼容性问题,需要手动修改源码打补丁。本报告记录了完整的Bug过程和最终可用的修复方案。
二、安装步骤
# 创建 conda 环境
conda create -n vllm python=3.10
conda activate vllm
# 安装 vLLM(CUDA 12.6 版本)
pip install vllm==0.19.1+cu126 --extra-index-url https://download.pytorch.org/whl/cu126
三、Bug记录与修复
Bug 1:ZMQError: not a socket
错误信息
zmq.error.ZMQError: not a socket
File "vllm\v1\engine\utils.py", line 1039, in wait_for_engine_startup
events = poller.poll(STARTUP_POLL_PERIOD_MS)
原因
vLLM V1 引擎默认使用多进程模式(AsyncMPClient),通过 ZMQ 跨进程通信。Windows 使用 spawn 方式创建子进程,父进程中创建的 ZMQ socket 无法被子进程继承,导致 “not a socket” 错误。
修复方法
设置环境变量禁用多进程模式:
setx VLLM_ENABLE_V1_MULTIPROCESSING 0
⚠️ 注意:必须用
setx写入系统环境变量后重新打开 CMD 窗口才生效。用set命令只在当前会话有效,但 vLLM serve 会启动子进程,子进程无法继承父进程的set变量。
Bug 2:AttributeError: ‘InprocClient’ object has no attribute ‘engine_ranks_managed’
错误信息
AttributeError: 'InprocClient' object has no attribute 'engine_ranks_managed'
File "vllm\v1\engine\async_llm.py", line 168, in __init__
engine_idxs=self.engine_core.engine_ranks_managed,
原因
禁用多进程后走 InprocClient 路径,但 InprocClient 是为同步的 LLM 类设计的,缺少 AsyncLLM 所需的 engine_ranks_managed 属性。
修复位置
C:\Users\sheng\miniconda3\envs\vllm\lib\site-packages\vllm\v1\engine\core_client.py
修复方法
在 InprocClient.__init__ 中添加该属性(见Bug 3 的完整代码)。
Bug 3:NotImplementedError — InprocClient 缺少所有 async 方法
错误信息
NotImplementedError
File "core_client.py", line 221, in get_output_async
File "core_client.py", line 235, in reset_mm_cache_async
原因
AsyncLLM 调用的全是 _async 后缀的方法,而 InprocClient 只实现了同步版本,父类 EngineCoreClient 中的 async 方法全部 raise NotImplementedError。
修复方法
用以下完整代码替换 core_client.py 中的整个 InprocClient 类:
class InprocClient(EngineCoreClient):
"""
InprocClient: Windows 兼容版本,补全所有 async 方法,
并使用后台线程 + asyncio.Queue 驱动引擎步进。
"""
def __init__(self, *args, **kwargs):
import asyncio
import threading
self.engine_core = EngineCore(*args, **kwargs)
self.engine_ranks_managed = [0]
self._has_requests = threading.Event()
self._output_queue: asyncio.Queue | None = None
self._loop: asyncio.AbstractEventLoop | None = None
self._step_thread: threading.Thread | None = None
self._stopped = False
def _ensure_step_thread(self):
if self._step_thread is not None:
return
import threading
self._step_thread = threading.Thread(
target=self._step_loop,
name="InprocClientStepThread",
daemon=True,
)
self._step_thread.start()
def _step_loop(self):
while not self._stopped:
self._has_requests.wait(timeout=1.0)
if self._stopped:
break
outputs, model_executed = self.engine_core.step_fn()
self.engine_core.post_step(model_executed=model_executed)
result = outputs and outputs.get(0) or None
if result is not None and (result.outputs or result.scheduler_stats):
if self._loop and self._output_queue:
self._loop.call_soon_threadsafe(
self._output_queue.put_nowait, result
)
if not (result.scheduler_stats and
getattr(result.scheduler_stats, 'num_running_reqs', 0) > 0):
self._has_requests.clear()
async def get_output_async(self) -> EngineCoreOutputs:
import asyncio
if self._output_queue is None:
self._output_queue = asyncio.Queue()
self._loop = asyncio.get_running_loop()
self._ensure_step_thread()
return await self._output_queue.get()
def add_request(self, request: EngineCoreRequest) -> None:
req, request_wave = self.engine_core.preprocess_add_request(request)
self.engine_core.add_request(req, request_wave)
self._has_requests.set()
async def add_request_async(self, request: EngineCoreRequest) -> None:
self.add_request(request)
def get_output(self) -> EngineCoreOutputs:
outputs, model_executed = self.engine_core.step_fn()
self.engine_core.post_step(model_executed=model_executed)
return outputs and outputs.get(0) or EngineCoreOutputs()
def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
return self.engine_core.get_supported_tasks()
async def get_supported_tasks_async(self) -> tuple[SupportedTask, ...]:
return self.get_supported_tasks()
def abort_requests(self, request_ids: list[str]) -> None:
if len(request_ids) > 0:
self.engine_core.abort_requests(request_ids)
async def abort_requests_async(self, request_ids: list[str]) -> None:
self.abort_requests(request_ids)
def shutdown(self, timeout: float | None = None) -> None:
self._stopped = True
self._has_requests.set()
self.engine_core.shutdown()
def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
self.engine_core.profile(is_start, profile_prefix)
async def profile_async(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
self.profile(is_start, profile_prefix)
def reset_mm_cache(self) -> None:
self.engine_core.reset_mm_cache()
async def reset_mm_cache_async(self) -> None:
self.reset_mm_cache()
def reset_prefix_cache(
self, reset_running_requests: bool = False, reset_connector: bool = False
) -> bool:
return self.engine_core.reset_prefix_cache(reset_running_requests, reset_connector)
async def reset_prefix_cache_async(
self, reset_running_requests: bool = False, reset_connector: bool = False
) -> bool:
return self.reset_prefix_cache(reset_running_requests, reset_connector)
def reset_encoder_cache(self) -> None:
self.engine_core.reset_encoder_cache()
async def reset_encoder_cache_async(self) -> None:
self.reset_encoder_cache()
def sleep(self, level: int = 1, mode: PauseMode = "abort") -> None:
if mode == "wait":
raise ValueError("'wait' pause mode is not supported in inproc-engine mode")
result = self.engine_core.sleep(level, mode)
assert result is None
async def sleep_async(self, level: int = 1, mode: PauseMode = "abort") -> None:
self.sleep(level, mode)
def wake_up(self, tags: list[str] | None = None) -> None:
self.engine_core.wake_up(tags)
async def wake_up_async(self, tags: list[str] | None = None) -> None:
self.wake_up(tags)
def is_sleeping(self) -> bool:
return self.engine_core.is_sleeping()
async def is_sleeping_async(self) -> bool:
return self.is_sleeping()
def execute_dummy_batch(self) -> None:
self.engine_core.execute_dummy_batch()
async def execute_dummy_batch_async(self) -> None:
self.execute_dummy_batch()
def add_lora(self, lora_request: LoRARequest) -> bool:
return self.engine_core.add_lora(lora_request)
async def add_lora_async(self, lora_request: LoRARequest) -> bool:
return self.add_lora(lora_request)
def remove_lora(self, lora_id: int) -> bool:
return self.engine_core.remove_lora(lora_id)
async def remove_lora_async(self, lora_id: int) -> bool:
return self.remove_lora(lora_id)
def list_loras(self) -> set[int]:
return self.engine_core.list_loras()
async def list_loras_async(self) -> set[int]:
return self.list_loras()
def pin_lora(self, lora_id: int) -> bool:
return self.engine_core.pin_lora(lora_id)
async def pin_lora_async(self, lora_id: int) -> bool:
return self.pin_lora(lora_id)
def save_sharded_state(
self, path: str, pattern: str | None = None, max_size: int | None = None
) -> None:
self.engine_core.save_sharded_state(path, pattern, max_size)
async def save_sharded_state_async(
self, path: str, pattern: str | None = None, max_size: int | None = None
) -> None:
self.save_sharded_state(path, pattern, max_size)
def collective_rpc(
self,
method: str | Callable[..., _R],
timeout: float | None = None,
args: tuple = (),
kwargs: dict[str, Any] | None = None,
) -> list[_R]:
return self.engine_core.collective_rpc(method, timeout, args, kwargs)
async def collective_rpc_async(
self,
method: str | Callable[..., _R],
timeout: float | None = None,
args: tuple = (),
kwargs: dict[str, Any] | None = None,
) -> list[_R]:
return self.collective_rpc(method, timeout, args, kwargs)
async def pause_scheduler_async(
self, mode: PauseMode = "abort", clear_cache: bool = True
) -> None:
pass
async def resume_scheduler_async(self) -> None:
pass
async def is_scheduler_paused_async(self) -> bool:
return False
def dp_engines_running(self) -> bool:
return False
Bug 4:NotImplementedError — loop.add_signal_handler 不支持
错误信息
NotImplementedError
File "vllm\entrypoints\launcher.py", line 103, in serve_http
loop.add_signal_handler(signal.SIGINT, signal_handler)
原因
Windows 不支持 UNIX 信号机制,asyncio 的 loop.add_signal_handler() 在 Windows 上抛出 NotImplementedError。
修复位置
C:\Users\sheng\miniconda3\envs\vllm\lib\site-packages\vllm\entrypoints\launcher.py
修复方法
找到 add_signal_handler 相关代码,改为:
import sys
if sys.platform != "win32":
loop.add_signal_handler(signal.SIGINT, signal_handler)
loop.add_signal_handler(signal.SIGTERM, signal_handler)
else:
import signal as _signal
_signal.signal(_signal.SIGINT, lambda s, f: signal_handler())
_signal.signal(_signal.SIGTERM, lambda s, f: signal_handler())
Bug 5:AttributeError: ‘InprocClient’ object has no attribute ‘resources’
错误信息
AttributeError: 'InprocClient' object has no attribute 'resources'
File "vllm\v1\engine\async_llm.py", line 1027, in errored
return self.engine_core.resources.engine_dead or not self.is_running
原因
AsyncLLM.errored 属性访问 engine_core.resources.engine_dead,InprocClient 没有 resources 对象。
修复方法
在 InprocClient.__init__ 中加入 _FakeResources,同时补上 ensure_alive 方法:
def __init__(self, *args, **kwargs):
import asyncio
import threading
self.engine_core = EngineCore(*args, **kwargs)
self.engine_ranks_managed = [0]
self._has_requests = threading.Event()
self._output_queue: asyncio.Queue | None = None
self._loop: asyncio.AbstractEventLoop | None = None
self._step_thread: threading.Thread | None = None
self._stopped = False
# Windows 兼容:AsyncLLM 会访问 resources.engine_dead
class _FakeResources:
engine_dead = False
self.resources = _FakeResources()
def ensure_alive(self):
pass # InprocClient 始终存活
四、启动命令
所有补丁打完后,用以下命令启动服务:
vllm serve Qwen/Qwen3-0.6B --max-model-len 128 --max-num-seqs 1 --gpu-memory-utilization 0.80
启动成功标志:
INFO: Application startup complete.
五、测试请求
Windows CMD 下 curl 不支持单引号,需用双引号并转义,或使用文件方式:
方式一:转义双引号(CMD)
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d "{\"model\":\"Qwen/Qwen3-0.6B\",\"messages\":[{\"role\":\"user\",\"content\":\"你好\"}]}"
方式二:使用 JSON 文件(推荐)
新建 req.json:
{
"model": "Qwen/Qwen3-0.6B",
"messages": [{"role": "user", "content": "你好"}]
}
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d @req.json
方式三:使用 PowerShell
curl -Uri http://localhost:8000/v1/chat/completions `
-Method POST `
-Headers @{"Content-Type"="application/json"} `
-Body '{"model": "Qwen/Qwen3-0.6B", "messages": [{"role":"user", "content":"你好"}]}' `
-UseBasicParsing
或者使用 PowerShell 原生的 Invoke-RestMethod
Invoke-RestMethod -Uri http://localhost:8000/v1/chat/completions `
-Method POST `
-Headers @{"Content-Type"="application/json"} `
-Body '{"model": "Qwen/Qwen3-0.6B", "messages": [{"role":"user", "content":"你好"}]}' `
-UseBasicParsing
方法 4:使用 Python 脚本
import requests
response = requests.post(
"http://localhost:8000/v1/chat/completions",
headers={"Content-Type": "application/json"},
json={
"model": "Qwen/Qwen3-0.6B",
"messages": [{"role": "user", "content": "你好"}],
"temperature": 0.6,
"top_p": 0.95
}
)
print(response.json())
六、修改文件汇总
| 文件路径 | 修改内容 |
|---|---|
vllm\v1\engine\core_client.py | 重写 InprocClient 类,make_async_mp_client方法中增加对win32的判断,补全所有 async 方法,添加后台步进线程 |
vllm\entrypoints\launcher.py | 替换 add_signal_handler 为 Windows 兼容写法 |
| 系统环境变量 | 添加 VLLM_ENABLE_V1_MULTIPROCESSING=0 |
七、架构说明
Windows 补丁后的运行架构如下:
┌─────────────────────────────────────────────┐
│ API Server (主进程) │
│ │
│ FastAPI / uvicorn │
│ │ │
│ AsyncLLM │
│ │ │
│ InprocClient (补丁版) │
│ ┌────────────────────────────────────┐ │
│ │ asyncio 事件循环 │ │
│ │ output_handler ──► Queue.get() │ │
│ │ ▲ │ │
│ │ 后台线程 (StepThread) │ │ │
│ │ step_fn() ────────► Queue │ │
│ │ (有请求时才运行) │ │
│ └────────────────────────────────────┘ │
│ │ │
│ EngineCore (同进程,GPU 推理) │
└─────────────────────────────────────────────┘
原生 Linux 架构是多进程通过 ZMQ 通信,Windows 补丁将其改为单进程内后台线程 + asyncio Queue 的方式,规避了 Windows 的多进程和信号限制。
八、注意事项
- 性能:单进程模式下引擎占用主进程资源,高并发性能不如 Linux 多进程模式。
- 并发限制:
InprocClient的步进线程是单线程的,--max-num-seqs 1适合测试,生产环境不建议使用此方案。 - 升级风险:以上修改针对 vLLM 0.19.1,升级版本后需重新验证兼容性。
- 推荐方案:生产环境建议使用 WSL2(Windows Subsystem for Linux)运行 vLLM,可完全规避上述所有问题。
九、快速复现检查清单
-
setx VLLM_ENABLE_V1_MULTIPROCESSING 0,重新打开 CMD 验证 - 修改
core_client.py:替换InprocClient完整类 - 修改
launcher.py:替换add_signal_handler为 Windows 兼容写法 - 运行
vllm serve命令,等待Application startup complete - 用
@req.json方式发送测试请求验证推理正常
十、改动代码附录
v1/engine/core_client.py
class EngineCoreClient(ABC):
"""
EngineCoreClient: subclasses handle different methods for pushing
and pulling from the EngineCore for asyncio / multiprocessing.
Subclasses:
* InprocClient: In process EngineCore (for V0-style LLMEngine use)
* SyncMPClient: ZMQ + background proc EngineCore (for LLM)
* AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM)
"""
@staticmethod
def make_client(
multiprocess_mode: bool,
asyncio_mode: bool,
vllm_config: VllmConfig,
executor_class: type[Executor],
log_stats: bool,
) -> "EngineCoreClient":
# TODO: support this for debugging purposes.
if asyncio_mode and not multiprocess_mode:
raise NotImplementedError(
"Running EngineCore in asyncio without multiprocessing "
"is not currently supported."
)
if multiprocess_mode and asyncio_mode:
return EngineCoreClient.make_async_mp_client(
vllm_config, executor_class, log_stats
)
# === 改动代码 ===
if multiprocess_mode and not asyncio_mode:
if sys.platform == "win32":
# Windows doesn't support fork or IPC sockets reliably,
# fall back to in-process engine core
return InprocClient(vllm_config, executor_class, log_stats)
return SyncMPClient(vllm_config, executor_class, log_stats)
return InprocClient(vllm_config, executor_class, log_stats)
@staticmethod
@instrument(span_name="Overall Loading")
def make_async_mp_client(
vllm_config: VllmConfig,
executor_class: type[Executor],
log_stats: bool,
client_addresses: dict[str, str] | None = None,
client_count: int = 1,
client_index: int = 0,
) -> "AsyncMPClient":
# Windows workaround: force inproc mode
import sys
if sys.platform == "win32":
return InprocClient(vllm_config, executor_class, log_stats)
parallel_config = vllm_config.parallel_config
client_args = (
vllm_config,
executor_class,
log_stats,
client_addresses,
client_count,
client_index,
)
if parallel_config.data_parallel_size > 1:
if parallel_config.data_parallel_external_lb:
# External load balancer - client per DP rank.
return DPAsyncMPClient(*client_args)
# Internal load balancer - client balances to all DP ranks.
return DPLBAsyncMPClient(*client_args)
return AsyncMPClient(*client_args)
class InprocClient(EngineCoreClient):
# === 改动代码 ===
def __init__(self, *args, **kwargs):
import asyncio
import threading
self.engine_core = EngineCore(*args, **kwargs)
self.engine_ranks_managed = [0]
self._has_requests = threading.Event()
self._output_queue: asyncio.Queue | None = None
self._loop: asyncio.AbstractEventLoop | None = None
self._step_thread: threading.Thread | None = None
self._stopped = False
# Windows 兼容:AsyncLLM 会访问 resources.engine_dead
class _FakeResources:
engine_dead = False
self.resources = _FakeResources()
def ensure_alive(self):
pass # InprocClient 始终存活
def _ensure_step_thread(self):
"""启动后台步进线程(只启动一次)"""
if self._step_thread is not None:
return
import threading
self._step_thread = threading.Thread(
target=self._step_loop,
name="InprocClientStepThread",
daemon=True,
)
self._step_thread.start()
def _step_loop(self):
"""后台线程:持续步进引擎,有请求时才运行"""
import asyncio
while not self._stopped:
# 等待有请求才步进
self._has_requests.wait(timeout=1.0)
if self._stopped:
break
outputs, model_executed = self.engine_core.step_fn()
self.engine_core.post_step(model_executed=model_executed)
result = outputs and outputs.get(0) or None
if result is not None and (result.outputs or result.scheduler_stats):
# 把结果放入 asyncio queue(线程安全方式)
if self._loop and self._output_queue:
self._loop.call_soon_threadsafe(
self._output_queue.put_nowait, result
)
# 检查是否还有运行中的请求
if not (result.scheduler_stats and
getattr(result.scheduler_stats, 'num_running_reqs', 0) > 0):
self._has_requests.clear()
async def get_output_async(self) -> EngineCoreOutputs:
import asyncio
# 初始化 asyncio queue 和 loop(只做一次)
if self._output_queue is None:
self._output_queue = asyncio.Queue()
self._loop = asyncio.get_running_loop()
self._ensure_step_thread()
return await self._output_queue.get()
def add_request(self, request: EngineCoreRequest) -> None:
req, request_wave = self.engine_core.preprocess_add_request(request)
self.engine_core.add_request(req, request_wave)
# 通知步进线程有新请求
self._has_requests.set()
async def add_request_async(self, request: EngineCoreRequest) -> None:
self.add_request(request)
def get_output(self) -> EngineCoreOutputs:
outputs, model_executed = self.engine_core.step_fn()
self.engine_core.post_step(model_executed=model_executed)
return outputs and outputs.get(0) or EngineCoreOutputs()
def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
return self.engine_core.get_supported_tasks()
async def get_supported_tasks_async(self) -> tuple[SupportedTask, ...]:
return self.get_supported_tasks()
def abort_requests(self, request_ids: list[str]) -> None:
if len(request_ids) > 0:
self.engine_core.abort_requests(request_ids)
async def abort_requests_async(self, request_ids: list[str]) -> None:
self.abort_requests(request_ids)
def shutdown(self, timeout: float | None = None) -> None:
self._stopped = True
self._has_requests.set() # 唤醒线程让其退出
self.engine_core.shutdown()
def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
self.engine_core.profile(is_start, profile_prefix)
async def profile_async(self, is_start: bool = True, profile_prefix: str | None = None) -> None:
self.profile(is_start, profile_prefix)
def reset_mm_cache(self) -> None:
self.engine_core.reset_mm_cache()
async def reset_mm_cache_async(self) -> None:
self.reset_mm_cache()
def reset_prefix_cache(
self, reset_running_requests: bool = False, reset_connector: bool = False
) -> bool:
return self.engine_core.reset_prefix_cache(
reset_running_requests, reset_connector
)
async def reset_prefix_cache_async(
self, reset_running_requests: bool = False, reset_connector: bool = False
) -> bool:
return self.reset_prefix_cache(reset_running_requests, reset_connector)
def reset_encoder_cache(self) -> None:
self.engine_core.reset_encoder_cache()
async def reset_encoder_cache_async(self) -> None:
self.reset_encoder_cache()
def sleep(self, level: int = 1, mode: PauseMode = "abort") -> None:
if mode == "wait":
raise ValueError("'wait' pause mode is not supported in inproc-engine mode")
result = self.engine_core.sleep(level, mode)
assert result is None
async def sleep_async(self, level: int = 1, mode: PauseMode = "abort") -> None:
self.sleep(level, mode)
def wake_up(self, tags: list[str] | None = None) -> None:
self.engine_core.wake_up(tags)
async def wake_up_async(self, tags: list[str] | None = None) -> None:
self.wake_up(tags)
def is_sleeping(self) -> bool:
return self.engine_core.is_sleeping()
async def is_sleeping_async(self) -> bool:
return self.is_sleeping()
def execute_dummy_batch(self) -> None:
self.engine_core.execute_dummy_batch()
async def execute_dummy_batch_async(self) -> None:
self.execute_dummy_batch()
def add_lora(self, lora_request: LoRARequest) -> bool:
return self.engine_core.add_lora(lora_request)
async def add_lora_async(self, lora_request: LoRARequest) -> bool:
return self.add_lora(lora_request)
def remove_lora(self, lora_id: int) -> bool:
return self.engine_core.remove_lora(lora_id)
async def remove_lora_async(self, lora_id: int) -> bool:
return self.remove_lora(lora_id)
def list_loras(self) -> set[int]:
return self.engine_core.list_loras()
async def list_loras_async(self) -> set[int]:
return self.list_loras()
def pin_lora(self, lora_id: int) -> bool:
return self.engine_core.pin_lora(lora_id)
async def pin_lora_async(self, lora_id: int) -> bool:
return self.pin_lora(lora_id)
def save_sharded_state(
self, path: str, pattern: str | None = None, max_size: int | None = None
) -> None:
self.engine_core.save_sharded_state(path, pattern, max_size)
async def save_sharded_state_async(
self, path: str, pattern: str | None = None, max_size: int | None = None
) -> None:
self.save_sharded_state(path, pattern, max_size)
def collective_rpc(
self,
method: str | Callable[..., _R],
timeout: float | None = None,
args: tuple = (),
kwargs: dict[str, Any] | None = None,
) -> list[_R]:
return self.engine_core.collective_rpc(method, timeout, args, kwargs)
async def collective_rpc_async(
self,
method: str | Callable[..., _R],
timeout: float | None = None,
args: tuple = (),
kwargs: dict[str, Any] | None = None,
) -> list[_R]:
return self.collective_rpc(method, timeout, args, kwargs)
async def pause_scheduler_async(
self, mode: PauseMode = "abort", clear_cache: bool = True
) -> None:
pass
async def resume_scheduler_async(self) -> None:
pass
async def is_scheduler_paused_async(self) -> bool:
return False
def dp_engines_running(self) -> bool:
return False
entrypoints/launcher.py
async def serve_http(
app: FastAPI,
sock: socket.socket | None,
enable_ssl_refresh: bool = False,
**uvicorn_kwargs: Any,
):
"""
Start a FastAPI app using Uvicorn, with support for custom Uvicorn config
options. Supports http header limits via h11_max_incomplete_event_size and
h11_max_header_count.
"""
logger.info("Available routes are:")
# post endpoints
for route in app.routes:
methods = getattr(route, "methods", None)
path = getattr(route, "path", None)
if methods is None or path is None:
continue
logger.info("Route: %s, Methods: %s", path, ", ".join(methods))
# other endpoints
for route in app.routes:
endpoint = getattr(route, "endpoint", None)
methods = getattr(route, "methods", None)
path = getattr(route, "path", None)
if endpoint is None or path is None or methods is not None:
continue
logger.info("Route: %s, Endpoint: %s", path, endpoint.__name__)
# Extract header limit options if present
h11_max_incomplete_event_size = uvicorn_kwargs.pop(
"h11_max_incomplete_event_size", None
)
h11_max_header_count = uvicorn_kwargs.pop("h11_max_header_count", None)
# Set safe defaults if not provided
if h11_max_incomplete_event_size is None:
h11_max_incomplete_event_size = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
if h11_max_header_count is None:
h11_max_header_count = H11_MAX_HEADER_COUNT_DEFAULT
config = uvicorn.Config(app, **uvicorn_kwargs)
# Set header limits
config.h11_max_incomplete_event_size = h11_max_incomplete_event_size
config.h11_max_header_count = h11_max_header_count
config.load()
server = uvicorn.Server(config)
app.state.server = server
loop = asyncio.get_running_loop()
watchdog_task = loop.create_task(watchdog_loop(server, app.state.engine_client))
server_task = loop.create_task(server.serve(sockets=[sock] if sock else None))
ssl_cert_refresher = (
None
if not enable_ssl_refresh
else SSLCertRefresher(
ssl_context=config.ssl,
key_path=config.ssl_keyfile,
cert_path=config.ssl_certfile,
ca_path=config.ssl_ca_certs,
)
)
shutdown_event = asyncio.Event()
def signal_handler() -> None:
shutdown_event.set()
async def dummy_shutdown() -> None:
pass
# loop.add_signal_handler(signal.SIGINT, signal_handler)
# loop.add_signal_handler(signal.SIGTERM, signal_handler)
import sys
if sys.platform != "win32":
loop.add_signal_handler(signal.SIGINT, signal_handler)
loop.add_signal_handler(signal.SIGTERM, signal_handler)
else:
# Windows 不支持 add_signal_handler,用 signal.signal 代替
import signal as _signal
_signal.signal(_signal.SIGINT, lambda s, f: signal_handler())
_signal.signal(_signal.SIGTERM, lambda s, f: signal_handler())



8338

被折叠的 条评论
为什么被折叠?



