[Core] Fix circular reference which leaked llm instance in local dev env (#4737)

Storing exception frame is extremely prone to circular refernece because it contains the reference to objects.

When tensorizer is not installed, it leaks llm instance because error frame has references to various modules which cause circular reference problem.

I also found spec decoding has a circular reference issue, and I solved it using weakref.proxy.
This commit is contained in:
SangBin Cho 2024-05-10 23:54:32 +09:00 committed by GitHub
parent dac6a3f6ed
commit 6a0f617210
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 22 additions and 7 deletions

View File

@ -3,9 +3,12 @@
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
"""
import os
import weakref
import pytest
from vllm import LLM
MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
@ -13,6 +16,16 @@ MODELS = [
VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND"
def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
llm = LLM("facebook/opt-125m")
weak_llm = weakref.ref(llm)
del llm
# If there's any circular reference to vllm, this fails
# because llm instance is not GC'ed.
assert weak_llm() is None
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])

View File

@ -19,7 +19,7 @@ from vllm.model_executor.layers.quantization.base_config import (
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
tensorizer_load_fail = None
tensorizer_error_msg = None
try:
from tensorizer import (DecryptionParams, EncryptionParams,
@ -28,7 +28,7 @@ try:
from tensorizer.utils import (convert_bytes, get_mem_usage,
no_init_or_tensor)
except ImportError as e:
tensorizer_load_fail = e
tensorizer_error_msg = str(e)
__all__ = [
'EncryptionParams', 'DecryptionParams', 'TensorDeserializer',
@ -254,11 +254,11 @@ class TensorizerAgent:
def __init__(self, tensorizer_config: TensorizerConfig,
quant_config: QuantizationConfig, **extra_kwargs):
if tensorizer_load_fail is not None:
if tensorizer_error_msg is not None:
raise ImportError(
"Tensorizer is not installed. Please install tensorizer "
"to use this feature with `pip install vllm[tensorizer]`."
) from tensorizer_load_fail
"to use this feature with `pip install vllm[tensorizer]`. "
"Error message: {}".format(tensorizer_error_msg))
self.tensorizer_config = tensorizer_config
self.tensorizer_args = (

View File

@ -1,4 +1,5 @@
import copy
import weakref
from typing import List, Tuple
import torch
@ -32,7 +33,7 @@ class MultiStepWorker(Worker):
super().init_device()
self._proposer = Top1Proposer(
self,
weakref.proxy(self),
self.device,
self.vocab_size,
max_proposal_len=self.max_model_len,

View File

@ -1,3 +1,4 @@
import weakref
from typing import List, Optional, Tuple
import torch
@ -37,7 +38,7 @@ class NGramWorker(LoraNotSupportedWorkerBase):
# Current only support Top1Proposer
self._proposer = Top1Proposer(
self,
weakref.proxy(self),
device=self.device,
vocab_size=self.vocab_size,
)