[Misc] Optional installation of audio related packages (#8063)
This commit is contained in:
parent
5231f0898e
commit
5b86b19954
@ -22,8 +22,6 @@ typing_extensions >= 4.10
|
|||||||
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
|
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
|
||||||
pyzmq
|
pyzmq
|
||||||
msgspec
|
msgspec
|
||||||
librosa # Required for audio processing
|
|
||||||
soundfile # Required for audio processing
|
|
||||||
gguf == 0.9.1
|
gguf == 0.9.1
|
||||||
importlib_metadata
|
importlib_metadata
|
||||||
mistral_common >= 1.3.4
|
mistral_common >= 1.3.4
|
||||||
|
|||||||
@ -13,10 +13,12 @@ pytest-shard
|
|||||||
awscli
|
awscli
|
||||||
einops # required for MPT, qwen-vl and Mamba
|
einops # required for MPT, qwen-vl and Mamba
|
||||||
httpx
|
httpx
|
||||||
|
librosa # required for audio test
|
||||||
peft
|
peft
|
||||||
requests
|
requests
|
||||||
ray
|
ray
|
||||||
sentence-transformers # required for embedding
|
sentence-transformers # required for embedding
|
||||||
|
soundfile # required for audio test
|
||||||
compressed-tensors==0.4.0 # required for compressed-tensors
|
compressed-tensors==0.4.0 # required for compressed-tensors
|
||||||
timm # required for internvl test
|
timm # required for internvl test
|
||||||
transformers_stream_generator # required for qwen-vl test
|
transformers_stream_generator # required for qwen-vl test
|
||||||
|
|||||||
1
setup.py
1
setup.py
@ -501,6 +501,7 @@ setup(
|
|||||||
ext_modules=ext_modules,
|
ext_modules=ext_modules,
|
||||||
extras_require={
|
extras_require={
|
||||||
"tensorizer": ["tensorizer>=2.9.0"],
|
"tensorizer": ["tensorizer>=2.9.0"],
|
||||||
|
"audio": ["librosa", "soundfile"] # Required for audio processing
|
||||||
},
|
},
|
||||||
cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
|
cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
|
||||||
package_data=package_data,
|
package_data=package_data,
|
||||||
|
|||||||
@ -1,11 +1,9 @@
|
|||||||
from typing import List, Optional, Tuple, Type
|
from typing import List, Optional, Tuple, Type
|
||||||
|
|
||||||
import librosa
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
from transformers import AutoModel, AutoTokenizer, BatchEncoding
|
from transformers import AutoModel, AutoTokenizer, BatchEncoding
|
||||||
|
|
||||||
from vllm.assets.audio import AudioAsset
|
|
||||||
from vllm.sequence import SampleLogprobs
|
from vllm.sequence import SampleLogprobs
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
@ -21,6 +19,7 @@ AudioTuple = Tuple[np.ndarray, int]
|
|||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def audio_and_sample_rate():
|
def audio_and_sample_rate():
|
||||||
|
from vllm.assets.audio import AudioAsset
|
||||||
return AudioAsset("mary_had_lamb").audio_and_sample_rate
|
return AudioAsset("mary_had_lamb").audio_and_sample_rate
|
||||||
|
|
||||||
|
|
||||||
@ -109,6 +108,7 @@ def run_test(
|
|||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
postprocess_inputs=process,
|
postprocess_inputs=process,
|
||||||
auto_cls=AutoModel) as hf_model:
|
auto_cls=AutoModel) as hf_model:
|
||||||
|
import librosa
|
||||||
|
|
||||||
hf_outputs_per_audio = [
|
hf_outputs_per_audio = [
|
||||||
hf_model.generate_greedy_logprobs_limit(
|
hf_model.generate_greedy_logprobs_limit(
|
||||||
|
|||||||
@ -8,7 +8,6 @@ from functools import lru_cache
|
|||||||
from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
|
from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
|
||||||
TypedDict, Union, cast)
|
TypedDict, Union, cast)
|
||||||
|
|
||||||
import librosa
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.checkpoint
|
import torch.utils.checkpoint
|
||||||
@ -107,6 +106,11 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
|
|||||||
feature_extractor = whisper_feature_extractor(ctx)
|
feature_extractor = whisper_feature_extractor(ctx)
|
||||||
|
|
||||||
if sr != feature_extractor.sampling_rate:
|
if sr != feature_extractor.sampling_rate:
|
||||||
|
try:
|
||||||
|
import librosa
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Please install vllm[audio] for audio support.") from None
|
||||||
audio = librosa.resample(audio,
|
audio = librosa.resample(audio,
|
||||||
orig_sr=sr,
|
orig_sr=sr,
|
||||||
target_sr=feature_extractor.sampling_rate)
|
target_sr=feature_extractor.sampling_rate)
|
||||||
|
|||||||
@ -1,11 +1,9 @@
|
|||||||
import base64
|
import base64
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import List, Optional, Tuple, TypeVar, Union
|
from typing import Any, List, Optional, Tuple, TypeVar, Union
|
||||||
|
|
||||||
import librosa
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import soundfile
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from vllm.connections import global_http_connection
|
from vllm.connections import global_http_connection
|
||||||
@ -73,10 +71,22 @@ async def async_fetch_image(image_url: str,
|
|||||||
return image.convert(image_mode)
|
return image.convert(image_mode)
|
||||||
|
|
||||||
|
|
||||||
|
def try_import_audio_packages() -> Tuple[Any, Any]:
|
||||||
|
try:
|
||||||
|
import librosa
|
||||||
|
import soundfile
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Please install vllm[audio] for audio support.") from None
|
||||||
|
return librosa, soundfile
|
||||||
|
|
||||||
|
|
||||||
def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
|
def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
|
||||||
"""
|
"""
|
||||||
Load audio from a URL.
|
Load audio from a URL.
|
||||||
"""
|
"""
|
||||||
|
librosa, _ = try_import_audio_packages()
|
||||||
|
|
||||||
if audio_url.startswith("http"):
|
if audio_url.startswith("http"):
|
||||||
audio_bytes = global_http_connection.get_bytes(
|
audio_bytes = global_http_connection.get_bytes(
|
||||||
audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
|
audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
|
||||||
@ -95,6 +105,8 @@ async def async_fetch_audio(
|
|||||||
"""
|
"""
|
||||||
Asynchronously fetch audio from a URL.
|
Asynchronously fetch audio from a URL.
|
||||||
"""
|
"""
|
||||||
|
librosa, _ = try_import_audio_packages()
|
||||||
|
|
||||||
if audio_url.startswith("http"):
|
if audio_url.startswith("http"):
|
||||||
audio_bytes = await global_http_connection.async_get_bytes(
|
audio_bytes = await global_http_connection.async_get_bytes(
|
||||||
audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
|
audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
|
||||||
@ -123,6 +135,8 @@ def encode_audio_base64(
|
|||||||
sampling_rate: int,
|
sampling_rate: int,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Encode audio as base64."""
|
"""Encode audio as base64."""
|
||||||
|
_, soundfile = try_import_audio_packages()
|
||||||
|
|
||||||
buffered = BytesIO()
|
buffered = BytesIO()
|
||||||
soundfile.write(buffered, audio, sampling_rate, format="WAV")
|
soundfile.write(buffered, audio, sampling_rate, format="WAV")
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user