[Bugfix] Ensure special tokens are properly filtered out for guided structured output with MistralTokenizer (#10363)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
This commit is contained in:
parent
3a763ba0c3
commit
691a3ec047
@ -17,7 +17,7 @@ pillow # Required for image processing
|
|||||||
prometheus_client >= 0.18.0
|
prometheus_client >= 0.18.0
|
||||||
prometheus-fastapi-instrumentator >= 7.0.0
|
prometheus-fastapi-instrumentator >= 7.0.0
|
||||||
tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
||||||
lm-format-enforcer == 0.10.6
|
lm-format-enforcer >= 0.10.9, < 0.11
|
||||||
outlines >= 0.0.43, < 0.1
|
outlines >= 0.0.43, < 0.1
|
||||||
typing_extensions >= 4.10
|
typing_extensions >= 4.10
|
||||||
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
|
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
|
||||||
|
|||||||
@ -174,18 +174,29 @@ class MistralTokenizer:
|
|||||||
revision=revision)
|
revision=revision)
|
||||||
return tokenizer_file
|
return tokenizer_file
|
||||||
|
|
||||||
# the following attributes are set to fit VLLM's design
|
# the following attributes are set to fit VLLM's design and are used
|
||||||
|
# by the guided structured output backends.
|
||||||
@property
|
@property
|
||||||
def all_special_tokens_extended(self) -> List[str]:
|
def all_special_tokens_extended(self) -> List[str]:
|
||||||
return []
|
# tekken defines its own extended special tokens list
|
||||||
|
if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
|
||||||
|
special_tokens = self.tokenizer.SPECIAL_TOKENS
|
||||||
|
else:
|
||||||
|
special_tokens = list(SpecialTokens)
|
||||||
|
return [
|
||||||
|
s.value if isinstance(s, SpecialTokens) else s
|
||||||
|
for s in special_tokens
|
||||||
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def all_special_tokens(self) -> List[str]:
|
def all_special_tokens(self) -> List[str]:
|
||||||
return []
|
return self.all_special_tokens_extended
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def all_special_ids(self) -> List[int]:
|
def all_special_ids(self) -> List[int]:
|
||||||
return []
|
return [
|
||||||
|
self.all_special_tokens.index(t) for t in self.all_special_tokens
|
||||||
|
]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def bos_token_id(self) -> int:
|
def bos_token_id(self) -> int:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user