fix: enable trust-remote-code in api server & benchmark. (#509)
This commit is contained in:
parent
cf21a9bd5c
commit
8c4b2592fb
@ -21,6 +21,7 @@ def main(args: argparse.Namespace):
|
|||||||
tensor_parallel_size=args.tensor_parallel_size,
|
tensor_parallel_size=args.tensor_parallel_size,
|
||||||
max_num_seqs=args.batch_size,
|
max_num_seqs=args.batch_size,
|
||||||
max_num_batched_tokens=args.batch_size * args.input_len,
|
max_num_batched_tokens=args.batch_size * args.input_len,
|
||||||
|
trust_remote_code=args.trust_remote_code,
|
||||||
)
|
)
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
@ -74,5 +75,7 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument('--use-beam-search', action='store_true')
|
parser.add_argument('--use-beam-search', action='store_true')
|
||||||
parser.add_argument('--num-iters', type=int, default=3,
|
parser.add_argument('--num-iters', type=int, default=3,
|
||||||
help='Number of iterations to run.')
|
help='Number of iterations to run.')
|
||||||
|
parser.add_argument('--trust-remote-code', action='store_true',
|
||||||
|
help='trust remote code from huggingface')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -177,7 +177,7 @@ def main(args: argparse.Namespace):
|
|||||||
np.random.seed(args.seed)
|
np.random.seed(args.seed)
|
||||||
|
|
||||||
api_url = f"http://{args.host}:{args.port}/generate"
|
api_url = f"http://{args.host}:{args.port}/generate"
|
||||||
tokenizer = get_tokenizer(args.tokenizer)
|
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||||
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
|
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
|
||||||
|
|
||||||
benchmark_start_time = time.time()
|
benchmark_start_time = time.time()
|
||||||
@ -227,5 +227,7 @@ if __name__ == "__main__":
|
|||||||
"Otherwise, we use Poisson process to synthesize "
|
"Otherwise, we use Poisson process to synthesize "
|
||||||
"the request arrival times.")
|
"the request arrival times.")
|
||||||
parser.add_argument("--seed", type=int, default=0)
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
|
parser.add_argument('--trust-remote-code', action='store_true',
|
||||||
|
help='trust remote code from huggingface')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -74,7 +74,7 @@ def run_vllm(
|
|||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
seed=seed,
|
seed=seed,
|
||||||
trust_remote_code=trust_remote_code
|
trust_remote_code=trust_remote_code,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
@ -111,7 +111,8 @@ def run_hf(
|
|||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
) -> float:
|
) -> float:
|
||||||
assert not use_beam_search
|
assert not use_beam_search
|
||||||
llm = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
llm = AutoModelForCausalLM.from_pretrained(model,
|
||||||
|
torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
||||||
if llm.config.model_type == "llama":
|
if llm.config.model_type == "llama":
|
||||||
# To enable padding in the HF backend.
|
# To enable padding in the HF backend.
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
@ -173,8 +174,9 @@ def main(args: argparse.Namespace):
|
|||||||
args.seed, args.n, args.use_beam_search, args.trust_remote_code)
|
args.seed, args.n, args.use_beam_search, args.trust_remote_code)
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(
|
||||||
args.use_beam_search, args.hf_max_batch_size)
|
requests, args.model, tokenizer, args.n, args.use_beam_search,
|
||||||
|
args.hf_max_batch_size, args.trust_remote_code)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {args.backend}")
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
total_num_tokens = sum(
|
total_num_tokens = sum(
|
||||||
|
|||||||
@ -585,7 +585,8 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
# A separate tokenizer to map token IDs to strings.
|
# A separate tokenizer to map token IDs to strings.
|
||||||
tokenizer = get_tokenizer(engine_args.tokenizer,
|
tokenizer = get_tokenizer(engine_args.tokenizer,
|
||||||
tokenizer_mode=engine_args.tokenizer_mode)
|
tokenizer_mode=engine_args.tokenizer_mode,
|
||||||
|
trust_remote_code=engine_args.trust_remote_code)
|
||||||
|
|
||||||
uvicorn.run(app,
|
uvicorn.run(app,
|
||||||
host=args.host,
|
host=args.host,
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user