From 150a1ffbfd3d0429d30fa5ab841f53903a0a8a62 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 26 Jul 2024 17:39:10 -0400 Subject: [PATCH] [Doc] Update SkyPilot doc for wrong indents and instructions for update service (#4283) --- docs/source/serving/run_on_sky.rst | 400 ++++++++++++++++------------- 1 file changed, 228 insertions(+), 172 deletions(-) diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst index bd33c76c..674b14a8 100644 --- a/docs/source/serving/run_on_sky.rst +++ b/docs/source/serving/run_on_sky.rst @@ -5,9 +5,9 @@ Deploying and scaling up with SkyPilot .. raw:: html -

- vLLM -

+

+ vLLM +

vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot `__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery `__. @@ -21,8 +21,8 @@ Prerequisites .. code-block:: console - pip install skypilot-nightly - sky check + pip install skypilot-nightly + sky check Run on a single instance @@ -32,64 +32,64 @@ See the vLLM SkyPilot YAML for serving, `serving.yaml # Change to your own huggingface token, or use --env to pass. + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log & - - echo 'Waiting for vllm api server to start...' - while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log & + + echo 'Waiting for vllm api server to start...' + while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://localhost:8081/v1 \ - --stop-token-ids 128009,128001 + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://localhost:8081/v1 \ + --stop-token-ids 128009,128001 Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): .. code-block:: console - HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN + HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. .. code-block:: console - (task, pid=7431) Running on public URL: https://.gradio.live + (task, pid=7431) Running on public URL: https://.gradio.live **Optional**: Serve the 70B model instead of the default 8B and use more GPU: .. code-block:: console - HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct + HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct Scale up to multiple replicas @@ -99,151 +99,212 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut .. code-block:: yaml - service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_tokens: 1 - + service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + .. raw:: html -
- Click to see the full recipe YAML +
+ Click to see the full recipe YAML .. code-block:: yaml - service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? + service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? max_tokens: 1 - resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log & - - echo 'Waiting for vllm api server to start...' - while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://localhost:8081/v1 \ - --stop-token-ids 128009,128001 + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log .. raw:: html -
+
Start the serving the Llama-3 8B model on multiple replicas: .. code-block:: console - HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN + HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN Wait until the service is ready: .. code-block:: console - watch -n10 sky serve status vllm + watch -n10 sky serve status vllm .. raw:: html -
- Example outputs: +
+ Example outputs: .. code-block:: console - Services - NAME VERSION UPTIME STATUS REPLICAS ENDPOINT - vllm 1 35s READY 2/2 xx.yy.zz.100:30001 + Services + NAME VERSION UPTIME STATUS REPLICAS ENDPOINT + vllm 1 35s READY 2/2 xx.yy.zz.100:30001 - Service Replicas - SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION - vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP({'L4': 1}) READY us-east4 - vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP({'L4': 1}) READY us-east4 + Service Replicas + SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION + vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 + vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 .. raw:: html - -
+ +
After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: .. code-block:: console - ENDPOINT=$(sky serve status --endpoint 8081 vllm) - curl -L http://$ENDPOINT/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "meta-llama/Meta-Llama-3-8B-Instruct", - "messages": [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Who are you?" - } - ], - "stop_token_ids": [128009, 128001] - }' + ENDPOINT=$(sky serve status --endpoint 8081 vllm) + curl -L http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ], + "stop_token_ids": [128009, 128001] + }' -To enable autoscaling, you could specify additional configs in `services`: +To enable autoscaling, you could replace the `replicas` with the following configs in `service`: .. code-block:: yaml - services: - replica_policy: - min_replicas: 0 - max_replicas: 3 - target_qps_per_replica: 2 + service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 This will scale the service up to when the QPS exceeds 2 for each replica. + +.. raw:: html + +
+ Click to see the full recipe YAML + + +.. code-block:: yaml + + service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log + + +.. raw:: html + +
+ +To update the service with the new config: + +.. code-block:: console + + HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN + + +To stop the service: + +.. code-block:: console + + sky serve down vllm + **Optional**: Connect a GUI to the endpoint ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -253,58 +314,53 @@ It is also possible to access the Llama-3 service with a separate GUI frontend, .. raw:: html -
- Click to see the full GUI YAML +
+ Click to see the full GUI YAML .. code-block:: yaml - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct - ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. - resources: - cpus: 2 + resources: + cpus: 2 - setup: | - conda activate vllm - if [ $? -ne 0 ]; then - conda create -n vllm python=3.10 -y - conda activate vllm - fi + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm - # Install Gradio for web UI. - pip install gradio openai + # Install Gradio for web UI. + pip install gradio openai - run: | - conda activate vllm - export PATH=$PATH:/sbin - WORKER_IP=$(hostname -I | cut -d' ' -f1) - CONTROLLER_PORT=21001 - WORKER_PORT=21002 + run: | + conda activate vllm + export PATH=$PATH:/sbin + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://$ENDPOINT/v1 \ + --stop-token-ids 128009,128001 | tee ~/gradio.log - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://$ENDPOINT/v1 \ - --stop-token-ids 128009,128001 | tee ~/gradio.log .. raw:: html - -
+ +
1. Start the chat web UI: .. code-block:: console - sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) + sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) 2. Then, we can access the GUI at the returned gradio link: .. code-block:: console - | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live + | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live