diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 477ce9bc..9f185569 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -32,19 +32,20 @@ jobs:
         pip install types-setuptools
     - name: Mypy
       run: |
-        mypy vllm/attention/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/attention --config-file pyproject.toml
+        # TODO(sang): Fix nested dir
         mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/distributed/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/entrypoints/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/executor/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/usage/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
-
-        mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/spec_decode/*.py --follow-imports=skip --config-file pyproject.toml
-        mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
-        # TODO(sang): Follow up
-        # mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml
+        mypy vllm/distributed --config-file pyproject.toml
+        mypy vllm/entrypoints --config-file pyproject.toml
+        mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/usage --config-file pyproject.toml
+        mypy vllm/*.py --config-file pyproject.toml
+        mypy vllm/transformers_utils --config-file pyproject.toml
+        mypy vllm/engine  --config-file pyproject.toml
+        mypy vllm/worker --config-file pyproject.toml
+        mypy vllm/spec_decode --config-file pyproject.toml
+        # TODO(sang): Fix nested dir
+        mypy vllm/model_executor/*.py  --config-file pyproject.toml
+        # TODO(sang): Fix nested dir
+        # mypy vllm/lora/*.py --config-file pyproject.toml
 
diff --git a/format.sh b/format.sh
index 84ee88b5..bd2e9e89 100755
--- a/format.sh
+++ b/format.sh
@@ -94,21 +94,19 @@ echo 'vLLM yapf: Done'
 
 # Run mypy
 echo 'vLLM mypy:'
-mypy vllm/attention/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/attention --config-file pyproject.toml
 mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/distributed/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/entrypoints/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/executor/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/usage/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/transformers_utils/*.py --follow-imports=skip --config-file pyproject.toml
-
-# TODO(sang): Follow up
-mypy vllm/engine/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/worker/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/spec_decode/*.py --follow-imports=skip --config-file pyproject.toml
-mypy vllm/model_executor/*.py --follow-imports=skip --config-file pyproject.toml
-# mypy vllm/lora/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/distributed --config-file pyproject.toml
+mypy vllm/entrypoints --config-file pyproject.toml
+mypy vllm/executor --config-file pyproject.toml
+mypy vllm/usage --config-file pyproject.toml
+mypy vllm/*.py --config-file pyproject.toml
+mypy vllm/transformers_utils --config-file pyproject.toml
+mypy vllm/engine  --config-file pyproject.toml
+mypy vllm/worker --config-file pyproject.toml
+mypy vllm/spec_decode --config-file pyproject.toml
+mypy vllm/model_executor/*.py  --config-file pyproject.toml
+# mypy vllm/lora/*.py --config-file pyproject.toml
 
 
 CODESPELL_EXCLUDES=(
diff --git a/pyproject.toml b/pyproject.toml
index b870a4b8..a171d45b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,15 +46,17 @@ ignore = [
 python_version = "3.8"
 
 ignore_missing_imports = true
- check_untyped_defs = true
+check_untyped_defs = true
+follow_imports = "skip"
 
 files = "vllm"
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
 exclude = [
     "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
+    # Ignore triton kernels in ops.
+    'vllm/attention/ops/.*\.py$'
 ]
 
-
 [tool.codespell]
 ignore-words-list = "dout, te, indicies"
 skip = "./tests/prompts,./benchmarks/sonnet.txt"
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 7a4ccecf..be747c99 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -116,7 +116,7 @@ class AttentionImpl(ABC):
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata[AttentionMetadataPerStage],
+        attn_metadata: AttentionMetadata,
         kv_scale: float,
     ) -> torch.Tensor:
         raise NotImplementedError
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index dbaa71fd..7c5863a0 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -248,6 +248,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
 
         if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
+            assert prefill_meta.prompt_lens is not None
             if kv_cache is None or prefill_meta.block_tables.numel() == 0:
                 # triton attention
                 # When block_tables are not filled, it means q and k are the
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index d21b54b1..55a7ce59 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -106,7 +106,7 @@ class TorchSDPABackendImpl(AttentionImpl):
         key: torch.Tensor,
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
-        attn_metadata: TorchSDPAMetadata,
+        attn_metadata: TorchSDPAMetadata,  # type: ignore
         kv_scale: float,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
@@ -136,6 +136,7 @@ class TorchSDPABackendImpl(AttentionImpl):
                                                 kv_scale)
 
         if attn_metadata.is_prompt:
+            assert attn_metadata.prompt_lens is not None
             if (kv_cache is None or attn_metadata.block_tables.numel() == 0):
                 if self.num_kv_heads != self.num_heads:
                     key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index b745a04a..572a4dc7 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -288,6 +288,7 @@ class XFormersImpl(AttentionImpl):
             value: shape = [num_prefill_tokens, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         """
+        assert attn_metadata.prompt_lens is not None
         original_query = query
         if self.num_kv_heads != self.num_heads:
             # GQA/MQA requires the shape [B, M, G, H, K].
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index 560267e5..f1b65b25 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -104,6 +104,7 @@ class BlockTable:
             token_ids (List[int]): The sequence of token IDs to be appended.
         """
         assert self._is_allocated
+        assert self._blocks is not None
 
         self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
                                     num_lookahead_slots)
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index 50c70533..f11234a0 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -99,7 +99,7 @@ class CopyOnWriteTracker:
         refcounter: RefCounter,
         allocator: BlockAllocator,
     ):
-        self._copy_on_writes = defaultdict(list)
+        self._copy_on_writes: Dict[BlockId, List[BlockId]] = defaultdict(list)
         self._refcounter = refcounter
         self._allocator = allocator
 
@@ -138,6 +138,8 @@ class CopyOnWriteTracker:
                 prev_block=block.prev_block).block_id
 
             # Track src/dst copy.
+            assert src_block_id is not None
+            assert block_id is not None
             self._copy_on_writes[src_block_id].append(block_id)
 
         return block_id
@@ -180,6 +182,6 @@ def get_all_blocks_recursively(last_block: Block) -> List[Block]:
             recurse(block.prev_block, lst)
         lst.append(block)
 
-    all_blocks = []
+    all_blocks: List[Block] = []
     recurse(last_block, all_blocks)
     return all_blocks
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index fbceacf0..50ce9221 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -52,8 +52,7 @@ class Block(ABC):
 class BlockAllocator(ABC):
 
     @abstractmethod
-    def allocate_mutable(self, prev_block: Optional[Block],
-                         device: Device) -> Block:
+    def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
         pass
 
     @abstractmethod
@@ -98,8 +97,7 @@ class BlockAllocator(ABC):
 class DeviceAwareBlockAllocator(BlockAllocator):
 
     @abstractmethod
-    def allocate_mutable(self, prev_block: Optional[Block],
-                         device: Device) -> Block:
+    def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
         pass
 
     @abstractmethod
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 58cbe77b..9dbb427d 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,6 +1,6 @@
 import os
 from contextlib import contextmanager
-from typing import List, Optional
+from typing import Any, List, Optional
 
 import torch
 import torch.distributed as dist
@@ -18,7 +18,7 @@ except ImportError:
 
 logger = init_logger(__name__)
 
-_CA_HANDLE = None
+_CA_HANDLE: Optional["CustomAllreduce"] = None
 _IS_CAPTURING = False
 _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
 
@@ -51,7 +51,7 @@ def init_custom_ar() -> None:
             "Cannot test GPU P2P because not all GPUs are visible to the "
             "current process. This might be the case if 'CUDA_VISIBLE_DEVICES'"
             " is set.")
-        return False
+        return
     # test nvlink first, this will filter out most of the cases
     # where custom allreduce is not supported
     if "CUDA_VISIBLE_DEVICES" in os.environ:
@@ -117,7 +117,7 @@ def custom_all_reduce(input: torch.Tensor) -> Optional[torch.Tensor]:
     ca_handle = get_handle()
     # when custom allreduce is disabled, this will be None
     if ca_handle is None:
-        return
+        return None
     if is_capturing():
         if torch.cuda.is_current_stream_capturing():
             if ca_handle.should_custom_ar(input):
@@ -135,6 +135,8 @@ def custom_all_reduce(input: torch.Tensor) -> Optional[torch.Tensor]:
         if ca_handle.should_custom_ar(input):
             return ca_handle.all_reduce_unreg(input)
 
+    return None
+
 
 @contextmanager
 def _nvml():
@@ -224,14 +226,14 @@ class CustomAllreduce:
         return self._gather_ipc_meta(shard_data)
 
     def _gather_ipc_meta(self, shard_data):
-        all_data = [None] * self.world_size
+        all_data: List[Optional[Any]] = [None] * self.world_size
         dist.all_gather_object(all_data, shard_data)
 
         handles = []
         offsets = []
         for i in range(len(all_data)):
-            handles.append(all_data[i][0])
-            offsets.append(all_data[i][1])
+            handles.append(all_data[i][0])  # type: ignore
+            offsets.append(all_data[i][1])  # type: ignore
         return handles, offsets
 
     def register_buffer(self, inp: torch.Tensor):
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index c57a4f59..0707afe9 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -107,9 +107,10 @@ _c_ncclCommInitRank.argtypes = [
     ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, NcclUniqueId, ctypes.c_int
 ]
 
+ncclDataType_t = ctypes.c_int
 
-# enums
-class ncclDataType_t(ctypes.c_int):
+
+class ncclDataTypeEnum:
     ncclInt8 = 0
     ncclChar = 0
     ncclUint8 = 1
@@ -128,7 +129,7 @@ class ncclDataType_t(ctypes.c_int):
     ncclNumTypes = 10
 
     @classmethod
-    def from_torch(cls, dtype: torch.dtype) -> 'ncclDataType_t':
+    def from_torch(cls, dtype: torch.dtype) -> int:
         if dtype == torch.int8:
             return cls.ncclInt8
         if dtype == torch.uint8:
@@ -148,7 +149,10 @@ class ncclDataType_t(ctypes.c_int):
         raise ValueError(f"Unsupported dtype: {dtype}")
 
 
-class ncclRedOp_t(ctypes.c_int):
+ncclRedOp_t = ctypes.c_int
+
+
+class ncclRedOpTypeEnum:
     ncclSum = 0
     ncclProd = 1
     ncclMax = 2
@@ -157,7 +161,7 @@ class ncclRedOp_t(ctypes.c_int):
     ncclNumOps = 5
 
     @classmethod
-    def from_torch(cls, op: ReduceOp) -> 'ncclRedOp_t':
+    def from_torch(cls, op: ReduceOp) -> int:
         if op == ReduceOp.SUM:
             return cls.ncclSum
         if op == ReduceOp.PRODUCT:
@@ -180,8 +184,8 @@ class ncclRedOp_t(ctypes.c_int):
 _c_ncclAllReduce = nccl.ncclAllReduce
 _c_ncclAllReduce.restype = ctypes.c_int
 _c_ncclAllReduce.argtypes = [
-    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ncclDataType_t,
-    ncclRedOp_t, ctypes.c_void_p, ctypes.c_void_p
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ncclRedOp_t,
+    ncclDataType_t, ctypes.c_void_p, ctypes.c_void_p
 ]
 
 # equivalent to c declaration:
@@ -251,8 +255,8 @@ class NCCLCommunicator:
         result = _c_ncclAllReduce(ctypes.c_void_p(tensor.data_ptr()),
                                   ctypes.c_void_p(tensor.data_ptr()),
                                   tensor.numel(),
-                                  ncclDataType_t.from_torch(tensor.dtype),
-                                  ncclRedOp_t.from_torch(op), self.comm,
+                                  ncclDataTypeEnum.from_torch(tensor.dtype),
+                                  ncclRedOpTypeEnum.from_torch(op), self.comm,
                                   ctypes.c_void_p(stream.cuda_stream))
         assert result == 0
 
diff --git a/vllm/distributed/device_communicators/pynccl_utils.py b/vllm/distributed/device_communicators/pynccl_utils.py
index aeb73015..916dc814 100644
--- a/vllm/distributed/device_communicators/pynccl_utils.py
+++ b/vllm/distributed/device_communicators/pynccl_utils.py
@@ -30,6 +30,7 @@ def is_initialized() -> bool:
 def set_pynccl_stream(stream: torch.cuda.Stream):
     """Set the cuda stream for communication"""
     try:
+        assert comm is not None
         comm.stream = stream
         yield
     finally:
@@ -52,6 +53,7 @@ def init_process_group(world_size: int,
 def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None:
     """All-reduces the input tensor across the process group."""
     assert input_.is_cuda, f"{input_} should be a cuda tensor"
+    assert comm is not None
     comm.all_reduce(input_, op)
 
 
@@ -62,8 +64,9 @@ def destroy_process_group() -> None:
 
 def get_world_size() -> int:
     """Returns the world size."""
+    assert comm is not None
     return comm.world_size
 
 
-def get_nccl_backend():
+def get_nccl_backend() -> Optional["NCCLCommunicator"]:
     return comm
diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
index 9ddac7a0..f307ea4d 100644
--- a/vllm/engine/output_processor/interfaces.py
+++ b/vllm/engine/output_processor/interfaces.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Callable, Iterable, List
+from typing import Callable, List
 
 from transformers import PreTrainedTokenizer
 
@@ -8,6 +8,7 @@ from vllm.core.scheduler import Scheduler
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput
 from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
 
 
 class SequenceGroupOutputProcessor(ABC):
@@ -27,7 +28,7 @@ class SequenceGroupOutputProcessor(ABC):
         scheduler_config: SchedulerConfig,
         detokenizer: Detokenizer,
         scheduler: Scheduler,
-        seq_counter: Iterable[int],
+        seq_counter: Counter,
         get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
         stop_checker: "StopChecker",
     ):
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 50da0d35..39e99d06 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -1,4 +1,4 @@
-from typing import Callable, Iterable, List
+from typing import Callable, List
 
 from transformers import PreTrainedTokenizer
 
@@ -11,6 +11,7 @@ from vllm.sampling_params import SamplingParams
 from vllm.sequence import (Logprob, Sequence, SequenceGroup,
                            SequenceGroupOutput, SequenceOutput, SequenceStatus)
 from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
 
 logger = init_logger(__name__)
 
@@ -33,7 +34,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
         self,
         detokenizer: Detokenizer,
         scheduler: Scheduler,
-        seq_counter: Iterable[int],
+        seq_counter: Counter,
         get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
         stop_checker: StopChecker,
     ):
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index b3293732..7e9d6524 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -1,4 +1,4 @@
-from typing import Iterable, List, Tuple, Union
+from typing import Dict, List, Tuple, Union
 
 from vllm.config import SchedulerConfig
 from vllm.core.scheduler import Scheduler
@@ -10,6 +10,7 @@ from vllm.sampling_params import SamplingParams
 from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput,
                            SequenceOutput, SequenceStatus)
 from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
 
 logger = init_logger(__name__)
 
@@ -33,7 +34,7 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
         scheduler_config: SchedulerConfig,
         detokenizer: Detokenizer,
         scheduler: Scheduler,
-        seq_counter: Iterable[int],
+        seq_counter: Counter,
         stop_checker: StopChecker,
     ):
         self.scheduler_config = scheduler_config
@@ -69,7 +70,7 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
         samples = outputs.samples
         parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
         existing_finished_seqs = seq_group.get_finished_seqs()
-        parent_child_dict = {
+        parent_child_dict: Dict[int, List[SequenceOutput]] = {
             parent_seq.seq_id: []
             for parent_seq in parent_seqs
         }
@@ -92,7 +93,7 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
                 continue
             # Fork the parent sequence if there are multiple child samples.
             for child_sample in child_samples[:-1]:
-                new_child_seq_id = next(self.seq_counter)
+                new_child_seq_id: int = next(self.seq_counter)
                 child = parent.fork(new_child_seq_id)
                 child.append_token_id(child_sample.output_token,
                                       child_sample.logprobs)
diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py
index 5fbb09a8..d076fee8 100644
--- a/vllm/engine/output_processor/util.py
+++ b/vllm/engine/output_processor/util.py
@@ -8,7 +8,9 @@ def create_output_by_sequence_group(sampler_outputs: List[SamplerOutput],
     """Helper method which transforms a 2d list organized by
     [step][sequence group] into [sequence group][step].
     """
-    output_by_sequence_group = [[] for _ in range(num_seq_groups)]
+    output_by_sequence_group: List[List[SamplerOutput]] = [
+        [] for _ in range(num_seq_groups)
+    ]
     for step in sampler_outputs:
         for i, sequence_group_output in enumerate(step):
             output_by_sequence_group[i].append(sequence_group_output)
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index d6673976..37d76b8e 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -18,6 +18,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionResponse,
                                               CompletionRequest, ErrorResponse)
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
@@ -26,8 +27,8 @@ from vllm.usage.usage_lib import UsageContext
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
-openai_serving_chat: OpenAIServingChat = None
-openai_serving_completion: OpenAIServingCompletion = None
+openai_serving_chat: OpenAIServingChat
+openai_serving_completion: OpenAIServingCompletion
 logger = init_logger(__name__)
 
 
@@ -95,6 +96,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
         return StreamingResponse(content=generator,
                                  media_type="text/event-stream")
     else:
+        assert isinstance(generator, ChatCompletionResponse)
         return JSONResponse(content=generator.model_dump())
 
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index cf779d44..d9763d02 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -4,7 +4,8 @@ import time
 from typing import Dict, List, Literal, Optional, Union
 
 import torch
-from pydantic import BaseModel, Field, conint, model_validator
+from pydantic import BaseModel, Field, model_validator
+from typing_extensions import Annotated
 
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
@@ -30,7 +31,7 @@ class ModelPermission(BaseModel):
     allow_fine_tuning: bool = False
     organization: str = "*"
     group: Optional[str] = None
-    is_blocking: str = False
+    is_blocking: bool = False
 
 
 class ModelCard(BaseModel):
@@ -56,7 +57,7 @@ class UsageInfo(BaseModel):
 
 class ResponseFormat(BaseModel):
     # type must be "json_object" or "text"
-    type: str = Literal["text", "json_object"]
+    type: Literal["text", "json_object"]
 
 
 class ChatCompletionRequest(BaseModel):
@@ -152,6 +153,7 @@ class ChatCompletionRequest(BaseModel):
             def logit_bias_logits_processor(
                     token_ids: List[int],
                     logits: torch.Tensor) -> torch.Tensor:
+                assert self.logit_bias is not None
                 for token_id, bias in self.logit_bias.items():
                     # Clamp the bias between -100 and 100 per OpenAI API spec
                     bias = min(100, max(-100, bias))
@@ -213,7 +215,7 @@ class CompletionRequest(BaseModel):
     logit_bias: Optional[Dict[str, float]] = None
     logprobs: Optional[int] = None
     max_tokens: Optional[int] = 16
-    n: Optional[int] = 1
+    n: int = 1
     presence_penalty: Optional[float] = 0.0
     seed: Optional[int] = None
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
@@ -235,7 +237,7 @@ class CompletionRequest(BaseModel):
     min_tokens: Optional[int] = 0
     skip_special_tokens: Optional[bool] = True
     spaces_between_special_tokens: Optional[bool] = True
-    truncate_prompt_tokens: Optional[conint(ge=1)] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
     # doc: end-completion-sampling-params
 
     # doc: begin-completion-extra-params
@@ -289,6 +291,7 @@ class CompletionRequest(BaseModel):
             def logit_bias_logits_processor(
                     token_ids: List[int],
                     logits: torch.Tensor) -> torch.Tensor:
+                assert self.logit_bias is not None
                 for token_id, bias in self.logit_bias.items():
                     # Clamp the bias between -100 and 100 per OpenAI API spec
                     bias = min(100, max(-100, bias))
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index f35eab15..d502dd0a 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -115,12 +115,12 @@ class OpenAIServingChat(OpenAIServing):
         first_iteration = True
 
         # Send response for each token for each request.n (index)
+        assert request.n is not None
         previous_texts = [""] * request.n
         previous_num_tokens = [0] * request.n
         finish_reason_sent = [False] * request.n
         try:
             async for res in result_generator:
-                res: RequestOutput
                 # We need to do it here, because if there are exceptions in
                 # the result_generator, it needs to be sent as the FIRST
                 # response (by the try...catch).
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index b7e2530a..211b2e04 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -185,6 +185,7 @@ class OpenAIServingCompletion(OpenAIServing):
         model_name: str,
         num_prompts: int,
     ) -> AsyncGenerator[str, None]:
+        assert request.n is not None
         previous_texts = [""] * request.n * num_prompts
         previous_num_tokens = [0] * request.n * num_prompts
         has_echoed = [False] * request.n * num_prompts
@@ -202,6 +203,7 @@ class OpenAIServingCompletion(OpenAIServing):
                     # TODO(simon): optimize the performance by avoiding full
                     # text O(n^2) sending.
 
+                    assert request.max_tokens is not None
                     if request.echo and request.max_tokens == 0:
                         # only return the prompt
                         delta_text = res.prompt
@@ -279,7 +281,7 @@ class OpenAIServingCompletion(OpenAIServing):
         created_time: int,
         model_name: str,
     ) -> CompletionResponse:
-        choices = []
+        choices: List[CompletionResponseChoice] = []
         num_prompt_tokens = 0
         num_generated_tokens = 0
         for final_res in final_res_batch:
@@ -289,6 +291,7 @@ class OpenAIServingCompletion(OpenAIServing):
             prompt_text = final_res.prompt
 
             for output in final_res.outputs:
+                assert request.max_tokens is not None
                 if request.echo and request.max_tokens == 0:
                     token_ids = prompt_token_ids
                     top_logprobs = prompt_logprobs
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 376b5810..610e807c 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -4,7 +4,9 @@ from dataclasses import dataclass
 from http import HTTPStatus
 from typing import Dict, List, Optional, Tuple, Union
 
-from pydantic import conint
+from pydantic import Field
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from typing_extensions import Annotated
 
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
@@ -45,7 +47,8 @@ class OpenAIServing:
             ]
 
         self.max_model_len = 0
-        self.tokenizer = None
+        # Lazy initialized
+        self.tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 
         try:
             event_loop = asyncio.get_running_loop()
@@ -92,7 +95,7 @@ class OpenAIServing:
     def _create_logprobs(
         self,
         token_ids: List[int],
-        top_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None,
+        top_logprobs: List[Optional[Dict[int, Logprob]]],
         num_output_top_logprobs: Optional[int] = None,
         initial_text_offset: int = 0,
     ) -> LogProbs:
@@ -108,6 +111,7 @@ class OpenAIServing:
                 token = self.tokenizer.decode(token_id)
                 logprobs.tokens.append(token)
                 logprobs.token_logprobs.append(None)
+                assert logprobs.top_logprobs is not None
                 logprobs.top_logprobs.append(None)
             else:
                 token_logprob = step_top_logprobs[token_id].logprob
@@ -116,6 +120,7 @@ class OpenAIServing:
                 logprobs.token_logprobs.append(token_logprob)
 
                 if num_output_top_logprobs:
+                    assert logprobs.top_logprobs is not None
                     logprobs.top_logprobs.append({
                         # Convert float("-inf") to the
                         # JSON-serializable float that OpenAI uses
@@ -155,9 +160,9 @@ class OpenAIServing:
 
     async def _check_model(self, request) -> Optional[ErrorResponse]:
         if request.model in self.served_model_names:
-            return
+            return None
         if request.model in [lora.lora_name for lora in self.lora_requests]:
-            return
+            return None
         return self.create_error_response(
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",
@@ -165,7 +170,7 @@ class OpenAIServing:
 
     def _maybe_get_lora(self, request) -> Optional[LoRARequest]:
         if request.model in self.served_model_names:
-            return
+            return None
         for lora in self.lora_requests:
             if request.model == lora.lora_name:
                 return lora
@@ -177,7 +182,7 @@ class OpenAIServing:
         request: Union[ChatCompletionRequest, CompletionRequest],
         prompt: Optional[str] = None,
         prompt_ids: Optional[List[int]] = None,
-        truncate_prompt_tokens: Optional[conint(ge=1)] = None
+        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
     ) -> Tuple[List[int], str]:
         if not (prompt or prompt_ids):
             raise ValueError("Either prompt or prompt_ids should be provided.")
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index 21c2196e..fefad167 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -33,7 +33,7 @@ class LoRALayerWeights:
     def optimize(self) -> "LoRALayerWeights":
         """Optimize the LoRA by merging the scaling into lora_b."""
         if self.scaling == 1:
-            return
+            return self
         self.lora_b *= self.scaling
         self.scaling = 1
         return self
diff --git a/vllm/model_executor/layers/ops/sample.py b/vllm/model_executor/layers/ops/sample.py
index a19e9461..d08ae606 100644
--- a/vllm/model_executor/layers/ops/sample.py
+++ b/vllm/model_executor/layers/ops/sample.py
@@ -29,8 +29,8 @@ def _multi_split_sample(
     sampled_tokens_size: Tuple[int, int],
     sampled_logprobs_size: Tuple[int, int],
     sample_indices: torch.Tensor,
+    logprobs: torch.Tensor,
     *,
-    logprobs: Optional[torch.Tensor] = None,
     modify_greedy_probs: bool = False,
     save_logprobs: bool = False,
 ):
@@ -167,6 +167,7 @@ def sample(
         sampled_logprobs_size = (0, 0)
         logprobs = probs
 
+    assert logprobs is not None
     if _save_modified_probs:
         sampled_modified_probs_size = sampled_tokens_size
     else:
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 6519781c..a5225148 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -108,7 +108,8 @@ class RotaryEmbedding(nn.Module):
             query_pass = query[..., self.rotary_dim:]
             key_pass = key[..., self.rotary_dim:]
 
-        self.cos_sin_cache = self.cos_sin_cache.to(positions.device)
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
+            positions.device)
         cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
                                      if offsets is not None else positions]
         cos, sin = cos_sin.chunk(2, dim=-1)
diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py
index 94f43871..b06a946f 100644
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -222,13 +222,15 @@ class JAISConfig(PretrainedConfig):
                              f"got {alibi_scaling_type}")
         if (alibi_scaling_factor is not None
                 and not isinstance(alibi_scaling_factor, float)
-                or alibi_scaling_factor <= 1.0):
+                or (alibi_scaling_factor is not None
+                    and alibi_scaling_factor <= 1.0)):
             raise ValueError(
                 f"`alibi_scaling`'s factor field must be a float > 1.0,"
                 f"got {alibi_scaling_factor}")
         if (alibi_dynamic_scaling is not None
                 and not isinstance(alibi_dynamic_scaling, int)
-                or alibi_dynamic_scaling <= 1):
+                or (alibi_dynamic_scaling is not None
+                    and alibi_dynamic_scaling <= 1)):
             raise ValueError(
                 f"`alibi_scaling`'s `train_seq_len` field must be an"
                 f"integer > 1, got {alibi_dynamic_scaling}")
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index a3b979e8..69380d67 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -11,7 +11,7 @@ if ray:
     from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
         RayTokenizerGroupPool)
 else:
-    RayTokenizerGroupPool = None
+    RayTokenizerGroupPool = None  # type: ignore
 
 
 def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index c00b02fd..f3cdc005 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -89,6 +89,7 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
         This is blocking.
         """
         self._ensure_queue_initialized()
+        assert self._idle_actors is not None
 
         if self._idle_actors.empty():
             raise RuntimeError("No idle actors available.")
@@ -120,6 +121,7 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
         This is non-blocking.
         """
         self._ensure_queue_initialized()
+        assert self._idle_actors is not None
 
         actor = await self._idle_actors.get()
         try:
diff --git a/vllm/transformers_utils/tokenizers/baichuan.py b/vllm/transformers_utils/tokenizers/baichuan.py
index 79894035..76daabc4 100644
--- a/vllm/transformers_utils/tokenizers/baichuan.py
+++ b/vllm/transformers_utils/tokenizers/baichuan.py
@@ -114,9 +114,9 @@ class BaichuanTokenizer(PreTrainedTokenizer):
         token = self.sp_model.IdToPiece(index)
         return token
 
-    def convert_tokens_to_string(self, tokens):
+    def convert_tokens_to_string(self, tokens: List[str]):
         """Converts a sequence of tokens (string) in a single string."""
-        current_sub_tokens = []
+        current_sub_tokens: List[str] = []
         out_string = ""
         prev_is_special = False
         for i, token in enumerate(tokens):