vllm.v1.engine.output_processor ¶

OutputProcessor ¶

Process EngineCoreOutputs into RequestOutputs.

Source code in vllm/v1/engine/output_processor.py

class OutputProcessor:
    """Process EngineCoreOutputs into RequestOutputs."""

    def __init__(
        self,
        tokenizer: TokenizerLike | None,
        log_stats: bool,
        stream_interval: int = 1,
    ):
        self.log_stats = log_stats
        self.tokenizer = tokenizer
        self.stream_interval = stream_interval
        self.request_states: dict[str, RequestState] = {}
        self.parent_requests: dict[str, ParentRequest] = {}
        self.external_req_ids: defaultdict[str, list[str]] = defaultdict(list)
        self.lora_states = LoRARequestStates(log_stats)
        self.tracing_enabled: bool = False
        self._requests_drained = asyncio.Event()
        self._requests_drained.set()

    def get_num_unfinished_requests(self):
        return len(self.request_states)

    def has_unfinished_requests(self) -> bool:
        return len(self.request_states) > 0

    async def wait_for_requests_to_drain(self) -> None:
        if not self.request_states:
            return
        await self._requests_drained.wait()

    def propagate_error(self, e: Exception):
        """Propagate error to all generate() tasks."""

        for _, state in self.request_states.items():
            assert state.queue is not None
            state.queue.put(e)

    def abort_requests(self, request_ids: Iterable[str], internal: bool) -> list[str]:
        """Abort a list of requests.

        The request_ids may be either external request IDs (those passed to
        InputProcessor.process_inputs()) or internal request IDs (those randomly
        generated when creating the EngineCoreRequest).

        If an external request ID is provided, and that external request ID
        was used for multiple requests, all requests associated with that external
        request ID are aborted.

        In the case of parallel sampling, a request ID may be used to identify
        a parent request, in which case the associated child requests are aborted
        also.
        """
        internal_req_ids = []
        for request_id in request_ids:
            if internal:
                # Internal ID - this may be a parent request
                internal_req_ids.append(request_id)

                # Remove internal ID from the external->internal mapping
                if req_state := self.request_states.get(request_id):
                    external_req_id = req_state.external_req_id
                    internal_ids = self.external_req_ids[external_req_id]
                    internal_ids.remove(request_id)
                    if not internal_ids:
                        del self.external_req_ids[external_req_id]
            elif internal_ids := self.external_req_ids.pop(request_id, []):
                # External ID - abort all requests in the external->internal mapping
                internal_req_ids.extend(internal_ids)

        request_ids_to_abort = []
        for request_id in internal_req_ids:
            req_state = self.request_states.pop(request_id, None)
            if req_state is not None:
                self.lora_states.request_finished(request_id, req_state.lora_name)
                request_ids_to_abort.append(request_id)
                # Produce final abort output.
                if req_state.queue is not None and (
                    request_output := req_state.make_request_output(
                        new_token_ids=[],
                        # Set pooling_output is not None to
                        # correctly enter the abort pooling branch
                        pooling_output=EMPTY_CPU_TENSOR
                        if req_state.detokenizer is None
                        else None,
                        finish_reason=FinishReason.ABORT,
                        stop_reason=None,
                        kv_transfer_params=None,
                    )
                ):
                    req_state.queue.put(request_output)
            elif parent := self.parent_requests.get(request_id):
                # Abort children prior to removing the parent.
                if parent.child_requests:
                    child_reqs = list(parent.child_requests)
                    child_reqs = self.abort_requests(child_reqs, internal=True)
                    request_ids_to_abort.extend(child_reqs)
                self.parent_requests.pop(request_id, None)
        if not self.request_states:
            self._requests_drained.set()
        return request_ids_to_abort

    def add_request(
        self,
        request: EngineCoreRequest,
        prompt: str | None,
        parent_req: ParentRequest | None = None,
        request_index: int = 0,
        queue: RequestOutputCollector | None = None,
    ) -> None:
        request_id = request.request_id
        req_state = self.request_states.get(request_id)
        if req_state is not None:
            self._update_streaming_request_state(req_state, request, prompt)
            return

        req_state = RequestState.from_new_request(
            tokenizer=self.tokenizer,
            request=request,
            prompt=prompt,
            parent_req=parent_req,
            request_index=request_index,
            queue=queue,
            log_stats=self.log_stats,
            stream_interval=self.stream_interval,
        )
        if self._requests_drained.is_set():
            self._requests_drained.clear()
        self.request_states[request_id] = req_state
        if parent_req:
            self.parent_requests[parent_req.request_id] = parent_req

        # Track the external_req_id -> [internal_req_id, ...] mapping
        self.external_req_ids[req_state.external_req_id].append(request_id)

    def _update_streaming_request_state(
        self, req_state: RequestState, request: EngineCoreRequest, prompt: str | None
    ) -> None:
        """Queue a streaming update instead of immediately applying it."""
        if not request.resumable:
            # Final request - just mark completion, don't add its dummy tokens.
            if req_state.input_chunk_queue is None:
                # Engine already finished - emit final output and clean up.
                self._finish_request(req_state)
                if req_state.queue is not None:
                    # Emit a final output with finished=True
                    # to unblock the generate() loop.
                    req_state.queue.put(STREAM_FINISHED)
            elif req_state.input_chunk_queue:
                req_state.input_chunk_queue[-1].final = True
            else:
                req_state.streaming_input = False
            return

        update = StreamingUpdate(
            prompt=prompt,
            prompt_token_ids=request.prompt_token_ids,
            arrival_time=request.arrival_time,
        )

        # Apply request updates now if the last input already completed.
        if req_state.input_chunk_queue is None:
            req_state.apply_streaming_update(update)
            req_state.input_chunk_queue = deque()
        else:
            # Queue the streaming update otherwise.
            req_state.input_chunk_queue.append(update)

    def process_outputs(
        self,
        engine_core_outputs: list[EngineCoreOutput],
        engine_core_timestamp: float | None = None,
        iteration_stats: IterationStats | None = None,
    ) -> OutputProcessorOutput:
        """
        Process the EngineCoreOutputs:
        1) Compute stats for logging
        2) Detokenize
        3) Create and handle RequestOutput objects:
            * If there is a queue (for usage with AsyncLLM),
              put the RequestOutput objects into the queue for
              handling by the per-request generate() tasks.

            * If there is no queue (for usage with LLMEngine),
              return a list of RequestOutput objects.

        NOTE FOR DEVELOPERS

        vLLM V1 minimizes the number of python loops over the full
        batch to ensure system overheads are minimized. This is the
        only function that should loop over EngineCoreOutputs.

        If you need to touch every element of the batch, do it from
        within the loop below.
        """

        request_outputs: list[RequestOutput | PoolingRequestOutput] = []
        reqs_to_abort: list[str] = []
        for engine_core_output in engine_core_outputs:
            req_id = engine_core_output.request_id
            req_state = self.request_states.get(req_id)
            if req_state is None:
                # Ignore output for already-aborted request.
                continue

            # 1) Compute stats for this iteration.
            self._update_stats_from_output(
                req_state, engine_core_output, engine_core_timestamp, iteration_stats
            )

            new_token_ids = engine_core_output.new_token_ids
            pooling_output = engine_core_output.pooling_output
            finish_reason = engine_core_output.finish_reason
            stop_reason = engine_core_output.stop_reason
            kv_transfer_params = engine_core_output.kv_transfer_params
            routed_experts = engine_core_output.routed_experts
            req_state.num_cached_tokens = engine_core_output.num_cached_tokens
            req_state.is_prefilling = False

            if pooling_output is None:
                assert req_state.detokenizer is not None
                assert req_state.logprobs_processor is not None
                # 2) Detokenize the token ids into text and perform stop checks.
                stop_string = req_state.detokenizer.update(
                    new_token_ids, finish_reason == FinishReason.STOP
                )
                if stop_string:
                    finish_reason = FinishReason.STOP
                    stop_reason = stop_string

                # 3) Compute sample and prompt logprobs for request,
                # if required.
                req_state.logprobs_processor.update_from_output(engine_core_output)

            # 4) Create and handle RequestOutput objects.
            if request_output := req_state.make_request_output(
                new_token_ids,
                pooling_output,
                finish_reason,
                stop_reason,
                kv_transfer_params,
                routed_experts,
            ):
                if req_state.streaming_input:
                    request_output.finished = False

                if req_state.queue is not None:
                    # AsyncLLM: put into queue for handling by generate().
                    req_state.queue.put(request_output)
                else:
                    # LLMEngine: return list of RequestOutputs.
                    request_outputs.append(request_output)

            # Free completed requests.
            if finish_reason is not None:
                if req_state.streaming_input:
                    if req_state.input_chunk_queue:
                        update = req_state.input_chunk_queue.popleft()
                        req_state.apply_streaming_update(update)
                    else:
                        req_state.input_chunk_queue = None
                else:
                    self._finish_request(req_state)
                    if not engine_core_output.finished:
                        # If req not finished in EngineCore, but Detokenizer
                        # detected stop string, abort needed in EngineCore.
                        reqs_to_abort.append(req_id)

                    # Track per-request stats
                    self._update_stats_from_finished(
                        req_state, finish_reason, iteration_stats
                    )
                    if self.tracing_enabled:
                        self.do_tracing(engine_core_output, req_state, iteration_stats)

        return OutputProcessorOutput(
            request_outputs=request_outputs,
            reqs_to_abort=reqs_to_abort,
        )

    def _finish_request(self, req_state: RequestState) -> None:
        req_id = req_state.request_id
        self.request_states.pop(req_id)

        internal_ids = self.external_req_ids[req_state.external_req_id]
        internal_ids.remove(req_id)
        if not internal_ids:
            del self.external_req_ids[req_state.external_req_id]

        # Remove parent request if applicable.
        parent_req = req_state.parent_req
        if parent_req and not parent_req.child_requests:
            self.parent_requests.pop(parent_req.request_id, None)

        if not self.request_states:
            self._requests_drained.set()

    def update_scheduler_stats(self, scheduler_stats: SchedulerStats | None):
        self.lora_states.update_scheduler_stats(scheduler_stats)

    def do_tracing(
        self,
        engine_core_output: EngineCoreOutput,
        req_state: RequestState,
        iteration_stats: IterationStats | None,
    ) -> None:
        assert req_state.stats is not None
        assert iteration_stats is not None

        metrics = req_state.stats
        arrival_time_ns = int(metrics.arrival_time * 1e9)
        trace_context = extract_trace_context(engine_core_output.trace_headers)
        prompt_length = length_from_prompt_token_ids_or_embeds(
            req_state.prompt_token_ids, req_state.prompt_embeds
        )

        # Calculate timing metrics
        e2e_time = iteration_stats.iteration_timestamp - metrics.arrival_time
        queued_time = metrics.scheduled_ts - metrics.queued_ts
        prefill_time = metrics.first_token_ts - metrics.scheduled_ts
        decode_time = metrics.last_token_ts - metrics.first_token_ts
        inference_time = metrics.last_token_ts - metrics.scheduled_ts

        # Build attributes dict
        attributes: dict[str, Any] = {
            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN: (
                metrics.first_token_latency
            ),
            SpanAttributes.GEN_AI_LATENCY_E2E: e2e_time,
            SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE: queued_time,
            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS: prompt_length,
            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS: (
                metrics.num_generation_tokens
            ),
            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL: prefill_time,
            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE: decode_time,
            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE: inference_time,
            SpanAttributes.GEN_AI_REQUEST_ID: req_state.external_req_id,
        }

        # Add optional request parameters
        if req_state.top_p:
            attributes[SpanAttributes.GEN_AI_REQUEST_TOP_P] = req_state.top_p
        if req_state.max_tokens_param:
            attributes[SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS] = (
                req_state.max_tokens_param
            )
        if req_state.temperature:
            attributes[SpanAttributes.GEN_AI_REQUEST_TEMPERATURE] = (
                req_state.temperature
            )
        if req_state.n:
            attributes[SpanAttributes.GEN_AI_REQUEST_N] = req_state.n

        instrument_manual(
            span_name="llm_request",
            start_time=arrival_time_ns,
            attributes=attributes,
            context=trace_context,
            kind=SpanKind.SERVER,
        )

    def _update_stats_from_output(
        self,
        req_state: RequestState,
        engine_core_output: EngineCoreOutput,
        engine_core_timestamp: float | None,
        iteration_stats: IterationStats | None,
    ):
        if iteration_stats is None:
            return

        assert engine_core_timestamp is not None
        assert req_state.stats is not None
        iteration_stats.update_from_output(
            engine_core_output,
            engine_core_timestamp,
            req_state.is_prefilling,
            req_state.prompt_len,
            req_state.stats,
            self.lora_states,
            req_state.lora_name,
        )

    def _update_stats_from_finished(
        self,
        req_state: RequestState,
        finish_reason: FinishReason | None,
        iteration_stats: IterationStats | None,
    ):
        if iteration_stats is None:
            return

        assert finish_reason is not None
        assert req_state.stats is not None
        iteration_stats.update_from_finished_request(
            finish_reason=finish_reason,
            num_prompt_tokens=req_state.prompt_len,
            max_tokens_param=req_state.max_tokens_param,
            req_stats=req_state.stats,
            num_cached_tokens=req_state.num_cached_tokens,
        )
        self.lora_states.request_finished(req_state.request_id, req_state.lora_name)

        ParentRequest.observe_finished_request(
            req_state.parent_req, iteration_stats, req_state.stats.num_generation_tokens
        )

_update_streaming_request_state ¶

_update_streaming_request_state(
    req_state: RequestState,
    request: EngineCoreRequest,
    prompt: str | None,
) -> None

Queue a streaming update instead of immediately applying it.

Source code in vllm/v1/engine/output_processor.py

def _update_streaming_request_state(
    self, req_state: RequestState, request: EngineCoreRequest, prompt: str | None
) -> None:
    """Queue a streaming update instead of immediately applying it."""
    if not request.resumable:
        # Final request - just mark completion, don't add its dummy tokens.
        if req_state.input_chunk_queue is None:
            # Engine already finished - emit final output and clean up.
            self._finish_request(req_state)
            if req_state.queue is not None:
                # Emit a final output with finished=True
                # to unblock the generate() loop.
                req_state.queue.put(STREAM_FINISHED)
        elif req_state.input_chunk_queue:
            req_state.input_chunk_queue[-1].final = True
        else:
            req_state.streaming_input = False
        return

    update = StreamingUpdate(
        prompt=prompt,
        prompt_token_ids=request.prompt_token_ids,
        arrival_time=request.arrival_time,
    )

    # Apply request updates now if the last input already completed.
    if req_state.input_chunk_queue is None:
        req_state.apply_streaming_update(update)
        req_state.input_chunk_queue = deque()
    else:
        # Queue the streaming update otherwise.
        req_state.input_chunk_queue.append(update)

abort_requests ¶

abort_requests(
    request_ids: Iterable[str], internal: bool
) -> list[str]

Abort a list of requests.

The request_ids may be either external request IDs (those passed to InputProcessor.process_inputs()) or internal request IDs (those randomly generated when creating the EngineCoreRequest).

If an external request ID is provided, and that external request ID was used for multiple requests, all requests associated with that external request ID are aborted.

In the case of parallel sampling, a request ID may be used to identify a parent request, in which case the associated child requests are aborted also.

Source code in vllm/v1/engine/output_processor.py

def abort_requests(self, request_ids: Iterable[str], internal: bool) -> list[str]:
    """Abort a list of requests.

    The request_ids may be either external request IDs (those passed to
    InputProcessor.process_inputs()) or internal request IDs (those randomly
    generated when creating the EngineCoreRequest).

    If an external request ID is provided, and that external request ID
    was used for multiple requests, all requests associated with that external
    request ID are aborted.

    In the case of parallel sampling, a request ID may be used to identify
    a parent request, in which case the associated child requests are aborted
    also.
    """
    internal_req_ids = []
    for request_id in request_ids:
        if internal:
            # Internal ID - this may be a parent request
            internal_req_ids.append(request_id)

            # Remove internal ID from the external->internal mapping
            if req_state := self.request_states.get(request_id):
                external_req_id = req_state.external_req_id
                internal_ids = self.external_req_ids[external_req_id]
                internal_ids.remove(request_id)
                if not internal_ids:
                    del self.external_req_ids[external_req_id]
        elif internal_ids := self.external_req_ids.pop(request_id, []):
            # External ID - abort all requests in the external->internal mapping
            internal_req_ids.extend(internal_ids)

    request_ids_to_abort = []
    for request_id in internal_req_ids:
        req_state = self.request_states.pop(request_id, None)
        if req_state is not None:
            self.lora_states.request_finished(request_id, req_state.lora_name)
            request_ids_to_abort.append(request_id)
            # Produce final abort output.
            if req_state.queue is not None and (
                request_output := req_state.make_request_output(
                    new_token_ids=[],
                    # Set pooling_output is not None to
                    # correctly enter the abort pooling branch
                    pooling_output=EMPTY_CPU_TENSOR
                    if req_state.detokenizer is None
                    else None,
                    finish_reason=FinishReason.ABORT,
                    stop_reason=None,
                    kv_transfer_params=None,
                )
            ):
                req_state.queue.put(request_output)
        elif parent := self.parent_requests.get(request_id):
            # Abort children prior to removing the parent.
            if parent.child_requests:
                child_reqs = list(parent.child_requests)
                child_reqs = self.abort_requests(child_reqs, internal=True)
                request_ids_to_abort.extend(child_reqs)
            self.parent_requests.pop(request_id, None)
    if not self.request_states:
        self._requests_drained.set()
    return request_ids_to_abort

process_outputs ¶

process_outputs(
    engine_core_outputs: list[EngineCoreOutput],
    engine_core_timestamp: float | None = None,
    iteration_stats: IterationStats | None = None,
) -> OutputProcessorOutput

Process the EngineCoreOutputs: 1) Compute stats for logging 2) Detokenize 3) Create and handle RequestOutput objects: * If there is a queue (for usage with AsyncLLM), put the RequestOutput objects into the queue for handling by the per-request generate() tasks.

* If there is no queue (for usage with LLMEngine),
  return a list of RequestOutput objects.

NOTE FOR DEVELOPERS

vLLM V1 minimizes the number of python loops over the full batch to ensure system overheads are minimized. This is the only function that should loop over EngineCoreOutputs.

If you need to touch every element of the batch, do it from within the loop below.

Source code in vllm/v1/engine/output_processor.py

def process_outputs(
    self,
    engine_core_outputs: list[EngineCoreOutput],
    engine_core_timestamp: float | None = None,
    iteration_stats: IterationStats | None = None,
) -> OutputProcessorOutput:
    """
    Process the EngineCoreOutputs:
    1) Compute stats for logging
    2) Detokenize
    3) Create and handle RequestOutput objects:
        * If there is a queue (for usage with AsyncLLM),
          put the RequestOutput objects into the queue for
          handling by the per-request generate() tasks.

        * If there is no queue (for usage with LLMEngine),
          return a list of RequestOutput objects.

    NOTE FOR DEVELOPERS

    vLLM V1 minimizes the number of python loops over the full
    batch to ensure system overheads are minimized. This is the
    only function that should loop over EngineCoreOutputs.

    If you need to touch every element of the batch, do it from
    within the loop below.
    """

    request_outputs: list[RequestOutput | PoolingRequestOutput] = []
    reqs_to_abort: list[str] = []
    for engine_core_output in engine_core_outputs:
        req_id = engine_core_output.request_id
        req_state = self.request_states.get(req_id)
        if req_state is None:
            # Ignore output for already-aborted request.
            continue

        # 1) Compute stats for this iteration.
        self._update_stats_from_output(
            req_state, engine_core_output, engine_core_timestamp, iteration_stats
        )

        new_token_ids = engine_core_output.new_token_ids
        pooling_output = engine_core_output.pooling_output
        finish_reason = engine_core_output.finish_reason
        stop_reason = engine_core_output.stop_reason
        kv_transfer_params = engine_core_output.kv_transfer_params
        routed_experts = engine_core_output.routed_experts
        req_state.num_cached_tokens = engine_core_output.num_cached_tokens
        req_state.is_prefilling = False

        if pooling_output is None:
            assert req_state.detokenizer is not None
            assert req_state.logprobs_processor is not None
            # 2) Detokenize the token ids into text and perform stop checks.
            stop_string = req_state.detokenizer.update(
                new_token_ids, finish_reason == FinishReason.STOP
            )
            if stop_string:
                finish_reason = FinishReason.STOP
                stop_reason = stop_string

            # 3) Compute sample and prompt logprobs for request,
            # if required.
            req_state.logprobs_processor.update_from_output(engine_core_output)

        # 4) Create and handle RequestOutput objects.
        if request_output := req_state.make_request_output(
            new_token_ids,
            pooling_output,
            finish_reason,
            stop_reason,
            kv_transfer_params,
            routed_experts,
        ):
            if req_state.streaming_input:
                request_output.finished = False

            if req_state.queue is not None:
                # AsyncLLM: put into queue for handling by generate().
                req_state.queue.put(request_output)
            else:
                # LLMEngine: return list of RequestOutputs.
                request_outputs.append(request_output)

        # Free completed requests.
        if finish_reason is not None:
            if req_state.streaming_input:
                if req_state.input_chunk_queue:
                    update = req_state.input_chunk_queue.popleft()
                    req_state.apply_streaming_update(update)
                else:
                    req_state.input_chunk_queue = None
            else:
                self._finish_request(req_state)
                if not engine_core_output.finished:
                    # If req not finished in EngineCore, but Detokenizer
                    # detected stop string, abort needed in EngineCore.
                    reqs_to_abort.append(req_id)

                # Track per-request stats
                self._update_stats_from_finished(
                    req_state, finish_reason, iteration_stats
                )
                if self.tracing_enabled:
                    self.do_tracing(engine_core_output, req_state, iteration_stats)

    return OutputProcessorOutput(
        request_outputs=request_outputs,
        reqs_to_abort=reqs_to_abort,
    )

propagate_error ¶

propagate_error(e: Exception)

Propagate error to all generate() tasks.

Source code in vllm/v1/engine/output_processor.py

def propagate_error(self, e: Exception):
    """Propagate error to all generate() tasks."""

    for _, state in self.request_states.items():
        assert state.queue is not None
        state.queue.put(e)

RequestOutputCollector ¶

Collects streamed RequestOutputs per individual request, for hand-off to the consuming asyncio generate task.

When streaming deltas, RequestOutputs are merged if the producer gets ahead of the consumer.

Source code in vllm/v1/engine/output_processor.py

class RequestOutputCollector:
    """
    Collects streamed RequestOutputs per individual request,
    for hand-off to the consuming asyncio generate task.

    When streaming deltas, RequestOutputs are merged if the
    producer gets ahead of the consumer.
    """

    def __init__(self, output_kind: RequestOutputKind, request_id: str):
        self.aggregate = output_kind == RequestOutputKind.DELTA
        self.request_id = request_id
        self.output: RequestOutput | PoolingRequestOutput | Exception | None = None
        self.ready = asyncio.Event()

        self._input_stream_task: asyncio.Task | None = None

    def put(self, output: RequestOutput | PoolingRequestOutput | Exception) -> None:
        """Non-blocking put operation."""
        if self.output is None or isinstance(output, Exception):
            self.output = output
            self.ready.set()
        elif isinstance(self.output, RequestOutput) and isinstance(
            output, RequestOutput
        ):
            # This ensures that request outputs with different request indexes
            # (if n > 1) do not override each other.
            self.output.add(output, aggregate=self.aggregate)
        elif isinstance(self.output, PoolingRequestOutput) and isinstance(
            output, PoolingRequestOutput
        ):
            self.output = output

    async def get(self) -> RequestOutput | PoolingRequestOutput:
        """Get operation blocks on put event."""
        while (output := self.output) is None:
            await self.ready.wait()
        self.output = None
        self.ready.clear()
        if isinstance(output, Exception):
            raise output
        return output

    def get_nowait(self) -> RequestOutput | PoolingRequestOutput | None:
        """Non-blocking get operation."""
        output = self.output
        if output is not None:
            self.output = None
            self.ready.clear()
        if isinstance(output, Exception):
            raise output
        return output

    def close(self):
        if self._input_stream_task is not None:
            self._input_stream_task.cancel()
        self._input_stream_task = None

    def __del__(self):
        if (task := self._input_stream_task) is not None:
            task.get_loop().call_soon_threadsafe(task.cancel)
            self._input_stream_task = None

get `async` ¶

get() -> RequestOutput | PoolingRequestOutput

Get operation blocks on put event.

Source code in vllm/v1/engine/output_processor.py

async def get(self) -> RequestOutput | PoolingRequestOutput:
    """Get operation blocks on put event."""
    while (output := self.output) is None:
        await self.ready.wait()
    self.output = None
    self.ready.clear()
    if isinstance(output, Exception):
        raise output
    return output

get_nowait ¶

get_nowait() -> RequestOutput | PoolingRequestOutput | None

Non-blocking get operation.

Source code in vllm/v1/engine/output_processor.py

def get_nowait(self) -> RequestOutput | PoolingRequestOutput | None:
    """Non-blocking get operation."""
    output = self.output
    if output is not None:
        self.output = None
        self.ready.clear()
    if isinstance(output, Exception):
        raise output
    return output

put ¶

put(
    output: RequestOutput
    | PoolingRequestOutput
    | Exception,
) -> None

Non-blocking put operation.

Source code in vllm/v1/engine/output_processor.py

def put(self, output: RequestOutput | PoolingRequestOutput | Exception) -> None:
    """Non-blocking put operation."""
    if self.output is None or isinstance(output, Exception):
        self.output = output
        self.ready.set()
    elif isinstance(self.output, RequestOutput) and isinstance(
        output, RequestOutput
    ):
        # This ensures that request outputs with different request indexes
        # (if n > 1) do not override each other.
        self.output.add(output, aggregate=self.aggregate)
    elif isinstance(self.output, PoolingRequestOutput) and isinstance(
        output, PoolingRequestOutput
    ):
        self.output = output

StreamingUpdate `dataclass` ¶

Streaming input update data for output processor.

Contains the incremental prompt data to be applied to a request state when the current sub-request completes.

Source code in vllm/v1/engine/output_processor.py

@dataclass
class StreamingUpdate:
    """Streaming input update data for output processor.

    Contains the incremental prompt data to be applied to a request state
    when the current sub-request completes.
    """

    prompt: str | None
    prompt_token_ids: list[int] | None
    arrival_time: float
    final: bool = False

vllm.v1.engine.output_processor ¶

OutputProcessor ¶

_update_streaming_request_state ¶

abort_requests ¶

process_outputs ¶

propagate_error ¶

RequestOutputCollector ¶

get async ¶

get_nowait ¶

put ¶

StreamingUpdate dataclass ¶

get `async` ¶

StreamingUpdate `dataclass` ¶