class OutputProcessor:
"""Process EngineCoreOutputs into RequestOutputs."""
def __init__(
self,
tokenizer: TokenizerLike | None,
log_stats: bool,
stream_interval: int = 1,
):
self.log_stats = log_stats
self.tokenizer = tokenizer
self.stream_interval = stream_interval
self.request_states: dict[str, RequestState] = {}
self.parent_requests: dict[str, ParentRequest] = {}
self.external_req_ids: defaultdict[str, list[str]] = defaultdict(list)
self.lora_states = LoRARequestStates(log_stats)
self.tracing_enabled: bool = False
self._requests_drained = asyncio.Event()
self._requests_drained.set()
def get_num_unfinished_requests(self):
return len(self.request_states)
def has_unfinished_requests(self) -> bool:
return len(self.request_states) > 0
async def wait_for_requests_to_drain(self) -> None:
if not self.request_states:
return
await self._requests_drained.wait()
def propagate_error(self, e: Exception):
"""Propagate error to all generate() tasks."""
for _, state in self.request_states.items():
assert state.queue is not None
state.queue.put(e)
def abort_requests(self, request_ids: Iterable[str], internal: bool) -> list[str]:
"""Abort a list of requests.
The request_ids may be either external request IDs (those passed to
InputProcessor.process_inputs()) or internal request IDs (those randomly
generated when creating the EngineCoreRequest).
If an external request ID is provided, and that external request ID
was used for multiple requests, all requests associated with that external
request ID are aborted.
In the case of parallel sampling, a request ID may be used to identify
a parent request, in which case the associated child requests are aborted
also.
"""
internal_req_ids = []
for request_id in request_ids:
if internal:
# Internal ID - this may be a parent request
internal_req_ids.append(request_id)
# Remove internal ID from the external->internal mapping
if req_state := self.request_states.get(request_id):
external_req_id = req_state.external_req_id
internal_ids = self.external_req_ids[external_req_id]
internal_ids.remove(request_id)
if not internal_ids:
del self.external_req_ids[external_req_id]
elif internal_ids := self.external_req_ids.pop(request_id, []):
# External ID - abort all requests in the external->internal mapping
internal_req_ids.extend(internal_ids)
request_ids_to_abort = []
for request_id in internal_req_ids:
req_state = self.request_states.pop(request_id, None)
if req_state is not None:
self.lora_states.request_finished(request_id, req_state.lora_name)
request_ids_to_abort.append(request_id)
# Produce final abort output.
if req_state.queue is not None and (
request_output := req_state.make_request_output(
new_token_ids=[],
# Set pooling_output is not None to
# correctly enter the abort pooling branch
pooling_output=EMPTY_CPU_TENSOR
if req_state.detokenizer is None
else None,
finish_reason=FinishReason.ABORT,
stop_reason=None,
kv_transfer_params=None,
)
):
req_state.queue.put(request_output)
elif parent := self.parent_requests.get(request_id):
# Abort children prior to removing the parent.
if parent.child_requests:
child_reqs = list(parent.child_requests)
child_reqs = self.abort_requests(child_reqs, internal=True)
request_ids_to_abort.extend(child_reqs)
self.parent_requests.pop(request_id, None)
if not self.request_states:
self._requests_drained.set()
return request_ids_to_abort
def add_request(
self,
request: EngineCoreRequest,
prompt: str | None,
parent_req: ParentRequest | None = None,
request_index: int = 0,
queue: RequestOutputCollector | None = None,
) -> None:
request_id = request.request_id
req_state = self.request_states.get(request_id)
if req_state is not None:
self._update_streaming_request_state(req_state, request, prompt)
return
req_state = RequestState.from_new_request(
tokenizer=self.tokenizer,
request=request,
prompt=prompt,
parent_req=parent_req,
request_index=request_index,
queue=queue,
log_stats=self.log_stats,
stream_interval=self.stream_interval,
)
if self._requests_drained.is_set():
self._requests_drained.clear()
self.request_states[request_id] = req_state
if parent_req:
self.parent_requests[parent_req.request_id] = parent_req
# Track the external_req_id -> [internal_req_id, ...] mapping
self.external_req_ids[req_state.external_req_id].append(request_id)
def _update_streaming_request_state(
self, req_state: RequestState, request: EngineCoreRequest, prompt: str | None
) -> None:
"""Queue a streaming update instead of immediately applying it."""
if not request.resumable:
# Final request - just mark completion, don't add its dummy tokens.
if req_state.input_chunk_queue is None:
# Engine already finished - emit final output and clean up.
self._finish_request(req_state)
if req_state.queue is not None:
# Emit a final output with finished=True
# to unblock the generate() loop.
req_state.queue.put(STREAM_FINISHED)
elif req_state.input_chunk_queue:
req_state.input_chunk_queue[-1].final = True
else:
req_state.streaming_input = False
return
update = StreamingUpdate(
prompt=prompt,
prompt_token_ids=request.prompt_token_ids,
arrival_time=request.arrival_time,
)
# Apply request updates now if the last input already completed.
if req_state.input_chunk_queue is None:
req_state.apply_streaming_update(update)
req_state.input_chunk_queue = deque()
else:
# Queue the streaming update otherwise.
req_state.input_chunk_queue.append(update)
def process_outputs(
self,
engine_core_outputs: list[EngineCoreOutput],
engine_core_timestamp: float | None = None,
iteration_stats: IterationStats | None = None,
) -> OutputProcessorOutput:
"""
Process the EngineCoreOutputs:
1) Compute stats for logging
2) Detokenize
3) Create and handle RequestOutput objects:
* If there is a queue (for usage with AsyncLLM),
put the RequestOutput objects into the queue for
handling by the per-request generate() tasks.
* If there is no queue (for usage with LLMEngine),
return a list of RequestOutput objects.
NOTE FOR DEVELOPERS
vLLM V1 minimizes the number of python loops over the full
batch to ensure system overheads are minimized. This is the
only function that should loop over EngineCoreOutputs.
If you need to touch every element of the batch, do it from
within the loop below.
"""
request_outputs: list[RequestOutput | PoolingRequestOutput] = []
reqs_to_abort: list[str] = []
for engine_core_output in engine_core_outputs:
req_id = engine_core_output.request_id
req_state = self.request_states.get(req_id)
if req_state is None:
# Ignore output for already-aborted request.
continue
# 1) Compute stats for this iteration.
self._update_stats_from_output(
req_state, engine_core_output, engine_core_timestamp, iteration_stats
)
new_token_ids = engine_core_output.new_token_ids
pooling_output = engine_core_output.pooling_output
finish_reason = engine_core_output.finish_reason
stop_reason = engine_core_output.stop_reason
kv_transfer_params = engine_core_output.kv_transfer_params
routed_experts = engine_core_output.routed_experts
req_state.num_cached_tokens = engine_core_output.num_cached_tokens
req_state.is_prefilling = False
if pooling_output is None:
assert req_state.detokenizer is not None
assert req_state.logprobs_processor is not None
# 2) Detokenize the token ids into text and perform stop checks.
stop_string = req_state.detokenizer.update(
new_token_ids, finish_reason == FinishReason.STOP
)
if stop_string:
finish_reason = FinishReason.STOP
stop_reason = stop_string
# 3) Compute sample and prompt logprobs for request,
# if required.
req_state.logprobs_processor.update_from_output(engine_core_output)
# 4) Create and handle RequestOutput objects.
if request_output := req_state.make_request_output(
new_token_ids,
pooling_output,
finish_reason,
stop_reason,
kv_transfer_params,
routed_experts,
):
if req_state.streaming_input:
request_output.finished = False
if req_state.queue is not None:
# AsyncLLM: put into queue for handling by generate().
req_state.queue.put(request_output)
else:
# LLMEngine: return list of RequestOutputs.
request_outputs.append(request_output)
# Free completed requests.
if finish_reason is not None:
if req_state.streaming_input:
if req_state.input_chunk_queue:
update = req_state.input_chunk_queue.popleft()
req_state.apply_streaming_update(update)
else:
req_state.input_chunk_queue = None
else:
self._finish_request(req_state)
if not engine_core_output.finished:
# If req not finished in EngineCore, but Detokenizer
# detected stop string, abort needed in EngineCore.
reqs_to_abort.append(req_id)
# Track per-request stats
self._update_stats_from_finished(
req_state, finish_reason, iteration_stats
)
if self.tracing_enabled:
self.do_tracing(engine_core_output, req_state, iteration_stats)
return OutputProcessorOutput(
request_outputs=request_outputs,
reqs_to_abort=reqs_to_abort,
)
def _finish_request(self, req_state: RequestState) -> None:
req_id = req_state.request_id
self.request_states.pop(req_id)
internal_ids = self.external_req_ids[req_state.external_req_id]
internal_ids.remove(req_id)
if not internal_ids:
del self.external_req_ids[req_state.external_req_id]
# Remove parent request if applicable.
parent_req = req_state.parent_req
if parent_req and not parent_req.child_requests:
self.parent_requests.pop(parent_req.request_id, None)
if not self.request_states:
self._requests_drained.set()
def update_scheduler_stats(self, scheduler_stats: SchedulerStats | None):
self.lora_states.update_scheduler_stats(scheduler_stats)
def do_tracing(
self,
engine_core_output: EngineCoreOutput,
req_state: RequestState,
iteration_stats: IterationStats | None,
) -> None:
assert req_state.stats is not None
assert iteration_stats is not None
metrics = req_state.stats
arrival_time_ns = int(metrics.arrival_time * 1e9)
trace_context = extract_trace_context(engine_core_output.trace_headers)
prompt_length = length_from_prompt_token_ids_or_embeds(
req_state.prompt_token_ids, req_state.prompt_embeds
)
# Calculate timing metrics
e2e_time = iteration_stats.iteration_timestamp - metrics.arrival_time
queued_time = metrics.scheduled_ts - metrics.queued_ts
prefill_time = metrics.first_token_ts - metrics.scheduled_ts
decode_time = metrics.last_token_ts - metrics.first_token_ts
inference_time = metrics.last_token_ts - metrics.scheduled_ts
# Build attributes dict
attributes: dict[str, Any] = {
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN: (
metrics.first_token_latency
),
SpanAttributes.GEN_AI_LATENCY_E2E: e2e_time,
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE: queued_time,
SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS: prompt_length,
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS: (
metrics.num_generation_tokens
),
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_PREFILL: prefill_time,
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_DECODE: decode_time,
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_INFERENCE: inference_time,
SpanAttributes.GEN_AI_REQUEST_ID: req_state.external_req_id,
}
# Add optional request parameters
if req_state.top_p:
attributes[SpanAttributes.GEN_AI_REQUEST_TOP_P] = req_state.top_p
if req_state.max_tokens_param:
attributes[SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS] = (
req_state.max_tokens_param
)
if req_state.temperature:
attributes[SpanAttributes.GEN_AI_REQUEST_TEMPERATURE] = (
req_state.temperature
)
if req_state.n:
attributes[SpanAttributes.GEN_AI_REQUEST_N] = req_state.n
instrument_manual(
span_name="llm_request",
start_time=arrival_time_ns,
attributes=attributes,
context=trace_context,
kind=SpanKind.SERVER,
)
def _update_stats_from_output(
self,
req_state: RequestState,
engine_core_output: EngineCoreOutput,
engine_core_timestamp: float | None,
iteration_stats: IterationStats | None,
):
if iteration_stats is None:
return
assert engine_core_timestamp is not None
assert req_state.stats is not None
iteration_stats.update_from_output(
engine_core_output,
engine_core_timestamp,
req_state.is_prefilling,
req_state.prompt_len,
req_state.stats,
self.lora_states,
req_state.lora_name,
)
def _update_stats_from_finished(
self,
req_state: RequestState,
finish_reason: FinishReason | None,
iteration_stats: IterationStats | None,
):
if iteration_stats is None:
return
assert finish_reason is not None
assert req_state.stats is not None
iteration_stats.update_from_finished_request(
finish_reason=finish_reason,
num_prompt_tokens=req_state.prompt_len,
max_tokens_param=req_state.max_tokens_param,
req_stats=req_state.stats,
num_cached_tokens=req_state.num_cached_tokens,
)
self.lora_states.request_finished(req_state.request_id, req_state.lora_name)
ParentRequest.observe_finished_request(
req_state.parent_req, iteration_stats, req_state.stats.num_generation_tokens
)