Skip to content

Vllm in process

VllmInProcess

Bases: LLM

In-process vLLM backend using vllm.LLM.chat() so the model's chat template is applied automatically.

Supports guided decoding via StructuredOutputsParams(json=...).

In offline mode, vLLM does not automatically split reasoning vs final content for you; we do it here using the configured ReasoningParser (and a Harmony fallback).

Source code in src/kibad_llm/llms/vllm_in_process.py
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
class VllmInProcess(LLM):
    """
    In-process vLLM backend using vllm.LLM.chat() so the model's chat template
    is applied automatically.

    Supports guided decoding via StructuredOutputsParams(json=...).

    In offline mode, vLLM does not automatically split reasoning vs final content
    for you; we do it here using the configured ReasoningParser (and a Harmony fallback).
    """

    def __init__(
        self,
        *,
        model: str,
        vllm_kwargs: dict[str, Any] | None = None,
        lazy: bool = False,
        # for compatibility with other LlamaIndex LLMs (but directly supported kwargs take precedence)
        additional_kwargs: dict[str, Any] | None = None,
        **default_request_kwargs: Any,
    ) -> None:
        self._model_name = model
        self._vllm_kwargs = vllm_kwargs or {}
        if not lazy:
            # trigger vLLM initialization now (instead of waiting for first call)
            # so that any errors are raised during LLM setup instead of at call time
            _ = self.llm
            _ = self.reasoning_parser

        self._default_request_kwargs: dict[str, Any] = additional_kwargs or {}
        self._default_request_kwargs.update(default_request_kwargs)

    @property
    def llm(self) -> VllmLLM:
        if not hasattr(self, "_llm"):
            self._llm = VllmLLM(model=self._model_name, **self._vllm_kwargs)

        return self._llm

    @property
    def reasoning_parser(self) -> ReasoningParser | None:
        if not hasattr(self, "_reasoning_parser"):
            # Uses vllm_config.structured_outputs_config.reasoning_parser
            # to create a ReasoningParser (if configured).
            structured_output_manager = StructuredOutputManager(
                vllm_config=self.llm.llm_engine.vllm_config
            )
            self._reasoning_parser: ReasoningParser | None = structured_output_manager.reasoner
            if self._reasoning_parser is not None:
                logger.info(
                    f"Using reasoning parser: {type(self._reasoning_parser).__name__} "
                    f"for model {self._model_name} to separate reasoning from final content."
                )
            else:
                logger.info(
                    f"No reasoning parser configured for model {self._model_name}. "
                    f"Assuming no reasoning content in outputs."
                )

        return self._reasoning_parser

    def destroy(self) -> None:
        """Clean up vLLM resources."""
        if hasattr(self, "_llm"):
            del self._llm
        if hasattr(self, "_reasoning_parser"):
            del self._reasoning_parser
        cleanup()

    def __del__(self):
        self.destroy()

    def call_llm_chat_with_guided_decoding(
        self,
        messages: list[SimpleChatMessage],
        *,
        json_schema: dict[str, Any] | None = None,
        **request_kwargs: Any,
    ) -> ChatResponse:
        convo = [_chat_message_to_vllm_param(m) for m in messages]

        sampling_kwargs = {**self._default_request_kwargs, **request_kwargs}

        # pull out vLLM chat() kwargs; everything else goes into SamplingParams
        chat_kwargs: dict[str, Any] = {"use_tqdm": False}
        for k in list(sampling_kwargs.keys()):
            if k in _VLLM_CHAT_KWARGS:
                chat_kwargs[k] = sampling_kwargs.pop(k)

        if json_schema is not None:
            sampling_kwargs["structured_outputs"] = StructuredOutputsParams(json=json_schema)

        sampling_params = SamplingParams(**sampling_kwargs)
        req_outputs = self.llm.chat(convo, sampling_params=sampling_params, **chat_kwargs)
        # take the first output (we only sent one conversation) and first generation
        out = req_outputs[0].outputs[0]

        if self.reasoning_parser is not None:
            if isinstance(self.reasoning_parser, GptOssReasoningParser):
                # Harmony (gpt-oss): split via token ids
                reasoning, content, _is_tool_call = parse_chat_output(out.token_ids)
            else:
                # create dummy request object for reasoning extraction
                request_obj = ChatCompletionRequest(messages=convo, model=self._model_name, seed=0)
                reasoning, content = self.reasoning_parser.extract_reasoning(
                    model_output=out.text, request=request_obj
                )
        else:
            reasoning = None
            content = out.text

        msg = LlamaIndexChatMessage(role=MessageRole.ASSISTANT, content=content)
        if reasoning is not None:
            msg.additional_kwargs["reasoning"] = reasoning

        return ChatResponse(message=msg, raw=req_outputs)

    def get_reasoning_from_chat_response(self, response: ChatResponse) -> str | None:
        """Extract reasoning from a chat response."""

        # don't attempt extraction if no reasoning parser configured (and thus don't raise errors)
        if self.reasoning_parser is None:
            return None

        reasoning = response.message.additional_kwargs.get("reasoning")
        if not isinstance(reasoning, str):
            raise ReasoningExtractionError("Could not extract reasoning from chat response.")
        if not reasoning.strip():
            raise EmptyReasoningError("Extracted reasoning is empty.")
        return reasoning

destroy()

Clean up vLLM resources.

Source code in src/kibad_llm/llms/vllm_in_process.py
123
124
125
126
127
128
129
def destroy(self) -> None:
    """Clean up vLLM resources."""
    if hasattr(self, "_llm"):
        del self._llm
    if hasattr(self, "_reasoning_parser"):
        del self._reasoning_parser
    cleanup()

get_reasoning_from_chat_response(response)

Extract reasoning from a chat response.

Source code in src/kibad_llm/llms/vllm_in_process.py
179
180
181
182
183
184
185
186
187
188
189
190
191
def get_reasoning_from_chat_response(self, response: ChatResponse) -> str | None:
    """Extract reasoning from a chat response."""

    # don't attempt extraction if no reasoning parser configured (and thus don't raise errors)
    if self.reasoning_parser is None:
        return None

    reasoning = response.message.additional_kwargs.get("reasoning")
    if not isinstance(reasoning, str):
        raise ReasoningExtractionError("Could not extract reasoning from chat response.")
    if not reasoning.strip():
        raise EmptyReasoningError("Extracted reasoning is empty.")
    return reasoning