62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191 | class VllmInProcess(LLM):
"""
In-process vLLM backend using vllm.LLM.chat() so the model's chat template
is applied automatically.
Supports guided decoding via StructuredOutputsParams(json=...).
In offline mode, vLLM does not automatically split reasoning vs final content
for you; we do it here using the configured ReasoningParser (and a Harmony fallback).
"""
def __init__(
self,
*,
model: str,
vllm_kwargs: dict[str, Any] | None = None,
lazy: bool = False,
# for compatibility with other LlamaIndex LLMs (but directly supported kwargs take precedence)
additional_kwargs: dict[str, Any] | None = None,
**default_request_kwargs: Any,
) -> None:
self._model_name = model
self._vllm_kwargs = vllm_kwargs or {}
if not lazy:
# trigger vLLM initialization now (instead of waiting for first call)
# so that any errors are raised during LLM setup instead of at call time
_ = self.llm
_ = self.reasoning_parser
self._default_request_kwargs: dict[str, Any] = additional_kwargs or {}
self._default_request_kwargs.update(default_request_kwargs)
@property
def llm(self) -> VllmLLM:
if not hasattr(self, "_llm"):
self._llm = VllmLLM(model=self._model_name, **self._vllm_kwargs)
return self._llm
@property
def reasoning_parser(self) -> ReasoningParser | None:
if not hasattr(self, "_reasoning_parser"):
# Uses vllm_config.structured_outputs_config.reasoning_parser
# to create a ReasoningParser (if configured).
structured_output_manager = StructuredOutputManager(
vllm_config=self.llm.llm_engine.vllm_config
)
self._reasoning_parser: ReasoningParser | None = structured_output_manager.reasoner
if self._reasoning_parser is not None:
logger.info(
f"Using reasoning parser: {type(self._reasoning_parser).__name__} "
f"for model {self._model_name} to separate reasoning from final content."
)
else:
logger.info(
f"No reasoning parser configured for model {self._model_name}. "
f"Assuming no reasoning content in outputs."
)
return self._reasoning_parser
def destroy(self) -> None:
"""Clean up vLLM resources."""
if hasattr(self, "_llm"):
del self._llm
if hasattr(self, "_reasoning_parser"):
del self._reasoning_parser
cleanup()
def __del__(self):
self.destroy()
def call_llm_chat_with_guided_decoding(
self,
messages: list[SimpleChatMessage],
*,
json_schema: dict[str, Any] | None = None,
**request_kwargs: Any,
) -> ChatResponse:
convo = [_chat_message_to_vllm_param(m) for m in messages]
sampling_kwargs = {**self._default_request_kwargs, **request_kwargs}
# pull out vLLM chat() kwargs; everything else goes into SamplingParams
chat_kwargs: dict[str, Any] = {"use_tqdm": False}
for k in list(sampling_kwargs.keys()):
if k in _VLLM_CHAT_KWARGS:
chat_kwargs[k] = sampling_kwargs.pop(k)
if json_schema is not None:
sampling_kwargs["structured_outputs"] = StructuredOutputsParams(json=json_schema)
sampling_params = SamplingParams(**sampling_kwargs)
req_outputs = self.llm.chat(convo, sampling_params=sampling_params, **chat_kwargs)
# take the first output (we only sent one conversation) and first generation
out = req_outputs[0].outputs[0]
if self.reasoning_parser is not None:
if isinstance(self.reasoning_parser, GptOssReasoningParser):
# Harmony (gpt-oss): split via token ids
reasoning, content, _is_tool_call = parse_chat_output(out.token_ids)
else:
# create dummy request object for reasoning extraction
request_obj = ChatCompletionRequest(messages=convo, model=self._model_name, seed=0)
reasoning, content = self.reasoning_parser.extract_reasoning(
model_output=out.text, request=request_obj
)
else:
reasoning = None
content = out.text
msg = LlamaIndexChatMessage(role=MessageRole.ASSISTANT, content=content)
if reasoning is not None:
msg.additional_kwargs["reasoning"] = reasoning
return ChatResponse(message=msg, raw=req_outputs)
def get_reasoning_from_chat_response(self, response: ChatResponse) -> str | None:
"""Extract reasoning from a chat response."""
# don't attempt extraction if no reasoning parser configured (and thus don't raise errors)
if self.reasoning_parser is None:
return None
reasoning = response.message.additional_kwargs.get("reasoning")
if not isinstance(reasoning, str):
raise ReasoningExtractionError("Could not extract reasoning from chat response.")
if not reasoning.strip():
raise EmptyReasoningError("Extracted reasoning is empty.")
return reasoning
|