Skip to content

Openai like vllm

OpenAILikeVllm

Bases: LLM

Simple wrapper around OpenAI-like LLMs to indicate vLLM usage in extractors.extract_from_text

Source code in src/kibad_llm/llms/openai_like_vllm.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class OpenAILikeVllm(LLM):
    """Simple wrapper around OpenAI-like LLMs to indicate vLLM usage in extractors.extract_from_text"""

    def __init__(self, *args, **kwargs) -> None:
        self.model = OpenAILike(*args, **kwargs)

    def call_llm_chat_with_guided_decoding(
        self,
        messages: list[SimpleChatMessage],
        *,
        json_schema: dict[str, Any] | None = None,
        **request_kwargs,
    ) -> ChatResponse:
        if json_schema is not None:
            # vllm hosted models require json schema guided decoding via extra_body
            if "extra_body" not in request_kwargs:
                request_kwargs["extra_body"] = {}
            if "structured_outputs" in request_kwargs["extra_body"]:
                warn_once(
                    f'Overwriting existing "structured_outputs": '
                    f'{request_kwargs["extra_body"]["structured_outputs"]} '
                    'in request_parameters["extra_body"] with provided json schema for '
                    'guided decoding ("structured_outputs": {"json": schema}).'
                )
            request_kwargs["extra_body"]["structured_outputs"] = {"json": json_schema}

        llama_index_messages = [
            LlamaIndexChatMessage(role=msg.role, content=msg.content) for msg in messages
        ]
        try:
            return self.model.chat(llama_index_messages, **request_kwargs)
        except BadRequestError as e:
            # align error type with in_process LLMs
            raise ValueError(e.message) from e

    def get_reasoning_from_chat_response(self, response: ChatResponse) -> str:
        """Extract reasoning from a chat response."""

        raw_msg = self.get_raw_message_from_chat_response(response)

        # vLLM: prefer `reasoning`, fallback to legacy `reasoning_content`
        result = getattr(raw_msg, "reasoning", None) or getattr(raw_msg, "reasoning_content", None)
        if not isinstance(result, str):
            raise ReasoningExtractionError("Could not extract reasoning from chat response.")
        if not result.strip():
            raise EmptyReasoningError("Extracted reasoning is empty.")

        return result

get_reasoning_from_chat_response(response)

Extract reasoning from a chat response.

Source code in src/kibad_llm/llms/openai_like_vllm.py
52
53
54
55
56
57
58
59
60
61
62
63
64
def get_reasoning_from_chat_response(self, response: ChatResponse) -> str:
    """Extract reasoning from a chat response."""

    raw_msg = self.get_raw_message_from_chat_response(response)

    # vLLM: prefer `reasoning`, fallback to legacy `reasoning_content`
    result = getattr(raw_msg, "reasoning", None) or getattr(raw_msg, "reasoning_content", None)
    if not isinstance(result, str):
        raise ReasoningExtractionError("Could not extract reasoning from chat response.")
    if not result.strip():
        raise EmptyReasoningError("Extracted reasoning is empty.")

    return result