diff --git a/integrations/ollama/src/haystack_integrations/components/generators/ollama/chat/chat_generator.py b/integrations/ollama/src/haystack_integrations/components/generators/ollama/chat/chat_generator.py index 951c3bfe9..56fe9d6d3 100644 --- a/integrations/ollama/src/haystack_integrations/components/generators/ollama/chat/chat_generator.py +++ b/integrations/ollama/src/haystack_integrations/components/generators/ollama/chat/chat_generator.py @@ -109,7 +109,6 @@ def _convert_ollama_response_to_chatmessage(ollama_response: "ChatResponse") -> response_dict = ollama_response.model_dump() ollama_message = response_dict["message"] - text = ollama_message["content"] tool_calls = [] @@ -122,6 +121,12 @@ def _convert_ollama_response_to_chatmessage(ollama_response: "ChatResponse") -> message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls) message._meta = _convert_ollama_meta_to_openai_format(response_dict) + + thinking = ollama_message.get("thinking", None) + + if thinking is not None: + message._meta["thinking"] = thinking + return message @@ -156,6 +161,7 @@ def __init__( url: str = "http://localhost:11434", generation_kwargs: Optional[Dict[str, Any]] = None, timeout: int = 120, + think=False, keep_alive: Optional[Union[float, str]] = None, streaming_callback: Optional[Callable[[StreamingChunk], None]] = None, tools: Optional[List[Tool]] = None, @@ -172,6 +178,8 @@ def __init__( [Ollama docs](https://github.com/jmorganca/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values). :param timeout: The number of seconds before throwing a timeout error from the Ollama API. + :param think + Enables the model's "thinking" process. :param keep_alive: The option that controls how long the model will stay loaded into memory following the request. If not set, it will use the default value from the Ollama (5 minutes). @@ -200,6 +208,7 @@ def __init__( self.generation_kwargs = generation_kwargs or {} self.url = url self.model = model + self.think = think self.keep_alive = keep_alive self.streaming_callback = streaming_callback self.tools = tools @@ -329,6 +338,7 @@ def run( messages=ollama_messages, tools=ollama_tools, stream=stream, + think=self.think, keep_alive=self.keep_alive, options=generation_kwargs, format=self.response_format, diff --git a/integrations/ollama/src/haystack_integrations/components/generators/ollama/generator.py b/integrations/ollama/src/haystack_integrations/components/generators/ollama/generator.py index 64e497489..2b2ae91da 100644 --- a/integrations/ollama/src/haystack_integrations/components/generators/ollama/generator.py +++ b/integrations/ollama/src/haystack_integrations/components/generators/ollama/generator.py @@ -36,6 +36,7 @@ def __init__( template: Optional[str] = None, raw: bool = False, timeout: int = 120, + think: bool = False, keep_alive: Optional[Union[float, str]] = None, streaming_callback: Optional[Callable[[StreamingChunk], None]] = None, ): @@ -57,6 +58,8 @@ def __init__( if you are specifying a full templated prompt in your API request. :param timeout: The number of seconds before throwing a timeout error from the Ollama API. + :param think + Enables the model's "thinking" process. :param streaming_callback: A callback function that is called when a new token is received from the stream. The callback function accepts StreamingChunk as an argument. @@ -75,6 +78,7 @@ def __init__( self.system_prompt = system_prompt self.model = model self.url = url + self.think = think self.keep_alive = keep_alive self.generation_kwargs = generation_kwargs or {} self.streaming_callback = streaming_callback @@ -194,6 +198,7 @@ def run( model=self.model, prompt=prompt, stream=stream, + think=self.think, keep_alive=self.keep_alive, options=generation_kwargs, ) diff --git a/integrations/ollama/tests/test_chat_generator.py b/integrations/ollama/tests/test_chat_generator.py index 7e9a0e68d..bf649ea83 100644 --- a/integrations/ollama/tests/test_chat_generator.py +++ b/integrations/ollama/tests/test_chat_generator.py @@ -508,6 +508,17 @@ def test_run_with_chat_history(self): city.lower() in response["replies"][-1].text.lower() for city in ["Manchester", "Birmingham", "Glasgow"] ) + @pytest.mark.integration + def test_live_run_with_thinking(self): + chat_generator = OllamaChatGenerator(model="qwen3:1.7b", think=True) + + message = ChatMessage.from_user("How many times does the letter 'r' appear in the word 'strawberry'?") + response = chat_generator.run([message]) + + assert isinstance(response, dict) + assert isinstance(response["replies"], list) + assert "thinking" in response["replies"][0]._meta + @pytest.mark.integration def test_run_model_unavailable(self): component = OllamaChatGenerator(model="unknown_model") diff --git a/integrations/ollama/tests/test_generator.py b/integrations/ollama/tests/test_generator.py index fb3cdbf06..e9359fe92 100644 --- a/integrations/ollama/tests/test_generator.py +++ b/integrations/ollama/tests/test_generator.py @@ -29,6 +29,17 @@ def test_run_capital_cities(self): assert "meta" in results assert answer in response + @pytest.mark.integration + def test_run_with_thinking(self): + prompt = "How many times does the letter 'r' appear in the word 'strawberry'?" + component = OllamaGenerator(model="qwen3:1.7b", think=True) + + results = component.run(prompt=prompt) + + assert "replies" in results + assert "meta" in results + assert "thinking" in results["meta"][0] + @pytest.mark.integration def test_run_model_unavailable(self): component = OllamaGenerator(model="Alistair_is_great")