deepset-ai · Ryzhtus · Jun 14, 2025 · Jun 14, 2025 · anakin87 · Jun 16, 2025
@@ -109,7 +109,6 @@ def _convert_ollama_response_to_chatmessage(ollama_response: "ChatResponse") ->
     response_dict = ollama_response.model_dump()
 
     ollama_message = response_dict["message"]
-
     text = ollama_message["content"]
 
     tool_calls = []
@@ -122,6 +121,12 @@ def _convert_ollama_response_to_chatmessage(ollama_response: "ChatResponse") ->
     message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls)
 
     message._meta = _convert_ollama_meta_to_openai_format(response_dict)
+
+    thinking = ollama_message.get("thinking", None)
+
+    if thinking is not None:
+        message._meta["thinking"] = thinking
+
     return message
 
 
@@ -156,6 +161,7 @@ def __init__(
         url: str = "http://localhost:11434",
         generation_kwargs: Optional[Dict[str, Any]] = None,
         timeout: int = 120,
+        think=False,
         keep_alive: Optional[Union[float, str]] = None,
         streaming_callback: Optional[Callable[[StreamingChunk], None]] = None,
         tools: Optional[List[Tool]] = None,
@@ -172,6 +178,8 @@ def __init__(
             [Ollama docs](https://github.com/jmorganca/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values).
         :param timeout:
             The number of seconds before throwing a timeout error from the Ollama API.
+        :param think
+            Enables the model's "thinking" process.
-            Enables the model's "thinking" process.
+            If True, the modell will "think" before producing a response. 
+            Only [thinking models](https://ollama.com/search?c=thinking) support this feature.
+            The intermediate "thinking" output can be found in the `meta` property of the returned `ChatMessage`.
-            Enables the model's "thinking" process.
+            If True, the modell will "think" before producing a response. 
+            Only [thinking models](https://ollama.com/search?c=thinking) support this feature.
+            The intermediate "thinking" output can be found in the `meta` property of the returned `ChatMessage`.
         :param keep_alive:
             The option that controls how long the model will stay loaded into memory following the request.
             If not set, it will use the default value from the Ollama (5 minutes).
@@ -200,6 +208,7 @@ def __init__(
         self.generation_kwargs = generation_kwargs or {}
         self.url = url
         self.model = model
+        self.think = think
         self.keep_alive = keep_alive
         self.streaming_callback = streaming_callback
         self.tools = tools
@@ -329,6 +338,7 @@ def run(
             messages=ollama_messages,
             tools=ollama_tools,
             stream=stream,
+            think=self.think,
             keep_alive=self.keep_alive,
             options=generation_kwargs,
             format=self.response_format,

@@ -36,6 +36,7 @@ def __init__(
         template: Optional[str] = None,
         raw: bool = False,
         timeout: int = 120,
+        think: bool = False,
         keep_alive: Optional[Union[float, str]] = None,
         streaming_callback: Optional[Callable[[StreamingChunk], None]] = None,
     ):
@@ -57,6 +58,8 @@ def __init__(
             if you are specifying a full templated prompt in your API request.
         :param timeout:
             The number of seconds before throwing a timeout error from the Ollama API.
+        :param think
+            Enables the model's "thinking" process.
         :param streaming_callback:
             A callback function that is called when a new token is received from the stream.
             The callback function accepts StreamingChunk as an argument.
@@ -75,6 +78,7 @@ def __init__(
         self.system_prompt = system_prompt
         self.model = model
         self.url = url
+        self.think = think
         self.keep_alive = keep_alive
         self.generation_kwargs = generation_kwargs or {}
         self.streaming_callback = streaming_callback
@@ -194,6 +198,7 @@ def run(
             model=self.model,
             prompt=prompt,
             stream=stream,
+            think=self.think,
             keep_alive=self.keep_alive,
             options=generation_kwargs,
         )

@@ -508,6 +508,17 @@ def test_run_with_chat_history(self):
             city.lower() in response["replies"][-1].text.lower() for city in ["Manchester", "Birmingham", "Glasgow"]
         )
 
+    @pytest.mark.integration
+    def test_live_run_with_thinking(self):
+        chat_generator = OllamaChatGenerator(model="qwen3:1.7b", think=True)
 LLM_FOR_TESTS: "llama3.2:3b" 
 LLM_FOR_TESTS: "llama3.2:3b" 
+
+        message = ChatMessage.from_user("How many times does the letter 'r' appear in the word 'strawberry'?")
+        response = chat_generator.run([message])
+
+        assert isinstance(response, dict)
+        assert isinstance(response["replies"], list)
+        assert "thinking" in response["replies"][0]._meta
+
     @pytest.mark.integration
     def test_run_model_unavailable(self):
         component = OllamaChatGenerator(model="unknown_model")

@@ -29,6 +29,17 @@ def test_run_capital_cities(self):
             assert "meta" in results
             assert answer in response
 
+    @pytest.mark.integration
+    def test_run_with_thinking(self):
+        prompt = "How many times does the letter 'r' appear in the word 'strawberry'?"
+        component = OllamaGenerator(model="qwen3:1.7b", think=True)
+
+        results = component.run(prompt=prompt)
+
+        assert "replies" in results
+        assert "meta" in results
+        assert "thinking" in results["meta"][0]
+
     @pytest.mark.integration
     def test_run_model_unavailable(self):
         component = OllamaGenerator(model="Alistair_is_great")