Refactor OllamaAPI for improved async handling and response streaming

Updated OllamaAPI to support separate thinking and response stream handlers, enhancing the asynchronous generation of responses. Adjusted related models and observers to accommodate new streaming logic. Improved the handling of response data in OllamaResult and OllamaGenerateResponseModel, adding new properties for better tracking of response metrics. Refined integration tests to reflect changes in method signatures and ensure proper logging of streamed responses.
2025-11-01 00:50:42 +01:00 · 2025-08-31 14:02:42 +05:30
parent 5f5fa8ecae
commit c754bd11da
9 changed files with 328 additions and 194 deletions
--- a/src/main/java/io/github/ollama4j/OllamaAPI.java
+++ b/src/main/java/io/github/ollama4j/OllamaAPI.java
@@ -52,7 +52,7 @@ import java.util.stream.Collectors;
 /**
 * The base Ollama API class.
 */
-@SuppressWarnings({"DuplicatedCode", "resource"})
+@SuppressWarnings({ "DuplicatedCode", "resource" })
 public class OllamaAPI {
    private static final Logger logger = LoggerFactory.getLogger(OllamaAPI.class);
@@ -101,7 +101,7 @@ public class OllamaAPI {
     * Default is 0 (no retries).
     */
    @Setter
-    @SuppressWarnings({"FieldMayBeFinal", "FieldCanBeLocal"})
+    @SuppressWarnings({ "FieldMayBeFinal", "FieldCanBeLocal" })
    private int numberOfRetriesForModelPull = 0;
    /**
@@ -244,7 +244,7 @@ public class OllamaAPI {
     * tags, tag count, and the time when model was updated.
     *
     * @return A list of {@link LibraryModel} objects representing the models
-     * available in the Ollama library.
+     *         available in the Ollama library.
     * @throws OllamaBaseException  If the HTTP request fails or the response is not
     *                              successful (non-200 status code).
     * @throws IOException          If an I/O error occurs during the HTTP request
@@ -312,7 +312,7 @@ public class OllamaAPI {
     *                     of the library model
     *                     for which the tags need to be fetched.
     * @return a list of {@link LibraryModelTag} objects containing the extracted
-     * tags and their associated metadata.
+     *         tags and their associated metadata.
     * @throws OllamaBaseException  if the HTTP response status code indicates an
     *                              error (i.e., not 200 OK),
     *                              or if there is any other issue during the
@@ -371,9 +371,12 @@ public class OllamaAPI {
    /**
     * Finds a specific model using model name and tag from Ollama library.
     * <p>
-     * <b>Deprecated:</b> This method relies on the HTML structure of the Ollama website,
+     * <b>Deprecated:</b> This method relies on the HTML structure of the Ollama
-     * which is subject to change at any time. As a result, it is difficult to keep this API
+     * website,
-     * method consistently updated and reliable. Therefore, this method is deprecated and
+     * which is subject to change at any time. As a result, it is difficult to keep
     * this API
     * method consistently updated and reliable. Therefore, this method is
     * deprecated and
     * may be removed in future releases.
     * <p>
     * This method retrieves the model from the Ollama library by its name, then
@@ -386,14 +389,16 @@ public class OllamaAPI {
     * @param modelName The name of the model to search for in the library.
     * @param tag       The tag name to search for within the specified model.
     * @return The {@link LibraryModelTag} associated with the specified model and
-     * tag.
+     *         tag.
     * @throws OllamaBaseException    If there is a problem with the Ollama library
     *                                operations.
     * @throws IOException            If an I/O error occurs during the operation.
     * @throws URISyntaxException     If there is an error with the URI syntax.
     * @throws InterruptedException   If the operation is interrupted.
     * @throws NoSuchElementException If the model or the tag is not found.
-     * @deprecated This method relies on the HTML structure of the Ollama website, which can change at any time and break this API. It is deprecated and may be removed in the future.
+     * @deprecated This method relies on the HTML structure of the Ollama website,
     *             which can change at any time and break this API. It is deprecated
     *             and may be removed in the future.
     */
    @Deprecated
    public LibraryModelTag findModelTagFromLibrary(String modelName, String tag)
@@ -453,12 +458,13 @@ public class OllamaAPI {
    /**
     * Handles retry backoff for pullModel.
     */
-    private void handlePullRetry(String modelName, int currentRetry, int maxRetries, long baseDelayMillis) throws InterruptedException {
+    private void handlePullRetry(String modelName, int currentRetry, int maxRetries, long baseDelayMillis)
            throws InterruptedException {
        int attempt = currentRetry + 1;
        if (attempt < maxRetries) {
            long backoffMillis = baseDelayMillis * (1L << currentRetry);
            logger.error("Failed to pull model {}, retrying in {}s... (attempt {}/{})",
-                    modelName, backoffMillis/1000, attempt, maxRetries);
+                    modelName, backoffMillis / 1000, attempt, maxRetries);
            try {
                Thread.sleep(backoffMillis);
            } catch (InterruptedException ie) {
@@ -470,7 +476,6 @@ public class OllamaAPI {
        }
    }
    private void doPullModel(String modelName)
            throws OllamaBaseException, IOException, URISyntaxException, InterruptedException {
        String url = this.host + "/api/pull";
@@ -825,36 +830,74 @@ public class OllamaAPI {
    /**
     * Generate response for a question to a model running on Ollama server. This is
-     * a sync/blocking
+     * a sync/blocking call. This API does not support "thinking" models.
     * call.
     *
-     * @param model         the ollama model to ask the question to
+     * @param model                 the ollama model to ask the question to
-     * @param prompt        the prompt/question text
+     * @param prompt                the prompt/question text
-     * @param raw           if true no formatting will be applied to the prompt. You
+     * @param raw                   if true no formatting will be applied to the
-     *                      may choose to use the raw parameter if you are
+     *                              prompt. You
-     *                      specifying a full templated prompt in your request to
+     *                              may choose to use the raw parameter if you are
-     *                      the API
+     *                              specifying a full templated prompt in your
-     * @param think         if true the model will "think" step-by-step before
+     *                              request to
-     *                      generating the final response
+     *                              the API
-     * @param options       the Options object - <a
+     * @param options               the Options object - <a
-     *                      href=
+     *                              href=
-     *                      "https://github.com/jmorganca/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values">More
+     *                              "https://github.com/jmorganca/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values">More
-     *                      details on the options</a>
+     *                              details on the options</a>
-     * @param streamHandler optional callback consumer that will be applied every
+     * @param responseStreamHandler optional callback consumer that will be applied
-     *                      time a streamed response is received. If not set, the
+     *                              every
-     *                      stream parameter of the request is set to false.
+     *                              time a streamed response is received. If not
     *                              set, the
     *                              stream parameter of the request is set to false.
     * @return OllamaResult that includes response text and time taken for response
     * @throws OllamaBaseException  if the response indicates an error status
     * @throws IOException          if an I/O error occurs during the HTTP request
     * @throws InterruptedException if the operation is interrupted
     */
-    public OllamaResult generate(String model, String prompt, boolean raw, boolean think, Options options,
+    public OllamaResult generate(String model, String prompt, boolean raw, Options options,
-                                 OllamaStreamHandler streamHandler) throws OllamaBaseException, IOException, InterruptedException {
+            OllamaStreamHandler responseStreamHandler) throws OllamaBaseException, IOException, InterruptedException {
        OllamaGenerateRequest ollamaRequestModel = new OllamaGenerateRequest(model, prompt);
        ollamaRequestModel.setRaw(raw);
-        ollamaRequestModel.setThink(think);
+        ollamaRequestModel.setThink(false);
        ollamaRequestModel.setOptions(options.getOptionsMap());
-        return generateSyncForOllamaRequestModel(ollamaRequestModel, streamHandler);
+        return generateSyncForOllamaRequestModel(ollamaRequestModel, null, responseStreamHandler);
    }
    /**
     * Generate thinking and response tokens for a question to a thinking model
     * running on Ollama server. This is
     * a sync/blocking call.
     *
     * @param model                 the ollama model to ask the question to
     * @param prompt                the prompt/question text
     * @param raw                   if true no formatting will be applied to the
     *                              prompt. You
     *                              may choose to use the raw parameter if you are
     *                              specifying a full templated prompt in your
     *                              request to
     *                              the API
     * @param options               the Options object - <a
     *                              href=
     *                              "https://github.com/jmorganca/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values">More
     *                              details on the options</a>
     * @param responseStreamHandler optional callback consumer that will be applied
     *                              every
     *                              time a streamed response is received. If not
     *                              set, the
     *                              stream parameter of the request is set to false.
     * @return OllamaResult that includes response text and time taken for response
     * @throws OllamaBaseException  if the response indicates an error status
     * @throws IOException          if an I/O error occurs during the HTTP request
     * @throws InterruptedException if the operation is interrupted
     */
    public OllamaResult generate(String model, String prompt, boolean raw, Options options,
            OllamaStreamHandler thinkingStreamHandler, OllamaStreamHandler responseStreamHandler)
            throws OllamaBaseException, IOException, InterruptedException {
        OllamaGenerateRequest ollamaRequestModel = new OllamaGenerateRequest(model, prompt);
        ollamaRequestModel.setRaw(raw);
        ollamaRequestModel.setThink(true);
        ollamaRequestModel.setOptions(options.getOptionsMap());
        return generateSyncForOllamaRequestModel(ollamaRequestModel, thinkingStreamHandler, responseStreamHandler);
    }
    /**
@@ -862,7 +905,7 @@ public class OllamaAPI {
     * mode).
     * <p>
     * Uses
-     * {@link #generate(String, String, boolean, boolean, Options, OllamaStreamHandler)}
+     * {@link #generate(String, String, boolean, Options, OllamaStreamHandler)}
     *
     * @param model   The name or identifier of the AI model to use for generating
     *                the response.
@@ -871,10 +914,10 @@ public class OllamaAPI {
     *                and provide a full prompt. In this case, you can use the raw
     *                parameter to disable templating. Also note that raw mode will
     *                not return a context.
     * @param think   If set to true, the model will "think" step-by-step before
     *                generating the final response.
     * @param options Additional options or configurations to use when generating
     *                the response.
     * @param think   if true the model will "think" step-by-step before
     *                generating the final response
     * @return {@link OllamaResult}
     * @throws OllamaBaseException  if the response indicates an error status
     * @throws IOException          if an I/O error occurs during the HTTP request
@@ -882,7 +925,11 @@ public class OllamaAPI {
     */
    public OllamaResult generate(String model, String prompt, boolean raw, boolean think, Options options)
            throws OllamaBaseException, IOException, InterruptedException {
-        return generate(model, prompt, raw, think, options, null);
+        if (think) {
            return generate(model, prompt, raw, options, null, null);
        } else {
            return generate(model, prompt, raw, options, null);
        }
    }
    /**
@@ -896,7 +943,7 @@ public class OllamaAPI {
     * @param format A map containing the format specification for the structured
     *               output.
     * @return An instance of {@link OllamaResult} containing the structured
-     * response.
+     *         response.
     * @throws OllamaBaseException  if the response indicates an error status.
     * @throws IOException          if an I/O error occurs during the HTTP request.
     * @throws InterruptedException if the operation is interrupted.
@@ -958,18 +1005,16 @@ public class OllamaAPI {
     * @param model   The name or identifier of the AI model to use for generating
     *                the response.
     * @param prompt  The input text or prompt to provide to the AI model.
     * @param think   If set to true, the model will "think" step-by-step before
     *                generating the final response.
     * @param options Additional options or configurations to use when generating
     *                the response.
     * @return {@link OllamaToolsResult} An OllamaToolsResult object containing the
-     * response from the AI model and the results of invoking the tools on
+     *         response from the AI model and the results of invoking the tools on
-     * that output.
+     *         that output.
     * @throws OllamaBaseException  if the response indicates an error status
     * @throws IOException          if an I/O error occurs during the HTTP request
     * @throws InterruptedException if the operation is interrupted
     */
-    public OllamaToolsResult generateWithTools(String model, String prompt, boolean think, Options options)
+    public OllamaToolsResult generateWithTools(String model, String prompt, Options options)
            throws OllamaBaseException, IOException, InterruptedException, ToolInvocationException {
        boolean raw = true;
        OllamaToolsResult toolResult = new OllamaToolsResult();
@@ -984,7 +1029,7 @@ public class OllamaAPI {
            prompt = promptBuilder.build();
        }
-        OllamaResult result = generate(model, prompt, raw, think, options, null);
+        OllamaResult result = generate(model, prompt, raw, options, null);
        toolResult.setModelResult(result);
        String toolsResponse = result.getResponse();
@@ -1014,19 +1059,47 @@ public class OllamaAPI {
    }
    /**
-     * Generate response for a question to a model running on Ollama server and get
+     * Asynchronously generates a response for a prompt using a model running on the
-     * a callback handle
+     * Ollama server.
-     * that can be used to check for status and get the response from the model
+     * <p>
-     * later. This would be
+     * This method returns an {@link OllamaAsyncResultStreamer} handle that can be
-     * an async/non-blocking call.
+     * used to poll for
     * status and retrieve streamed "thinking" and response tokens from the model.
     * The call is non-blocking.
     * </p>
     *
-     * @param model  the ollama model to ask the question to
+     * <p>
-     * @param prompt the prompt/question text
+     * <b>Example usage:</b>
-     * @return the ollama async result callback handle
+     * </p>
     *
     * <pre>{@code
     * OllamaAsyncResultStreamer resultStreamer = ollamaAPI.generateAsync("gpt-oss:20b", "Who are you", false, true);
     * int pollIntervalMilliseconds = 1000;
     * while (true) {
     *     String thinkingTokens = resultStreamer.getThinkingResponseStream().poll();
     *     String responseTokens = resultStreamer.getResponseStream().poll();
     *     System.out.print(thinkingTokens != null ? thinkingTokens.toUpperCase() : "");
     *     System.out.print(responseTokens != null ? responseTokens.toLowerCase() : "");
     *     Thread.sleep(pollIntervalMilliseconds);
     *     if (!resultStreamer.isAlive())
     *         break;
     * }
     * System.out.println("Complete thinking response: " + resultStreamer.getCompleteThinkingResponse());
     * System.out.println("Complete response: " + resultStreamer.getCompleteResponse());
     * }</pre>
     *
     * @param model  the Ollama model to use for generating the response
     * @param prompt the prompt or question text to send to the model
     * @param raw    if {@code true}, returns the raw response from the model
     * @param think  if {@code true}, streams "thinking" tokens as well as response
     *               tokens
     * @return an {@link OllamaAsyncResultStreamer} handle for polling and
     *         retrieving streamed results
     */
-    public OllamaAsyncResultStreamer generateAsync(String model, String prompt, boolean raw) {
+    public OllamaAsyncResultStreamer generateAsync(String model, String prompt, boolean raw, boolean think) {
        OllamaGenerateRequest ollamaRequestModel = new OllamaGenerateRequest(model, prompt);
        ollamaRequestModel.setRaw(raw);
        ollamaRequestModel.setThink(think);
        URI uri = URI.create(this.host + "/api/generate");
        OllamaAsyncResultStreamer ollamaAsyncResultStreamer = new OllamaAsyncResultStreamer(
                getRequestBuilderDefault(uri), ollamaRequestModel, requestTimeoutSeconds);
@@ -1055,14 +1128,14 @@ public class OllamaAPI {
     * @throws InterruptedException if the operation is interrupted
     */
    public OllamaResult generateWithImageFiles(String model, String prompt, List<File> imageFiles, Options options,
-                                               OllamaStreamHandler streamHandler) throws OllamaBaseException, IOException, InterruptedException {
+            OllamaStreamHandler streamHandler) throws OllamaBaseException, IOException, InterruptedException {
        List<String> images = new ArrayList<>();
        for (File imageFile : imageFiles) {
            images.add(encodeFileToBase64(imageFile));
        }
        OllamaGenerateRequest ollamaRequestModel = new OllamaGenerateRequest(model, prompt, images);
        ollamaRequestModel.setOptions(options.getOptionsMap());
-        return generateSyncForOllamaRequestModel(ollamaRequestModel, streamHandler);
+        return generateSyncForOllamaRequestModel(ollamaRequestModel, null, streamHandler);
    }
    /**
@@ -1102,7 +1175,7 @@ public class OllamaAPI {
     * @throws URISyntaxException   if the URI for the request is malformed
     */
    public OllamaResult generateWithImageURLs(String model, String prompt, List<String> imageURLs, Options options,
-                                              OllamaStreamHandler streamHandler)
+            OllamaStreamHandler streamHandler)
            throws OllamaBaseException, IOException, InterruptedException, URISyntaxException {
        List<String> images = new ArrayList<>();
        for (String imageURL : imageURLs) {
@@ -1110,7 +1183,7 @@ public class OllamaAPI {
        }
        OllamaGenerateRequest ollamaRequestModel = new OllamaGenerateRequest(model, prompt, images);
        ollamaRequestModel.setOptions(options.getOptionsMap());
-        return generateSyncForOllamaRequestModel(ollamaRequestModel, streamHandler);
+        return generateSyncForOllamaRequestModel(ollamaRequestModel, null, streamHandler);
    }
    /**
@@ -1144,20 +1217,20 @@ public class OllamaAPI {
     * @param streamHandler optional callback that will be invoked with each
     *                      streamed response; if null, streaming is disabled
     * @return OllamaResult containing the response text and the time taken for the
-     * response
+     *         response
     * @throws OllamaBaseException  if the response indicates an error status
     * @throws IOException          if an I/O error occurs during the HTTP request
     * @throws InterruptedException if the operation is interrupted
     */
    public OllamaResult generateWithImages(String model, String prompt, List<byte[]> images, Options options,
-                                           OllamaStreamHandler streamHandler) throws OllamaBaseException, IOException, InterruptedException {
+            OllamaStreamHandler streamHandler) throws OllamaBaseException, IOException, InterruptedException {
        List<String> encodedImages = new ArrayList<>();
        for (byte[] image : images) {
            encodedImages.add(encodeByteArrayToBase64(image));
        }
        OllamaGenerateRequest ollamaRequestModel = new OllamaGenerateRequest(model, prompt, encodedImages);
        ollamaRequestModel.setOptions(options.getOptionsMap());
-        return generateSyncForOllamaRequestModel(ollamaRequestModel, streamHandler);
+        return generateSyncForOllamaRequestModel(ollamaRequestModel, null, streamHandler);
    }
    /**
@@ -1184,7 +1257,7 @@ public class OllamaAPI {
     * @param model    the ollama model to ask the question to
     * @param messages chat history / message stack to send to the model
     * @return {@link OllamaChatResult} containing the api response and the message
-     * history including the newly acquired assistant response.
+     *         history including the newly acquired assistant response.
     * @throws OllamaBaseException     any response code than 200 has been returned
     * @throws IOException             in case the responseStream can not be read
     * @throws InterruptedException    in case the server is not reachable or
@@ -1223,7 +1296,7 @@ public class OllamaAPI {
     */
    public OllamaChatResult chat(OllamaChatRequest request)
            throws OllamaBaseException, IOException, InterruptedException, ToolInvocationException {
-        return chat(request, null);
+        return chat(request, null, null);
    }
    /**
@@ -1232,10 +1305,11 @@ public class OllamaAPI {
     * <p>
     * Hint: the OllamaChatRequestModel#getStream() property is not implemented.
     *
-     * @param request       request object to be sent to the server
+     * @param request               request object to be sent to the server
-     * @param streamHandler callback handler to handle the last message from stream
+     * @param responseStreamHandler callback handler to handle the last message from
-     *                      (caution: all previous tokens from stream will be
+     *                              stream
-     *                      concatenated)
+     * @param thinkingStreamHandler callback handler to handle the last thinking
     *                              message from stream
     * @return {@link OllamaChatResult}
     * @throws OllamaBaseException     any response code than 200 has been returned
     * @throws IOException             in case the responseStream can not be read
@@ -1248,9 +1322,10 @@ public class OllamaAPI {
     * @throws InterruptedException    if the operation is interrupted
     * @throws ToolInvocationException if the tool invocation fails
     */
-    public OllamaChatResult chat(OllamaChatRequest request, OllamaStreamHandler streamHandler)
+    public OllamaChatResult chat(OllamaChatRequest request, OllamaStreamHandler thinkingStreamHandler,
            OllamaStreamHandler responseStreamHandler)
            throws OllamaBaseException, IOException, InterruptedException, ToolInvocationException {
-        return chatStreaming(request, new OllamaChatStreamObserver(streamHandler));
+        return chatStreaming(request, new OllamaChatStreamObserver(thinkingStreamHandler, responseStreamHandler));
    }
    /**
@@ -1393,7 +1468,7 @@ public class OllamaAPI {
                registerAnnotatedTools(provider.getDeclaredConstructor().newInstance());
            }
        } catch (InstantiationException | NoSuchMethodException | IllegalAccessException
-                 | InvocationTargetException e) {
+                | InvocationTargetException e) {
            throw new RuntimeException(e);
        }
    }
@@ -1518,10 +1593,12 @@ public class OllamaAPI {
     * the request will be streamed; otherwise, a regular synchronous request will
     * be made.
     *
-     * @param ollamaRequestModel the request model containing necessary parameters
+     * @param ollamaRequestModel    the request model containing necessary
-     *                           for the Ollama API request.
+     *                              parameters
-     * @param streamHandler      the stream handler to process streaming responses,
+     *                              for the Ollama API request.
-     *                           or null for non-streaming requests.
+     * @param responseStreamHandler the stream handler to process streaming
     *                              responses,
     *                              or null for non-streaming requests.
     * @return the result of the Ollama API request.
     * @throws OllamaBaseException  if the request fails due to an issue with the
     *                              Ollama API.
@@ -1530,13 +1607,14 @@ public class OllamaAPI {
     * @throws InterruptedException if the thread is interrupted during the request.
     */
    private OllamaResult generateSyncForOllamaRequestModel(OllamaGenerateRequest ollamaRequestModel,
-                                                           OllamaStreamHandler streamHandler) throws OllamaBaseException, IOException, InterruptedException {
+            OllamaStreamHandler thinkingStreamHandler, OllamaStreamHandler responseStreamHandler)
            throws OllamaBaseException, IOException, InterruptedException {
        OllamaGenerateEndpointCaller requestCaller = new OllamaGenerateEndpointCaller(host, auth, requestTimeoutSeconds,
                verbose);
        OllamaResult result;
-        if (streamHandler != null) {
+        if (responseStreamHandler != null) {
            ollamaRequestModel.setStream(true);
-            result = requestCaller.call(ollamaRequestModel, streamHandler);
+            result = requestCaller.call(ollamaRequestModel, thinkingStreamHandler, responseStreamHandler);
        } else {
            result = requestCaller.callSync(ollamaRequestModel);
        }
--- a/src/main/java/io/github/ollama4j/models/chat/OllamaChatStreamObserver.java
+++ b/src/main/java/io/github/ollama4j/models/chat/OllamaChatStreamObserver.java
@@ -6,27 +6,46 @@ import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
 public class OllamaChatStreamObserver implements OllamaTokenHandler {
-    private final OllamaStreamHandler streamHandler;
+    private final OllamaStreamHandler thinkingStreamHandler;
    private final OllamaStreamHandler responseStreamHandler;
    private String message = "";
    @Override
    public void accept(OllamaChatResponseModel token) {
-        if (streamHandler == null || token == null || token.getMessage() == null) {
+        if (responseStreamHandler == null || token == null || token.getMessage() == null) {
            return;
        }
        String content = token.getMessage().getContent();
        String thinking = token.getMessage().getThinking();
        String content = token.getMessage().getContent();
        boolean hasContent = !content.isEmpty();
        boolean hasThinking = thinking != null && !thinking.isEmpty();
        boolean hasContent = !content.isEmpty();
-        if (hasThinking && !hasContent) {
+//        if (hasThinking && !hasContent) {
-            message += thinking;
+////            message += thinking;
-        } else {
+//            message = thinking;
-            message += content;
+//        } else {
 ////            message += content;
 //            message = content;
 //        }
 //
 //        responseStreamHandler.accept(message);
        if (!hasContent && hasThinking && thinkingStreamHandler != null) {
            // message = message + thinking;
            // use only new tokens received, instead of appending the tokens to the previous
            // ones and sending the full string again
            thinkingStreamHandler.accept(thinking);
        } else if (hasContent && responseStreamHandler != null) {
            // message = message + response;
            // use only new tokens received, instead of appending the tokens to the previous
            // ones and sending the full string again
            responseStreamHandler.accept(content);
        }
        streamHandler.accept(message);
    }
 }
--- a/src/main/java/io/github/ollama4j/models/generate/OllamaGenerateResponseModel.java
+++ b/src/main/java/io/github/ollama4j/models/generate/OllamaGenerateResponseModel.java
@@ -4,6 +4,7 @@ import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
 import com.fasterxml.jackson.annotation.JsonProperty;
 import java.util.List;
 import lombok.Data;
@Data
@@ -14,11 +15,12 @@ public class OllamaGenerateResponseModel {
    private String response;
    private String thinking;
    private boolean done;
    private @JsonProperty("done_reason") String doneReason;
    private List<Integer> context;
    private @JsonProperty("total_duration") Long totalDuration;
    private @JsonProperty("load_duration") Long loadDuration;
    private @JsonProperty("prompt_eval_duration") Long promptEvalDuration;
    private @JsonProperty("eval_duration") Long evalDuration;
    private @JsonProperty("prompt_eval_count") Integer promptEvalCount;
    private @JsonProperty("prompt_eval_duration") Long promptEvalDuration;
    private @JsonProperty("eval_count") Integer evalCount;
    private @JsonProperty("eval_duration") Long evalDuration;
 }
--- a/src/main/java/io/github/ollama4j/models/generate/OllamaGenerateStreamObserver.java
+++ b/src/main/java/io/github/ollama4j/models/generate/OllamaGenerateStreamObserver.java
@@ -5,14 +5,16 @@ import java.util.List;
 public class OllamaGenerateStreamObserver {
-    private OllamaStreamHandler streamHandler;
+    private final OllamaStreamHandler thinkingStreamHandler;
    private final OllamaStreamHandler responseStreamHandler;
-    private List<OllamaGenerateResponseModel> responseParts = new ArrayList<>();
+    private final List<OllamaGenerateResponseModel> responseParts = new ArrayList<>();
    private String message = "";
-    public OllamaGenerateStreamObserver(OllamaStreamHandler streamHandler) {
+    public OllamaGenerateStreamObserver(OllamaStreamHandler thinkingStreamHandler, OllamaStreamHandler responseStreamHandler) {
-        this.streamHandler = streamHandler;
+        this.responseStreamHandler = responseStreamHandler;
        this.thinkingStreamHandler = thinkingStreamHandler;
    }
    public void notify(OllamaGenerateResponseModel currentResponsePart) {
@@ -27,11 +29,18 @@ public class OllamaGenerateStreamObserver {
        boolean hasResponse = response != null && !response.isEmpty();
        boolean hasThinking = thinking != null && !thinking.isEmpty();
-        if (!hasResponse && hasThinking) {
+        if (!hasResponse && hasThinking && thinkingStreamHandler != null) {
-            message = message + thinking;
+            // message = message + thinking;
-        } else if (hasResponse) {
+
-            message = message + response;
+            // use only new tokens received, instead of appending the tokens to the previous
            // ones and sending the full string again
            thinkingStreamHandler.accept(thinking);
        } else if (hasResponse && responseStreamHandler != null) {
            // message = message + response;
            // use only new tokens received, instead of appending the tokens to the previous
            // ones and sending the full string again
            responseStreamHandler.accept(response);
        }
        streamHandler.accept(message);
    }
 }
--- a/src/main/java/io/github/ollama4j/models/request/OllamaGenerateEndpointCaller.java
+++ b/src/main/java/io/github/ollama4j/models/request/OllamaGenerateEndpointCaller.java
@@ -27,7 +27,7 @@ public class OllamaGenerateEndpointCaller extends OllamaEndpointCaller {
    private static final Logger LOG = LoggerFactory.getLogger(OllamaGenerateEndpointCaller.class);
-    private OllamaGenerateStreamObserver streamObserver;
+    private OllamaGenerateStreamObserver responseStreamObserver;
    public OllamaGenerateEndpointCaller(String host, Auth basicAuth, long requestTimeoutSeconds, boolean verbose) {
        super(host, basicAuth, requestTimeoutSeconds, verbose);
@@ -48,8 +48,8 @@ public class OllamaGenerateEndpointCaller extends OllamaEndpointCaller {
            if (ollamaResponseModel.getThinking() != null) {
                thinkingBuffer.append(ollamaResponseModel.getThinking());
            }
-            if (streamObserver != null) {
+            if (responseStreamObserver != null) {
-                streamObserver.notify(ollamaResponseModel);
+                responseStreamObserver.notify(ollamaResponseModel);
            }
            return ollamaResponseModel.isDone();
        } catch (JsonProcessingException e) {
@@ -58,9 +58,8 @@ public class OllamaGenerateEndpointCaller extends OllamaEndpointCaller {
        }
    }
-    public OllamaResult call(OllamaRequestBody body, OllamaStreamHandler streamHandler)
+    public OllamaResult call(OllamaRequestBody body, OllamaStreamHandler thinkingStreamHandler, OllamaStreamHandler responseStreamHandler) throws OllamaBaseException, IOException, InterruptedException {
-            throws OllamaBaseException, IOException, InterruptedException {
+        responseStreamObserver = new OllamaGenerateStreamObserver(thinkingStreamHandler, responseStreamHandler);
        streamObserver = new OllamaGenerateStreamObserver(streamHandler);
        return callSync(body);
    }
@@ -73,47 +72,41 @@ public class OllamaGenerateEndpointCaller extends OllamaEndpointCaller {
     * @throws IOException          in case the responseStream can not be read
     * @throws InterruptedException in case the server is not reachable or network issues happen
     */
    @SuppressWarnings("DuplicatedCode")
    public OllamaResult callSync(OllamaRequestBody body) throws OllamaBaseException, IOException, InterruptedException {
        // Create Request
        long startTime = System.currentTimeMillis();
        HttpClient httpClient = HttpClient.newHttpClient();
        URI uri = URI.create(getHost() + getEndpointSuffix());
-        HttpRequest.Builder requestBuilder =
+        HttpRequest.Builder requestBuilder = getRequestBuilderDefault(uri).POST(body.getBodyPublisher());
                getRequestBuilderDefault(uri)
                        .POST(
                                body.getBodyPublisher());
        HttpRequest request = requestBuilder.build();
-        if (isVerbose()) LOG.info("Asking model: " + body.toString());
+        if (isVerbose()) LOG.info("Asking model: {}", body);
-        HttpResponse<InputStream> response =
+        HttpResponse<InputStream> response = httpClient.send(request, HttpResponse.BodyHandlers.ofInputStream());
                httpClient.send(request, HttpResponse.BodyHandlers.ofInputStream());
        int statusCode = response.statusCode();
        InputStream responseBodyStream = response.body();
        StringBuilder responseBuffer = new StringBuilder();
        StringBuilder thinkingBuffer = new StringBuilder();
-        try (BufferedReader reader =
+        OllamaGenerateResponseModel ollamaGenerateResponseModel = null;
-                     new BufferedReader(new InputStreamReader(responseBodyStream, StandardCharsets.UTF_8))) {
+        try (BufferedReader reader = new BufferedReader(new InputStreamReader(responseBodyStream, StandardCharsets.UTF_8))) {
            String line;
            while ((line = reader.readLine()) != null) {
                if (statusCode == 404) {
                    LOG.warn("Status code: 404 (Not Found)");
-                    OllamaErrorResponse ollamaResponseModel =
+                    OllamaErrorResponse ollamaResponseModel = Utils.getObjectMapper().readValue(line, OllamaErrorResponse.class);
                            Utils.getObjectMapper().readValue(line, OllamaErrorResponse.class);
                    responseBuffer.append(ollamaResponseModel.getError());
                } else if (statusCode == 401) {
                    LOG.warn("Status code: 401 (Unauthorized)");
-                    OllamaErrorResponse ollamaResponseModel =
+                    OllamaErrorResponse ollamaResponseModel = Utils.getObjectMapper().readValue("{\"error\":\"Unauthorized\"}", OllamaErrorResponse.class);
                            Utils.getObjectMapper()
                                    .readValue("{\"error\":\"Unauthorized\"}", OllamaErrorResponse.class);
                    responseBuffer.append(ollamaResponseModel.getError());
                } else if (statusCode == 400) {
                    LOG.warn("Status code: 400 (Bad Request)");
-                    OllamaErrorResponse ollamaResponseModel = Utils.getObjectMapper().readValue(line,
+                    OllamaErrorResponse ollamaResponseModel = Utils.getObjectMapper().readValue(line, OllamaErrorResponse.class);
                            OllamaErrorResponse.class);
                    responseBuffer.append(ollamaResponseModel.getError());
                } else {
                    boolean finished = parseResponseAndAddToBuffer(line, responseBuffer, thinkingBuffer);
                    if (finished) {
                        ollamaGenerateResponseModel = Utils.getObjectMapper().readValue(line, OllamaGenerateResponseModel.class);
                        break;
                    }
                }
@@ -121,13 +114,25 @@ public class OllamaGenerateEndpointCaller extends OllamaEndpointCaller {
        }
        if (statusCode != 200) {
-            LOG.error("Status code " + statusCode);
+            LOG.error("Status code: {}", statusCode);
            throw new OllamaBaseException(responseBuffer.toString());
        } else {
            long endTime = System.currentTimeMillis();
-            OllamaResult ollamaResult =
+            OllamaResult ollamaResult = new OllamaResult(responseBuffer.toString(), thinkingBuffer.toString(), endTime - startTime, statusCode);
-                    new OllamaResult(responseBuffer.toString(), thinkingBuffer.toString(), endTime - startTime, statusCode);
+
-            if (isVerbose()) LOG.info("Model response: " + ollamaResult);
+            ollamaResult.setModel(ollamaGenerateResponseModel.getModel());
            ollamaResult.setCreatedAt(ollamaGenerateResponseModel.getCreatedAt());
            ollamaResult.setDone(ollamaGenerateResponseModel.isDone());
            ollamaResult.setDoneReason(ollamaGenerateResponseModel.getDoneReason());
            ollamaResult.setContext(ollamaGenerateResponseModel.getContext());
            ollamaResult.setTotalDuration(ollamaGenerateResponseModel.getTotalDuration());
            ollamaResult.setLoadDuration(ollamaGenerateResponseModel.getLoadDuration());
            ollamaResult.setPromptEvalCount(ollamaGenerateResponseModel.getPromptEvalCount());
            ollamaResult.setPromptEvalDuration(ollamaGenerateResponseModel.getPromptEvalDuration());
            ollamaResult.setEvalCount(ollamaGenerateResponseModel.getEvalCount());
            ollamaResult.setEvalDuration(ollamaGenerateResponseModel.getEvalDuration());
            if (isVerbose()) LOG.info("Model response: {}", ollamaResult);
            return ollamaResult;
        }
    }
--- a/src/main/java/io/github/ollama4j/models/response/OllamaAsyncResultStreamer.java
+++ b/src/main/java/io/github/ollama4j/models/response/OllamaAsyncResultStreamer.java
@@ -26,8 +26,10 @@ import java.time.Duration;
 public class OllamaAsyncResultStreamer extends Thread {
    private final HttpRequest.Builder requestBuilder;
    private final OllamaGenerateRequest ollamaRequestModel;
-    private final OllamaResultStream stream = new OllamaResultStream();
+    private final OllamaResultStream thinkingResponseStream = new OllamaResultStream();
    private final OllamaResultStream responseStream = new OllamaResultStream();
    private String completeResponse;
    private String completeThinkingResponse;
    /**
@@ -54,14 +56,11 @@ public class OllamaAsyncResultStreamer extends Thread {
    @Getter
    private long responseTime = 0;
-    public OllamaAsyncResultStreamer(
+    public OllamaAsyncResultStreamer(HttpRequest.Builder requestBuilder, OllamaGenerateRequest ollamaRequestModel, long requestTimeoutSeconds) {
            HttpRequest.Builder requestBuilder,
            OllamaGenerateRequest ollamaRequestModel,
            long requestTimeoutSeconds) {
        this.requestBuilder = requestBuilder;
        this.ollamaRequestModel = ollamaRequestModel;
        this.completeResponse = "";
-        this.stream.add("");
+        this.responseStream.add("");
        this.requestTimeoutSeconds = requestTimeoutSeconds;
    }
@@ -71,16 +70,8 @@ public class OllamaAsyncResultStreamer extends Thread {
        HttpClient httpClient = HttpClient.newHttpClient();
        long startTime = System.currentTimeMillis();
        try {
-            HttpRequest request =
+            HttpRequest request = requestBuilder.POST(HttpRequest.BodyPublishers.ofString(Utils.getObjectMapper().writeValueAsString(ollamaRequestModel))).header(Constants.HttpConstants.HEADER_KEY_CONTENT_TYPE, Constants.HttpConstants.APPLICATION_JSON).timeout(Duration.ofSeconds(requestTimeoutSeconds)).build();
-                    requestBuilder
+            HttpResponse<InputStream> response = httpClient.send(request, HttpResponse.BodyHandlers.ofInputStream());
                            .POST(
                                    HttpRequest.BodyPublishers.ofString(
                                            Utils.getObjectMapper().writeValueAsString(ollamaRequestModel)))
                            .header(Constants.HttpConstants.HEADER_KEY_CONTENT_TYPE, Constants.HttpConstants.APPLICATION_JSON)
                            .timeout(Duration.ofSeconds(requestTimeoutSeconds))
                            .build();
            HttpResponse<InputStream> response =
                    httpClient.send(request, HttpResponse.BodyHandlers.ofInputStream());
            int statusCode = response.statusCode();
            this.httpStatusCode = statusCode;
@@ -89,25 +80,33 @@ public class OllamaAsyncResultStreamer extends Thread {
            try {
                reader = new BufferedReader(new InputStreamReader(responseBodyStream, StandardCharsets.UTF_8));
                String line;
                StringBuilder thinkingBuffer = new StringBuilder();
                StringBuilder responseBuffer = new StringBuilder();
                while ((line = reader.readLine()) != null) {
                    if (statusCode == 404) {
-                        OllamaErrorResponse ollamaResponseModel =
+                        OllamaErrorResponse ollamaResponseModel = Utils.getObjectMapper().readValue(line, OllamaErrorResponse.class);
-                                Utils.getObjectMapper().readValue(line, OllamaErrorResponse.class);
+                        responseStream.add(ollamaResponseModel.getError());
                        stream.add(ollamaResponseModel.getError());
                        responseBuffer.append(ollamaResponseModel.getError());
                    } else {
-                        OllamaGenerateResponseModel ollamaResponseModel =
+                        OllamaGenerateResponseModel ollamaResponseModel = Utils.getObjectMapper().readValue(line, OllamaGenerateResponseModel.class);
-                                Utils.getObjectMapper().readValue(line, OllamaGenerateResponseModel.class);
+                        String thinkingTokens = ollamaResponseModel.getThinking();
-                        String res = ollamaResponseModel.getResponse();
+                        String responseTokens = ollamaResponseModel.getResponse();
-                        stream.add(res);
+                        if (thinkingTokens == null) {
                            thinkingTokens = "";
                        }
                        if (responseTokens == null) {
                            responseTokens = "";
                        }
                        thinkingResponseStream.add(thinkingTokens);
                        responseStream.add(responseTokens);
                        if (!ollamaResponseModel.isDone()) {
-                            responseBuffer.append(res);
+                            responseBuffer.append(responseTokens);
                            thinkingBuffer.append(thinkingTokens);
                        }
                    }
                }
                this.succeeded = true;
                this.completeThinkingResponse = thinkingBuffer.toString();
                this.completeResponse = responseBuffer.toString();
                long endTime = System.currentTimeMillis();
                responseTime = endTime - startTime;
--- a/src/main/java/io/github/ollama4j/models/response/OllamaResult.java
+++ b/src/main/java/io/github/ollama4j/models/response/OllamaResult.java
@@ -1,15 +1,18 @@
 package io.github.ollama4j.models.response;
 import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
 import com.fasterxml.jackson.annotation.JsonProperty;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.core.type.TypeReference;
 import io.github.ollama4j.models.generate.OllamaGenerateResponseModel;
 import lombok.Data;
 import lombok.Getter;
 import static io.github.ollama4j.utils.Utils.getObjectMapper;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 /**
@@ -21,30 +24,34 @@ import java.util.Map;
@JsonIgnoreProperties(ignoreUnknown = true)
 public class OllamaResult {
    /**
     * -- GETTER --
     * Get the completion/response text
     *
     * @return String completion/response text
     */
    private final String response;
    private final String thinking;
    /**
-     * -- GETTER --
+     * Get the thinking text (if available)
     */
    private final String thinking;
    /**
     * Get the response status code.
     *
     * @return int - response status code
     */
    private int httpStatusCode;
    /**
     * -- GETTER --
     * Get the response time in milliseconds.
     *
     * @return long - response time in milliseconds
     */
    private long responseTime = 0;
    private String model;
    private String createdAt;
    private boolean done;
    private String doneReason;
    private List<Integer> context;
    private Long totalDuration;
    private Long loadDuration;
    private Integer promptEvalCount;
    private Long promptEvalDuration;
    private Integer evalCount;
    private Long evalDuration;
    public OllamaResult(String response, String thinking, long responseTime, int httpStatusCode) {
        this.response = response;
        this.thinking = thinking;
@@ -60,6 +67,17 @@ public class OllamaResult {
            responseMap.put("thinking", this.thinking);
            responseMap.put("httpStatusCode", this.httpStatusCode);
            responseMap.put("responseTime", this.responseTime);
            responseMap.put("model", this.model);
            responseMap.put("createdAt", this.createdAt);
            responseMap.put("done", this.done);
            responseMap.put("doneReason", this.doneReason);
            responseMap.put("context", this.context);
            responseMap.put("totalDuration", this.totalDuration);
            responseMap.put("loadDuration", this.loadDuration);
            responseMap.put("promptEvalCount", this.promptEvalCount);
            responseMap.put("promptEvalDuration", this.promptEvalDuration);
            responseMap.put("evalCount", this.evalCount);
            responseMap.put("evalDuration", this.evalDuration);
            return getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(responseMap);
        } catch (JsonProcessingException e) {
            throw new RuntimeException(e);
--- a/src/test/java/io/github/ollama4j/integrationtests/OllamaAPIIntegrationTest.java
+++ b/src/test/java/io/github/ollama4j/integrationtests/OllamaAPIIntegrationTest.java
@@ -202,14 +202,12 @@ class OllamaAPIIntegrationTest {
            throws OllamaBaseException, IOException, URISyntaxException, InterruptedException {
        api.pullModel(GENERAL_PURPOSE_MODEL);
        boolean raw = false;
        boolean thinking = false;
        StringBuffer sb = new StringBuffer();
        OllamaResult result = api.generate(GENERAL_PURPOSE_MODEL,
                "What is the capital of France? And what's France's connection with Mona Lisa?", raw,
-                thinking, new OptionsBuilder().build(), (s) -> {
+                new OptionsBuilder().build(), (s) -> {
-                    String substring = s.substring(sb.toString().length());
+                    LOG.info(s);
-                    LOG.info(substring);
+                    sb.append(s);
                    sb.append(substring);
                });
        assertNotNull(result);
@@ -355,12 +353,10 @@ class OllamaAPIIntegrationTest {
                .withKeepAlive("0m").withOptions(new OptionsBuilder().setTemperature(0.9f).build())
                .build();
        StringBuffer sb = new StringBuffer();
        OllamaChatResult chatResult = api.chat(requestModel, (s) -> {
-            String substring = s.substring(sb.toString().length());
+            LOG.info(s.toUpperCase());
-            sb.append(substring);
+        }, (s) -> {
-            LOG.info(substring);
+            LOG.info(s.toLowerCase());
        });
        assertNotNull(chatResult, "chatResult should not be null");
@@ -468,9 +464,11 @@ class OllamaAPIIntegrationTest {
        StringBuffer sb = new StringBuffer();
        OllamaChatResult chatResult = api.chat(requestModel, (s) -> {
-            String substring = s.substring(sb.toString().length());
+            LOG.info(s.toUpperCase());
-            sb.append(substring);
+            sb.append(s);
-            LOG.info(substring);
+        }, (s) -> {
            LOG.info(s.toLowerCase());
            sb.append(s);
        });
        assertNotNull(chatResult);
        assertNotNull(chatResult.getResponseModel());
@@ -491,10 +489,13 @@ class OllamaAPIIntegrationTest {
        StringBuffer sb = new StringBuffer();
        OllamaChatResult chatResult = api.chat(requestModel, (s) -> {
-            String substring = s.substring(sb.toString().length());
+            sb.append(s);
-            sb.append(substring);
+            LOG.info(s.toUpperCase());
-            LOG.info(substring);
+        }, (s) -> {
            sb.append(s);
            LOG.info(s.toLowerCase());
        });
        assertNotNull(chatResult);
        assertNotNull(chatResult.getResponseModel());
        assertNotNull(chatResult.getResponseModel().getMessage());
@@ -586,9 +587,8 @@ class OllamaAPIIntegrationTest {
        OllamaResult result = api.generateWithImageFiles(VISION_MODEL, "What is in this image?",
                List.of(imageFile), new OptionsBuilder().build(), (s) -> {
-                    String substring = s.substring(sb.toString().length());
+                    LOG.info(s);
-                    LOG.info(substring);
+                    sb.append(s);
                    sb.append(substring);
                });
        assertNotNull(result);
        assertNotNull(result.getResponse());
@@ -603,10 +603,10 @@ class OllamaAPIIntegrationTest {
        api.pullModel(THINKING_TOOL_MODEL);
        boolean raw = false;
-        boolean thinking = true;
+        boolean think = true;
-        OllamaResult result = api.generate(THINKING_TOOL_MODEL, "Who are you?", raw, thinking,
+        OllamaResult result = api.generate(THINKING_TOOL_MODEL, "Who are you?", raw, think,
-                new OptionsBuilder().build(), null);
+                new OptionsBuilder().build());
        assertNotNull(result);
        assertNotNull(result.getResponse());
        assertFalse(result.getResponse().isEmpty());
@@ -621,15 +621,19 @@ class OllamaAPIIntegrationTest {
        api.pullModel(THINKING_TOOL_MODEL);
        boolean raw = false;
        boolean thinking = true;
        StringBuffer sb = new StringBuffer();
-        OllamaResult result = api.generate(THINKING_TOOL_MODEL, "Who are you?", raw, thinking,
+        OllamaResult result = api.generate(THINKING_TOOL_MODEL, "Who are you?", raw,
-                new OptionsBuilder().build(), (s) -> {
+                new OptionsBuilder().build(),
-                    String substring = s.substring(sb.toString().length());
+                (thinkingToken) -> {
-                    sb.append(substring);
+                    sb.append(thinkingToken);
-                    LOG.info(substring);
+                    LOG.info(thinkingToken);
-                });
+                },
                (resToken) -> {
                    sb.append(resToken);
                    LOG.info(resToken);
                }
        );
        assertNotNull(result);
        assertNotNull(result.getResponse());
        assertFalse(result.getResponse().isEmpty());
--- a/src/test/java/io/github/ollama4j/unittests/TestMockedAPIs.java
+++ b/src/test/java/io/github/ollama4j/unittests/TestMockedAPIs.java
@@ -155,7 +155,7 @@ class TestMockedAPIs {
        try {
            when(ollamaAPI.generateWithImageFiles(
                    model, prompt, Collections.emptyList(), new OptionsBuilder().build()))
-                    .thenReturn(new OllamaResult("","", 0, 200));
+                    .thenReturn(new OllamaResult("", "", 0, 200));
            ollamaAPI.generateWithImageFiles(
                    model, prompt, Collections.emptyList(), new OptionsBuilder().build());
            verify(ollamaAPI, times(1))
@@ -174,7 +174,7 @@ class TestMockedAPIs {
        try {
            when(ollamaAPI.generateWithImageURLs(
                    model, prompt, Collections.emptyList(), new OptionsBuilder().build()))
-                    .thenReturn(new OllamaResult("","", 0, 200));
+                    .thenReturn(new OllamaResult("", "", 0, 200));
            ollamaAPI.generateWithImageURLs(
                    model, prompt, Collections.emptyList(), new OptionsBuilder().build());
            verify(ollamaAPI, times(1))
@@ -190,10 +190,10 @@ class TestMockedAPIs {
        OllamaAPI ollamaAPI = Mockito.mock(OllamaAPI.class);
        String model = OllamaModelType.LLAMA2;
        String prompt = "some prompt text";
-        when(ollamaAPI.generateAsync(model, prompt, false))
+        when(ollamaAPI.generateAsync(model, prompt, false, false))
                .thenReturn(new OllamaAsyncResultStreamer(null, null, 3));
-        ollamaAPI.generateAsync(model, prompt, false);
+        ollamaAPI.generateAsync(model, prompt, false, false);
-        verify(ollamaAPI, times(1)).generateAsync(model, prompt, false);
+        verify(ollamaAPI, times(1)).generateAsync(model, prompt, false, false);
    }
    @Test