Refactor OllamaAPI for improved async handling and response streaming

Updated OllamaAPI to support separate thinking and response stream handlers, enhancing the asynchronous generation of responses. Adjusted related models and observers to accommodate new streaming logic. Improved the handling of response data in OllamaResult and OllamaGenerateResponseModel, adding new properties for better tracking of response metrics. Refined integration tests to reflect changes in method signatures and ensure proper logging of streamed responses.
2025-11-01 00:50:42 +01:00 · 2025-08-31 14:02:42 +05:30
parent 5f5fa8ecae
commit c754bd11da
9 changed files with 328 additions and 194 deletions
--- a/src/main/java/io/github/ollama4j/OllamaAPI.java
+++ b/src/main/java/io/github/ollama4j/OllamaAPI.java
@@ -52,7 +52,7 @@ import java.util.stream.Collectors;
 /**
 * The base Ollama API class.
 */
-@SuppressWarnings({"DuplicatedCode", "resource"})
+@SuppressWarnings({ "DuplicatedCode", "resource" })
 public class OllamaAPI {

    private static final Logger logger = LoggerFactory.getLogger(OllamaAPI.class);
@@ -101,7 +101,7 @@ public class OllamaAPI {
     * Default is 0 (no retries).
     */
    @Setter
-    @SuppressWarnings({"FieldMayBeFinal", "FieldCanBeLocal"})
+    @SuppressWarnings({ "FieldMayBeFinal", "FieldCanBeLocal" })
    private int numberOfRetriesForModelPull = 0;

    /**
@@ -371,9 +371,12 @@ public class OllamaAPI {
    /**
     * Finds a specific model using model name and tag from Ollama library.
     * <p>
-     * <b>Deprecated:</b> This method relies on the HTML structure of the Ollama website,
-     * which is subject to change at any time. As a result, it is difficult to keep this API
-     * method consistently updated and reliable. Therefore, this method is deprecated and
+     * <b>Deprecated:</b> This method relies on the HTML structure of the Ollama
+     * website,
+     * which is subject to change at any time. As a result, it is difficult to keep
+     * this API
+     * method consistently updated and reliable. Therefore, this method is
+     * deprecated and
     * may be removed in future releases.
     * <p>
     * This method retrieves the model from the Ollama library by its name, then
@@ -393,7 +396,9 @@ public class OllamaAPI {
     * @throws URISyntaxException     If there is an error with the URI syntax.
     * @throws InterruptedException   If the operation is interrupted.
     * @throws NoSuchElementException If the model or the tag is not found.
-     * @deprecated This method relies on the HTML structure of the Ollama website, which can change at any time and break this API. It is deprecated and may be removed in the future.
+     * @deprecated This method relies on the HTML structure of the Ollama website,
+     *             which can change at any time and break this API. It is deprecated
+     *             and may be removed in the future.
     */
    @Deprecated
    public LibraryModelTag findModelTagFromLibrary(String modelName, String tag)
@@ -453,12 +458,13 @@ public class OllamaAPI {
    /**
     * Handles retry backoff for pullModel.
     */
-    private void handlePullRetry(String modelName, int currentRetry, int maxRetries, long baseDelayMillis) throws InterruptedException {
+    private void handlePullRetry(String modelName, int currentRetry, int maxRetries, long baseDelayMillis)
+            throws InterruptedException {
        int attempt = currentRetry + 1;
        if (attempt < maxRetries) {
            long backoffMillis = baseDelayMillis * (1L << currentRetry);
            logger.error("Failed to pull model {}, retrying in {}s... (attempt {}/{})",
-                    modelName, backoffMillis/1000, attempt, maxRetries);
+                    modelName, backoffMillis / 1000, attempt, maxRetries);
            try {
                Thread.sleep(backoffMillis);
            } catch (InterruptedException ie) {
@@ -470,7 +476,6 @@ public class OllamaAPI {
        }
    }

-
    private void doPullModel(String modelName)
            throws OllamaBaseException, IOException, URISyntaxException, InterruptedException {
        String url = this.host + "/api/pull";
@@ -825,36 +830,74 @@ public class OllamaAPI {

    /**
     * Generate response for a question to a model running on Ollama server. This is
-     * a sync/blocking
-     * call.
+     * a sync/blocking call. This API does not support "thinking" models.
     *
     * @param model                 the ollama model to ask the question to
     * @param prompt                the prompt/question text
-     * @param raw           if true no formatting will be applied to the prompt. You
+     * @param raw                   if true no formatting will be applied to the
+     *                              prompt. You
     *                              may choose to use the raw parameter if you are
-     *                      specifying a full templated prompt in your request to
+     *                              specifying a full templated prompt in your
+     *                              request to
     *                              the API
-     * @param think         if true the model will "think" step-by-step before
-     *                      generating the final response
     * @param options               the Options object - <a
     *                              href=
     *                              "https://github.com/jmorganca/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values">More
     *                              details on the options</a>
-     * @param streamHandler optional callback consumer that will be applied every
-     *                      time a streamed response is received. If not set, the
+     * @param responseStreamHandler optional callback consumer that will be applied
+     *                              every
+     *                              time a streamed response is received. If not
+     *                              set, the
     *                              stream parameter of the request is set to false.
     * @return OllamaResult that includes response text and time taken for response
     * @throws OllamaBaseException  if the response indicates an error status
     * @throws IOException          if an I/O error occurs during the HTTP request
     * @throws InterruptedException if the operation is interrupted
     */
-    public OllamaResult generate(String model, String prompt, boolean raw, boolean think, Options options,
-                                 OllamaStreamHandler streamHandler) throws OllamaBaseException, IOException, InterruptedException {
+    public OllamaResult generate(String model, String prompt, boolean raw, Options options,
+            OllamaStreamHandler responseStreamHandler) throws OllamaBaseException, IOException, InterruptedException {
        OllamaGenerateRequest ollamaRequestModel = new OllamaGenerateRequest(model, prompt);
        ollamaRequestModel.setRaw(raw);
-        ollamaRequestModel.setThink(think);
+        ollamaRequestModel.setThink(false);
        ollamaRequestModel.setOptions(options.getOptionsMap());
-        return generateSyncForOllamaRequestModel(ollamaRequestModel, streamHandler);
+        return generateSyncForOllamaRequestModel(ollamaRequestModel, null, responseStreamHandler);
+    }
+
+    /**
+     * Generate thinking and response tokens for a question to a thinking model
+     * running on Ollama server. This is
+     * a sync/blocking call.
+     *
+     * @param model                 the ollama model to ask the question to
+     * @param prompt                the prompt/question text
+     * @param raw                   if true no formatting will be applied to the
+     *                              prompt. You
+     *                              may choose to use the raw parameter if you are
+     *                              specifying a full templated prompt in your
+     *                              request to
+     *                              the API
+     * @param options               the Options object - <a
+     *                              href=
+     *                              "https://github.com/jmorganca/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values">More
+     *                              details on the options</a>
+     * @param responseStreamHandler optional callback consumer that will be applied
+     *                              every
+     *                              time a streamed response is received. If not
+     *                              set, the
+     *                              stream parameter of the request is set to false.
+     * @return OllamaResult that includes response text and time taken for response
+     * @throws OllamaBaseException  if the response indicates an error status
+     * @throws IOException          if an I/O error occurs during the HTTP request
+     * @throws InterruptedException if the operation is interrupted
+     */
+    public OllamaResult generate(String model, String prompt, boolean raw, Options options,
+            OllamaStreamHandler thinkingStreamHandler, OllamaStreamHandler responseStreamHandler)
+            throws OllamaBaseException, IOException, InterruptedException {
+        OllamaGenerateRequest ollamaRequestModel = new OllamaGenerateRequest(model, prompt);
+        ollamaRequestModel.setRaw(raw);
+        ollamaRequestModel.setThink(true);
+        ollamaRequestModel.setOptions(options.getOptionsMap());
+        return generateSyncForOllamaRequestModel(ollamaRequestModel, thinkingStreamHandler, responseStreamHandler);
    }

    /**
@@ -862,7 +905,7 @@ public class OllamaAPI {
     * mode).
     * <p>
     * Uses
-     * {@link #generate(String, String, boolean, boolean, Options, OllamaStreamHandler)}
+     * {@link #generate(String, String, boolean, Options, OllamaStreamHandler)}
     *
     * @param model   The name or identifier of the AI model to use for generating
     *                the response.
@@ -871,10 +914,10 @@ public class OllamaAPI {
     *                and provide a full prompt. In this case, you can use the raw
     *                parameter to disable templating. Also note that raw mode will
     *                not return a context.
-     * @param think   If set to true, the model will "think" step-by-step before
-     *                generating the final response.
     * @param options Additional options or configurations to use when generating
     *                the response.
+     * @param think   if true the model will "think" step-by-step before
+     *                generating the final response
     * @return {@link OllamaResult}
     * @throws OllamaBaseException  if the response indicates an error status
     * @throws IOException          if an I/O error occurs during the HTTP request
@@ -882,7 +925,11 @@ public class OllamaAPI {
     */
    public OllamaResult generate(String model, String prompt, boolean raw, boolean think, Options options)
            throws OllamaBaseException, IOException, InterruptedException {
-        return generate(model, prompt, raw, think, options, null);
+        if (think) {
+            return generate(model, prompt, raw, options, null, null);
+        } else {
+            return generate(model, prompt, raw, options, null);
+        }
    }

    /**
@@ -958,8 +1005,6 @@ public class OllamaAPI {
     * @param model   The name or identifier of the AI model to use for generating
     *                the response.
     * @param prompt  The input text or prompt to provide to the AI model.
-     * @param think   If set to true, the model will "think" step-by-step before
-     *                generating the final response.
     * @param options Additional options or configurations to use when generating
     *                the response.
     * @return {@link OllamaToolsResult} An OllamaToolsResult object containing the
@@ -969,7 +1014,7 @@ public class OllamaAPI {
     * @throws IOException          if an I/O error occurs during the HTTP request
     * @throws InterruptedException if the operation is interrupted
     */
-    public OllamaToolsResult generateWithTools(String model, String prompt, boolean think, Options options)
+    public OllamaToolsResult generateWithTools(String model, String prompt, Options options)
            throws OllamaBaseException, IOException, InterruptedException, ToolInvocationException {
        boolean raw = true;
        OllamaToolsResult toolResult = new OllamaToolsResult();
@@ -984,7 +1029,7 @@ public class OllamaAPI {
            prompt = promptBuilder.build();
        }

-        OllamaResult result = generate(model, prompt, raw, think, options, null);
+        OllamaResult result = generate(model, prompt, raw, options, null);
        toolResult.setModelResult(result);

        String toolsResponse = result.getResponse();
@@ -1014,19 +1059,47 @@ public class OllamaAPI {
    }

    /**
-     * Generate response for a question to a model running on Ollama server and get
-     * a callback handle
-     * that can be used to check for status and get the response from the model
-     * later. This would be
-     * an async/non-blocking call.
+     * Asynchronously generates a response for a prompt using a model running on the
+     * Ollama server.
+     * <p>
+     * This method returns an {@link OllamaAsyncResultStreamer} handle that can be
+     * used to poll for
+     * status and retrieve streamed "thinking" and response tokens from the model.
+     * The call is non-blocking.
+     * </p>
     *
-     * @param model  the ollama model to ask the question to
-     * @param prompt the prompt/question text
-     * @return the ollama async result callback handle
+     * <p>
+     * <b>Example usage:</b>
+     * </p>
+     *
+     * <pre>{@code
+     * OllamaAsyncResultStreamer resultStreamer = ollamaAPI.generateAsync("gpt-oss:20b", "Who are you", false, true);
+     * int pollIntervalMilliseconds = 1000;
+     * while (true) {
+     *     String thinkingTokens = resultStreamer.getThinkingResponseStream().poll();
+     *     String responseTokens = resultStreamer.getResponseStream().poll();
+     *     System.out.print(thinkingTokens != null ? thinkingTokens.toUpperCase() : "");
+     *     System.out.print(responseTokens != null ? responseTokens.toLowerCase() : "");
+     *     Thread.sleep(pollIntervalMilliseconds);
+     *     if (!resultStreamer.isAlive())
+     *         break;
+     * }
+     * System.out.println("Complete thinking response: " + resultStreamer.getCompleteThinkingResponse());
+     * System.out.println("Complete response: " + resultStreamer.getCompleteResponse());
+     * }</pre>
+     *
+     * @param model  the Ollama model to use for generating the response
+     * @param prompt the prompt or question text to send to the model
+     * @param raw    if {@code true}, returns the raw response from the model
+     * @param think  if {@code true}, streams "thinking" tokens as well as response
+     *               tokens
+     * @return an {@link OllamaAsyncResultStreamer} handle for polling and
+     *         retrieving streamed results
     */
-    public OllamaAsyncResultStreamer generateAsync(String model, String prompt, boolean raw) {
+    public OllamaAsyncResultStreamer generateAsync(String model, String prompt, boolean raw, boolean think) {
        OllamaGenerateRequest ollamaRequestModel = new OllamaGenerateRequest(model, prompt);
        ollamaRequestModel.setRaw(raw);
+        ollamaRequestModel.setThink(think);
        URI uri = URI.create(this.host + "/api/generate");
        OllamaAsyncResultStreamer ollamaAsyncResultStreamer = new OllamaAsyncResultStreamer(
                getRequestBuilderDefault(uri), ollamaRequestModel, requestTimeoutSeconds);
@@ -1062,7 +1135,7 @@ public class OllamaAPI {
        }
        OllamaGenerateRequest ollamaRequestModel = new OllamaGenerateRequest(model, prompt, images);
        ollamaRequestModel.setOptions(options.getOptionsMap());
-        return generateSyncForOllamaRequestModel(ollamaRequestModel, streamHandler);
+        return generateSyncForOllamaRequestModel(ollamaRequestModel, null, streamHandler);
    }

    /**
@@ -1110,7 +1183,7 @@ public class OllamaAPI {
        }
        OllamaGenerateRequest ollamaRequestModel = new OllamaGenerateRequest(model, prompt, images);
        ollamaRequestModel.setOptions(options.getOptionsMap());
-        return generateSyncForOllamaRequestModel(ollamaRequestModel, streamHandler);
+        return generateSyncForOllamaRequestModel(ollamaRequestModel, null, streamHandler);
    }

    /**
@@ -1157,7 +1230,7 @@ public class OllamaAPI {
        }
        OllamaGenerateRequest ollamaRequestModel = new OllamaGenerateRequest(model, prompt, encodedImages);
        ollamaRequestModel.setOptions(options.getOptionsMap());
-        return generateSyncForOllamaRequestModel(ollamaRequestModel, streamHandler);
+        return generateSyncForOllamaRequestModel(ollamaRequestModel, null, streamHandler);
    }

    /**
@@ -1223,7 +1296,7 @@ public class OllamaAPI {
     */
    public OllamaChatResult chat(OllamaChatRequest request)
            throws OllamaBaseException, IOException, InterruptedException, ToolInvocationException {
-        return chat(request, null);
+        return chat(request, null, null);
    }

    /**
@@ -1233,9 +1306,10 @@ public class OllamaAPI {
     * Hint: the OllamaChatRequestModel#getStream() property is not implemented.
     *
     * @param request               request object to be sent to the server
-     * @param streamHandler callback handler to handle the last message from stream
-     *                      (caution: all previous tokens from stream will be
-     *                      concatenated)
+     * @param responseStreamHandler callback handler to handle the last message from
+     *                              stream
+     * @param thinkingStreamHandler callback handler to handle the last thinking
+     *                              message from stream
     * @return {@link OllamaChatResult}
     * @throws OllamaBaseException     any response code than 200 has been returned
     * @throws IOException             in case the responseStream can not be read
@@ -1248,9 +1322,10 @@ public class OllamaAPI {
     * @throws InterruptedException    if the operation is interrupted
     * @throws ToolInvocationException if the tool invocation fails
     */
-    public OllamaChatResult chat(OllamaChatRequest request, OllamaStreamHandler streamHandler)
+    public OllamaChatResult chat(OllamaChatRequest request, OllamaStreamHandler thinkingStreamHandler,
+            OllamaStreamHandler responseStreamHandler)
            throws OllamaBaseException, IOException, InterruptedException, ToolInvocationException {
-        return chatStreaming(request, new OllamaChatStreamObserver(streamHandler));
+        return chatStreaming(request, new OllamaChatStreamObserver(thinkingStreamHandler, responseStreamHandler));
    }

    /**
@@ -1518,9 +1593,11 @@ public class OllamaAPI {
     * the request will be streamed; otherwise, a regular synchronous request will
     * be made.
     *
-     * @param ollamaRequestModel the request model containing necessary parameters
+     * @param ollamaRequestModel    the request model containing necessary
+     *                              parameters
     *                              for the Ollama API request.
-     * @param streamHandler      the stream handler to process streaming responses,
+     * @param responseStreamHandler the stream handler to process streaming
+     *                              responses,
     *                              or null for non-streaming requests.
     * @return the result of the Ollama API request.
     * @throws OllamaBaseException  if the request fails due to an issue with the
@@ -1530,13 +1607,14 @@ public class OllamaAPI {
     * @throws InterruptedException if the thread is interrupted during the request.
     */
    private OllamaResult generateSyncForOllamaRequestModel(OllamaGenerateRequest ollamaRequestModel,
-                                                           OllamaStreamHandler streamHandler) throws OllamaBaseException, IOException, InterruptedException {
+            OllamaStreamHandler thinkingStreamHandler, OllamaStreamHandler responseStreamHandler)
+            throws OllamaBaseException, IOException, InterruptedException {
        OllamaGenerateEndpointCaller requestCaller = new OllamaGenerateEndpointCaller(host, auth, requestTimeoutSeconds,
                verbose);
        OllamaResult result;
-        if (streamHandler != null) {
+        if (responseStreamHandler != null) {
            ollamaRequestModel.setStream(true);
-            result = requestCaller.call(ollamaRequestModel, streamHandler);
+            result = requestCaller.call(ollamaRequestModel, thinkingStreamHandler, responseStreamHandler);
        } else {
            result = requestCaller.callSync(ollamaRequestModel);
        }
--- a/src/main/java/io/github/ollama4j/models/chat/OllamaChatStreamObserver.java
+++ b/src/main/java/io/github/ollama4j/models/chat/OllamaChatStreamObserver.java
@@ -6,27 +6,46 @@ import lombok.RequiredArgsConstructor;

@RequiredArgsConstructor
 public class OllamaChatStreamObserver implements OllamaTokenHandler {
-    private final OllamaStreamHandler streamHandler;
+    private final OllamaStreamHandler thinkingStreamHandler;
+    private final OllamaStreamHandler responseStreamHandler;
+
    private String message = "";

    @Override
    public void accept(OllamaChatResponseModel token) {
-        if (streamHandler == null || token == null || token.getMessage() == null) {
+        if (responseStreamHandler == null || token == null || token.getMessage() == null) {
            return;
        }

-        String content = token.getMessage().getContent();
        String thinking = token.getMessage().getThinking();
+        String content = token.getMessage().getContent();

-        boolean hasContent = !content.isEmpty();
        boolean hasThinking = thinking != null && !thinking.isEmpty();
+        boolean hasContent = !content.isEmpty();

-        if (hasThinking && !hasContent) {
-            message += thinking;
-        } else {
-            message += content;
+//        if (hasThinking && !hasContent) {
+////            message += thinking;
+//            message = thinking;
+//        } else {
+////            message += content;
+//            message = content;
+//        }
+//
+//        responseStreamHandler.accept(message);
+
+
+        if (!hasContent && hasThinking && thinkingStreamHandler != null) {
+            // message = message + thinking;
+
+            // use only new tokens received, instead of appending the tokens to the previous
+            // ones and sending the full string again
+            thinkingStreamHandler.accept(thinking);
+        } else if (hasContent && responseStreamHandler != null) {
+            // message = message + response;
+
+            // use only new tokens received, instead of appending the tokens to the previous
+            // ones and sending the full string again
+            responseStreamHandler.accept(content);
        }
-
-        streamHandler.accept(message);
    }
 }
--- a/src/main/java/io/github/ollama4j/models/generate/OllamaGenerateResponseModel.java
+++ b/src/main/java/io/github/ollama4j/models/generate/OllamaGenerateResponseModel.java
@@ -4,6 +4,7 @@ import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
 import com.fasterxml.jackson.annotation.JsonProperty;

 import java.util.List;
+
 import lombok.Data;

@Data
@@ -14,11 +15,12 @@ public class OllamaGenerateResponseModel {
    private String response;
    private String thinking;
    private boolean done;
+    private @JsonProperty("done_reason") String doneReason;
    private List<Integer> context;
    private @JsonProperty("total_duration") Long totalDuration;
    private @JsonProperty("load_duration") Long loadDuration;
-    private @JsonProperty("prompt_eval_duration") Long promptEvalDuration;
-    private @JsonProperty("eval_duration") Long evalDuration;
    private @JsonProperty("prompt_eval_count") Integer promptEvalCount;
+    private @JsonProperty("prompt_eval_duration") Long promptEvalDuration;
    private @JsonProperty("eval_count") Integer evalCount;
+    private @JsonProperty("eval_duration") Long evalDuration;
 }
--- a/src/main/java/io/github/ollama4j/models/generate/OllamaGenerateStreamObserver.java
+++ b/src/main/java/io/github/ollama4j/models/generate/OllamaGenerateStreamObserver.java
@@ -5,14 +5,16 @@ import java.util.List;

 public class OllamaGenerateStreamObserver {

-    private OllamaStreamHandler streamHandler;
+    private final OllamaStreamHandler thinkingStreamHandler;
+    private final OllamaStreamHandler responseStreamHandler;

-    private List<OllamaGenerateResponseModel> responseParts = new ArrayList<>();
+    private final List<OllamaGenerateResponseModel> responseParts = new ArrayList<>();

    private String message = "";

-    public OllamaGenerateStreamObserver(OllamaStreamHandler streamHandler) {
-        this.streamHandler = streamHandler;
+    public OllamaGenerateStreamObserver(OllamaStreamHandler thinkingStreamHandler, OllamaStreamHandler responseStreamHandler) {
+        this.responseStreamHandler = responseStreamHandler;
+        this.thinkingStreamHandler = thinkingStreamHandler;
    }

    public void notify(OllamaGenerateResponseModel currentResponsePart) {
@@ -27,11 +29,18 @@ public class OllamaGenerateStreamObserver {
        boolean hasResponse = response != null && !response.isEmpty();
        boolean hasThinking = thinking != null && !thinking.isEmpty();

-        if (!hasResponse && hasThinking) {
-            message = message + thinking;
-        } else if (hasResponse) {
-            message = message + response;
+        if (!hasResponse && hasThinking && thinkingStreamHandler != null) {
+            // message = message + thinking;
+
+            // use only new tokens received, instead of appending the tokens to the previous
+            // ones and sending the full string again
+            thinkingStreamHandler.accept(thinking);
+        } else if (hasResponse && responseStreamHandler != null) {
+            // message = message + response;
+
+            // use only new tokens received, instead of appending the tokens to the previous
+            // ones and sending the full string again
+            responseStreamHandler.accept(response);
        }
-        streamHandler.accept(message);
    }
 }
--- a/src/main/java/io/github/ollama4j/models/request/OllamaGenerateEndpointCaller.java
+++ b/src/main/java/io/github/ollama4j/models/request/OllamaGenerateEndpointCaller.java
@@ -27,7 +27,7 @@ public class OllamaGenerateEndpointCaller extends OllamaEndpointCaller {

    private static final Logger LOG = LoggerFactory.getLogger(OllamaGenerateEndpointCaller.class);

-    private OllamaGenerateStreamObserver streamObserver;
+    private OllamaGenerateStreamObserver responseStreamObserver;

    public OllamaGenerateEndpointCaller(String host, Auth basicAuth, long requestTimeoutSeconds, boolean verbose) {
        super(host, basicAuth, requestTimeoutSeconds, verbose);
@@ -48,8 +48,8 @@ public class OllamaGenerateEndpointCaller extends OllamaEndpointCaller {
            if (ollamaResponseModel.getThinking() != null) {
                thinkingBuffer.append(ollamaResponseModel.getThinking());
            }
-            if (streamObserver != null) {
-                streamObserver.notify(ollamaResponseModel);
+            if (responseStreamObserver != null) {
+                responseStreamObserver.notify(ollamaResponseModel);
            }
            return ollamaResponseModel.isDone();
        } catch (JsonProcessingException e) {
@@ -58,9 +58,8 @@ public class OllamaGenerateEndpointCaller extends OllamaEndpointCaller {
        }
    }

-    public OllamaResult call(OllamaRequestBody body, OllamaStreamHandler streamHandler)
-            throws OllamaBaseException, IOException, InterruptedException {
-        streamObserver = new OllamaGenerateStreamObserver(streamHandler);
+    public OllamaResult call(OllamaRequestBody body, OllamaStreamHandler thinkingStreamHandler, OllamaStreamHandler responseStreamHandler) throws OllamaBaseException, IOException, InterruptedException {
+        responseStreamObserver = new OllamaGenerateStreamObserver(thinkingStreamHandler, responseStreamHandler);
        return callSync(body);
    }

@@ -73,47 +72,41 @@ public class OllamaGenerateEndpointCaller extends OllamaEndpointCaller {
     * @throws IOException          in case the responseStream can not be read
     * @throws InterruptedException in case the server is not reachable or network issues happen
     */
+    @SuppressWarnings("DuplicatedCode")
    public OllamaResult callSync(OllamaRequestBody body) throws OllamaBaseException, IOException, InterruptedException {
        // Create Request
        long startTime = System.currentTimeMillis();
        HttpClient httpClient = HttpClient.newHttpClient();
        URI uri = URI.create(getHost() + getEndpointSuffix());
-        HttpRequest.Builder requestBuilder =
-                getRequestBuilderDefault(uri)
-                        .POST(
-                                body.getBodyPublisher());
+        HttpRequest.Builder requestBuilder = getRequestBuilderDefault(uri).POST(body.getBodyPublisher());
        HttpRequest request = requestBuilder.build();
-        if (isVerbose()) LOG.info("Asking model: " + body.toString());
-        HttpResponse<InputStream> response =
-                httpClient.send(request, HttpResponse.BodyHandlers.ofInputStream());
+        if (isVerbose()) LOG.info("Asking model: {}", body);
+        HttpResponse<InputStream> response = httpClient.send(request, HttpResponse.BodyHandlers.ofInputStream());

        int statusCode = response.statusCode();
        InputStream responseBodyStream = response.body();
        StringBuilder responseBuffer = new StringBuilder();
        StringBuilder thinkingBuffer = new StringBuilder();
-        try (BufferedReader reader =
-                     new BufferedReader(new InputStreamReader(responseBodyStream, StandardCharsets.UTF_8))) {
+        OllamaGenerateResponseModel ollamaGenerateResponseModel = null;
+        try (BufferedReader reader = new BufferedReader(new InputStreamReader(responseBodyStream, StandardCharsets.UTF_8))) {
            String line;
            while ((line = reader.readLine()) != null) {
                if (statusCode == 404) {
                    LOG.warn("Status code: 404 (Not Found)");
-                    OllamaErrorResponse ollamaResponseModel =
-                            Utils.getObjectMapper().readValue(line, OllamaErrorResponse.class);
+                    OllamaErrorResponse ollamaResponseModel = Utils.getObjectMapper().readValue(line, OllamaErrorResponse.class);
                    responseBuffer.append(ollamaResponseModel.getError());
                } else if (statusCode == 401) {
                    LOG.warn("Status code: 401 (Unauthorized)");
-                    OllamaErrorResponse ollamaResponseModel =
-                            Utils.getObjectMapper()
-                                    .readValue("{\"error\":\"Unauthorized\"}", OllamaErrorResponse.class);
+                    OllamaErrorResponse ollamaResponseModel = Utils.getObjectMapper().readValue("{\"error\":\"Unauthorized\"}", OllamaErrorResponse.class);
                    responseBuffer.append(ollamaResponseModel.getError());
                } else if (statusCode == 400) {
                    LOG.warn("Status code: 400 (Bad Request)");
-                    OllamaErrorResponse ollamaResponseModel = Utils.getObjectMapper().readValue(line,
-                            OllamaErrorResponse.class);
+                    OllamaErrorResponse ollamaResponseModel = Utils.getObjectMapper().readValue(line, OllamaErrorResponse.class);
                    responseBuffer.append(ollamaResponseModel.getError());
                } else {
                    boolean finished = parseResponseAndAddToBuffer(line, responseBuffer, thinkingBuffer);
                    if (finished) {
+                        ollamaGenerateResponseModel = Utils.getObjectMapper().readValue(line, OllamaGenerateResponseModel.class);
                        break;
                    }
                }
@@ -121,13 +114,25 @@ public class OllamaGenerateEndpointCaller extends OllamaEndpointCaller {
        }

        if (statusCode != 200) {
-            LOG.error("Status code " + statusCode);
+            LOG.error("Status code: {}", statusCode);
            throw new OllamaBaseException(responseBuffer.toString());
        } else {
            long endTime = System.currentTimeMillis();
-            OllamaResult ollamaResult =
-                    new OllamaResult(responseBuffer.toString(), thinkingBuffer.toString(), endTime - startTime, statusCode);
-            if (isVerbose()) LOG.info("Model response: " + ollamaResult);
+            OllamaResult ollamaResult = new OllamaResult(responseBuffer.toString(), thinkingBuffer.toString(), endTime - startTime, statusCode);
+
+            ollamaResult.setModel(ollamaGenerateResponseModel.getModel());
+            ollamaResult.setCreatedAt(ollamaGenerateResponseModel.getCreatedAt());
+            ollamaResult.setDone(ollamaGenerateResponseModel.isDone());
+            ollamaResult.setDoneReason(ollamaGenerateResponseModel.getDoneReason());
+            ollamaResult.setContext(ollamaGenerateResponseModel.getContext());
+            ollamaResult.setTotalDuration(ollamaGenerateResponseModel.getTotalDuration());
+            ollamaResult.setLoadDuration(ollamaGenerateResponseModel.getLoadDuration());
+            ollamaResult.setPromptEvalCount(ollamaGenerateResponseModel.getPromptEvalCount());
+            ollamaResult.setPromptEvalDuration(ollamaGenerateResponseModel.getPromptEvalDuration());
+            ollamaResult.setEvalCount(ollamaGenerateResponseModel.getEvalCount());
+            ollamaResult.setEvalDuration(ollamaGenerateResponseModel.getEvalDuration());
+
+            if (isVerbose()) LOG.info("Model response: {}", ollamaResult);
            return ollamaResult;
        }
    }
--- a/src/main/java/io/github/ollama4j/models/response/OllamaAsyncResultStreamer.java
+++ b/src/main/java/io/github/ollama4j/models/response/OllamaAsyncResultStreamer.java
@@ -26,8 +26,10 @@ import java.time.Duration;
 public class OllamaAsyncResultStreamer extends Thread {
    private final HttpRequest.Builder requestBuilder;
    private final OllamaGenerateRequest ollamaRequestModel;
-    private final OllamaResultStream stream = new OllamaResultStream();
+    private final OllamaResultStream thinkingResponseStream = new OllamaResultStream();
+    private final OllamaResultStream responseStream = new OllamaResultStream();
    private String completeResponse;
+    private String completeThinkingResponse;


    /**
@@ -54,14 +56,11 @@ public class OllamaAsyncResultStreamer extends Thread {
    @Getter
    private long responseTime = 0;

-    public OllamaAsyncResultStreamer(
-            HttpRequest.Builder requestBuilder,
-            OllamaGenerateRequest ollamaRequestModel,
-            long requestTimeoutSeconds) {
+    public OllamaAsyncResultStreamer(HttpRequest.Builder requestBuilder, OllamaGenerateRequest ollamaRequestModel, long requestTimeoutSeconds) {
        this.requestBuilder = requestBuilder;
        this.ollamaRequestModel = ollamaRequestModel;
        this.completeResponse = "";
-        this.stream.add("");
+        this.responseStream.add("");
        this.requestTimeoutSeconds = requestTimeoutSeconds;
    }

@@ -71,16 +70,8 @@ public class OllamaAsyncResultStreamer extends Thread {
        HttpClient httpClient = HttpClient.newHttpClient();
        long startTime = System.currentTimeMillis();
        try {
-            HttpRequest request =
-                    requestBuilder
-                            .POST(
-                                    HttpRequest.BodyPublishers.ofString(
-                                            Utils.getObjectMapper().writeValueAsString(ollamaRequestModel)))
-                            .header(Constants.HttpConstants.HEADER_KEY_CONTENT_TYPE, Constants.HttpConstants.APPLICATION_JSON)
-                            .timeout(Duration.ofSeconds(requestTimeoutSeconds))
-                            .build();
-            HttpResponse<InputStream> response =
-                    httpClient.send(request, HttpResponse.BodyHandlers.ofInputStream());
+            HttpRequest request = requestBuilder.POST(HttpRequest.BodyPublishers.ofString(Utils.getObjectMapper().writeValueAsString(ollamaRequestModel))).header(Constants.HttpConstants.HEADER_KEY_CONTENT_TYPE, Constants.HttpConstants.APPLICATION_JSON).timeout(Duration.ofSeconds(requestTimeoutSeconds)).build();
+            HttpResponse<InputStream> response = httpClient.send(request, HttpResponse.BodyHandlers.ofInputStream());
            int statusCode = response.statusCode();
            this.httpStatusCode = statusCode;

@@ -89,25 +80,33 @@ public class OllamaAsyncResultStreamer extends Thread {
            try {
                reader = new BufferedReader(new InputStreamReader(responseBodyStream, StandardCharsets.UTF_8));
                String line;
+                StringBuilder thinkingBuffer = new StringBuilder();
                StringBuilder responseBuffer = new StringBuilder();
                while ((line = reader.readLine()) != null) {
                    if (statusCode == 404) {
-                        OllamaErrorResponse ollamaResponseModel =
-                                Utils.getObjectMapper().readValue(line, OllamaErrorResponse.class);
-                        stream.add(ollamaResponseModel.getError());
+                        OllamaErrorResponse ollamaResponseModel = Utils.getObjectMapper().readValue(line, OllamaErrorResponse.class);
+                        responseStream.add(ollamaResponseModel.getError());
                        responseBuffer.append(ollamaResponseModel.getError());
                    } else {
-                        OllamaGenerateResponseModel ollamaResponseModel =
-                                Utils.getObjectMapper().readValue(line, OllamaGenerateResponseModel.class);
-                        String res = ollamaResponseModel.getResponse();
-                        stream.add(res);
+                        OllamaGenerateResponseModel ollamaResponseModel = Utils.getObjectMapper().readValue(line, OllamaGenerateResponseModel.class);
+                        String thinkingTokens = ollamaResponseModel.getThinking();
+                        String responseTokens = ollamaResponseModel.getResponse();
+                        if (thinkingTokens == null) {
+                            thinkingTokens = "";
+                        }
+                        if (responseTokens == null) {
+                            responseTokens = "";
+                        }
+                        thinkingResponseStream.add(thinkingTokens);
+                        responseStream.add(responseTokens);
                        if (!ollamaResponseModel.isDone()) {
-                            responseBuffer.append(res);
+                            responseBuffer.append(responseTokens);
+                            thinkingBuffer.append(thinkingTokens);
                        }
                    }
                }
-
                this.succeeded = true;
+                this.completeThinkingResponse = thinkingBuffer.toString();
                this.completeResponse = responseBuffer.toString();
                long endTime = System.currentTimeMillis();
                responseTime = endTime - startTime;
--- a/src/main/java/io/github/ollama4j/models/response/OllamaResult.java
+++ b/src/main/java/io/github/ollama4j/models/response/OllamaResult.java
@@ -1,15 +1,18 @@
 package io.github.ollama4j.models.response;

 import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonProperty;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.core.type.TypeReference;

+import io.github.ollama4j.models.generate.OllamaGenerateResponseModel;
 import lombok.Data;
 import lombok.Getter;

 import static io.github.ollama4j.utils.Utils.getObjectMapper;

 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;

 /**
@@ -21,30 +24,34 @@ import java.util.Map;
@JsonIgnoreProperties(ignoreUnknown = true)
 public class OllamaResult {
    /**
-     * -- GETTER --
     * Get the completion/response text
-     *
-     * @return String completion/response text
     */
    private final String response;
-    private final String thinking;
-
    /**
-     * -- GETTER --
+     * Get the thinking text (if available)
+     */
+    private final String thinking;
+    /**
     * Get the response status code.
-     *
-     * @return int - response status code
     */
    private int httpStatusCode;
-
    /**
-     * -- GETTER --
     * Get the response time in milliseconds.
-     *
-     * @return long - response time in milliseconds
     */
    private long responseTime = 0;

+    private String model;
+    private String createdAt;
+    private boolean done;
+    private String doneReason;
+    private List<Integer> context;
+    private Long totalDuration;
+    private Long loadDuration;
+    private Integer promptEvalCount;
+    private Long promptEvalDuration;
+    private Integer evalCount;
+    private Long evalDuration;
+
    public OllamaResult(String response, String thinking, long responseTime, int httpStatusCode) {
        this.response = response;
        this.thinking = thinking;
@@ -60,6 +67,17 @@ public class OllamaResult {
            responseMap.put("thinking", this.thinking);
            responseMap.put("httpStatusCode", this.httpStatusCode);
            responseMap.put("responseTime", this.responseTime);
+            responseMap.put("model", this.model);
+            responseMap.put("createdAt", this.createdAt);
+            responseMap.put("done", this.done);
+            responseMap.put("doneReason", this.doneReason);
+            responseMap.put("context", this.context);
+            responseMap.put("totalDuration", this.totalDuration);
+            responseMap.put("loadDuration", this.loadDuration);
+            responseMap.put("promptEvalCount", this.promptEvalCount);
+            responseMap.put("promptEvalDuration", this.promptEvalDuration);
+            responseMap.put("evalCount", this.evalCount);
+            responseMap.put("evalDuration", this.evalDuration);
            return getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(responseMap);
        } catch (JsonProcessingException e) {
            throw new RuntimeException(e);
--- a/src/test/java/io/github/ollama4j/integrationtests/OllamaAPIIntegrationTest.java
+++ b/src/test/java/io/github/ollama4j/integrationtests/OllamaAPIIntegrationTest.java
@@ -202,14 +202,12 @@ class OllamaAPIIntegrationTest {
            throws OllamaBaseException, IOException, URISyntaxException, InterruptedException {
        api.pullModel(GENERAL_PURPOSE_MODEL);
        boolean raw = false;
-        boolean thinking = false;
        StringBuffer sb = new StringBuffer();
        OllamaResult result = api.generate(GENERAL_PURPOSE_MODEL,
                "What is the capital of France? And what's France's connection with Mona Lisa?", raw,
-                thinking, new OptionsBuilder().build(), (s) -> {
-                    String substring = s.substring(sb.toString().length());
-                    LOG.info(substring);
-                    sb.append(substring);
+                new OptionsBuilder().build(), (s) -> {
+                    LOG.info(s);
+                    sb.append(s);
                });

        assertNotNull(result);
@@ -355,12 +353,10 @@ class OllamaAPIIntegrationTest {
                .withKeepAlive("0m").withOptions(new OptionsBuilder().setTemperature(0.9f).build())
                .build();

-        StringBuffer sb = new StringBuffer();
-
        OllamaChatResult chatResult = api.chat(requestModel, (s) -> {
-            String substring = s.substring(sb.toString().length());
-            sb.append(substring);
-            LOG.info(substring);
+            LOG.info(s.toUpperCase());
+        }, (s) -> {
+            LOG.info(s.toLowerCase());
        });

        assertNotNull(chatResult, "chatResult should not be null");
@@ -468,9 +464,11 @@ class OllamaAPIIntegrationTest {
        StringBuffer sb = new StringBuffer();

        OllamaChatResult chatResult = api.chat(requestModel, (s) -> {
-            String substring = s.substring(sb.toString().length());
-            sb.append(substring);
-            LOG.info(substring);
+            LOG.info(s.toUpperCase());
+            sb.append(s);
+        }, (s) -> {
+            LOG.info(s.toLowerCase());
+            sb.append(s);
        });
        assertNotNull(chatResult);
        assertNotNull(chatResult.getResponseModel());
@@ -491,10 +489,13 @@ class OllamaAPIIntegrationTest {
        StringBuffer sb = new StringBuffer();

        OllamaChatResult chatResult = api.chat(requestModel, (s) -> {
-            String substring = s.substring(sb.toString().length());
-            sb.append(substring);
-            LOG.info(substring);
+            sb.append(s);
+            LOG.info(s.toUpperCase());
+        }, (s) -> {
+            sb.append(s);
+            LOG.info(s.toLowerCase());
        });
+
        assertNotNull(chatResult);
        assertNotNull(chatResult.getResponseModel());
        assertNotNull(chatResult.getResponseModel().getMessage());
@@ -586,9 +587,8 @@ class OllamaAPIIntegrationTest {

        OllamaResult result = api.generateWithImageFiles(VISION_MODEL, "What is in this image?",
                List.of(imageFile), new OptionsBuilder().build(), (s) -> {
-                    String substring = s.substring(sb.toString().length());
-                    LOG.info(substring);
-                    sb.append(substring);
+                    LOG.info(s);
+                    sb.append(s);
                });
        assertNotNull(result);
        assertNotNull(result.getResponse());
@@ -603,10 +603,10 @@ class OllamaAPIIntegrationTest {
        api.pullModel(THINKING_TOOL_MODEL);

        boolean raw = false;
-        boolean thinking = true;
+        boolean think = true;

-        OllamaResult result = api.generate(THINKING_TOOL_MODEL, "Who are you?", raw, thinking,
-                new OptionsBuilder().build(), null);
+        OllamaResult result = api.generate(THINKING_TOOL_MODEL, "Who are you?", raw, think,
+                new OptionsBuilder().build());
        assertNotNull(result);
        assertNotNull(result.getResponse());
        assertFalse(result.getResponse().isEmpty());
@@ -621,15 +621,19 @@ class OllamaAPIIntegrationTest {
        api.pullModel(THINKING_TOOL_MODEL);

        boolean raw = false;
-        boolean thinking = true;

        StringBuffer sb = new StringBuffer();
-        OllamaResult result = api.generate(THINKING_TOOL_MODEL, "Who are you?", raw, thinking,
-                new OptionsBuilder().build(), (s) -> {
-                    String substring = s.substring(sb.toString().length());
-                    sb.append(substring);
-                    LOG.info(substring);
-                });
+        OllamaResult result = api.generate(THINKING_TOOL_MODEL, "Who are you?", raw,
+                new OptionsBuilder().build(),
+                (thinkingToken) -> {
+                    sb.append(thinkingToken);
+                    LOG.info(thinkingToken);
+                },
+                (resToken) -> {
+                    sb.append(resToken);
+                    LOG.info(resToken);
+                }
+        );
        assertNotNull(result);
        assertNotNull(result.getResponse());
        assertFalse(result.getResponse().isEmpty());
--- a/src/test/java/io/github/ollama4j/unittests/TestMockedAPIs.java
+++ b/src/test/java/io/github/ollama4j/unittests/TestMockedAPIs.java
@@ -155,7 +155,7 @@ class TestMockedAPIs {
        try {
            when(ollamaAPI.generateWithImageFiles(
                    model, prompt, Collections.emptyList(), new OptionsBuilder().build()))
-                    .thenReturn(new OllamaResult("","", 0, 200));
+                    .thenReturn(new OllamaResult("", "", 0, 200));
            ollamaAPI.generateWithImageFiles(
                    model, prompt, Collections.emptyList(), new OptionsBuilder().build());
            verify(ollamaAPI, times(1))
@@ -174,7 +174,7 @@ class TestMockedAPIs {
        try {
            when(ollamaAPI.generateWithImageURLs(
                    model, prompt, Collections.emptyList(), new OptionsBuilder().build()))
-                    .thenReturn(new OllamaResult("","", 0, 200));
+                    .thenReturn(new OllamaResult("", "", 0, 200));
            ollamaAPI.generateWithImageURLs(
                    model, prompt, Collections.emptyList(), new OptionsBuilder().build());
            verify(ollamaAPI, times(1))
@@ -190,10 +190,10 @@ class TestMockedAPIs {
        OllamaAPI ollamaAPI = Mockito.mock(OllamaAPI.class);
        String model = OllamaModelType.LLAMA2;
        String prompt = "some prompt text";
-        when(ollamaAPI.generateAsync(model, prompt, false))
+        when(ollamaAPI.generateAsync(model, prompt, false, false))
                .thenReturn(new OllamaAsyncResultStreamer(null, null, 3));
-        ollamaAPI.generateAsync(model, prompt, false);
-        verify(ollamaAPI, times(1)).generateAsync(model, prompt, false);
+        ollamaAPI.generateAsync(model, prompt, false, false);
+        verify(ollamaAPI, times(1)).generateAsync(model, prompt, false, false);
    }

    @Test