lisette

import asyncio, base64, json, litellm, mimetypes, random, string, ast
from typing import Optional,Callable
from html import escape
from litellm import (acompletion, completion, stream_chunk_builder, Message,
                     ModelResponse, ModelResponseStream, get_model_info, register_model, Usage)
from litellm.utils import function_to_dict, StreamingChoices, Delta, ChatCompletionMessageToolCall, Function, Choices
from toolslm.funccall import mk_ns, call_func, call_func_async, get_schema
from fastcore.utils import *
from fastcore.meta import delegates
from fastcore import imghdr
from dataclasses import dataclass
from litellm.exceptions import ContextWindowExceededError

Caching

Anthropic

We use explicit caching via cache control checkpoints. Anthropic requires exact match with cached tokens and even a small change results in cache invalidation.

disable_cachy()

a,b = random.randint(0,100), random.randint(0,100)
hist = [[f"What is {a}+{b}?\n" * 250], f"It's {a+b}", ['hi'], "Hello"]

In this first api call we will see cache creation until the last user msg:

# #| notest
# # TODO: flaky
# sleep(5)
# chat = AsyncChat(ms[3], cache=True, hist=hist)
# rs = await chat('hi again', stream=True, stream_options={"include_usage": True})
# async for o in rs: 
#     if isinstance(o, ModelResponse): print(o.usage)

# #| notest
# # TODO: flaky
# test_eq(o.usage.cache_creation_input_tokens > 1000, True)
# test_eq(o.usage.cache_read_input_tokens, 0)

# #| notest
# # TODO: flaky
# hist.extend([['hi again'], 'how may i help you?'])
# chat = AsyncChat(ms[3], cache=True, hist=hist)
# rs = await chat('bye!', stream=True, stream_options={"include_usage": True})
# async for o in rs:
#     if isinstance(o, ModelResponse): print(o.usage)

# #| notest
# # TODO: flaky
# test_eq(o.usage.cache_read_input_tokens > 1000, True)

The subsequent call should re-use the existing cache:

Gemini

Gemini implicit caching supports partial token matches. The usage metadata only shows cache hits with the cached_tokens field. So, to view them we need to run completions at least twice.

Testing with gemini-2.5-flash until gemini-3-pro-preview is more reliable

# #| notest
# # TODO: flaky
# chat = AsyncChat(ms[2], cache=True, hist=hist)
# rs = await chat('hi again', stream=True, stream_options={"include_usage": True})
# async for o in rs: 
#     if isinstance(o, ModelResponse): print(o.usage)

Running the same completion again:

# #| notest
# # TODO: flaky
# sleep(5) # it takes a while for cached tokens to be avail.
# chat = AsyncChat(ms[2], cache=True, hist=hist)
# rs = await chat('hi again', stream=True, stream_options={"include_usage": True})
# async for o in rs: 
#     if isinstance(o, ModelResponse): print(o.usage)

# #| notest
# # TODO: flaky
# test_eq(o.usage.prompt_tokens_details.cached_tokens > 1800, True)

# #| notest
# # TODO: flaky
# hist.extend([['hi again'], 'how may i help you?'])
# chat = AsyncChat(ms[2], cache=True, hist=hist)
# rs = await chat('bye!', stream=True, stream_options={"include_usage": True})
# async for o in rs:
#     if isinstance(o, ModelResponse): print(o.usage)

# #| notest
# # TODO: flaky
# test_eq(o.usage.prompt_tokens_details.cached_tokens > 1800, True)

Let’s modify the cached content and see that partial matching works:

# #| notest
# # TODO: flaky
# c = hist[0][0]
# hist[0][0] = c[:int(len(c)*0.75)] + " Some extra text"
# hist.extend([['hi again'], 'how may i help you?'])
# chat = AsyncChat(ms[2], cache=True, hist=hist)
# rs = await chat('bye!', stream=True, stream_options={"include_usage": True})
# async for o in rs:
#     if isinstance(o, ModelResponse): print(o.usage)

# #| notest
# # # TODO: flaky
# test_eq(o.usage.prompt_tokens_details.cached_tokens > 900, True)