import litellm, importlib, httpx
from lisette.core import Chat, AsyncChat, patch_litellm
from cachy import enable_cachy, disable_cachy
from fastcore.test import *Usage
Lisette Usage Logger
_ = importlib.reload(litellm) # to re-run the notebook without kernel restart# litellm._turn_on_debug()patch_litellm()Usage
Usage ()
Initialize self. See help(type(self)) for accurate signature.
Anthropic provides web search request counts directly via usage.server_tool_use.web_search_requests, billed at $10 per 1,000 searches (pricing). Gemini returns queries in groundingMetadata.webSearchQueries—each query counts as a separate billable use—with 5,000 free prompts per month, then $14 per 1,000 search queries (coming soon) (pricing, grounding docs).
search_count
search_count (r)
The precomputed response cost provided is available in kwargs['response_cost'] according to the litellm docs:
LisetteUsageLogger
LisetteUsageLogger (db_path)
Args: turn_off_message_logging: bool - if True, the message logging will be turned off. Message and response will be redacted from StandardLoggingPayload. message_logging: bool - deprecated param, use turn_off_message_logging instead
Cost Utils
class PrefixDict(dict):
def __getitem__(self, key):
if key in self.keys(): return super().__getitem__(key)
for k in self.keys():
if key.startswith(k): return super().__getitem__(k)
raise KeyError(key)model_prices = PrefixDict({
'claude-sonnet-4-5': dict(input_prc = 3/1e6, cache_write_prc = 3.75/1e6, cache_read_prc = 0.3/1e6, output_prc = 15/1e6, web_search_prc = 10/1e3)
})Simplified cost utils to demonstrate total cost calculation (use Usage.response_cost in prod):
@patch(as_prop=True)
def inp_cost(self:Usage): return model_prices[self.model]['input_prc'] * (self.prompt_tokens - self.cache_read_tokens)
@patch(as_prop=True)
def cache_write_cost(self:Usage): return model_prices[self.model]['cache_write_prc'] * self.cache_creation_tokens
@patch(as_prop=True)
def cache_read_cost(self:Usage): return model_prices[self.model]['cache_read_prc'] * self.cache_read_tokens
@patch(as_prop=True)
def out_cost(self:Usage): return model_prices[self.model]['output_prc'] * self.completion_tokens
@patch(as_prop=True)
def web_cost(self:Usage): return model_prices[self.model]['web_search_prc'] * ifnone(self.web_search_requests, 0)
@patch(as_prop=True)
def cost(self:Usage): return self.inp_cost + self.cache_write_cost + self.cache_read_cost + self.out_cost + self.web_costA mapping of model pricing is also available in litellm, which is used to calculate the response_cost
model_pricing = dict2obj(httpx.get(litellm.model_cost_map_url).json())# model_pricing['claude-sonnet-4-5']# model_pricing['gemini-3-pro-preview']Examples
from tempfile import NamedTemporaryFile
tf =NamedTemporaryFile(suffix='.db')@patch
def user_id_fn(self:LisetteUsageLogger): return 'user-123'
tf=NamedTemporaryFile(suffix='.db')
logger = LisetteUsageLogger(tf.name)
litellm.callbacks = [logger]slc = ','.join('id model user_id prompt_tokens completion_tokens total_tokens cached_tokens cache_creation_tokens cache_read_tokens web_search_requests response_cost'.split())# litellm.set_verbose = TrueA simple example:
chat = Chat('claude-sonnet-4-5-20250929')
r = chat("What is 2+2?")time.sleep(3) # wait for callback db write
u = logger.usage(select=slc)[-1]; uOur calculated cost matches litellm’s response_cost. In some cases it might be better to use the custom calculation as we’ll see in the remaining of this notebook:
test_eq(u.cost, u.response_cost)Now, let’s test with streaming:
chat = Chat('claude-sonnet-4-5')
res = chat("Count from 1 to 5", stream=True)
for o in res: passtime.sleep(3)
u = logger.usage(select=slc)[-1]; utest_eq(u.cost, u.response_cost)Streaming logged successfully. Let’s also verify async chat calls are logged properly.
chat_async = AsyncChat('claude-sonnet-4-5-20250929')
await chat_async("What is 3+3?")time.sleep(3)
u = logger.usage(select=slc)[-1]; utest_eq(u.cost, u.response_cost)Finally, let’s test async streaming to ensure all API patterns are covered.
res = await chat_async("Count from 10 to 15", stream=True)
async for o in res: pass
print(o)time.sleep(3)
u = logger.usage(select=slc)[-1]; utest_eq(u.cost, u.response_cost)Search
Now let’s run a prompt with web search:
chat = Chat('gemini/gemini-2.5-flash')
chat("What is the weather like in NYC? Search web.", search="m")time.sleep(3)
u = logger.usage(select=slc)[-1]; utest_eq(u.web_search_requests,1)chat = Chat('claude-sonnet-4-5-20250929')
r = chat("What is the weather like in NYC? Search web.", search="m")time.sleep(3)
u = logger.usage(select=slc)[-1]; utest_eq(u.web_search_requests,1)Litellm’s response_cost doesn’t take web search request cost into account!
Now, this is a case where using the custom calculations is better as it will also include the web search request cost:
test_eq(u.cost, u.response_cost + u.web_search_requests * model_prices[u.model]['web_search_prc'])Search with streaming
Web search with streaming:
chat = Chat('gemini/gemini-2.5-flash')
res = chat("What is the weather like in NYC? Search web.", search="m", stream=True, stream_options={"include_usage": True})
for o in res: pass
# print(o)time.sleep(3)
u = logger.usage(select=slc)[-1]; uAnthropic web search requests are available in usage.server_tool_use
chat = Chat('claude-sonnet-4-5')
res = chat("What is the weather like in NYC now? Search web.", search="m", stream=True, stream_options={"include_usage": True})
for o in res: pass
# print(o)time.sleep(3)
u = logger.usage(select=slc)[-1]; utest_eq(u.cost, u.response_cost + u.web_search_requests * model_prices[u.model]['web_search_prc'])test_eq(len(logger.usage()), 8)Usage.total_cost
Usage.total_cost (sc=0.01)
L(logger.usage()).attrgot('response_cost').sum()disable_cachy()A simple Gemini example (requires min tokens and running twice to see cached_tokens):
# #| notest
# chat = Chat('gemini/gemini-2.5-flash')
# chat("What is 2+2?"* 500)
# time.sleep(5)
# chat("What is 2+2?"* 500)# #| notest
# time.sleep(3) # wait for callback db write
# u = logger.usage(select=slc)[-1];u# #| notest
# test_eq(len(logger.usage()), 10)
# test_eq(logger.usage()[-1].cached_tokens > 3000, True)tf.close()