Usage

Lisette usage and cost monitoring

import litellm, importlib, httpx
from lisette.core import Chat, AsyncChat, patch_litellm
from cachy import enable_cachy, disable_cachy
from fastcore.test import *

Lisette Usage Logger

_ = importlib.reload(litellm) # to re-run the notebook without kernel restart

# litellm._turn_on_debug()

patch_litellm()

source

Usage


def Usage(
    args:VAR_POSITIONAL, kwargs:VAR_KEYWORD
):

Initialize self. See help(type(self)) for accurate signature.

Anthropic provides web search request counts directly via usage.server_tool_use.web_search_requests, billed at $10 per 1,000 searches (pricing). Gemini returns queries in groundingMetadata.webSearchQueries—each query counts as a separate billable use—with 5,000 free prompts per month, then $14 per 1,000 search queries (coming soon) (pricing, grounding docs).

source

search_count


def search_count(
    r
):

The precomputed response cost provided is available in kwargs['response_cost'] according to the litellm docs:

source

LisetteUsageLogger


def LisetteUsageLogger(
    db_path
):

Args: turn_off_message_logging: bool - if True, the message logging will be turned off. Message and response will be redacted from StandardLoggingPayload. message_logging: bool - deprecated param, use turn_off_message_logging instead

Cost Utils

class PrefixDict(dict):
    def __getitem__(self, key):
        if key in self.keys(): return super().__getitem__(key)
        for k in self.keys(): 
            if key.startswith(k): return super().__getitem__(k)
        raise KeyError(key)

model_prices = PrefixDict({
    'claude-sonnet-4-5': dict(input_prc = 3/1e6, cache_write_prc = 3.75/1e6, cache_read_prc = 0.3/1e6, output_prc = 15/1e6, web_search_prc = 10/1e3)
})

Simplified cost utils to demonstrate total cost calculation (use Usage.response_cost in prod):

@patch(as_prop=True)
def inp_cost(self:Usage):         return model_prices[self.model]['input_prc'] * (self.prompt_tokens - self.cache_read_tokens)
@patch(as_prop=True)
def cache_write_cost(self:Usage): return model_prices[self.model]['cache_write_prc'] * self.cache_creation_tokens
@patch(as_prop=True)
def cache_read_cost(self:Usage):  return model_prices[self.model]['cache_read_prc'] * self.cache_read_tokens
@patch(as_prop=True)
def out_cost(self:Usage):         return model_prices[self.model]['output_prc'] * self.completion_tokens
@patch(as_prop=True)
def web_cost(self:Usage):         return model_prices[self.model]['web_search_prc'] * ifnone(self.web_search_requests, 0)
@patch(as_prop=True)
def cost(self:Usage):             return self.inp_cost + self.cache_write_cost + self.cache_read_cost + self.out_cost + self.web_cost

A mapping of model pricing is also available in litellm, which is used to calculate the response_cost

model_pricing = dict2obj(httpx.get(litellm.model_cost_map_url).json())

# model_pricing['claude-sonnet-4-5']

# model_pricing['gemini-3-pro-preview']

Examples

from tempfile import NamedTemporaryFile
tf =NamedTemporaryFile(suffix='.db')

@patch
def user_id_fn(self:LisetteUsageLogger): return 'user-123'
tf=NamedTemporaryFile(suffix='.db')
logger = LisetteUsageLogger(tf.name)
litellm.callbacks = [logger]

slc = ','.join('id model user_id prompt_tokens completion_tokens total_tokens cached_tokens cache_creation_tokens cache_read_tokens web_search_requests response_cost'.split())

# litellm.set_verbose = True

A simple example:

chat = Chat('claude-sonnet-4-5-20250929')
r = chat("What is 2+2?")

time.sleep(3) # wait for callback db write
u = logger.usage(select=slc)[-1]; u

Usage(id=1, timestamp=UNSET, model='claude-sonnet-4-5-20250929', user_id='user-123', prompt_tokens=14, completion_tokens=11, total_tokens=25, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=0, response_cost=0.000207)

Our calculated cost matches litellm’s response_cost. In some cases it might be better to use the custom calculation as we’ll see in the remaining of this notebook:

test_eq(u.cost, u.response_cost)

Now, let’s test with streaming:

chat = Chat('claude-sonnet-4-5')
res = chat("Count from 1 to 5", stream=True)
for o in res: pass

time.sleep(3)
u = logger.usage(select=slc)[-1]; u

Usage(id=2, timestamp=UNSET, model='claude-sonnet-4-5', user_id='user-123', prompt_tokens=15, completion_tokens=17, total_tokens=32, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=0, response_cost=0.00030000000000000003)

test_eq(u.cost, u.response_cost)

Streaming logged successfully. Let’s also verify async chat calls are logged properly.

chat_async = AsyncChat('claude-sonnet-4-5-20250929')
await chat_async("What is 3+3?")

3 + 3 = 6

id: chatcmpl-xxx
model: claude-sonnet-4-5-20250929
finish_reason: stop
usage: Usage(completion_tokens=13, prompt_tokens=14, total_tokens=27, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=None, cached_tokens=0, text_tokens=None, image_tokens=None, cache_creation_tokens=0, cache_creation_token_details=CacheCreationTokenDetails(ephemeral_5m_input_tokens=0, ephemeral_1h_input_tokens=0)), cache_creation_input_tokens=0, cache_read_input_tokens=0)

time.sleep(3)
u = logger.usage(select=slc)[-1]; u

Usage(id=3, timestamp=UNSET, model='claude-sonnet-4-5-20250929', user_id='user-123', prompt_tokens=14, completion_tokens=13, total_tokens=27, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=0, response_cost=0.00023700000000000001)

test_eq(u.cost, u.response_cost)

Finally, let’s test async streaming to ensure all API patterns are covered.

res = await chat_async("Count from 10 to 15", stream=True)
async for o in res: pass
print(o)

ModelResponse(id='chatcmpl-xxx', created=1000000000, model='claude-sonnet-4-5-20250929', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content='10, 11, 12, 13, 14, 15', role='assistant', tool_calls=None, function_call=None, provider_specific_fields=None))], usage=Usage(completion_tokens=20, prompt_tokens=38, total_tokens=58, completion_tokens_details=CompletionTokensDetailsWrapper(accepted_prediction_tokens=None, audio_tokens=None, reasoning_tokens=0, rejected_prediction_tokens=None, text_tokens=None, image_tokens=None), prompt_tokens_details=None))

time.sleep(3)
u = logger.usage(select=slc)[-1]; u

Usage(id=4, timestamp=UNSET, model='claude-sonnet-4-5-20250929', user_id='user-123', prompt_tokens=38, completion_tokens=20, total_tokens=58, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=0, response_cost=0.00041400000000000003)

test_eq(u.cost, u.response_cost)

Search

Now let’s run a prompt with web search:

flash = 'gemini/gemini-3-flash-preview'
chat = Chat(flash)
chat("What is the weather like in NYC? Search web.", search="m")

New York City is currently experiencing a severe winter weather event. As of Friday, February 6, 2026, the city is under a Weather Alert issued by NYC Emergency Management due to dangerously cold temperatures, strong winds, and snow.

Current Conditions & Immediate Forecast

Temperature: Currently around 31°F (-1°C), but temperatures are expected to drop steadily throughout the day.
Snow: Light snow and flurries are expected this afternoon and evening, with accumulations of 0.5 to 1 inch likely by Saturday morning.
Wind: An arctic cold front is moving in, which will bring a rapid increase in wind speeds tonight.

Extreme Cold Warning (Saturday – Sunday)

The National Weather Service has issued an Extreme Cold Warning effective from 10:00 AM Saturday, Feb 7, through 1:00 PM Sunday, Feb 8. * Wind Chills: Dangerously cold wind chills could reach as low as -20°F (-29°C). At these temperatures, frostbite can occur on exposed skin in as little as 5 minutes. * Winds: Sustained winds of 20–30 mph are expected Saturday, with gusts up to 50 mph, potentially causing blowing snow and hazardous travel conditions. * Low Temperatures: Saturday night is forecast to reach a low of approximately 5°F (-15°C).

City Response & Safety

Warming Centers: The city has opened warming centers in all five boroughs. You can find locations by calling 311 or visiting the NYC Warming Centers website.
Code Blue: An “Enhanced Code Blue” is in effect to protect vulnerable populations and those experiencing homelessness.
Travel: Officials urge New Yorkers to stay indoors if possible. If you must go out, dress in multiple warm layers and cover all exposed skin.

id: chatcmpl-xxx
model: gemini-3-flash-preview
finish_reason: stop
usage: Usage(completion_tokens=438, prompt_tokens=12, total_tokens=450, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetailsWrapper(audio_tokens=None, cached_tokens=None, text_tokens=12, image_tokens=None))

time.sleep(3)
u = logger.usage(select=slc)[-1]; u

Usage(id=5, timestamp=UNSET, model='gemini-3-flash-preview', user_id='user-123', prompt_tokens=12, completion_tokens=438, total_tokens=450, cached_tokens=None, cache_creation_tokens=None, cache_read_tokens=None, web_search_requests=3, response_cost=0.00132)

assert u.web_search_requests

chat = Chat('claude-sonnet-4-5-20250929')
r = chat("What is the weather like in NYC? Search web.", search="m")

time.sleep(3)
u = logger.usage(select=slc)[-1]; u

Usage(id=6, timestamp=UNSET, model='claude-sonnet-4-5-20250929', user_id='user-123', prompt_tokens=10303, completion_tokens=303, total_tokens=10606, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=1, response_cost=0.035454)

assert u.web_search_requests

Important

Litellm’s response_cost doesn’t take web search request cost into account!

Now, this is a case where using the custom calculations is better as it will also include the web search request cost:

test_eq(u.cost, u.response_cost + u.web_search_requests * model_prices[u.model]['web_search_prc'])

Search with streaming

Web search with streaming:

Important

Gemini web search requests are part of prompt_tokens_details which is only included with stream_options={"include_usage": True} when stream=True.

There is currently a bug with gemini web search request counts, Issue and PR. Waiting for litellm 1.80.11 pypi release.

chat = Chat(flash)
res = chat("What is the weather like in NYC? Search web.", search="m", stream=True, stream_options={"include_usage": True})
for o in res: pass
# print(o)

time.sleep(3)
u = logger.usage(select=slc)[-1]; u

Usage(id=7, timestamp=UNSET, model='gemini-3-flash-preview', user_id='user-123', prompt_tokens=12, completion_tokens=446, total_tokens=458, cached_tokens=None, cache_creation_tokens=None, cache_read_tokens=None, web_search_requests=2, response_cost=0.071344)

Important

Anthropic web search requests are available in usage.server_tool_use

chat = Chat('claude-sonnet-4-5')
res = chat("What is the weather like in NYC now? Search web.", search="m", stream=True, stream_options={"include_usage": True})
for o in res: pass
# print(o)

time.sleep(3)
u = logger.usage(select=slc)[-1]; u

Usage(id=8, timestamp=UNSET, model='claude-sonnet-4-5', user_id='user-123', prompt_tokens=10305, completion_tokens=269, total_tokens=10574, cached_tokens=0, cache_creation_tokens=0, cache_read_tokens=0, web_search_requests=1, response_cost=0.03495)

test_eq(u.cost, u.response_cost + u.web_search_requests * model_prices[u.model]['web_search_prc'])

test_eq(len(logger.usage()), 8)

source

Usage.total_cost


def total_cost(
    sc:float=0.01
):

L(logger.usage()).attrgot('response_cost').sum()

0.14422600000000002

disable_cachy()

A simple Gemini example (requires min tokens and running twice to see cached_tokens):

# #| notest
# chat = Chat('gemini/gemini-2.5-flash')
# chat("What is 2+2?"* 500)
# time.sleep(5)
# chat("What is 2+2?"* 500)

# #| notest
# time.sleep(3) # wait for callback db write
# u = logger.usage(select=slc)[-1];u

# #| notest
# test_eq(len(logger.usage()), 10)
# test_eq(logger.usage()[-1].cached_tokens > 3000, True)

tf.close()