Skip to content

Client Examples

Your client is an async function with this signature:

async def client(model, messages, temp, max_tokens) -> tuple[str, int, int]:
    """
    Args:
        model: Model identifier string
        messages: List of {"role": str, "content": str} dicts
        temp: Sampling temperature
        max_tokens: Maximum response tokens

    Returns:
        Tuple of (response_text, input_tokens, output_tokens)
    """

OpenAI

from openai import AsyncOpenAI

client = AsyncOpenAI()

async def openai_client(model, messages, temp, max_tokens):
    resp = await client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temp,
        max_tokens=max_tokens,
    )
    return (
        resp.choices[0].message.content,
        resp.usage.prompt_tokens,
        resp.usage.completion_tokens,
    )

OpenAI + Anthropic (Multi-Provider)

Use two AsyncOpenAI instances with different base URLs:

import os
from openai import AsyncOpenAI

openai_client = AsyncOpenAI()
anthropic_client = AsyncOpenAI(
    base_url="https://api.anthropic.com/v1/",
    api_key=os.environ["ANTHROPIC_API_KEY"],
)

async def multi_provider_client(model, messages, temp, max_tokens):
    client = anthropic_client if model.startswith("claude") else openai_client
    # GPT-5: max_completion_tokens, no temperature, minimal reasoning
    is_gpt5 = model.startswith("gpt-5")
    params = {
        "model": model,
        "messages": messages,
        **({"max_completion_tokens": max_tokens, "reasoning_effort": "minimal"} if is_gpt5 else {"max_tokens": max_tokens, "temperature": temp}),
    }
    resp = await client.chat.completions.create(**params)
    return (
        resp.choices[0].message.content,
        resp.usage.prompt_tokens,
        resp.usage.completion_tokens,
    )

Example pipeline mixing providers:

from mixture_llm import Propose, Aggregate

pipeline = [
    Propose(["gpt-5-nano-2025-08-07", "claude-sonnet-4-5", "gpt-5-nano-2025-08-07"]),
    Aggregate("claude-sonnet-4-5"),
]

OpenRouter (All Models via One API)

OpenRouter provides access to hundreds of models through a single API:

import os
from openai import AsyncOpenAI

client = AsyncOpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.environ["OPENROUTER_API_KEY"],
)

async def openrouter_client(model, messages, temp, max_tokens):
    resp = await client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temp,
        max_tokens=max_tokens,
    )
    return (
        resp.choices[0].message.content,
        resp.usage.prompt_tokens,
        resp.usage.completion_tokens,
    )

Together MoA models via OpenRouter:

PROPOSERS = [
    "qwen/qwen-2.5-72b-instruct",
    "meta-llama/llama-3.3-70b-instruct",
    "mistralai/mixtral-8x22b-instruct",
]

together_moa_openrouter = [
    Propose(PROPOSERS, temp=0.7, max_tokens=512),
    Synthesize(PROPOSERS, temp=0.7, max_tokens=512),
    Aggregate("qwen/qwen-2.5-72b-instruct", max_tokens=1024),
]

Groq via LiteLLM (Free Tier)

Groq offers free access to several models with blazing fast inference:

from litellm import acompletion

async def groq_client(model, messages, temp, max_tokens):
    resp = await acompletion(
        model=f"groq/{model}",
        messages=messages,
        temperature=temp,
        max_tokens=max_tokens,
    )
    return (
        resp.choices[0].message.content,
        resp.usage.prompt_tokens,
        resp.usage.completion_tokens,
    )

Free Groq models (check console.groq.com/docs/rate-limits for current list):

GROQ_FREE = [
    "llama-3.3-70b-versatile",
    "llama-3.1-8b-instant",
    "qwen/qwen3-32b",
    "meta-llama/llama-4-scout-17b-16e-instruct",
]

free_moa = [
    Propose(GROQ_FREE, temp=0.7, max_tokens=512),
    Aggregate("llama-3.3-70b-versatile"),
]

Self-MoA with Groq:

free_self_moa = [
    Propose(["llama-3.3-70b-versatile"] * 4, temp=0.7),
    Aggregate("llama-3.3-70b-versatile"),
]

Rate limits

Groq free tier has rate limits (30 RPM for most models). For production workloads, consider paid tiers or other providers.

Together AI

from openai import AsyncOpenAI

client = AsyncOpenAI(
    base_url="https://api.together.xyz/v1",
    api_key=os.environ["TOGETHER_API_KEY"],
)

async def together_client(model, messages, temp, max_tokens):
    resp = await client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temp,
        max_tokens=max_tokens,
    )
    return (
        resp.choices[0].message.content,
        resp.usage.prompt_tokens,
        resp.usage.completion_tokens,
    )

Error Handling

The library handles errors gracefully—failed LLM calls are excluded from results:

async def robust_client(model, messages, temp, max_tokens):
    try:
        # Your API call
        ...
    except Exception as e:
        raise  # Library catches this and records the error

# Errors appear in history
result, history = await run(pipeline, query, robust_client)
for step in history:
    for call in step["llm_calls"]:
        if "error" in call:
            print(f"{call['model']} failed: {call['error']}")

Custom Retry Logic

Add retries for rate limits:

import asyncio
from openai import AsyncOpenAI, RateLimitError

client = AsyncOpenAI()

async def retry_client(model, messages, temp, max_tokens):
    for delay in [1, 2, 4, 8]:
        try:
            resp = await client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=temp,
                max_tokens=max_tokens,
            )
            return (
                resp.choices[0].message.content,
                resp.usage.prompt_tokens,
                resp.usage.completion_tokens,
            )
        except RateLimitError:
            await asyncio.sleep(delay)
    raise Exception(f"Rate limited after retries: {model}")