Text Generation

Generate text responses using on-device language models with streaming or one-shot methods.

Creating a Session

Start by creating a KuzcoSession with your desired model:

import Kuzco
// Basic session
let session = try await KuzcoSession(model: .qwen3_4b)
// Session with custom configuration
let config = KuzcoConfiguration(
temperature: 0.7,
maxTokens: 2048,
topP: 0.9
)
let session = try await KuzcoSession(model: .qwen3_8b, configuration: config)

Streaming Responses

Streaming is ideal for chat interfaces, providing real-time text as it's generated:

StreamingChat.swift
import SwiftUI
import Kuzco
struct ChatView: View {
@State private var messages: [ChatMessage] = []
@State private var currentResponse = ""
@State private var isStreaming = false
@State private var session: KuzcoSession?
var body: some View {
VStack {
ScrollView {
ForEach(messages) { message in
MessageBubble(message: message)
}
if !currentResponse.isEmpty {
MessageBubble(
message: ChatMessage(role: .assistant, content: currentResponse)
)
}
}
ChatInput { text in
sendMessage(text)
}
.disabled(isStreaming)
}
.task {
session = try? await KuzcoSession(model: .qwen3_4b)
}
}
func sendMessage(_ text: String) {
guard let session = session else { return }
messages.append(ChatMessage(role: .user, content: text))
isStreaming = true
currentResponse = ""
Task {
do {
for try await partial in session.streamResponse(to: text) {
currentResponse += partial.text
}
// Move completed response to messages
messages.append(ChatMessage(role: .assistant, content: currentResponse))
currentResponse = ""
} catch {
currentResponse = "Error: \(error.localizedDescription)"
}
isStreaming = false
}
}
}

Stream with Token Usage

Track token usage during streaming for analytics or rate limiting:

for try await partial in session.streamResponse(to: prompt) {
print(partial.text, terminator: "")
// Token usage is available on each partial response
if let usage = partial.tokenUsage {
print("Tokens used: \(usage.totalTokens)")
}
// Check if generation is complete
if partial.isComplete {
print("\nGeneration finished!")
}
}

One-Shot Responses

For simpler use cases, get the complete response at once:

// Simple one-shot
let response = try await session.oneShot("What is the speed of light?")
print(response.text)
// With token usage
print("Prompt tokens: \(response.tokenUsage?.promptTokens ?? 0)")
print("Completion tokens: \(response.tokenUsage?.completionTokens ?? 0)")

System Prompts

Customize the model's behavior with a system prompt:

let session = try await KuzcoSession(model: .qwen3_4b)
// Set system prompt
session.setSystemPrompt("""
You are a helpful fitness coach. Provide exercise recommendations
based on the user's goals. Always emphasize proper form and safety.
Include warm-up suggestions when relevant.
""")
// The model will now respond as a fitness coach
let response = try await session.oneShot("I want to build upper body strength")
print(response.text)

Conversation History

Sessions maintain conversation history automatically:

let session = try await KuzcoSession(model: .qwen3_4b)
// First message
let r1 = try await session.oneShot("My name is Alex")
print(r1.text) // "Nice to meet you, Alex!"
// Follow-up - context is remembered
let r2 = try await session.oneShot("What's my name?")
print(r2.text) // "Your name is Alex."
// Access conversation history
let history = session.conversationHistory
for message in history {
print("\(message.role): \(message.content)")
}
// Clear history to start fresh
session.clearHistory()

Advanced: Manual Messages

For more control, add messages manually to the conversation:

let session = try await KuzcoSession(model: .qwen3_4b)
// Add messages manually
session.addMessage(role: .system, content: "You are a pirate captain.")
session.addMessage(role: .user, content: "Where should we sail?")
session.addMessage(role: .assistant, content: "Arrr! To the Caribbean!")
session.addMessage(role: .user, content: "What treasure should we seek?")
// Generate response with full context
let response = try await session.generateResponse()
print(response.text) // Response in pirate voice with context

Cancellation

Cancel ongoing generation using Swift's task cancellation:

// Store the task reference
var generationTask: Task<Void, Never>?
func startGeneration() {
generationTask = Task {
do {
for try await partial in session.streamResponse(to: prompt) {
// Check for cancellation
try Task.checkCancellation()
response += partial.text
}
} catch is CancellationError {
print("Generation cancelled")
} catch {
print("Error: \(error)")
}
}
}
func cancelGeneration() {
generationTask?.cancel()
}

Best Practices

Reuse Sessions

Create one session and reuse it for multiple requests. Creating new sessions has initialization overhead.

Use Streaming for Chat

Streaming provides better UX for conversational interfaces by showing text as it's generated.

Handle Errors Gracefully

Always wrap Kuzco calls in do-catch blocks and provide user-friendly error messages.

Clear History Appropriately

Long conversations consume context. Clear history when starting new topics or reaching context limits.