Vision AI

Analyze images, answer visual questions, and extract information from images using on-device vision-language models.

Available Vision Models

Qwen3 VL

.qwen3VL

Full-featured vision-language model with excellent image understanding and detailed descriptions.

~4 GB | Best quality

SmolVLM

.smolVLM

Compact vision model for efficient image analysis on memory-constrained devices.

~2 GB | Lower memory

Basic Image Analysis

Analyze an image and get a description:

ImageAnalysis.swift
import SwiftUI
import Kuzco
import PhotosUI
struct ImageAnalysisView: View {
@State private var selectedImage: UIImage?
@State private var analysis = ""
@State private var isAnalyzing = false
@State private var photoItem: PhotosPickerItem?
var body: some View {
VStack(spacing: 20) {
if let image = selectedImage {
Image(uiImage: image)
.resizable()
.scaledToFit()
.frame(maxHeight: 300)
.cornerRadius(12)
}
PhotosPicker(selection: $photoItem, matching: .images) {
Label("Select Photo", systemImage: "photo")
}
.onChange(of: photoItem) { _, item in
Task {
if let data = try? await item?.loadTransferable(type: Data.self),
let image = UIImage(data: data) {
selectedImage = image
analyzeImage(image)
}
}
}
if isAnalyzing {
ProgressView("Analyzing...")
} else if !analysis.isEmpty {
Text(analysis)
.padding()
.background(Color.secondary.opacity(0.1))
.cornerRadius(8)
}
}
.padding()
}
func analyzeImage(_ image: UIImage) {
isAnalyzing = true
analysis = ""
Task {
do {
let session = try await KuzcoSession(model: .qwen3VL)
let response = try await session.analyzeImage(
image,
prompt: "Describe this image in detail."
)
analysis = response.text
} catch {
analysis = "Error: \(error.localizedDescription)"
}
isAnalyzing = false
}
}
}

Visual Question Answering

Ask specific questions about image content:

let session = try await KuzcoSession(model: .qwen3VL)
// Ask about image content
let response = try await session.analyzeImage(
photo,
prompt: "What breed of dog is in this image?"
)
print(response.text)
// Count objects
let countResponse = try await session.analyzeImage(
photo,
prompt: "How many people are in this photo?"
)
// Read text from images
let ocrResponse = try await session.analyzeImage(
documentPhoto,
prompt: "What text is visible in this image?"
)

Streaming Vision Responses

Stream the analysis for longer descriptions:

let session = try await KuzcoSession(model: .qwen3VL)
var description = ""
for try await partial in session.streamImageAnalysis(
image,
prompt: "Provide a detailed description of everything in this image."
) {
description += partial.text
// Update UI with streaming text
}

Multi-Turn Visual Conversations

Have follow-up conversations about an image:

let session = try await KuzcoSession(model: .qwen3VL)
// Initial image analysis
let initial = try await session.analyzeImage(
photo,
prompt: "What's in this image?"
)
print(initial.text) // "This is a photo of a kitchen with modern appliances..."
// Follow-up questions (image context is maintained)
let followUp = try await session.oneShot("What color is the refrigerator?")
print(followUp.text) // "The refrigerator is stainless steel..."
let another = try await session.oneShot("Is there a dishwasher visible?")
print(another.text) // "Yes, there's a dishwasher to the left of..."

Use Cases

Accessibility

Generate alt text and image descriptions for visually impaired users.

let altText = try await session.analyzeImage(
image,
prompt: "Write a brief, accessible alt text for this image."
)

Document Processing

Extract information from receipts, business cards, or documents.

let receipt = try await session.analyzeImage(
receiptPhoto,
prompt: "Extract the total amount and date from this receipt."
)

Product Identification

Identify products, plants, landmarks, or other objects.

let plant = try await session.analyzeImage(
plantPhoto,
prompt: "What type of plant is this? Include care tips."
)

Content Moderation

Analyze images for appropriate content.

let check = try await session.analyzeImage(
userPhoto,
prompt: "Is this image appropriate for a family-friendly app? Answer yes or no with brief explanation."
)

Image Preprocessing

For best results, preprocess images before analysis:

import UIKit
extension UIImage {
func resizedForAnalysis(maxDimension: CGFloat = 1024) -> UIImage {
let ratio = min(maxDimension / size.width, maxDimension / size.height)
if ratio >= 1 { return self }
let newSize = CGSize(
width: size.width * ratio,
height: size.height * ratio
)
UIGraphicsBeginImageContextWithOptions(newSize, false, 1.0)
draw(in: CGRect(origin: .zero, size: newSize))
let resized = UIGraphicsGetImageFromCurrentImageContext()
UIGraphicsEndImageContext()
return resized ?? self
}
}
// Usage
let optimized = originalImage.resizedForAnalysis()
let response = try await session.analyzeImage(optimized, prompt: "Describe this.")

Camera Integration

Analyze images directly from the camera:

CameraAnalysis.swift
import SwiftUI
import Kuzco
struct CameraAnalysisView: View {
@State private var showCamera = false
@State private var capturedImage: UIImage?
@State private var analysis = ""
var body: some View {
VStack {
if let image = capturedImage {
Image(uiImage: image)
.resizable()
.scaledToFit()
Text(analysis)
}
Button("Take Photo") {
showCamera = true
}
}
.sheet(isPresented: $showCamera) {
CameraPicker { image in
capturedImage = image
analyzeCapture(image)
}
}
}
func analyzeCapture(_ image: UIImage) {
Task {
let session = try await KuzcoSession(model: .smolVLM)
let response = try await session.analyzeImage(
image,
prompt: "What am I looking at?"
)
analysis = response.text
}
}
}