How to build object detection in SwiftUI
Feed live camera frames into a VNCoreMLRequest backed by a VNCoreMLModel, then map the returned
VNRecognizedObjectObservation bounding boxes onto a SwiftUI overlay using GeometryReader.
// Minimal: run a CoreML object-detection model on one UIImage
import Vision, CoreML
func detect(in image: UIImage) throws -> [VNRecognizedObjectObservation] {
let model = try VNCoreMLModel(for: YOLOv3().model)
let request = VNCoreMLRequest(model: model)
request.imageCropAndScaleOption = .scaleFill
let handler = VNImageRequestHandler(cgImage: image.cgImage!, options: [:])
try handler.perform([request])
return (request.results as? [VNRecognizedObjectObservation]) ?? []
}
Full implementation
The full pipeline wraps AVCaptureSession inside an @Observable class that publishes
detected objects whenever a new frame arrives. A SwiftUI view layers bounding-box rectangles over a live camera
preview using GeometryReader to convert Vision's normalised coordinates (origin bottom-left)
into view-space coordinates (origin top-left).
import SwiftUI
import AVFoundation
import Vision
import CoreML
// MARK: - Detected object model
struct DetectedObject: Identifiable {
let id = UUID()
let label: String
let confidence: Float
/// Normalised rect in Vision coords (origin bottom-left, 0-1 range)
let boundingBox: CGRect
}
// MARK: - Camera + Vision coordinator
@Observable
final class ObjectDetector: NSObject, AVCaptureVideoDataOutputSampleBufferDelegate {
var detections: [DetectedObject] = []
private let session = AVCaptureSession()
private let videoOutput = AVCaptureVideoDataOutput()
private let visionQueue = DispatchQueue(label: "com.soarias.vision", qos: .userInteractive)
private var visionRequest: VNCoreMLRequest?
override init() {
super.init()
setupModel()
setupCamera()
}
// 1. Load CoreML model
private func setupModel() {
do {
// Replace YOLOv3 with your .mlmodel class name.
let coreMLModel = try VNCoreMLModel(for: YOLOv3(configuration: MLModelConfiguration()).model)
let request = VNCoreMLRequest(model: coreMLModel) { [weak self] req, _ in
self?.handleResults(req.results)
}
request.imageCropAndScaleOption = .scaleFill
visionRequest = request
} catch {
print("Model load failed: \(error)")
}
}
// 2. Configure AVCaptureSession
private func setupCamera() {
session.sessionPreset = .hd1280x720
guard
let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .back),
let input = try? AVCaptureDeviceInput(device: device),
session.canAddInput(input)
else { return }
session.addInput(input)
videoOutput.setSampleBufferDelegate(self, queue: visionQueue)
videoOutput.alwaysDiscardsLateVideoFrames = true
if session.canAddOutput(videoOutput) { session.addOutput(videoOutput) }
// Match preview orientation
if let conn = videoOutput.connection(with: .video) {
conn.videoRotationAngle = 90
}
}
func start() { Task.detached { self.session.startRunning() } }
func stop() { session.stopRunning() }
// 3. Receive frames and run Vision
func captureOutput(_ output: AVCaptureOutput,
didOutput sampleBuffer: CMSampleBuffer,
from connection: AVCaptureConnection) {
guard
let request = visionRequest,
let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer)
else { return }
let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer,
orientation: .right,
options: [:])
try? handler.perform([request])
}
// 4. Publish results on main actor
private func handleResults(_ results: [any VNObservation]?) {
let observations = (results as? [VNRecognizedObjectObservation]) ?? []
let mapped = observations.compactMap { obs -> DetectedObject? in
guard let top = obs.labels.first, top.confidence > 0.4 else { return nil }
return DetectedObject(label: top.identifier,
confidence: top.confidence,
boundingBox: obs.boundingBox)
}
DispatchQueue.main.async { self.detections = mapped }
}
// Expose session for preview layer
var captureSession: AVCaptureSession { session }
}
// MARK: - Camera preview (UIViewRepresentable)
struct CameraPreview: UIViewRepresentable {
let session: AVCaptureSession
func makeUIView(context: Context) -> PreviewView {
let view = PreviewView()
view.previewLayer.session = session
view.previewLayer.videoGravity = .resizeAspectFill
return view
}
func updateUIView(_ uiView: PreviewView, context: Context) {}
class PreviewView: UIView {
override class var layerClass: AnyClass { AVCaptureVideoPreviewLayer.self }
var previewLayer: AVCaptureVideoPreviewLayer { layer as! AVCaptureVideoPreviewLayer }
}
}
// MARK: - Bounding box overlay
struct BoundingBoxOverlay: View {
let detections: [DetectedObject]
let size: CGSize
var body: some View {
ForEach(detections) { obj in
let rect = visionToView(obj.boundingBox, in: size)
ZStack(alignment: .topLeading) {
Rectangle()
.stroke(Color.yellow, lineWidth: 2)
.frame(width: rect.width, height: rect.height)
Text("\(obj.label) \(Int(obj.confidence * 100))%")
.font(.caption2.bold())
.foregroundStyle(.black)
.padding(2)
.background(Color.yellow)
.offset(y: -18)
}
.position(x: rect.midX, y: rect.midY)
.accessibilityLabel("\(obj.label), \(Int(obj.confidence * 100)) percent confidence")
}
}
/// Flip Vision's bottom-left origin to SwiftUI's top-left origin.
private func visionToView(_ box: CGRect, in size: CGSize) -> CGRect {
CGRect(
x: box.minX * size.width,
y: (1 - box.maxY) * size.height,
width: box.width * size.width,
height: box.height * size.height
)
}
}
// MARK: - Root view
struct ObjectDetectionView: View {
@State private var detector = ObjectDetector()
var body: some View {
GeometryReader { geo in
ZStack {
CameraPreview(session: detector.captureSession)
.ignoresSafeArea()
BoundingBoxOverlay(detections: detector.detections, size: geo.size)
}
}
.ignoresSafeArea()
.onAppear { detector.start() }
.onDisappear { detector.stop() }
.accessibilityLabel("Object detection camera view")
}
}
#Preview {
ObjectDetectionView()
}
How it works
-
Model loading (
setupModel()). AVNCoreMLModelwraps the compiled.mlmodelcbundle. SettingimageCropAndScaleOption = .scaleFillprevents letterboxing artefacts that would skew bounding-box coordinates returned by the model. -
Camera pipeline (
setupCamera()).AVCaptureVideoDataOutputcalls the delegate'scaptureOutput(_:didOutput:from:)onvisionQueue— a private serial queue — so Vision never blocks the main thread.alwaysDiscardsLateVideoFrames = truekeeps latency low. -
Per-frame inference.
Each
CVPixelBufferis wrapped in aVNImageRequestHandlerwithorientation: .rightto match the sensor's natural landscape orientation rotated for portrait display.perform([request])runs synchronously onvisionQueue. -
Result filtering (
handleResults). Only observations whose top label exceeds 40 % confidence are kept. Results are published onDispatchQueue.main, triggering a SwiftUI re-render via the@Observablemacro. -
Coordinate flip (
visionToView). Vision uses a unit-space coordinate system with the origin at the bottom-left; SwiftUI'sGeometryReaderorigin is top-left. The transformy = (1 − box.maxY) × heightperforms the vertical flip before scaling to view points.
Variants
Detect objects in a still image (Photo Picker)
import PhotosUI
struct ImagePickerDetector: View {
@State private var pickerItem: PhotosPickerItem?
@State private var detections: [DetectedObject] = []
@State private var pickedImage: UIImage?
var body: some View {
VStack {
PhotosPicker("Choose photo", selection: $pickerItem, matching: .images)
.onChange(of: pickerItem) { _, item in
Task {
guard
let data = try? await item?.loadTransferable(type: Data.self),
let ui = UIImage(data: data),
let cg = ui.cgImage
else { return }
pickedImage = ui
let model = try VNCoreMLModel(for: YOLOv3().model)
let req = VNCoreMLRequest(model: model)
req.imageCropAndScaleOption = .scaleFill
try VNImageRequestHandler(cgImage: cg).perform([req])
detections = ((req.results as? [VNRecognizedObjectObservation]) ?? [])
.compactMap { obs in
obs.labels.first.map {
DetectedObject(label: $0.identifier,
confidence: $0.confidence,
boundingBox: obs.boundingBox)
}
}
}
}
if let img = pickedImage {
GeometryReader { geo in
Image(uiImage: img).resizable().scaledToFit()
BoundingBoxOverlay(detections: detections, size: geo.size)
}
}
}
.padding()
}
}
Using the built-in object saliency model (no custom .mlmodel)
If you only need to know where objects are without classification labels, skip the CoreML model entirely.
Use VNGenerateObjectnessBasedSaliencyImageRequest — it ships in the OS, requires zero download,
and returns VNSaliencyImageObservation with salientObjects bounding boxes. Swap the
VNCoreMLRequest for it and read obs.salientObjects instead of
obs.labels. No model file, no bundle size hit.
Common pitfalls
-
iOS version / privacy manifest. Camera access requires
NSCameraUsageDescriptionin Info.plist. On iOS 17+ you also need a Privacy Manifest (PrivacyInfo.xcprivacy) if you distribute on the App Store — missing it causes App Store rejection. -
Coordinate system mismatch. The most common bug is forgetting that Vision's
boundingBoxorigin is bottom-left. Failing to apply they = (1 − box.maxY)flip produces boxes that appear mirrored vertically. -
Main-thread Vision calls. Calling
VNImageRequestHandler.performon the main thread freezes the UI at 60 fps. Always dispatch to a dedicated serial background queue, and only push final results back to@MainActor/DispatchQueue.main. -
Model accuracy vs.
imageCropAndScaleOption. The default.centerCropoption can silently discard objects near the frame edges. Use.scaleFillfor models trained on full-frame input (e.g. YOLOv3, CreateML detectors). -
Accessibility. Bounding boxes drawn with
ZStackare invisible to VoiceOver by default. Add.accessibilityLabelto each overlay so VoiceOver users hear detected labels and confidence percentages.
Prompt this with Claude Code
When using Soarias or Claude Code directly to implement this:
Implement object detection in SwiftUI for iOS 17+. Use Vision/CoreML (VNCoreMLRequest, VNCoreMLModel, VNRecognizedObjectObservation). Stream live camera frames via AVCaptureVideoDataOutput on a background serial queue. Draw bounding-box overlays with GeometryReader, flipping Vision's bottom-left origin. Make it accessible (VoiceOver labels on each bounding box). Add a #Preview with realistic sample data.
In Soarias' Build phase, paste this prompt into the implementation step so Claude Code scaffolds the
AVCaptureSession, @Observable detector class, and SwiftUI overlay in one pass —
leaving you to drop in your .mlmodel file and adjust the confidence threshold.
Related
FAQ
Does this work on iOS 16?
The Vision and CoreML APIs used here (VNCoreMLRequest, VNRecognizedObjectObservation)
are available back to iOS 11, but the code relies on the @Observable macro and the
#Preview macro which require iOS 17 / Xcode 15+. For iOS 16 support, replace
@Observable with ObservableObject + @Published and swap
#Preview for a PreviewProvider.
Which CoreML model should I use for general object detection?
Apple's YOLOv3 and YOLOv3-Tiny are free to download from the
Apple ML Models page
and are already compiled for iOS. For custom classes, train a
Create ML Object Detector project in Xcode — it exports a .mlmodel with the same
VNCoreMLRequest interface, so no code changes are needed.
What's the UIKit equivalent?
In UIKit you would use the same AVCaptureVideoDataOutput +
VNImageRequestHandler pipeline, but draw bounding boxes on a
CAShapeLayer overlaid on the AVCaptureVideoPreviewLayer instead of using
SwiftUI's GeometryReader. The coordinate flip (y = 1 − maxY) applies
identically — it is a Vision requirement, not a SwiftUI one.
Last reviewed: 2026-05-11 by the Soarias team.