How to build object detection in SwiftUI

iOS 17+ Xcode 16+ Advanced APIs: Vision / CoreML Updated: May 11, 2026

TL;DR

Feed live camera frames into a VNCoreMLRequest backed by a VNCoreMLModel, then map the returned VNRecognizedObjectObservation bounding boxes onto a SwiftUI overlay using GeometryReader.

// Minimal: run a CoreML object-detection model on one UIImage
import Vision, CoreML

func detect(in image: UIImage) throws -> [VNRecognizedObjectObservation] {
    let model = try VNCoreMLModel(for: YOLOv3().model)
    let request = VNCoreMLRequest(model: model)
    request.imageCropAndScaleOption = .scaleFill

    let handler = VNImageRequestHandler(cgImage: image.cgImage!, options: [:])
    try handler.perform([request])

    return (request.results as? [VNRecognizedObjectObservation]) ?? []
}

Full implementation

The full pipeline wraps AVCaptureSession inside an @Observable class that publishes detected objects whenever a new frame arrives. A SwiftUI view layers bounding-box rectangles over a live camera preview using GeometryReader to convert Vision's normalised coordinates (origin bottom-left) into view-space coordinates (origin top-left).

import SwiftUI
import AVFoundation
import Vision
import CoreML

// MARK: - Detected object model
struct DetectedObject: Identifiable {
    let id = UUID()
    let label: String
    let confidence: Float
    /// Normalised rect in Vision coords (origin bottom-left, 0-1 range)
    let boundingBox: CGRect
}

// MARK: - Camera + Vision coordinator
@Observable
final class ObjectDetector: NSObject, AVCaptureVideoDataOutputSampleBufferDelegate {

    var detections: [DetectedObject] = []

    private let session = AVCaptureSession()
    private let videoOutput = AVCaptureVideoDataOutput()
    private let visionQueue = DispatchQueue(label: "com.soarias.vision", qos: .userInteractive)
    private var visionRequest: VNCoreMLRequest?

    override init() {
        super.init()
        setupModel()
        setupCamera()
    }

    // 1. Load CoreML model
    private func setupModel() {
        do {
            // Replace YOLOv3 with your .mlmodel class name.
            let coreMLModel = try VNCoreMLModel(for: YOLOv3(configuration: MLModelConfiguration()).model)
            let request = VNCoreMLRequest(model: coreMLModel) { [weak self] req, _ in
                self?.handleResults(req.results)
            }
            request.imageCropAndScaleOption = .scaleFill
            visionRequest = request
        } catch {
            print("Model load failed: \(error)")
        }
    }

    // 2. Configure AVCaptureSession
    private func setupCamera() {
        session.sessionPreset = .hd1280x720
        guard
            let device = AVCaptureDevice.default(.builtInWideAngleCamera, for: .video, position: .back),
            let input = try? AVCaptureDeviceInput(device: device),
            session.canAddInput(input)
        else { return }

        session.addInput(input)
        videoOutput.setSampleBufferDelegate(self, queue: visionQueue)
        videoOutput.alwaysDiscardsLateVideoFrames = true
        if session.canAddOutput(videoOutput) { session.addOutput(videoOutput) }

        // Match preview orientation
        if let conn = videoOutput.connection(with: .video) {
            conn.videoRotationAngle = 90
        }
    }

    func start() { Task.detached { self.session.startRunning() } }
    func stop()  { session.stopRunning() }

    // 3. Receive frames and run Vision
    func captureOutput(_ output: AVCaptureOutput,
                       didOutput sampleBuffer: CMSampleBuffer,
                       from connection: AVCaptureConnection) {
        guard
            let request = visionRequest,
            let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer)
        else { return }

        let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer,
                                            orientation: .right,
                                            options: [:])
        try? handler.perform([request])
    }

    // 4. Publish results on main actor
    private func handleResults(_ results: [any VNObservation]?) {
        let observations = (results as? [VNRecognizedObjectObservation]) ?? []
        let mapped = observations.compactMap { obs -> DetectedObject? in
            guard let top = obs.labels.first, top.confidence > 0.4 else { return nil }
            return DetectedObject(label: top.identifier,
                                  confidence: top.confidence,
                                  boundingBox: obs.boundingBox)
        }
        DispatchQueue.main.async { self.detections = mapped }
    }

    // Expose session for preview layer
    var captureSession: AVCaptureSession { session }
}

// MARK: - Camera preview (UIViewRepresentable)
struct CameraPreview: UIViewRepresentable {
    let session: AVCaptureSession

    func makeUIView(context: Context) -> PreviewView {
        let view = PreviewView()
        view.previewLayer.session = session
        view.previewLayer.videoGravity = .resizeAspectFill
        return view
    }
    func updateUIView(_ uiView: PreviewView, context: Context) {}

    class PreviewView: UIView {
        override class var layerClass: AnyClass { AVCaptureVideoPreviewLayer.self }
        var previewLayer: AVCaptureVideoPreviewLayer { layer as! AVCaptureVideoPreviewLayer }
    }
}

// MARK: - Bounding box overlay
struct BoundingBoxOverlay: View {
    let detections: [DetectedObject]
    let size: CGSize

    var body: some View {
        ForEach(detections) { obj in
            let rect = visionToView(obj.boundingBox, in: size)
            ZStack(alignment: .topLeading) {
                Rectangle()
                    .stroke(Color.yellow, lineWidth: 2)
                    .frame(width: rect.width, height: rect.height)
                Text("\(obj.label) \(Int(obj.confidence * 100))%")
                    .font(.caption2.bold())
                    .foregroundStyle(.black)
                    .padding(2)
                    .background(Color.yellow)
                    .offset(y: -18)
            }
            .position(x: rect.midX, y: rect.midY)
            .accessibilityLabel("\(obj.label), \(Int(obj.confidence * 100)) percent confidence")
        }
    }

    /// Flip Vision's bottom-left origin to SwiftUI's top-left origin.
    private func visionToView(_ box: CGRect, in size: CGSize) -> CGRect {
        CGRect(
            x: box.minX * size.width,
            y: (1 - box.maxY) * size.height,
            width: box.width * size.width,
            height: box.height * size.height
        )
    }
}

// MARK: - Root view
struct ObjectDetectionView: View {
    @State private var detector = ObjectDetector()

    var body: some View {
        GeometryReader { geo in
            ZStack {
                CameraPreview(session: detector.captureSession)
                    .ignoresSafeArea()

                BoundingBoxOverlay(detections: detector.detections, size: geo.size)
            }
        }
        .ignoresSafeArea()
        .onAppear { detector.start() }
        .onDisappear { detector.stop() }
        .accessibilityLabel("Object detection camera view")
    }
}

#Preview {
    ObjectDetectionView()
}

How it works

Model loading (setupModel()). A VNCoreMLModel wraps the compiled .mlmodelc bundle. Setting imageCropAndScaleOption = .scaleFill prevents letterboxing artefacts that would skew bounding-box coordinates returned by the model.
Camera pipeline (setupCamera()). AVCaptureVideoDataOutput calls the delegate's captureOutput(_:didOutput:from:) on visionQueue — a private serial queue — so Vision never blocks the main thread. alwaysDiscardsLateVideoFrames = true keeps latency low.
Per-frame inference. Each CVPixelBuffer is wrapped in a VNImageRequestHandler with orientation: .right to match the sensor's natural landscape orientation rotated for portrait display. perform([request]) runs synchronously on visionQueue.
Result filtering (handleResults). Only observations whose top label exceeds 40 % confidence are kept. Results are published on DispatchQueue.main, triggering a SwiftUI re-render via the @Observable macro.
Coordinate flip (visionToView). Vision uses a unit-space coordinate system with the origin at the bottom-left; SwiftUI's GeometryReader origin is top-left. The transform y = (1 − box.maxY) × height performs the vertical flip before scaling to view points.

Variants

Detect objects in a still image (Photo Picker)

import PhotosUI

struct ImagePickerDetector: View {
    @State private var pickerItem: PhotosPickerItem?
    @State private var detections: [DetectedObject] = []
    @State private var pickedImage: UIImage?

    var body: some View {
        VStack {
            PhotosPicker("Choose photo", selection: $pickerItem, matching: .images)
                .onChange(of: pickerItem) { _, item in
                    Task {
                        guard
                            let data = try? await item?.loadTransferable(type: Data.self),
                            let ui = UIImage(data: data),
                            let cg = ui.cgImage
                        else { return }
                        pickedImage = ui
                        let model = try VNCoreMLModel(for: YOLOv3().model)
                        let req   = VNCoreMLRequest(model: model)
                        req.imageCropAndScaleOption = .scaleFill
                        try VNImageRequestHandler(cgImage: cg).perform([req])
                        detections = ((req.results as? [VNRecognizedObjectObservation]) ?? [])
                            .compactMap { obs in
                                obs.labels.first.map {
                                    DetectedObject(label: $0.identifier,
                                                   confidence: $0.confidence,
                                                   boundingBox: obs.boundingBox)
                                }
                            }
                    }
                }

            if let img = pickedImage {
                GeometryReader { geo in
                    Image(uiImage: img).resizable().scaledToFit()
                    BoundingBoxOverlay(detections: detections, size: geo.size)
                }
            }
        }
        .padding()
    }
}

Using the built-in object saliency model (no custom .mlmodel)

If you only need to know where objects are without classification labels, skip the CoreML model entirely. Use VNGenerateObjectnessBasedSaliencyImageRequest — it ships in the OS, requires zero download, and returns VNSaliencyImageObservation with salientObjects bounding boxes. Swap the VNCoreMLRequest for it and read obs.salientObjects instead of obs.labels. No model file, no bundle size hit.

Common pitfalls

iOS version / privacy manifest. Camera access requires NSCameraUsageDescription in Info.plist. On iOS 17+ you also need a Privacy Manifest (PrivacyInfo.xcprivacy) if you distribute on the App Store — missing it causes App Store rejection.
Coordinate system mismatch. The most common bug is forgetting that Vision's boundingBox origin is bottom-left. Failing to apply the y = (1 − box.maxY) flip produces boxes that appear mirrored vertically.
Main-thread Vision calls. Calling VNImageRequestHandler.perform on the main thread freezes the UI at 60 fps. Always dispatch to a dedicated serial background queue, and only push final results back to @MainActor / DispatchQueue.main.
Model accuracy vs. imageCropAndScaleOption. The default .centerCrop option can silently discard objects near the frame edges. Use .scaleFill for models trained on full-frame input (e.g. YOLOv3, CreateML detectors).
Accessibility. Bounding boxes drawn with ZStack are invisible to VoiceOver by default. Add .accessibilityLabel to each overlay so VoiceOver users hear detected labels and confidence percentages.

Prompt this with Claude Code

When using Soarias or Claude Code directly to implement this:

Implement object detection in SwiftUI for iOS 17+.
Use Vision/CoreML (VNCoreMLRequest, VNCoreMLModel, VNRecognizedObjectObservation).
Stream live camera frames via AVCaptureVideoDataOutput on a background serial queue.
Draw bounding-box overlays with GeometryReader, flipping Vision's bottom-left origin.
Make it accessible (VoiceOver labels on each bounding box).
Add a #Preview with realistic sample data.

In Soarias' Build phase, paste this prompt into the implementation step so Claude Code scaffolds the AVCaptureSession, @Observable detector class, and SwiftUI overlay in one pass — leaving you to drop in your .mlmodel file and adjust the confidence threshold.

FAQ

Does this work on iOS 16?

The Vision and CoreML APIs used here (VNCoreMLRequest, VNRecognizedObjectObservation) are available back to iOS 11, but the code relies on the @Observable macro and the #Preview macro which require iOS 17 / Xcode 15+. For iOS 16 support, replace @Observable with ObservableObject + @Published and swap #Preview for a PreviewProvider.

Which CoreML model should I use for general object detection?

Apple's YOLOv3 and YOLOv3-Tiny are free to download from the Apple ML Models page and are already compiled for iOS. For custom classes, train a Create ML Object Detector project in Xcode — it exports a .mlmodel with the same VNCoreMLRequest interface, so no code changes are needed.

What's the UIKit equivalent?

In UIKit you would use the same AVCaptureVideoDataOutput + VNImageRequestHandler pipeline, but draw bounding boxes on a CAShapeLayer overlaid on the AVCaptureVideoPreviewLayer instead of using SwiftUI's GeometryReader. The coordinate flip (y = 1 − maxY) applies identically — it is a Vision requirement, not a SwiftUI one.

Last reviewed: 2026-05-11 by the Soarias team.