slightly improve video export speed

2025-12-12 11:12:02 -05:00 · 2025-12-12 11:12:02 -05:00 · cb62d0ee9d
parent d94ec0d6a8
commit cb62d0ee9d
10 changed files with 1542 additions and 142 deletions
--- a/lightningbeam-ui/Cargo.toml
+++ b/lightningbeam-ui/Cargo.toml
@ -9,15 +9,18 @@ members = [
 # UI Framework (using eframe for simplified integration)
 # Note: Upgraded from 0.29 to 0.31 to fix Linux IME/keyboard input issues
 # See: https://github.com/emilk/egui/pull/5198
-eframe = { version = "0.31", default-features = true, features = ["wgpu"] }
-egui_extras = { version = "0.31", features = ["image", "svg"] }
-egui-wgpu = "0.31"
+# Upgraded to 0.33 for shader editor (egui_code_editor) and continued bug fixes
+egui = "0.33"
+eframe = { version = "0.33", default-features = true, features = ["wgpu"] }
+egui_extras = { version = "0.33", features = ["image", "svg", "syntect"] }
+egui-wgpu = "0.33"
+egui_code_editor = "0.2"

 # GPU Rendering
-# vello 0.5 uses wgpu 24, matching eframe 0.31
-vello = "0.5"
-wgpu = "24"
-kurbo = { version = "0.11", features = ["serde"] }
+# vello from git uses wgpu 27, matching eframe 0.33
+vello = { git = "https://github.com/linebender/vello", branch = "main" }
+wgpu = { version = "27", features = ["vulkan", "metal"] }
+kurbo = { version = "0.12", features = ["serde"] }
 peniko = "0.5"

 # Windowing
--- a/lightningbeam-ui/lightningbeam-core/src/gpu/mod.rs
+++ b/lightningbeam-ui/lightningbeam-core/src/gpu/mod.rs
@ -10,12 +10,14 @@ pub mod buffer_pool;
 pub mod color_convert;
 pub mod compositor;
 pub mod effect_processor;
+pub mod yuv_converter;

 // Re-export commonly used types
 pub use buffer_pool::{BufferHandle, BufferPool, BufferSpec, BufferFormat};
 pub use color_convert::SrgbToLinearConverter;
 pub use compositor::{Compositor, CompositorLayer, BlendMode};
 pub use effect_processor::{EffectProcessor, EffectUniforms};
+pub use yuv_converter::YuvConverter;

 /// Standard HDR internal texture format (16-bit float per channel)
 pub const HDR_FORMAT: wgpu::TextureFormat = wgpu::TextureFormat::Rgba16Float;
--- a/lightningbeam-ui/lightningbeam-core/src/gpu/yuv_converter.rs
+++ b/lightningbeam-ui/lightningbeam-core/src/gpu/yuv_converter.rs
@ -0,0 +1,241 @@
+//! GPU-accelerated RGBA to YUV420p color space conversion
+//!
+//! Provides a compute shader-based converter for transforming RGBA textures
+//! to YUV420p planar format using the BT.709 color matrix (HD video standard).
+//! This replaces the CPU-based conversion with GPU parallel processing.
+
+/// GPU pipeline for RGBA to YUV420p color space conversion
+///
+/// Converts Rgba8Unorm textures to YUV420p planar format using BT.709 colorspace.
+/// The Y plane is full resolution, while U and V planes are subsampled 4:2:0.
+///
+/// Output texture layout:
+/// - Rows 0 to height-1: Y plane (luma, full resolution)
+/// - Rows height to height + height/4 - 1: U plane (chroma, half resolution)
+/// - Rows height + height/4 to height + height/2 - 1: V plane (chroma, half resolution)
+pub struct YuvConverter {
+    pipeline: wgpu::ComputePipeline,
+    bind_group_layout: wgpu::BindGroupLayout,
+}
+
+impl YuvConverter {
+    /// Create a new RGBA to YUV420p converter
+    pub fn new(device: &wgpu::Device) -> Self {
+        // Create bind group layout
+        let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label: Some("yuv_converter_bind_group_layout"),
+            entries: &[
+                // Input RGBA texture (binding 0)
+                wgpu::BindGroupLayoutEntry {
+                    binding: 0,
+                    visibility: wgpu::ShaderStages::COMPUTE,
+                    ty: wgpu::BindingType::Texture {
+                        sample_type: wgpu::TextureSampleType::Float { filterable: false },
+                        view_dimension: wgpu::TextureViewDimension::D2,
+                        multisampled: false,
+                    },
+                    count: None,
+                },
+                // Output YUV texture (Rgba8Unorm storage texture, binding 1)
+                // Note: R8Unorm doesn't support storage binding, so we use Rgba8Unorm and write to .r channel
+                wgpu::BindGroupLayoutEntry {
+                    binding: 1,
+                    visibility: wgpu::ShaderStages::COMPUTE,
+                    ty: wgpu::BindingType::StorageTexture {
+                        access: wgpu::StorageTextureAccess::WriteOnly,
+                        format: wgpu::TextureFormat::Rgba8Unorm,
+                        view_dimension: wgpu::TextureViewDimension::D2,
+                    },
+                    count: None,
+                },
+            ],
+        });
+
+        // Create pipeline layout
+        let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: Some("yuv_converter_pipeline_layout"),
+            bind_group_layouts: &[&bind_group_layout],
+            push_constant_ranges: &[],
+        });
+
+        // Create shader module
+        let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: Some("yuv_converter_shader"),
+            source: wgpu::ShaderSource::Wgsl(YUV_CONVERTER_SHADER.into()),
+        });
+
+        // Create compute pipeline
+        let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label: Some("yuv_converter_pipeline"),
+            layout: Some(&pipeline_layout),
+            module: &shader,
+            entry_point: Some("main"),
+            compilation_options: wgpu::PipelineCompilationOptions::default(),
+            cache: None,
+        });
+
+        Self {
+            pipeline,
+            bind_group_layout,
+        }
+    }
+
+    /// Convert RGBA texture to YUV420p planar format
+    ///
+    /// Reads from `rgba_view` and writes Y, U, V planes to `yuv_output_view`.
+    /// The output texture must be R8Unorm format with height = input_height * 1.5
+    /// to accommodate the packed YUV planes.
+    ///
+    /// # Arguments
+    /// * `device` - GPU device
+    /// * `encoder` - Command encoder to record GPU commands
+    /// * `rgba_view` - Source RGBA texture view
+    /// * `yuv_output_view` - Destination YUV planar texture view (R8Unorm, height*1.5)
+    /// * `width` - Width of the source RGBA texture
+    /// * `height` - Height of the source RGBA texture
+    pub fn convert(
+        &self,
+        device: &wgpu::Device,
+        encoder: &mut wgpu::CommandEncoder,
+        rgba_view: &wgpu::TextureView,
+        yuv_output_view: &wgpu::TextureView,
+        width: u32,
+        height: u32,
+    ) {
+        // Create bind group for this conversion
+        let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: Some("yuv_converter_bind_group"),
+            layout: &self.bind_group_layout,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: wgpu::BindingResource::TextureView(rgba_view),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 1,
+                    resource: wgpu::BindingResource::TextureView(yuv_output_view),
+                },
+            ],
+        });
+
+        // Compute pass
+        let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+            label: Some("yuv_conversion_pass"),
+            timestamp_writes: None,
+        });
+
+        compute_pass.set_pipeline(&self.pipeline);
+        compute_pass.set_bind_group(0, &bind_group, &[]);
+
+        // Dispatch workgroups: 8x8 threads per workgroup
+        // Each thread processes one pixel for the Y plane
+        // Chroma planes are processed by threads at even coordinates
+        let workgroup_size = 8;
+        let workgroups_x = (width + workgroup_size - 1) / workgroup_size;
+        let workgroups_y = (height + workgroup_size - 1) / workgroup_size;
+        compute_pass.dispatch_workgroups(workgroups_x, workgroups_y, 1);
+    }
+}
+
+/// WGSL compute shader for RGBA to YUV420p conversion
+const YUV_CONVERTER_SHADER: &str = r#"
+// RGBA to YUV420p Compute Shader
+// BT.709 color space for HD video (ITU-R BT.709-6 standard)
+//
+// Color matrix:
+// Y  =  0.2126*R + 0.7152*G + 0.0722*B
+// U  = -0.1146*R - 0.3854*G + 0.5000*B + 0.5
+// V  =  0.5000*R - 0.4542*G - 0.0458*B + 0.5
+//
+// Output texture layout (packed planar, side-by-side U/V):
+// - Rows [0, height): Y plane (full resolution, full width)
+// - Rows [height, height + height/2): U plane (left half, columns 0 to width/2-1)
+//                                      V plane (right half, columns width/2 to width-1)
+
+@group(0) @binding(0) var input_rgba: texture_2d<f32>;
+@group(0) @binding(1) var output_yuv: texture_storage_2d<rgba8unorm, write>;
+
+@compute @workgroup_size(8, 8, 1)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
+    let dims = textureDimensions(input_rgba);
+    let pos = global_id.xy;
+
+    // Bounds check
+    if (pos.x >= dims.x || pos.y >= dims.y) {
+        return;
+    }
+
+    // Load RGBA pixel
+    let rgba = textureLoad(input_rgba, pos, 0);
+    let r = rgba.r;
+    let g = rgba.g;
+    let b = rgba.b;
+
+    // Compute Y (luma) - full resolution, BT.709
+    let y = 0.2126 * r + 0.7152 * g + 0.0722 * b;
+
+    // Write Y value to Y plane (rows 0 to height-1)
+    textureStore(output_yuv, pos, vec4<f32>(y, 0.0, 0.0, 0.0));
+
+    // Compute U and V (chroma) - subsampled 4:2:0
+    // Only process even coordinates (top-left of 2x2 blocks)
+    if (pos.x % 2u == 0u && pos.y % 2u == 0u) {
+        // Sample 2x2 block for chroma subsampling
+        var r_sum = r;
+        var g_sum = g;
+        var b_sum = b;
+        var count = 1.0;
+
+        // Sample right neighbor (x+1, y)
+        if (pos.x + 1u < dims.x) {
+            let rgba_r = textureLoad(input_rgba, pos + vec2<u32>(1u, 0u), 0);
+            r_sum += rgba_r.r;
+            g_sum += rgba_r.g;
+            b_sum += rgba_r.b;
+            count += 1.0;
+        }
+
+        // Sample bottom neighbor (x, y+1)
+        if (pos.y + 1u < dims.y) {
+            let rgba_b = textureLoad(input_rgba, pos + vec2<u32>(0u, 1u), 0);
+            r_sum += rgba_b.r;
+            g_sum += rgba_b.g;
+            b_sum += rgba_b.b;
+            count += 1.0;
+        }
+
+        // Sample bottom-right neighbor (x+1, y+1)
+        if (pos.x + 1u < dims.x && pos.y + 1u < dims.y) {
+            let rgba_br = textureLoad(input_rgba, pos + vec2<u32>(1u, 1u), 0);
+            r_sum += rgba_br.r;
+            g_sum += rgba_br.g;
+            b_sum += rgba_br.b;
+            count += 1.0;
+        }
+
+        // Average the 2x2 block
+        let r_avg = r_sum / count;
+        let g_avg = g_sum / count;
+        let b_avg = b_sum / count;
+
+        // Compute chroma components (BT.709, centered at 0.5 for unsigned 8-bit)
+        let u = -0.1146 * r_avg - 0.3854 * g_avg + 0.5000 * b_avg + 0.5;
+        let v =  0.5000 * r_avg - 0.4542 * g_avg - 0.0458 * b_avg + 0.5;
+
+        // Compute chroma plane positions (half resolution)
+        // Pack U and V side-by-side: U on left half, V on right half
+        let chroma_x = pos.x / 2u;
+        let chroma_y = pos.y / 2u;
+
+        // U plane: left half (columns 0 to width/2-1), rows height to height+height/2-1
+        let u_pos = vec2<u32>(chroma_x, dims.y + chroma_y);
+
+        // V plane: right half (columns width/2 to width-1), rows height to height+height/2-1
+        let v_pos = vec2<u32>(dims.x / 2u + chroma_x, dims.y + chroma_y);
+
+        // Write U and V values to their respective planes
+        textureStore(output_yuv, u_pos, vec4<f32>(u, 0.0, 0.0, 0.0));
+        textureStore(output_yuv, v_pos, vec4<f32>(v, 0.0, 0.0, 0.0));
+    }
+}
+"#;
--- a/lightningbeam-ui/lightningbeam-editor/src/export/async_render_helpers.md
+++ b/lightningbeam-ui/lightningbeam-editor/src/export/async_render_helpers.md
@ -0,0 +1,62 @@
+# Plan for Async Rendering Helpers
+
+I'm creating this temporary document to plan the async rendering changes.
+
+## Current Architecture (Synchronous)
+`render_frame_to_rgba_hdr()` in video_exporter.rs:
+1. Render document to RGBA (lines 750-991)
+2. GPU YUV conversion (lines 993-1005)
+3. Copy YUV to staging buffer (lines 1007-1029)
+4. Submit GPU commands (line 1031)
+5. **BLOCKING** map_async + wait (lines 1033-1045)
+6. Extract Y, U, V planes from mapped buffer (lines 1047-1087)
+7. Unmap and return YUV planes (lines 1089-1092)
+
+## New Architecture (Async Pipelined)
+Split into two phases using ReadbackPipeline:
+
+### Phase 1: Submit Frame (Non-blocking)
+New function `submit_frame_to_readback_pipeline()`:
+- Input: buffer from ReadbackPipeline.acquire()
+- Steps 1-3: Render to RGBA, GPU YUV, copy to buffer's YUV texture
+- Return encoder to ReadbackPipeline for submission
+- **Does NOT wait for GPU**
+
+### Phase 2: Extract YUV (After async mapping)
+Helper function `extract_yuv_planes_from_buffer()`:
+- Input: mapped buffer data from ReadbackPipeline
+- Steps 6-7: Extract Y, U, V planes, return them
+- Used after ReadbackPipeline.get_mapped_data()
+
+## Modified render_next_video_frame()
+New async pipeline loop:
+```
+while more_work_to_do:
+    // Poll for completed frames
+    for result in pipeline.poll_nonblocking():
+        data = pipeline.get_mapped_data(result.buffer_id)
+        (y, u, v) = extract_yuv_planes(data)
+        send_to_encoder_in_order(result.frame_num, y, u, v)
+        pipeline.release(result.buffer_id)
+
+    // Submit new frames (up to 3 in flight)
+    if current_frame < total_frames && frames_in_flight < 3:
+        if let Some(buffer) = pipeline.acquire(frame_num, timestamp):
+            encoder = submit_frame_to_pipeline(buffer)
+            pipeline.submit_and_readback(buffer.id, encoder)
+            frames_in_flight++
+            current_frame++
+
+    // Done when all frames submitted AND all completed
+    if current_frame >= total_frames && frames_in_flight == 0:
+        return Ok(false)
+
+    return Ok(true)  // More work to do
+```
+
+This achieves triple buffering:
+- Frame N: GPU rendering
+- Frame N-1: GPU→CPU async transfer
+- Frame N-2: CPU encoding
+
+Expected speedup: 5x
--- a/lightningbeam-ui/lightningbeam-editor/src/export/cpu_yuv_converter.rs
+++ b/lightningbeam-ui/lightningbeam-editor/src/export/cpu_yuv_converter.rs
@ -0,0 +1,143 @@
+//! CPU-based RGBA→YUV420p color space converter using FFmpeg's swscale
+//!
+//! This module provides a wrapper around FFmpeg's highly-optimized swscale library
+//! for converting RGBA data to YUV420p format. Uses SIMD instructions when available
+//! for maximum performance.
+
+use ffmpeg_next as ffmpeg;
+
+/// CPU-based RGBA→YUV420p converter using FFmpeg's swscale
+///
+/// This converter uses FFmpeg's swscale library which is highly optimized with SIMD
+/// instructions (SSE, AVX) for fast color space conversion on the CPU.
+pub struct CpuYuvConverter {
+    width: u32,
+    height: u32,
+}
+
+impl CpuYuvConverter {
+    /// Create new converter for given dimensions
+    ///
+    /// # Arguments
+    /// * `width` - Frame width in pixels
+    /// * `height` - Frame height in pixels
+    pub fn new(width: u32, height: u32) -> Result<Self, String> {
+        Ok(Self { width, height })
+    }
+
+    /// Convert RGBA data to YUV420p planes
+    ///
+    /// Performs color space conversion from RGBA (8-bit per channel, packed format)
+    /// to YUV420p (8-bit per channel, planar format with subsampled chroma).
+    ///
+    /// Uses BT.709 color matrix (HD standard) for the conversion.
+    ///
+    /// # Arguments
+    /// * `rgba_data` - Packed RGBA data (width * height * 4 bytes)
+    ///
+    /// # Returns
+    /// Tuple of (y_plane, u_plane, v_plane) as separate Vec<u8>
+    ///
+    /// # Panics
+    /// Panics if rgba_data length doesn't match width * height * 4
+    pub fn convert(&self, rgba_data: &[u8]) -> Result<(Vec<u8>, Vec<u8>, Vec<u8>), String> {
+        let expected_size = (self.width * self.height * 4) as usize;
+        assert_eq!(
+            rgba_data.len(),
+            expected_size,
+            "RGBA data size mismatch: expected {} bytes, got {}",
+            expected_size,
+            rgba_data.len()
+        );
+
+        // Create source RGBA frame
+        let mut rgba_frame = ffmpeg::frame::Video::new(
+            ffmpeg::format::Pixel::RGBA,
+            self.width,
+            self.height,
+        );
+
+        // Copy RGBA data into source frame
+        // ffmpeg-next provides mutable access to the frame data
+        let frame_data = rgba_frame.data_mut(0);
+        frame_data.copy_from_slice(rgba_data);
+
+        // Create destination YUV420p frame
+        let mut yuv_frame = ffmpeg::frame::Video::new(
+            ffmpeg::format::Pixel::YUV420P,
+            self.width,
+            self.height,
+        );
+
+        // Create swscale context for RGBA→YUV420p conversion
+        // Uses BT.709 color matrix (HD standard)
+        let mut scaler = ffmpeg::software::scaling::Context::get(
+            ffmpeg::format::Pixel::RGBA,
+            self.width,
+            self.height,
+            ffmpeg::format::Pixel::YUV420P,
+            self.width,
+            self.height,
+            ffmpeg::software::scaling::Flags::BILINEAR,
+        )
+        .map_err(|e| format!("Failed to create swscale context: {}", e))?;
+
+        // Perform the conversion (SIMD-optimized)
+        scaler
+            .run(&rgba_frame, &mut yuv_frame)
+            .map_err(|e| format!("swscale conversion failed: {}", e))?;
+
+        // Extract planar YUV data
+        // YUV420p has 3 planes:
+        // - Y: full resolution (width × height)
+        // - U: quarter resolution (width/2 × height/2)
+        // - V: quarter resolution (width/2 × height/2)
+        let y_plane = yuv_frame.data(0).to_vec();
+        let u_plane = yuv_frame.data(1).to_vec();
+        let v_plane = yuv_frame.data(2).to_vec();
+
+        Ok((y_plane, u_plane, v_plane))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_converter_creation() {
+        let converter = CpuYuvConverter::new(1920, 1080);
+        assert!(converter.is_ok());
+    }
+
+    #[test]
+    fn test_conversion_output_sizes() {
+        let converter = CpuYuvConverter::new(1920, 1080).unwrap();
+
+        // Create dummy RGBA data (all black)
+        let rgba_data = vec![0u8; 1920 * 1080 * 4];
+
+        let result = converter.convert(&rgba_data);
+        assert!(result.is_ok());
+
+        let (y, u, v) = result.unwrap();
+
+        // Y plane should be full resolution
+        assert_eq!(y.len(), 1920 * 1080);
+
+        // U and V planes should be quarter resolution (subsampled 2x2)
+        assert_eq!(u.len(), (1920 / 2) * (1080 / 2));
+        assert_eq!(v.len(), (1920 / 2) * (1080 / 2));
+    }
+
+    #[test]
+    #[should_panic(expected = "RGBA data size mismatch")]
+    fn test_wrong_input_size_panics() {
+        let converter = CpuYuvConverter::new(1920, 1080).unwrap();
+
+        // Wrong size input
+        let rgba_data = vec![0u8; 1000];
+
+        let _ = converter.convert(&rgba_data);
+    }
+}
--- a/lightningbeam-ui/lightningbeam-editor/src/export/mod.rs
+++ b/lightningbeam-ui/lightningbeam-editor/src/export/mod.rs
@ -6,6 +6,9 @@
 pub mod audio_exporter;
 pub mod dialog;
 pub mod video_exporter;
+pub mod readback_pipeline;
+pub mod perf_metrics;
+pub mod cpu_yuv_converter;

 use lightningbeam_core::export::{AudioExportSettings, VideoExportSettings, ExportProgress};
 use lightningbeam_core::document::Document;
@ -18,8 +21,14 @@ use std::sync::atomic::{AtomicBool, Ordering};

 /// Message sent from main thread to video encoder thread
 enum VideoFrameMessage {
-    /// RGBA frame data with frame number and timestamp
-    Frame { frame_num: usize, timestamp: f64, rgba_data: Vec<u8> },
+    /// YUV420p frame data with frame number and timestamp (GPU-converted)
+    Frame {
+        frame_num: usize,
+        timestamp: f64,
+        y_plane: Vec<u8>,
+        u_plane: Vec<u8>,
+        v_plane: Vec<u8>,
+    },
    /// Signal that all frames have been sent
    Done,
 }
@ -44,6 +53,16 @@ pub struct VideoExportState {
    frame_tx: Option<Sender<VideoFrameMessage>>,
    /// HDR GPU resources for compositing pipeline (effects, color conversion)
    gpu_resources: Option<video_exporter::ExportGpuResources>,
+    /// Async triple-buffered readback pipeline for GPU RGBA frames
+    readback_pipeline: Option<readback_pipeline::ReadbackPipeline>,
+    /// CPU YUV converter for RGBA→YUV420p conversion
+    cpu_yuv_converter: Option<cpu_yuv_converter::CpuYuvConverter>,
+    /// Frames that have been submitted to GPU but not yet encoded
+    frames_in_flight: usize,
+    /// Next frame number to send to encoder (for ordering)
+    next_frame_to_encode: usize,
+    /// Performance metrics for instrumentation
+    perf_metrics: Option<perf_metrics::ExportMetrics>,
 }

 /// Export orchestrator that manages the export process
@ -168,13 +187,11 @@ impl ExportOrchestrator {

        // Poll video progress
        while let Ok(progress) = parallel.video_progress_rx.try_recv() {
-            println!("📨 [PARALLEL] Video progress: {:?}", std::mem::discriminant(&progress));
            parallel.video_progress = Some(progress);
        }

        // Poll audio progress
        while let Ok(progress) = parallel.audio_progress_rx.try_recv() {
-            println!("📨 [PARALLEL] Audio progress: {:?}", std::mem::discriminant(&progress));
            parallel.audio_progress = Some(progress);
        }

@ -621,7 +638,7 @@ impl ExportOrchestrator {
        self.thread_handle = Some(handle);

        // Initialize video export state
-        // GPU resources will be initialized lazily on first frame (needs device)
+        // GPU resources and readback pipeline will be initialized lazily on first frame (needs device)
        self.video_state = Some(VideoExportState {
            current_frame: 0,
            total_frames,
@ -632,6 +649,11 @@ impl ExportOrchestrator {
            height,
            frame_tx: Some(frame_tx),
            gpu_resources: None,
+            readback_pipeline: None,
+            cpu_yuv_converter: None,
+            frames_in_flight: 0,
+            next_frame_to_encode: 0,
+            perf_metrics: Some(perf_metrics::ExportMetrics::new()),
        });

        println!("🎬 [VIDEO EXPORT] Encoder thread spawned, ready for frames");
@ -745,7 +767,7 @@ impl ExportOrchestrator {
        });

        // Initialize video export state for incremental rendering
-        // GPU resources will be initialized lazily on first frame (needs device)
+        // GPU resources and readback pipeline will be initialized lazily on first frame (needs device)
        self.video_state = Some(VideoExportState {
            current_frame: 0,
            total_frames,
@ -756,6 +778,11 @@ impl ExportOrchestrator {
            height: video_height,
            frame_tx: Some(frame_tx),
            gpu_resources: None,
+            readback_pipeline: None,
+            cpu_yuv_converter: None,
+            frames_in_flight: 0,
+            next_frame_to_encode: 0,
+            perf_metrics: Some(perf_metrics::ExportMetrics::new()),
        });

        // Initialize parallel export state
@ -777,6 +804,7 @@ impl ExportOrchestrator {

    /// Render and send the next video frame (call from main thread)
    ///
+    /// Uses async triple-buffered pipeline for maximum throughput.
    /// Returns true if there are more frames to render, false if done.
    ///
    /// # Arguments
@ -798,62 +826,143 @@ impl ExportOrchestrator {
        image_cache: &mut ImageCache,
        video_manager: &Arc<std::sync::Mutex<VideoManager>>,
    ) -> Result<bool, String> {
+        use std::time::Instant;
+
        let state = self.video_state.as_mut()
            .ok_or("No video export in progress")?;

-        if state.current_frame >= state.total_frames {
-            // All frames rendered, signal encoder thread
-            if let Some(tx) = state.frame_tx.take() {
-                tx.send(VideoFrameMessage::Done).ok();
-            }
-            // Clean up GPU resources
-            state.gpu_resources = None;
-            return Ok(false);
-        }
-
-        // Calculate timestamp for this frame
-        let timestamp = state.start_time + (state.current_frame as f64 / state.framerate);
-
-        // Get frame dimensions from export settings
        let width = state.width;
        let height = state.height;

-        // Initialize GPU resources on first frame (needs device)
+        // Initialize GPU resources and readback pipeline on first frame
        if state.gpu_resources.is_none() {
-            println!("🎬 [VIDEO EXPORT] Initializing HDR GPU resources for {}x{}", width, height);
+            println!("🎬 [VIDEO EXPORT] Initializing HDR GPU + async pipeline {}x{}", width, height);
            state.gpu_resources = Some(video_exporter::ExportGpuResources::new(device, width, height));
+            state.readback_pipeline = Some(readback_pipeline::ReadbackPipeline::new(device, queue, width, height));
+            state.cpu_yuv_converter = Some(cpu_yuv_converter::CpuYuvConverter::new(width, height)?);
+            println!("🚀 [ASYNC PIPELINE] Triple-buffered pipeline initialized");
+            println!("🚀 [CPU YUV] swscale converter initialized");
        }

-        // Render frame to RGBA buffer using HDR pipeline (with effects)
-        let mut rgba_buffer = vec![0u8; (width * height * 4) as usize];
+        let pipeline = state.readback_pipeline.as_mut().unwrap();
        let gpu_resources = state.gpu_resources.as_mut().unwrap();
-        video_exporter::render_frame_to_rgba_hdr(
-            document,
-            timestamp,
-            width,
-            height,
-            device,
-            queue,
-            renderer,
-            image_cache,
-            video_manager,
-            gpu_resources,
-            &mut rgba_buffer,
-        )?;
+        let cpu_converter = state.cpu_yuv_converter.as_mut().unwrap();
+        let mut metrics = state.perf_metrics.as_mut();

-        // Send frame to encoder thread
-        if let Some(tx) = &state.frame_tx {
-            tx.send(VideoFrameMessage::Frame {
-                frame_num: state.current_frame,
-                timestamp,
-                rgba_data: rgba_buffer,
-            }).map_err(|_| "Failed to send frame to encoder")?;
+        // Poll for completed async readbacks (non-blocking)
+        if let Some(m) = metrics.as_mut() {
+            m.poll_count += 1;
+        }
+        let completed_frames = pipeline.poll_nonblocking();
+        if let Some(m) = metrics.as_mut() {
+            m.completions_per_poll.push(completed_frames.len());
        }

-        state.current_frame += 1;
+        // Process completed frames IN ORDER
+        for result in completed_frames {
+            if result.frame_num == state.next_frame_to_encode {
+                // Record readback completion time
+                if let Some(m) = metrics.as_mut() {
+                    if let Some(frame_metrics) = m.frames.get_mut(result.frame_num) {
+                        frame_metrics.readback_complete = Some(Instant::now());
+                    }
+                }

-        // Return true if more frames remain
-        Ok(state.current_frame < state.total_frames)
+                // Extract RGBA data (timed)
+                let extraction_start = Instant::now();
+                let rgba_data = pipeline.extract_rgba_data(result.buffer_id);
+                let extraction_end = Instant::now();
+
+                // CPU YUV conversion (timed)
+                let conversion_start = Instant::now();
+                let (y, u, v) = cpu_converter.convert(&rgba_data)?;
+                let conversion_end = Instant::now();
+
+                if let Some(m) = metrics.as_mut() {
+                    if let Some(frame_metrics) = m.frames.get_mut(result.frame_num) {
+                        frame_metrics.extraction_start = Some(extraction_start);
+                        frame_metrics.extraction_end = Some(extraction_end);
+                        frame_metrics.conversion_start = Some(conversion_start);
+                        frame_metrics.conversion_end = Some(conversion_end);
+                    }
+                }
+
+                // Send to encoder
+                if let Some(tx) = &state.frame_tx {
+                    tx.send(VideoFrameMessage::Frame {
+                        frame_num: result.frame_num,
+                        timestamp: result.timestamp,
+                        y_plane: y,
+                        u_plane: u,
+                        v_plane: v,
+                    }).map_err(|_| "Failed to send frame")?;
+                }
+
+                pipeline.release(result.buffer_id);
+                state.frames_in_flight -= 1;
+                state.next_frame_to_encode += 1;
+            }
+        }
+
+        // Submit new frames (up to 3 in flight)
+        while state.current_frame < state.total_frames && state.frames_in_flight < 3 {
+            let timestamp = state.start_time + (state.current_frame as f64 / state.framerate);
+
+            if let Some(acquired) = pipeline.acquire(state.current_frame, timestamp) {
+                // Create frame metrics entry
+                if let Some(m) = metrics.as_mut() {
+                    m.frames.push(perf_metrics::FrameMetrics::new(state.current_frame));
+                }
+
+                // Render to GPU (timed)
+                let render_start = Instant::now();
+                let encoder = video_exporter::render_frame_to_gpu_rgba(
+                    document, timestamp, width, height,
+                    device, queue, renderer, image_cache, video_manager,
+                    gpu_resources, &acquired.rgba_texture_view,
+                )?;
+                let render_end = Instant::now();
+
+                // Record render timing
+                if let Some(m) = metrics.as_mut() {
+                    if let Some(frame_metrics) = m.frames.get_mut(state.current_frame) {
+                        frame_metrics.render_end = Some(render_end);
+                        frame_metrics.submit_time = Some(Instant::now());
+                    }
+                }
+
+                // Submit for async readback
+                pipeline.submit_and_readback(acquired.id, encoder);
+
+                state.current_frame += 1;
+                state.frames_in_flight += 1;
+            } else {
+                break; // All buffers in use
+            }
+        }
+
+        // Done when all submitted AND all completed
+        if state.current_frame >= state.total_frames && state.frames_in_flight == 0 {
+            println!("🎬 [VIDEO EXPORT] Complete: {} frames", state.total_frames);
+
+            // Print performance summary
+            if let Some(m) = &state.perf_metrics {
+                m.print_summary();
+                m.print_per_frame_details(10);
+            }
+
+            if let Some(tx) = state.frame_tx.take() {
+                tx.send(VideoFrameMessage::Done).ok();
+            }
+
+            state.gpu_resources = None;
+            state.readback_pipeline = None;
+            state.cpu_yuv_converter = None;
+            state.perf_metrics = None;
+            return Ok(false);
+        }
+
+        Ok(true) // More work to do
    }

    /// Background thread that receives frames and encodes them
@ -925,9 +1034,9 @@ impl ExportOrchestrator {

        // Wait for first frame to determine dimensions
        let first_frame = match frame_rx.recv() {
-            Ok(VideoFrameMessage::Frame { frame_num, timestamp, rgba_data }) => {
-                println!("🧵 [ENCODER] Received first frame ({} bytes)", rgba_data.len());
-                Some((frame_num, timestamp, rgba_data))
+            Ok(VideoFrameMessage::Frame { frame_num, timestamp, y_plane, u_plane, v_plane }) => {
+                println!("🧵 [ENCODER] Received first YUV frame (Y: {} bytes)", y_plane.len());
+                Some((frame_num, timestamp, y_plane, u_plane, v_plane))
            }
            Ok(VideoFrameMessage::Done) => {
                return Err("No frames to encode".to_string());
@ -938,9 +1047,9 @@ impl ExportOrchestrator {
        };

        // Determine dimensions from first frame
-        let (width, height) = if let Some((_, _, ref rgba_data)) = first_frame {
-            // Calculate dimensions from buffer size (RGBA = 4 bytes per pixel)
-            let pixel_count = rgba_data.len() / 4;
+        let (width, height) = if let Some((_, _, ref y_plane, _, _)) = first_frame {
+            // Calculate dimensions from Y plane size (full resolution, 1 byte per pixel)
+            let pixel_count = y_plane.len();
            // Use settings dimensions if provided, otherwise infer from buffer
            let w = settings.width.unwrap_or(1920); // Default to 1920 if not specified
            let h = settings.height.unwrap_or(1080); // Default to 1080 if not specified
@ -979,11 +1088,13 @@ impl ExportOrchestrator {
        println!("🧵 [ENCODER] Encoder initialized, ready to encode frames");

        // Process first frame
-        if let Some((frame_num, timestamp, rgba_data)) = first_frame {
+        if let Some((frame_num, timestamp, y_plane, u_plane, v_plane)) = first_frame {
            Self::encode_frame(
                &mut encoder,
                &mut output,
-                &rgba_data,
+                &y_plane,
+                &u_plane,
+                &v_plane,
                width,
                height,
                timestamp,
@ -994,8 +1105,6 @@ impl ExportOrchestrator {
                frame: 1,
                total: total_frames,
            }).ok();
-
-            println!("🧵 [ENCODER] Encoded frame {}", frame_num);
        }

        // Process remaining frames
@ -1006,11 +1115,13 @@ impl ExportOrchestrator {
            }

            match frame_rx.recv() {
-                Ok(VideoFrameMessage::Frame { frame_num, timestamp, rgba_data }) => {
+                Ok(VideoFrameMessage::Frame { frame_num, timestamp, y_plane, u_plane, v_plane }) => {
                    Self::encode_frame(
                        &mut encoder,
                        &mut output,
-                        &rgba_data,
+                        &y_plane,
+                        &u_plane,
+                        &v_plane,
                        width,
                        height,
                        timestamp,
@ -1023,10 +1134,6 @@ impl ExportOrchestrator {
                        frame: frames_encoded,
                        total: total_frames,
                    }).ok();
-
-                    if frames_encoded % 30 == 0 || frames_encoded == frame_num + 1 {
-                        println!("🧵 [ENCODER] Encoded frame {}/{}", frames_encoded, total_frames);
-                    }
                }
                Ok(VideoFrameMessage::Done) => {
                    println!("🧵 [ENCODER] All frames received, flushing encoder");
@ -1052,17 +1159,18 @@ impl ExportOrchestrator {
        Ok(())
    }

-    /// Encode a single RGBA frame
+    /// Encode a single YUV420p frame (already converted by GPU)
    fn encode_frame(
        encoder: &mut ffmpeg_next::encoder::Video,
        output: &mut ffmpeg_next::format::context::Output,
-        rgba_data: &[u8],
+        y_plane: &[u8],
+        u_plane: &[u8],
+        v_plane: &[u8],
        width: u32,
        height: u32,
        timestamp: f64,
    ) -> Result<(), String> {
-        // Convert RGBA to YUV420p
-        let (y_plane, u_plane, v_plane) = video_exporter::rgba_to_yuv420p(rgba_data, width, height);
+        // YUV planes already converted by GPU (no CPU conversion needed)

        // Create FFmpeg video frame
        let mut video_frame = ffmpeg_next::frame::Video::new(
@ -1087,8 +1195,6 @@ impl ExportOrchestrator {
        // Encoder time base is 1/(framerate * 1000), so PTS = timestamp * (framerate * 1000)
        let encoder_tb = encoder.time_base();
        let pts = (timestamp * encoder_tb.1 as f64) as i64;
-        println!("🎬 [ENCODE] Frame timestamp={:.3}s, encoder_tb={}/{}, calculated PTS={}",
-                 timestamp, encoder_tb.0, encoder_tb.1, pts);
        video_frame.set_pts(Some(pts));

        // Send frame to encoder
--- a/lightningbeam-ui/lightningbeam-editor/src/export/perf_metrics.rs
+++ b/lightningbeam-ui/lightningbeam-editor/src/export/perf_metrics.rs
@ -0,0 +1,191 @@
+//! Performance instrumentation for video export pipeline
+//!
+//! Tracks timing for each stage of the export process:
+//! - GPU rendering (render_frame_to_gpu_yuv)
+//! - Async readback (map_async completion)
+//! - YUV plane extraction
+//! - FFmpeg encoding
+//! - Polling frequency and efficiency
+
+use std::time::{Duration, Instant};
+
+/// Performance metrics for a single frame
+#[derive(Debug)]
+pub struct FrameMetrics {
+    pub frame_num: usize,
+    pub render_start: Instant,
+    pub render_end: Option<Instant>,
+    pub submit_time: Option<Instant>,
+    pub readback_complete: Option<Instant>,
+    pub extraction_start: Option<Instant>,
+    pub extraction_end: Option<Instant>,
+    pub conversion_start: Option<Instant>,
+    pub conversion_end: Option<Instant>,
+    pub encode_start: Option<Instant>,
+    pub encode_end: Option<Instant>,
+}
+
+impl FrameMetrics {
+    pub fn new(frame_num: usize) -> Self {
+        Self {
+            frame_num,
+            render_start: Instant::now(),
+            render_end: None,
+            submit_time: None,
+            readback_complete: None,
+            extraction_start: None,
+            extraction_end: None,
+            conversion_start: None,
+            conversion_end: None,
+            encode_start: None,
+            encode_end: None,
+        }
+    }
+
+    pub fn render_duration(&self) -> Option<Duration> {
+        self.render_end.map(|end| end.duration_since(self.render_start))
+    }
+
+    pub fn readback_duration(&self) -> Option<Duration> {
+        self.submit_time.and_then(|submit|
+            self.readback_complete.map(|complete|
+                complete.duration_since(submit)
+            )
+        )
+    }
+
+    pub fn extraction_duration(&self) -> Option<Duration> {
+        self.extraction_start.and_then(|start|
+            self.extraction_end.map(|end|
+                end.duration_since(start)
+            )
+        )
+    }
+
+    pub fn conversion_duration(&self) -> Option<Duration> {
+        self.conversion_start.and_then(|start|
+            self.conversion_end.map(|end|
+                end.duration_since(start)
+            )
+        )
+    }
+
+    pub fn encode_duration(&self) -> Option<Duration> {
+        self.encode_start.and_then(|start|
+            self.encode_end.map(|end|
+                end.duration_since(start)
+            )
+        )
+    }
+
+    pub fn total_duration(&self) -> Option<Duration> {
+        self.encode_end.map(|end| end.duration_since(self.render_start))
+    }
+}
+
+/// Aggregate performance metrics for entire export
+pub struct ExportMetrics {
+    pub frames: Vec<FrameMetrics>,
+    export_start: Instant,
+    pub poll_count: usize,
+    pub completions_per_poll: Vec<usize>,
+}
+
+impl ExportMetrics {
+    pub fn new() -> Self {
+        Self {
+            frames: Vec::new(),
+            export_start: Instant::now(),
+            poll_count: 0,
+            completions_per_poll: Vec::new(),
+        }
+    }
+
+    /// Print comprehensive performance summary
+    pub fn print_summary(&self) {
+        println!("\n📊 [PERF] Export Performance Summary");
+        println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
+
+        // Calculate averages for each stage
+        let mut render_times = Vec::new();
+        let mut readback_times = Vec::new();
+        let mut extraction_times = Vec::new();
+        let mut conversion_times = Vec::new();
+        let mut encode_times = Vec::new();
+        let mut total_times = Vec::new();
+
+        for metrics in &self.frames {
+            if let Some(d) = metrics.render_duration() {
+                render_times.push(d);
+            }
+            if let Some(d) = metrics.readback_duration() {
+                readback_times.push(d);
+            }
+            if let Some(d) = metrics.extraction_duration() {
+                extraction_times.push(d);
+            }
+            if let Some(d) = metrics.conversion_duration() {
+                conversion_times.push(d);
+            }
+            if let Some(d) = metrics.encode_duration() {
+                encode_times.push(d);
+            }
+            if let Some(d) = metrics.total_duration() {
+                total_times.push(d);
+            }
+        }
+
+        let avg = |times: &[Duration]| -> f64 {
+            if times.is_empty() { return 0.0; }
+            times.iter().sum::<Duration>().as_secs_f64() / times.len() as f64 * 1000.0
+        };
+
+        println!("Render:     {:.2}ms avg", avg(&render_times));
+        println!("Readback:   {:.2}ms avg", avg(&readback_times));
+        println!("Extraction: {:.2}ms avg", avg(&extraction_times));
+        println!("Conversion: {:.2}ms avg", avg(&conversion_times));
+        println!("Encode:     {:.2}ms avg", avg(&encode_times));
+        println!("Total:      {:.2}ms avg", avg(&total_times));
+
+        let total_export_time = Instant::now().duration_since(self.export_start).as_secs_f64();
+        let fps = self.frames.len() as f64 / total_export_time;
+        println!("\nOverall: {:.2} fps ({:.1}s for {} frames)",
+                 fps, total_export_time, self.frames.len());
+
+        if self.poll_count > 0 {
+            let avg_completions = self.completions_per_poll.iter().sum::<usize>() as f64 / self.poll_count as f64;
+            println!("Polls: {} ({:.2} completions/poll avg)",
+                     self.poll_count, avg_completions);
+        }
+
+        println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
+    }
+
+    /// Print detailed per-frame breakdown for last N frames
+    pub fn print_per_frame_details(&self, last_n: usize) {
+        println!("\n📋 [PERF] Per-Frame Breakdown (last {} frames)", last_n);
+        println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
+        println!("{:>5} | {:>8} | {:>8} | {:>8} | {:>8} | {:>8} | {:>8}",
+                 "Frame", "Render", "Readback", "Extract", "Convert", "Encode", "Total");
+        println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
+
+        let start = if self.frames.len() > last_n {
+            self.frames.len() - last_n
+        } else {
+            0
+        };
+
+        for metrics in &self.frames[start..] {
+            println!("{:5} | {:>7.2}ms | {:>7.2}ms | {:>7.2}ms | {:>7.2}ms | {:>7.2}ms | {:>7.2}ms",
+                metrics.frame_num,
+                metrics.render_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
+                metrics.readback_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
+                metrics.extraction_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
+                metrics.conversion_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
+                metrics.encode_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
+                metrics.total_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
+            );
+        }
+        println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
+    }
+}
--- a/lightningbeam-ui/lightningbeam-editor/src/export/readback_pipeline.rs
+++ b/lightningbeam-ui/lightningbeam-editor/src/export/readback_pipeline.rs
@ -0,0 +1,317 @@
+//! Async triple-buffered GPU readback pipeline for video export
+//!
+//! This module implements a pipelined export system that overlaps GPU rendering
+//! with CPU encoding to maximize throughput. It uses triple buffering to keep
+//! both GPU and CPU busy simultaneously:
+//!
+//! - Frame N: GPU rendering/conversion
+//! - Frame N-1: GPU→CPU async transfer
+//! - Frame N-2: CPU encoding
+//!
+//! Expected speedup: 5x over synchronous blocking approach
+
+use std::sync::mpsc::{channel, Receiver, Sender};
+
+/// Result from a completed async buffer mapping
+#[derive(Debug)]
+pub struct ReadbackResult {
+    pub buffer_id: usize,
+    pub frame_num: usize,
+    pub timestamp: f64,
+}
+
+/// State of a pipeline buffer in the triple-buffering state machine
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum BufferState {
+    /// Buffer is available for new frame rendering
+    Free,
+    /// GPU is currently rendering/converting to this buffer
+    Rendering,
+    /// Buffer readback submitted, waiting for GPU→CPU transfer
+    ReadbackPending,
+    /// Buffer mapped and ready for CPU to read
+    Mapped,
+    /// CPU is encoding this buffer's data
+    Encoding,
+}
+
+/// A single buffer in the triple-buffering pipeline
+struct PipelineBuffer {
+    id: usize,
+    /// RGBA texture for GPU rendering output (Rgba8Unorm)
+    rgba_texture: wgpu::Texture,
+    rgba_texture_view: wgpu::TextureView,
+    /// Staging buffer for GPU→CPU transfer (MAP_READ)
+    staging_buffer: wgpu::Buffer,
+    /// Current state in the pipeline
+    state: BufferState,
+    /// Frame metadata (set when rendering starts)
+    frame_num: Option<usize>,
+    timestamp: Option<f64>,
+}
+
+/// Handle to an acquired buffer for rendering
+pub struct AcquiredBuffer {
+    pub id: usize,
+    pub rgba_texture_view: wgpu::TextureView,
+}
+
+/// Triple-buffered async readback pipeline
+///
+/// Manages 3 buffers cycling through the pipeline:
+/// Free → Rendering → ReadbackPending → Mapped → Encoding → Free
+pub struct ReadbackPipeline {
+    buffers: Vec<PipelineBuffer>,
+    /// Channel for async map_async callbacks
+    readback_rx: Receiver<ReadbackResult>,
+    readback_tx: Sender<ReadbackResult>,
+    /// wgpu device and queue references (needed for polling and buffer operations)
+    device: wgpu::Device,
+    queue: wgpu::Queue,
+    /// Buffer dimensions
+    width: u32,
+    height: u32,
+}
+
+impl ReadbackPipeline {
+    /// Create a new triple-buffered readback pipeline
+    ///
+    /// # Arguments
+    /// * `device` - GPU device (will be cloned for async operations)
+    /// * `queue` - GPU queue (will be cloned for async operations)
+    /// * `width` - Frame width in pixels
+    /// * `height` - Frame height in pixels
+    pub fn new(device: &wgpu::Device, queue: &wgpu::Queue, width: u32, height: u32) -> Self {
+        let (readback_tx, readback_rx) = channel();
+
+        // Create 3 buffers for triple buffering
+        let mut buffers = Vec::new();
+        for id in 0..3 {
+            // RGBA texture (Rgba8Unorm)
+            let rgba_texture = device.create_texture(&wgpu::TextureDescriptor {
+                label: Some(&format!("readback_rgba_texture_{}", id)),
+                size: wgpu::Extent3d {
+                    width,
+                    height,
+                    depth_or_array_layers: 1,
+                },
+                mip_level_count: 1,
+                sample_count: 1,
+                dimension: wgpu::TextureDimension::D2,
+                format: wgpu::TextureFormat::Rgba8Unorm,
+                usage: wgpu::TextureUsages::RENDER_ATTACHMENT | wgpu::TextureUsages::COPY_SRC,
+                view_formats: &[],
+            });
+
+            let rgba_texture_view = rgba_texture.create_view(&wgpu::TextureViewDescriptor::default());
+
+            // Staging buffer for GPU→CPU readback
+            let rgba_buffer_size = (width * height * 4) as u64; // Rgba8Unorm = 4 bytes/pixel
+            let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+                label: Some(&format!("readback_staging_buffer_{}", id)),
+                size: rgba_buffer_size,
+                usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+                mapped_at_creation: false,
+            });
+
+            buffers.push(PipelineBuffer {
+                id,
+                rgba_texture,
+                rgba_texture_view,
+                staging_buffer,
+                state: BufferState::Free,
+                frame_num: None,
+                timestamp: None,
+            });
+        }
+
+        Self {
+            buffers,
+            readback_rx,
+            readback_tx,
+            device: device.clone(),
+            queue: queue.clone(),
+            width,
+            height,
+        }
+    }
+
+    /// Acquire a free buffer for rendering (non-blocking)
+    ///
+    /// Returns None if all buffers are in use (caller should poll and retry)
+    pub fn acquire(&mut self, frame_num: usize, timestamp: f64) -> Option<AcquiredBuffer> {
+        // Find first Free buffer
+        for buffer in &mut self.buffers {
+            if buffer.state == BufferState::Free {
+                buffer.state = BufferState::Rendering;
+                buffer.frame_num = Some(frame_num);
+                buffer.timestamp = Some(timestamp);
+
+                return Some(AcquiredBuffer {
+                    id: buffer.id,
+                    rgba_texture_view: buffer.rgba_texture_view.clone(),
+                });
+            }
+        }
+
+        None // All buffers busy
+    }
+
+    /// Submit GPU commands and initiate async readback
+    ///
+    /// # Arguments
+    /// * `buffer_id` - ID of the buffer to submit (from AcquiredBuffer)
+    /// * `encoder` - Command encoder with rendering commands
+    pub fn submit_and_readback(&mut self, buffer_id: usize, mut encoder: wgpu::CommandEncoder) {
+        let buffer = &mut self.buffers[buffer_id];
+        assert_eq!(buffer.state, BufferState::Rendering, "Buffer not in Rendering state");
+
+        // Copy RGBA texture to staging buffer
+        encoder.copy_texture_to_buffer(
+            wgpu::TexelCopyTextureInfo {
+                texture: &buffer.rgba_texture,
+                mip_level: 0,
+                origin: wgpu::Origin3d::ZERO,
+                aspect: wgpu::TextureAspect::All,
+            },
+            wgpu::TexelCopyBufferInfo {
+                buffer: &buffer.staging_buffer,
+                layout: wgpu::TexelCopyBufferLayout {
+                    offset: 0,
+                    bytes_per_row: Some(self.width * 4), // Rgba8Unorm
+                    rows_per_image: Some(self.height),
+                },
+            },
+            wgpu::Extent3d {
+                width: self.width,
+                height: self.height,
+                depth_or_array_layers: 1,
+            },
+        );
+
+        // Submit GPU commands (non-blocking)
+        self.queue.submit(Some(encoder.finish()));
+
+        // Initiate async buffer mapping
+        let frame_num = buffer.frame_num.unwrap();
+        let timestamp = buffer.timestamp.unwrap();
+        let tx = self.readback_tx.clone();
+
+        buffer.staging_buffer.slice(..).map_async(wgpu::MapMode::Read, move |result| {
+            if result.is_ok() {
+                let _ = tx.send(ReadbackResult {
+                    buffer_id,
+                    frame_num,
+                    timestamp,
+                });
+            }
+        });
+
+        buffer.state = BufferState::ReadbackPending;
+    }
+
+    /// Poll for completed readbacks (non-blocking)
+    ///
+    /// Returns list of buffers that are now ready for CPU encoding.
+    /// Call this frequently to process completed transfers.
+    pub fn poll_nonblocking(&mut self) -> Vec<ReadbackResult> {
+        // Poll GPU without blocking
+        self.device.poll(wgpu::PollType::Poll);
+
+        // Collect all completed readbacks
+        let mut results = Vec::new();
+        while let Ok(result) = self.readback_rx.try_recv() {
+            // Update buffer state to Mapped
+            self.buffers[result.buffer_id].state = BufferState::Mapped;
+            results.push(result);
+        }
+
+        results
+    }
+
+    /// Extract RGBA data from mapped buffer (for CPU YUV conversion)
+    ///
+    /// Buffer must be in Mapped state (after poll_nonblocking returned it).
+    /// This immediately copies the RGBA data, allowing the buffer to be released.
+    pub fn extract_rgba_data(&mut self, buffer_id: usize) -> Vec<u8> {
+        let buffer = &mut self.buffers[buffer_id];
+        assert_eq!(buffer.state, BufferState::Mapped, "Buffer not in Mapped state");
+
+        buffer.state = BufferState::Encoding;
+
+        // Map the buffer and copy RGBA data
+        let slice = buffer.staging_buffer.slice(..);
+        let data = slice.get_mapped_range();
+
+        // Simple copy - RGBA data goes to CPU for conversion
+        data.to_vec()
+    }
+
+    /// Release buffer after encoding completes, returning it to the free pool
+    ///
+    /// # Arguments
+    /// * `buffer_id` - ID of buffer to release
+    pub fn release(&mut self, buffer_id: usize) {
+        let buffer = &mut self.buffers[buffer_id];
+        assert_eq!(buffer.state, BufferState::Encoding, "Buffer not in Encoding state");
+
+        // Unmap buffer
+        buffer.staging_buffer.unmap();
+
+        // Clear metadata
+        buffer.frame_num = None;
+        buffer.timestamp = None;
+
+        // Return to free pool
+        buffer.state = BufferState::Free;
+    }
+
+    /// Flush pipeline and wait for all pending operations
+    ///
+    /// Call this at the end of export to ensure all frames are processed
+    pub fn flush(&mut self) -> Vec<ReadbackResult> {
+        let mut all_results = Vec::new();
+
+        // Keep polling until all buffers are Free
+        loop {
+            // Poll for new completions
+            self.device.poll(wgpu::PollType::Poll);
+
+            while let Ok(result) = self.readback_rx.try_recv() {
+                self.buffers[result.buffer_id].state = BufferState::Mapped;
+                all_results.push(result);
+            }
+
+            // Check if all buffers are Free (or can be made Free)
+            let mut all_free = true;
+            for buffer in &self.buffers {
+                match buffer.state {
+                    BufferState::Free => {},
+                    BufferState::Rendering | BufferState::ReadbackPending => {
+                        all_free = false;
+                        break;
+                    },
+                    BufferState::Mapped | BufferState::Encoding => {
+                        // These should be handled by the caller, shouldn't happen during flush
+                        panic!("Buffer in {} state during flush - caller should encode and release",
+                               if buffer.state == BufferState::Mapped { "Mapped" } else { "Encoding" });
+                    }
+                }
+            }
+
+            if all_free {
+                break;
+            }
+
+            // Small sleep to avoid busy-waiting
+            std::thread::sleep(std::time::Duration::from_millis(1));
+        }
+
+        all_results
+    }
+
+    /// Get buffer count currently in flight (for monitoring)
+    pub fn buffers_in_flight(&self) -> usize {
+        self.buffers.iter().filter(|b| b.state != BufferState::Free).count()
+    }
+}
--- a/lightningbeam-ui/lightningbeam-editor/src/export/video_exporter.rs
+++ b/lightningbeam-ui/lightningbeam-editor/src/export/video_exporter.rs
@ -12,7 +12,7 @@ use lightningbeam_core::renderer::{ImageCache, render_document_for_compositing,
 use lightningbeam_core::video::VideoManager;
 use lightningbeam_core::gpu::{
    BufferPool, BufferSpec, BufferFormat, Compositor, CompositorLayer,
-    SrgbToLinearConverter, EffectProcessor, HDR_FORMAT,
+    SrgbToLinearConverter, EffectProcessor, YuvConverter, HDR_FORMAT,
 };

 /// Reusable frame buffers to avoid allocations
@ -56,10 +56,22 @@ pub struct ExportGpuResources {
    pub srgb_to_linear: SrgbToLinearConverter,
    /// Effect processor for shader effects
    pub effect_processor: EffectProcessor,
+    /// GPU-accelerated RGBA to YUV420p converter
+    pub yuv_converter: YuvConverter,
    /// HDR accumulator texture for compositing
    pub hdr_texture: wgpu::Texture,
    /// View for HDR texture
    pub hdr_texture_view: wgpu::TextureView,
+    /// Persistent RGBA output texture (sRGB, reused for all frames)
+    pub output_texture: wgpu::Texture,
+    /// View for persistent output texture
+    pub output_texture_view: wgpu::TextureView,
+    /// Persistent YUV texture for GPU conversion (R8Unorm, height*1.5, reused for all frames)
+    pub yuv_texture: wgpu::Texture,
+    /// View for persistent YUV texture
+    pub yuv_texture_view: wgpu::TextureView,
+    /// Persistent staging buffer for GPU→CPU readback (reused for all frames)
+    pub staging_buffer: wgpu::Buffer,
    /// Linear to sRGB blit pipeline for final output
    pub linear_to_srgb_pipeline: wgpu::RenderPipeline,
    /// Bind group layout for linear to sRGB blit
@ -75,6 +87,7 @@ impl ExportGpuResources {
        let compositor = Compositor::new(device, HDR_FORMAT);
        let srgb_to_linear = SrgbToLinearConverter::new(device);
        let effect_processor = EffectProcessor::new(device, HDR_FORMAT);
+        let yuv_converter = YuvConverter::new(device);

        // Create HDR accumulator texture
        let hdr_texture = device.create_texture(&wgpu::TextureDescriptor {
@ -95,6 +108,53 @@ impl ExportGpuResources {
        });
        let hdr_texture_view = hdr_texture.create_view(&wgpu::TextureViewDescriptor::default());

+        // Create persistent RGBA output texture (sRGB, reused for all frames)
+        let output_texture = device.create_texture(&wgpu::TextureDescriptor {
+            label: Some("export_output_texture"),
+            size: wgpu::Extent3d {
+                width,
+                height,
+                depth_or_array_layers: 1,
+            },
+            mip_level_count: 1,
+            sample_count: 1,
+            dimension: wgpu::TextureDimension::D2,
+            format: wgpu::TextureFormat::Rgba8Unorm,
+            usage: wgpu::TextureUsages::RENDER_ATTACHMENT
+                | wgpu::TextureUsages::TEXTURE_BINDING
+                | wgpu::TextureUsages::COPY_SRC,
+            view_formats: &[],
+        });
+        let output_texture_view = output_texture.create_view(&wgpu::TextureViewDescriptor::default());
+
+        // Create persistent YUV texture (Rgba8Unorm, height*1.5 for packed Y+U+V planes)
+        // Note: Using Rgba8Unorm instead of R8Unorm because R8Unorm doesn't support STORAGE_BINDING
+        let yuv_height = height + height / 2; // Y plane + U plane + V plane
+        let yuv_texture = device.create_texture(&wgpu::TextureDescriptor {
+            label: Some("export_yuv_texture"),
+            size: wgpu::Extent3d {
+                width,
+                height: yuv_height,
+                depth_or_array_layers: 1,
+            },
+            mip_level_count: 1,
+            sample_count: 1,
+            dimension: wgpu::TextureDimension::D2,
+            format: wgpu::TextureFormat::Rgba8Unorm,
+            usage: wgpu::TextureUsages::STORAGE_BINDING | wgpu::TextureUsages::COPY_SRC,
+            view_formats: &[],
+        });
+        let yuv_texture_view = yuv_texture.create_view(&wgpu::TextureViewDescriptor::default());
+
+        // Create persistent staging buffer for GPU→CPU readback
+        let yuv_buffer_size = (width * yuv_height * 4) as u64; // Rgba8Unorm = 4 bytes per pixel
+        let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("export_staging_buffer"),
+            size: yuv_buffer_size,
+            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+            mapped_at_creation: false,
+        });
+
        // Create linear to sRGB blit pipeline
        let linear_to_srgb_bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
            label: Some("linear_to_srgb_bind_group_layout"),
@ -179,8 +239,14 @@ impl ExportGpuResources {
            compositor,
            srgb_to_linear,
            effect_processor,
+            yuv_converter,
            hdr_texture,
            hdr_texture_view,
+            output_texture,
+            output_texture_view,
+            yuv_texture,
+            yuv_texture_view,
+            staging_buffer,
            linear_to_srgb_pipeline,
            linear_to_srgb_bind_group_layout,
            linear_to_srgb_sampler,
@ -476,20 +542,11 @@ pub fn receive_and_write_packets(
    let encoder_tb = encoder.time_base();
    let stream_tb = output.stream(0).ok_or("No output stream found")?.time_base();

-    println!("🎬 [PACKET] Encoder TB: {}/{}, Stream TB: {}/{}",
-             encoder_tb.0, encoder_tb.1, stream_tb.0, stream_tb.1);
-
    while encoder.receive_packet(&mut encoded).is_ok() {
-        println!("🎬 [PACKET] Before rescale - PTS: {:?}, DTS: {:?}, Duration: {:?}",
-                 encoded.pts(), encoded.dts(), encoded.duration());
-
        encoded.set_stream(0);
        // Rescale timestamps from encoder time base to stream time base
        encoded.rescale_ts(encoder_tb, stream_tb);

-        println!("🎬 [PACKET] After rescale - PTS: {:?}, DTS: {:?}, Duration: {:?}",
-                 encoded.pts(), encoded.dts(), encoded.duration());
-
        encoded
            .write_interleaved(output)
            .map_err(|e| format!("Failed to write packet: {}", e))?;
@ -660,10 +717,9 @@ pub fn render_frame_to_rgba(
 /// * `image_cache` - Image cache for rendering
 /// * `video_manager` - Video manager for video clips
 /// * `gpu_resources` - HDR GPU resources for compositing
-/// * `rgba_buffer` - Output buffer for RGBA pixels (must be width * height * 4 bytes)
 ///
 /// # Returns
-/// Ok(()) on success, Err with message on failure
+/// Ok((y_plane, u_plane, v_plane)) with YUV420p planes on success, Err with message on failure
 pub fn render_frame_to_rgba_hdr(
    document: &mut Document,
    timestamp: f64,
@ -675,8 +731,7 @@ pub fn render_frame_to_rgba_hdr(
    image_cache: &mut ImageCache,
    video_manager: &Arc<std::sync::Mutex<VideoManager>>,
    gpu_resources: &mut ExportGpuResources,
-    rgba_buffer: &mut [u8],
-) -> Result<(), String> {
+) -> Result<(Vec<u8>, Vec<u8>, Vec<u8>), String> {
    use vello::kurbo::Affine;

    // Set document time to the frame timestamp
@ -879,22 +934,8 @@ pub fn render_frame_to_rgba_hdr(
    // Advance frame counter for buffer cleanup
    gpu_resources.buffer_pool.next_frame();

-    // Create output texture for final sRGB output
-    let output_texture = device.create_texture(&wgpu::TextureDescriptor {
-        label: Some("export_output_texture"),
-        size: wgpu::Extent3d {
-            width,
-            height,
-            depth_or_array_layers: 1,
-        },
-        mip_level_count: 1,
-        sample_count: 1,
-        dimension: wgpu::TextureDimension::D2,
-        format: wgpu::TextureFormat::Rgba8Unorm,
-        usage: wgpu::TextureUsages::RENDER_ATTACHMENT | wgpu::TextureUsages::COPY_SRC,
-        view_formats: &[],
-    });
-    let output_view = output_texture.create_view(&wgpu::TextureViewDescriptor::default());
+    // Use persistent output texture (already created in ExportGpuResources)
+    let output_view = &gpu_resources.output_texture_view;

    // Convert HDR to sRGB for output
    let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
@ -940,52 +981,48 @@ pub fn render_frame_to_rgba_hdr(

    queue.submit(Some(encoder.finish()));

-    // GPU readback: Create staging buffer with proper alignment
-    let bytes_per_pixel = 4u32; // RGBA8
-    let bytes_per_row_alignment = 256u32;
-    let unpadded_bytes_per_row = width * bytes_per_pixel;
-    let bytes_per_row = ((unpadded_bytes_per_row + bytes_per_row_alignment - 1)
-        / bytes_per_row_alignment) * bytes_per_row_alignment;
-    let buffer_size = (bytes_per_row * height) as u64;
-
-    let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
-        label: Some("export_staging_buffer"),
-        size: buffer_size,
-        usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
-        mapped_at_creation: false,
+    // GPU YUV conversion: Convert RGBA output to YUV420p
+    let mut yuv_encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+        label: Some("export_yuv_conversion_encoder"),
    });

-    // Copy texture to staging buffer
-    let mut copy_encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
-        label: Some("export_copy_encoder"),
-    });
+    gpu_resources.yuv_converter.convert(
+        device,
+        &mut yuv_encoder,
+        output_view,
+        &gpu_resources.yuv_texture_view,
+        width,
+        height,
+    );

-    copy_encoder.copy_texture_to_buffer(
+    // Copy YUV texture to persistent staging buffer
+    let yuv_height = height + height / 2; // Y plane + U plane + V plane
+    yuv_encoder.copy_texture_to_buffer(
        wgpu::TexelCopyTextureInfo {
-            texture: &output_texture,
+            texture: &gpu_resources.yuv_texture,
            mip_level: 0,
            origin: wgpu::Origin3d::ZERO,
            aspect: wgpu::TextureAspect::All,
        },
        wgpu::TexelCopyBufferInfo {
-            buffer: &staging_buffer,
+            buffer: &gpu_resources.staging_buffer,
            layout: wgpu::TexelCopyBufferLayout {
                offset: 0,
-                bytes_per_row: Some(bytes_per_row),
-                rows_per_image: Some(height),
+                bytes_per_row: Some(width * 4), // Rgba8Unorm = 4 bytes per pixel
+                rows_per_image: Some(yuv_height),
            },
        },
        wgpu::Extent3d {
            width,
-            height,
+            height: yuv_height,
            depth_or_array_layers: 1,
        },
    );

-    queue.submit(Some(copy_encoder.finish()));
+    queue.submit(Some(yuv_encoder.finish()));

-    // Map buffer and read pixels (synchronous)
-    let buffer_slice = staging_buffer.slice(..);
+    // Map buffer and read YUV pixels (synchronous)
+    let buffer_slice = gpu_resources.staging_buffer.slice(..);
    let (sender, receiver) = std::sync::mpsc::channel();
    buffer_slice.map_async(wgpu::MapMode::Read, move |result| {
        sender.send(result).ok();
@ -998,20 +1035,319 @@ pub fn render_frame_to_rgba_hdr(
        .map_err(|_| "Failed to receive buffer mapping result")?
        .map_err(|e| format!("Failed to map buffer: {:?}", e))?;

-    // Copy data from mapped buffer to output, removing padding
+    // Extract Y, U, V planes from packed YUV buffer
    let data = buffer_slice.get_mapped_range();
-    for y in 0..height as usize {
-        let src_offset = y * bytes_per_row as usize;
-        let dst_offset = y * unpadded_bytes_per_row as usize;
-        let row_bytes = unpadded_bytes_per_row as usize;
-        rgba_buffer[dst_offset..dst_offset + row_bytes]
-            .copy_from_slice(&data[src_offset..src_offset + row_bytes]);
+    let width_usize = width as usize;
+    let height_usize = height as usize;
+
+    // Y plane: rows 0 to height-1 (extract R channel from Rgba8Unorm)
+    let y_plane_size = width_usize * height_usize;
+    let mut y_plane = vec![0u8; y_plane_size];
+    for y in 0..height_usize {
+        let src_row_offset = y * width_usize * 4; // 4 bytes per pixel (Rgba8Unorm)
+        let dst_row_offset = y * width_usize;
+        for x in 0..width_usize {
+            y_plane[dst_row_offset + x] = data[src_row_offset + x * 4]; // Extract R channel
+        }
+    }
+
+    // U and V planes: rows height to height + height/2 - 1 (half resolution, side-by-side layout)
+    // U plane is in left half (columns 0 to width/2-1), V plane is in right half (columns width/2 to width-1)
+    let chroma_width = width_usize / 2;
+    let chroma_height = height_usize / 2;
+    let chroma_row_start = height_usize * width_usize * 4; // Start of chroma rows in bytes
+
+    let mut u_plane = vec![0u8; chroma_width * chroma_height];
+    let mut v_plane = vec![0u8; chroma_width * chroma_height];
+
+    for y in 0..chroma_height {
+        let row_offset = chroma_row_start + y * width_usize * 4; // Full width rows in chroma region
+
+        // Extract U plane (left half: columns 0 to chroma_width-1)
+        let u_start = row_offset;
+        let dst_offset = y * chroma_width;
+        for x in 0..chroma_width {
+            u_plane[dst_offset + x] = data[u_start + x * 4]; // Extract R channel
+        }
+
+        // Extract V plane (right half: columns width/2 to width/2+chroma_width-1)
+        let v_start = row_offset + chroma_width * 4;
+        for x in 0..chroma_width {
+            v_plane[dst_offset + x] = data[v_start + x * 4]; // Extract R channel
+        }
    }

    drop(data);
-    staging_buffer.unmap();
+    gpu_resources.staging_buffer.unmap();

-    Ok(())
+    Ok((y_plane, u_plane, v_plane))
+}
+
+/// Render frame to GPU RGBA texture (non-blocking, for async pipeline)
+///
+/// Similar to render_frame_to_rgba_hdr but renders to an external RGBA texture view
+/// (provided by ReadbackPipeline) and returns the command encoder WITHOUT blocking on readback.
+/// The caller (ReadbackPipeline) will submit the encoder and handle async readback.
+///
+/// # Arguments
+/// * `document` - Document to render
+/// * `timestamp` - Time in seconds to render at
+/// * `width` - Frame width in pixels
+/// * `height` - Frame height in pixels
+/// * `device` - wgpu device
+/// * `queue` - wgpu queue
+/// * `renderer` - Vello renderer
+/// * `image_cache` - Image cache for rendering
+/// * `video_manager` - Video manager for video clips
+/// * `gpu_resources` - HDR GPU resources for compositing
+/// * `rgba_texture_view` - External RGBA texture view (from ReadbackPipeline)
+///
+/// # Returns
+/// Command encoder ready for submission (caller submits via ReadbackPipeline)
+pub fn render_frame_to_gpu_rgba(
+    document: &mut Document,
+    timestamp: f64,
+    width: u32,
+    height: u32,
+    device: &wgpu::Device,
+    queue: &wgpu::Queue,
+    renderer: &mut vello::Renderer,
+    image_cache: &mut ImageCache,
+    video_manager: &Arc<std::sync::Mutex<VideoManager>>,
+    gpu_resources: &mut ExportGpuResources,
+    rgba_texture_view: &wgpu::TextureView,
+) -> Result<wgpu::CommandEncoder, String> {
+    use vello::kurbo::Affine;
+
+    // Set document time to the frame timestamp
+    document.current_time = timestamp;
+
+    // Use identity transform for export (document coordinates = pixel coordinates)
+    let base_transform = Affine::IDENTITY;
+
+    // Render document for compositing (returns per-layer scenes)
+    let composite_result = render_document_for_compositing(
+        document,
+        base_transform,
+        image_cache,
+        video_manager,
+    );
+
+    // Buffer specs for layer rendering
+    let layer_spec = BufferSpec::new(width, height, BufferFormat::Rgba8Srgb);
+    let hdr_spec = BufferSpec::new(width, height, BufferFormat::Rgba16Float);
+
+    // Render parameters for Vello (transparent background for layers)
+    let layer_render_params = vello::RenderParams {
+        base_color: vello::peniko::Color::TRANSPARENT,
+        width,
+        height,
+        antialiasing_method: vello::AaConfig::Area,
+    };
+
+    // Render background and composite it
+    let bg_srgb_handle = gpu_resources.buffer_pool.acquire(device, layer_spec);
+    let bg_hdr_handle = gpu_resources.buffer_pool.acquire(device, hdr_spec);
+
+    if let (Some(bg_srgb_view), Some(bg_hdr_view)) = (
+        gpu_resources.buffer_pool.get_view(bg_srgb_handle),
+        gpu_resources.buffer_pool.get_view(bg_hdr_handle),
+    ) {
+        renderer.render_to_texture(device, queue, &composite_result.background, bg_srgb_view, &layer_render_params)
+            .map_err(|e| format!("Failed to render background: {}", e))?;
+
+        let mut convert_encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+            label: Some("export_bg_srgb_to_linear_encoder"),
+        });
+        gpu_resources.srgb_to_linear.convert(device, &mut convert_encoder, bg_srgb_view, bg_hdr_view);
+        queue.submit(Some(convert_encoder.finish()));
+
+        let bg_compositor_layer = CompositorLayer::normal(bg_hdr_handle, 1.0);
+        let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+            label: Some("export_bg_composite_encoder"),
+        });
+        gpu_resources.compositor.composite(
+            device,
+            queue,
+            &mut encoder,
+            &[bg_compositor_layer],
+            &gpu_resources.buffer_pool,
+            &gpu_resources.hdr_texture_view,
+            Some([0.0, 0.0, 0.0, 1.0]),
+        );
+        queue.submit(Some(encoder.finish()));
+    }
+    gpu_resources.buffer_pool.release(bg_srgb_handle);
+    gpu_resources.buffer_pool.release(bg_hdr_handle);
+
+    // Render and composite each layer incrementally
+    for rendered_layer in &composite_result.layers {
+        if !rendered_layer.has_content {
+            continue;
+        }
+
+        match &rendered_layer.layer_type {
+            RenderedLayerType::Content => {
+                let srgb_handle = gpu_resources.buffer_pool.acquire(device, layer_spec);
+                let hdr_layer_handle = gpu_resources.buffer_pool.acquire(device, hdr_spec);
+
+                if let (Some(srgb_view), Some(hdr_layer_view)) = (
+                    gpu_resources.buffer_pool.get_view(srgb_handle),
+                    gpu_resources.buffer_pool.get_view(hdr_layer_handle),
+                ) {
+                    renderer.render_to_texture(device, queue, &rendered_layer.scene, srgb_view, &layer_render_params)
+                        .map_err(|e| format!("Failed to render layer: {}", e))?;
+
+                    let mut convert_encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                        label: Some("export_layer_srgb_to_linear_encoder"),
+                    });
+                    gpu_resources.srgb_to_linear.convert(device, &mut convert_encoder, srgb_view, hdr_layer_view);
+                    queue.submit(Some(convert_encoder.finish()));
+
+                    let compositor_layer = CompositorLayer::normal(hdr_layer_handle, rendered_layer.opacity);
+                    let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                        label: Some("export_layer_composite_encoder"),
+                    });
+                    gpu_resources.compositor.composite(
+                        device,
+                        queue,
+                        &mut encoder,
+                        &[compositor_layer],
+                        &gpu_resources.buffer_pool,
+                        &gpu_resources.hdr_texture_view,
+                        None,
+                    );
+                    queue.submit(Some(encoder.finish()));
+                }
+                gpu_resources.buffer_pool.release(srgb_handle);
+                gpu_resources.buffer_pool.release(hdr_layer_handle);
+            }
+            RenderedLayerType::Effect { effect_instances } => {
+                // Effect layer - apply effects to the current HDR accumulator
+                let current_time = document.current_time;
+
+                for effect_instance in effect_instances {
+                    // Get effect definition from document
+                    let Some(effect_def) = document.get_effect_definition(&effect_instance.clip_id) else {
+                        continue;
+                    };
+
+                    // Compile effect if needed
+                    if !gpu_resources.effect_processor.is_compiled(&effect_def.id) {
+                        let success = gpu_resources.effect_processor.compile_effect(device, effect_def);
+                        if !success {
+                            eprintln!("Failed to compile effect: {}", effect_def.name);
+                            continue;
+                        }
+                    }
+
+                    // Create EffectInstance from ClipInstance for the processor
+                    let effect_inst = lightningbeam_core::effect::EffectInstance::new(
+                        effect_def,
+                        effect_instance.timeline_start,
+                        effect_instance.timeline_start + effect_instance.effective_duration(lightningbeam_core::effect::EFFECT_DURATION),
+                    );
+
+                    // Acquire temp buffer for effect output (HDR format)
+                    let effect_output_handle = gpu_resources.buffer_pool.acquire(device, hdr_spec);
+
+                    if let Some(effect_output_view) = gpu_resources.buffer_pool.get_view(effect_output_handle) {
+                        let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                            label: Some("export_effect_encoder"),
+                        });
+
+                        // Apply effect: HDR accumulator → effect output buffer
+                        let applied = gpu_resources.effect_processor.apply_effect(
+                            device,
+                            queue,
+                            &mut encoder,
+                            effect_def,
+                            &effect_inst,
+                            &gpu_resources.hdr_texture_view,
+                            effect_output_view,
+                            width,
+                            height,
+                            current_time,
+                        );
+
+                        if applied {
+                            // Copy effect output back to HDR accumulator
+                            encoder.copy_texture_to_texture(
+                                wgpu::TexelCopyTextureInfo {
+                                    texture: gpu_resources.buffer_pool.get_texture(effect_output_handle).unwrap(),
+                                    mip_level: 0,
+                                    origin: wgpu::Origin3d::ZERO,
+                                    aspect: wgpu::TextureAspect::All,
+                                },
+                                wgpu::TexelCopyTextureInfo {
+                                    texture: &gpu_resources.hdr_texture,
+                                    mip_level: 0,
+                                    origin: wgpu::Origin3d::ZERO,
+                                    aspect: wgpu::TextureAspect::All,
+                                },
+                                wgpu::Extent3d {
+                                    width,
+                                    height,
+                                    depth_or_array_layers: 1,
+                                },
+                            );
+                        }
+
+                        queue.submit(Some(encoder.finish()));
+                    }
+
+                    gpu_resources.buffer_pool.release(effect_output_handle);
+                }
+            }
+        }
+    }
+
+    // Convert HDR to sRGB (linear → sRGB), render directly to external RGBA texture
+    let output_view = rgba_texture_view;
+    let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+        label: Some("export_linear_to_srgb_bind_group"),
+        layout: &gpu_resources.linear_to_srgb_bind_group_layout,
+        entries: &[
+            wgpu::BindGroupEntry {
+                binding: 0,
+                resource: wgpu::BindingResource::TextureView(&gpu_resources.hdr_texture_view),
+            },
+            wgpu::BindGroupEntry {
+                binding: 1,
+                resource: wgpu::BindingResource::Sampler(&gpu_resources.linear_to_srgb_sampler),
+            },
+        ],
+    });
+
+    let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+        label: Some("export_linear_to_srgb_encoder"),
+    });
+
+    {
+        let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
+            label: Some("export_linear_to_srgb_pass"),
+            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                view: &output_view,
+                resolve_target: None,
+                ops: wgpu::Operations {
+                    load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
+                    store: wgpu::StoreOp::Store,
+                },
+                depth_slice: None,
+            })],
+            depth_stencil_attachment: None,
+            occlusion_query_set: None,
+            timestamp_writes: None,
+        });
+
+        render_pass.set_pipeline(&gpu_resources.linear_to_srgb_pipeline);
+        render_pass.set_bind_group(0, &bind_group, &[]);
+        render_pass.draw(0..4, 0..1);
+    }
+
+    // Return encoder for caller to submit (ReadbackPipeline will handle submission and async readback)
+    // Frame is already rendered to external RGBA texture, no GPU YUV conversion needed
+    Ok(encoder)
 }

 #[cfg(test)]
--- a/lightningbeam-ui/lightningbeam-editor/src/main.rs
+++ b/lightningbeam-ui/lightningbeam-editor/src/main.rs
@ -2936,7 +2936,6 @@ impl eframe::App for EditorApp {
                }
            }
            if let Some(progress) = orchestrator.poll_progress() {
-                println!("📨 [MAIN] Received progress from orchestrator!");
                match progress {
                    lightningbeam_core::export::ExportProgress::Started { total_frames } => {
                        println!("Export started: {} frames", total_frames);