slightly improve video export speed

This commit is contained in:
Skyler Lehmkuhl 2025-12-12 11:12:02 -05:00
parent d94ec0d6a8
commit cb62d0ee9d
10 changed files with 1542 additions and 142 deletions

View File

@ -9,15 +9,18 @@ members = [
# UI Framework (using eframe for simplified integration)
# Note: Upgraded from 0.29 to 0.31 to fix Linux IME/keyboard input issues
# See: https://github.com/emilk/egui/pull/5198
eframe = { version = "0.31", default-features = true, features = ["wgpu"] }
egui_extras = { version = "0.31", features = ["image", "svg"] }
egui-wgpu = "0.31"
# Upgraded to 0.33 for shader editor (egui_code_editor) and continued bug fixes
egui = "0.33"
eframe = { version = "0.33", default-features = true, features = ["wgpu"] }
egui_extras = { version = "0.33", features = ["image", "svg", "syntect"] }
egui-wgpu = "0.33"
egui_code_editor = "0.2"
# GPU Rendering
# vello 0.5 uses wgpu 24, matching eframe 0.31
vello = "0.5"
wgpu = "24"
kurbo = { version = "0.11", features = ["serde"] }
# vello from git uses wgpu 27, matching eframe 0.33
vello = { git = "https://github.com/linebender/vello", branch = "main" }
wgpu = { version = "27", features = ["vulkan", "metal"] }
kurbo = { version = "0.12", features = ["serde"] }
peniko = "0.5"
# Windowing

View File

@ -10,12 +10,14 @@ pub mod buffer_pool;
pub mod color_convert;
pub mod compositor;
pub mod effect_processor;
pub mod yuv_converter;
// Re-export commonly used types
pub use buffer_pool::{BufferHandle, BufferPool, BufferSpec, BufferFormat};
pub use color_convert::SrgbToLinearConverter;
pub use compositor::{Compositor, CompositorLayer, BlendMode};
pub use effect_processor::{EffectProcessor, EffectUniforms};
pub use yuv_converter::YuvConverter;
/// Standard HDR internal texture format (16-bit float per channel)
pub const HDR_FORMAT: wgpu::TextureFormat = wgpu::TextureFormat::Rgba16Float;

View File

@ -0,0 +1,241 @@
//! GPU-accelerated RGBA to YUV420p color space conversion
//!
//! Provides a compute shader-based converter for transforming RGBA textures
//! to YUV420p planar format using the BT.709 color matrix (HD video standard).
//! This replaces the CPU-based conversion with GPU parallel processing.
/// GPU pipeline for RGBA to YUV420p color space conversion
///
/// Converts Rgba8Unorm textures to YUV420p planar format using BT.709 colorspace.
/// The Y plane is full resolution, while U and V planes are subsampled 4:2:0.
///
/// Output texture layout:
/// - Rows 0 to height-1: Y plane (luma, full resolution)
/// - Rows height to height + height/4 - 1: U plane (chroma, half resolution)
/// - Rows height + height/4 to height + height/2 - 1: V plane (chroma, half resolution)
pub struct YuvConverter {
pipeline: wgpu::ComputePipeline,
bind_group_layout: wgpu::BindGroupLayout,
}
impl YuvConverter {
/// Create a new RGBA to YUV420p converter
pub fn new(device: &wgpu::Device) -> Self {
// Create bind group layout
let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: Some("yuv_converter_bind_group_layout"),
entries: &[
// Input RGBA texture (binding 0)
wgpu::BindGroupLayoutEntry {
binding: 0,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Texture {
sample_type: wgpu::TextureSampleType::Float { filterable: false },
view_dimension: wgpu::TextureViewDimension::D2,
multisampled: false,
},
count: None,
},
// Output YUV texture (Rgba8Unorm storage texture, binding 1)
// Note: R8Unorm doesn't support storage binding, so we use Rgba8Unorm and write to .r channel
wgpu::BindGroupLayoutEntry {
binding: 1,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::StorageTexture {
access: wgpu::StorageTextureAccess::WriteOnly,
format: wgpu::TextureFormat::Rgba8Unorm,
view_dimension: wgpu::TextureViewDimension::D2,
},
count: None,
},
],
});
// Create pipeline layout
let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: Some("yuv_converter_pipeline_layout"),
bind_group_layouts: &[&bind_group_layout],
push_constant_ranges: &[],
});
// Create shader module
let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
label: Some("yuv_converter_shader"),
source: wgpu::ShaderSource::Wgsl(YUV_CONVERTER_SHADER.into()),
});
// Create compute pipeline
let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
label: Some("yuv_converter_pipeline"),
layout: Some(&pipeline_layout),
module: &shader,
entry_point: Some("main"),
compilation_options: wgpu::PipelineCompilationOptions::default(),
cache: None,
});
Self {
pipeline,
bind_group_layout,
}
}
/// Convert RGBA texture to YUV420p planar format
///
/// Reads from `rgba_view` and writes Y, U, V planes to `yuv_output_view`.
/// The output texture must be R8Unorm format with height = input_height * 1.5
/// to accommodate the packed YUV planes.
///
/// # Arguments
/// * `device` - GPU device
/// * `encoder` - Command encoder to record GPU commands
/// * `rgba_view` - Source RGBA texture view
/// * `yuv_output_view` - Destination YUV planar texture view (R8Unorm, height*1.5)
/// * `width` - Width of the source RGBA texture
/// * `height` - Height of the source RGBA texture
pub fn convert(
&self,
device: &wgpu::Device,
encoder: &mut wgpu::CommandEncoder,
rgba_view: &wgpu::TextureView,
yuv_output_view: &wgpu::TextureView,
width: u32,
height: u32,
) {
// Create bind group for this conversion
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("yuv_converter_bind_group"),
layout: &self.bind_group_layout,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: wgpu::BindingResource::TextureView(rgba_view),
},
wgpu::BindGroupEntry {
binding: 1,
resource: wgpu::BindingResource::TextureView(yuv_output_view),
},
],
});
// Compute pass
let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
label: Some("yuv_conversion_pass"),
timestamp_writes: None,
});
compute_pass.set_pipeline(&self.pipeline);
compute_pass.set_bind_group(0, &bind_group, &[]);
// Dispatch workgroups: 8x8 threads per workgroup
// Each thread processes one pixel for the Y plane
// Chroma planes are processed by threads at even coordinates
let workgroup_size = 8;
let workgroups_x = (width + workgroup_size - 1) / workgroup_size;
let workgroups_y = (height + workgroup_size - 1) / workgroup_size;
compute_pass.dispatch_workgroups(workgroups_x, workgroups_y, 1);
}
}
/// WGSL compute shader for RGBA to YUV420p conversion
const YUV_CONVERTER_SHADER: &str = r#"
// RGBA to YUV420p Compute Shader
// BT.709 color space for HD video (ITU-R BT.709-6 standard)
//
// Color matrix:
// Y = 0.2126*R + 0.7152*G + 0.0722*B
// U = -0.1146*R - 0.3854*G + 0.5000*B + 0.5
// V = 0.5000*R - 0.4542*G - 0.0458*B + 0.5
//
// Output texture layout (packed planar, side-by-side U/V):
// - Rows [0, height): Y plane (full resolution, full width)
// - Rows [height, height + height/2): U plane (left half, columns 0 to width/2-1)
// V plane (right half, columns width/2 to width-1)
@group(0) @binding(0) var input_rgba: texture_2d<f32>;
@group(0) @binding(1) var output_yuv: texture_storage_2d<rgba8unorm, write>;
@compute @workgroup_size(8, 8, 1)
fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
let dims = textureDimensions(input_rgba);
let pos = global_id.xy;
// Bounds check
if (pos.x >= dims.x || pos.y >= dims.y) {
return;
}
// Load RGBA pixel
let rgba = textureLoad(input_rgba, pos, 0);
let r = rgba.r;
let g = rgba.g;
let b = rgba.b;
// Compute Y (luma) - full resolution, BT.709
let y = 0.2126 * r + 0.7152 * g + 0.0722 * b;
// Write Y value to Y plane (rows 0 to height-1)
textureStore(output_yuv, pos, vec4<f32>(y, 0.0, 0.0, 0.0));
// Compute U and V (chroma) - subsampled 4:2:0
// Only process even coordinates (top-left of 2x2 blocks)
if (pos.x % 2u == 0u && pos.y % 2u == 0u) {
// Sample 2x2 block for chroma subsampling
var r_sum = r;
var g_sum = g;
var b_sum = b;
var count = 1.0;
// Sample right neighbor (x+1, y)
if (pos.x + 1u < dims.x) {
let rgba_r = textureLoad(input_rgba, pos + vec2<u32>(1u, 0u), 0);
r_sum += rgba_r.r;
g_sum += rgba_r.g;
b_sum += rgba_r.b;
count += 1.0;
}
// Sample bottom neighbor (x, y+1)
if (pos.y + 1u < dims.y) {
let rgba_b = textureLoad(input_rgba, pos + vec2<u32>(0u, 1u), 0);
r_sum += rgba_b.r;
g_sum += rgba_b.g;
b_sum += rgba_b.b;
count += 1.0;
}
// Sample bottom-right neighbor (x+1, y+1)
if (pos.x + 1u < dims.x && pos.y + 1u < dims.y) {
let rgba_br = textureLoad(input_rgba, pos + vec2<u32>(1u, 1u), 0);
r_sum += rgba_br.r;
g_sum += rgba_br.g;
b_sum += rgba_br.b;
count += 1.0;
}
// Average the 2x2 block
let r_avg = r_sum / count;
let g_avg = g_sum / count;
let b_avg = b_sum / count;
// Compute chroma components (BT.709, centered at 0.5 for unsigned 8-bit)
let u = -0.1146 * r_avg - 0.3854 * g_avg + 0.5000 * b_avg + 0.5;
let v = 0.5000 * r_avg - 0.4542 * g_avg - 0.0458 * b_avg + 0.5;
// Compute chroma plane positions (half resolution)
// Pack U and V side-by-side: U on left half, V on right half
let chroma_x = pos.x / 2u;
let chroma_y = pos.y / 2u;
// U plane: left half (columns 0 to width/2-1), rows height to height+height/2-1
let u_pos = vec2<u32>(chroma_x, dims.y + chroma_y);
// V plane: right half (columns width/2 to width-1), rows height to height+height/2-1
let v_pos = vec2<u32>(dims.x / 2u + chroma_x, dims.y + chroma_y);
// Write U and V values to their respective planes
textureStore(output_yuv, u_pos, vec4<f32>(u, 0.0, 0.0, 0.0));
textureStore(output_yuv, v_pos, vec4<f32>(v, 0.0, 0.0, 0.0));
}
}
"#;

View File

@ -0,0 +1,62 @@
# Plan for Async Rendering Helpers
I'm creating this temporary document to plan the async rendering changes.
## Current Architecture (Synchronous)
`render_frame_to_rgba_hdr()` in video_exporter.rs:
1. Render document to RGBA (lines 750-991)
2. GPU YUV conversion (lines 993-1005)
3. Copy YUV to staging buffer (lines 1007-1029)
4. Submit GPU commands (line 1031)
5. **BLOCKING** map_async + wait (lines 1033-1045)
6. Extract Y, U, V planes from mapped buffer (lines 1047-1087)
7. Unmap and return YUV planes (lines 1089-1092)
## New Architecture (Async Pipelined)
Split into two phases using ReadbackPipeline:
### Phase 1: Submit Frame (Non-blocking)
New function `submit_frame_to_readback_pipeline()`:
- Input: buffer from ReadbackPipeline.acquire()
- Steps 1-3: Render to RGBA, GPU YUV, copy to buffer's YUV texture
- Return encoder to ReadbackPipeline for submission
- **Does NOT wait for GPU**
### Phase 2: Extract YUV (After async mapping)
Helper function `extract_yuv_planes_from_buffer()`:
- Input: mapped buffer data from ReadbackPipeline
- Steps 6-7: Extract Y, U, V planes, return them
- Used after ReadbackPipeline.get_mapped_data()
## Modified render_next_video_frame()
New async pipeline loop:
```
while more_work_to_do:
// Poll for completed frames
for result in pipeline.poll_nonblocking():
data = pipeline.get_mapped_data(result.buffer_id)
(y, u, v) = extract_yuv_planes(data)
send_to_encoder_in_order(result.frame_num, y, u, v)
pipeline.release(result.buffer_id)
// Submit new frames (up to 3 in flight)
if current_frame < total_frames && frames_in_flight < 3:
if let Some(buffer) = pipeline.acquire(frame_num, timestamp):
encoder = submit_frame_to_pipeline(buffer)
pipeline.submit_and_readback(buffer.id, encoder)
frames_in_flight++
current_frame++
// Done when all frames submitted AND all completed
if current_frame >= total_frames && frames_in_flight == 0:
return Ok(false)
return Ok(true) // More work to do
```
This achieves triple buffering:
- Frame N: GPU rendering
- Frame N-1: GPU→CPU async transfer
- Frame N-2: CPU encoding
Expected speedup: 5x

View File

@ -0,0 +1,143 @@
//! CPU-based RGBA→YUV420p color space converter using FFmpeg's swscale
//!
//! This module provides a wrapper around FFmpeg's highly-optimized swscale library
//! for converting RGBA data to YUV420p format. Uses SIMD instructions when available
//! for maximum performance.
use ffmpeg_next as ffmpeg;
/// CPU-based RGBA→YUV420p converter using FFmpeg's swscale
///
/// This converter uses FFmpeg's swscale library which is highly optimized with SIMD
/// instructions (SSE, AVX) for fast color space conversion on the CPU.
pub struct CpuYuvConverter {
width: u32,
height: u32,
}
impl CpuYuvConverter {
/// Create new converter for given dimensions
///
/// # Arguments
/// * `width` - Frame width in pixels
/// * `height` - Frame height in pixels
pub fn new(width: u32, height: u32) -> Result<Self, String> {
Ok(Self { width, height })
}
/// Convert RGBA data to YUV420p planes
///
/// Performs color space conversion from RGBA (8-bit per channel, packed format)
/// to YUV420p (8-bit per channel, planar format with subsampled chroma).
///
/// Uses BT.709 color matrix (HD standard) for the conversion.
///
/// # Arguments
/// * `rgba_data` - Packed RGBA data (width * height * 4 bytes)
///
/// # Returns
/// Tuple of (y_plane, u_plane, v_plane) as separate Vec<u8>
///
/// # Panics
/// Panics if rgba_data length doesn't match width * height * 4
pub fn convert(&self, rgba_data: &[u8]) -> Result<(Vec<u8>, Vec<u8>, Vec<u8>), String> {
let expected_size = (self.width * self.height * 4) as usize;
assert_eq!(
rgba_data.len(),
expected_size,
"RGBA data size mismatch: expected {} bytes, got {}",
expected_size,
rgba_data.len()
);
// Create source RGBA frame
let mut rgba_frame = ffmpeg::frame::Video::new(
ffmpeg::format::Pixel::RGBA,
self.width,
self.height,
);
// Copy RGBA data into source frame
// ffmpeg-next provides mutable access to the frame data
let frame_data = rgba_frame.data_mut(0);
frame_data.copy_from_slice(rgba_data);
// Create destination YUV420p frame
let mut yuv_frame = ffmpeg::frame::Video::new(
ffmpeg::format::Pixel::YUV420P,
self.width,
self.height,
);
// Create swscale context for RGBA→YUV420p conversion
// Uses BT.709 color matrix (HD standard)
let mut scaler = ffmpeg::software::scaling::Context::get(
ffmpeg::format::Pixel::RGBA,
self.width,
self.height,
ffmpeg::format::Pixel::YUV420P,
self.width,
self.height,
ffmpeg::software::scaling::Flags::BILINEAR,
)
.map_err(|e| format!("Failed to create swscale context: {}", e))?;
// Perform the conversion (SIMD-optimized)
scaler
.run(&rgba_frame, &mut yuv_frame)
.map_err(|e| format!("swscale conversion failed: {}", e))?;
// Extract planar YUV data
// YUV420p has 3 planes:
// - Y: full resolution (width × height)
// - U: quarter resolution (width/2 × height/2)
// - V: quarter resolution (width/2 × height/2)
let y_plane = yuv_frame.data(0).to_vec();
let u_plane = yuv_frame.data(1).to_vec();
let v_plane = yuv_frame.data(2).to_vec();
Ok((y_plane, u_plane, v_plane))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_converter_creation() {
let converter = CpuYuvConverter::new(1920, 1080);
assert!(converter.is_ok());
}
#[test]
fn test_conversion_output_sizes() {
let converter = CpuYuvConverter::new(1920, 1080).unwrap();
// Create dummy RGBA data (all black)
let rgba_data = vec![0u8; 1920 * 1080 * 4];
let result = converter.convert(&rgba_data);
assert!(result.is_ok());
let (y, u, v) = result.unwrap();
// Y plane should be full resolution
assert_eq!(y.len(), 1920 * 1080);
// U and V planes should be quarter resolution (subsampled 2x2)
assert_eq!(u.len(), (1920 / 2) * (1080 / 2));
assert_eq!(v.len(), (1920 / 2) * (1080 / 2));
}
#[test]
#[should_panic(expected = "RGBA data size mismatch")]
fn test_wrong_input_size_panics() {
let converter = CpuYuvConverter::new(1920, 1080).unwrap();
// Wrong size input
let rgba_data = vec![0u8; 1000];
let _ = converter.convert(&rgba_data);
}
}

View File

@ -6,6 +6,9 @@
pub mod audio_exporter;
pub mod dialog;
pub mod video_exporter;
pub mod readback_pipeline;
pub mod perf_metrics;
pub mod cpu_yuv_converter;
use lightningbeam_core::export::{AudioExportSettings, VideoExportSettings, ExportProgress};
use lightningbeam_core::document::Document;
@ -18,8 +21,14 @@ use std::sync::atomic::{AtomicBool, Ordering};
/// Message sent from main thread to video encoder thread
enum VideoFrameMessage {
/// RGBA frame data with frame number and timestamp
Frame { frame_num: usize, timestamp: f64, rgba_data: Vec<u8> },
/// YUV420p frame data with frame number and timestamp (GPU-converted)
Frame {
frame_num: usize,
timestamp: f64,
y_plane: Vec<u8>,
u_plane: Vec<u8>,
v_plane: Vec<u8>,
},
/// Signal that all frames have been sent
Done,
}
@ -44,6 +53,16 @@ pub struct VideoExportState {
frame_tx: Option<Sender<VideoFrameMessage>>,
/// HDR GPU resources for compositing pipeline (effects, color conversion)
gpu_resources: Option<video_exporter::ExportGpuResources>,
/// Async triple-buffered readback pipeline for GPU RGBA frames
readback_pipeline: Option<readback_pipeline::ReadbackPipeline>,
/// CPU YUV converter for RGBA→YUV420p conversion
cpu_yuv_converter: Option<cpu_yuv_converter::CpuYuvConverter>,
/// Frames that have been submitted to GPU but not yet encoded
frames_in_flight: usize,
/// Next frame number to send to encoder (for ordering)
next_frame_to_encode: usize,
/// Performance metrics for instrumentation
perf_metrics: Option<perf_metrics::ExportMetrics>,
}
/// Export orchestrator that manages the export process
@ -168,13 +187,11 @@ impl ExportOrchestrator {
// Poll video progress
while let Ok(progress) = parallel.video_progress_rx.try_recv() {
println!("📨 [PARALLEL] Video progress: {:?}", std::mem::discriminant(&progress));
parallel.video_progress = Some(progress);
}
// Poll audio progress
while let Ok(progress) = parallel.audio_progress_rx.try_recv() {
println!("📨 [PARALLEL] Audio progress: {:?}", std::mem::discriminant(&progress));
parallel.audio_progress = Some(progress);
}
@ -621,7 +638,7 @@ impl ExportOrchestrator {
self.thread_handle = Some(handle);
// Initialize video export state
// GPU resources will be initialized lazily on first frame (needs device)
// GPU resources and readback pipeline will be initialized lazily on first frame (needs device)
self.video_state = Some(VideoExportState {
current_frame: 0,
total_frames,
@ -632,6 +649,11 @@ impl ExportOrchestrator {
height,
frame_tx: Some(frame_tx),
gpu_resources: None,
readback_pipeline: None,
cpu_yuv_converter: None,
frames_in_flight: 0,
next_frame_to_encode: 0,
perf_metrics: Some(perf_metrics::ExportMetrics::new()),
});
println!("🎬 [VIDEO EXPORT] Encoder thread spawned, ready for frames");
@ -745,7 +767,7 @@ impl ExportOrchestrator {
});
// Initialize video export state for incremental rendering
// GPU resources will be initialized lazily on first frame (needs device)
// GPU resources and readback pipeline will be initialized lazily on first frame (needs device)
self.video_state = Some(VideoExportState {
current_frame: 0,
total_frames,
@ -756,6 +778,11 @@ impl ExportOrchestrator {
height: video_height,
frame_tx: Some(frame_tx),
gpu_resources: None,
readback_pipeline: None,
cpu_yuv_converter: None,
frames_in_flight: 0,
next_frame_to_encode: 0,
perf_metrics: Some(perf_metrics::ExportMetrics::new()),
});
// Initialize parallel export state
@ -777,6 +804,7 @@ impl ExportOrchestrator {
/// Render and send the next video frame (call from main thread)
///
/// Uses async triple-buffered pipeline for maximum throughput.
/// Returns true if there are more frames to render, false if done.
///
/// # Arguments
@ -798,62 +826,143 @@ impl ExportOrchestrator {
image_cache: &mut ImageCache,
video_manager: &Arc<std::sync::Mutex<VideoManager>>,
) -> Result<bool, String> {
use std::time::Instant;
let state = self.video_state.as_mut()
.ok_or("No video export in progress")?;
if state.current_frame >= state.total_frames {
// All frames rendered, signal encoder thread
if let Some(tx) = state.frame_tx.take() {
tx.send(VideoFrameMessage::Done).ok();
}
// Clean up GPU resources
state.gpu_resources = None;
return Ok(false);
}
// Calculate timestamp for this frame
let timestamp = state.start_time + (state.current_frame as f64 / state.framerate);
// Get frame dimensions from export settings
let width = state.width;
let height = state.height;
// Initialize GPU resources on first frame (needs device)
// Initialize GPU resources and readback pipeline on first frame
if state.gpu_resources.is_none() {
println!("🎬 [VIDEO EXPORT] Initializing HDR GPU resources for {}x{}", width, height);
println!("🎬 [VIDEO EXPORT] Initializing HDR GPU + async pipeline {}x{}", width, height);
state.gpu_resources = Some(video_exporter::ExportGpuResources::new(device, width, height));
state.readback_pipeline = Some(readback_pipeline::ReadbackPipeline::new(device, queue, width, height));
state.cpu_yuv_converter = Some(cpu_yuv_converter::CpuYuvConverter::new(width, height)?);
println!("🚀 [ASYNC PIPELINE] Triple-buffered pipeline initialized");
println!("🚀 [CPU YUV] swscale converter initialized");
}
// Render frame to RGBA buffer using HDR pipeline (with effects)
let mut rgba_buffer = vec![0u8; (width * height * 4) as usize];
let pipeline = state.readback_pipeline.as_mut().unwrap();
let gpu_resources = state.gpu_resources.as_mut().unwrap();
video_exporter::render_frame_to_rgba_hdr(
document,
timestamp,
width,
height,
device,
queue,
renderer,
image_cache,
video_manager,
gpu_resources,
&mut rgba_buffer,
)?;
let cpu_converter = state.cpu_yuv_converter.as_mut().unwrap();
let mut metrics = state.perf_metrics.as_mut();
// Send frame to encoder thread
if let Some(tx) = &state.frame_tx {
tx.send(VideoFrameMessage::Frame {
frame_num: state.current_frame,
timestamp,
rgba_data: rgba_buffer,
}).map_err(|_| "Failed to send frame to encoder")?;
// Poll for completed async readbacks (non-blocking)
if let Some(m) = metrics.as_mut() {
m.poll_count += 1;
}
let completed_frames = pipeline.poll_nonblocking();
if let Some(m) = metrics.as_mut() {
m.completions_per_poll.push(completed_frames.len());
}
state.current_frame += 1;
// Process completed frames IN ORDER
for result in completed_frames {
if result.frame_num == state.next_frame_to_encode {
// Record readback completion time
if let Some(m) = metrics.as_mut() {
if let Some(frame_metrics) = m.frames.get_mut(result.frame_num) {
frame_metrics.readback_complete = Some(Instant::now());
}
}
// Return true if more frames remain
Ok(state.current_frame < state.total_frames)
// Extract RGBA data (timed)
let extraction_start = Instant::now();
let rgba_data = pipeline.extract_rgba_data(result.buffer_id);
let extraction_end = Instant::now();
// CPU YUV conversion (timed)
let conversion_start = Instant::now();
let (y, u, v) = cpu_converter.convert(&rgba_data)?;
let conversion_end = Instant::now();
if let Some(m) = metrics.as_mut() {
if let Some(frame_metrics) = m.frames.get_mut(result.frame_num) {
frame_metrics.extraction_start = Some(extraction_start);
frame_metrics.extraction_end = Some(extraction_end);
frame_metrics.conversion_start = Some(conversion_start);
frame_metrics.conversion_end = Some(conversion_end);
}
}
// Send to encoder
if let Some(tx) = &state.frame_tx {
tx.send(VideoFrameMessage::Frame {
frame_num: result.frame_num,
timestamp: result.timestamp,
y_plane: y,
u_plane: u,
v_plane: v,
}).map_err(|_| "Failed to send frame")?;
}
pipeline.release(result.buffer_id);
state.frames_in_flight -= 1;
state.next_frame_to_encode += 1;
}
}
// Submit new frames (up to 3 in flight)
while state.current_frame < state.total_frames && state.frames_in_flight < 3 {
let timestamp = state.start_time + (state.current_frame as f64 / state.framerate);
if let Some(acquired) = pipeline.acquire(state.current_frame, timestamp) {
// Create frame metrics entry
if let Some(m) = metrics.as_mut() {
m.frames.push(perf_metrics::FrameMetrics::new(state.current_frame));
}
// Render to GPU (timed)
let render_start = Instant::now();
let encoder = video_exporter::render_frame_to_gpu_rgba(
document, timestamp, width, height,
device, queue, renderer, image_cache, video_manager,
gpu_resources, &acquired.rgba_texture_view,
)?;
let render_end = Instant::now();
// Record render timing
if let Some(m) = metrics.as_mut() {
if let Some(frame_metrics) = m.frames.get_mut(state.current_frame) {
frame_metrics.render_end = Some(render_end);
frame_metrics.submit_time = Some(Instant::now());
}
}
// Submit for async readback
pipeline.submit_and_readback(acquired.id, encoder);
state.current_frame += 1;
state.frames_in_flight += 1;
} else {
break; // All buffers in use
}
}
// Done when all submitted AND all completed
if state.current_frame >= state.total_frames && state.frames_in_flight == 0 {
println!("🎬 [VIDEO EXPORT] Complete: {} frames", state.total_frames);
// Print performance summary
if let Some(m) = &state.perf_metrics {
m.print_summary();
m.print_per_frame_details(10);
}
if let Some(tx) = state.frame_tx.take() {
tx.send(VideoFrameMessage::Done).ok();
}
state.gpu_resources = None;
state.readback_pipeline = None;
state.cpu_yuv_converter = None;
state.perf_metrics = None;
return Ok(false);
}
Ok(true) // More work to do
}
/// Background thread that receives frames and encodes them
@ -925,9 +1034,9 @@ impl ExportOrchestrator {
// Wait for first frame to determine dimensions
let first_frame = match frame_rx.recv() {
Ok(VideoFrameMessage::Frame { frame_num, timestamp, rgba_data }) => {
println!("🧵 [ENCODER] Received first frame ({} bytes)", rgba_data.len());
Some((frame_num, timestamp, rgba_data))
Ok(VideoFrameMessage::Frame { frame_num, timestamp, y_plane, u_plane, v_plane }) => {
println!("🧵 [ENCODER] Received first YUV frame (Y: {} bytes)", y_plane.len());
Some((frame_num, timestamp, y_plane, u_plane, v_plane))
}
Ok(VideoFrameMessage::Done) => {
return Err("No frames to encode".to_string());
@ -938,9 +1047,9 @@ impl ExportOrchestrator {
};
// Determine dimensions from first frame
let (width, height) = if let Some((_, _, ref rgba_data)) = first_frame {
// Calculate dimensions from buffer size (RGBA = 4 bytes per pixel)
let pixel_count = rgba_data.len() / 4;
let (width, height) = if let Some((_, _, ref y_plane, _, _)) = first_frame {
// Calculate dimensions from Y plane size (full resolution, 1 byte per pixel)
let pixel_count = y_plane.len();
// Use settings dimensions if provided, otherwise infer from buffer
let w = settings.width.unwrap_or(1920); // Default to 1920 if not specified
let h = settings.height.unwrap_or(1080); // Default to 1080 if not specified
@ -979,11 +1088,13 @@ impl ExportOrchestrator {
println!("🧵 [ENCODER] Encoder initialized, ready to encode frames");
// Process first frame
if let Some((frame_num, timestamp, rgba_data)) = first_frame {
if let Some((frame_num, timestamp, y_plane, u_plane, v_plane)) = first_frame {
Self::encode_frame(
&mut encoder,
&mut output,
&rgba_data,
&y_plane,
&u_plane,
&v_plane,
width,
height,
timestamp,
@ -994,8 +1105,6 @@ impl ExportOrchestrator {
frame: 1,
total: total_frames,
}).ok();
println!("🧵 [ENCODER] Encoded frame {}", frame_num);
}
// Process remaining frames
@ -1006,11 +1115,13 @@ impl ExportOrchestrator {
}
match frame_rx.recv() {
Ok(VideoFrameMessage::Frame { frame_num, timestamp, rgba_data }) => {
Ok(VideoFrameMessage::Frame { frame_num, timestamp, y_plane, u_plane, v_plane }) => {
Self::encode_frame(
&mut encoder,
&mut output,
&rgba_data,
&y_plane,
&u_plane,
&v_plane,
width,
height,
timestamp,
@ -1023,10 +1134,6 @@ impl ExportOrchestrator {
frame: frames_encoded,
total: total_frames,
}).ok();
if frames_encoded % 30 == 0 || frames_encoded == frame_num + 1 {
println!("🧵 [ENCODER] Encoded frame {}/{}", frames_encoded, total_frames);
}
}
Ok(VideoFrameMessage::Done) => {
println!("🧵 [ENCODER] All frames received, flushing encoder");
@ -1052,17 +1159,18 @@ impl ExportOrchestrator {
Ok(())
}
/// Encode a single RGBA frame
/// Encode a single YUV420p frame (already converted by GPU)
fn encode_frame(
encoder: &mut ffmpeg_next::encoder::Video,
output: &mut ffmpeg_next::format::context::Output,
rgba_data: &[u8],
y_plane: &[u8],
u_plane: &[u8],
v_plane: &[u8],
width: u32,
height: u32,
timestamp: f64,
) -> Result<(), String> {
// Convert RGBA to YUV420p
let (y_plane, u_plane, v_plane) = video_exporter::rgba_to_yuv420p(rgba_data, width, height);
// YUV planes already converted by GPU (no CPU conversion needed)
// Create FFmpeg video frame
let mut video_frame = ffmpeg_next::frame::Video::new(
@ -1087,8 +1195,6 @@ impl ExportOrchestrator {
// Encoder time base is 1/(framerate * 1000), so PTS = timestamp * (framerate * 1000)
let encoder_tb = encoder.time_base();
let pts = (timestamp * encoder_tb.1 as f64) as i64;
println!("🎬 [ENCODE] Frame timestamp={:.3}s, encoder_tb={}/{}, calculated PTS={}",
timestamp, encoder_tb.0, encoder_tb.1, pts);
video_frame.set_pts(Some(pts));
// Send frame to encoder

View File

@ -0,0 +1,191 @@
//! Performance instrumentation for video export pipeline
//!
//! Tracks timing for each stage of the export process:
//! - GPU rendering (render_frame_to_gpu_yuv)
//! - Async readback (map_async completion)
//! - YUV plane extraction
//! - FFmpeg encoding
//! - Polling frequency and efficiency
use std::time::{Duration, Instant};
/// Performance metrics for a single frame
#[derive(Debug)]
pub struct FrameMetrics {
pub frame_num: usize,
pub render_start: Instant,
pub render_end: Option<Instant>,
pub submit_time: Option<Instant>,
pub readback_complete: Option<Instant>,
pub extraction_start: Option<Instant>,
pub extraction_end: Option<Instant>,
pub conversion_start: Option<Instant>,
pub conversion_end: Option<Instant>,
pub encode_start: Option<Instant>,
pub encode_end: Option<Instant>,
}
impl FrameMetrics {
pub fn new(frame_num: usize) -> Self {
Self {
frame_num,
render_start: Instant::now(),
render_end: None,
submit_time: None,
readback_complete: None,
extraction_start: None,
extraction_end: None,
conversion_start: None,
conversion_end: None,
encode_start: None,
encode_end: None,
}
}
pub fn render_duration(&self) -> Option<Duration> {
self.render_end.map(|end| end.duration_since(self.render_start))
}
pub fn readback_duration(&self) -> Option<Duration> {
self.submit_time.and_then(|submit|
self.readback_complete.map(|complete|
complete.duration_since(submit)
)
)
}
pub fn extraction_duration(&self) -> Option<Duration> {
self.extraction_start.and_then(|start|
self.extraction_end.map(|end|
end.duration_since(start)
)
)
}
pub fn conversion_duration(&self) -> Option<Duration> {
self.conversion_start.and_then(|start|
self.conversion_end.map(|end|
end.duration_since(start)
)
)
}
pub fn encode_duration(&self) -> Option<Duration> {
self.encode_start.and_then(|start|
self.encode_end.map(|end|
end.duration_since(start)
)
)
}
pub fn total_duration(&self) -> Option<Duration> {
self.encode_end.map(|end| end.duration_since(self.render_start))
}
}
/// Aggregate performance metrics for entire export
pub struct ExportMetrics {
pub frames: Vec<FrameMetrics>,
export_start: Instant,
pub poll_count: usize,
pub completions_per_poll: Vec<usize>,
}
impl ExportMetrics {
pub fn new() -> Self {
Self {
frames: Vec::new(),
export_start: Instant::now(),
poll_count: 0,
completions_per_poll: Vec::new(),
}
}
/// Print comprehensive performance summary
pub fn print_summary(&self) {
println!("\n📊 [PERF] Export Performance Summary");
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
// Calculate averages for each stage
let mut render_times = Vec::new();
let mut readback_times = Vec::new();
let mut extraction_times = Vec::new();
let mut conversion_times = Vec::new();
let mut encode_times = Vec::new();
let mut total_times = Vec::new();
for metrics in &self.frames {
if let Some(d) = metrics.render_duration() {
render_times.push(d);
}
if let Some(d) = metrics.readback_duration() {
readback_times.push(d);
}
if let Some(d) = metrics.extraction_duration() {
extraction_times.push(d);
}
if let Some(d) = metrics.conversion_duration() {
conversion_times.push(d);
}
if let Some(d) = metrics.encode_duration() {
encode_times.push(d);
}
if let Some(d) = metrics.total_duration() {
total_times.push(d);
}
}
let avg = |times: &[Duration]| -> f64 {
if times.is_empty() { return 0.0; }
times.iter().sum::<Duration>().as_secs_f64() / times.len() as f64 * 1000.0
};
println!("Render: {:.2}ms avg", avg(&render_times));
println!("Readback: {:.2}ms avg", avg(&readback_times));
println!("Extraction: {:.2}ms avg", avg(&extraction_times));
println!("Conversion: {:.2}ms avg", avg(&conversion_times));
println!("Encode: {:.2}ms avg", avg(&encode_times));
println!("Total: {:.2}ms avg", avg(&total_times));
let total_export_time = Instant::now().duration_since(self.export_start).as_secs_f64();
let fps = self.frames.len() as f64 / total_export_time;
println!("\nOverall: {:.2} fps ({:.1}s for {} frames)",
fps, total_export_time, self.frames.len());
if self.poll_count > 0 {
let avg_completions = self.completions_per_poll.iter().sum::<usize>() as f64 / self.poll_count as f64;
println!("Polls: {} ({:.2} completions/poll avg)",
self.poll_count, avg_completions);
}
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
}
/// Print detailed per-frame breakdown for last N frames
pub fn print_per_frame_details(&self, last_n: usize) {
println!("\n📋 [PERF] Per-Frame Breakdown (last {} frames)", last_n);
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
println!("{:>5} | {:>8} | {:>8} | {:>8} | {:>8} | {:>8} | {:>8}",
"Frame", "Render", "Readback", "Extract", "Convert", "Encode", "Total");
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
let start = if self.frames.len() > last_n {
self.frames.len() - last_n
} else {
0
};
for metrics in &self.frames[start..] {
println!("{:5} | {:>7.2}ms | {:>7.2}ms | {:>7.2}ms | {:>7.2}ms | {:>7.2}ms | {:>7.2}ms",
metrics.frame_num,
metrics.render_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
metrics.readback_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
metrics.extraction_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
metrics.conversion_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
metrics.encode_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
metrics.total_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
);
}
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
}
}

View File

@ -0,0 +1,317 @@
//! Async triple-buffered GPU readback pipeline for video export
//!
//! This module implements a pipelined export system that overlaps GPU rendering
//! with CPU encoding to maximize throughput. It uses triple buffering to keep
//! both GPU and CPU busy simultaneously:
//!
//! - Frame N: GPU rendering/conversion
//! - Frame N-1: GPU→CPU async transfer
//! - Frame N-2: CPU encoding
//!
//! Expected speedup: 5x over synchronous blocking approach
use std::sync::mpsc::{channel, Receiver, Sender};
/// Result from a completed async buffer mapping
#[derive(Debug)]
pub struct ReadbackResult {
pub buffer_id: usize,
pub frame_num: usize,
pub timestamp: f64,
}
/// State of a pipeline buffer in the triple-buffering state machine
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum BufferState {
/// Buffer is available for new frame rendering
Free,
/// GPU is currently rendering/converting to this buffer
Rendering,
/// Buffer readback submitted, waiting for GPU→CPU transfer
ReadbackPending,
/// Buffer mapped and ready for CPU to read
Mapped,
/// CPU is encoding this buffer's data
Encoding,
}
/// A single buffer in the triple-buffering pipeline
struct PipelineBuffer {
id: usize,
/// RGBA texture for GPU rendering output (Rgba8Unorm)
rgba_texture: wgpu::Texture,
rgba_texture_view: wgpu::TextureView,
/// Staging buffer for GPU→CPU transfer (MAP_READ)
staging_buffer: wgpu::Buffer,
/// Current state in the pipeline
state: BufferState,
/// Frame metadata (set when rendering starts)
frame_num: Option<usize>,
timestamp: Option<f64>,
}
/// Handle to an acquired buffer for rendering
pub struct AcquiredBuffer {
pub id: usize,
pub rgba_texture_view: wgpu::TextureView,
}
/// Triple-buffered async readback pipeline
///
/// Manages 3 buffers cycling through the pipeline:
/// Free → Rendering → ReadbackPending → Mapped → Encoding → Free
pub struct ReadbackPipeline {
buffers: Vec<PipelineBuffer>,
/// Channel for async map_async callbacks
readback_rx: Receiver<ReadbackResult>,
readback_tx: Sender<ReadbackResult>,
/// wgpu device and queue references (needed for polling and buffer operations)
device: wgpu::Device,
queue: wgpu::Queue,
/// Buffer dimensions
width: u32,
height: u32,
}
impl ReadbackPipeline {
/// Create a new triple-buffered readback pipeline
///
/// # Arguments
/// * `device` - GPU device (will be cloned for async operations)
/// * `queue` - GPU queue (will be cloned for async operations)
/// * `width` - Frame width in pixels
/// * `height` - Frame height in pixels
pub fn new(device: &wgpu::Device, queue: &wgpu::Queue, width: u32, height: u32) -> Self {
let (readback_tx, readback_rx) = channel();
// Create 3 buffers for triple buffering
let mut buffers = Vec::new();
for id in 0..3 {
// RGBA texture (Rgba8Unorm)
let rgba_texture = device.create_texture(&wgpu::TextureDescriptor {
label: Some(&format!("readback_rgba_texture_{}", id)),
size: wgpu::Extent3d {
width,
height,
depth_or_array_layers: 1,
},
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
format: wgpu::TextureFormat::Rgba8Unorm,
usage: wgpu::TextureUsages::RENDER_ATTACHMENT | wgpu::TextureUsages::COPY_SRC,
view_formats: &[],
});
let rgba_texture_view = rgba_texture.create_view(&wgpu::TextureViewDescriptor::default());
// Staging buffer for GPU→CPU readback
let rgba_buffer_size = (width * height * 4) as u64; // Rgba8Unorm = 4 bytes/pixel
let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: Some(&format!("readback_staging_buffer_{}", id)),
size: rgba_buffer_size,
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
mapped_at_creation: false,
});
buffers.push(PipelineBuffer {
id,
rgba_texture,
rgba_texture_view,
staging_buffer,
state: BufferState::Free,
frame_num: None,
timestamp: None,
});
}
Self {
buffers,
readback_rx,
readback_tx,
device: device.clone(),
queue: queue.clone(),
width,
height,
}
}
/// Acquire a free buffer for rendering (non-blocking)
///
/// Returns None if all buffers are in use (caller should poll and retry)
pub fn acquire(&mut self, frame_num: usize, timestamp: f64) -> Option<AcquiredBuffer> {
// Find first Free buffer
for buffer in &mut self.buffers {
if buffer.state == BufferState::Free {
buffer.state = BufferState::Rendering;
buffer.frame_num = Some(frame_num);
buffer.timestamp = Some(timestamp);
return Some(AcquiredBuffer {
id: buffer.id,
rgba_texture_view: buffer.rgba_texture_view.clone(),
});
}
}
None // All buffers busy
}
/// Submit GPU commands and initiate async readback
///
/// # Arguments
/// * `buffer_id` - ID of the buffer to submit (from AcquiredBuffer)
/// * `encoder` - Command encoder with rendering commands
pub fn submit_and_readback(&mut self, buffer_id: usize, mut encoder: wgpu::CommandEncoder) {
let buffer = &mut self.buffers[buffer_id];
assert_eq!(buffer.state, BufferState::Rendering, "Buffer not in Rendering state");
// Copy RGBA texture to staging buffer
encoder.copy_texture_to_buffer(
wgpu::TexelCopyTextureInfo {
texture: &buffer.rgba_texture,
mip_level: 0,
origin: wgpu::Origin3d::ZERO,
aspect: wgpu::TextureAspect::All,
},
wgpu::TexelCopyBufferInfo {
buffer: &buffer.staging_buffer,
layout: wgpu::TexelCopyBufferLayout {
offset: 0,
bytes_per_row: Some(self.width * 4), // Rgba8Unorm
rows_per_image: Some(self.height),
},
},
wgpu::Extent3d {
width: self.width,
height: self.height,
depth_or_array_layers: 1,
},
);
// Submit GPU commands (non-blocking)
self.queue.submit(Some(encoder.finish()));
// Initiate async buffer mapping
let frame_num = buffer.frame_num.unwrap();
let timestamp = buffer.timestamp.unwrap();
let tx = self.readback_tx.clone();
buffer.staging_buffer.slice(..).map_async(wgpu::MapMode::Read, move |result| {
if result.is_ok() {
let _ = tx.send(ReadbackResult {
buffer_id,
frame_num,
timestamp,
});
}
});
buffer.state = BufferState::ReadbackPending;
}
/// Poll for completed readbacks (non-blocking)
///
/// Returns list of buffers that are now ready for CPU encoding.
/// Call this frequently to process completed transfers.
pub fn poll_nonblocking(&mut self) -> Vec<ReadbackResult> {
// Poll GPU without blocking
self.device.poll(wgpu::PollType::Poll);
// Collect all completed readbacks
let mut results = Vec::new();
while let Ok(result) = self.readback_rx.try_recv() {
// Update buffer state to Mapped
self.buffers[result.buffer_id].state = BufferState::Mapped;
results.push(result);
}
results
}
/// Extract RGBA data from mapped buffer (for CPU YUV conversion)
///
/// Buffer must be in Mapped state (after poll_nonblocking returned it).
/// This immediately copies the RGBA data, allowing the buffer to be released.
pub fn extract_rgba_data(&mut self, buffer_id: usize) -> Vec<u8> {
let buffer = &mut self.buffers[buffer_id];
assert_eq!(buffer.state, BufferState::Mapped, "Buffer not in Mapped state");
buffer.state = BufferState::Encoding;
// Map the buffer and copy RGBA data
let slice = buffer.staging_buffer.slice(..);
let data = slice.get_mapped_range();
// Simple copy - RGBA data goes to CPU for conversion
data.to_vec()
}
/// Release buffer after encoding completes, returning it to the free pool
///
/// # Arguments
/// * `buffer_id` - ID of buffer to release
pub fn release(&mut self, buffer_id: usize) {
let buffer = &mut self.buffers[buffer_id];
assert_eq!(buffer.state, BufferState::Encoding, "Buffer not in Encoding state");
// Unmap buffer
buffer.staging_buffer.unmap();
// Clear metadata
buffer.frame_num = None;
buffer.timestamp = None;
// Return to free pool
buffer.state = BufferState::Free;
}
/// Flush pipeline and wait for all pending operations
///
/// Call this at the end of export to ensure all frames are processed
pub fn flush(&mut self) -> Vec<ReadbackResult> {
let mut all_results = Vec::new();
// Keep polling until all buffers are Free
loop {
// Poll for new completions
self.device.poll(wgpu::PollType::Poll);
while let Ok(result) = self.readback_rx.try_recv() {
self.buffers[result.buffer_id].state = BufferState::Mapped;
all_results.push(result);
}
// Check if all buffers are Free (or can be made Free)
let mut all_free = true;
for buffer in &self.buffers {
match buffer.state {
BufferState::Free => {},
BufferState::Rendering | BufferState::ReadbackPending => {
all_free = false;
break;
},
BufferState::Mapped | BufferState::Encoding => {
// These should be handled by the caller, shouldn't happen during flush
panic!("Buffer in {} state during flush - caller should encode and release",
if buffer.state == BufferState::Mapped { "Mapped" } else { "Encoding" });
}
}
}
if all_free {
break;
}
// Small sleep to avoid busy-waiting
std::thread::sleep(std::time::Duration::from_millis(1));
}
all_results
}
/// Get buffer count currently in flight (for monitoring)
pub fn buffers_in_flight(&self) -> usize {
self.buffers.iter().filter(|b| b.state != BufferState::Free).count()
}
}

View File

@ -12,7 +12,7 @@ use lightningbeam_core::renderer::{ImageCache, render_document_for_compositing,
use lightningbeam_core::video::VideoManager;
use lightningbeam_core::gpu::{
BufferPool, BufferSpec, BufferFormat, Compositor, CompositorLayer,
SrgbToLinearConverter, EffectProcessor, HDR_FORMAT,
SrgbToLinearConverter, EffectProcessor, YuvConverter, HDR_FORMAT,
};
/// Reusable frame buffers to avoid allocations
@ -56,10 +56,22 @@ pub struct ExportGpuResources {
pub srgb_to_linear: SrgbToLinearConverter,
/// Effect processor for shader effects
pub effect_processor: EffectProcessor,
/// GPU-accelerated RGBA to YUV420p converter
pub yuv_converter: YuvConverter,
/// HDR accumulator texture for compositing
pub hdr_texture: wgpu::Texture,
/// View for HDR texture
pub hdr_texture_view: wgpu::TextureView,
/// Persistent RGBA output texture (sRGB, reused for all frames)
pub output_texture: wgpu::Texture,
/// View for persistent output texture
pub output_texture_view: wgpu::TextureView,
/// Persistent YUV texture for GPU conversion (R8Unorm, height*1.5, reused for all frames)
pub yuv_texture: wgpu::Texture,
/// View for persistent YUV texture
pub yuv_texture_view: wgpu::TextureView,
/// Persistent staging buffer for GPU→CPU readback (reused for all frames)
pub staging_buffer: wgpu::Buffer,
/// Linear to sRGB blit pipeline for final output
pub linear_to_srgb_pipeline: wgpu::RenderPipeline,
/// Bind group layout for linear to sRGB blit
@ -75,6 +87,7 @@ impl ExportGpuResources {
let compositor = Compositor::new(device, HDR_FORMAT);
let srgb_to_linear = SrgbToLinearConverter::new(device);
let effect_processor = EffectProcessor::new(device, HDR_FORMAT);
let yuv_converter = YuvConverter::new(device);
// Create HDR accumulator texture
let hdr_texture = device.create_texture(&wgpu::TextureDescriptor {
@ -95,6 +108,53 @@ impl ExportGpuResources {
});
let hdr_texture_view = hdr_texture.create_view(&wgpu::TextureViewDescriptor::default());
// Create persistent RGBA output texture (sRGB, reused for all frames)
let output_texture = device.create_texture(&wgpu::TextureDescriptor {
label: Some("export_output_texture"),
size: wgpu::Extent3d {
width,
height,
depth_or_array_layers: 1,
},
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
format: wgpu::TextureFormat::Rgba8Unorm,
usage: wgpu::TextureUsages::RENDER_ATTACHMENT
| wgpu::TextureUsages::TEXTURE_BINDING
| wgpu::TextureUsages::COPY_SRC,
view_formats: &[],
});
let output_texture_view = output_texture.create_view(&wgpu::TextureViewDescriptor::default());
// Create persistent YUV texture (Rgba8Unorm, height*1.5 for packed Y+U+V planes)
// Note: Using Rgba8Unorm instead of R8Unorm because R8Unorm doesn't support STORAGE_BINDING
let yuv_height = height + height / 2; // Y plane + U plane + V plane
let yuv_texture = device.create_texture(&wgpu::TextureDescriptor {
label: Some("export_yuv_texture"),
size: wgpu::Extent3d {
width,
height: yuv_height,
depth_or_array_layers: 1,
},
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
format: wgpu::TextureFormat::Rgba8Unorm,
usage: wgpu::TextureUsages::STORAGE_BINDING | wgpu::TextureUsages::COPY_SRC,
view_formats: &[],
});
let yuv_texture_view = yuv_texture.create_view(&wgpu::TextureViewDescriptor::default());
// Create persistent staging buffer for GPU→CPU readback
let yuv_buffer_size = (width * yuv_height * 4) as u64; // Rgba8Unorm = 4 bytes per pixel
let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("export_staging_buffer"),
size: yuv_buffer_size,
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
mapped_at_creation: false,
});
// Create linear to sRGB blit pipeline
let linear_to_srgb_bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: Some("linear_to_srgb_bind_group_layout"),
@ -179,8 +239,14 @@ impl ExportGpuResources {
compositor,
srgb_to_linear,
effect_processor,
yuv_converter,
hdr_texture,
hdr_texture_view,
output_texture,
output_texture_view,
yuv_texture,
yuv_texture_view,
staging_buffer,
linear_to_srgb_pipeline,
linear_to_srgb_bind_group_layout,
linear_to_srgb_sampler,
@ -476,20 +542,11 @@ pub fn receive_and_write_packets(
let encoder_tb = encoder.time_base();
let stream_tb = output.stream(0).ok_or("No output stream found")?.time_base();
println!("🎬 [PACKET] Encoder TB: {}/{}, Stream TB: {}/{}",
encoder_tb.0, encoder_tb.1, stream_tb.0, stream_tb.1);
while encoder.receive_packet(&mut encoded).is_ok() {
println!("🎬 [PACKET] Before rescale - PTS: {:?}, DTS: {:?}, Duration: {:?}",
encoded.pts(), encoded.dts(), encoded.duration());
encoded.set_stream(0);
// Rescale timestamps from encoder time base to stream time base
encoded.rescale_ts(encoder_tb, stream_tb);
println!("🎬 [PACKET] After rescale - PTS: {:?}, DTS: {:?}, Duration: {:?}",
encoded.pts(), encoded.dts(), encoded.duration());
encoded
.write_interleaved(output)
.map_err(|e| format!("Failed to write packet: {}", e))?;
@ -660,10 +717,9 @@ pub fn render_frame_to_rgba(
/// * `image_cache` - Image cache for rendering
/// * `video_manager` - Video manager for video clips
/// * `gpu_resources` - HDR GPU resources for compositing
/// * `rgba_buffer` - Output buffer for RGBA pixels (must be width * height * 4 bytes)
///
/// # Returns
/// Ok(()) on success, Err with message on failure
/// Ok((y_plane, u_plane, v_plane)) with YUV420p planes on success, Err with message on failure
pub fn render_frame_to_rgba_hdr(
document: &mut Document,
timestamp: f64,
@ -675,8 +731,7 @@ pub fn render_frame_to_rgba_hdr(
image_cache: &mut ImageCache,
video_manager: &Arc<std::sync::Mutex<VideoManager>>,
gpu_resources: &mut ExportGpuResources,
rgba_buffer: &mut [u8],
) -> Result<(), String> {
) -> Result<(Vec<u8>, Vec<u8>, Vec<u8>), String> {
use vello::kurbo::Affine;
// Set document time to the frame timestamp
@ -879,22 +934,8 @@ pub fn render_frame_to_rgba_hdr(
// Advance frame counter for buffer cleanup
gpu_resources.buffer_pool.next_frame();
// Create output texture for final sRGB output
let output_texture = device.create_texture(&wgpu::TextureDescriptor {
label: Some("export_output_texture"),
size: wgpu::Extent3d {
width,
height,
depth_or_array_layers: 1,
},
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
format: wgpu::TextureFormat::Rgba8Unorm,
usage: wgpu::TextureUsages::RENDER_ATTACHMENT | wgpu::TextureUsages::COPY_SRC,
view_formats: &[],
});
let output_view = output_texture.create_view(&wgpu::TextureViewDescriptor::default());
// Use persistent output texture (already created in ExportGpuResources)
let output_view = &gpu_resources.output_texture_view;
// Convert HDR to sRGB for output
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
@ -940,52 +981,48 @@ pub fn render_frame_to_rgba_hdr(
queue.submit(Some(encoder.finish()));
// GPU readback: Create staging buffer with proper alignment
let bytes_per_pixel = 4u32; // RGBA8
let bytes_per_row_alignment = 256u32;
let unpadded_bytes_per_row = width * bytes_per_pixel;
let bytes_per_row = ((unpadded_bytes_per_row + bytes_per_row_alignment - 1)
/ bytes_per_row_alignment) * bytes_per_row_alignment;
let buffer_size = (bytes_per_row * height) as u64;
let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("export_staging_buffer"),
size: buffer_size,
usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
// GPU YUV conversion: Convert RGBA output to YUV420p
let mut yuv_encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("export_yuv_conversion_encoder"),
});
// Copy texture to staging buffer
let mut copy_encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("export_copy_encoder"),
});
gpu_resources.yuv_converter.convert(
device,
&mut yuv_encoder,
output_view,
&gpu_resources.yuv_texture_view,
width,
height,
);
copy_encoder.copy_texture_to_buffer(
// Copy YUV texture to persistent staging buffer
let yuv_height = height + height / 2; // Y plane + U plane + V plane
yuv_encoder.copy_texture_to_buffer(
wgpu::TexelCopyTextureInfo {
texture: &output_texture,
texture: &gpu_resources.yuv_texture,
mip_level: 0,
origin: wgpu::Origin3d::ZERO,
aspect: wgpu::TextureAspect::All,
},
wgpu::TexelCopyBufferInfo {
buffer: &staging_buffer,
buffer: &gpu_resources.staging_buffer,
layout: wgpu::TexelCopyBufferLayout {
offset: 0,
bytes_per_row: Some(bytes_per_row),
rows_per_image: Some(height),
bytes_per_row: Some(width * 4), // Rgba8Unorm = 4 bytes per pixel
rows_per_image: Some(yuv_height),
},
},
wgpu::Extent3d {
width,
height,
height: yuv_height,
depth_or_array_layers: 1,
},
);
queue.submit(Some(copy_encoder.finish()));
queue.submit(Some(yuv_encoder.finish()));
// Map buffer and read pixels (synchronous)
let buffer_slice = staging_buffer.slice(..);
// Map buffer and read YUV pixels (synchronous)
let buffer_slice = gpu_resources.staging_buffer.slice(..);
let (sender, receiver) = std::sync::mpsc::channel();
buffer_slice.map_async(wgpu::MapMode::Read, move |result| {
sender.send(result).ok();
@ -998,20 +1035,319 @@ pub fn render_frame_to_rgba_hdr(
.map_err(|_| "Failed to receive buffer mapping result")?
.map_err(|e| format!("Failed to map buffer: {:?}", e))?;
// Copy data from mapped buffer to output, removing padding
// Extract Y, U, V planes from packed YUV buffer
let data = buffer_slice.get_mapped_range();
for y in 0..height as usize {
let src_offset = y * bytes_per_row as usize;
let dst_offset = y * unpadded_bytes_per_row as usize;
let row_bytes = unpadded_bytes_per_row as usize;
rgba_buffer[dst_offset..dst_offset + row_bytes]
.copy_from_slice(&data[src_offset..src_offset + row_bytes]);
let width_usize = width as usize;
let height_usize = height as usize;
// Y plane: rows 0 to height-1 (extract R channel from Rgba8Unorm)
let y_plane_size = width_usize * height_usize;
let mut y_plane = vec![0u8; y_plane_size];
for y in 0..height_usize {
let src_row_offset = y * width_usize * 4; // 4 bytes per pixel (Rgba8Unorm)
let dst_row_offset = y * width_usize;
for x in 0..width_usize {
y_plane[dst_row_offset + x] = data[src_row_offset + x * 4]; // Extract R channel
}
}
// U and V planes: rows height to height + height/2 - 1 (half resolution, side-by-side layout)
// U plane is in left half (columns 0 to width/2-1), V plane is in right half (columns width/2 to width-1)
let chroma_width = width_usize / 2;
let chroma_height = height_usize / 2;
let chroma_row_start = height_usize * width_usize * 4; // Start of chroma rows in bytes
let mut u_plane = vec![0u8; chroma_width * chroma_height];
let mut v_plane = vec![0u8; chroma_width * chroma_height];
for y in 0..chroma_height {
let row_offset = chroma_row_start + y * width_usize * 4; // Full width rows in chroma region
// Extract U plane (left half: columns 0 to chroma_width-1)
let u_start = row_offset;
let dst_offset = y * chroma_width;
for x in 0..chroma_width {
u_plane[dst_offset + x] = data[u_start + x * 4]; // Extract R channel
}
// Extract V plane (right half: columns width/2 to width/2+chroma_width-1)
let v_start = row_offset + chroma_width * 4;
for x in 0..chroma_width {
v_plane[dst_offset + x] = data[v_start + x * 4]; // Extract R channel
}
}
drop(data);
staging_buffer.unmap();
gpu_resources.staging_buffer.unmap();
Ok(())
Ok((y_plane, u_plane, v_plane))
}
/// Render frame to GPU RGBA texture (non-blocking, for async pipeline)
///
/// Similar to render_frame_to_rgba_hdr but renders to an external RGBA texture view
/// (provided by ReadbackPipeline) and returns the command encoder WITHOUT blocking on readback.
/// The caller (ReadbackPipeline) will submit the encoder and handle async readback.
///
/// # Arguments
/// * `document` - Document to render
/// * `timestamp` - Time in seconds to render at
/// * `width` - Frame width in pixels
/// * `height` - Frame height in pixels
/// * `device` - wgpu device
/// * `queue` - wgpu queue
/// * `renderer` - Vello renderer
/// * `image_cache` - Image cache for rendering
/// * `video_manager` - Video manager for video clips
/// * `gpu_resources` - HDR GPU resources for compositing
/// * `rgba_texture_view` - External RGBA texture view (from ReadbackPipeline)
///
/// # Returns
/// Command encoder ready for submission (caller submits via ReadbackPipeline)
pub fn render_frame_to_gpu_rgba(
document: &mut Document,
timestamp: f64,
width: u32,
height: u32,
device: &wgpu::Device,
queue: &wgpu::Queue,
renderer: &mut vello::Renderer,
image_cache: &mut ImageCache,
video_manager: &Arc<std::sync::Mutex<VideoManager>>,
gpu_resources: &mut ExportGpuResources,
rgba_texture_view: &wgpu::TextureView,
) -> Result<wgpu::CommandEncoder, String> {
use vello::kurbo::Affine;
// Set document time to the frame timestamp
document.current_time = timestamp;
// Use identity transform for export (document coordinates = pixel coordinates)
let base_transform = Affine::IDENTITY;
// Render document for compositing (returns per-layer scenes)
let composite_result = render_document_for_compositing(
document,
base_transform,
image_cache,
video_manager,
);
// Buffer specs for layer rendering
let layer_spec = BufferSpec::new(width, height, BufferFormat::Rgba8Srgb);
let hdr_spec = BufferSpec::new(width, height, BufferFormat::Rgba16Float);
// Render parameters for Vello (transparent background for layers)
let layer_render_params = vello::RenderParams {
base_color: vello::peniko::Color::TRANSPARENT,
width,
height,
antialiasing_method: vello::AaConfig::Area,
};
// Render background and composite it
let bg_srgb_handle = gpu_resources.buffer_pool.acquire(device, layer_spec);
let bg_hdr_handle = gpu_resources.buffer_pool.acquire(device, hdr_spec);
if let (Some(bg_srgb_view), Some(bg_hdr_view)) = (
gpu_resources.buffer_pool.get_view(bg_srgb_handle),
gpu_resources.buffer_pool.get_view(bg_hdr_handle),
) {
renderer.render_to_texture(device, queue, &composite_result.background, bg_srgb_view, &layer_render_params)
.map_err(|e| format!("Failed to render background: {}", e))?;
let mut convert_encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("export_bg_srgb_to_linear_encoder"),
});
gpu_resources.srgb_to_linear.convert(device, &mut convert_encoder, bg_srgb_view, bg_hdr_view);
queue.submit(Some(convert_encoder.finish()));
let bg_compositor_layer = CompositorLayer::normal(bg_hdr_handle, 1.0);
let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("export_bg_composite_encoder"),
});
gpu_resources.compositor.composite(
device,
queue,
&mut encoder,
&[bg_compositor_layer],
&gpu_resources.buffer_pool,
&gpu_resources.hdr_texture_view,
Some([0.0, 0.0, 0.0, 1.0]),
);
queue.submit(Some(encoder.finish()));
}
gpu_resources.buffer_pool.release(bg_srgb_handle);
gpu_resources.buffer_pool.release(bg_hdr_handle);
// Render and composite each layer incrementally
for rendered_layer in &composite_result.layers {
if !rendered_layer.has_content {
continue;
}
match &rendered_layer.layer_type {
RenderedLayerType::Content => {
let srgb_handle = gpu_resources.buffer_pool.acquire(device, layer_spec);
let hdr_layer_handle = gpu_resources.buffer_pool.acquire(device, hdr_spec);
if let (Some(srgb_view), Some(hdr_layer_view)) = (
gpu_resources.buffer_pool.get_view(srgb_handle),
gpu_resources.buffer_pool.get_view(hdr_layer_handle),
) {
renderer.render_to_texture(device, queue, &rendered_layer.scene, srgb_view, &layer_render_params)
.map_err(|e| format!("Failed to render layer: {}", e))?;
let mut convert_encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("export_layer_srgb_to_linear_encoder"),
});
gpu_resources.srgb_to_linear.convert(device, &mut convert_encoder, srgb_view, hdr_layer_view);
queue.submit(Some(convert_encoder.finish()));
let compositor_layer = CompositorLayer::normal(hdr_layer_handle, rendered_layer.opacity);
let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("export_layer_composite_encoder"),
});
gpu_resources.compositor.composite(
device,
queue,
&mut encoder,
&[compositor_layer],
&gpu_resources.buffer_pool,
&gpu_resources.hdr_texture_view,
None,
);
queue.submit(Some(encoder.finish()));
}
gpu_resources.buffer_pool.release(srgb_handle);
gpu_resources.buffer_pool.release(hdr_layer_handle);
}
RenderedLayerType::Effect { effect_instances } => {
// Effect layer - apply effects to the current HDR accumulator
let current_time = document.current_time;
for effect_instance in effect_instances {
// Get effect definition from document
let Some(effect_def) = document.get_effect_definition(&effect_instance.clip_id) else {
continue;
};
// Compile effect if needed
if !gpu_resources.effect_processor.is_compiled(&effect_def.id) {
let success = gpu_resources.effect_processor.compile_effect(device, effect_def);
if !success {
eprintln!("Failed to compile effect: {}", effect_def.name);
continue;
}
}
// Create EffectInstance from ClipInstance for the processor
let effect_inst = lightningbeam_core::effect::EffectInstance::new(
effect_def,
effect_instance.timeline_start,
effect_instance.timeline_start + effect_instance.effective_duration(lightningbeam_core::effect::EFFECT_DURATION),
);
// Acquire temp buffer for effect output (HDR format)
let effect_output_handle = gpu_resources.buffer_pool.acquire(device, hdr_spec);
if let Some(effect_output_view) = gpu_resources.buffer_pool.get_view(effect_output_handle) {
let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("export_effect_encoder"),
});
// Apply effect: HDR accumulator → effect output buffer
let applied = gpu_resources.effect_processor.apply_effect(
device,
queue,
&mut encoder,
effect_def,
&effect_inst,
&gpu_resources.hdr_texture_view,
effect_output_view,
width,
height,
current_time,
);
if applied {
// Copy effect output back to HDR accumulator
encoder.copy_texture_to_texture(
wgpu::TexelCopyTextureInfo {
texture: gpu_resources.buffer_pool.get_texture(effect_output_handle).unwrap(),
mip_level: 0,
origin: wgpu::Origin3d::ZERO,
aspect: wgpu::TextureAspect::All,
},
wgpu::TexelCopyTextureInfo {
texture: &gpu_resources.hdr_texture,
mip_level: 0,
origin: wgpu::Origin3d::ZERO,
aspect: wgpu::TextureAspect::All,
},
wgpu::Extent3d {
width,
height,
depth_or_array_layers: 1,
},
);
}
queue.submit(Some(encoder.finish()));
}
gpu_resources.buffer_pool.release(effect_output_handle);
}
}
}
}
// Convert HDR to sRGB (linear → sRGB), render directly to external RGBA texture
let output_view = rgba_texture_view;
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("export_linear_to_srgb_bind_group"),
layout: &gpu_resources.linear_to_srgb_bind_group_layout,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: wgpu::BindingResource::TextureView(&gpu_resources.hdr_texture_view),
},
wgpu::BindGroupEntry {
binding: 1,
resource: wgpu::BindingResource::Sampler(&gpu_resources.linear_to_srgb_sampler),
},
],
});
let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("export_linear_to_srgb_encoder"),
});
{
let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
label: Some("export_linear_to_srgb_pass"),
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
view: &output_view,
resolve_target: None,
ops: wgpu::Operations {
load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
store: wgpu::StoreOp::Store,
},
depth_slice: None,
})],
depth_stencil_attachment: None,
occlusion_query_set: None,
timestamp_writes: None,
});
render_pass.set_pipeline(&gpu_resources.linear_to_srgb_pipeline);
render_pass.set_bind_group(0, &bind_group, &[]);
render_pass.draw(0..4, 0..1);
}
// Return encoder for caller to submit (ReadbackPipeline will handle submission and async readback)
// Frame is already rendered to external RGBA texture, no GPU YUV conversion needed
Ok(encoder)
}
#[cfg(test)]

View File

@ -2936,7 +2936,6 @@ impl eframe::App for EditorApp {
}
}
if let Some(progress) = orchestrator.poll_progress() {
println!("📨 [MAIN] Received progress from orchestrator!");
match progress {
lightningbeam_core::export::ExportProgress::Started { total_frames } => {
println!("Export started: {} frames", total_frames);