slightly improve video export speed
This commit is contained in:
parent
d94ec0d6a8
commit
cb62d0ee9d
|
|
@ -9,15 +9,18 @@ members = [
|
|||
# UI Framework (using eframe for simplified integration)
|
||||
# Note: Upgraded from 0.29 to 0.31 to fix Linux IME/keyboard input issues
|
||||
# See: https://github.com/emilk/egui/pull/5198
|
||||
eframe = { version = "0.31", default-features = true, features = ["wgpu"] }
|
||||
egui_extras = { version = "0.31", features = ["image", "svg"] }
|
||||
egui-wgpu = "0.31"
|
||||
# Upgraded to 0.33 for shader editor (egui_code_editor) and continued bug fixes
|
||||
egui = "0.33"
|
||||
eframe = { version = "0.33", default-features = true, features = ["wgpu"] }
|
||||
egui_extras = { version = "0.33", features = ["image", "svg", "syntect"] }
|
||||
egui-wgpu = "0.33"
|
||||
egui_code_editor = "0.2"
|
||||
|
||||
# GPU Rendering
|
||||
# vello 0.5 uses wgpu 24, matching eframe 0.31
|
||||
vello = "0.5"
|
||||
wgpu = "24"
|
||||
kurbo = { version = "0.11", features = ["serde"] }
|
||||
# vello from git uses wgpu 27, matching eframe 0.33
|
||||
vello = { git = "https://github.com/linebender/vello", branch = "main" }
|
||||
wgpu = { version = "27", features = ["vulkan", "metal"] }
|
||||
kurbo = { version = "0.12", features = ["serde"] }
|
||||
peniko = "0.5"
|
||||
|
||||
# Windowing
|
||||
|
|
|
|||
|
|
@ -10,12 +10,14 @@ pub mod buffer_pool;
|
|||
pub mod color_convert;
|
||||
pub mod compositor;
|
||||
pub mod effect_processor;
|
||||
pub mod yuv_converter;
|
||||
|
||||
// Re-export commonly used types
|
||||
pub use buffer_pool::{BufferHandle, BufferPool, BufferSpec, BufferFormat};
|
||||
pub use color_convert::SrgbToLinearConverter;
|
||||
pub use compositor::{Compositor, CompositorLayer, BlendMode};
|
||||
pub use effect_processor::{EffectProcessor, EffectUniforms};
|
||||
pub use yuv_converter::YuvConverter;
|
||||
|
||||
/// Standard HDR internal texture format (16-bit float per channel)
|
||||
pub const HDR_FORMAT: wgpu::TextureFormat = wgpu::TextureFormat::Rgba16Float;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,241 @@
|
|||
//! GPU-accelerated RGBA to YUV420p color space conversion
|
||||
//!
|
||||
//! Provides a compute shader-based converter for transforming RGBA textures
|
||||
//! to YUV420p planar format using the BT.709 color matrix (HD video standard).
|
||||
//! This replaces the CPU-based conversion with GPU parallel processing.
|
||||
|
||||
/// GPU pipeline for RGBA to YUV420p color space conversion
|
||||
///
|
||||
/// Converts Rgba8Unorm textures to YUV420p planar format using BT.709 colorspace.
|
||||
/// The Y plane is full resolution, while U and V planes are subsampled 4:2:0.
|
||||
///
|
||||
/// Output texture layout:
|
||||
/// - Rows 0 to height-1: Y plane (luma, full resolution)
|
||||
/// - Rows height to height + height/4 - 1: U plane (chroma, half resolution)
|
||||
/// - Rows height + height/4 to height + height/2 - 1: V plane (chroma, half resolution)
|
||||
pub struct YuvConverter {
|
||||
pipeline: wgpu::ComputePipeline,
|
||||
bind_group_layout: wgpu::BindGroupLayout,
|
||||
}
|
||||
|
||||
impl YuvConverter {
|
||||
/// Create a new RGBA to YUV420p converter
|
||||
pub fn new(device: &wgpu::Device) -> Self {
|
||||
// Create bind group layout
|
||||
let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
|
||||
label: Some("yuv_converter_bind_group_layout"),
|
||||
entries: &[
|
||||
// Input RGBA texture (binding 0)
|
||||
wgpu::BindGroupLayoutEntry {
|
||||
binding: 0,
|
||||
visibility: wgpu::ShaderStages::COMPUTE,
|
||||
ty: wgpu::BindingType::Texture {
|
||||
sample_type: wgpu::TextureSampleType::Float { filterable: false },
|
||||
view_dimension: wgpu::TextureViewDimension::D2,
|
||||
multisampled: false,
|
||||
},
|
||||
count: None,
|
||||
},
|
||||
// Output YUV texture (Rgba8Unorm storage texture, binding 1)
|
||||
// Note: R8Unorm doesn't support storage binding, so we use Rgba8Unorm and write to .r channel
|
||||
wgpu::BindGroupLayoutEntry {
|
||||
binding: 1,
|
||||
visibility: wgpu::ShaderStages::COMPUTE,
|
||||
ty: wgpu::BindingType::StorageTexture {
|
||||
access: wgpu::StorageTextureAccess::WriteOnly,
|
||||
format: wgpu::TextureFormat::Rgba8Unorm,
|
||||
view_dimension: wgpu::TextureViewDimension::D2,
|
||||
},
|
||||
count: None,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
// Create pipeline layout
|
||||
let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
|
||||
label: Some("yuv_converter_pipeline_layout"),
|
||||
bind_group_layouts: &[&bind_group_layout],
|
||||
push_constant_ranges: &[],
|
||||
});
|
||||
|
||||
// Create shader module
|
||||
let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
|
||||
label: Some("yuv_converter_shader"),
|
||||
source: wgpu::ShaderSource::Wgsl(YUV_CONVERTER_SHADER.into()),
|
||||
});
|
||||
|
||||
// Create compute pipeline
|
||||
let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
|
||||
label: Some("yuv_converter_pipeline"),
|
||||
layout: Some(&pipeline_layout),
|
||||
module: &shader,
|
||||
entry_point: Some("main"),
|
||||
compilation_options: wgpu::PipelineCompilationOptions::default(),
|
||||
cache: None,
|
||||
});
|
||||
|
||||
Self {
|
||||
pipeline,
|
||||
bind_group_layout,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert RGBA texture to YUV420p planar format
|
||||
///
|
||||
/// Reads from `rgba_view` and writes Y, U, V planes to `yuv_output_view`.
|
||||
/// The output texture must be R8Unorm format with height = input_height * 1.5
|
||||
/// to accommodate the packed YUV planes.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `device` - GPU device
|
||||
/// * `encoder` - Command encoder to record GPU commands
|
||||
/// * `rgba_view` - Source RGBA texture view
|
||||
/// * `yuv_output_view` - Destination YUV planar texture view (R8Unorm, height*1.5)
|
||||
/// * `width` - Width of the source RGBA texture
|
||||
/// * `height` - Height of the source RGBA texture
|
||||
pub fn convert(
|
||||
&self,
|
||||
device: &wgpu::Device,
|
||||
encoder: &mut wgpu::CommandEncoder,
|
||||
rgba_view: &wgpu::TextureView,
|
||||
yuv_output_view: &wgpu::TextureView,
|
||||
width: u32,
|
||||
height: u32,
|
||||
) {
|
||||
// Create bind group for this conversion
|
||||
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
|
||||
label: Some("yuv_converter_bind_group"),
|
||||
layout: &self.bind_group_layout,
|
||||
entries: &[
|
||||
wgpu::BindGroupEntry {
|
||||
binding: 0,
|
||||
resource: wgpu::BindingResource::TextureView(rgba_view),
|
||||
},
|
||||
wgpu::BindGroupEntry {
|
||||
binding: 1,
|
||||
resource: wgpu::BindingResource::TextureView(yuv_output_view),
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
// Compute pass
|
||||
let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
|
||||
label: Some("yuv_conversion_pass"),
|
||||
timestamp_writes: None,
|
||||
});
|
||||
|
||||
compute_pass.set_pipeline(&self.pipeline);
|
||||
compute_pass.set_bind_group(0, &bind_group, &[]);
|
||||
|
||||
// Dispatch workgroups: 8x8 threads per workgroup
|
||||
// Each thread processes one pixel for the Y plane
|
||||
// Chroma planes are processed by threads at even coordinates
|
||||
let workgroup_size = 8;
|
||||
let workgroups_x = (width + workgroup_size - 1) / workgroup_size;
|
||||
let workgroups_y = (height + workgroup_size - 1) / workgroup_size;
|
||||
compute_pass.dispatch_workgroups(workgroups_x, workgroups_y, 1);
|
||||
}
|
||||
}
|
||||
|
||||
/// WGSL compute shader for RGBA to YUV420p conversion
|
||||
const YUV_CONVERTER_SHADER: &str = r#"
|
||||
// RGBA to YUV420p Compute Shader
|
||||
// BT.709 color space for HD video (ITU-R BT.709-6 standard)
|
||||
//
|
||||
// Color matrix:
|
||||
// Y = 0.2126*R + 0.7152*G + 0.0722*B
|
||||
// U = -0.1146*R - 0.3854*G + 0.5000*B + 0.5
|
||||
// V = 0.5000*R - 0.4542*G - 0.0458*B + 0.5
|
||||
//
|
||||
// Output texture layout (packed planar, side-by-side U/V):
|
||||
// - Rows [0, height): Y plane (full resolution, full width)
|
||||
// - Rows [height, height + height/2): U plane (left half, columns 0 to width/2-1)
|
||||
// V plane (right half, columns width/2 to width-1)
|
||||
|
||||
@group(0) @binding(0) var input_rgba: texture_2d<f32>;
|
||||
@group(0) @binding(1) var output_yuv: texture_storage_2d<rgba8unorm, write>;
|
||||
|
||||
@compute @workgroup_size(8, 8, 1)
|
||||
fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
|
||||
let dims = textureDimensions(input_rgba);
|
||||
let pos = global_id.xy;
|
||||
|
||||
// Bounds check
|
||||
if (pos.x >= dims.x || pos.y >= dims.y) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Load RGBA pixel
|
||||
let rgba = textureLoad(input_rgba, pos, 0);
|
||||
let r = rgba.r;
|
||||
let g = rgba.g;
|
||||
let b = rgba.b;
|
||||
|
||||
// Compute Y (luma) - full resolution, BT.709
|
||||
let y = 0.2126 * r + 0.7152 * g + 0.0722 * b;
|
||||
|
||||
// Write Y value to Y plane (rows 0 to height-1)
|
||||
textureStore(output_yuv, pos, vec4<f32>(y, 0.0, 0.0, 0.0));
|
||||
|
||||
// Compute U and V (chroma) - subsampled 4:2:0
|
||||
// Only process even coordinates (top-left of 2x2 blocks)
|
||||
if (pos.x % 2u == 0u && pos.y % 2u == 0u) {
|
||||
// Sample 2x2 block for chroma subsampling
|
||||
var r_sum = r;
|
||||
var g_sum = g;
|
||||
var b_sum = b;
|
||||
var count = 1.0;
|
||||
|
||||
// Sample right neighbor (x+1, y)
|
||||
if (pos.x + 1u < dims.x) {
|
||||
let rgba_r = textureLoad(input_rgba, pos + vec2<u32>(1u, 0u), 0);
|
||||
r_sum += rgba_r.r;
|
||||
g_sum += rgba_r.g;
|
||||
b_sum += rgba_r.b;
|
||||
count += 1.0;
|
||||
}
|
||||
|
||||
// Sample bottom neighbor (x, y+1)
|
||||
if (pos.y + 1u < dims.y) {
|
||||
let rgba_b = textureLoad(input_rgba, pos + vec2<u32>(0u, 1u), 0);
|
||||
r_sum += rgba_b.r;
|
||||
g_sum += rgba_b.g;
|
||||
b_sum += rgba_b.b;
|
||||
count += 1.0;
|
||||
}
|
||||
|
||||
// Sample bottom-right neighbor (x+1, y+1)
|
||||
if (pos.x + 1u < dims.x && pos.y + 1u < dims.y) {
|
||||
let rgba_br = textureLoad(input_rgba, pos + vec2<u32>(1u, 1u), 0);
|
||||
r_sum += rgba_br.r;
|
||||
g_sum += rgba_br.g;
|
||||
b_sum += rgba_br.b;
|
||||
count += 1.0;
|
||||
}
|
||||
|
||||
// Average the 2x2 block
|
||||
let r_avg = r_sum / count;
|
||||
let g_avg = g_sum / count;
|
||||
let b_avg = b_sum / count;
|
||||
|
||||
// Compute chroma components (BT.709, centered at 0.5 for unsigned 8-bit)
|
||||
let u = -0.1146 * r_avg - 0.3854 * g_avg + 0.5000 * b_avg + 0.5;
|
||||
let v = 0.5000 * r_avg - 0.4542 * g_avg - 0.0458 * b_avg + 0.5;
|
||||
|
||||
// Compute chroma plane positions (half resolution)
|
||||
// Pack U and V side-by-side: U on left half, V on right half
|
||||
let chroma_x = pos.x / 2u;
|
||||
let chroma_y = pos.y / 2u;
|
||||
|
||||
// U plane: left half (columns 0 to width/2-1), rows height to height+height/2-1
|
||||
let u_pos = vec2<u32>(chroma_x, dims.y + chroma_y);
|
||||
|
||||
// V plane: right half (columns width/2 to width-1), rows height to height+height/2-1
|
||||
let v_pos = vec2<u32>(dims.x / 2u + chroma_x, dims.y + chroma_y);
|
||||
|
||||
// Write U and V values to their respective planes
|
||||
textureStore(output_yuv, u_pos, vec4<f32>(u, 0.0, 0.0, 0.0));
|
||||
textureStore(output_yuv, v_pos, vec4<f32>(v, 0.0, 0.0, 0.0));
|
||||
}
|
||||
}
|
||||
"#;
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
# Plan for Async Rendering Helpers
|
||||
|
||||
I'm creating this temporary document to plan the async rendering changes.
|
||||
|
||||
## Current Architecture (Synchronous)
|
||||
`render_frame_to_rgba_hdr()` in video_exporter.rs:
|
||||
1. Render document to RGBA (lines 750-991)
|
||||
2. GPU YUV conversion (lines 993-1005)
|
||||
3. Copy YUV to staging buffer (lines 1007-1029)
|
||||
4. Submit GPU commands (line 1031)
|
||||
5. **BLOCKING** map_async + wait (lines 1033-1045)
|
||||
6. Extract Y, U, V planes from mapped buffer (lines 1047-1087)
|
||||
7. Unmap and return YUV planes (lines 1089-1092)
|
||||
|
||||
## New Architecture (Async Pipelined)
|
||||
Split into two phases using ReadbackPipeline:
|
||||
|
||||
### Phase 1: Submit Frame (Non-blocking)
|
||||
New function `submit_frame_to_readback_pipeline()`:
|
||||
- Input: buffer from ReadbackPipeline.acquire()
|
||||
- Steps 1-3: Render to RGBA, GPU YUV, copy to buffer's YUV texture
|
||||
- Return encoder to ReadbackPipeline for submission
|
||||
- **Does NOT wait for GPU**
|
||||
|
||||
### Phase 2: Extract YUV (After async mapping)
|
||||
Helper function `extract_yuv_planes_from_buffer()`:
|
||||
- Input: mapped buffer data from ReadbackPipeline
|
||||
- Steps 6-7: Extract Y, U, V planes, return them
|
||||
- Used after ReadbackPipeline.get_mapped_data()
|
||||
|
||||
## Modified render_next_video_frame()
|
||||
New async pipeline loop:
|
||||
```
|
||||
while more_work_to_do:
|
||||
// Poll for completed frames
|
||||
for result in pipeline.poll_nonblocking():
|
||||
data = pipeline.get_mapped_data(result.buffer_id)
|
||||
(y, u, v) = extract_yuv_planes(data)
|
||||
send_to_encoder_in_order(result.frame_num, y, u, v)
|
||||
pipeline.release(result.buffer_id)
|
||||
|
||||
// Submit new frames (up to 3 in flight)
|
||||
if current_frame < total_frames && frames_in_flight < 3:
|
||||
if let Some(buffer) = pipeline.acquire(frame_num, timestamp):
|
||||
encoder = submit_frame_to_pipeline(buffer)
|
||||
pipeline.submit_and_readback(buffer.id, encoder)
|
||||
frames_in_flight++
|
||||
current_frame++
|
||||
|
||||
// Done when all frames submitted AND all completed
|
||||
if current_frame >= total_frames && frames_in_flight == 0:
|
||||
return Ok(false)
|
||||
|
||||
return Ok(true) // More work to do
|
||||
```
|
||||
|
||||
This achieves triple buffering:
|
||||
- Frame N: GPU rendering
|
||||
- Frame N-1: GPU→CPU async transfer
|
||||
- Frame N-2: CPU encoding
|
||||
|
||||
Expected speedup: 5x
|
||||
|
|
@ -0,0 +1,143 @@
|
|||
//! CPU-based RGBA→YUV420p color space converter using FFmpeg's swscale
|
||||
//!
|
||||
//! This module provides a wrapper around FFmpeg's highly-optimized swscale library
|
||||
//! for converting RGBA data to YUV420p format. Uses SIMD instructions when available
|
||||
//! for maximum performance.
|
||||
|
||||
use ffmpeg_next as ffmpeg;
|
||||
|
||||
/// CPU-based RGBA→YUV420p converter using FFmpeg's swscale
|
||||
///
|
||||
/// This converter uses FFmpeg's swscale library which is highly optimized with SIMD
|
||||
/// instructions (SSE, AVX) for fast color space conversion on the CPU.
|
||||
pub struct CpuYuvConverter {
|
||||
width: u32,
|
||||
height: u32,
|
||||
}
|
||||
|
||||
impl CpuYuvConverter {
|
||||
/// Create new converter for given dimensions
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `width` - Frame width in pixels
|
||||
/// * `height` - Frame height in pixels
|
||||
pub fn new(width: u32, height: u32) -> Result<Self, String> {
|
||||
Ok(Self { width, height })
|
||||
}
|
||||
|
||||
/// Convert RGBA data to YUV420p planes
|
||||
///
|
||||
/// Performs color space conversion from RGBA (8-bit per channel, packed format)
|
||||
/// to YUV420p (8-bit per channel, planar format with subsampled chroma).
|
||||
///
|
||||
/// Uses BT.709 color matrix (HD standard) for the conversion.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `rgba_data` - Packed RGBA data (width * height * 4 bytes)
|
||||
///
|
||||
/// # Returns
|
||||
/// Tuple of (y_plane, u_plane, v_plane) as separate Vec<u8>
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if rgba_data length doesn't match width * height * 4
|
||||
pub fn convert(&self, rgba_data: &[u8]) -> Result<(Vec<u8>, Vec<u8>, Vec<u8>), String> {
|
||||
let expected_size = (self.width * self.height * 4) as usize;
|
||||
assert_eq!(
|
||||
rgba_data.len(),
|
||||
expected_size,
|
||||
"RGBA data size mismatch: expected {} bytes, got {}",
|
||||
expected_size,
|
||||
rgba_data.len()
|
||||
);
|
||||
|
||||
// Create source RGBA frame
|
||||
let mut rgba_frame = ffmpeg::frame::Video::new(
|
||||
ffmpeg::format::Pixel::RGBA,
|
||||
self.width,
|
||||
self.height,
|
||||
);
|
||||
|
||||
// Copy RGBA data into source frame
|
||||
// ffmpeg-next provides mutable access to the frame data
|
||||
let frame_data = rgba_frame.data_mut(0);
|
||||
frame_data.copy_from_slice(rgba_data);
|
||||
|
||||
// Create destination YUV420p frame
|
||||
let mut yuv_frame = ffmpeg::frame::Video::new(
|
||||
ffmpeg::format::Pixel::YUV420P,
|
||||
self.width,
|
||||
self.height,
|
||||
);
|
||||
|
||||
// Create swscale context for RGBA→YUV420p conversion
|
||||
// Uses BT.709 color matrix (HD standard)
|
||||
let mut scaler = ffmpeg::software::scaling::Context::get(
|
||||
ffmpeg::format::Pixel::RGBA,
|
||||
self.width,
|
||||
self.height,
|
||||
ffmpeg::format::Pixel::YUV420P,
|
||||
self.width,
|
||||
self.height,
|
||||
ffmpeg::software::scaling::Flags::BILINEAR,
|
||||
)
|
||||
.map_err(|e| format!("Failed to create swscale context: {}", e))?;
|
||||
|
||||
// Perform the conversion (SIMD-optimized)
|
||||
scaler
|
||||
.run(&rgba_frame, &mut yuv_frame)
|
||||
.map_err(|e| format!("swscale conversion failed: {}", e))?;
|
||||
|
||||
// Extract planar YUV data
|
||||
// YUV420p has 3 planes:
|
||||
// - Y: full resolution (width × height)
|
||||
// - U: quarter resolution (width/2 × height/2)
|
||||
// - V: quarter resolution (width/2 × height/2)
|
||||
let y_plane = yuv_frame.data(0).to_vec();
|
||||
let u_plane = yuv_frame.data(1).to_vec();
|
||||
let v_plane = yuv_frame.data(2).to_vec();
|
||||
|
||||
Ok((y_plane, u_plane, v_plane))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_converter_creation() {
|
||||
let converter = CpuYuvConverter::new(1920, 1080);
|
||||
assert!(converter.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_conversion_output_sizes() {
|
||||
let converter = CpuYuvConverter::new(1920, 1080).unwrap();
|
||||
|
||||
// Create dummy RGBA data (all black)
|
||||
let rgba_data = vec![0u8; 1920 * 1080 * 4];
|
||||
|
||||
let result = converter.convert(&rgba_data);
|
||||
assert!(result.is_ok());
|
||||
|
||||
let (y, u, v) = result.unwrap();
|
||||
|
||||
// Y plane should be full resolution
|
||||
assert_eq!(y.len(), 1920 * 1080);
|
||||
|
||||
// U and V planes should be quarter resolution (subsampled 2x2)
|
||||
assert_eq!(u.len(), (1920 / 2) * (1080 / 2));
|
||||
assert_eq!(v.len(), (1920 / 2) * (1080 / 2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "RGBA data size mismatch")]
|
||||
fn test_wrong_input_size_panics() {
|
||||
let converter = CpuYuvConverter::new(1920, 1080).unwrap();
|
||||
|
||||
// Wrong size input
|
||||
let rgba_data = vec![0u8; 1000];
|
||||
|
||||
let _ = converter.convert(&rgba_data);
|
||||
}
|
||||
}
|
||||
|
|
@ -6,6 +6,9 @@
|
|||
pub mod audio_exporter;
|
||||
pub mod dialog;
|
||||
pub mod video_exporter;
|
||||
pub mod readback_pipeline;
|
||||
pub mod perf_metrics;
|
||||
pub mod cpu_yuv_converter;
|
||||
|
||||
use lightningbeam_core::export::{AudioExportSettings, VideoExportSettings, ExportProgress};
|
||||
use lightningbeam_core::document::Document;
|
||||
|
|
@ -18,8 +21,14 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
|||
|
||||
/// Message sent from main thread to video encoder thread
|
||||
enum VideoFrameMessage {
|
||||
/// RGBA frame data with frame number and timestamp
|
||||
Frame { frame_num: usize, timestamp: f64, rgba_data: Vec<u8> },
|
||||
/// YUV420p frame data with frame number and timestamp (GPU-converted)
|
||||
Frame {
|
||||
frame_num: usize,
|
||||
timestamp: f64,
|
||||
y_plane: Vec<u8>,
|
||||
u_plane: Vec<u8>,
|
||||
v_plane: Vec<u8>,
|
||||
},
|
||||
/// Signal that all frames have been sent
|
||||
Done,
|
||||
}
|
||||
|
|
@ -44,6 +53,16 @@ pub struct VideoExportState {
|
|||
frame_tx: Option<Sender<VideoFrameMessage>>,
|
||||
/// HDR GPU resources for compositing pipeline (effects, color conversion)
|
||||
gpu_resources: Option<video_exporter::ExportGpuResources>,
|
||||
/// Async triple-buffered readback pipeline for GPU RGBA frames
|
||||
readback_pipeline: Option<readback_pipeline::ReadbackPipeline>,
|
||||
/// CPU YUV converter for RGBA→YUV420p conversion
|
||||
cpu_yuv_converter: Option<cpu_yuv_converter::CpuYuvConverter>,
|
||||
/// Frames that have been submitted to GPU but not yet encoded
|
||||
frames_in_flight: usize,
|
||||
/// Next frame number to send to encoder (for ordering)
|
||||
next_frame_to_encode: usize,
|
||||
/// Performance metrics for instrumentation
|
||||
perf_metrics: Option<perf_metrics::ExportMetrics>,
|
||||
}
|
||||
|
||||
/// Export orchestrator that manages the export process
|
||||
|
|
@ -168,13 +187,11 @@ impl ExportOrchestrator {
|
|||
|
||||
// Poll video progress
|
||||
while let Ok(progress) = parallel.video_progress_rx.try_recv() {
|
||||
println!("📨 [PARALLEL] Video progress: {:?}", std::mem::discriminant(&progress));
|
||||
parallel.video_progress = Some(progress);
|
||||
}
|
||||
|
||||
// Poll audio progress
|
||||
while let Ok(progress) = parallel.audio_progress_rx.try_recv() {
|
||||
println!("📨 [PARALLEL] Audio progress: {:?}", std::mem::discriminant(&progress));
|
||||
parallel.audio_progress = Some(progress);
|
||||
}
|
||||
|
||||
|
|
@ -621,7 +638,7 @@ impl ExportOrchestrator {
|
|||
self.thread_handle = Some(handle);
|
||||
|
||||
// Initialize video export state
|
||||
// GPU resources will be initialized lazily on first frame (needs device)
|
||||
// GPU resources and readback pipeline will be initialized lazily on first frame (needs device)
|
||||
self.video_state = Some(VideoExportState {
|
||||
current_frame: 0,
|
||||
total_frames,
|
||||
|
|
@ -632,6 +649,11 @@ impl ExportOrchestrator {
|
|||
height,
|
||||
frame_tx: Some(frame_tx),
|
||||
gpu_resources: None,
|
||||
readback_pipeline: None,
|
||||
cpu_yuv_converter: None,
|
||||
frames_in_flight: 0,
|
||||
next_frame_to_encode: 0,
|
||||
perf_metrics: Some(perf_metrics::ExportMetrics::new()),
|
||||
});
|
||||
|
||||
println!("🎬 [VIDEO EXPORT] Encoder thread spawned, ready for frames");
|
||||
|
|
@ -745,7 +767,7 @@ impl ExportOrchestrator {
|
|||
});
|
||||
|
||||
// Initialize video export state for incremental rendering
|
||||
// GPU resources will be initialized lazily on first frame (needs device)
|
||||
// GPU resources and readback pipeline will be initialized lazily on first frame (needs device)
|
||||
self.video_state = Some(VideoExportState {
|
||||
current_frame: 0,
|
||||
total_frames,
|
||||
|
|
@ -756,6 +778,11 @@ impl ExportOrchestrator {
|
|||
height: video_height,
|
||||
frame_tx: Some(frame_tx),
|
||||
gpu_resources: None,
|
||||
readback_pipeline: None,
|
||||
cpu_yuv_converter: None,
|
||||
frames_in_flight: 0,
|
||||
next_frame_to_encode: 0,
|
||||
perf_metrics: Some(perf_metrics::ExportMetrics::new()),
|
||||
});
|
||||
|
||||
// Initialize parallel export state
|
||||
|
|
@ -777,6 +804,7 @@ impl ExportOrchestrator {
|
|||
|
||||
/// Render and send the next video frame (call from main thread)
|
||||
///
|
||||
/// Uses async triple-buffered pipeline for maximum throughput.
|
||||
/// Returns true if there are more frames to render, false if done.
|
||||
///
|
||||
/// # Arguments
|
||||
|
|
@ -798,62 +826,143 @@ impl ExportOrchestrator {
|
|||
image_cache: &mut ImageCache,
|
||||
video_manager: &Arc<std::sync::Mutex<VideoManager>>,
|
||||
) -> Result<bool, String> {
|
||||
use std::time::Instant;
|
||||
|
||||
let state = self.video_state.as_mut()
|
||||
.ok_or("No video export in progress")?;
|
||||
|
||||
if state.current_frame >= state.total_frames {
|
||||
// All frames rendered, signal encoder thread
|
||||
if let Some(tx) = state.frame_tx.take() {
|
||||
tx.send(VideoFrameMessage::Done).ok();
|
||||
}
|
||||
// Clean up GPU resources
|
||||
state.gpu_resources = None;
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
// Calculate timestamp for this frame
|
||||
let timestamp = state.start_time + (state.current_frame as f64 / state.framerate);
|
||||
|
||||
// Get frame dimensions from export settings
|
||||
let width = state.width;
|
||||
let height = state.height;
|
||||
|
||||
// Initialize GPU resources on first frame (needs device)
|
||||
// Initialize GPU resources and readback pipeline on first frame
|
||||
if state.gpu_resources.is_none() {
|
||||
println!("🎬 [VIDEO EXPORT] Initializing HDR GPU resources for {}x{}", width, height);
|
||||
println!("🎬 [VIDEO EXPORT] Initializing HDR GPU + async pipeline {}x{}", width, height);
|
||||
state.gpu_resources = Some(video_exporter::ExportGpuResources::new(device, width, height));
|
||||
state.readback_pipeline = Some(readback_pipeline::ReadbackPipeline::new(device, queue, width, height));
|
||||
state.cpu_yuv_converter = Some(cpu_yuv_converter::CpuYuvConverter::new(width, height)?);
|
||||
println!("🚀 [ASYNC PIPELINE] Triple-buffered pipeline initialized");
|
||||
println!("🚀 [CPU YUV] swscale converter initialized");
|
||||
}
|
||||
|
||||
// Render frame to RGBA buffer using HDR pipeline (with effects)
|
||||
let mut rgba_buffer = vec![0u8; (width * height * 4) as usize];
|
||||
let pipeline = state.readback_pipeline.as_mut().unwrap();
|
||||
let gpu_resources = state.gpu_resources.as_mut().unwrap();
|
||||
video_exporter::render_frame_to_rgba_hdr(
|
||||
document,
|
||||
timestamp,
|
||||
width,
|
||||
height,
|
||||
device,
|
||||
queue,
|
||||
renderer,
|
||||
image_cache,
|
||||
video_manager,
|
||||
gpu_resources,
|
||||
&mut rgba_buffer,
|
||||
)?;
|
||||
let cpu_converter = state.cpu_yuv_converter.as_mut().unwrap();
|
||||
let mut metrics = state.perf_metrics.as_mut();
|
||||
|
||||
// Send frame to encoder thread
|
||||
// Poll for completed async readbacks (non-blocking)
|
||||
if let Some(m) = metrics.as_mut() {
|
||||
m.poll_count += 1;
|
||||
}
|
||||
let completed_frames = pipeline.poll_nonblocking();
|
||||
if let Some(m) = metrics.as_mut() {
|
||||
m.completions_per_poll.push(completed_frames.len());
|
||||
}
|
||||
|
||||
// Process completed frames IN ORDER
|
||||
for result in completed_frames {
|
||||
if result.frame_num == state.next_frame_to_encode {
|
||||
// Record readback completion time
|
||||
if let Some(m) = metrics.as_mut() {
|
||||
if let Some(frame_metrics) = m.frames.get_mut(result.frame_num) {
|
||||
frame_metrics.readback_complete = Some(Instant::now());
|
||||
}
|
||||
}
|
||||
|
||||
// Extract RGBA data (timed)
|
||||
let extraction_start = Instant::now();
|
||||
let rgba_data = pipeline.extract_rgba_data(result.buffer_id);
|
||||
let extraction_end = Instant::now();
|
||||
|
||||
// CPU YUV conversion (timed)
|
||||
let conversion_start = Instant::now();
|
||||
let (y, u, v) = cpu_converter.convert(&rgba_data)?;
|
||||
let conversion_end = Instant::now();
|
||||
|
||||
if let Some(m) = metrics.as_mut() {
|
||||
if let Some(frame_metrics) = m.frames.get_mut(result.frame_num) {
|
||||
frame_metrics.extraction_start = Some(extraction_start);
|
||||
frame_metrics.extraction_end = Some(extraction_end);
|
||||
frame_metrics.conversion_start = Some(conversion_start);
|
||||
frame_metrics.conversion_end = Some(conversion_end);
|
||||
}
|
||||
}
|
||||
|
||||
// Send to encoder
|
||||
if let Some(tx) = &state.frame_tx {
|
||||
tx.send(VideoFrameMessage::Frame {
|
||||
frame_num: state.current_frame,
|
||||
timestamp,
|
||||
rgba_data: rgba_buffer,
|
||||
}).map_err(|_| "Failed to send frame to encoder")?;
|
||||
frame_num: result.frame_num,
|
||||
timestamp: result.timestamp,
|
||||
y_plane: y,
|
||||
u_plane: u,
|
||||
v_plane: v,
|
||||
}).map_err(|_| "Failed to send frame")?;
|
||||
}
|
||||
|
||||
state.current_frame += 1;
|
||||
pipeline.release(result.buffer_id);
|
||||
state.frames_in_flight -= 1;
|
||||
state.next_frame_to_encode += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Return true if more frames remain
|
||||
Ok(state.current_frame < state.total_frames)
|
||||
// Submit new frames (up to 3 in flight)
|
||||
while state.current_frame < state.total_frames && state.frames_in_flight < 3 {
|
||||
let timestamp = state.start_time + (state.current_frame as f64 / state.framerate);
|
||||
|
||||
if let Some(acquired) = pipeline.acquire(state.current_frame, timestamp) {
|
||||
// Create frame metrics entry
|
||||
if let Some(m) = metrics.as_mut() {
|
||||
m.frames.push(perf_metrics::FrameMetrics::new(state.current_frame));
|
||||
}
|
||||
|
||||
// Render to GPU (timed)
|
||||
let render_start = Instant::now();
|
||||
let encoder = video_exporter::render_frame_to_gpu_rgba(
|
||||
document, timestamp, width, height,
|
||||
device, queue, renderer, image_cache, video_manager,
|
||||
gpu_resources, &acquired.rgba_texture_view,
|
||||
)?;
|
||||
let render_end = Instant::now();
|
||||
|
||||
// Record render timing
|
||||
if let Some(m) = metrics.as_mut() {
|
||||
if let Some(frame_metrics) = m.frames.get_mut(state.current_frame) {
|
||||
frame_metrics.render_end = Some(render_end);
|
||||
frame_metrics.submit_time = Some(Instant::now());
|
||||
}
|
||||
}
|
||||
|
||||
// Submit for async readback
|
||||
pipeline.submit_and_readback(acquired.id, encoder);
|
||||
|
||||
state.current_frame += 1;
|
||||
state.frames_in_flight += 1;
|
||||
} else {
|
||||
break; // All buffers in use
|
||||
}
|
||||
}
|
||||
|
||||
// Done when all submitted AND all completed
|
||||
if state.current_frame >= state.total_frames && state.frames_in_flight == 0 {
|
||||
println!("🎬 [VIDEO EXPORT] Complete: {} frames", state.total_frames);
|
||||
|
||||
// Print performance summary
|
||||
if let Some(m) = &state.perf_metrics {
|
||||
m.print_summary();
|
||||
m.print_per_frame_details(10);
|
||||
}
|
||||
|
||||
if let Some(tx) = state.frame_tx.take() {
|
||||
tx.send(VideoFrameMessage::Done).ok();
|
||||
}
|
||||
|
||||
state.gpu_resources = None;
|
||||
state.readback_pipeline = None;
|
||||
state.cpu_yuv_converter = None;
|
||||
state.perf_metrics = None;
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
Ok(true) // More work to do
|
||||
}
|
||||
|
||||
/// Background thread that receives frames and encodes them
|
||||
|
|
@ -925,9 +1034,9 @@ impl ExportOrchestrator {
|
|||
|
||||
// Wait for first frame to determine dimensions
|
||||
let first_frame = match frame_rx.recv() {
|
||||
Ok(VideoFrameMessage::Frame { frame_num, timestamp, rgba_data }) => {
|
||||
println!("🧵 [ENCODER] Received first frame ({} bytes)", rgba_data.len());
|
||||
Some((frame_num, timestamp, rgba_data))
|
||||
Ok(VideoFrameMessage::Frame { frame_num, timestamp, y_plane, u_plane, v_plane }) => {
|
||||
println!("🧵 [ENCODER] Received first YUV frame (Y: {} bytes)", y_plane.len());
|
||||
Some((frame_num, timestamp, y_plane, u_plane, v_plane))
|
||||
}
|
||||
Ok(VideoFrameMessage::Done) => {
|
||||
return Err("No frames to encode".to_string());
|
||||
|
|
@ -938,9 +1047,9 @@ impl ExportOrchestrator {
|
|||
};
|
||||
|
||||
// Determine dimensions from first frame
|
||||
let (width, height) = if let Some((_, _, ref rgba_data)) = first_frame {
|
||||
// Calculate dimensions from buffer size (RGBA = 4 bytes per pixel)
|
||||
let pixel_count = rgba_data.len() / 4;
|
||||
let (width, height) = if let Some((_, _, ref y_plane, _, _)) = first_frame {
|
||||
// Calculate dimensions from Y plane size (full resolution, 1 byte per pixel)
|
||||
let pixel_count = y_plane.len();
|
||||
// Use settings dimensions if provided, otherwise infer from buffer
|
||||
let w = settings.width.unwrap_or(1920); // Default to 1920 if not specified
|
||||
let h = settings.height.unwrap_or(1080); // Default to 1080 if not specified
|
||||
|
|
@ -979,11 +1088,13 @@ impl ExportOrchestrator {
|
|||
println!("🧵 [ENCODER] Encoder initialized, ready to encode frames");
|
||||
|
||||
// Process first frame
|
||||
if let Some((frame_num, timestamp, rgba_data)) = first_frame {
|
||||
if let Some((frame_num, timestamp, y_plane, u_plane, v_plane)) = first_frame {
|
||||
Self::encode_frame(
|
||||
&mut encoder,
|
||||
&mut output,
|
||||
&rgba_data,
|
||||
&y_plane,
|
||||
&u_plane,
|
||||
&v_plane,
|
||||
width,
|
||||
height,
|
||||
timestamp,
|
||||
|
|
@ -994,8 +1105,6 @@ impl ExportOrchestrator {
|
|||
frame: 1,
|
||||
total: total_frames,
|
||||
}).ok();
|
||||
|
||||
println!("🧵 [ENCODER] Encoded frame {}", frame_num);
|
||||
}
|
||||
|
||||
// Process remaining frames
|
||||
|
|
@ -1006,11 +1115,13 @@ impl ExportOrchestrator {
|
|||
}
|
||||
|
||||
match frame_rx.recv() {
|
||||
Ok(VideoFrameMessage::Frame { frame_num, timestamp, rgba_data }) => {
|
||||
Ok(VideoFrameMessage::Frame { frame_num, timestamp, y_plane, u_plane, v_plane }) => {
|
||||
Self::encode_frame(
|
||||
&mut encoder,
|
||||
&mut output,
|
||||
&rgba_data,
|
||||
&y_plane,
|
||||
&u_plane,
|
||||
&v_plane,
|
||||
width,
|
||||
height,
|
||||
timestamp,
|
||||
|
|
@ -1023,10 +1134,6 @@ impl ExportOrchestrator {
|
|||
frame: frames_encoded,
|
||||
total: total_frames,
|
||||
}).ok();
|
||||
|
||||
if frames_encoded % 30 == 0 || frames_encoded == frame_num + 1 {
|
||||
println!("🧵 [ENCODER] Encoded frame {}/{}", frames_encoded, total_frames);
|
||||
}
|
||||
}
|
||||
Ok(VideoFrameMessage::Done) => {
|
||||
println!("🧵 [ENCODER] All frames received, flushing encoder");
|
||||
|
|
@ -1052,17 +1159,18 @@ impl ExportOrchestrator {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
/// Encode a single RGBA frame
|
||||
/// Encode a single YUV420p frame (already converted by GPU)
|
||||
fn encode_frame(
|
||||
encoder: &mut ffmpeg_next::encoder::Video,
|
||||
output: &mut ffmpeg_next::format::context::Output,
|
||||
rgba_data: &[u8],
|
||||
y_plane: &[u8],
|
||||
u_plane: &[u8],
|
||||
v_plane: &[u8],
|
||||
width: u32,
|
||||
height: u32,
|
||||
timestamp: f64,
|
||||
) -> Result<(), String> {
|
||||
// Convert RGBA to YUV420p
|
||||
let (y_plane, u_plane, v_plane) = video_exporter::rgba_to_yuv420p(rgba_data, width, height);
|
||||
// YUV planes already converted by GPU (no CPU conversion needed)
|
||||
|
||||
// Create FFmpeg video frame
|
||||
let mut video_frame = ffmpeg_next::frame::Video::new(
|
||||
|
|
@ -1087,8 +1195,6 @@ impl ExportOrchestrator {
|
|||
// Encoder time base is 1/(framerate * 1000), so PTS = timestamp * (framerate * 1000)
|
||||
let encoder_tb = encoder.time_base();
|
||||
let pts = (timestamp * encoder_tb.1 as f64) as i64;
|
||||
println!("🎬 [ENCODE] Frame timestamp={:.3}s, encoder_tb={}/{}, calculated PTS={}",
|
||||
timestamp, encoder_tb.0, encoder_tb.1, pts);
|
||||
video_frame.set_pts(Some(pts));
|
||||
|
||||
// Send frame to encoder
|
||||
|
|
|
|||
|
|
@ -0,0 +1,191 @@
|
|||
//! Performance instrumentation for video export pipeline
|
||||
//!
|
||||
//! Tracks timing for each stage of the export process:
|
||||
//! - GPU rendering (render_frame_to_gpu_yuv)
|
||||
//! - Async readback (map_async completion)
|
||||
//! - YUV plane extraction
|
||||
//! - FFmpeg encoding
|
||||
//! - Polling frequency and efficiency
|
||||
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
/// Performance metrics for a single frame
|
||||
#[derive(Debug)]
|
||||
pub struct FrameMetrics {
|
||||
pub frame_num: usize,
|
||||
pub render_start: Instant,
|
||||
pub render_end: Option<Instant>,
|
||||
pub submit_time: Option<Instant>,
|
||||
pub readback_complete: Option<Instant>,
|
||||
pub extraction_start: Option<Instant>,
|
||||
pub extraction_end: Option<Instant>,
|
||||
pub conversion_start: Option<Instant>,
|
||||
pub conversion_end: Option<Instant>,
|
||||
pub encode_start: Option<Instant>,
|
||||
pub encode_end: Option<Instant>,
|
||||
}
|
||||
|
||||
impl FrameMetrics {
|
||||
pub fn new(frame_num: usize) -> Self {
|
||||
Self {
|
||||
frame_num,
|
||||
render_start: Instant::now(),
|
||||
render_end: None,
|
||||
submit_time: None,
|
||||
readback_complete: None,
|
||||
extraction_start: None,
|
||||
extraction_end: None,
|
||||
conversion_start: None,
|
||||
conversion_end: None,
|
||||
encode_start: None,
|
||||
encode_end: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn render_duration(&self) -> Option<Duration> {
|
||||
self.render_end.map(|end| end.duration_since(self.render_start))
|
||||
}
|
||||
|
||||
pub fn readback_duration(&self) -> Option<Duration> {
|
||||
self.submit_time.and_then(|submit|
|
||||
self.readback_complete.map(|complete|
|
||||
complete.duration_since(submit)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
pub fn extraction_duration(&self) -> Option<Duration> {
|
||||
self.extraction_start.and_then(|start|
|
||||
self.extraction_end.map(|end|
|
||||
end.duration_since(start)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
pub fn conversion_duration(&self) -> Option<Duration> {
|
||||
self.conversion_start.and_then(|start|
|
||||
self.conversion_end.map(|end|
|
||||
end.duration_since(start)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
pub fn encode_duration(&self) -> Option<Duration> {
|
||||
self.encode_start.and_then(|start|
|
||||
self.encode_end.map(|end|
|
||||
end.duration_since(start)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
pub fn total_duration(&self) -> Option<Duration> {
|
||||
self.encode_end.map(|end| end.duration_since(self.render_start))
|
||||
}
|
||||
}
|
||||
|
||||
/// Aggregate performance metrics for entire export
|
||||
pub struct ExportMetrics {
|
||||
pub frames: Vec<FrameMetrics>,
|
||||
export_start: Instant,
|
||||
pub poll_count: usize,
|
||||
pub completions_per_poll: Vec<usize>,
|
||||
}
|
||||
|
||||
impl ExportMetrics {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
frames: Vec::new(),
|
||||
export_start: Instant::now(),
|
||||
poll_count: 0,
|
||||
completions_per_poll: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Print comprehensive performance summary
|
||||
pub fn print_summary(&self) {
|
||||
println!("\n📊 [PERF] Export Performance Summary");
|
||||
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
|
||||
// Calculate averages for each stage
|
||||
let mut render_times = Vec::new();
|
||||
let mut readback_times = Vec::new();
|
||||
let mut extraction_times = Vec::new();
|
||||
let mut conversion_times = Vec::new();
|
||||
let mut encode_times = Vec::new();
|
||||
let mut total_times = Vec::new();
|
||||
|
||||
for metrics in &self.frames {
|
||||
if let Some(d) = metrics.render_duration() {
|
||||
render_times.push(d);
|
||||
}
|
||||
if let Some(d) = metrics.readback_duration() {
|
||||
readback_times.push(d);
|
||||
}
|
||||
if let Some(d) = metrics.extraction_duration() {
|
||||
extraction_times.push(d);
|
||||
}
|
||||
if let Some(d) = metrics.conversion_duration() {
|
||||
conversion_times.push(d);
|
||||
}
|
||||
if let Some(d) = metrics.encode_duration() {
|
||||
encode_times.push(d);
|
||||
}
|
||||
if let Some(d) = metrics.total_duration() {
|
||||
total_times.push(d);
|
||||
}
|
||||
}
|
||||
|
||||
let avg = |times: &[Duration]| -> f64 {
|
||||
if times.is_empty() { return 0.0; }
|
||||
times.iter().sum::<Duration>().as_secs_f64() / times.len() as f64 * 1000.0
|
||||
};
|
||||
|
||||
println!("Render: {:.2}ms avg", avg(&render_times));
|
||||
println!("Readback: {:.2}ms avg", avg(&readback_times));
|
||||
println!("Extraction: {:.2}ms avg", avg(&extraction_times));
|
||||
println!("Conversion: {:.2}ms avg", avg(&conversion_times));
|
||||
println!("Encode: {:.2}ms avg", avg(&encode_times));
|
||||
println!("Total: {:.2}ms avg", avg(&total_times));
|
||||
|
||||
let total_export_time = Instant::now().duration_since(self.export_start).as_secs_f64();
|
||||
let fps = self.frames.len() as f64 / total_export_time;
|
||||
println!("\nOverall: {:.2} fps ({:.1}s for {} frames)",
|
||||
fps, total_export_time, self.frames.len());
|
||||
|
||||
if self.poll_count > 0 {
|
||||
let avg_completions = self.completions_per_poll.iter().sum::<usize>() as f64 / self.poll_count as f64;
|
||||
println!("Polls: {} ({:.2} completions/poll avg)",
|
||||
self.poll_count, avg_completions);
|
||||
}
|
||||
|
||||
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
|
||||
}
|
||||
|
||||
/// Print detailed per-frame breakdown for last N frames
|
||||
pub fn print_per_frame_details(&self, last_n: usize) {
|
||||
println!("\n📋 [PERF] Per-Frame Breakdown (last {} frames)", last_n);
|
||||
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
println!("{:>5} | {:>8} | {:>8} | {:>8} | {:>8} | {:>8} | {:>8}",
|
||||
"Frame", "Render", "Readback", "Extract", "Convert", "Encode", "Total");
|
||||
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
|
||||
let start = if self.frames.len() > last_n {
|
||||
self.frames.len() - last_n
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
for metrics in &self.frames[start..] {
|
||||
println!("{:5} | {:>7.2}ms | {:>7.2}ms | {:>7.2}ms | {:>7.2}ms | {:>7.2}ms | {:>7.2}ms",
|
||||
metrics.frame_num,
|
||||
metrics.render_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
|
||||
metrics.readback_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
|
||||
metrics.extraction_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
|
||||
metrics.conversion_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
|
||||
metrics.encode_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
|
||||
metrics.total_duration().map(|d| d.as_secs_f64() * 1000.0).unwrap_or(0.0),
|
||||
);
|
||||
}
|
||||
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n");
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,317 @@
|
|||
//! Async triple-buffered GPU readback pipeline for video export
|
||||
//!
|
||||
//! This module implements a pipelined export system that overlaps GPU rendering
|
||||
//! with CPU encoding to maximize throughput. It uses triple buffering to keep
|
||||
//! both GPU and CPU busy simultaneously:
|
||||
//!
|
||||
//! - Frame N: GPU rendering/conversion
|
||||
//! - Frame N-1: GPU→CPU async transfer
|
||||
//! - Frame N-2: CPU encoding
|
||||
//!
|
||||
//! Expected speedup: 5x over synchronous blocking approach
|
||||
|
||||
use std::sync::mpsc::{channel, Receiver, Sender};
|
||||
|
||||
/// Result from a completed async buffer mapping
|
||||
#[derive(Debug)]
|
||||
pub struct ReadbackResult {
|
||||
pub buffer_id: usize,
|
||||
pub frame_num: usize,
|
||||
pub timestamp: f64,
|
||||
}
|
||||
|
||||
/// State of a pipeline buffer in the triple-buffering state machine
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
enum BufferState {
|
||||
/// Buffer is available for new frame rendering
|
||||
Free,
|
||||
/// GPU is currently rendering/converting to this buffer
|
||||
Rendering,
|
||||
/// Buffer readback submitted, waiting for GPU→CPU transfer
|
||||
ReadbackPending,
|
||||
/// Buffer mapped and ready for CPU to read
|
||||
Mapped,
|
||||
/// CPU is encoding this buffer's data
|
||||
Encoding,
|
||||
}
|
||||
|
||||
/// A single buffer in the triple-buffering pipeline
|
||||
struct PipelineBuffer {
|
||||
id: usize,
|
||||
/// RGBA texture for GPU rendering output (Rgba8Unorm)
|
||||
rgba_texture: wgpu::Texture,
|
||||
rgba_texture_view: wgpu::TextureView,
|
||||
/// Staging buffer for GPU→CPU transfer (MAP_READ)
|
||||
staging_buffer: wgpu::Buffer,
|
||||
/// Current state in the pipeline
|
||||
state: BufferState,
|
||||
/// Frame metadata (set when rendering starts)
|
||||
frame_num: Option<usize>,
|
||||
timestamp: Option<f64>,
|
||||
}
|
||||
|
||||
/// Handle to an acquired buffer for rendering
|
||||
pub struct AcquiredBuffer {
|
||||
pub id: usize,
|
||||
pub rgba_texture_view: wgpu::TextureView,
|
||||
}
|
||||
|
||||
/// Triple-buffered async readback pipeline
|
||||
///
|
||||
/// Manages 3 buffers cycling through the pipeline:
|
||||
/// Free → Rendering → ReadbackPending → Mapped → Encoding → Free
|
||||
pub struct ReadbackPipeline {
|
||||
buffers: Vec<PipelineBuffer>,
|
||||
/// Channel for async map_async callbacks
|
||||
readback_rx: Receiver<ReadbackResult>,
|
||||
readback_tx: Sender<ReadbackResult>,
|
||||
/// wgpu device and queue references (needed for polling and buffer operations)
|
||||
device: wgpu::Device,
|
||||
queue: wgpu::Queue,
|
||||
/// Buffer dimensions
|
||||
width: u32,
|
||||
height: u32,
|
||||
}
|
||||
|
||||
impl ReadbackPipeline {
|
||||
/// Create a new triple-buffered readback pipeline
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `device` - GPU device (will be cloned for async operations)
|
||||
/// * `queue` - GPU queue (will be cloned for async operations)
|
||||
/// * `width` - Frame width in pixels
|
||||
/// * `height` - Frame height in pixels
|
||||
pub fn new(device: &wgpu::Device, queue: &wgpu::Queue, width: u32, height: u32) -> Self {
|
||||
let (readback_tx, readback_rx) = channel();
|
||||
|
||||
// Create 3 buffers for triple buffering
|
||||
let mut buffers = Vec::new();
|
||||
for id in 0..3 {
|
||||
// RGBA texture (Rgba8Unorm)
|
||||
let rgba_texture = device.create_texture(&wgpu::TextureDescriptor {
|
||||
label: Some(&format!("readback_rgba_texture_{}", id)),
|
||||
size: wgpu::Extent3d {
|
||||
width,
|
||||
height,
|
||||
depth_or_array_layers: 1,
|
||||
},
|
||||
mip_level_count: 1,
|
||||
sample_count: 1,
|
||||
dimension: wgpu::TextureDimension::D2,
|
||||
format: wgpu::TextureFormat::Rgba8Unorm,
|
||||
usage: wgpu::TextureUsages::RENDER_ATTACHMENT | wgpu::TextureUsages::COPY_SRC,
|
||||
view_formats: &[],
|
||||
});
|
||||
|
||||
let rgba_texture_view = rgba_texture.create_view(&wgpu::TextureViewDescriptor::default());
|
||||
|
||||
// Staging buffer for GPU→CPU readback
|
||||
let rgba_buffer_size = (width * height * 4) as u64; // Rgba8Unorm = 4 bytes/pixel
|
||||
let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some(&format!("readback_staging_buffer_{}", id)),
|
||||
size: rgba_buffer_size,
|
||||
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
|
||||
mapped_at_creation: false,
|
||||
});
|
||||
|
||||
buffers.push(PipelineBuffer {
|
||||
id,
|
||||
rgba_texture,
|
||||
rgba_texture_view,
|
||||
staging_buffer,
|
||||
state: BufferState::Free,
|
||||
frame_num: None,
|
||||
timestamp: None,
|
||||
});
|
||||
}
|
||||
|
||||
Self {
|
||||
buffers,
|
||||
readback_rx,
|
||||
readback_tx,
|
||||
device: device.clone(),
|
||||
queue: queue.clone(),
|
||||
width,
|
||||
height,
|
||||
}
|
||||
}
|
||||
|
||||
/// Acquire a free buffer for rendering (non-blocking)
|
||||
///
|
||||
/// Returns None if all buffers are in use (caller should poll and retry)
|
||||
pub fn acquire(&mut self, frame_num: usize, timestamp: f64) -> Option<AcquiredBuffer> {
|
||||
// Find first Free buffer
|
||||
for buffer in &mut self.buffers {
|
||||
if buffer.state == BufferState::Free {
|
||||
buffer.state = BufferState::Rendering;
|
||||
buffer.frame_num = Some(frame_num);
|
||||
buffer.timestamp = Some(timestamp);
|
||||
|
||||
return Some(AcquiredBuffer {
|
||||
id: buffer.id,
|
||||
rgba_texture_view: buffer.rgba_texture_view.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
None // All buffers busy
|
||||
}
|
||||
|
||||
/// Submit GPU commands and initiate async readback
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `buffer_id` - ID of the buffer to submit (from AcquiredBuffer)
|
||||
/// * `encoder` - Command encoder with rendering commands
|
||||
pub fn submit_and_readback(&mut self, buffer_id: usize, mut encoder: wgpu::CommandEncoder) {
|
||||
let buffer = &mut self.buffers[buffer_id];
|
||||
assert_eq!(buffer.state, BufferState::Rendering, "Buffer not in Rendering state");
|
||||
|
||||
// Copy RGBA texture to staging buffer
|
||||
encoder.copy_texture_to_buffer(
|
||||
wgpu::TexelCopyTextureInfo {
|
||||
texture: &buffer.rgba_texture,
|
||||
mip_level: 0,
|
||||
origin: wgpu::Origin3d::ZERO,
|
||||
aspect: wgpu::TextureAspect::All,
|
||||
},
|
||||
wgpu::TexelCopyBufferInfo {
|
||||
buffer: &buffer.staging_buffer,
|
||||
layout: wgpu::TexelCopyBufferLayout {
|
||||
offset: 0,
|
||||
bytes_per_row: Some(self.width * 4), // Rgba8Unorm
|
||||
rows_per_image: Some(self.height),
|
||||
},
|
||||
},
|
||||
wgpu::Extent3d {
|
||||
width: self.width,
|
||||
height: self.height,
|
||||
depth_or_array_layers: 1,
|
||||
},
|
||||
);
|
||||
|
||||
// Submit GPU commands (non-blocking)
|
||||
self.queue.submit(Some(encoder.finish()));
|
||||
|
||||
// Initiate async buffer mapping
|
||||
let frame_num = buffer.frame_num.unwrap();
|
||||
let timestamp = buffer.timestamp.unwrap();
|
||||
let tx = self.readback_tx.clone();
|
||||
|
||||
buffer.staging_buffer.slice(..).map_async(wgpu::MapMode::Read, move |result| {
|
||||
if result.is_ok() {
|
||||
let _ = tx.send(ReadbackResult {
|
||||
buffer_id,
|
||||
frame_num,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
buffer.state = BufferState::ReadbackPending;
|
||||
}
|
||||
|
||||
/// Poll for completed readbacks (non-blocking)
|
||||
///
|
||||
/// Returns list of buffers that are now ready for CPU encoding.
|
||||
/// Call this frequently to process completed transfers.
|
||||
pub fn poll_nonblocking(&mut self) -> Vec<ReadbackResult> {
|
||||
// Poll GPU without blocking
|
||||
self.device.poll(wgpu::PollType::Poll);
|
||||
|
||||
// Collect all completed readbacks
|
||||
let mut results = Vec::new();
|
||||
while let Ok(result) = self.readback_rx.try_recv() {
|
||||
// Update buffer state to Mapped
|
||||
self.buffers[result.buffer_id].state = BufferState::Mapped;
|
||||
results.push(result);
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
/// Extract RGBA data from mapped buffer (for CPU YUV conversion)
|
||||
///
|
||||
/// Buffer must be in Mapped state (after poll_nonblocking returned it).
|
||||
/// This immediately copies the RGBA data, allowing the buffer to be released.
|
||||
pub fn extract_rgba_data(&mut self, buffer_id: usize) -> Vec<u8> {
|
||||
let buffer = &mut self.buffers[buffer_id];
|
||||
assert_eq!(buffer.state, BufferState::Mapped, "Buffer not in Mapped state");
|
||||
|
||||
buffer.state = BufferState::Encoding;
|
||||
|
||||
// Map the buffer and copy RGBA data
|
||||
let slice = buffer.staging_buffer.slice(..);
|
||||
let data = slice.get_mapped_range();
|
||||
|
||||
// Simple copy - RGBA data goes to CPU for conversion
|
||||
data.to_vec()
|
||||
}
|
||||
|
||||
/// Release buffer after encoding completes, returning it to the free pool
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `buffer_id` - ID of buffer to release
|
||||
pub fn release(&mut self, buffer_id: usize) {
|
||||
let buffer = &mut self.buffers[buffer_id];
|
||||
assert_eq!(buffer.state, BufferState::Encoding, "Buffer not in Encoding state");
|
||||
|
||||
// Unmap buffer
|
||||
buffer.staging_buffer.unmap();
|
||||
|
||||
// Clear metadata
|
||||
buffer.frame_num = None;
|
||||
buffer.timestamp = None;
|
||||
|
||||
// Return to free pool
|
||||
buffer.state = BufferState::Free;
|
||||
}
|
||||
|
||||
/// Flush pipeline and wait for all pending operations
|
||||
///
|
||||
/// Call this at the end of export to ensure all frames are processed
|
||||
pub fn flush(&mut self) -> Vec<ReadbackResult> {
|
||||
let mut all_results = Vec::new();
|
||||
|
||||
// Keep polling until all buffers are Free
|
||||
loop {
|
||||
// Poll for new completions
|
||||
self.device.poll(wgpu::PollType::Poll);
|
||||
|
||||
while let Ok(result) = self.readback_rx.try_recv() {
|
||||
self.buffers[result.buffer_id].state = BufferState::Mapped;
|
||||
all_results.push(result);
|
||||
}
|
||||
|
||||
// Check if all buffers are Free (or can be made Free)
|
||||
let mut all_free = true;
|
||||
for buffer in &self.buffers {
|
||||
match buffer.state {
|
||||
BufferState::Free => {},
|
||||
BufferState::Rendering | BufferState::ReadbackPending => {
|
||||
all_free = false;
|
||||
break;
|
||||
},
|
||||
BufferState::Mapped | BufferState::Encoding => {
|
||||
// These should be handled by the caller, shouldn't happen during flush
|
||||
panic!("Buffer in {} state during flush - caller should encode and release",
|
||||
if buffer.state == BufferState::Mapped { "Mapped" } else { "Encoding" });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if all_free {
|
||||
break;
|
||||
}
|
||||
|
||||
// Small sleep to avoid busy-waiting
|
||||
std::thread::sleep(std::time::Duration::from_millis(1));
|
||||
}
|
||||
|
||||
all_results
|
||||
}
|
||||
|
||||
/// Get buffer count currently in flight (for monitoring)
|
||||
pub fn buffers_in_flight(&self) -> usize {
|
||||
self.buffers.iter().filter(|b| b.state != BufferState::Free).count()
|
||||
}
|
||||
}
|
||||
|
|
@ -12,7 +12,7 @@ use lightningbeam_core::renderer::{ImageCache, render_document_for_compositing,
|
|||
use lightningbeam_core::video::VideoManager;
|
||||
use lightningbeam_core::gpu::{
|
||||
BufferPool, BufferSpec, BufferFormat, Compositor, CompositorLayer,
|
||||
SrgbToLinearConverter, EffectProcessor, HDR_FORMAT,
|
||||
SrgbToLinearConverter, EffectProcessor, YuvConverter, HDR_FORMAT,
|
||||
};
|
||||
|
||||
/// Reusable frame buffers to avoid allocations
|
||||
|
|
@ -56,10 +56,22 @@ pub struct ExportGpuResources {
|
|||
pub srgb_to_linear: SrgbToLinearConverter,
|
||||
/// Effect processor for shader effects
|
||||
pub effect_processor: EffectProcessor,
|
||||
/// GPU-accelerated RGBA to YUV420p converter
|
||||
pub yuv_converter: YuvConverter,
|
||||
/// HDR accumulator texture for compositing
|
||||
pub hdr_texture: wgpu::Texture,
|
||||
/// View for HDR texture
|
||||
pub hdr_texture_view: wgpu::TextureView,
|
||||
/// Persistent RGBA output texture (sRGB, reused for all frames)
|
||||
pub output_texture: wgpu::Texture,
|
||||
/// View for persistent output texture
|
||||
pub output_texture_view: wgpu::TextureView,
|
||||
/// Persistent YUV texture for GPU conversion (R8Unorm, height*1.5, reused for all frames)
|
||||
pub yuv_texture: wgpu::Texture,
|
||||
/// View for persistent YUV texture
|
||||
pub yuv_texture_view: wgpu::TextureView,
|
||||
/// Persistent staging buffer for GPU→CPU readback (reused for all frames)
|
||||
pub staging_buffer: wgpu::Buffer,
|
||||
/// Linear to sRGB blit pipeline for final output
|
||||
pub linear_to_srgb_pipeline: wgpu::RenderPipeline,
|
||||
/// Bind group layout for linear to sRGB blit
|
||||
|
|
@ -75,6 +87,7 @@ impl ExportGpuResources {
|
|||
let compositor = Compositor::new(device, HDR_FORMAT);
|
||||
let srgb_to_linear = SrgbToLinearConverter::new(device);
|
||||
let effect_processor = EffectProcessor::new(device, HDR_FORMAT);
|
||||
let yuv_converter = YuvConverter::new(device);
|
||||
|
||||
// Create HDR accumulator texture
|
||||
let hdr_texture = device.create_texture(&wgpu::TextureDescriptor {
|
||||
|
|
@ -95,6 +108,53 @@ impl ExportGpuResources {
|
|||
});
|
||||
let hdr_texture_view = hdr_texture.create_view(&wgpu::TextureViewDescriptor::default());
|
||||
|
||||
// Create persistent RGBA output texture (sRGB, reused for all frames)
|
||||
let output_texture = device.create_texture(&wgpu::TextureDescriptor {
|
||||
label: Some("export_output_texture"),
|
||||
size: wgpu::Extent3d {
|
||||
width,
|
||||
height,
|
||||
depth_or_array_layers: 1,
|
||||
},
|
||||
mip_level_count: 1,
|
||||
sample_count: 1,
|
||||
dimension: wgpu::TextureDimension::D2,
|
||||
format: wgpu::TextureFormat::Rgba8Unorm,
|
||||
usage: wgpu::TextureUsages::RENDER_ATTACHMENT
|
||||
| wgpu::TextureUsages::TEXTURE_BINDING
|
||||
| wgpu::TextureUsages::COPY_SRC,
|
||||
view_formats: &[],
|
||||
});
|
||||
let output_texture_view = output_texture.create_view(&wgpu::TextureViewDescriptor::default());
|
||||
|
||||
// Create persistent YUV texture (Rgba8Unorm, height*1.5 for packed Y+U+V planes)
|
||||
// Note: Using Rgba8Unorm instead of R8Unorm because R8Unorm doesn't support STORAGE_BINDING
|
||||
let yuv_height = height + height / 2; // Y plane + U plane + V plane
|
||||
let yuv_texture = device.create_texture(&wgpu::TextureDescriptor {
|
||||
label: Some("export_yuv_texture"),
|
||||
size: wgpu::Extent3d {
|
||||
width,
|
||||
height: yuv_height,
|
||||
depth_or_array_layers: 1,
|
||||
},
|
||||
mip_level_count: 1,
|
||||
sample_count: 1,
|
||||
dimension: wgpu::TextureDimension::D2,
|
||||
format: wgpu::TextureFormat::Rgba8Unorm,
|
||||
usage: wgpu::TextureUsages::STORAGE_BINDING | wgpu::TextureUsages::COPY_SRC,
|
||||
view_formats: &[],
|
||||
});
|
||||
let yuv_texture_view = yuv_texture.create_view(&wgpu::TextureViewDescriptor::default());
|
||||
|
||||
// Create persistent staging buffer for GPU→CPU readback
|
||||
let yuv_buffer_size = (width * yuv_height * 4) as u64; // Rgba8Unorm = 4 bytes per pixel
|
||||
let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some("export_staging_buffer"),
|
||||
size: yuv_buffer_size,
|
||||
usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
|
||||
mapped_at_creation: false,
|
||||
});
|
||||
|
||||
// Create linear to sRGB blit pipeline
|
||||
let linear_to_srgb_bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
|
||||
label: Some("linear_to_srgb_bind_group_layout"),
|
||||
|
|
@ -179,8 +239,14 @@ impl ExportGpuResources {
|
|||
compositor,
|
||||
srgb_to_linear,
|
||||
effect_processor,
|
||||
yuv_converter,
|
||||
hdr_texture,
|
||||
hdr_texture_view,
|
||||
output_texture,
|
||||
output_texture_view,
|
||||
yuv_texture,
|
||||
yuv_texture_view,
|
||||
staging_buffer,
|
||||
linear_to_srgb_pipeline,
|
||||
linear_to_srgb_bind_group_layout,
|
||||
linear_to_srgb_sampler,
|
||||
|
|
@ -476,20 +542,11 @@ pub fn receive_and_write_packets(
|
|||
let encoder_tb = encoder.time_base();
|
||||
let stream_tb = output.stream(0).ok_or("No output stream found")?.time_base();
|
||||
|
||||
println!("🎬 [PACKET] Encoder TB: {}/{}, Stream TB: {}/{}",
|
||||
encoder_tb.0, encoder_tb.1, stream_tb.0, stream_tb.1);
|
||||
|
||||
while encoder.receive_packet(&mut encoded).is_ok() {
|
||||
println!("🎬 [PACKET] Before rescale - PTS: {:?}, DTS: {:?}, Duration: {:?}",
|
||||
encoded.pts(), encoded.dts(), encoded.duration());
|
||||
|
||||
encoded.set_stream(0);
|
||||
// Rescale timestamps from encoder time base to stream time base
|
||||
encoded.rescale_ts(encoder_tb, stream_tb);
|
||||
|
||||
println!("🎬 [PACKET] After rescale - PTS: {:?}, DTS: {:?}, Duration: {:?}",
|
||||
encoded.pts(), encoded.dts(), encoded.duration());
|
||||
|
||||
encoded
|
||||
.write_interleaved(output)
|
||||
.map_err(|e| format!("Failed to write packet: {}", e))?;
|
||||
|
|
@ -660,10 +717,9 @@ pub fn render_frame_to_rgba(
|
|||
/// * `image_cache` - Image cache for rendering
|
||||
/// * `video_manager` - Video manager for video clips
|
||||
/// * `gpu_resources` - HDR GPU resources for compositing
|
||||
/// * `rgba_buffer` - Output buffer for RGBA pixels (must be width * height * 4 bytes)
|
||||
///
|
||||
/// # Returns
|
||||
/// Ok(()) on success, Err with message on failure
|
||||
/// Ok((y_plane, u_plane, v_plane)) with YUV420p planes on success, Err with message on failure
|
||||
pub fn render_frame_to_rgba_hdr(
|
||||
document: &mut Document,
|
||||
timestamp: f64,
|
||||
|
|
@ -675,8 +731,7 @@ pub fn render_frame_to_rgba_hdr(
|
|||
image_cache: &mut ImageCache,
|
||||
video_manager: &Arc<std::sync::Mutex<VideoManager>>,
|
||||
gpu_resources: &mut ExportGpuResources,
|
||||
rgba_buffer: &mut [u8],
|
||||
) -> Result<(), String> {
|
||||
) -> Result<(Vec<u8>, Vec<u8>, Vec<u8>), String> {
|
||||
use vello::kurbo::Affine;
|
||||
|
||||
// Set document time to the frame timestamp
|
||||
|
|
@ -879,22 +934,8 @@ pub fn render_frame_to_rgba_hdr(
|
|||
// Advance frame counter for buffer cleanup
|
||||
gpu_resources.buffer_pool.next_frame();
|
||||
|
||||
// Create output texture for final sRGB output
|
||||
let output_texture = device.create_texture(&wgpu::TextureDescriptor {
|
||||
label: Some("export_output_texture"),
|
||||
size: wgpu::Extent3d {
|
||||
width,
|
||||
height,
|
||||
depth_or_array_layers: 1,
|
||||
},
|
||||
mip_level_count: 1,
|
||||
sample_count: 1,
|
||||
dimension: wgpu::TextureDimension::D2,
|
||||
format: wgpu::TextureFormat::Rgba8Unorm,
|
||||
usage: wgpu::TextureUsages::RENDER_ATTACHMENT | wgpu::TextureUsages::COPY_SRC,
|
||||
view_formats: &[],
|
||||
});
|
||||
let output_view = output_texture.create_view(&wgpu::TextureViewDescriptor::default());
|
||||
// Use persistent output texture (already created in ExportGpuResources)
|
||||
let output_view = &gpu_resources.output_texture_view;
|
||||
|
||||
// Convert HDR to sRGB for output
|
||||
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
|
||||
|
|
@ -940,52 +981,48 @@ pub fn render_frame_to_rgba_hdr(
|
|||
|
||||
queue.submit(Some(encoder.finish()));
|
||||
|
||||
// GPU readback: Create staging buffer with proper alignment
|
||||
let bytes_per_pixel = 4u32; // RGBA8
|
||||
let bytes_per_row_alignment = 256u32;
|
||||
let unpadded_bytes_per_row = width * bytes_per_pixel;
|
||||
let bytes_per_row = ((unpadded_bytes_per_row + bytes_per_row_alignment - 1)
|
||||
/ bytes_per_row_alignment) * bytes_per_row_alignment;
|
||||
let buffer_size = (bytes_per_row * height) as u64;
|
||||
|
||||
let staging_buffer = device.create_buffer(&wgpu::BufferDescriptor {
|
||||
label: Some("export_staging_buffer"),
|
||||
size: buffer_size,
|
||||
usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
|
||||
mapped_at_creation: false,
|
||||
// GPU YUV conversion: Convert RGBA output to YUV420p
|
||||
let mut yuv_encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
|
||||
label: Some("export_yuv_conversion_encoder"),
|
||||
});
|
||||
|
||||
// Copy texture to staging buffer
|
||||
let mut copy_encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
|
||||
label: Some("export_copy_encoder"),
|
||||
});
|
||||
gpu_resources.yuv_converter.convert(
|
||||
device,
|
||||
&mut yuv_encoder,
|
||||
output_view,
|
||||
&gpu_resources.yuv_texture_view,
|
||||
width,
|
||||
height,
|
||||
);
|
||||
|
||||
copy_encoder.copy_texture_to_buffer(
|
||||
// Copy YUV texture to persistent staging buffer
|
||||
let yuv_height = height + height / 2; // Y plane + U plane + V plane
|
||||
yuv_encoder.copy_texture_to_buffer(
|
||||
wgpu::TexelCopyTextureInfo {
|
||||
texture: &output_texture,
|
||||
texture: &gpu_resources.yuv_texture,
|
||||
mip_level: 0,
|
||||
origin: wgpu::Origin3d::ZERO,
|
||||
aspect: wgpu::TextureAspect::All,
|
||||
},
|
||||
wgpu::TexelCopyBufferInfo {
|
||||
buffer: &staging_buffer,
|
||||
buffer: &gpu_resources.staging_buffer,
|
||||
layout: wgpu::TexelCopyBufferLayout {
|
||||
offset: 0,
|
||||
bytes_per_row: Some(bytes_per_row),
|
||||
rows_per_image: Some(height),
|
||||
bytes_per_row: Some(width * 4), // Rgba8Unorm = 4 bytes per pixel
|
||||
rows_per_image: Some(yuv_height),
|
||||
},
|
||||
},
|
||||
wgpu::Extent3d {
|
||||
width,
|
||||
height,
|
||||
height: yuv_height,
|
||||
depth_or_array_layers: 1,
|
||||
},
|
||||
);
|
||||
|
||||
queue.submit(Some(copy_encoder.finish()));
|
||||
queue.submit(Some(yuv_encoder.finish()));
|
||||
|
||||
// Map buffer and read pixels (synchronous)
|
||||
let buffer_slice = staging_buffer.slice(..);
|
||||
// Map buffer and read YUV pixels (synchronous)
|
||||
let buffer_slice = gpu_resources.staging_buffer.slice(..);
|
||||
let (sender, receiver) = std::sync::mpsc::channel();
|
||||
buffer_slice.map_async(wgpu::MapMode::Read, move |result| {
|
||||
sender.send(result).ok();
|
||||
|
|
@ -998,20 +1035,319 @@ pub fn render_frame_to_rgba_hdr(
|
|||
.map_err(|_| "Failed to receive buffer mapping result")?
|
||||
.map_err(|e| format!("Failed to map buffer: {:?}", e))?;
|
||||
|
||||
// Copy data from mapped buffer to output, removing padding
|
||||
// Extract Y, U, V planes from packed YUV buffer
|
||||
let data = buffer_slice.get_mapped_range();
|
||||
for y in 0..height as usize {
|
||||
let src_offset = y * bytes_per_row as usize;
|
||||
let dst_offset = y * unpadded_bytes_per_row as usize;
|
||||
let row_bytes = unpadded_bytes_per_row as usize;
|
||||
rgba_buffer[dst_offset..dst_offset + row_bytes]
|
||||
.copy_from_slice(&data[src_offset..src_offset + row_bytes]);
|
||||
let width_usize = width as usize;
|
||||
let height_usize = height as usize;
|
||||
|
||||
// Y plane: rows 0 to height-1 (extract R channel from Rgba8Unorm)
|
||||
let y_plane_size = width_usize * height_usize;
|
||||
let mut y_plane = vec![0u8; y_plane_size];
|
||||
for y in 0..height_usize {
|
||||
let src_row_offset = y * width_usize * 4; // 4 bytes per pixel (Rgba8Unorm)
|
||||
let dst_row_offset = y * width_usize;
|
||||
for x in 0..width_usize {
|
||||
y_plane[dst_row_offset + x] = data[src_row_offset + x * 4]; // Extract R channel
|
||||
}
|
||||
}
|
||||
|
||||
// U and V planes: rows height to height + height/2 - 1 (half resolution, side-by-side layout)
|
||||
// U plane is in left half (columns 0 to width/2-1), V plane is in right half (columns width/2 to width-1)
|
||||
let chroma_width = width_usize / 2;
|
||||
let chroma_height = height_usize / 2;
|
||||
let chroma_row_start = height_usize * width_usize * 4; // Start of chroma rows in bytes
|
||||
|
||||
let mut u_plane = vec![0u8; chroma_width * chroma_height];
|
||||
let mut v_plane = vec![0u8; chroma_width * chroma_height];
|
||||
|
||||
for y in 0..chroma_height {
|
||||
let row_offset = chroma_row_start + y * width_usize * 4; // Full width rows in chroma region
|
||||
|
||||
// Extract U plane (left half: columns 0 to chroma_width-1)
|
||||
let u_start = row_offset;
|
||||
let dst_offset = y * chroma_width;
|
||||
for x in 0..chroma_width {
|
||||
u_plane[dst_offset + x] = data[u_start + x * 4]; // Extract R channel
|
||||
}
|
||||
|
||||
// Extract V plane (right half: columns width/2 to width/2+chroma_width-1)
|
||||
let v_start = row_offset + chroma_width * 4;
|
||||
for x in 0..chroma_width {
|
||||
v_plane[dst_offset + x] = data[v_start + x * 4]; // Extract R channel
|
||||
}
|
||||
}
|
||||
|
||||
drop(data);
|
||||
staging_buffer.unmap();
|
||||
gpu_resources.staging_buffer.unmap();
|
||||
|
||||
Ok(())
|
||||
Ok((y_plane, u_plane, v_plane))
|
||||
}
|
||||
|
||||
/// Render frame to GPU RGBA texture (non-blocking, for async pipeline)
|
||||
///
|
||||
/// Similar to render_frame_to_rgba_hdr but renders to an external RGBA texture view
|
||||
/// (provided by ReadbackPipeline) and returns the command encoder WITHOUT blocking on readback.
|
||||
/// The caller (ReadbackPipeline) will submit the encoder and handle async readback.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `document` - Document to render
|
||||
/// * `timestamp` - Time in seconds to render at
|
||||
/// * `width` - Frame width in pixels
|
||||
/// * `height` - Frame height in pixels
|
||||
/// * `device` - wgpu device
|
||||
/// * `queue` - wgpu queue
|
||||
/// * `renderer` - Vello renderer
|
||||
/// * `image_cache` - Image cache for rendering
|
||||
/// * `video_manager` - Video manager for video clips
|
||||
/// * `gpu_resources` - HDR GPU resources for compositing
|
||||
/// * `rgba_texture_view` - External RGBA texture view (from ReadbackPipeline)
|
||||
///
|
||||
/// # Returns
|
||||
/// Command encoder ready for submission (caller submits via ReadbackPipeline)
|
||||
pub fn render_frame_to_gpu_rgba(
|
||||
document: &mut Document,
|
||||
timestamp: f64,
|
||||
width: u32,
|
||||
height: u32,
|
||||
device: &wgpu::Device,
|
||||
queue: &wgpu::Queue,
|
||||
renderer: &mut vello::Renderer,
|
||||
image_cache: &mut ImageCache,
|
||||
video_manager: &Arc<std::sync::Mutex<VideoManager>>,
|
||||
gpu_resources: &mut ExportGpuResources,
|
||||
rgba_texture_view: &wgpu::TextureView,
|
||||
) -> Result<wgpu::CommandEncoder, String> {
|
||||
use vello::kurbo::Affine;
|
||||
|
||||
// Set document time to the frame timestamp
|
||||
document.current_time = timestamp;
|
||||
|
||||
// Use identity transform for export (document coordinates = pixel coordinates)
|
||||
let base_transform = Affine::IDENTITY;
|
||||
|
||||
// Render document for compositing (returns per-layer scenes)
|
||||
let composite_result = render_document_for_compositing(
|
||||
document,
|
||||
base_transform,
|
||||
image_cache,
|
||||
video_manager,
|
||||
);
|
||||
|
||||
// Buffer specs for layer rendering
|
||||
let layer_spec = BufferSpec::new(width, height, BufferFormat::Rgba8Srgb);
|
||||
let hdr_spec = BufferSpec::new(width, height, BufferFormat::Rgba16Float);
|
||||
|
||||
// Render parameters for Vello (transparent background for layers)
|
||||
let layer_render_params = vello::RenderParams {
|
||||
base_color: vello::peniko::Color::TRANSPARENT,
|
||||
width,
|
||||
height,
|
||||
antialiasing_method: vello::AaConfig::Area,
|
||||
};
|
||||
|
||||
// Render background and composite it
|
||||
let bg_srgb_handle = gpu_resources.buffer_pool.acquire(device, layer_spec);
|
||||
let bg_hdr_handle = gpu_resources.buffer_pool.acquire(device, hdr_spec);
|
||||
|
||||
if let (Some(bg_srgb_view), Some(bg_hdr_view)) = (
|
||||
gpu_resources.buffer_pool.get_view(bg_srgb_handle),
|
||||
gpu_resources.buffer_pool.get_view(bg_hdr_handle),
|
||||
) {
|
||||
renderer.render_to_texture(device, queue, &composite_result.background, bg_srgb_view, &layer_render_params)
|
||||
.map_err(|e| format!("Failed to render background: {}", e))?;
|
||||
|
||||
let mut convert_encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
|
||||
label: Some("export_bg_srgb_to_linear_encoder"),
|
||||
});
|
||||
gpu_resources.srgb_to_linear.convert(device, &mut convert_encoder, bg_srgb_view, bg_hdr_view);
|
||||
queue.submit(Some(convert_encoder.finish()));
|
||||
|
||||
let bg_compositor_layer = CompositorLayer::normal(bg_hdr_handle, 1.0);
|
||||
let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
|
||||
label: Some("export_bg_composite_encoder"),
|
||||
});
|
||||
gpu_resources.compositor.composite(
|
||||
device,
|
||||
queue,
|
||||
&mut encoder,
|
||||
&[bg_compositor_layer],
|
||||
&gpu_resources.buffer_pool,
|
||||
&gpu_resources.hdr_texture_view,
|
||||
Some([0.0, 0.0, 0.0, 1.0]),
|
||||
);
|
||||
queue.submit(Some(encoder.finish()));
|
||||
}
|
||||
gpu_resources.buffer_pool.release(bg_srgb_handle);
|
||||
gpu_resources.buffer_pool.release(bg_hdr_handle);
|
||||
|
||||
// Render and composite each layer incrementally
|
||||
for rendered_layer in &composite_result.layers {
|
||||
if !rendered_layer.has_content {
|
||||
continue;
|
||||
}
|
||||
|
||||
match &rendered_layer.layer_type {
|
||||
RenderedLayerType::Content => {
|
||||
let srgb_handle = gpu_resources.buffer_pool.acquire(device, layer_spec);
|
||||
let hdr_layer_handle = gpu_resources.buffer_pool.acquire(device, hdr_spec);
|
||||
|
||||
if let (Some(srgb_view), Some(hdr_layer_view)) = (
|
||||
gpu_resources.buffer_pool.get_view(srgb_handle),
|
||||
gpu_resources.buffer_pool.get_view(hdr_layer_handle),
|
||||
) {
|
||||
renderer.render_to_texture(device, queue, &rendered_layer.scene, srgb_view, &layer_render_params)
|
||||
.map_err(|e| format!("Failed to render layer: {}", e))?;
|
||||
|
||||
let mut convert_encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
|
||||
label: Some("export_layer_srgb_to_linear_encoder"),
|
||||
});
|
||||
gpu_resources.srgb_to_linear.convert(device, &mut convert_encoder, srgb_view, hdr_layer_view);
|
||||
queue.submit(Some(convert_encoder.finish()));
|
||||
|
||||
let compositor_layer = CompositorLayer::normal(hdr_layer_handle, rendered_layer.opacity);
|
||||
let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
|
||||
label: Some("export_layer_composite_encoder"),
|
||||
});
|
||||
gpu_resources.compositor.composite(
|
||||
device,
|
||||
queue,
|
||||
&mut encoder,
|
||||
&[compositor_layer],
|
||||
&gpu_resources.buffer_pool,
|
||||
&gpu_resources.hdr_texture_view,
|
||||
None,
|
||||
);
|
||||
queue.submit(Some(encoder.finish()));
|
||||
}
|
||||
gpu_resources.buffer_pool.release(srgb_handle);
|
||||
gpu_resources.buffer_pool.release(hdr_layer_handle);
|
||||
}
|
||||
RenderedLayerType::Effect { effect_instances } => {
|
||||
// Effect layer - apply effects to the current HDR accumulator
|
||||
let current_time = document.current_time;
|
||||
|
||||
for effect_instance in effect_instances {
|
||||
// Get effect definition from document
|
||||
let Some(effect_def) = document.get_effect_definition(&effect_instance.clip_id) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
// Compile effect if needed
|
||||
if !gpu_resources.effect_processor.is_compiled(&effect_def.id) {
|
||||
let success = gpu_resources.effect_processor.compile_effect(device, effect_def);
|
||||
if !success {
|
||||
eprintln!("Failed to compile effect: {}", effect_def.name);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Create EffectInstance from ClipInstance for the processor
|
||||
let effect_inst = lightningbeam_core::effect::EffectInstance::new(
|
||||
effect_def,
|
||||
effect_instance.timeline_start,
|
||||
effect_instance.timeline_start + effect_instance.effective_duration(lightningbeam_core::effect::EFFECT_DURATION),
|
||||
);
|
||||
|
||||
// Acquire temp buffer for effect output (HDR format)
|
||||
let effect_output_handle = gpu_resources.buffer_pool.acquire(device, hdr_spec);
|
||||
|
||||
if let Some(effect_output_view) = gpu_resources.buffer_pool.get_view(effect_output_handle) {
|
||||
let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
|
||||
label: Some("export_effect_encoder"),
|
||||
});
|
||||
|
||||
// Apply effect: HDR accumulator → effect output buffer
|
||||
let applied = gpu_resources.effect_processor.apply_effect(
|
||||
device,
|
||||
queue,
|
||||
&mut encoder,
|
||||
effect_def,
|
||||
&effect_inst,
|
||||
&gpu_resources.hdr_texture_view,
|
||||
effect_output_view,
|
||||
width,
|
||||
height,
|
||||
current_time,
|
||||
);
|
||||
|
||||
if applied {
|
||||
// Copy effect output back to HDR accumulator
|
||||
encoder.copy_texture_to_texture(
|
||||
wgpu::TexelCopyTextureInfo {
|
||||
texture: gpu_resources.buffer_pool.get_texture(effect_output_handle).unwrap(),
|
||||
mip_level: 0,
|
||||
origin: wgpu::Origin3d::ZERO,
|
||||
aspect: wgpu::TextureAspect::All,
|
||||
},
|
||||
wgpu::TexelCopyTextureInfo {
|
||||
texture: &gpu_resources.hdr_texture,
|
||||
mip_level: 0,
|
||||
origin: wgpu::Origin3d::ZERO,
|
||||
aspect: wgpu::TextureAspect::All,
|
||||
},
|
||||
wgpu::Extent3d {
|
||||
width,
|
||||
height,
|
||||
depth_or_array_layers: 1,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
queue.submit(Some(encoder.finish()));
|
||||
}
|
||||
|
||||
gpu_resources.buffer_pool.release(effect_output_handle);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert HDR to sRGB (linear → sRGB), render directly to external RGBA texture
|
||||
let output_view = rgba_texture_view;
|
||||
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
|
||||
label: Some("export_linear_to_srgb_bind_group"),
|
||||
layout: &gpu_resources.linear_to_srgb_bind_group_layout,
|
||||
entries: &[
|
||||
wgpu::BindGroupEntry {
|
||||
binding: 0,
|
||||
resource: wgpu::BindingResource::TextureView(&gpu_resources.hdr_texture_view),
|
||||
},
|
||||
wgpu::BindGroupEntry {
|
||||
binding: 1,
|
||||
resource: wgpu::BindingResource::Sampler(&gpu_resources.linear_to_srgb_sampler),
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
|
||||
label: Some("export_linear_to_srgb_encoder"),
|
||||
});
|
||||
|
||||
{
|
||||
let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
|
||||
label: Some("export_linear_to_srgb_pass"),
|
||||
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
|
||||
view: &output_view,
|
||||
resolve_target: None,
|
||||
ops: wgpu::Operations {
|
||||
load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
|
||||
store: wgpu::StoreOp::Store,
|
||||
},
|
||||
depth_slice: None,
|
||||
})],
|
||||
depth_stencil_attachment: None,
|
||||
occlusion_query_set: None,
|
||||
timestamp_writes: None,
|
||||
});
|
||||
|
||||
render_pass.set_pipeline(&gpu_resources.linear_to_srgb_pipeline);
|
||||
render_pass.set_bind_group(0, &bind_group, &[]);
|
||||
render_pass.draw(0..4, 0..1);
|
||||
}
|
||||
|
||||
// Return encoder for caller to submit (ReadbackPipeline will handle submission and async readback)
|
||||
// Frame is already rendered to external RGBA texture, no GPU YUV conversion needed
|
||||
Ok(encoder)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
|
|||
|
|
@ -2936,7 +2936,6 @@ impl eframe::App for EditorApp {
|
|||
}
|
||||
}
|
||||
if let Some(progress) = orchestrator.poll_progress() {
|
||||
println!("📨 [MAIN] Received progress from orchestrator!");
|
||||
match progress {
|
||||
lightningbeam_core::export::ExportProgress::Started { total_frames } => {
|
||||
println!("Export started: {} frames", total_frames);
|
||||
|
|
|
|||
Loading…
Reference in New Issue