diff --git a/lightningbeam-ui/lightningbeam-core/src/brush_engine.rs b/lightningbeam-ui/lightningbeam-core/src/brush_engine.rs index ecdad50..eb2389e 100644 --- a/lightningbeam-ui/lightningbeam-core/src/brush_engine.rs +++ b/lightningbeam-ui/lightningbeam-core/src/brush_engine.rs @@ -38,6 +38,46 @@ use image::RgbaImage; use crate::raster_layer::{RasterBlendMode, StrokeRecord}; +/// A single brush dab ready for GPU dispatch. +/// +/// Padded to 64 bytes (4 × 16 bytes) for WGSL struct alignment in a storage buffer. +#[repr(C)] +#[derive(Clone, Copy, Debug, bytemuck::Pod, bytemuck::Zeroable)] +pub struct GpuDab { + /// Dab centre X (canvas pixels) + pub x: f32, + /// Dab centre Y (canvas pixels) + pub y: f32, + /// Dab radius (pixels) + pub radius: f32, + /// Hardness 0.0–1.0 (controls the falloff curve shape) + pub hardness: f32, + + /// Composite opacity for this dab + pub opacity: f32, + /// Brush color R (linear, premultiplied) + pub color_r: f32, + /// Brush color G + pub color_g: f32, + /// Brush color B + pub color_b: f32, + + /// Brush color A + pub color_a: f32, + /// Normalized stroke direction X (smudge only; 0 otherwise) + pub ndx: f32, + /// Normalized stroke direction Y (smudge only; 0 otherwise) + pub ndy: f32, + /// Distance to sample behind stroke for smudge (smudge only; 0 otherwise) + pub smudge_dist: f32, + + /// Blend mode: 0 = Normal, 1 = Erase, 2 = Smudge + pub blend_mode: u32, + pub _pad0: u32, + pub _pad1: u32, + pub _pad2: u32, +} + /// Transient brush stroke state (tracks partial dab position between segments) pub struct StrokeState { /// Distance along the path already "consumed" toward the next dab (in pixels) @@ -58,6 +98,114 @@ impl Default for StrokeState { pub struct BrushEngine; impl BrushEngine { + /// Compute the list of GPU dabs for a stroke segment. + /// + /// Uses the same dab-spacing logic as [`apply_stroke_with_state`] but produces + /// [`GpuDab`] structs for upload to the GPU compute pipeline instead of painting + /// into a pixel buffer. + /// + /// Also returns the union bounding box of all dabs as `(x0, y0, x1, y1)` in + /// integer canvas pixel coordinates (clamped to non-negative values; `x0==i32::MAX` + /// when the returned Vec is empty). + pub fn compute_dabs( + stroke: &StrokeRecord, + state: &mut StrokeState, + ) -> (Vec, (i32, i32, i32, i32)) { + let mut dabs: Vec = Vec::new(); + let mut bbox = (i32::MAX, i32::MAX, i32::MIN, i32::MIN); + + let blend_mode_u = match stroke.blend_mode { + RasterBlendMode::Normal => 0u32, + RasterBlendMode::Erase => 1u32, + RasterBlendMode::Smudge => 2u32, + }; + + let mut push_dab = |dabs: &mut Vec, + bbox: &mut (i32, i32, i32, i32), + x: f32, y: f32, + radius: f32, opacity: f32, + ndx: f32, ndy: f32, smudge_dist: f32| { + let r_fringe = radius + 1.0; + bbox.0 = bbox.0.min((x - r_fringe).floor() as i32); + bbox.1 = bbox.1.min((y - r_fringe).floor() as i32); + bbox.2 = bbox.2.max((x + r_fringe).ceil() as i32); + bbox.3 = bbox.3.max((y + r_fringe).ceil() as i32); + dabs.push(GpuDab { + x, y, radius, + hardness: stroke.brush_settings.hardness, + opacity, + color_r: stroke.color[0], + color_g: stroke.color[1], + color_b: stroke.color[2], + color_a: stroke.color[3], + ndx, ndy, smudge_dist, + blend_mode: blend_mode_u, + _pad0: 0, _pad1: 0, _pad2: 0, + }); + }; + + if stroke.points.len() < 2 { + if let Some(pt) = stroke.points.first() { + let r = stroke.brush_settings.radius_at_pressure(pt.pressure); + let o = stroke.brush_settings.opacity_at_pressure(pt.pressure); + // Single-tap smudge has no direction — skip (same as CPU engine) + if !matches!(stroke.blend_mode, RasterBlendMode::Smudge) { + push_dab(&mut dabs, &mut bbox, pt.x, pt.y, r, o, 0.0, 0.0, 0.0); + } + state.distance_since_last_dab = 0.0; + } + return (dabs, bbox); + } + + for window in stroke.points.windows(2) { + let p0 = &window[0]; + let p1 = &window[1]; + + let dx = p1.x - p0.x; + let dy = p1.y - p0.y; + let seg_len = (dx * dx + dy * dy).sqrt(); + if seg_len < 1e-4 { continue; } + + let mut t = 0.0f32; + while t < 1.0 { + let pressure = p0.pressure + t * (p1.pressure - p0.pressure); + let radius = stroke.brush_settings.radius_at_pressure(pressure); + let spacing = (radius * stroke.brush_settings.dabs_per_radius).max(0.5); + + let dist_to_next = spacing - state.distance_since_last_dab; + let seg_t_to_next = (dist_to_next / seg_len).max(0.0); + + if seg_t_to_next > 1.0 - t { + state.distance_since_last_dab += seg_len * (1.0 - t); + break; + } + + t += seg_t_to_next; + let x2 = p0.x + t * dx; + let y2 = p0.y + t * dy; + let pressure2 = p0.pressure + t * (p1.pressure - p0.pressure); + let radius2 = stroke.brush_settings.radius_at_pressure(pressure2); + let opacity2 = stroke.brush_settings.opacity_at_pressure(pressure2); + + if matches!(stroke.blend_mode, RasterBlendMode::Smudge) { + let ndx = dx / seg_len; + let ndy = dy / seg_len; + let smudge_dist = + (radius2 * stroke.brush_settings.dabs_per_radius).max(1.0); + push_dab(&mut dabs, &mut bbox, + x2, y2, radius2, opacity2, ndx, ndy, smudge_dist); + } else { + push_dab(&mut dabs, &mut bbox, + x2, y2, radius2, opacity2, 0.0, 0.0, 0.0); + } + + state.distance_since_last_dab = 0.0; + } + } + + (dabs, bbox) + } + /// Apply a complete stroke to a pixel buffer. /// /// A fresh [`StrokeState`] is created for each stroke (starts with full dab diff --git a/lightningbeam-ui/lightningbeam-core/src/renderer.rs b/lightningbeam-ui/lightningbeam-core/src/renderer.rs index 649535f..d622cc8 100644 --- a/lightningbeam-ui/lightningbeam-core/src/renderer.rs +++ b/lightningbeam-ui/lightningbeam-core/src/renderer.rs @@ -358,7 +358,10 @@ fn render_raster_layer_to_scene( format: ImageFormat::Rgba8, width: kf.width, height: kf.height, - alpha_type: ImageAlphaType::Alpha, + // raw_pixels stores sRGB-encoded premultiplied RGBA (channels are + // gamma-encoded, alpha is linear). Premultiplied tells Vello to + // decode the sRGB channels without premultiplying again. + alpha_type: ImageAlphaType::AlphaPremultiplied, }; let brush = ImageBrush::new(image_data); let canvas_rect = Rect::new(0.0, 0.0, kf.width as f64, kf.height as f64); diff --git a/lightningbeam-ui/lightningbeam-editor/src/gpu_brush.rs b/lightningbeam-ui/lightningbeam-editor/src/gpu_brush.rs new file mode 100644 index 0000000..b4092ef --- /dev/null +++ b/lightningbeam-ui/lightningbeam-editor/src/gpu_brush.rs @@ -0,0 +1,691 @@ +//! GPU-accelerated raster brush engine. +//! +//! [`GpuBrushEngine`] wraps the `brush_dab.wgsl` compute pipeline and manages +//! per-keyframe canvas texture pairs (ping-pong) used as the live canvas during +//! raster painting. +//! +//! ## Lifecycle +//! +//! 1. **Stroke start** — caller supplies the initial pixel data; the engine uploads +//! it to both canvas textures so either can serve as source/dest. +//! 2. **Each drag event** — [`GpuBrushEngine::render_dabs`] copies src→dst, +//! dispatches the compute shader, then swaps src/dst. +//! 3. **Stroke end** — [`GpuBrushEngine::readback_canvas`] copies the current +//! source texture into a staging buffer and returns the raw RGBA bytes +//! (blocking — uses `device.poll(Maintain::Wait)`). +//! 4. **Idle** — canvas textures are kept alive for the next stroke (no re-upload +//! needed if the layer has not changed). + +use std::collections::HashMap; +use uuid::Uuid; +use lightningbeam_core::brush_engine::GpuDab; + +// --------------------------------------------------------------------------- +// Colour-space helpers +// --------------------------------------------------------------------------- + +/// Decode one sRGB-encoded byte to linear float [0, 1]. +fn srgb_to_linear(c: f32) -> f32 { + if c <= 0.04045 { + c / 12.92 + } else { + ((c + 0.055) / 1.055).powf(2.4) + } +} + +/// Encode one linear float [0, 1] to an sRGB-encoded byte. +fn linear_to_srgb_byte(c: u8) -> u8 { + let f = c as f32 / 255.0; + let encoded = if f <= 0.0031308 { + f * 12.92 + } else { + 1.055 * f.powf(1.0 / 2.4) - 0.055 + }; + (encoded * 255.0 + 0.5) as u8 +} + +// --------------------------------------------------------------------------- +// Per-keyframe canvas texture pair (ping-pong) +// --------------------------------------------------------------------------- + +/// A pair of textures used for double-buffered canvas rendering. +/// +/// `current` indexes the texture that holds the up-to-date canvas state. +pub struct CanvasPair { + pub textures: [wgpu::Texture; 2], + pub views: [wgpu::TextureView; 2], + /// Index (0 or 1) of the texture that is the current "source" (authoritative). + pub current: usize, + pub width: u32, + pub height: u32, +} + +impl CanvasPair { + pub fn new(device: &wgpu::Device, width: u32, height: u32) -> Self { + let desc = wgpu::TextureDescriptor { + label: Some("raster_canvas"), + size: wgpu::Extent3d { width, height, depth_or_array_layers: 1 }, + mip_level_count: 1, + sample_count: 1, + dimension: wgpu::TextureDimension::D2, + format: wgpu::TextureFormat::Rgba8Unorm, + usage: wgpu::TextureUsages::TEXTURE_BINDING + | wgpu::TextureUsages::STORAGE_BINDING + | wgpu::TextureUsages::COPY_SRC + | wgpu::TextureUsages::COPY_DST, + view_formats: &[], + }; + let t0 = device.create_texture(&desc); + let t1 = device.create_texture(&desc); + let v0 = t0.create_view(&wgpu::TextureViewDescriptor::default()); + let v1 = t1.create_view(&wgpu::TextureViewDescriptor::default()); + Self { + textures: [t0, t1], + views: [v0, v1], + current: 0, + width, + height, + } + } + + /// Upload raw RGBA bytes to both textures (call once at stroke start). + /// + /// `pixels` is expected to be **sRGB-encoded premultiplied** (the format stored + /// in `raw_pixels` / PNG files). The values are decoded to linear premultiplied + /// before being written to the canvas, which operates entirely in linear space. + pub fn upload(&self, queue: &wgpu::Queue, pixels: &[u8]) { + // Decode sRGB-premultiplied → linear premultiplied for the GPU canvas. + let linear: Vec = pixels.chunks_exact(4).flat_map(|p| { + let r = (srgb_to_linear(p[0] as f32 / 255.0) * 255.0 + 0.5) as u8; + let g = (srgb_to_linear(p[1] as f32 / 255.0) * 255.0 + 0.5) as u8; + let b = (srgb_to_linear(p[2] as f32 / 255.0) * 255.0 + 0.5) as u8; + [r, g, b, p[3]] + }).collect(); + + let layout = wgpu::TexelCopyBufferLayout { + offset: 0, + bytes_per_row: Some(self.width * 4), + rows_per_image: Some(self.height), + }; + let extent = wgpu::Extent3d { + width: self.width, + height: self.height, + depth_or_array_layers: 1, + }; + for tex in &self.textures { + queue.write_texture( + wgpu::TexelCopyTextureInfo { + texture: tex, + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + &linear, + layout, + extent, + ); + } + } + + /// Source (current, authoritative) texture. + pub fn src(&self) -> &wgpu::Texture { &self.textures[self.current] } + /// Source texture view. + pub fn src_view(&self) -> &wgpu::TextureView { &self.views[self.current] } + /// Destination (write target) texture. + pub fn dst(&self) -> &wgpu::Texture { &self.textures[1 - self.current] } + /// Destination texture view. + pub fn dst_view(&self) -> &wgpu::TextureView { &self.views[1 - self.current] } + /// Commit the just-completed dispatch: make dst the new source. + pub fn swap(&mut self) { self.current = 1 - self.current; } +} + +// --------------------------------------------------------------------------- +// GpuBrushEngine +// --------------------------------------------------------------------------- + +/// GPU brush engine — holds the compute pipeline and per-keyframe canvas pairs. +pub struct GpuBrushEngine { + compute_pipeline: wgpu::ComputePipeline, + compute_bg_layout: wgpu::BindGroupLayout, + + /// Canvas texture pairs keyed by keyframe UUID. + pub canvases: HashMap, +} + +/// CPU-side parameters uniform for the compute shader. +#[repr(C)] +#[derive(Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)] +struct DabParams { + bbox_x0: i32, + bbox_y0: i32, + bbox_w: u32, + bbox_h: u32, + num_dabs: u32, + canvas_w: u32, + canvas_h: u32, + _pad: u32, +} + +impl GpuBrushEngine { + /// Create the pipeline. Returns `Err` if the device lacks the required + /// storage-texture capability for `Rgba8Unorm`. + pub fn new(device: &wgpu::Device) -> Self { + let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor { + label: Some("brush_dab_shader"), + source: wgpu::ShaderSource::Wgsl( + include_str!("panes/shaders/brush_dab.wgsl").into(), + ), + }); + + let compute_bg_layout = device.create_bind_group_layout( + &wgpu::BindGroupLayoutDescriptor { + label: Some("brush_dab_bgl"), + entries: &[ + // 0: dab storage buffer (read-only) + wgpu::BindGroupLayoutEntry { + binding: 0, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: true }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + // 1: params uniform + wgpu::BindGroupLayoutEntry { + binding: 1, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Uniform, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + // 2: canvas source (sampled) + wgpu::BindGroupLayoutEntry { + binding: 2, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Texture { + sample_type: wgpu::TextureSampleType::Float { filterable: true }, + view_dimension: wgpu::TextureViewDimension::D2, + multisampled: false, + }, + count: None, + }, + // 3: canvas destination (write-only storage) + wgpu::BindGroupLayoutEntry { + binding: 3, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::StorageTexture { + access: wgpu::StorageTextureAccess::WriteOnly, + format: wgpu::TextureFormat::Rgba8Unorm, + view_dimension: wgpu::TextureViewDimension::D2, + }, + count: None, + }, + ], + }, + ); + + let pipeline_layout = device.create_pipeline_layout( + &wgpu::PipelineLayoutDescriptor { + label: Some("brush_dab_pl"), + bind_group_layouts: &[&compute_bg_layout], + push_constant_ranges: &[], + }, + ); + + let compute_pipeline = device.create_compute_pipeline( + &wgpu::ComputePipelineDescriptor { + label: Some("brush_dab_pipeline"), + layout: Some(&pipeline_layout), + module: &shader, + entry_point: Some("main"), + compilation_options: Default::default(), + cache: None, + }, + ); + + Self { + compute_pipeline, + compute_bg_layout, + canvases: HashMap::new(), + } + } + + /// Ensure a canvas pair exists for `keyframe_id` at the given dimensions. + /// + /// If the canvas exists but has different dimensions it is replaced. + pub fn ensure_canvas( + &mut self, + device: &wgpu::Device, + keyframe_id: Uuid, + width: u32, + height: u32, + ) -> &mut CanvasPair { + let needs_new = self.canvases.get(&keyframe_id) + .map_or(true, |c| c.width != width || c.height != height); + if needs_new { + self.canvases.insert(keyframe_id, CanvasPair::new(device, width, height)); + } + self.canvases.get_mut(&keyframe_id).unwrap() + } + + /// Dispatch the brush compute shader for `dabs` onto the canvas of `keyframe_id`. + /// + /// * Pre-fills `dst` from `src` so untouched pixels are preserved. + /// * Dispatches the compute shader. + /// * Swaps src/dst so the just-written texture becomes the new source. + /// + /// `dab_bbox` is `(x0, y0, x1, y1)` — the union bounding box of all dabs. + /// If `dabs` is empty or the bbox is invalid, does nothing. + pub fn render_dabs( + &mut self, + device: &wgpu::Device, + queue: &wgpu::Queue, + keyframe_id: Uuid, + dabs: &[GpuDab], + bbox: (i32, i32, i32, i32), + canvas_w: u32, + canvas_h: u32, + ) { + if dabs.is_empty() || bbox.0 == i32::MAX { return; } + + let canvas = match self.canvases.get_mut(&keyframe_id) { + Some(c) => c, + None => return, + }; + + // Clamp bbox to canvas bounds + let x0 = bbox.0.max(0) as u32; + let y0 = bbox.1.max(0) as u32; + let x1 = (bbox.2.min(canvas_w as i32 - 1)).max(0) as u32; + let y1 = (bbox.3.min(canvas_h as i32 - 1)).max(0) as u32; + if x1 < x0 || y1 < y0 { return; } + + let bbox_w = x1 - x0 + 1; + let bbox_h = y1 - y0 + 1; + + // --- Pre-fill dst from src: copy the ENTIRE canvas so every pixel outside + // the dab bounding box is preserved across the ping-pong swap. + // Copying only the bbox would leave dst with data from two frames ago + // in all other regions, causing missing dabs on alternating frames. --- + let mut copy_encoder = device.create_command_encoder( + &wgpu::CommandEncoderDescriptor { label: Some("canvas_copy_encoder") }, + ); + let full_extent = wgpu::Extent3d { + width: canvas.width, + height: canvas.height, + depth_or_array_layers: 1, + }; + copy_encoder.copy_texture_to_texture( + wgpu::TexelCopyTextureInfo { + texture: canvas.src(), + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + wgpu::TexelCopyTextureInfo { + texture: canvas.dst(), + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + full_extent, + ); + queue.submit(Some(copy_encoder.finish())); + + // --- Upload dab data and params --- + let dab_bytes = bytemuck::cast_slice(dabs); + let dab_buf = device.create_buffer(&wgpu::BufferDescriptor { + label: Some("dab_storage_buf"), + size: dab_bytes.len() as u64, + usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }); + queue.write_buffer(&dab_buf, 0, dab_bytes); + + let params = DabParams { + bbox_x0: x0 as i32, + bbox_y0: y0 as i32, + bbox_w, + bbox_h, + num_dabs: dabs.len() as u32, + canvas_w, + canvas_h, + _pad: 0, + }; + let params_buf = device.create_buffer(&wgpu::BufferDescriptor { + label: Some("dab_params_buf"), + size: std::mem::size_of::() as u64, + usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }); + queue.write_buffer(¶ms_buf, 0, bytemuck::bytes_of(¶ms)); + + let bg = device.create_bind_group(&wgpu::BindGroupDescriptor { + label: Some("brush_dab_bg"), + layout: &self.compute_bg_layout, + entries: &[ + wgpu::BindGroupEntry { + binding: 0, + resource: dab_buf.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 1, + resource: params_buf.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 2, + resource: wgpu::BindingResource::TextureView(canvas.src_view()), + }, + wgpu::BindGroupEntry { + binding: 3, + resource: wgpu::BindingResource::TextureView(canvas.dst_view()), + }, + ], + }); + + // --- Dispatch --- + let mut compute_encoder = device.create_command_encoder( + &wgpu::CommandEncoderDescriptor { label: Some("brush_dab_encoder") }, + ); + { + let mut pass = compute_encoder.begin_compute_pass( + &wgpu::ComputePassDescriptor { + label: Some("brush_dab_pass"), + timestamp_writes: None, + }, + ); + pass.set_pipeline(&self.compute_pipeline); + pass.set_bind_group(0, &bg, &[]); + let wg_x = bbox_w.div_ceil(8); + let wg_y = bbox_h.div_ceil(8); + pass.dispatch_workgroups(wg_x, wg_y, 1); + } + queue.submit(Some(compute_encoder.finish())); + + // Swap: dst is now the authoritative source + canvas.swap(); + } + + /// Read the current canvas back to a CPU `Vec` (raw RGBA, row-major). + /// + /// **Blocks** until the GPU work is complete (`Maintain::Wait`). + /// Should only be called at stroke end, not every frame. + /// + /// Returns `None` if no canvas exists for `keyframe_id`. + pub fn readback_canvas( + &self, + device: &wgpu::Device, + queue: &wgpu::Queue, + keyframe_id: Uuid, + ) -> Option> { + let canvas = self.canvases.get(&keyframe_id)?; + let width = canvas.width; + let height = canvas.height; + + // wgpu requires bytes_per_row to be a multiple of 256 + let bytes_per_row_aligned = + ((width * 4 + 255) / 256) * 256; + let total_bytes = (bytes_per_row_aligned * height) as u64; + + let staging = device.create_buffer(&wgpu::BufferDescriptor { + label: Some("canvas_readback_buf"), + size: total_bytes, + usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }); + + let mut encoder = device.create_command_encoder( + &wgpu::CommandEncoderDescriptor { label: Some("canvas_readback_encoder") }, + ); + encoder.copy_texture_to_buffer( + wgpu::TexelCopyTextureInfo { + texture: canvas.src(), + mip_level: 0, + origin: wgpu::Origin3d::ZERO, + aspect: wgpu::TextureAspect::All, + }, + wgpu::TexelCopyBufferInfo { + buffer: &staging, + layout: wgpu::TexelCopyBufferLayout { + offset: 0, + bytes_per_row: Some(bytes_per_row_aligned), + rows_per_image: Some(height), + }, + }, + wgpu::Extent3d { width, height, depth_or_array_layers: 1 }, + ); + queue.submit(Some(encoder.finish())); + + // Block until complete + let slice = staging.slice(..); + let (tx, rx) = std::sync::mpsc::channel(); + slice.map_async(wgpu::MapMode::Read, move |r| { let _ = tx.send(r); }); + let _ = device.poll(wgpu::PollType::wait_indefinitely()); + if rx.recv().ok()?.is_err() { return None; } + + let mapped = slice.get_mapped_range(); + + // De-stride: copy only `width * 4` bytes per row (drop alignment padding) + let bytes_per_row_tight = (width * 4) as usize; + let bytes_per_row_src = bytes_per_row_aligned as usize; + let mut pixels = vec![0u8; (width * height * 4) as usize]; + for row in 0..height as usize { + let src = &mapped[row * bytes_per_row_src .. row * bytes_per_row_src + bytes_per_row_tight]; + let dst = &mut pixels[row * bytes_per_row_tight .. (row + 1) * bytes_per_row_tight]; + dst.copy_from_slice(src); + } + + drop(mapped); + staging.unmap(); + + // Encode linear premultiplied → sRGB-encoded premultiplied so the returned + // bytes match what Vello expects (ImageAlphaType::Premultiplied with sRGB + // channels). Alpha is left unchanged. + for pixel in pixels.chunks_exact_mut(4) { + pixel[0] = linear_to_srgb_byte(pixel[0]); + pixel[1] = linear_to_srgb_byte(pixel[1]); + pixel[2] = linear_to_srgb_byte(pixel[2]); + } + + Some(pixels) + } + + /// Remove the canvas pair for a keyframe (e.g. when the layer is deleted). + pub fn remove_canvas(&mut self, keyframe_id: &Uuid) { + self.canvases.remove(keyframe_id); + } +} + +// --------------------------------------------------------------------------- +// Canvas blit pipeline (renders canvas texture to layer sRGB buffer) +// --------------------------------------------------------------------------- + +/// Bind group layout + pipeline for blitting a canvas texture (at document +/// resolution) into a layer render buffer (at viewport resolution), applying +/// the camera transform. +pub struct CanvasBlitPipeline { + pub pipeline: wgpu::RenderPipeline, + pub bg_layout: wgpu::BindGroupLayout, + pub sampler: wgpu::Sampler, +} + +/// Camera parameters uniform for canvas_blit.wgsl. +#[repr(C)] +#[derive(Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)] +pub struct CameraParams { + pub pan_x: f32, + pub pan_y: f32, + pub zoom: f32, + pub canvas_w: f32, + pub canvas_h: f32, + pub viewport_w: f32, + pub viewport_h: f32, + pub _pad: f32, +} + +impl CanvasBlitPipeline { + pub fn new(device: &wgpu::Device) -> Self { + let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor { + label: Some("canvas_blit_shader"), + source: wgpu::ShaderSource::Wgsl( + include_str!("panes/shaders/canvas_blit.wgsl").into(), + ), + }); + + let bg_layout = device.create_bind_group_layout( + &wgpu::BindGroupLayoutDescriptor { + label: Some("canvas_blit_bgl"), + entries: &[ + wgpu::BindGroupLayoutEntry { + binding: 0, + visibility: wgpu::ShaderStages::FRAGMENT, + ty: wgpu::BindingType::Texture { + sample_type: wgpu::TextureSampleType::Float { filterable: true }, + view_dimension: wgpu::TextureViewDimension::D2, + multisampled: false, + }, + count: None, + }, + wgpu::BindGroupLayoutEntry { + binding: 1, + visibility: wgpu::ShaderStages::FRAGMENT, + ty: wgpu::BindingType::Sampler(wgpu::SamplerBindingType::Filtering), + count: None, + }, + wgpu::BindGroupLayoutEntry { + binding: 2, + visibility: wgpu::ShaderStages::FRAGMENT, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Uniform, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + ], + }, + ); + + let pipeline_layout = device.create_pipeline_layout( + &wgpu::PipelineLayoutDescriptor { + label: Some("canvas_blit_pl"), + bind_group_layouts: &[&bg_layout], + push_constant_ranges: &[], + }, + ); + + let pipeline = device.create_render_pipeline( + &wgpu::RenderPipelineDescriptor { + label: Some("canvas_blit_pipeline"), + layout: Some(&pipeline_layout), + vertex: wgpu::VertexState { + module: &shader, + entry_point: Some("vs_main"), + buffers: &[], + compilation_options: Default::default(), + }, + fragment: Some(wgpu::FragmentState { + module: &shader, + entry_point: Some("fs_main"), + targets: &[Some(wgpu::ColorTargetState { + format: wgpu::TextureFormat::Rgba8Unorm, + blend: None, // canvas already stores premultiplied alpha + write_mask: wgpu::ColorWrites::ALL, + })], + compilation_options: Default::default(), + }), + primitive: wgpu::PrimitiveState { + topology: wgpu::PrimitiveTopology::TriangleStrip, + ..Default::default() + }, + depth_stencil: None, + multisample: wgpu::MultisampleState::default(), + multiview: None, + cache: None, + }, + ); + + let sampler = device.create_sampler(&wgpu::SamplerDescriptor { + label: Some("canvas_blit_sampler"), + address_mode_u: wgpu::AddressMode::ClampToEdge, + address_mode_v: wgpu::AddressMode::ClampToEdge, + address_mode_w: wgpu::AddressMode::ClampToEdge, + mag_filter: wgpu::FilterMode::Linear, + min_filter: wgpu::FilterMode::Linear, + mipmap_filter: wgpu::FilterMode::Nearest, + ..Default::default() + }); + + Self { pipeline, bg_layout, sampler } + } + + /// Render the canvas texture into `target_view` (Rgba8Unorm) with the given camera. + /// + /// `target_view` is cleared to transparent before writing. + pub fn blit( + &self, + device: &wgpu::Device, + queue: &wgpu::Queue, + canvas_view: &wgpu::TextureView, + target_view: &wgpu::TextureView, + camera: &CameraParams, + ) { + // Upload camera params + let cam_buf = device.create_buffer(&wgpu::BufferDescriptor { + label: Some("canvas_blit_cam_buf"), + size: std::mem::size_of::() as u64, + usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }); + queue.write_buffer(&cam_buf, 0, bytemuck::bytes_of(camera)); + + let bg = device.create_bind_group(&wgpu::BindGroupDescriptor { + label: Some("canvas_blit_bg"), + layout: &self.bg_layout, + entries: &[ + wgpu::BindGroupEntry { + binding: 0, + resource: wgpu::BindingResource::TextureView(canvas_view), + }, + wgpu::BindGroupEntry { + binding: 1, + resource: wgpu::BindingResource::Sampler(&self.sampler), + }, + wgpu::BindGroupEntry { + binding: 2, + resource: cam_buf.as_entire_binding(), + }, + ], + }); + + let mut encoder = device.create_command_encoder( + &wgpu::CommandEncoderDescriptor { label: Some("canvas_blit_encoder") }, + ); + { + let mut rp = encoder.begin_render_pass(&wgpu::RenderPassDescriptor { + label: Some("canvas_blit_pass"), + color_attachments: &[Some(wgpu::RenderPassColorAttachment { + view: target_view, + resolve_target: None, + depth_slice: None, + ops: wgpu::Operations { + load: wgpu::LoadOp::Clear(wgpu::Color::TRANSPARENT), + store: wgpu::StoreOp::Store, + }, + })], + depth_stencil_attachment: None, + occlusion_query_set: None, + timestamp_writes: None, + }); + rp.set_pipeline(&self.pipeline); + rp.set_bind_group(0, &bg, &[]); + rp.draw(0..4, 0..1); + } + queue.submit(Some(encoder.finish())); + } +} diff --git a/lightningbeam-ui/lightningbeam-editor/src/main.rs b/lightningbeam-ui/lightningbeam-editor/src/main.rs index 97c1c12..84bf768 100644 --- a/lightningbeam-ui/lightningbeam-editor/src/main.rs +++ b/lightningbeam-ui/lightningbeam-editor/src/main.rs @@ -23,6 +23,7 @@ use theme::{Theme, ThemeMode}; mod waveform_gpu; mod cqt_gpu; +mod gpu_brush; mod config; use config::AppConfig; diff --git a/lightningbeam-ui/lightningbeam-editor/src/panes/shaders/brush_dab.wgsl b/lightningbeam-ui/lightningbeam-editor/src/panes/shaders/brush_dab.wgsl new file mode 100644 index 0000000..f4a500e --- /dev/null +++ b/lightningbeam-ui/lightningbeam-editor/src/panes/shaders/brush_dab.wgsl @@ -0,0 +1,152 @@ +// GPU brush dab compute shader. +// +// Renders all dabs for one stroke segment into the raster canvas. +// Uses a ping-pong pair: reads from `canvas_src` (texture_2d) via textureLoad, +// writes to `canvas_dst` (storage, write-only). +// +// `textureSample` is forbidden in compute shaders; bilinear filtering for the +// smudge tool is implemented manually using four textureLoad calls. +// +// Before this dispatch the caller copies `canvas_src` → `canvas_dst` so that pixels +// outside the union dab bounding box (not touched by the shader) remain unchanged. +// +// Dispatch: ceil(bbox_w / 8) × ceil(bbox_h / 8) × 1 +// Each thread covers one pixel in the bounding-box-clamped canvas region. + +// --------------------------------------------------------------------------- +// Data layout must match GpuDab in brush_engine.rs (64 bytes, 16-byte aligned). +// --------------------------------------------------------------------------- +struct GpuDab { + x: f32, y: f32, radius: f32, hardness: f32, // bytes 0–15 + opacity: f32, color_r: f32, color_g: f32, color_b: f32, // bytes 16–31 + color_a: f32, ndx: f32, ndy: f32, smudge_dist: f32, // bytes 32–47 + blend_mode: u32, _pad0: u32, _pad1: u32, _pad2: u32, // bytes 48–63 +} + +struct Params { + bbox_x0: i32, + bbox_y0: i32, + bbox_w: u32, + bbox_h: u32, + num_dabs: u32, + canvas_w: u32, + canvas_h: u32, + _pad: u32, +} + +@group(0) @binding(0) var dabs: array; +@group(0) @binding(1) var params: Params; +@group(0) @binding(2) var canvas_src: texture_2d; +@group(0) @binding(3) var canvas_dst: texture_storage_2d; + +// --------------------------------------------------------------------------- +// Manual bilinear sample from canvas_src at sub-pixel coordinates (px, py). +// Out-of-bounds texels clamp to the canvas edge (replicates ClampToEdge). +// textureSample is forbidden in compute shaders; we use four textureLoad calls. +// --------------------------------------------------------------------------- +fn bilinear_sample(px: f32, py: f32) -> vec4 { + let cw = i32(params.canvas_w); + let ch = i32(params.canvas_h); + + // Integer coords of the top-left sample + let ix = i32(floor(px - 0.5)); + let iy = i32(floor(py - 0.5)); + + // Fractional weights + let fx = fract(px - 0.5); + let fy = fract(py - 0.5); + + // Clamp to [0, dim-1] + let x0 = clamp(ix, 0, cw - 1); + let x1 = clamp(ix + 1, 0, cw - 1); + let y0 = clamp(iy, 0, ch - 1); + let y1 = clamp(iy + 1, 0, ch - 1); + + let s00 = textureLoad(canvas_src, vec2(x0, y0), 0); + let s10 = textureLoad(canvas_src, vec2(x1, y0), 0); + let s01 = textureLoad(canvas_src, vec2(x0, y1), 0); + let s11 = textureLoad(canvas_src, vec2(x1, y1), 0); + + return mix(mix(s00, s10, fx), mix(s01, s11, fx), fy); +} + +// --------------------------------------------------------------------------- +// Apply a single dab to `current` and return the updated colour. +// --------------------------------------------------------------------------- +fn apply_dab(current: vec4, dab: GpuDab, px: i32, py: i32) -> vec4 { + let dx = f32(px) + 0.5 - dab.x; + let dy = f32(py) + 0.5 - dab.y; + let rr = (dx * dx + dy * dy) / (dab.radius * dab.radius); + if rr > 1.0 { return current; } + + // Two-segment linear falloff (identical to libmypaint calculate_opa) + let h = clamp(dab.hardness, 0.001, 1.0); + var opa_weight: f32; + if rr <= h { + opa_weight = 1.0 + rr * (-(1.0 / h - 1.0)); + } else { + opa_weight = h / (1.0 - h) + rr * (-h / (1.0 - h)); + } + opa_weight = clamp(opa_weight, 0.0, 1.0); + + if dab.blend_mode == 0u { + // Normal: "over" operator + let dab_a = opa_weight * dab.opacity * dab.color_a; + if dab_a <= 0.0 { return current; } + let ba = 1.0 - dab_a; + return vec4( + dab_a * dab.color_r + ba * current.r, + dab_a * dab.color_g + ba * current.g, + dab_a * dab.color_b + ba * current.b, + dab_a + ba * current.a, + ); + } else if dab.blend_mode == 1u { + // Erase: multiplicative alpha reduction + let dab_a = opa_weight * dab.opacity * dab.color_a; + if dab_a <= 0.0 { return current; } + let new_a = current.a * (1.0 - dab_a); + let scale = select(0.0, new_a / current.a, current.a > 1e-6); + return vec4(current.r * scale, current.g * scale, current.b * scale, new_a); + } else { + // Smudge: directional warp — sample from position behind the stroke direction + let alpha = opa_weight * dab.opacity; + if alpha <= 0.0 { return current; } + let src_x = f32(px) + 0.5 - dab.ndx * dab.smudge_dist; + let src_y = f32(py) + 0.5 - dab.ndy * dab.smudge_dist; + let src = bilinear_sample(src_x, src_y); + let da = 1.0 - alpha; + return vec4( + alpha * src.r + da * current.r, + alpha * src.g + da * current.g, + alpha * src.b + da * current.b, + alpha * src.a + da * current.a, + ); + } +} + +// --------------------------------------------------------------------------- +// Main entry point +// --------------------------------------------------------------------------- +@compute @workgroup_size(8, 8) +fn main(@builtin(global_invocation_id) gid: vec3) { + // Bounds check within the bounding box + if gid.x >= params.bbox_w || gid.y >= params.bbox_h { return; } + + let px = i32(gid.x) + params.bbox_x0; + let py = i32(gid.y) + params.bbox_y0; + + // Bounds check within the canvas (bbox may extend past canvas edges) + if px < 0 || py < 0 || u32(px) >= params.canvas_w || u32(py) >= params.canvas_h { return; } + + // Read current pixel from source (canvas_dst was pre-filled from canvas_src + // by the caller, but we read from canvas_src to ensure consistency) + var current = textureLoad(canvas_src, vec2(px, py), 0); + + // Apply all dabs for this frame (sequential in the thread, no races between threads + // since each thread owns a unique output pixel) + for (var i = 0u; i < params.num_dabs; i++) { + current = apply_dab(current, dabs[i], px, py); + } + + textureStore(canvas_dst, vec2(px, py), current); +} diff --git a/lightningbeam-ui/lightningbeam-editor/src/panes/shaders/canvas_blit.wgsl b/lightningbeam-ui/lightningbeam-editor/src/panes/shaders/canvas_blit.wgsl new file mode 100644 index 0000000..dc132d0 --- /dev/null +++ b/lightningbeam-ui/lightningbeam-editor/src/panes/shaders/canvas_blit.wgsl @@ -0,0 +1,83 @@ +// Canvas blit shader. +// +// Renders a GPU raster canvas (at document resolution) into the layer's sRGB +// render buffer (at viewport resolution), applying the camera transform +// (pan + zoom) to map document-space pixels to viewport-space pixels. +// +// Any viewport pixel whose corresponding document coordinate falls outside +// [0, canvas_w) × [0, canvas_h) outputs transparent black. + +struct CameraParams { + pan_x: f32, + pan_y: f32, + zoom: f32, + canvas_w: f32, + canvas_h: f32, + viewport_w: f32, + viewport_h: f32, + _pad: f32, +} + +@group(0) @binding(0) var canvas_tex: texture_2d; +@group(0) @binding(1) var canvas_sampler: sampler; +@group(0) @binding(2) var camera: CameraParams; + +struct VertexOutput { + @builtin(position) position: vec4, + @location(0) uv: vec2, +} + +// Generates a fullscreen triangle strip (same pattern as blit.wgsl) +@vertex +fn vs_main(@builtin(vertex_index) vertex_index: u32) -> VertexOutput { + var out: VertexOutput; + let x = f32((vertex_index & 1u) << 1u); + let y = f32(vertex_index & 2u); + out.position = vec4(x * 2.0 - 1.0, 1.0 - y * 2.0, 0.0, 1.0); + out.uv = vec2(x, y); + return out; +} + +// Linear → sRGB encoding for a single channel. +// Applied to premultiplied linear values so the downstream srgb_to_linear +// pass round-trips correctly without darkening semi-transparent edges. +fn linear_to_srgb(c: f32) -> f32 { + return select( + 1.055 * pow(max(c, 0.0), 1.0 / 2.4) - 0.055, + c * 12.92, + c <= 0.0031308, + ); +} + +@fragment +fn fs_main(in: VertexOutput) -> @location(0) vec4 { + // Map viewport UV [0,1] → viewport pixel + let vp = in.uv * vec2(camera.viewport_w, camera.viewport_h); + + // Map viewport pixel → document pixel (inverse camera transform) + let doc = (vp - vec2(camera.pan_x, camera.pan_y)) / camera.zoom; + + // Map document pixel → canvas UV [0,1] + let canvas_uv = doc / vec2(camera.canvas_w, camera.canvas_h); + + // Out-of-bounds → transparent + if canvas_uv.x < 0.0 || canvas_uv.x > 1.0 + || canvas_uv.y < 0.0 || canvas_uv.y > 1.0 { + return vec4(0.0, 0.0, 0.0, 0.0); + } + + // The canvas stores premultiplied linear RGBA. + // The srgb_to_linear converter downstream applies the sRGB gamma formula + // channel-by-channel without alpha awareness. To make the round-trip + // transparent we pre-encode with linear_to_srgb here: + // canvas (linear premul) → sRGB buffer → srgb_to_linear → linear premul ✓ + // Without this, srgb_to_linear darkens small premultiplied values + // (e.g. white at 10% opacity: 0.1 → 0.01), producing a grey halo. + let c = textureSample(canvas_tex, canvas_sampler, canvas_uv); + return vec4( + linear_to_srgb(c.r), + linear_to_srgb(c.g), + linear_to_srgb(c.b), + c.a, + ); +} diff --git a/lightningbeam-ui/lightningbeam-editor/src/panes/stage.rs b/lightningbeam-ui/lightningbeam-editor/src/panes/stage.rs index 33eb262..1bbc8fd 100644 --- a/lightningbeam-ui/lightningbeam-editor/src/panes/stage.rs +++ b/lightningbeam-ui/lightningbeam-editor/src/panes/stage.rs @@ -36,6 +36,10 @@ struct SharedVelloResources { effect_processor: Mutex, /// sRGB to linear color converter (for Vello output) srgb_to_linear: SrgbToLinearConverter, + /// GPU raster brush engine (compute pipeline + canvas texture cache) + gpu_brush: Mutex, + /// Canvas blit pipeline (renders GPU canvas to layer sRGB buffer) + canvas_blit: crate::gpu_brush::CanvasBlitPipeline, } /// Per-instance Vello resources (created for each Stage pane) @@ -206,7 +210,11 @@ impl SharedVelloResources { // Initialize sRGB to linear converter for Vello output let srgb_to_linear = SrgbToLinearConverter::new(device); - println!("✅ Vello shared resources initialized (renderer, shaders, HDR compositor, effect processor, and color converter)"); + // Initialize GPU raster brush engine + let gpu_brush = crate::gpu_brush::GpuBrushEngine::new(device); + let canvas_blit = crate::gpu_brush::CanvasBlitPipeline::new(device); + + println!("✅ Vello shared resources initialized (renderer, shaders, HDR compositor, effect processor, color converter, and GPU brush engine)"); Ok(Self { renderer: Arc::new(Mutex::new(renderer)), @@ -220,6 +228,8 @@ impl SharedVelloResources { compositor, effect_processor: Mutex::new(effect_processor), srgb_to_linear, + gpu_brush: Mutex::new(gpu_brush), + canvas_blit, }) } } @@ -390,6 +400,15 @@ struct VelloRenderContext { mouse_world_pos: Option, /// Latest webcam frame for live preview (if any camera is active) webcam_frame: Option, + /// GPU brush dabs to dispatch in this frame's prepare() call. + pending_raster_dabs: Option, + /// Instance ID (for storing readback results in the global map). + instance_id_for_readback: u64, + /// The (layer_id, keyframe_id) of the raster layer with a live GPU canvas. + /// Present for the entire stroke duration, not just frames with new dabs. + painting_canvas: Option<(uuid::Uuid, uuid::Uuid)>, + /// GPU canvas keyframe to remove at the top of this prepare() call. + pending_canvas_removal: Option, } /// Callback for Vello rendering within egui @@ -470,6 +489,77 @@ impl egui_wgpu::CallbackTrait for VelloCallback { // This means we only need 1 layer buffer at a time (plus the HDR accumulator) instance_resources.ensure_hdr_texture(device, &shared, width, height); + // --- Deferred GPU canvas removal --- + // The previous frame's render_content consumed a readback result and updated + // raw_pixels. Now that the Vello scene is current we can safely drop the + // GPU canvas; painting_canvas was already cleared so the compositor will use + // the Vello scene from here on. + if let Some(kf_id) = self.ctx.pending_canvas_removal { + if let Ok(mut gpu_brush) = shared.gpu_brush.lock() { + gpu_brush.remove_canvas(&kf_id); + } + } + + // --- GPU brush dispatch --- + // Dispatch the compute shader for any pending raster dabs from this frame's + // input event. Must happen before compositing so the updated canvas texture + // is sampled correctly when the layer is blitted. + if let Some(ref pending) = self.ctx.pending_raster_dabs { + if let Ok(mut gpu_brush) = shared.gpu_brush.lock() { + // Ensure the canvas pair exists (creates it if missing or wrong size) + gpu_brush.ensure_canvas( + device, + pending.keyframe_id, + pending.canvas_width, + pending.canvas_height, + ); + // On stroke start, upload the pre-stroke pixel data to both textures + if let Some(ref pixels) = pending.initial_pixels { + if let Some(canvas) = gpu_brush.canvases.get(&pending.keyframe_id) { + canvas.upload(queue, pixels); + } + } + // Dispatch the compute shader for this frame's dabs + if !pending.dabs.is_empty() { + gpu_brush.render_dabs( + device, + queue, + pending.keyframe_id, + &pending.dabs, + pending.dab_bbox, + pending.canvas_width, + pending.canvas_height, + ); + } + // On stroke end, read back the finished canvas and store it so + // the next ui() call can create the undo action. + if pending.wants_final_readback { + if let Some(pixels) = gpu_brush.readback_canvas( + device, + queue, + pending.keyframe_id, + ) { + let results = RASTER_READBACK_RESULTS.get_or_init(|| { + Arc::new(Mutex::new(std::collections::HashMap::new())) + }); + if let Ok(mut map) = results.lock() { + map.insert(self.ctx.instance_id_for_readback, RasterReadbackResult { + layer_id: pending.layer_id, + time: pending.time, + canvas_width: pending.canvas_width, + canvas_height: pending.canvas_height, + pixels, + }); + } + // Canvas is kept alive: the compositor will still blit it + // this frame (painting_canvas is still Some). render_content + // will clear painting_canvas and set pending_canvas_removal, + // so the texture is freed at the top of the next prepare(). + } + } + } + } + let mut image_cache = shared.image_cache.lock().unwrap(); let composite_result = lightningbeam_core::renderer::render_document_for_compositing( @@ -558,7 +648,14 @@ impl egui_wgpu::CallbackTrait for VelloCallback { // Now render and composite each layer incrementally for rendered_layer in &composite_result.layers { - if !rendered_layer.has_content { + // Check if this raster layer has a live GPU canvas that should be + // blitted every frame, even when no new dabs arrived this frame. + // `painting_canvas` persists for the entire stroke duration. + let gpu_canvas_kf: Option = self.ctx.painting_canvas + .filter(|(layer_id, _)| *layer_id == rendered_layer.layer_id) + .map(|(_, kf_id)| kf_id); + + if !rendered_layer.has_content && gpu_canvas_kf.is_none() { continue; } @@ -573,9 +670,42 @@ impl egui_wgpu::CallbackTrait for VelloCallback { buffer_pool.get_view(hdr_layer_handle), &instance_resources.hdr_texture_view, ) { - // Render layer scene to sRGB buffer - if let Ok(mut renderer) = shared.renderer.lock() { - renderer.render_to_texture(device, queue, &rendered_layer.scene, srgb_view, &layer_render_params).ok(); + // GPU canvas blit path: if a live GPU canvas exists for this + // raster layer, sample it directly instead of rendering the Vello + // scene (which lags until raw_pixels is updated after readback). + let used_gpu_canvas = if let Some(kf_id) = gpu_canvas_kf { + let mut used = false; + if let Ok(gpu_brush) = shared.gpu_brush.lock() { + if let Some(canvas) = gpu_brush.canvases.get(&kf_id) { + let camera = crate::gpu_brush::CameraParams { + pan_x: self.ctx.pan_offset.x, + pan_y: self.ctx.pan_offset.y, + zoom: self.ctx.zoom, + canvas_w: canvas.width as f32, + canvas_h: canvas.height as f32, + viewport_w: width as f32, + viewport_h: height as f32, + _pad: 0.0, + }; + shared.canvas_blit.blit( + device, queue, + canvas.src_view(), + srgb_view, + &camera, + ); + used = true; + } + } + used + } else { + false + }; + + if !used_gpu_canvas { + // Render layer scene to sRGB buffer + if let Ok(mut renderer) = shared.renderer.lock() { + renderer.render_to_texture(device, queue, &rendered_layer.scene, srgb_view, &layer_render_params).ok(); + } } // Convert sRGB to linear HDR @@ -2140,6 +2270,20 @@ pub struct StagePane { raster_stroke_state: Option<(uuid::Uuid, f64, lightningbeam_core::brush_engine::StrokeState, Vec)>, // Last raster stroke point (for incremental segment painting) raster_last_point: Option, + /// GPU dabs computed during this frame's drag event — consumed by prepare(). + pending_raster_dabs: Option, + /// Undo snapshot info captured at mouse-down; claimed when readback completes. + /// (layer_id, time, canvas_w, canvas_h, buffer_before) + pending_undo_before: Option<(uuid::Uuid, f64, u32, u32, Vec)>, + /// The (layer_id, keyframe_id) of the raster layer whose GPU canvas is live. + /// Set on mouse-down, cleared when the readback result is consumed. + /// Used every frame to blit the GPU canvas instead of the stale Vello scene. + painting_canvas: Option<(uuid::Uuid, uuid::Uuid)>, + /// Keyframe UUID whose GPU canvas should be removed at the start of the next + /// prepare() call. Set by render_content after consuming the readback result + /// and updating raw_pixels, so the canvas lives one extra composite frame to + /// avoid a flash of the stale Vello scene. + pending_canvas_removal: Option, /// Synthetic drag/click override for test mode replay (debug builds only) #[cfg(debug_assertions)] replay_override: Option, @@ -2171,6 +2315,46 @@ static INSTANCE_COUNTER: std::sync::atomic::AtomicU64 = std::sync::atomic::Atomi // Global storage for eyedropper results (instance_id -> (color, color_mode)) static EYEDROPPER_RESULTS: OnceLock>>> = OnceLock::new(); +/// Pending GPU dabs for a single drag event. +/// +/// Created by the event handler (`handle_raster_stroke_tool`) and consumed once +/// by `VelloCallback::prepare()`. +struct PendingRasterDabs { + /// Keyframe UUID — indexes the canvas texture pair in `GpuBrushEngine`. + keyframe_id: uuid::Uuid, + /// Layer UUID — used for the undo readback result. + layer_id: uuid::Uuid, + /// Playback time of the keyframe. + time: f64, + /// Canvas dimensions (pixels). + canvas_width: u32, + canvas_height: u32, + /// Raw RGBA pixel data to upload to the canvas texture on the very first dab of + /// a stroke (i.e., when the stroke starts). `None` on subsequent drag events. + initial_pixels: Option>, + /// Dab list computed by `BrushEngine::compute_dabs()`. + dabs: Vec, + /// Union bounding box of `dabs` (x0, y0, x1, y1) in canvas pixel coords. + dab_bbox: (i32, i32, i32, i32), + /// When `true`, perform a full canvas readback after dispatching and store + /// the result in `RASTER_READBACK_RESULTS` so the next frame can create + /// the undo action. + wants_final_readback: bool, +} + +/// Result stored by `prepare()` after a stroke-end readback. +struct RasterReadbackResult { + layer_id: uuid::Uuid, + time: f64, + canvas_width: u32, + canvas_height: u32, + /// Raw RGBA pixels from the completed stroke. + pixels: Vec, +} + +// Global storage for raster readback results (instance_id -> result) +static RASTER_READBACK_RESULTS: OnceLock>>> = OnceLock::new(); + /// Cached 2x2 stipple image brush for selection overlay. /// Pattern: [[black, transparent], [transparent, white]] /// Tiled with nearest-neighbor sampling so each pixel stays crisp. @@ -2217,6 +2401,10 @@ impl StagePane { current_snap: None, raster_stroke_state: None, raster_last_point: None, + pending_raster_dabs: None, + pending_undo_before: None, + painting_canvas: None, + pending_canvas_removal: None, #[cfg(debug_assertions)] replay_override: None, } @@ -4183,9 +4371,12 @@ impl StagePane { /// Handle raster stroke tool input (Draw/Erase/Smudge on a raster layer). /// - /// Paints incrementally into `document_mut()` on every drag event so the - /// result is visible immediately. On mouse-up the pre/post raw-pixel - /// buffers are wrapped in a `RasterStrokeAction` for undo/redo. + /// Computes GPU dab lists for each drag event and stores them in + /// `self.pending_raster_dabs` for dispatch by `VelloCallback::prepare()`. + /// + /// The actual pixel rendering happens on the GPU (compute shader). The CPU + /// only does dab placement arithmetic (cheap). On stroke end a readback is + /// requested so the undo system can capture the final pixel state. fn handle_raster_stroke_tool( &mut self, ui: &mut egui::Ui, @@ -4197,7 +4388,7 @@ impl StagePane { use lightningbeam_core::tool::ToolState; use lightningbeam_core::layer::AnyLayer; use lightningbeam_core::raster_layer::StrokePoint; - use lightningbeam_core::brush_engine::{BrushEngine, StrokeState, image_from_raw}; + use lightningbeam_core::brush_engine::{BrushEngine, StrokeState}; use lightningbeam_core::raster_layer::StrokeRecord; let active_layer_id = match *shared.active_layer_id { @@ -4209,9 +4400,7 @@ impl StagePane { let is_raster = shared.action_executor.document() .get_layer(&active_layer_id) .map_or(false, |l| matches!(l, AnyLayer::Raster(_))); - if !is_raster { - return; - } + if !is_raster { return; } let brush = { use lightningbeam_core::brush_settings::BrushSettings; @@ -4235,73 +4424,143 @@ impl StagePane { [c.r() as f32 / 255.0, c.g() as f32 / 255.0, c.b() as f32 / 255.0, c.a() as f32 / 255.0] }; - // Mouse down: snapshot buffer_before, init stroke state, paint first dab + // ---------------------------------------------------------------- + // Mouse down: capture buffer_before, start stroke, compute first dab + // ---------------------------------------------------------------- if self.rsp_drag_started(response) || self.rsp_clicked(response) { - let (doc_width, doc_height, buffer_before) = { + let (doc_width, doc_height) = { let doc = shared.action_executor.document(); - let buf = doc.get_layer(&active_layer_id) - .and_then(|l| if let AnyLayer::Raster(rl) = l { - rl.keyframe_at(*shared.playback_time).map(|kf| kf.raw_pixels.clone()) - } else { None }) - .unwrap_or_default(); - (doc.width as u32, doc.height as u32, buf) + (doc.width as u32, doc.height as u32) }; - // Start a fresh stroke state; MAX distance ensures first point gets a dab - let mut stroke_state = StrokeState::new(); - stroke_state.distance_since_last_dab = f32::MAX; - - let first_pt = StrokePoint { x: world_pos.x, y: world_pos.y, pressure: 1.0, tilt_x: 0.0, tilt_y: 0.0, timestamp: 0.0 }; - - // Paint the first dab directly into the document + // Ensure the keyframe exists BEFORE reading its ID, so we always get + // the real UUID. Previously we read the ID first and fell back to a + // randomly-generated UUID when no keyframe existed; that fake UUID was + // stored in painting_canvas but subsequent drag frames used the real UUID + // from keyframe_at(), causing the GPU canvas to be a different object from + // the one being composited. { let doc = shared.action_executor.document_mut(); if let Some(AnyLayer::Raster(rl)) = doc.get_layer_mut(&active_layer_id) { - let kf = rl.ensure_keyframe_at(*shared.playback_time, doc_width, doc_height); - let mut img = image_from_raw(std::mem::take(&mut kf.raw_pixels), kf.width, kf.height); - let single = StrokeRecord { - brush_settings: brush.clone(), - color, - blend_mode, - points: vec![first_pt.clone()], - }; - BrushEngine::apply_stroke_with_state(&mut img, &single, &mut stroke_state); - kf.raw_pixels = img.into_raw(); + rl.ensure_keyframe_at(*shared.playback_time, doc_width, doc_height); } } - self.raster_stroke_state = Some((active_layer_id, *shared.playback_time, stroke_state, buffer_before)); + // Now read the guaranteed-to-exist keyframe to get the real UUID. + let (keyframe_id, canvas_width, canvas_height, buffer_before, initial_pixels) = { + let doc = shared.action_executor.document(); + if let Some(AnyLayer::Raster(rl)) = doc.get_layer(&active_layer_id) { + if let Some(kf) = rl.keyframe_at(*shared.playback_time) { + let raw = kf.raw_pixels.clone(); + let init = if raw.is_empty() { + vec![0u8; (kf.width * kf.height * 4) as usize] + } else { + raw.clone() + }; + (kf.id, kf.width, kf.height, raw, init) + } else { + return; // shouldn't happen after ensure_keyframe_at + } + } else { + return; + } + }; + + // Compute the first dab (single-point tap) + let mut stroke_state = StrokeState::new(); + stroke_state.distance_since_last_dab = f32::MAX; + + let first_pt = StrokePoint { + x: world_pos.x, y: world_pos.y, + pressure: 1.0, tilt_x: 0.0, tilt_y: 0.0, timestamp: 0.0, + }; + let single = StrokeRecord { + brush_settings: brush.clone(), + color, + blend_mode, + points: vec![first_pt.clone()], + }; + let (dabs, dab_bbox) = BrushEngine::compute_dabs(&single, &mut stroke_state); + + self.painting_canvas = Some((active_layer_id, keyframe_id)); + self.pending_undo_before = Some(( + active_layer_id, + *shared.playback_time, + canvas_width, + canvas_height, + buffer_before, + )); + self.pending_raster_dabs = Some(PendingRasterDabs { + keyframe_id, + layer_id: active_layer_id, + time: *shared.playback_time, + canvas_width, + canvas_height, + initial_pixels: Some(initial_pixels), + dabs, + dab_bbox, + wants_final_readback: false, + }); + self.raster_stroke_state = Some(( + active_layer_id, + *shared.playback_time, + stroke_state, + Vec::new(), // buffer_before now lives in pending_undo_before + )); self.raster_last_point = Some(first_pt); *shared.tool_state = ToolState::DrawingRasterStroke { points: vec![] }; } - // Mouse drag: paint each new segment immediately + // ---------------------------------------------------------------- + // Mouse drag: compute dabs for this segment + // ---------------------------------------------------------------- if self.rsp_dragged(response) { if let Some((layer_id, time, ref mut stroke_state, _)) = self.raster_stroke_state { if let Some(prev_pt) = self.raster_last_point.take() { - let curr_pt = StrokePoint { x: world_pos.x, y: world_pos.y, pressure: 1.0, tilt_x: 0.0, tilt_y: 0.0, timestamp: 0.0 }; + let curr_pt = StrokePoint { + x: world_pos.x, y: world_pos.y, + pressure: 1.0, tilt_x: 0.0, tilt_y: 0.0, timestamp: 0.0, + }; - // Skip if not moved enough const MIN_DIST_SQ: f32 = 1.5 * 1.5; let dx = curr_pt.x - prev_pt.x; let dy = curr_pt.y - prev_pt.y; - let moved_pt = if dx * dx + dy * dy >= MIN_DIST_SQ { curr_pt.clone() } else { prev_pt.clone() }; + let moved_pt = if dx * dx + dy * dy >= MIN_DIST_SQ { + curr_pt.clone() + } else { + prev_pt.clone() + }; if dx * dx + dy * dy >= MIN_DIST_SQ { - let doc = shared.action_executor.document_mut(); - if let Some(AnyLayer::Raster(rl)) = doc.get_layer_mut(&layer_id) { - if let Some(kf) = rl.keyframe_at_mut(time) { - let mut img = image_from_raw(std::mem::take(&mut kf.raw_pixels), kf.width, kf.height); - let seg = StrokeRecord { - brush_settings: brush.clone(), - color, - blend_mode, - points: vec![prev_pt, curr_pt], - }; - BrushEngine::apply_stroke_with_state(&mut img, &seg, stroke_state); - kf.raw_pixels = img.into_raw(); - } - } + // Get keyframe info (needed for canvas dimensions) + let (kf_id, kw, kh) = { + let doc = shared.action_executor.document(); + if let Some(AnyLayer::Raster(rl)) = doc.get_layer(&layer_id) { + if let Some(kf) = rl.keyframe_at(time) { + (kf.id, kf.width, kf.height) + } else { self.raster_last_point = Some(moved_pt); return; } + } else { self.raster_last_point = Some(moved_pt); return; } + }; + + let seg = StrokeRecord { + brush_settings: brush.clone(), + color, + blend_mode, + points: vec![prev_pt, curr_pt], + }; + let (dabs, dab_bbox) = BrushEngine::compute_dabs(&seg, stroke_state); + + self.pending_raster_dabs = Some(PendingRasterDabs { + keyframe_id: kf_id, + layer_id, + time, + canvas_width: kw, + canvas_height: kh, + initial_pixels: None, + dabs, + dab_bbox, + wants_final_readback: false, + }); } self.raster_last_point = Some(moved_pt); @@ -4309,37 +4568,44 @@ impl StagePane { } } - // Mouse up: wrap the pre/post buffers in an undo action + // ---------------------------------------------------------------- + // Mouse up: request a full-canvas readback for the undo snapshot + // ---------------------------------------------------------------- if self.rsp_drag_stopped(response) || (self.rsp_any_released(ui) && matches!(*shared.tool_state, ToolState::DrawingRasterStroke { .. })) { - if let Some((layer_id, time, _, buffer_before)) = self.raster_stroke_state.take() { - use lightningbeam_core::actions::RasterStrokeAction; - - let (doc_width, doc_height, buffer_after) = { - let doc = shared.action_executor.document(); - let buf = doc.get_layer(&layer_id) - .and_then(|l| if let AnyLayer::Raster(rl) = l { - rl.keyframe_at(time).map(|kf| kf.raw_pixels.clone()) - } else { None }) - .unwrap_or_default(); - (doc.width as u32, doc.height as u32, buf) - }; - - let action = RasterStrokeAction::new( - layer_id, - time, - buffer_before, - buffer_after, - doc_width, - doc_height, - ); - // execute is a no-op for the first call (pixels already in document), - // but registers the action in the undo stack - let _ = shared.action_executor.execute(Box::new(action)); - } + self.raster_stroke_state = None; self.raster_last_point = None; *shared.tool_state = ToolState::Idle; + + // Mark the pending dabs (if any this frame) for final readback. + // If there are no pending dabs this frame, create a "readback only" entry. + if let Some(ref mut pending) = self.pending_raster_dabs { + pending.wants_final_readback = true; + } else if let Some((ub_layer, ub_time, ub_cw, ub_ch, _)) = + self.pending_undo_before.as_ref() + { + let (ub_layer, ub_time, ub_cw, ub_ch) = (*ub_layer, *ub_time, *ub_cw, *ub_ch); + // Get keyframe_id for the canvas texture lookup + let kf_id = shared.action_executor.document() + .get_layer(&ub_layer) + .and_then(|l| if let AnyLayer::Raster(rl) = l { + rl.keyframe_at(ub_time).map(|kf| kf.id) + } else { None }); + if let Some(kf_id) = kf_id { + self.pending_raster_dabs = Some(PendingRasterDabs { + keyframe_id: kf_id, + layer_id: ub_layer, + time: ub_time, + canvas_width: ub_cw, + canvas_height: ub_ch, + initial_pixels: None, + dabs: Vec::new(), + dab_bbox: (i32::MAX, i32::MAX, i32::MIN, i32::MIN), + wants_final_readback: true, + }); + } + } } } @@ -6830,6 +7096,35 @@ impl PaneRenderer for StagePane { self.pan_offset = viewport_center - canvas_center; } + // Check for completed raster stroke readbacks and create undo actions + if let Ok(mut results) = RASTER_READBACK_RESULTS + .get_or_init(|| Arc::new(Mutex::new(std::collections::HashMap::new()))) + .lock() { + if let Some(readback) = results.remove(&self.instance_id) { + if let Some((layer_id, time, w, h, buffer_before)) = self.pending_undo_before.take() { + use lightningbeam_core::actions::RasterStrokeAction; + let action = RasterStrokeAction::new( + layer_id, + time, + buffer_before, + readback.pixels.clone(), + w, + h, + ); + // execute() sets raw_pixels = buffer_after so future Vello renders + // and file saves see the completed stroke. + let _ = shared.action_executor.execute(Box::new(action)); + } + // raw_pixels is now up to date; switch compositing back to the Vello + // scene. Schedule the GPU canvas for removal at the start of the next + // prepare() — keeping it alive for this frame's composite avoids a + // one-frame flash of the stale Vello scene. + if let Some((_, kf_id)) = self.painting_canvas.take() { + self.pending_canvas_removal = Some(kf_id); + } + } + } + // Check for completed eyedropper samples from GPU readback and apply them if let Ok(mut results) = EYEDROPPER_RESULTS .get_or_init(|| Arc::new(Mutex::new(std::collections::HashMap::new()))) @@ -7176,6 +7471,10 @@ impl PaneRenderer for StagePane { region_selection: shared.region_selection.clone(), mouse_world_pos, webcam_frame: shared.webcam_frame.clone(), + pending_raster_dabs: self.pending_raster_dabs.take(), + instance_id_for_readback: self.instance_id, + painting_canvas: self.painting_canvas, + pending_canvas_removal: self.pending_canvas_removal.take(), }}; let cb = egui_wgpu::Callback::new_paint_callback(