Move raster editing to GPU

2026-03-01 15:41:28 -05:00 · 2026-03-01 15:41:28 -05:00 · da02edb9f5
parent e85efe7405
commit da02edb9f5
7 changed files with 1460 additions and 83 deletions
--- a/lightningbeam-ui/lightningbeam-core/src/brush_engine.rs
+++ b/lightningbeam-ui/lightningbeam-core/src/brush_engine.rs
@ -38,6 +38,46 @@
 use image::RgbaImage;
 use crate::raster_layer::{RasterBlendMode, StrokeRecord};

+/// A single brush dab ready for GPU dispatch.
+///
+/// Padded to 64 bytes (4 × 16 bytes) for WGSL struct alignment in a storage buffer.
+#[repr(C)]
+#[derive(Clone, Copy, Debug, bytemuck::Pod, bytemuck::Zeroable)]
+pub struct GpuDab {
+    /// Dab centre X (canvas pixels)
+    pub x: f32,
+    /// Dab centre Y (canvas pixels)
+    pub y: f32,
+    /// Dab radius (pixels)
+    pub radius: f32,
+    /// Hardness 0.0–1.0 (controls the falloff curve shape)
+    pub hardness: f32,
+
+    /// Composite opacity for this dab
+    pub opacity: f32,
+    /// Brush color R (linear, premultiplied)
+    pub color_r: f32,
+    /// Brush color G
+    pub color_g: f32,
+    /// Brush color B
+    pub color_b: f32,
+
+    /// Brush color A
+    pub color_a: f32,
+    /// Normalized stroke direction X (smudge only; 0 otherwise)
+    pub ndx: f32,
+    /// Normalized stroke direction Y (smudge only; 0 otherwise)
+    pub ndy: f32,
+    /// Distance to sample behind stroke for smudge (smudge only; 0 otherwise)
+    pub smudge_dist: f32,
+
+    /// Blend mode: 0 = Normal, 1 = Erase, 2 = Smudge
+    pub blend_mode: u32,
+    pub _pad0: u32,
+    pub _pad1: u32,
+    pub _pad2: u32,
+}
+
 /// Transient brush stroke state (tracks partial dab position between segments)
 pub struct StrokeState {
    /// Distance along the path already "consumed" toward the next dab (in pixels)
@ -58,6 +98,114 @@ impl Default for StrokeState {
 pub struct BrushEngine;

 impl BrushEngine {
+    /// Compute the list of GPU dabs for a stroke segment.
+    ///
+    /// Uses the same dab-spacing logic as [`apply_stroke_with_state`] but produces
+    /// [`GpuDab`] structs for upload to the GPU compute pipeline instead of painting
+    /// into a pixel buffer.
+    ///
+    /// Also returns the union bounding box of all dabs as `(x0, y0, x1, y1)` in
+    /// integer canvas pixel coordinates (clamped to non-negative values; `x0==i32::MAX`
+    /// when the returned Vec is empty).
+    pub fn compute_dabs(
+        stroke: &StrokeRecord,
+        state: &mut StrokeState,
+    ) -> (Vec<GpuDab>, (i32, i32, i32, i32)) {
+        let mut dabs: Vec<GpuDab> = Vec::new();
+        let mut bbox = (i32::MAX, i32::MAX, i32::MIN, i32::MIN);
+
+        let blend_mode_u = match stroke.blend_mode {
+            RasterBlendMode::Normal => 0u32,
+            RasterBlendMode::Erase  => 1u32,
+            RasterBlendMode::Smudge => 2u32,
+        };
+
+        let mut push_dab = |dabs: &mut Vec<GpuDab>,
+                             bbox: &mut (i32, i32, i32, i32),
+                             x: f32, y: f32,
+                             radius: f32, opacity: f32,
+                             ndx: f32, ndy: f32, smudge_dist: f32| {
+            let r_fringe = radius + 1.0;
+            bbox.0 = bbox.0.min((x - r_fringe).floor() as i32);
+            bbox.1 = bbox.1.min((y - r_fringe).floor() as i32);
+            bbox.2 = bbox.2.max((x + r_fringe).ceil() as i32);
+            bbox.3 = bbox.3.max((y + r_fringe).ceil() as i32);
+            dabs.push(GpuDab {
+                x, y, radius,
+                hardness: stroke.brush_settings.hardness,
+                opacity,
+                color_r: stroke.color[0],
+                color_g: stroke.color[1],
+                color_b: stroke.color[2],
+                color_a: stroke.color[3],
+                ndx, ndy, smudge_dist,
+                blend_mode: blend_mode_u,
+                _pad0: 0, _pad1: 0, _pad2: 0,
+            });
+        };
+
+        if stroke.points.len() < 2 {
+            if let Some(pt) = stroke.points.first() {
+                let r = stroke.brush_settings.radius_at_pressure(pt.pressure);
+                let o = stroke.brush_settings.opacity_at_pressure(pt.pressure);
+                // Single-tap smudge has no direction — skip (same as CPU engine)
+                if !matches!(stroke.blend_mode, RasterBlendMode::Smudge) {
+                    push_dab(&mut dabs, &mut bbox, pt.x, pt.y, r, o, 0.0, 0.0, 0.0);
+                }
+                state.distance_since_last_dab = 0.0;
+            }
+            return (dabs, bbox);
+        }
+
+        for window in stroke.points.windows(2) {
+            let p0 = &window[0];
+            let p1 = &window[1];
+
+            let dx = p1.x - p0.x;
+            let dy = p1.y - p0.y;
+            let seg_len = (dx * dx + dy * dy).sqrt();
+            if seg_len < 1e-4 { continue; }
+
+            let mut t = 0.0f32;
+            while t < 1.0 {
+                let pressure = p0.pressure + t * (p1.pressure - p0.pressure);
+                let radius = stroke.brush_settings.radius_at_pressure(pressure);
+                let spacing = (radius * stroke.brush_settings.dabs_per_radius).max(0.5);
+
+                let dist_to_next = spacing - state.distance_since_last_dab;
+                let seg_t_to_next = (dist_to_next / seg_len).max(0.0);
+
+                if seg_t_to_next > 1.0 - t {
+                    state.distance_since_last_dab += seg_len * (1.0 - t);
+                    break;
+                }
+
+                t += seg_t_to_next;
+                let x2 = p0.x + t * dx;
+                let y2 = p0.y + t * dy;
+                let pressure2 = p0.pressure + t * (p1.pressure - p0.pressure);
+                let radius2 = stroke.brush_settings.radius_at_pressure(pressure2);
+                let opacity2 = stroke.brush_settings.opacity_at_pressure(pressure2);
+
+                if matches!(stroke.blend_mode, RasterBlendMode::Smudge) {
+                    let ndx = dx / seg_len;
+                    let ndy = dy / seg_len;
+                    let smudge_dist =
+                        (radius2 * stroke.brush_settings.dabs_per_radius).max(1.0);
+                    push_dab(&mut dabs, &mut bbox,
+                             x2, y2, radius2, opacity2, ndx, ndy, smudge_dist);
+                } else {
+                    push_dab(&mut dabs, &mut bbox,
+                             x2, y2, radius2, opacity2, 0.0, 0.0, 0.0);
+                }
+
+                state.distance_since_last_dab = 0.0;
+            }
+        }
+
+        (dabs, bbox)
+    }
+
    /// Apply a complete stroke to a pixel buffer.
    ///
    /// A fresh [`StrokeState`] is created for each stroke (starts with full dab
--- a/lightningbeam-ui/lightningbeam-core/src/renderer.rs
+++ b/lightningbeam-ui/lightningbeam-core/src/renderer.rs
@ -358,7 +358,10 @@ fn render_raster_layer_to_scene(
        format: ImageFormat::Rgba8,
        width: kf.width,
        height: kf.height,
-        alpha_type: ImageAlphaType::Alpha,
+        // raw_pixels stores sRGB-encoded premultiplied RGBA (channels are
+        // gamma-encoded, alpha is linear).  Premultiplied tells Vello to
+        // decode the sRGB channels without premultiplying again.
+        alpha_type: ImageAlphaType::AlphaPremultiplied,
    };
    let brush = ImageBrush::new(image_data);
    let canvas_rect = Rect::new(0.0, 0.0, kf.width as f64, kf.height as f64);
--- a/lightningbeam-ui/lightningbeam-editor/src/gpu_brush.rs
+++ b/lightningbeam-ui/lightningbeam-editor/src/gpu_brush.rs
@ -0,0 +1,691 @@
+//! GPU-accelerated raster brush engine.
+//!
+//! [`GpuBrushEngine`] wraps the `brush_dab.wgsl` compute pipeline and manages
+//! per-keyframe canvas texture pairs (ping-pong) used as the live canvas during
+//! raster painting.
+//!
+//! ## Lifecycle
+//!
+//! 1. **Stroke start** — caller supplies the initial pixel data; the engine uploads
+//!    it to both canvas textures so either can serve as source/dest.
+//! 2. **Each drag event** — [`GpuBrushEngine::render_dabs`] copies src→dst,
+//!    dispatches the compute shader, then swaps src/dst.
+//! 3. **Stroke end** — [`GpuBrushEngine::readback_canvas`] copies the current
+//!    source texture into a staging buffer and returns the raw RGBA bytes
+//!    (blocking — uses `device.poll(Maintain::Wait)`).
+//! 4. **Idle** — canvas textures are kept alive for the next stroke (no re-upload
+//!    needed if the layer has not changed).
+
+use std::collections::HashMap;
+use uuid::Uuid;
+use lightningbeam_core::brush_engine::GpuDab;
+
+// ---------------------------------------------------------------------------
+// Colour-space helpers
+// ---------------------------------------------------------------------------
+
+/// Decode one sRGB-encoded byte to linear float [0, 1].
+fn srgb_to_linear(c: f32) -> f32 {
+    if c <= 0.04045 {
+        c / 12.92
+    } else {
+        ((c + 0.055) / 1.055).powf(2.4)
+    }
+}
+
+/// Encode one linear float [0, 1] to an sRGB-encoded byte.
+fn linear_to_srgb_byte(c: u8) -> u8 {
+    let f = c as f32 / 255.0;
+    let encoded = if f <= 0.0031308 {
+        f * 12.92
+    } else {
+        1.055 * f.powf(1.0 / 2.4) - 0.055
+    };
+    (encoded * 255.0 + 0.5) as u8
+}
+
+// ---------------------------------------------------------------------------
+// Per-keyframe canvas texture pair (ping-pong)
+// ---------------------------------------------------------------------------
+
+/// A pair of textures used for double-buffered canvas rendering.
+///
+/// `current` indexes the texture that holds the up-to-date canvas state.
+pub struct CanvasPair {
+    pub textures: [wgpu::Texture; 2],
+    pub views:    [wgpu::TextureView; 2],
+    /// Index (0 or 1) of the texture that is the current "source" (authoritative).
+    pub current: usize,
+    pub width:   u32,
+    pub height:  u32,
+}
+
+impl CanvasPair {
+    pub fn new(device: &wgpu::Device, width: u32, height: u32) -> Self {
+        let desc = wgpu::TextureDescriptor {
+            label:  Some("raster_canvas"),
+            size:   wgpu::Extent3d { width, height, depth_or_array_layers: 1 },
+            mip_level_count: 1,
+            sample_count: 1,
+            dimension: wgpu::TextureDimension::D2,
+            format: wgpu::TextureFormat::Rgba8Unorm,
+            usage:  wgpu::TextureUsages::TEXTURE_BINDING
+                  | wgpu::TextureUsages::STORAGE_BINDING
+                  | wgpu::TextureUsages::COPY_SRC
+                  | wgpu::TextureUsages::COPY_DST,
+            view_formats: &[],
+        };
+        let t0 = device.create_texture(&desc);
+        let t1 = device.create_texture(&desc);
+        let v0 = t0.create_view(&wgpu::TextureViewDescriptor::default());
+        let v1 = t1.create_view(&wgpu::TextureViewDescriptor::default());
+        Self {
+            textures: [t0, t1],
+            views:    [v0, v1],
+            current: 0,
+            width,
+            height,
+        }
+    }
+
+    /// Upload raw RGBA bytes to both textures (call once at stroke start).
+    ///
+    /// `pixels` is expected to be **sRGB-encoded premultiplied** (the format stored
+    /// in `raw_pixels` / PNG files).  The values are decoded to linear premultiplied
+    /// before being written to the canvas, which operates entirely in linear space.
+    pub fn upload(&self, queue: &wgpu::Queue, pixels: &[u8]) {
+        // Decode sRGB-premultiplied → linear premultiplied for the GPU canvas.
+        let linear: Vec<u8> = pixels.chunks_exact(4).flat_map(|p| {
+            let r = (srgb_to_linear(p[0] as f32 / 255.0) * 255.0 + 0.5) as u8;
+            let g = (srgb_to_linear(p[1] as f32 / 255.0) * 255.0 + 0.5) as u8;
+            let b = (srgb_to_linear(p[2] as f32 / 255.0) * 255.0 + 0.5) as u8;
+            [r, g, b, p[3]]
+        }).collect();
+
+        let layout = wgpu::TexelCopyBufferLayout {
+            offset: 0,
+            bytes_per_row: Some(self.width * 4),
+            rows_per_image: Some(self.height),
+        };
+        let extent = wgpu::Extent3d {
+            width: self.width,
+            height: self.height,
+            depth_or_array_layers: 1,
+        };
+        for tex in &self.textures {
+            queue.write_texture(
+                wgpu::TexelCopyTextureInfo {
+                    texture: tex,
+                    mip_level: 0,
+                    origin: wgpu::Origin3d::ZERO,
+                    aspect: wgpu::TextureAspect::All,
+                },
+                &linear,
+                layout,
+                extent,
+            );
+        }
+    }
+
+    /// Source (current, authoritative) texture.
+    pub fn src(&self) -> &wgpu::Texture { &self.textures[self.current] }
+    /// Source texture view.
+    pub fn src_view(&self) -> &wgpu::TextureView { &self.views[self.current] }
+    /// Destination (write target) texture.
+    pub fn dst(&self) -> &wgpu::Texture { &self.textures[1 - self.current] }
+    /// Destination texture view.
+    pub fn dst_view(&self) -> &wgpu::TextureView { &self.views[1 - self.current] }
+    /// Commit the just-completed dispatch: make dst the new source.
+    pub fn swap(&mut self) { self.current = 1 - self.current; }
+}
+
+// ---------------------------------------------------------------------------
+// GpuBrushEngine
+// ---------------------------------------------------------------------------
+
+/// GPU brush engine — holds the compute pipeline and per-keyframe canvas pairs.
+pub struct GpuBrushEngine {
+    compute_pipeline:   wgpu::ComputePipeline,
+    compute_bg_layout:  wgpu::BindGroupLayout,
+
+    /// Canvas texture pairs keyed by keyframe UUID.
+    pub canvases: HashMap<Uuid, CanvasPair>,
+}
+
+/// CPU-side parameters uniform for the compute shader.
+#[repr(C)]
+#[derive(Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
+struct DabParams {
+    bbox_x0:  i32,
+    bbox_y0:  i32,
+    bbox_w:   u32,
+    bbox_h:   u32,
+    num_dabs: u32,
+    canvas_w: u32,
+    canvas_h: u32,
+    _pad:     u32,
+}
+
+impl GpuBrushEngine {
+    /// Create the pipeline.  Returns `Err` if the device lacks the required
+    /// storage-texture capability for `Rgba8Unorm`.
+    pub fn new(device: &wgpu::Device) -> Self {
+        let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label:  Some("brush_dab_shader"),
+            source: wgpu::ShaderSource::Wgsl(
+                include_str!("panes/shaders/brush_dab.wgsl").into(),
+            ),
+        });
+
+        let compute_bg_layout = device.create_bind_group_layout(
+            &wgpu::BindGroupLayoutDescriptor {
+                label: Some("brush_dab_bgl"),
+                entries: &[
+                    // 0: dab storage buffer (read-only)
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty:                wgpu::BufferBindingType::Storage { read_only: true },
+                            has_dynamic_offset: false,
+                            min_binding_size:  None,
+                        },
+                        count: None,
+                    },
+                    // 1: params uniform
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 1,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Buffer {
+                            ty:                wgpu::BufferBindingType::Uniform,
+                            has_dynamic_offset: false,
+                            min_binding_size:  None,
+                        },
+                        count: None,
+                    },
+                    // 2: canvas source (sampled)
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 2,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::Texture {
+                            sample_type:    wgpu::TextureSampleType::Float { filterable: true },
+                            view_dimension: wgpu::TextureViewDimension::D2,
+                            multisampled:   false,
+                        },
+                        count: None,
+                    },
+                    // 3: canvas destination (write-only storage)
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 3,
+                        visibility: wgpu::ShaderStages::COMPUTE,
+                        ty: wgpu::BindingType::StorageTexture {
+                            access:         wgpu::StorageTextureAccess::WriteOnly,
+                            format:         wgpu::TextureFormat::Rgba8Unorm,
+                            view_dimension: wgpu::TextureViewDimension::D2,
+                        },
+                        count: None,
+                    },
+                ],
+            },
+        );
+
+        let pipeline_layout = device.create_pipeline_layout(
+            &wgpu::PipelineLayoutDescriptor {
+                label:                Some("brush_dab_pl"),
+                bind_group_layouts:   &[&compute_bg_layout],
+                push_constant_ranges: &[],
+            },
+        );
+
+        let compute_pipeline = device.create_compute_pipeline(
+            &wgpu::ComputePipelineDescriptor {
+                label:   Some("brush_dab_pipeline"),
+                layout:  Some(&pipeline_layout),
+                module:  &shader,
+                entry_point: Some("main"),
+                compilation_options: Default::default(),
+                cache: None,
+            },
+        );
+
+        Self {
+            compute_pipeline,
+            compute_bg_layout,
+            canvases: HashMap::new(),
+        }
+    }
+
+    /// Ensure a canvas pair exists for `keyframe_id` at the given dimensions.
+    ///
+    /// If the canvas exists but has different dimensions it is replaced.
+    pub fn ensure_canvas(
+        &mut self,
+        device: &wgpu::Device,
+        keyframe_id: Uuid,
+        width: u32,
+        height: u32,
+    ) -> &mut CanvasPair {
+        let needs_new = self.canvases.get(&keyframe_id)
+            .map_or(true, |c| c.width != width || c.height != height);
+        if needs_new {
+            self.canvases.insert(keyframe_id, CanvasPair::new(device, width, height));
+        }
+        self.canvases.get_mut(&keyframe_id).unwrap()
+    }
+
+    /// Dispatch the brush compute shader for `dabs` onto the canvas of `keyframe_id`.
+    ///
+    /// * Pre-fills `dst` from `src` so untouched pixels are preserved.
+    /// * Dispatches the compute shader.
+    /// * Swaps src/dst so the just-written texture becomes the new source.
+    ///
+    /// `dab_bbox` is `(x0, y0, x1, y1)` — the union bounding box of all dabs.
+    /// If `dabs` is empty or the bbox is invalid, does nothing.
+    pub fn render_dabs(
+        &mut self,
+        device: &wgpu::Device,
+        queue:  &wgpu::Queue,
+        keyframe_id: Uuid,
+        dabs:   &[GpuDab],
+        bbox:   (i32, i32, i32, i32),
+        canvas_w: u32,
+        canvas_h: u32,
+    ) {
+        if dabs.is_empty() || bbox.0 == i32::MAX { return; }
+
+        let canvas = match self.canvases.get_mut(&keyframe_id) {
+            Some(c) => c,
+            None => return,
+        };
+
+        // Clamp bbox to canvas bounds
+        let x0 = bbox.0.max(0) as u32;
+        let y0 = bbox.1.max(0) as u32;
+        let x1 = (bbox.2.min(canvas_w as i32 - 1)).max(0) as u32;
+        let y1 = (bbox.3.min(canvas_h as i32 - 1)).max(0) as u32;
+        if x1 < x0 || y1 < y0 { return; }
+
+        let bbox_w = x1 - x0 + 1;
+        let bbox_h = y1 - y0 + 1;
+
+        // --- Pre-fill dst from src: copy the ENTIRE canvas so every pixel outside
+        // the dab bounding box is preserved across the ping-pong swap.
+        // Copying only the bbox would leave dst with data from two frames ago
+        // in all other regions, causing missing dabs on alternating frames. ---
+        let mut copy_encoder = device.create_command_encoder(
+            &wgpu::CommandEncoderDescriptor { label: Some("canvas_copy_encoder") },
+        );
+        let full_extent = wgpu::Extent3d {
+            width: canvas.width,
+            height: canvas.height,
+            depth_or_array_layers: 1,
+        };
+        copy_encoder.copy_texture_to_texture(
+            wgpu::TexelCopyTextureInfo {
+                texture:  canvas.src(),
+                mip_level: 0,
+                origin: wgpu::Origin3d::ZERO,
+                aspect: wgpu::TextureAspect::All,
+            },
+            wgpu::TexelCopyTextureInfo {
+                texture:  canvas.dst(),
+                mip_level: 0,
+                origin: wgpu::Origin3d::ZERO,
+                aspect: wgpu::TextureAspect::All,
+            },
+            full_extent,
+        );
+        queue.submit(Some(copy_encoder.finish()));
+
+        // --- Upload dab data and params ---
+        let dab_bytes = bytemuck::cast_slice(dabs);
+        let dab_buf = device.create_buffer(&wgpu::BufferDescriptor {
+            label:              Some("dab_storage_buf"),
+            size:               dab_bytes.len() as u64,
+            usage:              wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+        queue.write_buffer(&dab_buf, 0, dab_bytes);
+
+        let params = DabParams {
+            bbox_x0:  x0 as i32,
+            bbox_y0:  y0 as i32,
+            bbox_w,
+            bbox_h,
+            num_dabs: dabs.len() as u32,
+            canvas_w,
+            canvas_h,
+            _pad: 0,
+        };
+        let params_buf = device.create_buffer(&wgpu::BufferDescriptor {
+            label:              Some("dab_params_buf"),
+            size:               std::mem::size_of::<DabParams>() as u64,
+            usage:              wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+        queue.write_buffer(&params_buf, 0, bytemuck::bytes_of(&params));
+
+        let bg = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label:  Some("brush_dab_bg"),
+            layout: &self.compute_bg_layout,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding: 0,
+                    resource: dab_buf.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 1,
+                    resource: params_buf.as_entire_binding(),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 2,
+                    resource: wgpu::BindingResource::TextureView(canvas.src_view()),
+                },
+                wgpu::BindGroupEntry {
+                    binding: 3,
+                    resource: wgpu::BindingResource::TextureView(canvas.dst_view()),
+                },
+            ],
+        });
+
+        // --- Dispatch ---
+        let mut compute_encoder = device.create_command_encoder(
+            &wgpu::CommandEncoderDescriptor { label: Some("brush_dab_encoder") },
+        );
+        {
+            let mut pass = compute_encoder.begin_compute_pass(
+                &wgpu::ComputePassDescriptor {
+                    label: Some("brush_dab_pass"),
+                    timestamp_writes: None,
+                },
+            );
+            pass.set_pipeline(&self.compute_pipeline);
+            pass.set_bind_group(0, &bg, &[]);
+            let wg_x = bbox_w.div_ceil(8);
+            let wg_y = bbox_h.div_ceil(8);
+            pass.dispatch_workgroups(wg_x, wg_y, 1);
+        }
+        queue.submit(Some(compute_encoder.finish()));
+
+        // Swap: dst is now the authoritative source
+        canvas.swap();
+    }
+
+    /// Read the current canvas back to a CPU `Vec<u8>` (raw RGBA, row-major).
+    ///
+    /// **Blocks** until the GPU work is complete (`Maintain::Wait`).
+    /// Should only be called at stroke end, not every frame.
+    ///
+    /// Returns `None` if no canvas exists for `keyframe_id`.
+    pub fn readback_canvas(
+        &self,
+        device: &wgpu::Device,
+        queue:  &wgpu::Queue,
+        keyframe_id: Uuid,
+    ) -> Option<Vec<u8>> {
+        let canvas = self.canvases.get(&keyframe_id)?;
+        let width  = canvas.width;
+        let height = canvas.height;
+
+        // wgpu requires bytes_per_row to be a multiple of 256
+        let bytes_per_row_aligned =
+            ((width * 4 + 255) / 256) * 256;
+        let total_bytes = (bytes_per_row_aligned * height) as u64;
+
+        let staging = device.create_buffer(&wgpu::BufferDescriptor {
+            label:              Some("canvas_readback_buf"),
+            size:               total_bytes,
+            usage:              wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+
+        let mut encoder = device.create_command_encoder(
+            &wgpu::CommandEncoderDescriptor { label: Some("canvas_readback_encoder") },
+        );
+        encoder.copy_texture_to_buffer(
+            wgpu::TexelCopyTextureInfo {
+                texture:  canvas.src(),
+                mip_level: 0,
+                origin: wgpu::Origin3d::ZERO,
+                aspect: wgpu::TextureAspect::All,
+            },
+            wgpu::TexelCopyBufferInfo {
+                buffer: &staging,
+                layout: wgpu::TexelCopyBufferLayout {
+                    offset:         0,
+                    bytes_per_row:  Some(bytes_per_row_aligned),
+                    rows_per_image: Some(height),
+                },
+            },
+            wgpu::Extent3d { width, height, depth_or_array_layers: 1 },
+        );
+        queue.submit(Some(encoder.finish()));
+
+        // Block until complete
+        let slice = staging.slice(..);
+        let (tx, rx) = std::sync::mpsc::channel();
+        slice.map_async(wgpu::MapMode::Read, move |r| { let _ = tx.send(r); });
+        let _ = device.poll(wgpu::PollType::wait_indefinitely());
+        if rx.recv().ok()?.is_err() { return None; }
+
+        let mapped = slice.get_mapped_range();
+
+        // De-stride: copy only `width * 4` bytes per row (drop alignment padding)
+        let bytes_per_row_tight = (width * 4) as usize;
+        let bytes_per_row_src   = bytes_per_row_aligned as usize;
+        let mut pixels = vec![0u8; (width * height * 4) as usize];
+        for row in 0..height as usize {
+            let src = &mapped[row * bytes_per_row_src .. row * bytes_per_row_src + bytes_per_row_tight];
+            let dst = &mut pixels[row * bytes_per_row_tight .. (row + 1) * bytes_per_row_tight];
+            dst.copy_from_slice(src);
+        }
+
+        drop(mapped);
+        staging.unmap();
+
+        // Encode linear premultiplied → sRGB-encoded premultiplied so the returned
+        // bytes match what Vello expects (ImageAlphaType::Premultiplied with sRGB
+        // channels).  Alpha is left unchanged.
+        for pixel in pixels.chunks_exact_mut(4) {
+            pixel[0] = linear_to_srgb_byte(pixel[0]);
+            pixel[1] = linear_to_srgb_byte(pixel[1]);
+            pixel[2] = linear_to_srgb_byte(pixel[2]);
+        }
+
+        Some(pixels)
+    }
+
+    /// Remove the canvas pair for a keyframe (e.g. when the layer is deleted).
+    pub fn remove_canvas(&mut self, keyframe_id: &Uuid) {
+        self.canvases.remove(keyframe_id);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Canvas blit pipeline (renders canvas texture to layer sRGB buffer)
+// ---------------------------------------------------------------------------
+
+/// Bind group layout + pipeline for blitting a canvas texture (at document
+/// resolution) into a layer render buffer (at viewport resolution), applying
+/// the camera transform.
+pub struct CanvasBlitPipeline {
+    pub pipeline: wgpu::RenderPipeline,
+    pub bg_layout: wgpu::BindGroupLayout,
+    pub sampler: wgpu::Sampler,
+}
+
+/// Camera parameters uniform for canvas_blit.wgsl.
+#[repr(C)]
+#[derive(Clone, Copy, bytemuck::Pod, bytemuck::Zeroable)]
+pub struct CameraParams {
+    pub pan_x:      f32,
+    pub pan_y:      f32,
+    pub zoom:       f32,
+    pub canvas_w:   f32,
+    pub canvas_h:   f32,
+    pub viewport_w: f32,
+    pub viewport_h: f32,
+    pub _pad:       f32,
+}
+
+impl CanvasBlitPipeline {
+    pub fn new(device: &wgpu::Device) -> Self {
+        let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label:  Some("canvas_blit_shader"),
+            source: wgpu::ShaderSource::Wgsl(
+                include_str!("panes/shaders/canvas_blit.wgsl").into(),
+            ),
+        });
+
+        let bg_layout = device.create_bind_group_layout(
+            &wgpu::BindGroupLayoutDescriptor {
+                label: Some("canvas_blit_bgl"),
+                entries: &[
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStages::FRAGMENT,
+                        ty: wgpu::BindingType::Texture {
+                            sample_type:    wgpu::TextureSampleType::Float { filterable: true },
+                            view_dimension: wgpu::TextureViewDimension::D2,
+                            multisampled:   false,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 1,
+                        visibility: wgpu::ShaderStages::FRAGMENT,
+                        ty: wgpu::BindingType::Sampler(wgpu::SamplerBindingType::Filtering),
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 2,
+                        visibility: wgpu::ShaderStages::FRAGMENT,
+                        ty: wgpu::BindingType::Buffer {
+                            ty:                wgpu::BufferBindingType::Uniform,
+                            has_dynamic_offset: false,
+                            min_binding_size:  None,
+                        },
+                        count: None,
+                    },
+                ],
+            },
+        );
+
+        let pipeline_layout = device.create_pipeline_layout(
+            &wgpu::PipelineLayoutDescriptor {
+                label:                Some("canvas_blit_pl"),
+                bind_group_layouts:   &[&bg_layout],
+                push_constant_ranges: &[],
+            },
+        );
+
+        let pipeline = device.create_render_pipeline(
+            &wgpu::RenderPipelineDescriptor {
+                label:  Some("canvas_blit_pipeline"),
+                layout: Some(&pipeline_layout),
+                vertex: wgpu::VertexState {
+                    module:  &shader,
+                    entry_point: Some("vs_main"),
+                    buffers: &[],
+                    compilation_options: Default::default(),
+                },
+                fragment: Some(wgpu::FragmentState {
+                    module:  &shader,
+                    entry_point: Some("fs_main"),
+                    targets: &[Some(wgpu::ColorTargetState {
+                        format:     wgpu::TextureFormat::Rgba8Unorm,
+                        blend:      None, // canvas already stores premultiplied alpha
+                        write_mask: wgpu::ColorWrites::ALL,
+                    })],
+                    compilation_options: Default::default(),
+                }),
+                primitive: wgpu::PrimitiveState {
+                    topology: wgpu::PrimitiveTopology::TriangleStrip,
+                    ..Default::default()
+                },
+                depth_stencil: None,
+                multisample:   wgpu::MultisampleState::default(),
+                multiview:     None,
+                cache:         None,
+            },
+        );
+
+        let sampler = device.create_sampler(&wgpu::SamplerDescriptor {
+            label:          Some("canvas_blit_sampler"),
+            address_mode_u: wgpu::AddressMode::ClampToEdge,
+            address_mode_v: wgpu::AddressMode::ClampToEdge,
+            address_mode_w: wgpu::AddressMode::ClampToEdge,
+            mag_filter:     wgpu::FilterMode::Linear,
+            min_filter:     wgpu::FilterMode::Linear,
+            mipmap_filter:  wgpu::FilterMode::Nearest,
+            ..Default::default()
+        });
+
+        Self { pipeline, bg_layout, sampler }
+    }
+
+    /// Render the canvas texture into `target_view` (Rgba8Unorm) with the given camera.
+    ///
+    /// `target_view` is cleared to transparent before writing.
+    pub fn blit(
+        &self,
+        device:      &wgpu::Device,
+        queue:       &wgpu::Queue,
+        canvas_view: &wgpu::TextureView,
+        target_view: &wgpu::TextureView,
+        camera:      &CameraParams,
+    ) {
+        // Upload camera params
+        let cam_buf = device.create_buffer(&wgpu::BufferDescriptor {
+            label:              Some("canvas_blit_cam_buf"),
+            size:               std::mem::size_of::<CameraParams>() as u64,
+            usage:              wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+        queue.write_buffer(&cam_buf, 0, bytemuck::bytes_of(camera));
+
+        let bg = device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label:  Some("canvas_blit_bg"),
+            layout: &self.bg_layout,
+            entries: &[
+                wgpu::BindGroupEntry {
+                    binding:  0,
+                    resource: wgpu::BindingResource::TextureView(canvas_view),
+                },
+                wgpu::BindGroupEntry {
+                    binding:  1,
+                    resource: wgpu::BindingResource::Sampler(&self.sampler),
+                },
+                wgpu::BindGroupEntry {
+                    binding:  2,
+                    resource: cam_buf.as_entire_binding(),
+                },
+            ],
+        });
+
+        let mut encoder = device.create_command_encoder(
+            &wgpu::CommandEncoderDescriptor { label: Some("canvas_blit_encoder") },
+        );
+        {
+            let mut rp = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
+                label: Some("canvas_blit_pass"),
+                color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                    view:           target_view,
+                    resolve_target: None,
+                    depth_slice:    None,
+                    ops:            wgpu::Operations {
+                        load:  wgpu::LoadOp::Clear(wgpu::Color::TRANSPARENT),
+                        store: wgpu::StoreOp::Store,
+                    },
+                })],
+                depth_stencil_attachment: None,
+                occlusion_query_set:      None,
+                timestamp_writes:         None,
+            });
+            rp.set_pipeline(&self.pipeline);
+            rp.set_bind_group(0, &bg, &[]);
+            rp.draw(0..4, 0..1);
+        }
+        queue.submit(Some(encoder.finish()));
+    }
+}
--- a/lightningbeam-ui/lightningbeam-editor/src/main.rs
+++ b/lightningbeam-ui/lightningbeam-editor/src/main.rs
@ -23,6 +23,7 @@ use theme::{Theme, ThemeMode};

 mod waveform_gpu;
 mod cqt_gpu;
+mod gpu_brush;

 mod config;
 use config::AppConfig;
--- a/lightningbeam-ui/lightningbeam-editor/src/panes/shaders/brush_dab.wgsl
+++ b/lightningbeam-ui/lightningbeam-editor/src/panes/shaders/brush_dab.wgsl
@ -0,0 +1,152 @@
+// GPU brush dab compute shader.
+//
+// Renders all dabs for one stroke segment into the raster canvas.
+// Uses a ping-pong pair: reads from `canvas_src` (texture_2d) via textureLoad,
+// writes to `canvas_dst` (storage, write-only).
+//
+// `textureSample` is forbidden in compute shaders; bilinear filtering for the
+// smudge tool is implemented manually using four textureLoad calls.
+//
+// Before this dispatch the caller copies `canvas_src` → `canvas_dst` so that pixels
+// outside the union dab bounding box (not touched by the shader) remain unchanged.
+//
+// Dispatch: ceil(bbox_w / 8) × ceil(bbox_h / 8) × 1
+// Each thread covers one pixel in the bounding-box-clamped canvas region.
+
+// ---------------------------------------------------------------------------
+// Data layout must match GpuDab in brush_engine.rs (64 bytes, 16-byte aligned).
+// ---------------------------------------------------------------------------
+struct GpuDab {
+    x: f32, y: f32, radius: f32, hardness: f32,             // bytes  0–15
+    opacity: f32, color_r: f32, color_g: f32, color_b: f32, // bytes 16–31
+    color_a: f32, ndx: f32, ndy: f32, smudge_dist: f32,     // bytes 32–47
+    blend_mode: u32, _pad0: u32, _pad1: u32, _pad2: u32,    // bytes 48–63
+}
+
+struct Params {
+    bbox_x0:  i32,
+    bbox_y0:  i32,
+    bbox_w:   u32,
+    bbox_h:   u32,
+    num_dabs: u32,
+    canvas_w: u32,
+    canvas_h: u32,
+    _pad:     u32,
+}
+
+@group(0) @binding(0) var<storage, read> dabs:       array<GpuDab>;
+@group(0) @binding(1) var<uniform>       params:     Params;
+@group(0) @binding(2) var               canvas_src: texture_2d<f32>;
+@group(0) @binding(3) var               canvas_dst: texture_storage_2d<rgba8unorm, write>;
+
+// ---------------------------------------------------------------------------
+// Manual bilinear sample from canvas_src at sub-pixel coordinates (px, py).
+// Out-of-bounds texels clamp to the canvas edge (replicates ClampToEdge).
+// textureSample is forbidden in compute shaders; we use four textureLoad calls.
+// ---------------------------------------------------------------------------
+fn bilinear_sample(px: f32, py: f32) -> vec4<f32> {
+    let cw = i32(params.canvas_w);
+    let ch = i32(params.canvas_h);
+
+    // Integer coords of the top-left sample
+    let ix = i32(floor(px - 0.5));
+    let iy = i32(floor(py - 0.5));
+
+    // Fractional weights
+    let fx = fract(px - 0.5);
+    let fy = fract(py - 0.5);
+
+    // Clamp to [0, dim-1]
+    let x0 = clamp(ix,     0, cw - 1);
+    let x1 = clamp(ix + 1, 0, cw - 1);
+    let y0 = clamp(iy,     0, ch - 1);
+    let y1 = clamp(iy + 1, 0, ch - 1);
+
+    let s00 = textureLoad(canvas_src, vec2<i32>(x0, y0), 0);
+    let s10 = textureLoad(canvas_src, vec2<i32>(x1, y0), 0);
+    let s01 = textureLoad(canvas_src, vec2<i32>(x0, y1), 0);
+    let s11 = textureLoad(canvas_src, vec2<i32>(x1, y1), 0);
+
+    return mix(mix(s00, s10, fx), mix(s01, s11, fx), fy);
+}
+
+// ---------------------------------------------------------------------------
+// Apply a single dab to `current` and return the updated colour.
+// ---------------------------------------------------------------------------
+fn apply_dab(current: vec4<f32>, dab: GpuDab, px: i32, py: i32) -> vec4<f32> {
+    let dx = f32(px) + 0.5 - dab.x;
+    let dy = f32(py) + 0.5 - dab.y;
+    let rr = (dx * dx + dy * dy) / (dab.radius * dab.radius);
+    if rr > 1.0 { return current; }
+
+    // Two-segment linear falloff (identical to libmypaint calculate_opa)
+    let h    = clamp(dab.hardness, 0.001, 1.0);
+    var opa_weight: f32;
+    if rr <= h {
+        opa_weight = 1.0 + rr * (-(1.0 / h - 1.0));
+    } else {
+        opa_weight = h / (1.0 - h) + rr * (-h / (1.0 - h));
+    }
+    opa_weight = clamp(opa_weight, 0.0, 1.0);
+
+    if dab.blend_mode == 0u {
+        // Normal: "over" operator
+        let dab_a = opa_weight * dab.opacity * dab.color_a;
+        if dab_a <= 0.0 { return current; }
+        let ba = 1.0 - dab_a;
+        return vec4<f32>(
+            dab_a * dab.color_r + ba * current.r,
+            dab_a * dab.color_g + ba * current.g,
+            dab_a * dab.color_b + ba * current.b,
+            dab_a               + ba * current.a,
+        );
+    } else if dab.blend_mode == 1u {
+        // Erase: multiplicative alpha reduction
+        let dab_a = opa_weight * dab.opacity * dab.color_a;
+        if dab_a <= 0.0 { return current; }
+        let new_a = current.a * (1.0 - dab_a);
+        let scale = select(0.0, new_a / current.a, current.a > 1e-6);
+        return vec4<f32>(current.r * scale, current.g * scale, current.b * scale, new_a);
+    } else {
+        // Smudge: directional warp — sample from position behind the stroke direction
+        let alpha = opa_weight * dab.opacity;
+        if alpha <= 0.0 { return current; }
+        let src_x = f32(px) + 0.5 - dab.ndx * dab.smudge_dist;
+        let src_y = f32(py) + 0.5 - dab.ndy * dab.smudge_dist;
+        let src   = bilinear_sample(src_x, src_y);
+        let da    = 1.0 - alpha;
+        return vec4<f32>(
+            alpha * src.r + da * current.r,
+            alpha * src.g + da * current.g,
+            alpha * src.b + da * current.b,
+            alpha * src.a + da * current.a,
+        );
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Main entry point
+// ---------------------------------------------------------------------------
+@compute @workgroup_size(8, 8)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    // Bounds check within the bounding box
+    if gid.x >= params.bbox_w || gid.y >= params.bbox_h { return; }
+
+    let px = i32(gid.x) + params.bbox_x0;
+    let py = i32(gid.y) + params.bbox_y0;
+
+    // Bounds check within the canvas (bbox may extend past canvas edges)
+    if px < 0 || py < 0 || u32(px) >= params.canvas_w || u32(py) >= params.canvas_h { return; }
+
+    // Read current pixel from source (canvas_dst was pre-filled from canvas_src
+    // by the caller, but we read from canvas_src to ensure consistency)
+    var current = textureLoad(canvas_src, vec2<i32>(px, py), 0);
+
+    // Apply all dabs for this frame (sequential in the thread, no races between threads
+    // since each thread owns a unique output pixel)
+    for (var i = 0u; i < params.num_dabs; i++) {
+        current = apply_dab(current, dabs[i], px, py);
+    }
+
+    textureStore(canvas_dst, vec2<i32>(px, py), current);
+}
--- a/lightningbeam-ui/lightningbeam-editor/src/panes/shaders/canvas_blit.wgsl
+++ b/lightningbeam-ui/lightningbeam-editor/src/panes/shaders/canvas_blit.wgsl
@ -0,0 +1,83 @@
+// Canvas blit shader.
+//
+// Renders a GPU raster canvas (at document resolution) into the layer's sRGB
+// render buffer (at viewport resolution), applying the camera transform
+// (pan + zoom) to map document-space pixels to viewport-space pixels.
+//
+// Any viewport pixel whose corresponding document coordinate falls outside
+// [0, canvas_w) × [0, canvas_h) outputs transparent black.
+
+struct CameraParams {
+    pan_x:      f32,
+    pan_y:      f32,
+    zoom:       f32,
+    canvas_w:   f32,
+    canvas_h:   f32,
+    viewport_w: f32,
+    viewport_h: f32,
+    _pad:       f32,
+}
+
+@group(0) @binding(0) var canvas_tex:     texture_2d<f32>;
+@group(0) @binding(1) var canvas_sampler: sampler;
+@group(0) @binding(2) var<uniform> camera: CameraParams;
+
+struct VertexOutput {
+    @builtin(position) position: vec4<f32>,
+    @location(0)       uv:       vec2<f32>,
+}
+
+// Generates a fullscreen triangle strip (same pattern as blit.wgsl)
+@vertex
+fn vs_main(@builtin(vertex_index) vertex_index: u32) -> VertexOutput {
+    var out: VertexOutput;
+    let x = f32((vertex_index & 1u) << 1u);
+    let y = f32(vertex_index & 2u);
+    out.position = vec4<f32>(x * 2.0 - 1.0, 1.0 - y * 2.0, 0.0, 1.0);
+    out.uv = vec2<f32>(x, y);
+    return out;
+}
+
+// Linear → sRGB encoding for a single channel.
+// Applied to premultiplied linear values so the downstream srgb_to_linear
+// pass round-trips correctly without darkening semi-transparent edges.
+fn linear_to_srgb(c: f32) -> f32 {
+    return select(
+        1.055 * pow(max(c, 0.0), 1.0 / 2.4) - 0.055,
+        c * 12.92,
+        c <= 0.0031308,
+    );
+}
+
+@fragment
+fn fs_main(in: VertexOutput) -> @location(0) vec4<f32> {
+    // Map viewport UV [0,1] → viewport pixel
+    let vp = in.uv * vec2<f32>(camera.viewport_w, camera.viewport_h);
+
+    // Map viewport pixel → document pixel (inverse camera transform)
+    let doc = (vp - vec2<f32>(camera.pan_x, camera.pan_y)) / camera.zoom;
+
+    // Map document pixel → canvas UV [0,1]
+    let canvas_uv = doc / vec2<f32>(camera.canvas_w, camera.canvas_h);
+
+    // Out-of-bounds → transparent
+    if canvas_uv.x < 0.0 || canvas_uv.x > 1.0
+    || canvas_uv.y < 0.0 || canvas_uv.y > 1.0 {
+        return vec4<f32>(0.0, 0.0, 0.0, 0.0);
+    }
+
+    // The canvas stores premultiplied linear RGBA.
+    // The srgb_to_linear converter downstream applies the sRGB gamma formula
+    // channel-by-channel without alpha awareness.  To make the round-trip
+    // transparent we pre-encode with linear_to_srgb here:
+    //   canvas (linear premul) → sRGB buffer → srgb_to_linear → linear premul ✓
+    // Without this, srgb_to_linear darkens small premultiplied values
+    // (e.g. white at 10% opacity: 0.1 → 0.01), producing a grey halo.
+    let c = textureSample(canvas_tex, canvas_sampler, canvas_uv);
+    return vec4<f32>(
+        linear_to_srgb(c.r),
+        linear_to_srgb(c.g),
+        linear_to_srgb(c.b),
+        c.a,
+    );
+}
--- a/lightningbeam-ui/lightningbeam-editor/src/panes/stage.rs
+++ b/lightningbeam-ui/lightningbeam-editor/src/panes/stage.rs
@ -36,6 +36,10 @@ struct SharedVelloResources {
    effect_processor: Mutex<EffectProcessor>,
    /// sRGB to linear color converter (for Vello output)
    srgb_to_linear: SrgbToLinearConverter,
+    /// GPU raster brush engine (compute pipeline + canvas texture cache)
+    gpu_brush: Mutex<crate::gpu_brush::GpuBrushEngine>,
+    /// Canvas blit pipeline (renders GPU canvas to layer sRGB buffer)
+    canvas_blit: crate::gpu_brush::CanvasBlitPipeline,
 }

 /// Per-instance Vello resources (created for each Stage pane)
@ -206,7 +210,11 @@ impl SharedVelloResources {
        // Initialize sRGB to linear converter for Vello output
        let srgb_to_linear = SrgbToLinearConverter::new(device);

-        println!("✅ Vello shared resources initialized (renderer, shaders, HDR compositor, effect processor, and color converter)");
+        // Initialize GPU raster brush engine
+        let gpu_brush = crate::gpu_brush::GpuBrushEngine::new(device);
+        let canvas_blit = crate::gpu_brush::CanvasBlitPipeline::new(device);
+
+        println!("✅ Vello shared resources initialized (renderer, shaders, HDR compositor, effect processor, color converter, and GPU brush engine)");

        Ok(Self {
            renderer: Arc::new(Mutex::new(renderer)),
@ -220,6 +228,8 @@ impl SharedVelloResources {
            compositor,
            effect_processor: Mutex::new(effect_processor),
            srgb_to_linear,
+            gpu_brush: Mutex::new(gpu_brush),
+            canvas_blit,
        })
    }
 }
@ -390,6 +400,15 @@ struct VelloRenderContext {
    mouse_world_pos: Option<vello::kurbo::Point>,
    /// Latest webcam frame for live preview (if any camera is active)
    webcam_frame: Option<lightningbeam_core::webcam::CaptureFrame>,
+    /// GPU brush dabs to dispatch in this frame's prepare() call.
+    pending_raster_dabs: Option<PendingRasterDabs>,
+    /// Instance ID (for storing readback results in the global map).
+    instance_id_for_readback: u64,
+    /// The (layer_id, keyframe_id) of the raster layer with a live GPU canvas.
+    /// Present for the entire stroke duration, not just frames with new dabs.
+    painting_canvas: Option<(uuid::Uuid, uuid::Uuid)>,
+    /// GPU canvas keyframe to remove at the top of this prepare() call.
+    pending_canvas_removal: Option<uuid::Uuid>,
 }

 /// Callback for Vello rendering within egui
@ -470,6 +489,77 @@ impl egui_wgpu::CallbackTrait for VelloCallback {
            // This means we only need 1 layer buffer at a time (plus the HDR accumulator)
            instance_resources.ensure_hdr_texture(device, &shared, width, height);

+            // --- Deferred GPU canvas removal ---
+            // The previous frame's render_content consumed a readback result and updated
+            // raw_pixels.  Now that the Vello scene is current we can safely drop the
+            // GPU canvas; painting_canvas was already cleared so the compositor will use
+            // the Vello scene from here on.
+            if let Some(kf_id) = self.ctx.pending_canvas_removal {
+                if let Ok(mut gpu_brush) = shared.gpu_brush.lock() {
+                    gpu_brush.remove_canvas(&kf_id);
+                }
+            }
+
+            // --- GPU brush dispatch ---
+            // Dispatch the compute shader for any pending raster dabs from this frame's
+            // input event.  Must happen before compositing so the updated canvas texture
+            // is sampled correctly when the layer is blitted.
+            if let Some(ref pending) = self.ctx.pending_raster_dabs {
+                if let Ok(mut gpu_brush) = shared.gpu_brush.lock() {
+                    // Ensure the canvas pair exists (creates it if missing or wrong size)
+                    gpu_brush.ensure_canvas(
+                        device,
+                        pending.keyframe_id,
+                        pending.canvas_width,
+                        pending.canvas_height,
+                    );
+                    // On stroke start, upload the pre-stroke pixel data to both textures
+                    if let Some(ref pixels) = pending.initial_pixels {
+                        if let Some(canvas) = gpu_brush.canvases.get(&pending.keyframe_id) {
+                            canvas.upload(queue, pixels);
+                        }
+                    }
+                    // Dispatch the compute shader for this frame's dabs
+                    if !pending.dabs.is_empty() {
+                        gpu_brush.render_dabs(
+                            device,
+                            queue,
+                            pending.keyframe_id,
+                            &pending.dabs,
+                            pending.dab_bbox,
+                            pending.canvas_width,
+                            pending.canvas_height,
+                        );
+                    }
+                    // On stroke end, read back the finished canvas and store it so
+                    // the next ui() call can create the undo action.
+                    if pending.wants_final_readback {
+                        if let Some(pixels) = gpu_brush.readback_canvas(
+                            device,
+                            queue,
+                            pending.keyframe_id,
+                        ) {
+                            let results = RASTER_READBACK_RESULTS.get_or_init(|| {
+                                Arc::new(Mutex::new(std::collections::HashMap::new()))
+                            });
+                            if let Ok(mut map) = results.lock() {
+                                map.insert(self.ctx.instance_id_for_readback, RasterReadbackResult {
+                                    layer_id:      pending.layer_id,
+                                    time:          pending.time,
+                                    canvas_width:  pending.canvas_width,
+                                    canvas_height: pending.canvas_height,
+                                    pixels,
+                                });
+                            }
+                            // Canvas is kept alive: the compositor will still blit it
+                            // this frame (painting_canvas is still Some).  render_content
+                            // will clear painting_canvas and set pending_canvas_removal,
+                            // so the texture is freed at the top of the next prepare().
+                        }
+                    }
+                }
+            }
+
            let mut image_cache = shared.image_cache.lock().unwrap();

            let composite_result = lightningbeam_core::renderer::render_document_for_compositing(
@ -558,7 +648,14 @@ impl egui_wgpu::CallbackTrait for VelloCallback {

            // Now render and composite each layer incrementally
            for rendered_layer in &composite_result.layers {
-                if !rendered_layer.has_content {
+                // Check if this raster layer has a live GPU canvas that should be
+                // blitted every frame, even when no new dabs arrived this frame.
+                // `painting_canvas` persists for the entire stroke duration.
+                let gpu_canvas_kf: Option<uuid::Uuid> = self.ctx.painting_canvas
+                    .filter(|(layer_id, _)| *layer_id == rendered_layer.layer_id)
+                    .map(|(_, kf_id)| kf_id);
+
+                if !rendered_layer.has_content && gpu_canvas_kf.is_none() {
                    continue;
                }

@ -573,9 +670,42 @@ impl egui_wgpu::CallbackTrait for VelloCallback {
                            buffer_pool.get_view(hdr_layer_handle),
                            &instance_resources.hdr_texture_view,
                        ) {
-                            // Render layer scene to sRGB buffer
-                            if let Ok(mut renderer) = shared.renderer.lock() {
-                                renderer.render_to_texture(device, queue, &rendered_layer.scene, srgb_view, &layer_render_params).ok();
+                            // GPU canvas blit path: if a live GPU canvas exists for this
+                            // raster layer, sample it directly instead of rendering the Vello
+                            // scene (which lags until raw_pixels is updated after readback).
+                            let used_gpu_canvas = if let Some(kf_id) = gpu_canvas_kf {
+                                let mut used = false;
+                                if let Ok(gpu_brush) = shared.gpu_brush.lock() {
+                                    if let Some(canvas) = gpu_brush.canvases.get(&kf_id) {
+                                        let camera = crate::gpu_brush::CameraParams {
+                                            pan_x:      self.ctx.pan_offset.x,
+                                            pan_y:      self.ctx.pan_offset.y,
+                                            zoom:       self.ctx.zoom,
+                                            canvas_w:   canvas.width as f32,
+                                            canvas_h:   canvas.height as f32,
+                                            viewport_w: width as f32,
+                                            viewport_h: height as f32,
+                                            _pad: 0.0,
+                                        };
+                                        shared.canvas_blit.blit(
+                                            device, queue,
+                                            canvas.src_view(),
+                                            srgb_view,
+                                            &camera,
+                                        );
+                                        used = true;
+                                    }
+                                }
+                                used
+                            } else {
+                                false
+                            };
+
+                            if !used_gpu_canvas {
+                                // Render layer scene to sRGB buffer
+                                if let Ok(mut renderer) = shared.renderer.lock() {
+                                    renderer.render_to_texture(device, queue, &rendered_layer.scene, srgb_view, &layer_render_params).ok();
+                                }
                            }

                            // Convert sRGB to linear HDR
@ -2140,6 +2270,20 @@ pub struct StagePane {
    raster_stroke_state: Option<(uuid::Uuid, f64, lightningbeam_core::brush_engine::StrokeState, Vec<u8>)>,
    // Last raster stroke point (for incremental segment painting)
    raster_last_point: Option<lightningbeam_core::raster_layer::StrokePoint>,
+    /// GPU dabs computed during this frame's drag event — consumed by prepare().
+    pending_raster_dabs: Option<PendingRasterDabs>,
+    /// Undo snapshot info captured at mouse-down; claimed when readback completes.
+    /// (layer_id, time, canvas_w, canvas_h, buffer_before)
+    pending_undo_before: Option<(uuid::Uuid, f64, u32, u32, Vec<u8>)>,
+    /// The (layer_id, keyframe_id) of the raster layer whose GPU canvas is live.
+    /// Set on mouse-down, cleared when the readback result is consumed.
+    /// Used every frame to blit the GPU canvas instead of the stale Vello scene.
+    painting_canvas: Option<(uuid::Uuid, uuid::Uuid)>,
+    /// Keyframe UUID whose GPU canvas should be removed at the start of the next
+    /// prepare() call.  Set by render_content after consuming the readback result
+    /// and updating raw_pixels, so the canvas lives one extra composite frame to
+    /// avoid a flash of the stale Vello scene.
+    pending_canvas_removal: Option<uuid::Uuid>,
    /// Synthetic drag/click override for test mode replay (debug builds only)
    #[cfg(debug_assertions)]
    replay_override: Option<ReplayDragState>,
@ -2171,6 +2315,46 @@ static INSTANCE_COUNTER: std::sync::atomic::AtomicU64 = std::sync::atomic::Atomi
 // Global storage for eyedropper results (instance_id -> (color, color_mode))
 static EYEDROPPER_RESULTS: OnceLock<Arc<Mutex<std::collections::HashMap<u64, (egui::Color32, super::ColorMode)>>>> = OnceLock::new();

+/// Pending GPU dabs for a single drag event.
+///
+/// Created by the event handler (`handle_raster_stroke_tool`) and consumed once
+/// by `VelloCallback::prepare()`.
+struct PendingRasterDabs {
+    /// Keyframe UUID — indexes the canvas texture pair in `GpuBrushEngine`.
+    keyframe_id: uuid::Uuid,
+    /// Layer UUID — used for the undo readback result.
+    layer_id: uuid::Uuid,
+    /// Playback time of the keyframe.
+    time: f64,
+    /// Canvas dimensions (pixels).
+    canvas_width: u32,
+    canvas_height: u32,
+    /// Raw RGBA pixel data to upload to the canvas texture on the very first dab of
+    /// a stroke (i.e., when the stroke starts).  `None` on subsequent drag events.
+    initial_pixels: Option<Vec<u8>>,
+    /// Dab list computed by `BrushEngine::compute_dabs()`.
+    dabs: Vec<lightningbeam_core::brush_engine::GpuDab>,
+    /// Union bounding box of `dabs` (x0, y0, x1, y1) in canvas pixel coords.
+    dab_bbox: (i32, i32, i32, i32),
+    /// When `true`, perform a full canvas readback after dispatching and store
+    /// the result in `RASTER_READBACK_RESULTS` so the next frame can create
+    /// the undo action.
+    wants_final_readback: bool,
+}
+
+/// Result stored by `prepare()` after a stroke-end readback.
+struct RasterReadbackResult {
+    layer_id: uuid::Uuid,
+    time: f64,
+    canvas_width: u32,
+    canvas_height: u32,
+    /// Raw RGBA pixels from the completed stroke.
+    pixels: Vec<u8>,
+}
+
+// Global storage for raster readback results (instance_id -> result)
+static RASTER_READBACK_RESULTS: OnceLock<Arc<Mutex<std::collections::HashMap<u64, RasterReadbackResult>>>> = OnceLock::new();
+
 /// Cached 2x2 stipple image brush for selection overlay.
 /// Pattern: [[black, transparent], [transparent, white]]
 /// Tiled with nearest-neighbor sampling so each pixel stays crisp.
@ -2217,6 +2401,10 @@ impl StagePane {
            current_snap: None,
            raster_stroke_state: None,
            raster_last_point: None,
+            pending_raster_dabs: None,
+            pending_undo_before: None,
+            painting_canvas: None,
+            pending_canvas_removal: None,
            #[cfg(debug_assertions)]
            replay_override: None,
        }
@ -4183,9 +4371,12 @@ impl StagePane {

    /// Handle raster stroke tool input (Draw/Erase/Smudge on a raster layer).
    ///
-    /// Paints incrementally into `document_mut()` on every drag event so the
-    /// result is visible immediately.  On mouse-up the pre/post raw-pixel
-    /// buffers are wrapped in a `RasterStrokeAction` for undo/redo.
+    /// Computes GPU dab lists for each drag event and stores them in
+    /// `self.pending_raster_dabs` for dispatch by `VelloCallback::prepare()`.
+    ///
+    /// The actual pixel rendering happens on the GPU (compute shader).  The CPU
+    /// only does dab placement arithmetic (cheap).  On stroke end a readback is
+    /// requested so the undo system can capture the final pixel state.
    fn handle_raster_stroke_tool(
        &mut self,
        ui: &mut egui::Ui,
@ -4197,7 +4388,7 @@ impl StagePane {
        use lightningbeam_core::tool::ToolState;
        use lightningbeam_core::layer::AnyLayer;
        use lightningbeam_core::raster_layer::StrokePoint;
-        use lightningbeam_core::brush_engine::{BrushEngine, StrokeState, image_from_raw};
+        use lightningbeam_core::brush_engine::{BrushEngine, StrokeState};
        use lightningbeam_core::raster_layer::StrokeRecord;

        let active_layer_id = match *shared.active_layer_id {
@ -4209,9 +4400,7 @@ impl StagePane {
        let is_raster = shared.action_executor.document()
            .get_layer(&active_layer_id)
            .map_or(false, |l| matches!(l, AnyLayer::Raster(_)));
-        if !is_raster {
-            return;
-        }
+        if !is_raster { return; }

        let brush = {
            use lightningbeam_core::brush_settings::BrushSettings;
@ -4235,73 +4424,143 @@ impl StagePane {
            [c.r() as f32 / 255.0, c.g() as f32 / 255.0, c.b() as f32 / 255.0, c.a() as f32 / 255.0]
        };

-        // Mouse down: snapshot buffer_before, init stroke state, paint first dab
+        // ----------------------------------------------------------------
+        // Mouse down: capture buffer_before, start stroke, compute first dab
+        // ----------------------------------------------------------------
        if self.rsp_drag_started(response) || self.rsp_clicked(response) {
-            let (doc_width, doc_height, buffer_before) = {
+            let (doc_width, doc_height) = {
                let doc = shared.action_executor.document();
-                let buf = doc.get_layer(&active_layer_id)
-                    .and_then(|l| if let AnyLayer::Raster(rl) = l {
-                        rl.keyframe_at(*shared.playback_time).map(|kf| kf.raw_pixels.clone())
-                    } else { None })
-                    .unwrap_or_default();
-                (doc.width as u32, doc.height as u32, buf)
+                (doc.width as u32, doc.height as u32)
            };

-            // Start a fresh stroke state; MAX distance ensures first point gets a dab
-            let mut stroke_state = StrokeState::new();
-            stroke_state.distance_since_last_dab = f32::MAX;
-
-            let first_pt = StrokePoint { x: world_pos.x, y: world_pos.y, pressure: 1.0, tilt_x: 0.0, tilt_y: 0.0, timestamp: 0.0 };
-
-            // Paint the first dab directly into the document
+            // Ensure the keyframe exists BEFORE reading its ID, so we always get
+            // the real UUID.  Previously we read the ID first and fell back to a
+            // randomly-generated UUID when no keyframe existed; that fake UUID was
+            // stored in painting_canvas but subsequent drag frames used the real UUID
+            // from keyframe_at(), causing the GPU canvas to be a different object from
+            // the one being composited.
            {
                let doc = shared.action_executor.document_mut();
                if let Some(AnyLayer::Raster(rl)) = doc.get_layer_mut(&active_layer_id) {
-                    let kf = rl.ensure_keyframe_at(*shared.playback_time, doc_width, doc_height);
-                    let mut img = image_from_raw(std::mem::take(&mut kf.raw_pixels), kf.width, kf.height);
-                    let single = StrokeRecord {
-                        brush_settings: brush.clone(),
-                        color,
-                        blend_mode,
-                        points: vec![first_pt.clone()],
-                    };
-                    BrushEngine::apply_stroke_with_state(&mut img, &single, &mut stroke_state);
-                    kf.raw_pixels = img.into_raw();
+                    rl.ensure_keyframe_at(*shared.playback_time, doc_width, doc_height);
                }
            }

-            self.raster_stroke_state = Some((active_layer_id, *shared.playback_time, stroke_state, buffer_before));
+            // Now read the guaranteed-to-exist keyframe to get the real UUID.
+            let (keyframe_id, canvas_width, canvas_height, buffer_before, initial_pixels) = {
+                let doc = shared.action_executor.document();
+                if let Some(AnyLayer::Raster(rl)) = doc.get_layer(&active_layer_id) {
+                    if let Some(kf) = rl.keyframe_at(*shared.playback_time) {
+                        let raw = kf.raw_pixels.clone();
+                        let init = if raw.is_empty() {
+                            vec![0u8; (kf.width * kf.height * 4) as usize]
+                        } else {
+                            raw.clone()
+                        };
+                        (kf.id, kf.width, kf.height, raw, init)
+                    } else {
+                        return; // shouldn't happen after ensure_keyframe_at
+                    }
+                } else {
+                    return;
+                }
+            };
+
+            // Compute the first dab (single-point tap)
+            let mut stroke_state = StrokeState::new();
+            stroke_state.distance_since_last_dab = f32::MAX;
+
+            let first_pt = StrokePoint {
+                x: world_pos.x, y: world_pos.y,
+                pressure: 1.0, tilt_x: 0.0, tilt_y: 0.0, timestamp: 0.0,
+            };
+            let single = StrokeRecord {
+                brush_settings: brush.clone(),
+                color,
+                blend_mode,
+                points: vec![first_pt.clone()],
+            };
+            let (dabs, dab_bbox) = BrushEngine::compute_dabs(&single, &mut stroke_state);
+
+            self.painting_canvas = Some((active_layer_id, keyframe_id));
+            self.pending_undo_before = Some((
+                active_layer_id,
+                *shared.playback_time,
+                canvas_width,
+                canvas_height,
+                buffer_before,
+            ));
+            self.pending_raster_dabs = Some(PendingRasterDabs {
+                keyframe_id,
+                layer_id: active_layer_id,
+                time: *shared.playback_time,
+                canvas_width,
+                canvas_height,
+                initial_pixels: Some(initial_pixels),
+                dabs,
+                dab_bbox,
+                wants_final_readback: false,
+            });
+            self.raster_stroke_state = Some((
+                active_layer_id,
+                *shared.playback_time,
+                stroke_state,
+                Vec::new(), // buffer_before now lives in pending_undo_before
+            ));
            self.raster_last_point = Some(first_pt);
            *shared.tool_state = ToolState::DrawingRasterStroke { points: vec![] };
        }

-        // Mouse drag: paint each new segment immediately
+        // ----------------------------------------------------------------
+        // Mouse drag: compute dabs for this segment
+        // ----------------------------------------------------------------
        if self.rsp_dragged(response) {
            if let Some((layer_id, time, ref mut stroke_state, _)) = self.raster_stroke_state {
                if let Some(prev_pt) = self.raster_last_point.take() {
-                    let curr_pt = StrokePoint { x: world_pos.x, y: world_pos.y, pressure: 1.0, tilt_x: 0.0, tilt_y: 0.0, timestamp: 0.0 };
+                    let curr_pt = StrokePoint {
+                        x: world_pos.x, y: world_pos.y,
+                        pressure: 1.0, tilt_x: 0.0, tilt_y: 0.0, timestamp: 0.0,
+                    };

-                    // Skip if not moved enough
                    const MIN_DIST_SQ: f32 = 1.5 * 1.5;
                    let dx = curr_pt.x - prev_pt.x;
                    let dy = curr_pt.y - prev_pt.y;
-                    let moved_pt = if dx * dx + dy * dy >= MIN_DIST_SQ { curr_pt.clone() } else { prev_pt.clone() };
+                    let moved_pt = if dx * dx + dy * dy >= MIN_DIST_SQ {
+                        curr_pt.clone()
+                    } else {
+                        prev_pt.clone()
+                    };

                    if dx * dx + dy * dy >= MIN_DIST_SQ {
-                        let doc = shared.action_executor.document_mut();
-                        if let Some(AnyLayer::Raster(rl)) = doc.get_layer_mut(&layer_id) {
-                            if let Some(kf) = rl.keyframe_at_mut(time) {
-                                let mut img = image_from_raw(std::mem::take(&mut kf.raw_pixels), kf.width, kf.height);
-                                let seg = StrokeRecord {
-                                    brush_settings: brush.clone(),
-                                    color,
-                                    blend_mode,
-                                    points: vec![prev_pt, curr_pt],
-                                };
-                                BrushEngine::apply_stroke_with_state(&mut img, &seg, stroke_state);
-                                kf.raw_pixels = img.into_raw();
-                            }
-                        }
+                        // Get keyframe info (needed for canvas dimensions)
+                        let (kf_id, kw, kh) = {
+                            let doc = shared.action_executor.document();
+                            if let Some(AnyLayer::Raster(rl)) = doc.get_layer(&layer_id) {
+                                if let Some(kf) = rl.keyframe_at(time) {
+                                    (kf.id, kf.width, kf.height)
+                                } else { self.raster_last_point = Some(moved_pt); return; }
+                            } else { self.raster_last_point = Some(moved_pt); return; }
+                        };
+
+                        let seg = StrokeRecord {
+                            brush_settings: brush.clone(),
+                            color,
+                            blend_mode,
+                            points: vec![prev_pt, curr_pt],
+                        };
+                        let (dabs, dab_bbox) = BrushEngine::compute_dabs(&seg, stroke_state);
+
+                        self.pending_raster_dabs = Some(PendingRasterDabs {
+                            keyframe_id: kf_id,
+                            layer_id,
+                            time,
+                            canvas_width: kw,
+                            canvas_height: kh,
+                            initial_pixels: None,
+                            dabs,
+                            dab_bbox,
+                            wants_final_readback: false,
+                        });
                    }

                    self.raster_last_point = Some(moved_pt);
@ -4309,37 +4568,44 @@ impl StagePane {
            }
        }

-        // Mouse up: wrap the pre/post buffers in an undo action
+        // ----------------------------------------------------------------
+        // Mouse up: request a full-canvas readback for the undo snapshot
+        // ----------------------------------------------------------------
        if self.rsp_drag_stopped(response)
            || (self.rsp_any_released(ui) && matches!(*shared.tool_state, ToolState::DrawingRasterStroke { .. }))
        {
-            if let Some((layer_id, time, _, buffer_before)) = self.raster_stroke_state.take() {
-                use lightningbeam_core::actions::RasterStrokeAction;
-
-                let (doc_width, doc_height, buffer_after) = {
-                    let doc = shared.action_executor.document();
-                    let buf = doc.get_layer(&layer_id)
-                        .and_then(|l| if let AnyLayer::Raster(rl) = l {
-                            rl.keyframe_at(time).map(|kf| kf.raw_pixels.clone())
-                        } else { None })
-                        .unwrap_or_default();
-                    (doc.width as u32, doc.height as u32, buf)
-                };
-
-                let action = RasterStrokeAction::new(
-                    layer_id,
-                    time,
-                    buffer_before,
-                    buffer_after,
-                    doc_width,
-                    doc_height,
-                );
-                // execute is a no-op for the first call (pixels already in document),
-                // but registers the action in the undo stack
-                let _ = shared.action_executor.execute(Box::new(action));
-            }
+            self.raster_stroke_state = None;
            self.raster_last_point = None;
            *shared.tool_state = ToolState::Idle;
+
+            // Mark the pending dabs (if any this frame) for final readback.
+            // If there are no pending dabs this frame, create a "readback only" entry.
+            if let Some(ref mut pending) = self.pending_raster_dabs {
+                pending.wants_final_readback = true;
+            } else if let Some((ub_layer, ub_time, ub_cw, ub_ch, _)) =
+                    self.pending_undo_before.as_ref()
+            {
+                let (ub_layer, ub_time, ub_cw, ub_ch) = (*ub_layer, *ub_time, *ub_cw, *ub_ch);
+                // Get keyframe_id for the canvas texture lookup
+                let kf_id = shared.action_executor.document()
+                    .get_layer(&ub_layer)
+                    .and_then(|l| if let AnyLayer::Raster(rl) = l {
+                        rl.keyframe_at(ub_time).map(|kf| kf.id)
+                    } else { None });
+                if let Some(kf_id) = kf_id {
+                    self.pending_raster_dabs = Some(PendingRasterDabs {
+                        keyframe_id: kf_id,
+                        layer_id: ub_layer,
+                        time: ub_time,
+                        canvas_width: ub_cw,
+                        canvas_height: ub_ch,
+                        initial_pixels: None,
+                        dabs: Vec::new(),
+                        dab_bbox: (i32::MAX, i32::MAX, i32::MIN, i32::MIN),
+                        wants_final_readback: true,
+                    });
+                }
+            }
        }
    }

@ -6830,6 +7096,35 @@ impl PaneRenderer for StagePane {
            self.pan_offset = viewport_center - canvas_center;
        }

+        // Check for completed raster stroke readbacks and create undo actions
+        if let Ok(mut results) = RASTER_READBACK_RESULTS
+            .get_or_init(|| Arc::new(Mutex::new(std::collections::HashMap::new())))
+            .lock() {
+            if let Some(readback) = results.remove(&self.instance_id) {
+                if let Some((layer_id, time, w, h, buffer_before)) = self.pending_undo_before.take() {
+                    use lightningbeam_core::actions::RasterStrokeAction;
+                    let action = RasterStrokeAction::new(
+                        layer_id,
+                        time,
+                        buffer_before,
+                        readback.pixels.clone(),
+                        w,
+                        h,
+                    );
+                    // execute() sets raw_pixels = buffer_after so future Vello renders
+                    // and file saves see the completed stroke.
+                    let _ = shared.action_executor.execute(Box::new(action));
+                }
+                // raw_pixels is now up to date; switch compositing back to the Vello
+                // scene.  Schedule the GPU canvas for removal at the start of the next
+                // prepare() — keeping it alive for this frame's composite avoids a
+                // one-frame flash of the stale Vello scene.
+                if let Some((_, kf_id)) = self.painting_canvas.take() {
+                    self.pending_canvas_removal = Some(kf_id);
+                }
+            }
+        }
+
        // Check for completed eyedropper samples from GPU readback and apply them
        if let Ok(mut results) = EYEDROPPER_RESULTS
            .get_or_init(|| Arc::new(Mutex::new(std::collections::HashMap::new())))
@ -7176,6 +7471,10 @@ impl PaneRenderer for StagePane {
            region_selection: shared.region_selection.clone(),
            mouse_world_pos,
            webcam_frame: shared.webcam_frame.clone(),
+            pending_raster_dabs: self.pending_raster_dabs.take(),
+            instance_id_for_readback: self.instance_id,
+            painting_canvas: self.painting_canvas,
+            pending_canvas_removal: self.pending_canvas_removal.take(),
        }};

        let cb = egui_wgpu::Callback::new_paint_callback(