Refactor tools and fix bugs

2026-03-08 18:44:32 -04:00 · 2026-03-08 18:44:32 -04:00 · 09856ab52c
parent 0d2609c064
commit 09856ab52c
9 changed files with 1588 additions and 28 deletions
--- a/lightningbeam-ui/lightningbeam-core/src/actions/raster_fill.rs
+++ b/lightningbeam-ui/lightningbeam-core/src/actions/raster_fill.rs
@ -43,6 +43,7 @@ impl Action for RasterFillAction {
        };
        let kf = raster.ensure_keyframe_at(self.time, self.width, self.height);
        kf.raw_pixels = self.buffer_after.clone();
+        kf.texture_dirty = true;
        Ok(())
    }

@ -55,6 +56,7 @@ impl Action for RasterFillAction {
        };
        let kf = raster.ensure_keyframe_at(self.time, self.width, self.height);
        kf.raw_pixels = self.buffer_before.clone();
+        kf.texture_dirty = true;
        Ok(())
    }

--- a/lightningbeam-ui/lightningbeam-core/src/actions/raster_stroke.rs
+++ b/lightningbeam-ui/lightningbeam-core/src/actions/raster_stroke.rs
@ -49,12 +49,14 @@ impl Action for RasterStrokeAction {
    fn execute(&mut self, document: &mut Document) -> Result<(), String> {
        let kf = get_keyframe_mut(document, &self.layer_id, self.time, self.width, self.height)?;
        kf.raw_pixels = self.buffer_after.clone();
+        kf.texture_dirty = true;
        Ok(())
    }

    fn rollback(&mut self, document: &mut Document) -> Result<(), String> {
        let kf = get_keyframe_mut(document, &self.layer_id, self.time, self.width, self.height)?;
        kf.raw_pixels = self.buffer_before.clone();
+        kf.texture_dirty = true;
        Ok(())
    }

--- a/lightningbeam-ui/lightningbeam-core/src/raster_layer.rs
+++ b/lightningbeam-ui/lightningbeam-core/src/raster_layer.rs
@ -139,6 +139,7 @@ impl RasterKeyframe {
            stroke_log: Vec::new(),
            tween_after: TweenType::Hold,
            raw_pixels: Vec::new(),
+            texture_dirty: true,
        }
    }
 }
--- a/lightningbeam-ui/lightningbeam-editor/src/debug_overlay.rs
+++ b/lightningbeam-ui/lightningbeam-editor/src/debug_overlay.rs
@ -5,9 +5,40 @@

 use eframe::egui;
 use std::collections::VecDeque;
+use std::sync::{Mutex, OnceLock};
 use std::time::{Duration, Instant};

 const FRAME_HISTORY_SIZE: usize = 60; // Track last 60 frames for FPS stats
+
+/// Timing breakdown for the GPU prepare() pass, written by the render thread.
+#[derive(Debug, Clone, Default)]
+pub struct PrepareTiming {
+    pub total_ms: f64,
+    pub removals_ms: f64,
+    pub gpu_dispatches_ms: f64,
+    pub scene_build_ms: f64,
+    pub composite_ms: f64,
+}
+
+static LAST_PREPARE_TIMING: OnceLock<Mutex<PrepareTiming>> = OnceLock::new();
+
+/// Called from `VelloCallback::prepare()` every frame to update the timing snapshot.
+pub fn update_prepare_timing(
+    total_ms: f64,
+    removals_ms: f64,
+    gpu_dispatches_ms: f64,
+    scene_build_ms: f64,
+    composite_ms: f64,
+) {
+    let cell = LAST_PREPARE_TIMING.get_or_init(|| Mutex::new(PrepareTiming::default()));
+    if let Ok(mut t) = cell.lock() {
+        t.total_ms         = total_ms;
+        t.removals_ms      = removals_ms;
+        t.gpu_dispatches_ms = gpu_dispatches_ms;
+        t.scene_build_ms   = scene_build_ms;
+        t.composite_ms     = composite_ms;
+    }
+}
 const DEVICE_REFRESH_INTERVAL: Duration = Duration::from_secs(2); // Refresh devices every 2 seconds
 const MEMORY_REFRESH_INTERVAL: Duration = Duration::from_millis(500); // Refresh memory every 500ms

@ -28,6 +59,9 @@ pub struct DebugStats {
    pub audio_input_devices: Vec<String>,
    pub has_pointer: bool,

+    // GPU prepare() timing breakdown (from render thread)
+    pub prepare_timing: PrepareTiming,
+
    // Performance metrics for each section
    pub timing_memory_us: u64,
    pub timing_gpu_us: u64,
@ -170,6 +204,12 @@ impl DebugStatsCollector {

        let timing_total_us = collection_start.elapsed().as_micros() as u64;

+        let prepare_timing = LAST_PREPARE_TIMING
+            .get()
+            .and_then(|m| m.lock().ok())
+            .map(|t| t.clone())
+            .unwrap_or_default();
+
        DebugStats {
            fps_current,
            fps_min,
@ -184,6 +224,7 @@ impl DebugStatsCollector {
            midi_devices,
            audio_input_devices,
            has_pointer,
+            prepare_timing,
            timing_memory_us,
            timing_gpu_us,
            timing_midi_us,
@ -231,6 +272,16 @@ pub fn render_debug_overlay(ctx: &egui::Context, stats: &DebugStats) {

                    ui.add_space(8.0);

+                    // GPU prepare() timing section
+                    let pt = &stats.prepare_timing;
+                    ui.colored_label(egui::Color32::YELLOW, format!("GPU prepare: {:.2} ms", pt.total_ms));
+                    ui.label(format!("  removals:      {:.2} ms", pt.removals_ms));
+                    ui.label(format!("  gpu_dispatch:  {:.2} ms", pt.gpu_dispatches_ms));
+                    ui.label(format!("  scene_build:   {:.2} ms", pt.scene_build_ms));
+                    ui.label(format!("  composite:     {:.2} ms", pt.composite_ms));
+
+                    ui.add_space(8.0);
+
                    // Memory section with timing
                    ui.colored_label(egui::Color32::YELLOW, format!("Memory: ({}µs)", stats.timing_memory_us));
                    ui.label(format!("Physical: {} MB", stats.memory_physical_mb));
--- a/lightningbeam-ui/lightningbeam-editor/src/gpu_brush.rs
+++ b/lightningbeam-ui/lightningbeam-editor/src/gpu_brush.rs
@ -775,6 +775,71 @@ impl GradientFillPipeline {
    }
 }

+// ── AlphaCompositePipeline ───────────────────────────────────────────────────
+
+/// Compute pipeline: composites the scratch buffer C over the source A → output B.
+///
+/// Binding layout (see `alpha_composite.wgsl`):
+///   0 = tex_a (texture_2d<f32>, Rgba8Unorm, sampled, not filterable)
+///   1 = tex_c (texture_2d<f32>, Rgba8Unorm, sampled, not filterable)
+///   2 = tex_b (texture_storage_2d<rgba8unorm, write>)
+struct AlphaCompositePipeline {
+    pipeline:  wgpu::ComputePipeline,
+    bg_layout: wgpu::BindGroupLayout,
+}
+
+impl AlphaCompositePipeline {
+    fn new(device: &wgpu::Device) -> Self {
+        let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label:  Some("alpha_composite_shader"),
+            source: wgpu::ShaderSource::Wgsl(
+                include_str!("panes/shaders/alpha_composite.wgsl").into(),
+            ),
+        });
+        let sampled_entry = |binding: u32| wgpu::BindGroupLayoutEntry {
+            binding,
+            visibility: wgpu::ShaderStages::COMPUTE,
+            ty: wgpu::BindingType::Texture {
+                sample_type:    wgpu::TextureSampleType::Float { filterable: false },
+                view_dimension: wgpu::TextureViewDimension::D2,
+                multisampled:   false,
+            },
+            count: None,
+        };
+        let bg_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label:   Some("alpha_composite_bgl"),
+            entries: &[
+                sampled_entry(0), // tex_a
+                sampled_entry(1), // tex_c
+                wgpu::BindGroupLayoutEntry {
+                    binding:    2,
+                    visibility: wgpu::ShaderStages::COMPUTE,
+                    ty: wgpu::BindingType::StorageTexture {
+                        access:         wgpu::StorageTextureAccess::WriteOnly,
+                        format:         wgpu::TextureFormat::Rgba8Unorm,
+                        view_dimension: wgpu::TextureViewDimension::D2,
+                    },
+                    count: None,
+                },
+            ],
+        });
+        let layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label:                Some("alpha_composite_layout"),
+            bind_group_layouts:   &[&bg_layout],
+            push_constant_ranges: &[],
+        });
+        let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label:               Some("alpha_composite_pipeline"),
+            layout:              Some(&layout),
+            module:              &shader,
+            entry_point:         Some("main"),
+            compilation_options: Default::default(),
+            cache:               None,
+        });
+        Self { pipeline, bg_layout }
+    }
+}
+
 // GpuBrushEngine
 // ---------------------------------------------------------------------------

@ -792,12 +857,21 @@ pub struct GpuBrushEngine {
    liquify_brush_pipeline: Option<LiquifyBrushPipeline>,
    /// Lazily created on first gradient fill use.
    gradient_fill_pipeline: Option<GradientFillPipeline>,
+    /// Lazily created on first unified-tool composite dispatch.
+    composite_pipeline: Option<AlphaCompositePipeline>,

    /// Canvas texture pairs keyed by keyframe UUID.
    pub canvases: HashMap<Uuid, CanvasPair>,

    /// Displacement map buffers keyed by a caller-supplied UUID.
    pub displacement_bufs: HashMap<Uuid, DisplacementBuffer>,
+
+    /// Persistent `Rgba8Unorm` textures for idle raster layers.
+    ///
+    /// Keyed by keyframe UUID (same ID space as `canvases`).  Entries are uploaded
+    /// once when `RasterKeyframe::texture_dirty` is set, then reused every frame.
+    /// Separate from `canvases` so tool teardown never accidentally removes them.
+    pub raster_layer_cache: HashMap<Uuid, CanvasPair>,
 }

 /// CPU-side parameters uniform for the compute shader.
@ -903,8 +977,10 @@ impl GpuBrushEngine {
            warp_apply_pipeline:    None,
            liquify_brush_pipeline: None,
            gradient_fill_pipeline: None,
+            composite_pipeline: None,
            canvases:           HashMap::new(),
            displacement_bufs:  HashMap::new(),
+            raster_layer_cache: HashMap::new(),
        }
    }

@ -1264,6 +1340,126 @@ impl GpuBrushEngine {
        self.canvases.remove(keyframe_id);
    }

+    // ── Raster-layer texture cache ────────────────────────────────────────────
+
+    /// Ensure a cached display texture exists for `kf_id`.
+    ///
+    /// If `dirty` is `true` (or no entry exists), the canvas is (re)created and
+    /// `pixels` is uploaded.  Call with `dirty = false` when only checking for
+    /// existence without re-uploading.
+    ///
+    /// `pixels` must be sRGB-premultiplied RGBA with length `w * h * 4`.
+    /// Panics in debug builds if the length does not match.
+    pub fn ensure_layer_texture(
+        &mut self,
+        device:  &wgpu::Device,
+        queue:   &wgpu::Queue,
+        kf_id:   Uuid,
+        pixels:  &[u8],
+        w:       u32,
+        h:       u32,
+        dirty:   bool,
+    ) {
+        debug_assert_eq!(
+            pixels.len(),
+            (w * h * 4) as usize,
+            "ensure_layer_texture: pixel buffer length mismatch (got {}, expected {})",
+            pixels.len(),
+            w * h * 4,
+        );
+        let needs_new = dirty || self.raster_layer_cache.get(&kf_id)
+            .map_or(true, |c| c.width != w || c.height != h);
+        if needs_new {
+            let canvas = CanvasPair::new(device, w, h);
+            if !pixels.is_empty() {
+                canvas.upload(queue, pixels);
+            }
+            self.raster_layer_cache.insert(kf_id, canvas);
+        }
+    }
+
+    /// Get the cached display texture for a raster layer keyframe.
+    pub fn get_layer_texture(&self, kf_id: &Uuid) -> Option<&CanvasPair> {
+        self.raster_layer_cache.get(kf_id)
+    }
+
+    /// Remove the cached texture for a raster layer keyframe (e.g. when deleted).
+    pub fn remove_layer_texture(&mut self, kf_id: &Uuid) {
+        self.raster_layer_cache.remove(kf_id);
+    }
+
+    /// Composite the accumulated-dab scratch buffer C over the source A, writing the
+    /// result into B:  `B = C + A × (1 − C.a)` (Porter-Duff src-over).
+    ///
+    /// All three canvases must already exist in `self.canvases` (created by
+    /// [`ensure_canvas`] from the [`WorkspaceInitPacket`] in `prepare()`).
+    ///
+    /// After dispatch, B's ping-pong index is swapped so `B.src_view()` holds the
+    /// composite result and the compositor can blit it.
+    pub fn composite_a_c_to_b(
+        &mut self,
+        device:  &wgpu::Device,
+        queue:   &wgpu::Queue,
+        a_id:    Uuid,
+        c_id:    Uuid,
+        b_id:    Uuid,
+        width:   u32,
+        height:  u32,
+    ) {
+        // Init pipeline lazily.
+        if self.composite_pipeline.is_none() {
+            self.composite_pipeline = Some(AlphaCompositePipeline::new(device));
+        }
+
+        // Build bind group and command buffer (all immutable borrows of self).
+        let cmd_buf = {
+            let pipeline = self.composite_pipeline.as_ref().unwrap();
+            let Some(a) = self.canvases.get(&a_id) else { return; };
+            let Some(c) = self.canvases.get(&c_id) else { return; };
+            let Some(b) = self.canvases.get(&b_id) else { return; };
+
+            let bg = device.create_bind_group(&wgpu::BindGroupDescriptor {
+                label:   Some("alpha_composite_bg"),
+                layout:  &pipeline.bg_layout,
+                entries: &[
+                    wgpu::BindGroupEntry {
+                        binding:  0,
+                        resource: wgpu::BindingResource::TextureView(a.src_view()),
+                    },
+                    wgpu::BindGroupEntry {
+                        binding:  1,
+                        resource: wgpu::BindingResource::TextureView(c.src_view()),
+                    },
+                    wgpu::BindGroupEntry {
+                        binding:  2,
+                        resource: wgpu::BindingResource::TextureView(b.dst_view()),
+                    },
+                ],
+            });
+
+            let mut enc = device.create_command_encoder(
+                &wgpu::CommandEncoderDescriptor { label: Some("alpha_composite_enc") },
+            );
+            {
+                let mut pass = enc.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                    label:            Some("alpha_composite"),
+                    timestamp_writes: None,
+                });
+                pass.set_pipeline(&pipeline.pipeline);
+                pass.set_bind_group(0, &bg, &[]);
+                pass.dispatch_workgroups((width + 7) / 8, (height + 7) / 8, 1);
+            }
+            enc.finish()
+        }; // Immutable borrows (pipeline, a, c, b) released here.
+
+        queue.submit(std::iter::once(cmd_buf));
+
+        // Swap B: src now holds the composite result.
+        if let Some(b) = self.canvases.get_mut(&b_id) {
+            b.swap();
+        }
+    }
+
    /// Dispatch the affine-resample transform shader from `anchor_id` → `float_id`.
    ///
    /// Reads from the anchor canvas's source view, writes into the float canvas's
--- a/lightningbeam-ui/lightningbeam-editor/src/main.rs
+++ b/lightningbeam-ui/lightningbeam-editor/src/main.rs
@ -28,6 +28,8 @@ mod waveform_gpu;
 mod cqt_gpu;
 mod gpu_brush;

+mod raster_tool;
+
 mod config;
 use config::AppConfig;

@ -954,6 +956,9 @@ impl EditorApp {
        #[cfg(debug_assertions)]
        cc.egui_ctx.style_mut(|style| style.debug.show_unaligned = false);

+        // Disable egui's built-in Ctrl+Plus/Minus zoom — we handle zoom ourselves.
+        cc.egui_ctx.options_mut(|o| o.zoom_with_keyboard = false);
+
        // Load application config
        let config = AppConfig::load();

@ -4453,16 +4458,9 @@ impl eframe::App for EditorApp {
    fn update(&mut self, ctx: &egui::Context, frame: &mut eframe::Frame) {
        let _frame_start = std::time::Instant::now();

-        // Disable egui's built-in Ctrl+Plus/Minus zoom behavior
-        // We handle zoom ourselves for the Stage pane
-        ctx.options_mut(|o| {
-            o.zoom_with_keyboard = false;
-        });
-
        // Force continuous repaint if we have pending waveform updates
        // This ensures thumbnails update immediately when waveform data arrives
        if !self.audio_pools_with_new_waveforms.is_empty() {
-            println!("🔄 [UPDATE] Pending waveform updates for pools: {:?}", self.audio_pools_with_new_waveforms);
            ctx.request_repaint();
        }

--- a/lightningbeam-ui/lightningbeam-editor/src/panes/shaders/alpha_composite.wgsl
+++ b/lightningbeam-ui/lightningbeam-editor/src/panes/shaders/alpha_composite.wgsl
@ -0,0 +1,27 @@
+// Alpha composite compute shader.
+//
+// Composites the accumulated-dab scratch buffer C on top of the source buffer A,
+// writing the result into the output buffer B:
+//
+//   B[px] = C[px] + A[px] * (1 − C[px].a)    (Porter-Duff src-over, C over A)
+//
+// All textures are Rgba8Unorm, linear premultiplied RGBA.
+// Dispatch: ceil(w/8) × ceil(h/8) × 1.
+
+@group(0) @binding(0) var tex_a: texture_2d<f32>;                        // source (A)
+@group(0) @binding(1) var tex_c: texture_2d<f32>;                        // accumulated dabs (C)
+@group(0) @binding(2) var tex_b: texture_storage_2d<rgba8unorm, write>;  // output (B)
+
+@compute @workgroup_size(8, 8)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let dims = textureDimensions(tex_a);
+    if gid.x >= dims.x || gid.y >= dims.y { return; }
+
+    let coord = vec2<i32>(i32(gid.x), i32(gid.y));
+    let a = textureLoad(tex_a, coord, 0);
+    let c = textureLoad(tex_c, coord, 0);
+
+    // Porter-Duff src-over: C is the foreground (dabs), A is the background.
+    // out = c + a * (1 - c.a)
+    textureStore(tex_b, coord, c + a * (1.0 - c.a));
+}
--- a/lightningbeam-ui/lightningbeam-editor/src/panes/stage.rs
+++ b/lightningbeam-ui/lightningbeam-editor/src/panes/stage.rs
@ -427,6 +427,30 @@ struct VelloRenderContext {
    /// the infopanel converts the pixel data to egui TextureHandles.
    /// Each entry is `(width, height, sRGB-premultiplied RGBA bytes)`.
    brush_preview_pixels: std::sync::Arc<std::sync::Mutex<Vec<(u32, u32, Vec<u8>)>>>,
+
+    // ── New unified raster tool rendering ─────────────────────────────────────
+
+    /// When `Some`, the compositor blits B (the tool output canvas) at the layer
+    /// or float slot described here, instead of the Vello scene / idle raster texture.
+    active_tool_render: Option<crate::raster_tool::ActiveToolRender>,
+    /// Canvas UUIDs to remove from `GpuBrushEngine` at the top of the next `prepare()`.
+    /// Replaced the single `pending_canvas_removal` field.
+    pending_canvas_removals: Vec<uuid::Uuid>,
+    /// First-frame canvas initialization for the active raster tool workspace.
+    /// `prepare()` creates A/B/C canvases and uploads source pixels on the same frame
+    /// the tool starts (mousedown).  Cleared after one consume.
+    pending_workspace_init: Option<crate::raster_tool::WorkspaceInitPacket>,
+    /// GPU work extracted from the active `RasterTool` this frame via
+    /// `take_pending_gpu_work()`.  Executed in `prepare()` before compositing.
+    pending_tool_gpu_work: Option<Box<dyn crate::raster_tool::PendingGpuWork>>,
+    /// Raster layer keyframe UUIDs whose `raster_layer_cache` entry should be
+    /// removed at the top of `prepare()` so the fresh `raw_pixels` are re-uploaded.
+    /// Populated by the pre-callback dirty-keyframe scan (for undo/redo) and by
+    /// stroke/fill/warp commit handlers.
+    pending_layer_cache_removals: Vec<uuid::Uuid>,
+    /// When `Some`, readback this B-canvas into `RASTER_READBACK_RESULTS` after
+    /// dispatching GPU tool work.  Set on mouseup by the unified raster tool commit path.
+    pending_tool_readback_b: Option<uuid::Uuid>,
 }

 /// Callback for Vello rendering within egui
@ -500,6 +524,10 @@ impl egui_wgpu::CallbackTrait for VelloCallback {
            camera_transform
        };

+        // Timing instrumentation: track where frame budget is spent.
+        // Prints to stderr when any section exceeds 2 ms, or total > 8 ms.
+        let _t_prepare_start = std::time::Instant::now();
+
        // Choose rendering path based on HDR compositing flag
        let mut scene = if USE_HDR_COMPOSITING {
            // HDR Compositing Pipeline: render each layer separately for proper opacity
@ -517,6 +545,75 @@ impl egui_wgpu::CallbackTrait for VelloCallback {
                    gpu_brush.remove_canvas(&kf_id);
                }
            }
+            // Process the bulk-removal list (A/B/C canvases from finished tool ops).
+            // The Vec was moved into this callback by StagePane via std::mem::take,
+            // so it is already gone from StagePane; no drain needed.
+            if !self.ctx.pending_canvas_removals.is_empty() {
+                if let Ok(mut gpu_brush) = shared.gpu_brush.lock() {
+                    for id in &self.ctx.pending_canvas_removals {
+                        gpu_brush.remove_canvas(id);
+                    }
+                }
+            }
+            // Invalidate raster_layer_cache entries whose raw_pixels changed (undo/redo,
+            // stroke commit, fill commit, etc.).  Removing the entry here causes the
+            // raster-cache section below to re-upload the fresh pixels on the same frame.
+            if !self.ctx.pending_layer_cache_removals.is_empty() {
+                if let Ok(mut gpu_brush) = shared.gpu_brush.lock() {
+                    for id in &self.ctx.pending_layer_cache_removals {
+                        gpu_brush.remove_layer_texture(id);
+                    }
+                }
+            }
+            let _t_after_removals = std::time::Instant::now();
+
+            // First-frame canvas initialization for the unified raster tool workspace.
+            // Creates A (source), B (output) and C (scratch) canvases; uploads pixels to A.
+            // B and C start zero-initialized (transparent).
+            if let Some(ref init) = self.ctx.pending_workspace_init {
+                if let Ok(mut gpu_brush) = shared.gpu_brush.lock() {
+                    // A canvas: source pixels.
+                    gpu_brush.ensure_canvas(device, init.a_canvas_id, init.width, init.height);
+                    if let Some(canvas) = gpu_brush.canvases.get(&init.a_canvas_id) {
+                        canvas.upload(queue, &init.a_pixels);
+                    }
+                    // B canvas: output (zero-initialized by GPU allocation).
+                    gpu_brush.ensure_canvas(device, init.b_canvas_id, init.width, init.height);
+                    // C canvas: scratch (zero-initialized by GPU allocation).
+                    gpu_brush.ensure_canvas(device, init.c_canvas_id, init.width, init.height);
+                }
+            }
+
+            // Unified raster tool GPU dispatch (dab shaders, composite pass, etc.).
+            if let Some(ref work) = self.ctx.pending_tool_gpu_work {
+                if let Ok(mut gpu_brush) = shared.gpu_brush.lock() {
+                    work.execute(device, queue, &mut *gpu_brush);
+                }
+            }
+
+            // Unified tool B-canvas readback on mouseup (commit path).
+            // Triggered when the active RasterTool's finish() returns true.
+            if let Some(b_id) = self.ctx.pending_tool_readback_b {
+                if let Ok(mut gpu_brush) = shared.gpu_brush.lock() {
+                    let dims = gpu_brush.canvases.get(&b_id).map(|c| (c.width, c.height));
+                    if let Some((w, h)) = dims {
+                        if let Some(pixels) = gpu_brush.readback_canvas(device, queue, b_id) {
+                            let results = RASTER_READBACK_RESULTS.get_or_init(|| {
+                                Arc::new(Mutex::new(std::collections::HashMap::new()))
+                            });
+                            if let Ok(mut map) = results.lock() {
+                                map.insert(self.ctx.instance_id_for_readback, RasterReadbackResult {
+                                    layer_id: uuid::Uuid::nil(), // unused; routing via pending_undo_before
+                                    time: 0.0,
+                                    canvas_width: w,
+                                    canvas_height: h,
+                                    pixels,
+                                });
+                            }
+                        }
+                    }
+                }
+            }

            // Lazy float GPU canvas initialization.
            // If a float exists but its GPU canvas hasn't been created yet, upload float.pixels now.
@ -791,6 +888,8 @@ impl egui_wgpu::CallbackTrait for VelloCallback {
                }
            }

+            let _t_after_gpu_dispatches = std::time::Instant::now();
+
            let mut image_cache = shared.image_cache.lock().unwrap();

            let composite_result = lightningbeam_core::renderer::render_document_for_compositing(
@ -801,6 +900,7 @@ impl egui_wgpu::CallbackTrait for VelloCallback {
                self.ctx.webcam_frame.as_ref(),
            );
            drop(image_cache);
+            let _t_after_scene_build = std::time::Instant::now();

            // Get buffer pool for layer rendering
            let mut buffer_pool = shared.buffer_pool.lock().unwrap();
@ -937,25 +1037,95 @@ impl egui_wgpu::CallbackTrait for VelloCallback {

            // Now render and composite each layer incrementally
            for rendered_layer in &composite_result.layers {
-                // Check if this raster layer has a live GPU canvas that should be
-                // blitted every frame, even when no new dabs arrived this frame.
-                // `painting_canvas` persists for the entire stroke duration.
-                // When painting into float (B), the GPU canvas is B's canvas — don't
-                // use it to replace the Vello scene for the layer (A must still render
-                // via Vello).
-                let gpu_canvas_kf: Option<uuid::Uuid> = if self.ctx.painting_float {
-                    None
-                } else {
-                    self.ctx.painting_canvas
-                        .filter(|(layer_id, _)| *layer_id == rendered_layer.layer_id)
-                        .map(|(_, kf_id)| kf_id)
-                        // Warp/Liquify: show display canvas in place of layer.
-                        .or_else(|| self.ctx.warp_display
+                // Determine which GPU canvas (if any) to blit for this layer.
+                //
+                // Priority order:
+                // 1. Active tool B canvas (new unified tool render).
+                // 2. Legacy painting_canvas (old per-tool render path, kept during migration).
+                // 3. Warp/Liquify display canvas.
+                // 4. Raster layer texture cache (idle raster layers — bypasses Vello).
+                // 5. None → fall through to Vello scene rendering.
+                //
+                // When painting_float is true, the active tool is working on the float,
+                // so the layer itself should still render normally (via Vello or cache).
+                let gpu_canvas_kf: Option<uuid::Uuid> = {
+                    // 1. New unified tool render: B canvas replaces this layer.
+                    let from_tool = self.ctx.active_tool_render.as_ref()
+                        .filter(|tr| tr.layer_id == Some(rendered_layer.layer_id))
+                        .map(|tr| tr.b_canvas_id);
+
+                    // 2. Legacy painting_canvas (old stroke path).
+                    let from_legacy = if self.ctx.painting_float {
+                        None
+                    } else {
+                        self.ctx.painting_canvas
                            .filter(|(layer_id, _)| *layer_id == rendered_layer.layer_id)
-                            .map(|(_, display_id)| display_id))
+                            .map(|(_, kf_id)| kf_id)
+                    };
+
+                    // 3. Warp/Liquify display canvas.
+                    let from_warp = self.ctx.warp_display
+                        .filter(|(layer_id, _)| *layer_id == rendered_layer.layer_id)
+                        .map(|(_, display_id)| display_id);
+
+                    from_tool.or(from_legacy).or(from_warp)
                };

-                if !rendered_layer.has_content && gpu_canvas_kf.is_none() {
+                // 4. Raster layer texture cache: for idle raster layers (no active tool canvas).
+                // Upload raw_pixels to the cache if texture_dirty; then use the cache entry.
+                let raster_cache_kf: Option<uuid::Uuid> = if gpu_canvas_kf.is_none() {
+                    // Find the active keyframe for this raster layer.
+                    let doc = &self.ctx.document;
+                    let raster_kf_id = doc.get_layer(&rendered_layer.layer_id)
+                        .and_then(|l| match l {
+                            lightningbeam_core::layer::AnyLayer::Raster(rl) => {
+                                rl.keyframe_at(self.ctx.playback_time)
+                            }
+                            _ => None,
+                        })
+                        .map(|kf| kf.id);
+
+                    if let Some(kf_id) = raster_kf_id {
+                        if let Ok(mut gpu_brush) = shared.gpu_brush.lock() {
+                            // Check if we have pixels to upload.
+                            let kf_data = doc.get_layer(&rendered_layer.layer_id)
+                                .and_then(|l| match l {
+                                    lightningbeam_core::layer::AnyLayer::Raster(rl) => {
+                                        rl.keyframe_at(self.ctx.playback_time)
+                                    }
+                                    _ => None,
+                                });
+                            if let Some(kf) = kf_data {
+                                if !kf.raw_pixels.is_empty() {
+                                    // Pass dirty=false: the cache entry was already removed
+                                    // above via pending_layer_cache_removals when raw_pixels
+                                    // changed (undo/redo, stroke commit, etc.).  A cache miss
+                                    // triggers upload; a cache hit skips the expensive sRGB
+                                    // conversion + GPU write that was firing every frame.
+                                    gpu_brush.ensure_layer_texture(
+                                        device, queue, kf_id,
+                                        &kf.raw_pixels,
+                                        kf.width, kf.height,
+                                        false,
+                                    );
+                                    Some(kf_id)
+                                } else {
+                                    None
+                                }
+                            } else {
+                                None
+                            }
+                        } else {
+                            None
+                        }
+                    } else {
+                        None
+                    }
+                } else {
+                    None
+                };
+
+                if !rendered_layer.has_content && gpu_canvas_kf.is_none() && raster_cache_kf.is_none() {
                    continue;
                }

@ -971,13 +1141,17 @@ impl egui_wgpu::CallbackTrait for VelloCallback {
                            &instance_resources.hdr_texture_view,
                        ) {
                            // GPU canvas blit path: if a live GPU canvas exists for this
-                            // raster layer, blit it directly into the HDR buffer (premultiplied
-                            // linear → Rgba16Float), bypassing the sRGB intermediate entirely.
+                            // raster layer (active tool B canvas, legacy painting_canvas, or
+                            // raster layer texture cache), blit it directly into the HDR buffer
+                            // (premultiplied linear → Rgba16Float), bypassing Vello entirely.
                            // Vello path: render to sRGB buffer → srgb_to_linear → HDR buffer.
-                            let used_gpu_canvas = if let Some(kf_id) = gpu_canvas_kf {
+                            let used_gpu_canvas = if let Some(kf_id) = gpu_canvas_kf.or(raster_cache_kf) {
                                let mut used = false;
                                if let Ok(gpu_brush) = shared.gpu_brush.lock() {
-                                    if let Some(canvas) = gpu_brush.canvases.get(&kf_id) {
+                                    // Try tool canvases first, then the layer texture cache.
+                                    let canvas = gpu_brush.canvases.get(&kf_id)
+                                        .or_else(|| gpu_brush.raster_layer_cache.get(&kf_id));
+                                    if let Some(canvas) = canvas {
                                        let camera = crate::gpu_brush::CameraParams {
                                            pan_x:      self.ctx.pan_offset.x,
                                            pan_y:      self.ctx.pan_offset.y,
@ -1220,6 +1394,9 @@ impl egui_wgpu::CallbackTrait for VelloCallback {
            let blit_params = if let Some(ref td) = self.ctx.transform_display {
                // During transform: show the display canvas (compute shader output) instead of float.
                Some((td.display_canvas_id, td.x, td.y, td.w, td.h))
+            } else if let Some(ref tr) = self.ctx.active_tool_render.as_ref().filter(|tr| tr.layer_id.is_none()) {
+                // Unified raster tool active on the float: show B canvas instead of float's own canvas.
+                Some((tr.b_canvas_id, tr.x, tr.y, tr.width, tr.height))
            } else if let Some(ref float_sel) = self.ctx.selection.raster_floating {
                // Regular float blit.
                Some((float_sel.canvas_id, float_sel.x, float_sel.y, float_sel.width, float_sel.height))
@ -1268,6 +1445,17 @@ impl egui_wgpu::CallbackTrait for VelloCallback {
            buffer_pool.next_frame();
            drop(buffer_pool);

+            // --- Frame timing report ---
+            let _t_end = std::time::Instant::now();
+            let total_ms = (_t_end - _t_prepare_start).as_secs_f64() * 1000.0;
+            let removals_ms = (_t_after_removals - _t_prepare_start).as_secs_f64() * 1000.0;
+            let gpu_dispatches_ms = (_t_after_gpu_dispatches - _t_after_removals).as_secs_f64() * 1000.0;
+            let scene_build_ms = (_t_after_scene_build - _t_after_gpu_dispatches).as_secs_f64() * 1000.0;
+            let composite_ms = (_t_end - _t_after_scene_build).as_secs_f64() * 1000.0;
+            crate::debug_overlay::update_prepare_timing(
+                total_ms, removals_ms, gpu_dispatches_ms, scene_build_ms, composite_ms,
+            );
+
            // For drag preview and other overlays, we still need a scene
            // Create an empty scene - the composited result is already in hdr_texture
            vello::Scene::new()
@ -2517,6 +2705,27 @@ pub struct StagePane {
    pending_gradient_op: Option<PendingGradientOp>,
    /// GPU ops for Warp/Liquify to dispatch in prepare().
    pending_warp_ops: Vec<PendingWarpOp>,
+
+    // ── New unified raster tool state ─────────────────────────────────────────
+    /// The active `RasterTool` implementation plus its GPU workspace.
+    /// Set on mousedown; cleared (and workspace queued for removal) on commit/cancel.
+    active_raster_tool: Option<(Box<dyn crate::raster_tool::RasterTool>, crate::raster_tool::RasterWorkspace)>,
+    /// Canvas UUIDs to remove from `GpuBrushEngine` at the top of the next `prepare()`.
+    /// Drains into `VelloRenderContext::pending_canvas_removals` each frame.
+    pending_canvas_removals: Vec<uuid::Uuid>,
+    /// First-frame canvas init packet for the active raster tool.  Forwarded to
+    /// `VelloRenderContext` on the mousedown frame; cleared after one forwarding.
+    pending_workspace_init: Option<crate::raster_tool::WorkspaceInitPacket>,
+    /// Keyframe UUIDs whose `raster_layer_cache` entry must be removed so fresh
+    /// `raw_pixels` are re-uploaded.  Drained into `VelloRenderContext` each frame.
+    pending_layer_cache_removals: Vec<uuid::Uuid>,
+    /// True when the unified raster tool has finished (mouseup) and is waiting for
+    /// the GPU readback result.  Cleared in render_content() after the result arrives.
+    active_tool_awaiting_readback: bool,
+    /// B-canvas UUID to readback into RASTER_READBACK_RESULTS on the next prepare().
+    /// Set on mouseup when `tool.finish()` returns true; forwarded to VelloRenderContext.
+    pending_tool_readback_b: Option<uuid::Uuid>,
+
    /// Synthetic drag/click override for test mode replay (debug builds only)
    #[cfg(debug_assertions)]
    replay_override: Option<ReplayDragState>,
@ -2906,6 +3115,12 @@ impl StagePane {
            gradient_state: None,
            pending_gradient_op: None,
            pending_warp_ops: Vec::new(),
+            active_raster_tool: None,
+            pending_canvas_removals: Vec::new(),
+            pending_workspace_init: None,
+            pending_layer_cache_removals: Vec::new(),
+            active_tool_awaiting_readback: false,
+            pending_tool_readback_b: None,
            #[cfg(debug_assertions)]
            replay_override: None,
        }
@ -5147,6 +5362,268 @@ impl StagePane {
        mask
    }

+    /// Allocate the three A/B/C GPU canvases and build a [`crate::raster_tool::RasterWorkspace`]
+    /// for a new raster tool operation.
+    ///
+    /// Called on **mousedown** before any tool-specific code runs.  The returned
+    /// [`crate::raster_tool::WorkspaceInitPacket`] must be stored in `self.pending_workspace_init`
+    /// so that [`VelloCallback::prepare`] can create the GPU textures on the first frame.
+    ///
+    /// - If a floating selection is active, the workspace targets it (Float path).
+    /// - Otherwise, any lingering float is committed first, then the active raster
+    ///   layer's keyframe becomes the workspace source (Layer path).
+    ///
+    /// Returns `None` when there is no raster target (no active layer, or the active
+    /// layer is not a raster layer).
+    fn begin_raster_workspace(
+        shared: &mut SharedPaneState,
+    ) -> Option<(crate::raster_tool::RasterWorkspace, crate::raster_tool::WorkspaceInitPacket)> {
+        use crate::raster_tool::{WorkspaceInitPacket, WorkspaceSource, RasterWorkspace};
+        use lightningbeam_core::layer::AnyLayer;
+
+        if let Some(ref float) = shared.selection.raster_floating {
+            // ── Float-active path ─────────────────────────────────────────
+            // Paint onto the floating selection's existing GPU canvas (A).
+            // Do NOT commit the float; it remains active.
+            let pixels = if float.pixels.is_empty() {
+                vec![0u8; (float.width * float.height * 4) as usize]
+            } else {
+                float.pixels.clone()
+            };
+            let (w, h, x, y) = (float.width, float.height, float.x, float.y);
+
+            let a_id = uuid::Uuid::new_v4();
+            let b_id = uuid::Uuid::new_v4();
+            let c_id = uuid::Uuid::new_v4();
+
+            let ws = RasterWorkspace {
+                a_canvas_id: a_id,
+                b_canvas_id: b_id,
+                c_canvas_id: c_id,
+                mask_texture: None,
+                width: w,
+                height: h,
+                x,
+                y,
+                source: WorkspaceSource::Float,
+                before_pixels: pixels.clone(),
+            };
+            let init = WorkspaceInitPacket {
+                a_canvas_id: a_id,
+                a_pixels: pixels,
+                b_canvas_id: b_id,
+                c_canvas_id: c_id,
+                width: w,
+                height: h,
+            };
+            Some((ws, init))
+        } else {
+            // ── Layer-active path ─────────────────────────────────────────
+            // Commit any lingering float so buffer_before reflects the fully-composited canvas.
+            Self::commit_raster_floating_now(shared);
+
+            let layer_id = (*shared.active_layer_id)?;
+            let time = *shared.playback_time;
+
+            let (doc_w, doc_h) = {
+                let doc = shared.action_executor.document();
+                (doc.width as u32, doc.height as u32)
+            };
+
+            // Ensure the keyframe exists before reading its ID.
+            {
+                let doc = shared.action_executor.document_mut();
+                if let Some(AnyLayer::Raster(rl)) = doc.get_layer_mut(&layer_id) {
+                    rl.ensure_keyframe_at(time, doc_w, doc_h);
+                } else {
+                    return None; // not a raster layer
+                }
+            }
+
+            // Read keyframe id and pixels.
+            let (kf_id, w, h, pixels) = {
+                let doc = shared.action_executor.document();
+                let AnyLayer::Raster(rl) = doc.get_layer(&layer_id)? else { return None };
+                let kf = rl.keyframe_at(time)?;
+                let pixels = if kf.raw_pixels.is_empty() {
+                    vec![0u8; (kf.width * kf.height * 4) as usize]
+                } else {
+                    kf.raw_pixels.clone()
+                };
+                (kf.id, kf.width, kf.height, pixels)
+            };
+
+            let a_id = uuid::Uuid::new_v4();
+            let b_id = uuid::Uuid::new_v4();
+            let c_id = uuid::Uuid::new_v4();
+
+            let ws = RasterWorkspace {
+                a_canvas_id: a_id,
+                b_canvas_id: b_id,
+                c_canvas_id: c_id,
+                mask_texture: None,
+                width: w,
+                height: h,
+                x: 0,
+                y: 0,
+                source: WorkspaceSource::Layer {
+                    layer_id,
+                    time,
+                    kf_id,
+                    canvas_w: doc_w,
+                    canvas_h: doc_h,
+                },
+                before_pixels: pixels.clone(),
+            };
+            let init = WorkspaceInitPacket {
+                a_canvas_id: a_id,
+                a_pixels: pixels,
+                b_canvas_id: b_id,
+                c_canvas_id: c_id,
+                width: w,
+                height: h,
+            };
+            Some((ws, init))
+        }
+    }
+
+    /// Unified raster stroke handler using the [`crate::raster_tool::RasterTool`] trait.
+    ///
+    /// Handles all paint-style brush tools (Paint, Pencil, Airbrush, Eraser, etc.).
+    /// - **mousedown**: calls `begin_raster_workspace()` + instantiates `BrushRasterTool`.
+    /// - **drag**: calls `tool.update()` each frame.
+    /// - **mouseup**: calls `tool.finish()`, schedules GPU B-canvas readback if committed.
+    fn handle_unified_raster_stroke_tool(
+        &mut self,
+        ui: &mut egui::Ui,
+        response: &egui::Response,
+        world_pos: egui::Vec2,
+        def: &'static dyn crate::tools::RasterToolDef,
+        shared: &mut SharedPaneState,
+    ) {
+        use lightningbeam_core::tool::ToolState;
+        use lightningbeam_core::raster_layer::RasterBlendMode;
+        use crate::raster_tool::{BrushRasterTool, RasterTool, WorkspaceSource};
+
+        let active_layer_id = match *shared.active_layer_id {
+            Some(id) => id,
+            None => return,
+        };
+
+        // Only operate on raster layers
+        let is_raster = shared.action_executor.document()
+            .get_layer(&active_layer_id)
+            .map_or(false, |l| matches!(l, lightningbeam_core::layer::AnyLayer::Raster(_)));
+        if !is_raster { return; }
+
+        let blend_mode = def.blend_mode();
+
+        // ----------------------------------------------------------------
+        // Mouse down: initialise the workspace and start the tool
+        // ----------------------------------------------------------------
+        let stroke_start = (self.rsp_primary_pressed(ui) && response.hovered()
+                            && self.active_raster_tool.is_none())
+                        || (self.rsp_clicked(response) && self.active_raster_tool.is_none());
+        if stroke_start {
+            // Build brush settings from the tool definition.
+            let bp = def.brush_params(shared.raster_settings);
+            let (mut b, radius, opacity, hardness, spacing) =
+                (bp.base_settings, bp.radius, bp.opacity, bp.hardness, bp.spacing);
+            b.radius_log      = radius.ln() - b.pressure_radius_gain * 0.5;
+            b.hardness        = hardness;
+            b.opaque          = opacity;
+            b.dabs_per_radius = spacing;
+            if matches!(blend_mode, RasterBlendMode::Smudge) {
+                b.dabs_per_actual_radius = 0.0;
+                b.smudge_radius_log = shared.raster_settings.smudge_strength;
+            }
+            if matches!(blend_mode, RasterBlendMode::BlurSharpen) {
+                b.dabs_per_actual_radius = 0.0;
+            }
+            let color = if matches!(blend_mode, RasterBlendMode::Erase) {
+                [1.0f32, 1.0, 1.0, 1.0]
+            } else {
+                let c = if shared.raster_settings.brush_use_fg {
+                    *shared.stroke_color
+                } else {
+                    *shared.fill_color
+                };
+                let s2l = |v: u8| -> f32 {
+                    let f = v as f32 / 255.0;
+                    if f <= 0.04045 { f / 12.92 } else { ((f + 0.055) / 1.055).powf(2.4) }
+                };
+                [s2l(c.r()), s2l(c.g()), s2l(c.b()), c.a() as f32 / 255.0]
+            };
+
+            if let Some((ws, init)) = Self::begin_raster_workspace(shared) {
+                let mut tool = Box::new(BrushRasterTool::new(color, b, blend_mode));
+                self.raster_last_compute_time = ui.input(|i| i.time);
+                tool.begin(&ws, world_pos, 0.0, shared.raster_settings);
+                self.pending_workspace_init = Some(init);
+                *shared.tool_state = ToolState::DrawingRasterStroke { points: vec![] };
+                self.active_raster_tool = Some((tool, ws));
+            }
+        }
+
+        // ----------------------------------------------------------------
+        // Per-frame update: fires every frame while stroke is active so
+        // time-based brushes (airbrush) accumulate dabs even when stationary.
+        // ----------------------------------------------------------------
+        if self.active_raster_tool.is_some()
+            && matches!(*shared.tool_state, ToolState::DrawingRasterStroke { .. })
+            && !stroke_start
+        {
+            let current_time = ui.input(|i| i.time);
+            let dt = (current_time - self.raster_last_compute_time).clamp(0.0, 0.1) as f32;
+            self.raster_last_compute_time = current_time;
+            if let Some((ref mut tool, ref ws)) = self.active_raster_tool {
+                tool.update(ws, world_pos, dt, shared.raster_settings);
+            }
+        }
+
+        // Keep egui repainting while a stroke is in progress.
+        if matches!(*shared.tool_state, ToolState::DrawingRasterStroke { .. }) {
+            ui.ctx().request_repaint();
+        }
+
+        // ----------------------------------------------------------------
+        // Mouse up: finish the tool, trigger readback if needed
+        // ----------------------------------------------------------------
+        let stroke_end = self.rsp_drag_stopped(response)
+            || (self.rsp_any_released(ui)
+                && self.active_raster_tool.is_some()
+                && matches!(*shared.tool_state, ToolState::DrawingRasterStroke { .. }));
+        if stroke_end {
+            *shared.tool_state = ToolState::Idle;
+            if self.active_raster_tool.is_some() {
+                let needs_commit = {
+                    let (ref mut tool, ref ws) = self.active_raster_tool.as_mut().unwrap();
+                    tool.finish(ws)
+                };
+                if needs_commit {
+                    let ws = &self.active_raster_tool.as_ref().unwrap().1;
+                    self.painting_float = matches!(ws.source, WorkspaceSource::Float);
+                    let (undo_layer_id, undo_time) = match &ws.source {
+                        WorkspaceSource::Layer { layer_id, time, .. } => (*layer_id, *time),
+                        WorkspaceSource::Float => (uuid::Uuid::nil(), 0.0),
+                    };
+                    self.pending_undo_before = Some((
+                        undo_layer_id, undo_time, ws.width, ws.height,
+                        ws.before_pixels.clone(),
+                    ));
+                    self.pending_tool_readback_b = Some(ws.b_canvas_id);
+                    self.active_tool_awaiting_readback = true;
+                    // Keep active_raster_tool alive until render_content() consumes the result.
+                } else {
+                    // No commit (no dabs were placed); discard immediately.
+                    if let Some((_, ws)) = self.active_raster_tool.take() {
+                        self.pending_canvas_removals.extend(ws.canvas_ids());
+                    }
+                }
+            }
+        }
+    }
+
    fn lift_selection_to_float(shared: &mut SharedPaneState) {
        use lightningbeam_core::layer::AnyLayer;
        use lightningbeam_core::selection::RasterFloatingSelection;
@ -9875,7 +10352,7 @@ impl StagePane {
                        shared.action_executor.document().get_layer(&id)
                    }).map_or(false, |l| matches!(l, lightningbeam_core::layer::AnyLayer::Raster(_)));
                    if is_raster {
-                        self.handle_raster_stroke_tool(ui, &response, world_pos, &crate::tools::paint::PAINT, shared);
+                        self.handle_unified_raster_stroke_tool(ui, &response, world_pos, &crate::tools::paint::PAINT, shared);
                    } else {
                        self.handle_draw_tool(ui, &response, world_pos, shared);
                    }
@ -10512,6 +10989,9 @@ impl PaneRenderer for StagePane {
                                }
                            }
                            float.pixels = pixels;
+                            // Invalidate the float's GPU canvas so the lazy-init
+                            // in prepare() re-uploads the fresh pixels next frame.
+                            self.pending_canvas_removals.push(float.canvas_id);
                        }
                    }
                    self.stroke_clip_selection = None;
@ -10560,6 +11040,14 @@ impl PaneRenderer for StagePane {
                        self.pending_canvas_removal = Some(kf_id);
                    }
                }
+                // Unified tool cleanup: clear active_raster_tool and queue A/B/C for removal.
+                // Runs after both the float and layer branches.
+                if self.active_tool_awaiting_readback {
+                    self.active_tool_awaiting_readback = false;
+                    if let Some((_, ws)) = self.active_raster_tool.take() {
+                        self.pending_canvas_removals.extend(ws.canvas_ids());
+                    }
+                }
            }
        }

@ -11078,6 +11566,26 @@ impl PaneRenderer for StagePane {
                }))
        });

+        // Scan for raster keyframes whose texture_dirty flag was set since last frame
+        // (e.g. by undo/redo or a stroke action execute/rollback). Must run BEFORE
+        // document_arc() is called below so that Arc::make_mut does not clone the document.
+        {
+            let doc = shared.action_executor.document_mut();
+            fn collect_dirty(layers: &mut [lightningbeam_core::layer::AnyLayer], out: &mut Vec<uuid::Uuid>) {
+                for layer in layers.iter_mut() {
+                    if let lightningbeam_core::layer::AnyLayer::Raster(rl) = layer {
+                        for kf in &mut rl.keyframes {
+                            if kf.texture_dirty {
+                                out.push(kf.id);
+                                kf.texture_dirty = false;
+                            }
+                        }
+                    }
+                }
+            }
+            collect_dirty(&mut doc.root.children, &mut self.pending_layer_cache_removals);
+        }
+
        // Use egui's custom painting callback for Vello
        // document_arc() returns Arc<Document> - cheap pointer copy, not deep clone
        let callback = VelloCallback { ctx: VelloRenderContext {
@ -11116,6 +11624,23 @@ impl PaneRenderer for StagePane {
            pending_canvas_removal: self.pending_canvas_removal.take(),
            painting_float: self.painting_float,
            brush_preview_pixels: shared.brush_preview_pixels.clone(),
+            active_tool_render: self.active_raster_tool.as_ref().map(|(_, ws)| {
+                crate::raster_tool::ActiveToolRender {
+                    b_canvas_id: ws.b_canvas_id,
+                    x: ws.x, y: ws.y,
+                    width: ws.width, height: ws.height,
+                    layer_id: match &ws.source {
+                        crate::raster_tool::WorkspaceSource::Layer { layer_id, .. } => Some(*layer_id),
+                        crate::raster_tool::WorkspaceSource::Float => None,
+                    },
+                }
+            }),
+            pending_canvas_removals: std::mem::take(&mut self.pending_canvas_removals),
+            pending_workspace_init: self.pending_workspace_init.take(),
+            pending_tool_gpu_work: self.active_raster_tool.as_mut()
+                .and_then(|(tool, _)| tool.take_pending_gpu_work()),
+            pending_layer_cache_removals: std::mem::take(&mut self.pending_layer_cache_removals),
+            pending_tool_readback_b: self.pending_tool_readback_b.take(),
        }};

        let cb = egui_wgpu::Callback::new_paint_callback(
--- a/lightningbeam-ui/lightningbeam-editor/src/raster_tool.rs
+++ b/lightningbeam-ui/lightningbeam-editor/src/raster_tool.rs
@ -0,0 +1,758 @@
+//! Unified raster tool interface.
+//!
+//! Every raster tool operates on three GPU textures of identical dimensions:
+//!
+//! | Buffer | Access | Purpose |
+//! |--------|--------|---------|
+//! | **A** | Read-only  | Source pixels, uploaded from layer/float at mousedown. |
+//! | **B** | Write-only | Output / display. Compositor shows B while the tool is active. |
+//! | **C** | Read+Write | Scratch. Dabs accumulate here across the stroke; composite A+C→B each frame. |
+//!
+//! All three are `Rgba8Unorm` with the same pixel dimensions.  The framework
+//! allocates and validates them in [`begin_raster_workspace`]; tools only
+//! dispatch shaders.
+
+use std::sync::Arc;
+use uuid::Uuid;
+use eframe::egui;
+
+// ── WorkspaceSource ──────────────────────────────────────────────────────────
+
+/// Describes whether the tool is operating on a raster layer or a floating selection.
+#[derive(Clone, Debug)]
+pub enum WorkspaceSource {
+    /// Operating on the full raster layer.
+    Layer {
+        layer_id: Uuid,
+        time:     f64,
+        /// The keyframe's own UUID (the A-canvas key in `GpuBrushEngine`).
+        kf_id:    Uuid,
+        /// Full canvas dimensions (may differ from workspace dims for floating selections).
+        canvas_w: u32,
+        canvas_h: u32,
+    },
+    /// Operating on the floating selection.
+    Float,
+}
+
+// ── RasterWorkspace ───────────────────────────────────────────────────────────
+
+/// GPU buffer IDs and metadata for a single tool operation.
+///
+/// Created by [`begin_raster_workspace`] on mousedown.  All three canvas UUIDs
+/// index into `GpuBrushEngine::canvases` and are valid for the lifetime of the
+/// active tool.  They are queued for removal in `pending_canvas_removals` after
+/// commit or cancel.
+#[derive(Debug)]
+pub struct RasterWorkspace {
+    /// A canvas (Rgba8Unorm) — source pixels, uploaded at mousedown, read-only for tools.
+    pub a_canvas_id: Uuid,
+    /// B canvas (Rgba8Unorm) — output / display; compositor shows this while active.
+    pub b_canvas_id: Uuid,
+    /// C canvas (Rgba8Unorm) — scratch; tools accumulate dabs here across the stroke.
+    pub c_canvas_id: Uuid,
+    /// Optional R8Unorm selection mask (same pixel dimensions as A/B/C).
+    /// `None` means the entire workspace is selected.
+    pub mask_texture: Option<Arc<wgpu::Texture>>,
+    /// Pixel dimensions.  A, B, C, and mask are all guaranteed to be this size.
+    pub width:  u32,
+    pub height: u32,
+    /// Top-left position in document-pixel space.
+    /// `(0, 0)` for a layer workspace; `(float.x, float.y)` for a float workspace.
+    pub x: i32,
+    pub y: i32,
+    /// Where the workspace came from — drives commit behaviour.
+    pub source: WorkspaceSource,
+    /// CPU snapshot taken at mousedown for undo / cancel.
+    /// Length is always `width * height * 4` (sRGB premultiplied RGBA).
+    pub before_pixels: Vec<u8>,
+}
+
+impl RasterWorkspace {
+    /// Panic-safe bounds check.  Asserts that every GPU canvas exists and has
+    /// the dimensions declared by this workspace.  Called by the framework
+    /// before `begin()` and before each `update()`.
+    pub fn validate(&self, gpu: &crate::gpu_brush::GpuBrushEngine) {
+        for (name, id) in [
+            ("A", self.a_canvas_id),
+            ("B", self.b_canvas_id),
+            ("C", self.c_canvas_id),
+        ] {
+            let canvas = gpu.canvases.get(&id).unwrap_or_else(|| {
+                panic!(
+                    "RasterWorkspace::validate: buffer '{}' (id={}) not found in GpuBrushEngine",
+                    name, id
+                )
+            });
+            assert_eq!(
+                canvas.width, self.width,
+                "RasterWorkspace::validate: buffer '{}' width {} != workspace width {}",
+                name, canvas.width, self.width
+            );
+            assert_eq!(
+                canvas.height, self.height,
+                "RasterWorkspace::validate: buffer '{}' height {} != workspace height {}",
+                name, canvas.height, self.height
+            );
+        }
+        let expected = (self.width * self.height * 4) as usize;
+        assert_eq!(
+            self.before_pixels.len(), expected,
+            "RasterWorkspace::validate: before_pixels.len()={} != expected {}",
+            self.before_pixels.len(), expected
+        );
+    }
+
+    /// Returns the three canvas UUIDs as an array (convenient for bulk removal).
+    pub fn canvas_ids(&self) -> [Uuid; 3] {
+        [self.a_canvas_id, self.b_canvas_id, self.c_canvas_id]
+    }
+}
+
+// ── WorkspaceInitPacket ───────────────────────────────────────────────────────
+
+/// Data sent to `prepare()` on the first frame to create and upload the A/B/C canvases.
+///
+/// The canvas UUIDs are pre-allocated in `begin_raster_workspace()` (UI thread).
+/// The actual `wgpu::Texture` creation and pixel upload happens in `prepare()`.
+pub struct WorkspaceInitPacket {
+    /// A canvas UUID (already in `RasterWorkspace::a_canvas_id`).
+    pub a_canvas_id: Uuid,
+    /// Pixel data to upload to A.  Length must equal `width * height * 4`.
+    pub a_pixels: Vec<u8>,
+    /// B canvas UUID.
+    pub b_canvas_id: Uuid,
+    /// C canvas UUID.
+    pub c_canvas_id: Uuid,
+    pub width:  u32,
+    pub height: u32,
+}
+
+// ── ActiveToolRender ──────────────────────────────────────────────────────────
+
+/// Passed to `VelloRenderContext` so the compositor can blit the tool's B output
+/// in the correct position in the layer stack.
+///
+/// While an `ActiveToolRender` is set:
+/// - If `layer_id == Some(id)`: blit B at that layer's compositor slot.
+/// - If `layer_id == None`: blit B at the float's compositor slot.
+#[derive(Clone, Debug)]
+pub struct ActiveToolRender {
+    /// B canvas to blit.
+    pub b_canvas_id: Uuid,
+    /// Position of the B canvas in document space.
+    pub x: i32,
+    pub y: i32,
+    /// Pixel dimensions of the B canvas.
+    pub width:  u32,
+    pub height: u32,
+    /// `Some(layer_id)` → B replaces this layer's render slot.
+    /// `None`           → B replaces the float render slot.
+    pub layer_id: Option<Uuid>,
+}
+
+// ── PendingGpuWork ────────────────────────────────────────────────────────────
+
+/// GPU work to execute in `VelloCallback::prepare()`.
+///
+/// Tools compute dab lists and other CPU-side data in `update()` (UI thread),
+/// store them as a `Box<dyn PendingGpuWork>`, and return that work through
+/// `RasterTool::take_pending_gpu_work()` each frame.  `prepare()` then calls
+/// `execute()` with the render-thread `device`/`queue`/`gpu`.
+///
+/// `execute()` takes `&self` so the work object need not be consumed; it lives
+/// in the `VelloRenderContext` (which is immutable in `prepare()`).
+pub trait PendingGpuWork: Send + Sync {
+    fn execute(
+        &self,
+        device: &wgpu::Device,
+        queue:  &wgpu::Queue,
+        gpu:    &mut crate::gpu_brush::GpuBrushEngine,
+    );
+}
+
+// ── RasterTool trait ──────────────────────────────────────────────────────────
+
+/// Unified interface for all raster tools.
+///
+/// All methods run on the UI thread.  They update the tool's internal state
+/// and store pending GPU op descriptors in fields that `StagePane` forwards
+/// to `VelloRenderContext` for execution by `VelloCallback::prepare()`.
+pub trait RasterTool: Send + Sync {
+    /// Called on **mousedown** after [`begin_raster_workspace`] has allocated and
+    /// validated A, B, and C.  The tool should initialise its internal state and
+    /// optionally queue an initial GPU dispatch (e.g. identity composite for
+    /// transform so the handle frame appears immediately).
+    fn begin(
+        &mut self,
+        ws:       &RasterWorkspace,
+        pos:      egui::Vec2,
+        dt:       f32,
+        settings: &crate::tools::RasterToolSettings,
+    );
+
+    /// Called every frame while the pointer is held (including the first drag frame).
+    /// The tool should accumulate new work into C and queue a composite A+C→B pass.
+    /// `dt` is the elapsed time in seconds since the previous call; used by time-based
+    /// brushes (airbrush, etc.) to fire dabs at the correct rate when stationary.
+    fn update(
+        &mut self,
+        ws:       &RasterWorkspace,
+        pos:      egui::Vec2,
+        dt:       f32,
+        settings: &crate::tools::RasterToolSettings,
+    );
+
+    /// Called on **pointer release**.  Returns `true` if a GPU readback of B should
+    /// be performed and the result committed to the document.  Returns `false` if
+    /// the operation was a no-op (e.g. the pointer never moved).
+    fn finish(&mut self, ws: &RasterWorkspace) -> bool;
+
+    /// Called on **Escape** or tool switch mid-stroke.  The caller restores the
+    /// source pixels from `ws.before_pixels` without creating an undo entry; the
+    /// tool just cleans up internal state.
+    fn cancel(&mut self, ws: &RasterWorkspace);
+
+    /// Called once per frame (in the VelloCallback construction, UI thread) to
+    /// extract pending GPU work accumulated by `begin()` / `update()`.
+    ///
+    /// The tool clears its internal pending work and returns it.  `prepare()` on
+    /// the render thread then calls `work.execute()`.  Default: no GPU work.
+    fn take_pending_gpu_work(&mut self) -> Option<Box<dyn PendingGpuWork>> {
+        None
+    }
+}
+
+// ── BrushRasterTool ───────────────────────────────────────────────────────────
+
+use lightningbeam_core::brush_engine::{BrushEngine, GpuDab, StrokeState};
+use lightningbeam_core::brush_settings::BrushSettings;
+use lightningbeam_core::raster_layer::{RasterBlendMode, StrokePoint, StrokeRecord};
+
+/// GPU work for one frame of a brush stroke: dispatch dabs into C, then composite A+C→B.
+struct PendingBrushWork {
+    dabs:     Vec<GpuDab>,
+    bbox:     (i32, i32, i32, i32),
+    a_id:     Uuid,
+    b_id:     Uuid,
+    c_id:     Uuid,
+    canvas_w: u32,
+    canvas_h: u32,
+}
+
+impl PendingGpuWork for PendingBrushWork {
+    fn execute(
+        &self,
+        device: &wgpu::Device,
+        queue:  &wgpu::Queue,
+        gpu:    &mut crate::gpu_brush::GpuBrushEngine,
+    ) {
+        // 1. Accumulate this frame's dabs into C (if any).
+        if !self.dabs.is_empty() {
+            gpu.render_dabs(device, queue, self.c_id, &self.dabs, self.bbox, self.canvas_w, self.canvas_h);
+        }
+        // 2. Always composite A + C → B so B shows A's content even with no dabs this frame.
+        //    On begin() with empty C this initialises B = A, avoiding a transparent flash.
+        gpu.composite_a_c_to_b(device, queue, self.a_id, self.c_id, self.b_id, self.canvas_w, self.canvas_h);
+    }
+}
+
+/// Raster tool for paint brushes (Normal blend mode).
+///
+/// Each `update()` call computes new dabs for that frame and stores them as
+/// `PendingBrushWork`.  `take_pending_gpu_work()` hands the work to `prepare()`
+/// which dispatches the dab and composite shaders on the render thread.
+pub struct BrushRasterTool {
+    color:        [f32; 4],
+    brush:        BrushSettings,
+    blend_mode:   RasterBlendMode,
+    stroke_state: StrokeState,
+    last_point:   Option<StrokePoint>,
+    pending:      Option<Box<PendingBrushWork>>,
+    /// True after at least one non-empty frame (so finish() knows a commit is needed).
+    has_dabs:     bool,
+    /// Offset to convert world coordinates to canvas-local coordinates.
+    canvas_offset_x: i32,
+    canvas_offset_y: i32,
+}
+
+impl BrushRasterTool {
+    /// Create a new brush tool.
+    ///
+    /// `color` — linear premultiplied RGBA, matches the format expected by `GpuDab`.
+    pub fn new(
+        color:      [f32; 4],
+        brush:      BrushSettings,
+        blend_mode: RasterBlendMode,
+    ) -> Self {
+        Self {
+            color,
+            brush,
+            blend_mode,
+            stroke_state: StrokeState::new(),
+            last_point:   None,
+            pending:      None,
+            has_dabs:     false,
+            canvas_offset_x: 0,
+            canvas_offset_y: 0,
+        }
+    }
+
+    fn make_stroke_point(pos: egui::Vec2, off_x: i32, off_y: i32) -> StrokePoint {
+        StrokePoint {
+            x:         pos.x - off_x as f32,
+            y:         pos.y - off_y as f32,
+            pressure:  1.0,
+            tilt_x:    0.0,
+            tilt_y:    0.0,
+            timestamp: 0.0,
+        }
+    }
+
+    fn dispatch_dabs(
+        &mut self,
+        ws:  &RasterWorkspace,
+        pt:  StrokePoint,
+        dt:  f32,
+    ) {
+        // Use a 2-point segment when we have a previous point so the engine
+        // interpolates dabs along the path.  First mousedown uses a single point.
+        let points = match self.last_point.take() {
+            Some(prev) => vec![prev, pt.clone()],
+            None       => vec![pt.clone()],
+        };
+        let record = StrokeRecord {
+            brush_settings: self.brush.clone(),
+            color:          self.color,
+            blend_mode:     self.blend_mode,
+            tool_params:    [0.0; 4],
+            points,
+        };
+        let (dabs, bbox) = BrushEngine::compute_dabs(&record, &mut self.stroke_state, dt);
+        if !dabs.is_empty() {
+            self.has_dabs = true;
+            self.pending = Some(Box::new(PendingBrushWork {
+                dabs,
+                bbox,
+                a_id:     ws.a_canvas_id,
+                b_id:     ws.b_canvas_id,
+                c_id:     ws.c_canvas_id,
+                canvas_w: ws.width,
+                canvas_h: ws.height,
+            }));
+        }
+        self.last_point = Some(pt);
+    }
+}
+
+impl RasterTool for BrushRasterTool {
+    fn begin(&mut self, ws: &RasterWorkspace, pos: egui::Vec2, _dt: f32, _settings: &crate::tools::RasterToolSettings) {
+        self.canvas_offset_x = ws.x;
+        self.canvas_offset_y = ws.y;
+        let pt = Self::make_stroke_point(pos, ws.x, ws.y);
+        self.dispatch_dabs(ws, pt, 0.0);
+        // Always ensure a composite is queued on begin() so B is initialised from A
+        // on the first frame even if no dabs fired (large spacing, etc.).
+        if self.pending.is_none() {
+            self.pending = Some(Box::new(PendingBrushWork {
+                dabs:     vec![],
+                bbox:     (0, 0, ws.width as i32, ws.height as i32),
+                a_id:     ws.a_canvas_id,
+                b_id:     ws.b_canvas_id,
+                c_id:     ws.c_canvas_id,
+                canvas_w: ws.width,
+                canvas_h: ws.height,
+            }));
+        }
+    }
+
+    fn update(&mut self, ws: &RasterWorkspace, pos: egui::Vec2, dt: f32, _settings: &crate::tools::RasterToolSettings) {
+        let pt = Self::make_stroke_point(pos, ws.x, ws.y);
+        self.dispatch_dabs(ws, pt, dt);
+    }
+
+    fn finish(&mut self, _ws: &RasterWorkspace) -> bool {
+        self.has_dabs
+    }
+
+    fn cancel(&mut self, _ws: &RasterWorkspace) {
+        self.pending = None;
+        self.has_dabs = false;
+    }
+
+    fn take_pending_gpu_work(&mut self) -> Option<Box<dyn PendingGpuWork>> {
+        self.pending.take().map(|w| w as Box<dyn PendingGpuWork>)
+    }
+}
+
+// ── EffectBrushTool ───────────────────────────────────────────────────────────
+
+/// Raster tool for effect brushes (Blur, Sharpen, Dodge, Burn, Sponge, Desaturate).
+///
+/// C accumulates a per-pixel influence weight (R channel, 0–255).
+/// The composite pass applies the effect to A, scaled by C.r, writing to B:
+///   `B = lerp(A, effect(A), C.r)`
+///
+/// Using C as an influence map (rather than accumulating modified pixels) prevents
+/// overlapping dabs from compounding the effect beyond the C.r cap (255).
+///
+/// # GPU implementation (TODO)
+/// Requires a dedicated `effect_brush_composite.wgsl` shader that reads A and C,
+/// applies the blend-mode-specific filter to A, and blends by C.r → B.
+pub struct EffectBrushTool {
+    brush:      BrushSettings,
+    blend_mode: RasterBlendMode,
+    has_dabs:   bool,
+}
+
+impl EffectBrushTool {
+    pub fn new(brush: BrushSettings, blend_mode: RasterBlendMode) -> Self {
+        Self { brush, blend_mode, has_dabs: false }
+    }
+}
+
+impl RasterTool for EffectBrushTool {
+    fn begin(&mut self, _ws: &RasterWorkspace, _pos: egui::Vec2, _dt: f32, _settings: &crate::tools::RasterToolSettings) {}
+    fn update(&mut self, _ws: &RasterWorkspace, _pos: egui::Vec2, _dt: f32, _settings: &crate::tools::RasterToolSettings) {
+        self.has_dabs = true; // placeholder
+    }
+    fn finish(&mut self, _ws: &RasterWorkspace) -> bool { self.has_dabs }
+    fn cancel(&mut self, _ws: &RasterWorkspace) { self.has_dabs = false; }
+    // GPU shaders not yet implemented; take_pending_gpu_work returns None (default).
+}
+
+// ── SmudgeTool ────────────────────────────────────────────────────────────────
+
+/// Raster tool for the smudge brush.
+///
+/// `begin()`: copy A → C so C starts with the source pixels for color pickup.
+/// `update()`: dispatch smudge dabs using `blend_mode=2` (reads C as source,
+///   writes smear to C); then composite C over A → B.
+///   Because the smudge shader reads from `canvas_src` (C.src) and writes to
+///   `canvas_dst` (C.dst), existing dabs are preserved in the smear history.
+///
+/// # GPU implementation (TODO)
+/// Requires an initial A → C copy in `begin()` (via GPU copy command).
+/// The smudge dab dispatch then uses `render_dabs(c_id, smudge_dabs, ...)`.
+/// The composite pass is `composite_a_c_to_b` (same as BrushRasterTool).
+pub struct SmudgeTool {
+    brush:    BrushSettings,
+    has_dabs: bool,
+}
+
+impl SmudgeTool {
+    pub fn new(brush: BrushSettings) -> Self {
+        Self { brush, has_dabs: false }
+    }
+}
+
+impl RasterTool for SmudgeTool {
+    fn begin(&mut self, _ws: &RasterWorkspace, _pos: egui::Vec2, _dt: f32, _settings: &crate::tools::RasterToolSettings) {}
+    fn update(&mut self, _ws: &RasterWorkspace, _pos: egui::Vec2, _dt: f32, _settings: &crate::tools::RasterToolSettings) {
+        self.has_dabs = true; // placeholder
+    }
+    fn finish(&mut self, _ws: &RasterWorkspace) -> bool { self.has_dabs }
+    fn cancel(&mut self, _ws: &RasterWorkspace) { self.has_dabs = false; }
+    // GPU shaders not yet implemented; take_pending_gpu_work returns None (default).
+}
+
+// ── GradientRasterTool ────────────────────────────────────────────────────────
+
+use crate::gpu_brush::GpuGradientStop;
+use lightningbeam_core::gradient::{GradientExtend, GradientType, ShapeGradient};
+
+fn gradient_stops_to_gpu(gradient: &ShapeGradient) -> Vec<GpuGradientStop> {
+    gradient.stops.iter().map(|s| {
+        GpuGradientStop::from_srgb_u8(s.position, s.color.r, s.color.g, s.color.b, s.color.a)
+    }).collect()
+}
+
+fn gradient_extend_to_u32(extend: GradientExtend) -> u32 {
+    match extend {
+        GradientExtend::Pad     => 0,
+        GradientExtend::Reflect => 1,
+        GradientExtend::Repeat  => 2,
+    }
+}
+
+fn gradient_kind_to_u32(kind: GradientType) -> u32 {
+    match kind {
+        GradientType::Linear => 0,
+        GradientType::Radial => 1,
+    }
+}
+
+struct PendingGradientWork {
+    a_id:        Uuid,
+    b_id:        Uuid,
+    stops:       Vec<GpuGradientStop>,
+    start:       (f32, f32),
+    end:         (f32, f32),
+    opacity:     f32,
+    extend_mode: u32,
+    kind:        u32,
+}
+
+impl PendingGpuWork for PendingGradientWork {
+    fn execute(&self, device: &wgpu::Device, queue: &wgpu::Queue, gpu: &mut crate::gpu_brush::GpuBrushEngine) {
+        gpu.apply_gradient_fill(
+            device, queue,
+            &self.a_id, &self.b_id,
+            &self.stops,
+            self.start, self.end,
+            self.opacity, self.extend_mode, self.kind,
+        );
+    }
+}
+
+/// Raster tool for gradient fills.
+///
+/// `begin()` records the canvas-local start position.
+/// `update()` recomputes gradient parameters from settings and queues a
+/// `PendingGradientWork` that calls `apply_gradient_fill` in `prepare()`.
+/// `finish()` returns whether any gradient was dispatched.
+pub struct GradientRasterTool {
+    start_canvas:   egui::Vec2,
+    end_canvas:     egui::Vec2,
+    pending:        Option<Box<PendingGradientWork>>,
+    has_dispatched: bool,
+}
+
+impl GradientRasterTool {
+    pub fn new() -> Self {
+        Self {
+            start_canvas:   egui::Vec2::ZERO,
+            end_canvas:     egui::Vec2::ZERO,
+            pending:        None,
+            has_dispatched: false,
+        }
+    }
+}
+
+impl RasterTool for GradientRasterTool {
+    fn begin(&mut self, ws: &RasterWorkspace, pos: egui::Vec2, _dt: f32, _settings: &crate::tools::RasterToolSettings) {
+        let canvas_pos = pos - egui::vec2(ws.x as f32, ws.y as f32);
+        self.start_canvas = canvas_pos;
+        self.end_canvas   = canvas_pos;
+    }
+
+    fn update(&mut self, ws: &RasterWorkspace, pos: egui::Vec2, _dt: f32, settings: &crate::tools::RasterToolSettings) {
+        self.end_canvas = pos - egui::vec2(ws.x as f32, ws.y as f32);
+        let gradient = &settings.gradient;
+        self.pending = Some(Box::new(PendingGradientWork {
+            a_id:        ws.a_canvas_id,
+            b_id:        ws.b_canvas_id,
+            stops:       gradient_stops_to_gpu(gradient),
+            start:       (self.start_canvas.x, self.start_canvas.y),
+            end:         (self.end_canvas.x,   self.end_canvas.y),
+            opacity:     settings.gradient_opacity,
+            extend_mode: gradient_extend_to_u32(gradient.extend),
+            kind:        gradient_kind_to_u32(gradient.kind),
+        }));
+        self.has_dispatched = true;
+    }
+
+    fn finish(&mut self, _ws: &RasterWorkspace) -> bool { self.has_dispatched }
+
+    fn cancel(&mut self, _ws: &RasterWorkspace) {
+        self.pending        = None;
+        self.has_dispatched = false;
+    }
+
+    fn take_pending_gpu_work(&mut self) -> Option<Box<dyn PendingGpuWork>> {
+        self.pending.take().map(|w| w as Box<dyn PendingGpuWork>)
+    }
+}
+
+// ── TransformRasterTool ───────────────────────────────────────────────────────
+
+use crate::gpu_brush::RasterTransformGpuParams;
+
+struct PendingTransformWork {
+    a_id:   Uuid,
+    b_id:   Uuid,
+    params: RasterTransformGpuParams,
+}
+
+impl PendingGpuWork for PendingTransformWork {
+    fn execute(&self, device: &wgpu::Device, queue: &wgpu::Queue, gpu: &mut crate::gpu_brush::GpuBrushEngine) {
+        gpu.render_transform(device, queue, &self.a_id, &self.b_id, self.params);
+    }
+}
+
+/// Raster tool for affine transforms (move, scale, rotate, shear).
+///
+/// `begin()` stores the initial canvas dimensions and queues an identity
+/// transform so B is initialised on the first frame.
+/// `update()` recomputes the inverse affine matrix from the current handle
+/// positions and queues a new `PendingTransformWork`.
+///
+/// The inverse matrix maps output pixel coordinates back to source pixel
+/// coordinates:  `src = M_inv * dst + b`
+/// where `M_inv = [[a00, a01], [a10, a11]]` and `b = [b0, b1]`.
+///
+/// # GPU implementation
+/// Fully wired — uses `GpuBrushEngine::render_transform`.  Handle interaction
+/// logic (drag, rotate, scale) is handled by the tool's `update()` caller in
+/// `stage.rs` which computes and passes in the `RasterTransformGpuParams`.
+pub struct TransformRasterTool {
+    pending:        Option<Box<PendingTransformWork>>,
+    has_dispatched: bool,
+    canvas_w:       u32,
+    canvas_h:       u32,
+}
+
+impl TransformRasterTool {
+    pub fn new() -> Self {
+        Self {
+            pending:        None,
+            has_dispatched: false,
+            canvas_w:       0,
+            canvas_h:       0,
+        }
+    }
+
+    /// Queue a transform with the given inverse-affine matrix.
+    /// Called by the stage handler after computing handle positions.
+    pub fn set_transform(
+        &mut self,
+        ws:     &RasterWorkspace,
+        params: RasterTransformGpuParams,
+    ) {
+        self.pending = Some(Box::new(PendingTransformWork {
+            a_id:   ws.a_canvas_id,
+            b_id:   ws.b_canvas_id,
+            params,
+        }));
+        self.has_dispatched = true;
+    }
+}
+
+impl RasterTool for TransformRasterTool {
+    fn begin(&mut self, ws: &RasterWorkspace, _pos: egui::Vec2, _dt: f32, _settings: &crate::tools::RasterToolSettings) {
+        self.canvas_w = ws.width;
+        self.canvas_h = ws.height;
+        // Queue identity transform so B shows the source immediately.
+        let identity = RasterTransformGpuParams {
+            a00: 1.0, a01: 0.0,
+            a10: 0.0, a11: 1.0,
+            b0: 0.0, b1: 0.0,
+            src_w: ws.width,  src_h: ws.height,
+            dst_w: ws.width,  dst_h: ws.height,
+            _pad0: 0, _pad1: 0,
+        };
+        self.set_transform(ws, identity);
+    }
+
+    fn update(&mut self, _ws: &RasterWorkspace, _pos: egui::Vec2, _dt: f32, _settings: &crate::tools::RasterToolSettings) {
+        // Handle interaction and matrix updates are driven from stage.rs via set_transform().
+    }
+
+    fn finish(&mut self, _ws: &RasterWorkspace) -> bool { self.has_dispatched }
+
+    fn cancel(&mut self, _ws: &RasterWorkspace) {
+        self.pending        = None;
+        self.has_dispatched = false;
+    }
+
+    fn take_pending_gpu_work(&mut self) -> Option<Box<dyn PendingGpuWork>> {
+        self.pending.take().map(|w| w as Box<dyn PendingGpuWork>)
+    }
+}
+
+// ── WarpRasterTool ────────────────────────────────────────────────────────────
+
+/// Raster tool for warp / mesh deformation.
+///
+/// Uses a displacement buffer (managed by `GpuBrushEngine`) that maps each
+/// output pixel to a source offset.  The displacement grid is updated by
+/// dragging control points; the warp shader reads anchor pixels + displacement
+/// → B each frame.
+///
+/// # GPU implementation (TODO)
+/// Requires: `create_displacement_buf`, `apply_warp` already exist in
+/// `GpuBrushEngine`.  Wire brush-drag interaction to update displacement
+/// entries and call `apply_warp`.
+pub struct WarpRasterTool {
+    has_dispatched: bool,
+}
+
+impl WarpRasterTool {
+    pub fn new() -> Self { Self { has_dispatched: false } }
+}
+
+impl RasterTool for WarpRasterTool {
+    fn begin(&mut self, _ws: &RasterWorkspace, _pos: egui::Vec2, _dt: f32, _settings: &crate::tools::RasterToolSettings) {}
+    fn update(&mut self, _ws: &RasterWorkspace, _pos: egui::Vec2, _dt: f32, _settings: &crate::tools::RasterToolSettings) {
+        self.has_dispatched = true; // placeholder
+    }
+    fn finish(&mut self, _ws: &RasterWorkspace) -> bool { self.has_dispatched }
+    fn cancel(&mut self, _ws: &RasterWorkspace) { self.has_dispatched = false; }
+    // take_pending_gpu_work: default (None) — full GPU wiring is TODO.
+}
+
+// ── LiquifyRasterTool ─────────────────────────────────────────────────────────
+
+/// Raster tool for liquify (per-pixel displacement painting).
+///
+/// Similar to `WarpRasterTool` but uses a full per-pixel displacement map
+/// (grid_cols = grid_rows = 0 in `apply_warp`) painted by brush strokes.
+/// Each dab accumulates displacement in the push/pull/swirl direction.
+///
+/// # GPU implementation (TODO)
+/// Requires: a dab-to-displacement shader that accumulates per-pixel offsets
+/// into the displacement buffer, then `apply_warp` reads it → B.
+pub struct LiquifyRasterTool {
+    has_dispatched: bool,
+}
+
+impl LiquifyRasterTool {
+    pub fn new() -> Self { Self { has_dispatched: false } }
+}
+
+impl RasterTool for LiquifyRasterTool {
+    fn begin(&mut self, _ws: &RasterWorkspace, _pos: egui::Vec2, _dt: f32, _settings: &crate::tools::RasterToolSettings) {}
+    fn update(&mut self, _ws: &RasterWorkspace, _pos: egui::Vec2, _dt: f32, _settings: &crate::tools::RasterToolSettings) {
+        self.has_dispatched = true; // placeholder
+    }
+    fn finish(&mut self, _ws: &RasterWorkspace) -> bool { self.has_dispatched }
+    fn cancel(&mut self, _ws: &RasterWorkspace) { self.has_dispatched = false; }
+    // take_pending_gpu_work: default (None) — full GPU wiring is TODO.
+}
+
+// ── SelectionTool ─────────────────────────────────────────────────────────────
+
+/// Raster selection tool (Magic Wand / Quick Select).
+///
+/// C (RGBA8) acts as the growing selection; C.r = mask value (0 or 255).
+/// Each `update()` frame a flood-fill / region-grow shader extends C.r.
+/// The composite pass draws A + a tinted overlay from C.r → B so the user
+/// sees the growing selection boundary.
+///
+/// `finish()` returns false (commit does not write pixels back to the layer;
+/// instead the caller extracts C.r into the standalone `R8Unorm` selection
+/// texture via `shared.raster_selection`).
+///
+/// # GPU implementation (TODO)
+/// Requires: a flood-fill compute shader seeded by the click position that
+/// grows the selection in C.r; and a composite shader that tints selected
+/// pixels blue/cyan for preview.
+pub struct SelectionTool {
+    has_selection: bool,
+}
+
+impl SelectionTool {
+    pub fn new() -> Self { Self { has_selection: false } }
+}
+
+impl RasterTool for SelectionTool {
+    fn begin(&mut self, _ws: &RasterWorkspace, _pos: egui::Vec2, _dt: f32, _settings: &crate::tools::RasterToolSettings) {}
+    fn update(&mut self, _ws: &RasterWorkspace, _pos: egui::Vec2, _dt: f32, _settings: &crate::tools::RasterToolSettings) {
+        self.has_selection = true; // placeholder
+    }
+    /// Selection tools never trigger a pixel readback/commit on mouseup.
+    /// The caller reads C.r directly into the selection mask texture.
+    fn finish(&mut self, _ws: &RasterWorkspace) -> bool { false }
+    fn cancel(&mut self, _ws: &RasterWorkspace) { self.has_selection = false; }
+    // take_pending_gpu_work: default (None) — full GPU wiring is TODO.
+}