Lightningbeam/lightningbeam-ui/lightningbeam-core/src/video.rs

//! Video decoding and management for Lightningbeam
//!
//! This module provides FFmpeg-based video decoding with LRU frame caching
//! for efficient video playback and preview.

use std::sync::{Arc, Mutex};
use std::num::NonZeroUsize;
use std::collections::HashMap;
use ffmpeg_next as ffmpeg;
use lru::LruCache;
use uuid::Uuid;

/// Metadata about a video file
#[derive(Debug, Clone)]
pub struct VideoMetadata {
    pub width: u32,
    pub height: u32,
    pub fps: f64,
    pub duration: f64,
    pub has_audio: bool,
}

/// Video decoder with LRU frame caching
pub struct VideoDecoder {
    path: String,
    _width: u32,          // Original video width
    _height: u32,         // Original video height
    output_width: u32,   // Scaled output width
    output_height: u32,  // Scaled output height
    fps: f64,
    _duration: f64,
    time_base: f64,
    stream_index: usize,
    frame_cache: LruCache<i64, Vec<u8>>, // timestamp -> RGBA data
    input: Option<ffmpeg::format::context::Input>,
    decoder: Option<ffmpeg::decoder::Video>,
    last_decoded_ts: i64, // Track the last decoded frame timestamp
    keyframe_positions: Vec<i64>, // Index of keyframe timestamps for fast seeking
}

impl VideoDecoder {
    /// Create a new video decoder
    ///
    /// `max_width` and `max_height` specify the maximum output dimensions.
    /// Video will be scaled down if larger, preserving aspect ratio.
    /// `build_keyframes` controls whether to build the keyframe index immediately (slow)
    /// or defer it for async building later.
    fn new(path: String, cache_size: usize, max_width: Option<u32>, max_height: Option<u32>, build_keyframes: bool) -> Result<Self, String> {
        ffmpeg::init().map_err(|e| e.to_string())?;

        let input = ffmpeg::format::input(&path)
            .map_err(|e| format!("Failed to open video: {}", e))?;

        let video_stream = input.streams()
            .best(ffmpeg::media::Type::Video)
            .ok_or("No video stream found")?;

        let stream_index = video_stream.index();

        let context_decoder = ffmpeg::codec::context::Context::from_parameters(
            video_stream.parameters()
        ).map_err(|e| e.to_string())?;

        let decoder = context_decoder.decoder().video()
            .map_err(|e| e.to_string())?;

        let width = decoder.width();
        let height = decoder.height();
        let time_base = f64::from(video_stream.time_base());

        // Calculate output dimensions (scale down if larger than max)
        let (output_width, output_height) = if let (Some(max_w), Some(max_h)) = (max_width, max_height) {
            // Calculate scale to fit within max dimensions while preserving aspect ratio
            let scale = (max_w as f32 / width as f32).min(max_h as f32 / height as f32).min(1.0);
            ((width as f32 * scale) as u32, (height as f32 * scale) as u32)
        } else {
            (width, height)
        };

        // Try to get duration from stream, fallback to container
        let duration = if video_stream.duration() > 0 {
            video_stream.duration() as f64 * time_base
        } else if input.duration() > 0 {
            input.duration() as f64 / f64::from(ffmpeg::ffi::AV_TIME_BASE)
        } else {
            // If no duration available, estimate from frame count and fps
            let fps = f64::from(video_stream.avg_frame_rate());
            if video_stream.frames() > 0 && fps > 0.0 {
                video_stream.frames() as f64 / fps
            } else {
                0.0 // Unknown duration
            }
        };

        let fps = f64::from(video_stream.avg_frame_rate());

        // Optionally build keyframe index for fast seeking
        let keyframe_positions = if build_keyframes {
            eprintln!("[Video Decoder] Building keyframe index for {}", path);
            let positions = Self::build_keyframe_index(&path, stream_index)?;
            eprintln!("[Video Decoder] Found {} keyframes", positions.len());
            positions
        } else {
            eprintln!("[Video Decoder] Deferring keyframe index building for {}", path);
            Vec::new()
        };

        Ok(Self {
            path,
            _width: width,
            _height: height,
            output_width,
            output_height,
            fps,
            _duration: duration,
            time_base,
            stream_index,
            frame_cache: LruCache::new(
                NonZeroUsize::new(cache_size).unwrap()
            ),
            input: None,
            decoder: None,
            last_decoded_ts: -1,
            keyframe_positions,
        })
    }

    /// Build keyframe index for this decoder
    /// This can be called asynchronously after decoder creation
    fn build_and_set_keyframe_index(&mut self) -> Result<(), String> {
        eprintln!("[Video Decoder] Building keyframe index for {}", self.path);
        let positions = Self::build_keyframe_index(&self.path, self.stream_index)?;
        eprintln!("[Video Decoder] Found {} keyframes", positions.len());
        self.keyframe_positions = positions;
        Ok(())
    }

    /// Get the output width (scaled dimensions)
    pub fn get_output_width(&self) -> u32 {
        self.output_width
    }

    /// Get the output height (scaled dimensions)
    pub fn get_output_height(&self) -> u32 {
        self.output_height
    }

    /// Decode a frame at the specified timestamp (public wrapper)
    pub fn decode_frame(&mut self, timestamp: f64) -> Result<Vec<u8>, String> {
        self.get_frame(timestamp)
    }

    /// Build an index of all keyframe positions in the video
    /// This enables fast seeking by knowing exactly where keyframes are
    fn build_keyframe_index(path: &str, stream_index: usize) -> Result<Vec<i64>, String> {
        let mut input = ffmpeg::format::input(path)
            .map_err(|e| format!("Failed to open video for indexing: {}", e))?;

        let mut keyframes = Vec::new();

        // Scan through all packets to find keyframes
        for (stream, packet) in input.packets() {
            if stream.index() == stream_index {
                // Check if this packet is a keyframe
                if packet.is_key() {
                    if let Some(pts) = packet.pts() {
                        keyframes.push(pts);
                    }
                }
            }
        }

        // Ensure keyframes are sorted (they should be already)
        keyframes.sort_unstable();

        Ok(keyframes)
    }

    /// Find the nearest keyframe at or before the target timestamp
    /// Returns the keyframe timestamp, or 0 if target is before first keyframe
    fn find_nearest_keyframe_before(&self, target_ts: i64) -> i64 {
        // Binary search to find the largest keyframe <= target_ts
        match self.keyframe_positions.binary_search(&target_ts) {
            Ok(idx) => self.keyframe_positions[idx],  // Exact match
            Err(0) => 0,  // Target is before first keyframe, seek to start
            Err(idx) => self.keyframe_positions[idx - 1],  // Use previous keyframe
        }
    }

    /// Get a decoded frame at the specified timestamp
    fn get_frame(&mut self, timestamp: f64) -> Result<Vec<u8>, String> {
        use std::time::Instant;
        let t_start = Instant::now();

        // Round timestamp to nearest frame boundary to improve cache hits
        // This ensures that timestamps like 1.0001s and 0.9999s both map to frame 1.0s
        let frame_duration = 1.0 / self.fps;
        let rounded_timestamp = (timestamp / frame_duration).round() * frame_duration;

        // Convert timestamp to frame timestamp
        let frame_ts = (rounded_timestamp / self.time_base) as i64;

        // Check cache
        if let Some(cached_frame) = self.frame_cache.get(&frame_ts) {
            eprintln!("[Video Timing] Cache hit for ts={:.3}s ({}ms)", timestamp, t_start.elapsed().as_millis());
            return Ok(cached_frame.clone());
        }

        // Determine if we need to seek
        // Seek if: no decoder open, going backwards, or jumping forward more than 2 seconds
        let need_seek = self.decoder.is_none()
            || frame_ts < self.last_decoded_ts
            || frame_ts > self.last_decoded_ts + (2.0 / self.time_base) as i64;

        if need_seek {
            let t_seek_start = Instant::now();

            // Find the nearest keyframe at or before our target using the index
            // This is the exact keyframe position, so we can seek directly to it
            let keyframe_ts_stream = self.find_nearest_keyframe_before(frame_ts);

            // Convert from stream timebase to AV_TIME_BASE (microseconds) for container-level seek
            // input.seek() with stream=-1 expects AV_TIME_BASE units, not stream units
            let keyframe_seconds = keyframe_ts_stream as f64 * self.time_base;
            let keyframe_ts_av = (keyframe_seconds * 1_000_000.0) as i64; // AV_TIME_BASE = 1000000

            eprintln!("[Video Seek] Target: {} | Keyframe(stream): {} | Keyframe(AV): {} | Index size: {}",
                frame_ts, keyframe_ts_stream, keyframe_ts_av, self.keyframe_positions.len());

            // Reopen input
            let mut input = ffmpeg::format::input(&self.path)
                .map_err(|e| format!("Failed to reopen video: {}", e))?;

            // Seek directly to the keyframe with a 1-unit window
            // Can't use keyframe_ts..keyframe_ts (empty) or ..= (not supported)
            input.seek(keyframe_ts_av, keyframe_ts_av..(keyframe_ts_av + 1))
                .map_err(|e| format!("Seek failed: {}", e))?;

            eprintln!("[Video Timing] Seek call took {}ms", t_seek_start.elapsed().as_millis());

            let context_decoder = ffmpeg::codec::context::Context::from_parameters(
                input.streams().best(ffmpeg::media::Type::Video).unwrap().parameters()
            ).map_err(|e| e.to_string())?;

            let decoder = context_decoder.decoder().video()
                .map_err(|e| e.to_string())?;

            self.input = Some(input);
            self.decoder = Some(decoder);
            // Set last_decoded_ts to just before the seek target so forward playback works
            // Without this, every frame would trigger a new seek
            self.last_decoded_ts = frame_ts - 1;
        }

        let input = self.input.as_mut().unwrap();
        let decoder = self.decoder.as_mut().unwrap();

        // Decode frames until we find the one closest to our target timestamp
        let mut best_frame_data: Option<Vec<u8>> = None;
        let mut best_frame_ts: Option<i64> = None;
        let t_decode_start = Instant::now();
        let mut decode_count = 0;
        let mut scale_time_ms = 0u128;

        for (stream, packet) in input.packets() {
            if stream.index() == self.stream_index {
                decoder.send_packet(&packet)
                    .map_err(|e| e.to_string())?;

                let mut frame = ffmpeg::util::frame::Video::empty();
                while decoder.receive_frame(&mut frame).is_ok() {
                    decode_count += 1;
                    let current_frame_ts = frame.timestamp().unwrap_or(0);
                    self.last_decoded_ts = current_frame_ts; // Update last decoded position

                    // Check if this frame is closer to our target than the previous best
                    let is_better = match best_frame_ts {
                        None => true,
                        Some(best_ts) => {
                            (current_frame_ts - frame_ts).abs() < (best_ts - frame_ts).abs()
                        }
                    };

                    if is_better {
                        let t_scale_start = Instant::now();

                        // Convert to RGBA and scale to output size
                        let mut scaler = ffmpeg::software::scaling::context::Context::get(
                            frame.format(),
                            frame.width(),
                            frame.height(),
                            ffmpeg::format::Pixel::RGBA,
                            self.output_width,
                            self.output_height,
                            ffmpeg::software::scaling::flag::Flags::BILINEAR,
                        ).map_err(|e| e.to_string())?;

                        let mut rgb_frame = ffmpeg::util::frame::Video::empty();
                        scaler.run(&frame, &mut rgb_frame)
                            .map_err(|e| e.to_string())?;

                        // Remove stride padding to create tightly packed RGBA data
                        let width = self.output_width as usize;
                        let height = self.output_height as usize;
                        let stride = rgb_frame.stride(0);
                        let row_size = width * 4; // RGBA = 4 bytes per pixel
                        let source_data = rgb_frame.data(0);

                        let mut packed_data = Vec::with_capacity(row_size * height);
                        for y in 0..height {
                            let row_start = y * stride;
                            let row_end = row_start + row_size;
                            packed_data.extend_from_slice(&source_data[row_start..row_end]);
                        }

                        scale_time_ms += t_scale_start.elapsed().as_millis();
                        best_frame_data = Some(packed_data);
                        best_frame_ts = Some(current_frame_ts);
                    }

                    // If we've reached or passed the target timestamp, we can stop
                    if current_frame_ts >= frame_ts {
                        // Found our frame, cache and return it
                        if let Some(data) = best_frame_data {
                            let total_time = t_start.elapsed().as_millis();
                            let decode_time = t_decode_start.elapsed().as_millis();
                            eprintln!("[Video Timing] ts={:.3}s | Decoded {} frames in {}ms | Scale: {}ms | Total: {}ms",
                                timestamp, decode_count, decode_time, scale_time_ms, total_time);
                            self.frame_cache.put(frame_ts, data.clone());
                            return Ok(data);
                        }
                        break;
                    }
                }
            }
        }

        eprintln!("[Video Decoder] ERROR: Failed to decode frame for timestamp {}", timestamp);
        Err("Failed to decode frame".to_string())
    }
}

/// Probe video file for metadata without creating a full decoder
pub fn probe_video(path: &str) -> Result<VideoMetadata, String> {
    ffmpeg::init().map_err(|e| e.to_string())?;

    let input = ffmpeg::format::input(path)
        .map_err(|e| format!("Failed to open video: {}", e))?;

    let video_stream = input.streams()
        .best(ffmpeg::media::Type::Video)
        .ok_or("No video stream found")?;

    let context_decoder = ffmpeg::codec::context::Context::from_parameters(
        video_stream.parameters()
    ).map_err(|e| e.to_string())?;

    let decoder = context_decoder.decoder().video()
        .map_err(|e| e.to_string())?;

    let width = decoder.width();
    let height = decoder.height();
    let time_base = f64::from(video_stream.time_base());

    // Try to get duration from stream, fallback to container
    let duration = if video_stream.duration() > 0 {
        video_stream.duration() as f64 * time_base
    } else if input.duration() > 0 {
        input.duration() as f64 / f64::from(ffmpeg::ffi::AV_TIME_BASE)
    } else {
        // If no duration available, estimate from frame count and fps
        let fps = f64::from(video_stream.avg_frame_rate());
        if video_stream.frames() > 0 && fps > 0.0 {
            video_stream.frames() as f64 / fps
        } else {
            0.0 // Unknown duration
        }
    };

    let fps = f64::from(video_stream.avg_frame_rate());

    // Check for audio stream
    let has_audio = input.streams()
        .best(ffmpeg::media::Type::Audio)
        .is_some();

    Ok(VideoMetadata {
        width,
        height,
        fps,
        duration,
        has_audio,
    })
}

/// A single decoded video frame with RGBA data
#[derive(Debug, Clone)]
pub struct VideoFrame {
    pub width: u32,
    pub height: u32,
    pub rgba_data: Arc<Vec<u8>>,
    pub timestamp: f64,
}

/// Manages video decoders and frame caching for multiple video clips
pub struct VideoManager {
    /// Pool of video decoders, one per clip
    decoders: HashMap<Uuid, Arc<Mutex<VideoDecoder>>>,

    /// Frame cache: (clip_id, timestamp_ms) -> frame
    /// Stores raw RGBA data for zero-copy rendering
    frame_cache: HashMap<(Uuid, i64), Arc<VideoFrame>>,

    /// Thumbnail cache: clip_id -> Vec of (timestamp, rgba_data)
    /// Low-resolution (64px width) thumbnails for scrubbing
    thumbnail_cache: HashMap<Uuid, Vec<(f64, Arc<Vec<u8>>)>>,

    /// Maximum number of frames to cache per decoder
    cache_size: usize,
}

impl VideoManager {
    /// Create a new video manager with default cache size
    pub fn new() -> Self {
        Self::with_cache_size(20)
    }

    /// Create a new video manager with specified cache size
    pub fn with_cache_size(cache_size: usize) -> Self {
        Self {
            decoders: HashMap::new(),
            frame_cache: HashMap::new(),
            thumbnail_cache: HashMap::new(),
            cache_size,
        }
    }

    /// Load a video file and create a decoder for it
    ///
    /// `target_width` and `target_height` specify the maximum dimensions
    /// for decoded frames. Video will be scaled down if larger.
    ///
    /// The keyframe index is NOT built during this call - use `build_keyframe_index_async`
    /// in a background thread to build it asynchronously.
    pub fn load_video(
        &mut self,
        clip_id: Uuid,
        path: String,
        target_width: u32,
        target_height: u32,
    ) -> Result<VideoMetadata, String> {
        // First probe the video for metadata
        let metadata = probe_video(&path)?;

        // Create decoder with target dimensions, without building keyframe index
        let decoder = VideoDecoder::new(
            path,
            self.cache_size,
            Some(target_width),
            Some(target_height),
            false, // Don't build keyframe index synchronously
        )?;

        // Store decoder in pool
        self.decoders.insert(clip_id, Arc::new(Mutex::new(decoder)));

        Ok(metadata)
    }

    /// Build keyframe index for a loaded video asynchronously
    ///
    /// This should be called from a background thread after load_video()
    /// to avoid blocking the UI during import.
    pub fn build_keyframe_index(&self, clip_id: &Uuid) -> Result<(), String> {
        let decoder_arc = self.decoders.get(clip_id)
            .ok_or_else(|| format!("Video clip {} not found", clip_id))?;

        let mut decoder = decoder_arc.lock()
            .map_err(|e| format!("Failed to lock decoder: {}", e))?;

        decoder.build_and_set_keyframe_index()
    }

    /// Get a decoded frame for a specific clip at a specific timestamp
    ///
    /// Returns None if the clip is not loaded or decoding fails.
    /// Frames are cached for performance.
    pub fn get_frame(&mut self, clip_id: &Uuid, timestamp: f64) -> Option<Arc<VideoFrame>> {
        // Convert timestamp to milliseconds for cache key
        let timestamp_ms = (timestamp * 1000.0) as i64;
        let cache_key = (*clip_id, timestamp_ms);

        // Check frame cache first
        if let Some(cached_frame) = self.frame_cache.get(&cache_key) {
            return Some(Arc::clone(cached_frame));
        }

        // Get decoder for this clip
        let decoder_arc = self.decoders.get(clip_id)?;
        let mut decoder = decoder_arc.lock().ok()?;

        // Decode the frame
        let rgba_data = decoder.get_frame(timestamp).ok()?;
        let width = decoder.output_width;
        let height = decoder.output_height;

        // Create VideoFrame and cache it
        let frame = Arc::new(VideoFrame {
            width,
            height,
            rgba_data: Arc::new(rgba_data),
            timestamp,
        });

        self.frame_cache.insert(cache_key, Arc::clone(&frame));

        Some(frame)
    }

    /// Generate thumbnails for a video clip (single batch version - use generate_thumbnails_progressive instead)
    ///
    /// Thumbnails are generated every 5 seconds at 128px width.
    /// This should be called in a background thread to avoid blocking.
    /// Thumbnails are inserted into the cache progressively as they're generated,
    /// allowing the UI to display them immediately.
    ///
    /// DEPRECATED: Use generate_thumbnails_progressive which releases the lock between thumbnails.
    pub fn generate_thumbnails(&mut self, clip_id: &Uuid, duration: f64) -> Result<(), String> {
        let decoder_arc = self.decoders.get(clip_id)
            .ok_or("Clip not loaded")?
            .clone();

        let mut decoder = decoder_arc.lock()
            .map_err(|e| format!("Failed to lock decoder: {}", e))?;

        // Initialize thumbnail cache entry with empty vec
        self.thumbnail_cache.insert(*clip_id, Vec::new());

        let interval = 5.0; // Generate thumbnail every 5 seconds
        let mut t = 0.0;

        while t < duration {
            // Decode frame at this timestamp
            if let Ok(rgba_data) = decoder.get_frame(t) {
                // Decode already scaled to output dimensions, but we want 128px width for thumbnails
                // We need to scale down further
                let current_width = decoder.output_width;
                let current_height = decoder.output_height;

                // Calculate thumbnail dimensions (128px width, maintain aspect ratio)
                let thumb_width = 128u32;
                let aspect_ratio = current_height as f32 / current_width as f32;
                let thumb_height = (thumb_width as f32 * aspect_ratio) as u32;

                // Simple nearest-neighbor downsampling for thumbnails
                let thumb_data = downsample_rgba(
                    &rgba_data,
                    current_width,
                    current_height,
                    thumb_width,
                    thumb_height,
                );

                // Insert thumbnail into cache immediately so UI can display it
                if let Some(thumbnails) = self.thumbnail_cache.get_mut(clip_id) {
                    thumbnails.push((t, Arc::new(thumb_data)));
                }
            }

            t += interval;
        }

        Ok(())
    }

    /// Get the decoder Arc for a clip (for external thumbnail generation)
    /// This allows external code to decode frames without holding the VideoManager lock
    pub fn get_decoder(&self, clip_id: &Uuid) -> Option<Arc<Mutex<VideoDecoder>>> {
        self.decoders.get(clip_id).cloned()
    }

    /// Insert a thumbnail into the cache (for external thumbnail generation)
    pub fn insert_thumbnail(&mut self, clip_id: &Uuid, timestamp: f64, data: Arc<Vec<u8>>) {
        self.thumbnail_cache
            .entry(*clip_id)
            .or_insert_with(Vec::new)
            .push((timestamp, data));
    }

    /// Get the thumbnail closest to the specified timestamp
    ///
    /// Returns None if no thumbnails have been generated for this clip.
    pub fn get_thumbnail_at(&self, clip_id: &Uuid, timestamp: f64) -> Option<(u32, u32, Arc<Vec<u8>>)> {
        let thumbnails = self.thumbnail_cache.get(clip_id)?;

        if thumbnails.is_empty() {
            return None;
        }

        // Binary search for closest thumbnail
        let idx = thumbnails.binary_search_by(|(t, _)| {
            t.partial_cmp(&timestamp).unwrap_or(std::cmp::Ordering::Equal)
        }).unwrap_or_else(|idx| {
            // If exact match not found, pick the closest
            if idx == 0 {
                0
            } else if idx >= thumbnails.len() {
                thumbnails.len() - 1
            } else {
                // Compare distance to previous and next
                let prev_dist = (thumbnails[idx - 1].0 - timestamp).abs();
                let next_dist = (thumbnails[idx].0 - timestamp).abs();
                if prev_dist < next_dist {
                    idx - 1
                } else {
                    idx
                }
            }
        });

        let (_, rgba_data) = &thumbnails[idx];

        // Return (width, height, data)
        // Thumbnails are always 128px width
        let thumb_width = 128;
        let thumb_height = (rgba_data.len() / (thumb_width * 4)) as u32;

        Some((thumb_width as u32, thumb_height, Arc::clone(rgba_data)))
    }

    /// Remove a video clip and its cached data
    pub fn unload_video(&mut self, clip_id: &Uuid) {
        self.decoders.remove(clip_id);

        // Remove all cached frames for this clip
        self.frame_cache.retain(|(id, _), _| id != clip_id);

        // Remove thumbnails
        self.thumbnail_cache.remove(clip_id);
    }

    /// Clear all frame caches (useful for memory management)
    pub fn clear_frame_cache(&mut self) {
        self.frame_cache.clear();
    }
}

impl Default for VideoManager {
    fn default() -> Self {
        Self::new()
    }
}

/// Simple nearest-neighbor downsampling for RGBA images
pub fn downsample_rgba_public(
    src: &[u8],
    src_width: u32,
    src_height: u32,
    dst_width: u32,
    dst_height: u32,
) -> Vec<u8> {
    downsample_rgba(src, src_width, src_height, dst_width, dst_height)
}

/// Simple nearest-neighbor downsampling for RGBA images (internal)
fn downsample_rgba(
    src: &[u8],
    src_width: u32,
    src_height: u32,
    dst_width: u32,
    dst_height: u32,
) -> Vec<u8> {
    let mut dst = Vec::with_capacity((dst_width * dst_height * 4) as usize);

    let x_ratio = src_width as f32 / dst_width as f32;
    let y_ratio = src_height as f32 / dst_height as f32;

    for y in 0..dst_height {
        for x in 0..dst_width {
            let src_x = (x as f32 * x_ratio) as u32;
            let src_y = (y as f32 * y_ratio) as u32;

            let src_idx = ((src_y * src_width + src_x) * 4) as usize;

            // Copy RGBA bytes
            dst.push(src[src_idx]);     // R
            dst.push(src[src_idx + 1]); // G
            dst.push(src[src_idx + 2]); // B
            dst.push(src[src_idx + 3]); // A
        }
    }

    dst
}

/// Extracted audio data from a video file
#[derive(Debug, Clone)]
pub struct ExtractedAudio {
    pub samples: Vec<f32>,
    pub channels: u32,
    pub sample_rate: u32,
    pub duration: f64,
}

/// Extract audio from a video file
///
/// This function performs the slow FFmpeg decoding without holding any locks.
/// The caller can then quickly add the audio to the DAW backend in a background thread.
///
/// Returns None if the video has no audio stream.
pub fn extract_audio_from_video(path: &str) -> Result<Option<ExtractedAudio>, String> {
    ffmpeg::init().map_err(|e| e.to_string())?;

    // Open video file
    let mut input = ffmpeg::format::input(path)
        .map_err(|e| format!("Failed to open video: {}", e))?;

    // Find audio stream
    let audio_stream_opt = input.streams()
        .best(ffmpeg::media::Type::Audio);

    // Return None if no audio stream
    if audio_stream_opt.is_none() {
        return Ok(None);
    }

    let audio_stream = audio_stream_opt.unwrap();
    let audio_index = audio_stream.index();

    // Get audio properties
    let context_decoder = ffmpeg::codec::context::Context::from_parameters(
        audio_stream.parameters()
    ).map_err(|e| e.to_string())?;

    let mut audio_decoder = context_decoder.decoder().audio()
        .map_err(|e| e.to_string())?;

    let sample_rate = audio_decoder.rate();
    let channels = audio_decoder.channels() as u32;

    // Decode all audio frames
    let mut audio_samples: Vec<f32> = Vec::new();

    for (stream, packet) in input.packets() {
        if stream.index() == audio_index {
            audio_decoder.send_packet(&packet)
                .map_err(|e| e.to_string())?;

            let mut audio_frame = ffmpeg::util::frame::Audio::empty();
            while audio_decoder.receive_frame(&mut audio_frame).is_ok() {
                // Convert audio to f32 packed format
                let format = audio_frame.format();
                let frame_channels = audio_frame.channels() as usize;

                // Create resampler to convert to f32 packed
                let mut resampler = ffmpeg::software::resampling::context::Context::get(
                    format,
                    audio_frame.channel_layout(),
                    sample_rate,
                    ffmpeg::format::Sample::F32(ffmpeg::format::sample::Type::Packed),
                    audio_frame.channel_layout(),
                    sample_rate,
                ).map_err(|e| e.to_string())?;

                let mut resampled_frame = ffmpeg::util::frame::Audio::empty();
                resampler.run(&audio_frame, &mut resampled_frame)
                    .map_err(|e| e.to_string())?;

                // Extract f32 samples (interleaved format)
                let data_ptr = resampled_frame.data(0).as_ptr() as *const f32;
                let total_samples = resampled_frame.samples() * frame_channels;
                let samples_slice = unsafe {
                    std::slice::from_raw_parts(data_ptr, total_samples)
                };

                audio_samples.extend_from_slice(samples_slice);
            }
        }
    }

    // Flush audio decoder
    audio_decoder.send_eof().map_err(|e| e.to_string())?;
    let mut audio_frame = ffmpeg::util::frame::Audio::empty();
    while audio_decoder.receive_frame(&mut audio_frame).is_ok() {
        let format = audio_frame.format();
        let frame_channels = audio_frame.channels() as usize;

        let mut resampler = ffmpeg::software::resampling::context::Context::get(
            format,
            audio_frame.channel_layout(),
            sample_rate,
            ffmpeg::format::Sample::F32(ffmpeg::format::sample::Type::Packed),
            audio_frame.channel_layout(),
            sample_rate,
        ).map_err(|e| e.to_string())?;

        let mut resampled_frame = ffmpeg::util::frame::Audio::empty();
        resampler.run(&audio_frame, &mut resampled_frame)
            .map_err(|e| e.to_string())?;

        let data_ptr = resampled_frame.data(0).as_ptr() as *const f32;
        let total_samples = resampled_frame.samples() * frame_channels;
        let samples_slice = unsafe {
            std::slice::from_raw_parts(data_ptr, total_samples)
        };

        audio_samples.extend_from_slice(samples_slice);
    }

    // Calculate duration
    let total_samples_per_channel = audio_samples.len() / channels as usize;
    let duration = total_samples_per_channel as f64 / sample_rate as f64;

    Ok(Some(ExtractedAudio {
        samples: audio_samples,
        channels,
        sample_rate,
        duration,
    }))
}