550 lines
20 KiB
Rust
550 lines
20 KiB
Rust
use std::sync::{Arc, Mutex};
|
|
use std::num::NonZeroUsize;
|
|
use std::io::Cursor;
|
|
use ffmpeg_next as ffmpeg;
|
|
use lru::LruCache;
|
|
use daw_backend::WaveformPeak;
|
|
use image::{RgbaImage, ImageEncoder};
|
|
|
|
#[derive(serde::Serialize, Clone)]
|
|
pub struct VideoFileMetadata {
|
|
pub pool_index: usize,
|
|
pub width: u32,
|
|
pub height: u32,
|
|
pub fps: f64,
|
|
pub duration: f64,
|
|
pub has_audio: bool,
|
|
pub audio_pool_index: Option<usize>,
|
|
pub audio_duration: Option<f64>,
|
|
pub audio_sample_rate: Option<u32>,
|
|
pub audio_channels: Option<u32>,
|
|
pub audio_waveform: Option<Vec<WaveformPeak>>,
|
|
}
|
|
|
|
struct VideoDecoder {
|
|
path: String,
|
|
width: u32, // Original video width
|
|
height: u32, // Original video height
|
|
output_width: u32, // Scaled output width
|
|
output_height: u32, // Scaled output height
|
|
fps: f64,
|
|
duration: f64,
|
|
time_base: f64,
|
|
stream_index: usize,
|
|
frame_cache: LruCache<i64, Vec<u8>>, // timestamp -> RGBA data
|
|
input: Option<ffmpeg::format::context::Input>,
|
|
decoder: Option<ffmpeg::decoder::Video>,
|
|
last_decoded_ts: i64, // Track the last decoded frame timestamp
|
|
}
|
|
|
|
impl VideoDecoder {
|
|
fn new(path: String, cache_size: usize, max_width: Option<u32>, max_height: Option<u32>) -> Result<Self, String> {
|
|
ffmpeg::init().map_err(|e| e.to_string())?;
|
|
|
|
let input = ffmpeg::format::input(&path)
|
|
.map_err(|e| format!("Failed to open video: {}", e))?;
|
|
|
|
let video_stream = input.streams()
|
|
.best(ffmpeg::media::Type::Video)
|
|
.ok_or("No video stream found")?;
|
|
|
|
let stream_index = video_stream.index();
|
|
|
|
let context_decoder = ffmpeg::codec::context::Context::from_parameters(
|
|
video_stream.parameters()
|
|
).map_err(|e| e.to_string())?;
|
|
|
|
let decoder = context_decoder.decoder().video()
|
|
.map_err(|e| e.to_string())?;
|
|
|
|
let width = decoder.width();
|
|
let height = decoder.height();
|
|
let time_base = f64::from(video_stream.time_base());
|
|
|
|
// Calculate output dimensions (scale down if larger than max)
|
|
let (output_width, output_height) = if let (Some(max_w), Some(max_h)) = (max_width, max_height) {
|
|
// Calculate scale to fit within max dimensions while preserving aspect ratio
|
|
let scale = (max_w as f32 / width as f32).min(max_h as f32 / height as f32).min(1.0);
|
|
((width as f32 * scale) as u32, (height as f32 * scale) as u32)
|
|
} else {
|
|
(width, height)
|
|
};
|
|
|
|
// Try to get duration from stream, fallback to container
|
|
let duration = if video_stream.duration() > 0 {
|
|
video_stream.duration() as f64 * time_base
|
|
} else if input.duration() > 0 {
|
|
input.duration() as f64 / f64::from(ffmpeg::ffi::AV_TIME_BASE)
|
|
} else {
|
|
// If no duration available, estimate from frame count and fps
|
|
let fps = f64::from(video_stream.avg_frame_rate());
|
|
if video_stream.frames() > 0 && fps > 0.0 {
|
|
video_stream.frames() as f64 / fps
|
|
} else {
|
|
0.0 // Unknown duration
|
|
}
|
|
};
|
|
|
|
let fps = f64::from(video_stream.avg_frame_rate());
|
|
|
|
Ok(Self {
|
|
path,
|
|
width,
|
|
height,
|
|
output_width,
|
|
output_height,
|
|
fps,
|
|
duration,
|
|
time_base,
|
|
stream_index,
|
|
frame_cache: LruCache::new(
|
|
NonZeroUsize::new(cache_size).unwrap()
|
|
),
|
|
input: None,
|
|
decoder: None,
|
|
last_decoded_ts: -1,
|
|
})
|
|
}
|
|
|
|
fn get_frame(&mut self, timestamp: f64) -> Result<Vec<u8>, String> {
|
|
use std::time::Instant;
|
|
let t_start = Instant::now();
|
|
|
|
// Convert timestamp to frame timestamp
|
|
let frame_ts = (timestamp / self.time_base) as i64;
|
|
|
|
// Check cache
|
|
if let Some(cached_frame) = self.frame_cache.get(&frame_ts) {
|
|
eprintln!("[Video Timing] Cache hit for ts={:.3}s ({}ms)", timestamp, t_start.elapsed().as_millis());
|
|
return Ok(cached_frame.clone());
|
|
}
|
|
|
|
let t_after_cache = Instant::now();
|
|
|
|
// Determine if we need to seek
|
|
// Seek if: no decoder open, going backwards, or jumping forward more than 2 seconds
|
|
let need_seek = self.decoder.is_none()
|
|
|| frame_ts < self.last_decoded_ts
|
|
|| frame_ts > self.last_decoded_ts + (2.0 / self.time_base) as i64;
|
|
|
|
if need_seek {
|
|
let t_seek_start = Instant::now();
|
|
|
|
// Reopen input
|
|
let mut input = ffmpeg::format::input(&self.path)
|
|
.map_err(|e| format!("Failed to reopen video: {}", e))?;
|
|
|
|
// Seek to timestamp
|
|
input.seek(frame_ts, ..frame_ts)
|
|
.map_err(|e| format!("Seek failed: {}", e))?;
|
|
|
|
let context_decoder = ffmpeg::codec::context::Context::from_parameters(
|
|
input.streams().best(ffmpeg::media::Type::Video).unwrap().parameters()
|
|
).map_err(|e| e.to_string())?;
|
|
|
|
let decoder = context_decoder.decoder().video()
|
|
.map_err(|e| e.to_string())?;
|
|
|
|
self.input = Some(input);
|
|
self.decoder = Some(decoder);
|
|
self.last_decoded_ts = -1; // Reset since we seeked
|
|
|
|
eprintln!("[Video Timing] Seek took {}ms", t_seek_start.elapsed().as_millis());
|
|
}
|
|
|
|
let input = self.input.as_mut().unwrap();
|
|
let decoder = self.decoder.as_mut().unwrap();
|
|
|
|
// Decode frames until we find the one closest to our target timestamp
|
|
let mut best_frame_data: Option<Vec<u8>> = None;
|
|
let mut best_frame_ts: Option<i64> = None;
|
|
let t_decode_start = Instant::now();
|
|
let mut decode_count = 0;
|
|
let mut scale_time_ms = 0u128;
|
|
|
|
for (stream, packet) in input.packets() {
|
|
if stream.index() == self.stream_index {
|
|
decoder.send_packet(&packet)
|
|
.map_err(|e| e.to_string())?;
|
|
|
|
let mut frame = ffmpeg::util::frame::Video::empty();
|
|
while decoder.receive_frame(&mut frame).is_ok() {
|
|
decode_count += 1;
|
|
let current_frame_ts = frame.timestamp().unwrap_or(0);
|
|
self.last_decoded_ts = current_frame_ts; // Update last decoded position
|
|
|
|
// Check if this frame is closer to our target than the previous best
|
|
let is_better = match best_frame_ts {
|
|
None => true,
|
|
Some(best_ts) => {
|
|
(current_frame_ts - frame_ts).abs() < (best_ts - frame_ts).abs()
|
|
}
|
|
};
|
|
|
|
if is_better {
|
|
let t_scale_start = Instant::now();
|
|
|
|
// Convert to RGBA and scale to output size
|
|
let mut scaler = ffmpeg::software::scaling::context::Context::get(
|
|
frame.format(),
|
|
frame.width(),
|
|
frame.height(),
|
|
ffmpeg::format::Pixel::RGBA,
|
|
self.output_width,
|
|
self.output_height,
|
|
ffmpeg::software::scaling::flag::Flags::BILINEAR,
|
|
).map_err(|e| e.to_string())?;
|
|
|
|
let mut rgb_frame = ffmpeg::util::frame::Video::empty();
|
|
scaler.run(&frame, &mut rgb_frame)
|
|
.map_err(|e| e.to_string())?;
|
|
|
|
// Remove stride padding to create tightly packed RGBA data
|
|
let width = self.output_width as usize;
|
|
let height = self.output_height as usize;
|
|
let stride = rgb_frame.stride(0);
|
|
let row_size = width * 4; // RGBA = 4 bytes per pixel
|
|
let source_data = rgb_frame.data(0);
|
|
|
|
let mut packed_data = Vec::with_capacity(row_size * height);
|
|
for y in 0..height {
|
|
let row_start = y * stride;
|
|
let row_end = row_start + row_size;
|
|
packed_data.extend_from_slice(&source_data[row_start..row_end]);
|
|
}
|
|
|
|
scale_time_ms += t_scale_start.elapsed().as_millis();
|
|
best_frame_data = Some(packed_data);
|
|
best_frame_ts = Some(current_frame_ts);
|
|
}
|
|
|
|
// If we've reached or passed the target timestamp, we can stop
|
|
if current_frame_ts >= frame_ts {
|
|
// Found our frame, cache and return it
|
|
if let Some(data) = best_frame_data {
|
|
let total_time = t_start.elapsed().as_millis();
|
|
let decode_time = t_decode_start.elapsed().as_millis();
|
|
eprintln!("[Video Timing] ts={:.3}s | Decoded {} frames in {}ms | Scale: {}ms | Total: {}ms",
|
|
timestamp, decode_count, decode_time, scale_time_ms, total_time);
|
|
self.frame_cache.put(frame_ts, data.clone());
|
|
return Ok(data);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
eprintln!("[Video Decoder] ERROR: Failed to decode frame for timestamp {}", timestamp);
|
|
Err("Failed to decode frame".to_string())
|
|
}
|
|
}
|
|
|
|
pub struct VideoState {
|
|
pool: Vec<Arc<Mutex<VideoDecoder>>>,
|
|
next_pool_index: usize,
|
|
cache_size: usize,
|
|
}
|
|
|
|
impl Default for VideoState {
|
|
fn default() -> Self {
|
|
Self {
|
|
pool: Vec::new(),
|
|
next_pool_index: 0,
|
|
cache_size: 20, // Default cache size
|
|
}
|
|
}
|
|
}
|
|
|
|
#[tauri::command]
|
|
pub async fn video_load_file(
|
|
video_state: tauri::State<'_, Arc<Mutex<VideoState>>>,
|
|
audio_state: tauri::State<'_, Arc<Mutex<crate::audio::AudioState>>>,
|
|
path: String,
|
|
) -> Result<VideoFileMetadata, String> {
|
|
ffmpeg::init().map_err(|e| e.to_string())?;
|
|
|
|
// Open input to check for audio stream
|
|
let mut input = ffmpeg::format::input(&path)
|
|
.map_err(|e| format!("Failed to open video: {}", e))?;
|
|
|
|
let audio_stream_opt = input.streams()
|
|
.best(ffmpeg::media::Type::Audio);
|
|
|
|
let has_audio = audio_stream_opt.is_some();
|
|
|
|
// Extract audio if present
|
|
let (audio_pool_index, audio_duration, audio_sample_rate, audio_channels, audio_waveform) = if has_audio {
|
|
let audio_stream = audio_stream_opt.unwrap();
|
|
let audio_index = audio_stream.index();
|
|
|
|
// Get audio properties
|
|
let context_decoder = ffmpeg::codec::context::Context::from_parameters(
|
|
audio_stream.parameters()
|
|
).map_err(|e| e.to_string())?;
|
|
|
|
let mut audio_decoder = context_decoder.decoder().audio()
|
|
.map_err(|e| e.to_string())?;
|
|
|
|
let sample_rate = audio_decoder.rate();
|
|
let channels = audio_decoder.channels() as u32;
|
|
|
|
// Decode all audio frames
|
|
let mut audio_samples: Vec<f32> = Vec::new();
|
|
|
|
for (stream, packet) in input.packets() {
|
|
if stream.index() == audio_index {
|
|
audio_decoder.send_packet(&packet)
|
|
.map_err(|e| e.to_string())?;
|
|
|
|
let mut audio_frame = ffmpeg::util::frame::Audio::empty();
|
|
while audio_decoder.receive_frame(&mut audio_frame).is_ok() {
|
|
// Convert audio to f32 planar format
|
|
let format = audio_frame.format();
|
|
let frame_channels = audio_frame.channels() as usize;
|
|
|
|
// Create resampler to convert to f32 planar
|
|
let mut resampler = ffmpeg::software::resampling::context::Context::get(
|
|
format,
|
|
audio_frame.channel_layout(),
|
|
sample_rate,
|
|
ffmpeg::format::Sample::F32(ffmpeg::format::sample::Type::Packed),
|
|
audio_frame.channel_layout(),
|
|
sample_rate,
|
|
).map_err(|e| e.to_string())?;
|
|
|
|
let mut resampled_frame = ffmpeg::util::frame::Audio::empty();
|
|
resampler.run(&audio_frame, &mut resampled_frame)
|
|
.map_err(|e| e.to_string())?;
|
|
|
|
// Extract f32 samples (interleaved format)
|
|
let data_ptr = resampled_frame.data(0).as_ptr() as *const f32;
|
|
let total_samples = resampled_frame.samples() * frame_channels;
|
|
let samples_slice = unsafe {
|
|
std::slice::from_raw_parts(data_ptr, total_samples)
|
|
};
|
|
|
|
audio_samples.extend_from_slice(samples_slice);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Flush audio decoder
|
|
audio_decoder.send_eof().map_err(|e| e.to_string())?;
|
|
let mut audio_frame = ffmpeg::util::frame::Audio::empty();
|
|
while audio_decoder.receive_frame(&mut audio_frame).is_ok() {
|
|
let format = audio_frame.format();
|
|
let frame_channels = audio_frame.channels() as usize;
|
|
|
|
let mut resampler = ffmpeg::software::resampling::context::Context::get(
|
|
format,
|
|
audio_frame.channel_layout(),
|
|
sample_rate,
|
|
ffmpeg::format::Sample::F32(ffmpeg::format::sample::Type::Packed),
|
|
audio_frame.channel_layout(),
|
|
sample_rate,
|
|
).map_err(|e| e.to_string())?;
|
|
|
|
let mut resampled_frame = ffmpeg::util::frame::Audio::empty();
|
|
resampler.run(&audio_frame, &mut resampled_frame)
|
|
.map_err(|e| e.to_string())?;
|
|
|
|
let data_ptr = resampled_frame.data(0).as_ptr() as *const f32;
|
|
let total_samples = resampled_frame.samples() * frame_channels;
|
|
let samples_slice = unsafe {
|
|
std::slice::from_raw_parts(data_ptr, total_samples)
|
|
};
|
|
|
|
audio_samples.extend_from_slice(samples_slice);
|
|
}
|
|
|
|
// Calculate audio duration
|
|
let total_samples_per_channel = audio_samples.len() / channels as usize;
|
|
let audio_duration = total_samples_per_channel as f64 / sample_rate as f64;
|
|
|
|
// Generate waveform
|
|
let target_peaks = ((audio_duration * 300.0) as usize).clamp(1000, 20000);
|
|
let waveform = generate_waveform(&audio_samples, channels, target_peaks);
|
|
|
|
// Send audio to DAW backend
|
|
let mut audio_state_guard = audio_state.lock().unwrap();
|
|
let audio_pool_index = audio_state_guard.next_pool_index;
|
|
audio_state_guard.next_pool_index += 1;
|
|
|
|
if let Some(controller) = &mut audio_state_guard.controller {
|
|
controller.add_audio_file(
|
|
path.clone(),
|
|
audio_samples,
|
|
channels,
|
|
sample_rate,
|
|
);
|
|
}
|
|
drop(audio_state_guard);
|
|
|
|
(Some(audio_pool_index), Some(audio_duration), Some(sample_rate), Some(channels), Some(waveform))
|
|
} else {
|
|
(None, None, None, None, None)
|
|
};
|
|
|
|
// Create video decoder with max dimensions for playback (800x600)
|
|
// This scales down high-res videos to reduce data transfer
|
|
let mut video_state_guard = video_state.lock().unwrap();
|
|
let pool_index = video_state_guard.next_pool_index;
|
|
video_state_guard.next_pool_index += 1;
|
|
|
|
let decoder = VideoDecoder::new(path, video_state_guard.cache_size, Some(800), Some(600))?;
|
|
|
|
let metadata = VideoFileMetadata {
|
|
pool_index,
|
|
width: decoder.output_width, // Return scaled dimensions to JS
|
|
height: decoder.output_height,
|
|
fps: decoder.fps,
|
|
duration: decoder.duration,
|
|
has_audio,
|
|
audio_pool_index,
|
|
audio_duration,
|
|
audio_sample_rate,
|
|
audio_channels,
|
|
audio_waveform,
|
|
};
|
|
|
|
video_state_guard.pool.push(Arc::new(Mutex::new(decoder)));
|
|
|
|
Ok(metadata)
|
|
}
|
|
|
|
fn generate_waveform(audio_data: &[f32], channels: u32, target_peaks: usize) -> Vec<WaveformPeak> {
|
|
let total_samples = audio_data.len();
|
|
let samples_per_channel = total_samples / channels as usize;
|
|
let samples_per_peak = (samples_per_channel / target_peaks).max(1);
|
|
|
|
let mut waveform = Vec::new();
|
|
|
|
for peak_idx in 0..target_peaks {
|
|
let start_sample = peak_idx * samples_per_peak;
|
|
let end_sample = ((peak_idx + 1) * samples_per_peak).min(samples_per_channel);
|
|
|
|
if start_sample >= samples_per_channel {
|
|
break;
|
|
}
|
|
|
|
let mut min_val = 0.0f32;
|
|
let mut max_val = 0.0f32;
|
|
|
|
for sample_idx in start_sample..end_sample {
|
|
// Average across channels
|
|
let mut channel_sum = 0.0f32;
|
|
for ch in 0..channels as usize {
|
|
let idx = sample_idx * channels as usize + ch;
|
|
if idx < total_samples {
|
|
channel_sum += audio_data[idx];
|
|
}
|
|
}
|
|
let avg_sample = channel_sum / channels as f32;
|
|
|
|
min_val = min_val.min(avg_sample);
|
|
max_val = max_val.max(avg_sample);
|
|
}
|
|
|
|
waveform.push(WaveformPeak {
|
|
min: min_val,
|
|
max: max_val,
|
|
});
|
|
}
|
|
|
|
waveform
|
|
}
|
|
|
|
#[tauri::command]
|
|
pub async fn video_get_frame(
|
|
state: tauri::State<'_, Arc<Mutex<VideoState>>>,
|
|
pool_index: usize,
|
|
timestamp: f64,
|
|
use_jpeg: bool,
|
|
channel: tauri::ipc::Channel,
|
|
) -> Result<(), String> {
|
|
use std::time::Instant;
|
|
|
|
let video_state = state.lock().unwrap();
|
|
|
|
let decoder = video_state.pool.get(pool_index)
|
|
.ok_or("Invalid pool index")?
|
|
.clone();
|
|
|
|
drop(video_state);
|
|
|
|
let mut decoder = decoder.lock().unwrap();
|
|
let frame_data = decoder.get_frame(timestamp)?;
|
|
|
|
let data_to_send = if use_jpeg {
|
|
let t_compress_start = Instant::now();
|
|
|
|
// Get frame dimensions from decoder
|
|
let width = decoder.output_width;
|
|
let height = decoder.output_height;
|
|
|
|
// Create image from raw RGBA data
|
|
let img = RgbaImage::from_raw(width, height, frame_data)
|
|
.ok_or("Failed to create image from frame data")?;
|
|
|
|
// Convert RGBA to RGB (JPEG doesn't support alpha)
|
|
let rgb_img = image::DynamicImage::ImageRgba8(img).to_rgb8();
|
|
|
|
// Encode to JPEG with quality 85 (good balance of size/quality)
|
|
let mut jpeg_data = Vec::new();
|
|
let mut encoder = image::codecs::jpeg::JpegEncoder::new_with_quality(&mut jpeg_data, 85);
|
|
encoder.encode(
|
|
rgb_img.as_raw(),
|
|
rgb_img.width(),
|
|
rgb_img.height(),
|
|
image::ColorType::Rgb8
|
|
).map_err(|e| format!("JPEG encoding failed: {}", e))?;
|
|
|
|
let compress_time = t_compress_start.elapsed().as_millis();
|
|
let original_size = width as usize * height as usize * 4;
|
|
let compressed_size = jpeg_data.len();
|
|
let ratio = original_size as f32 / compressed_size as f32;
|
|
|
|
eprintln!("[Video JPEG] Compressed {}KB -> {}KB ({}x) in {}ms",
|
|
original_size / 1024, compressed_size / 1024, ratio, compress_time);
|
|
|
|
jpeg_data
|
|
} else {
|
|
frame_data
|
|
};
|
|
|
|
// Send binary data through channel (bypasses JSON serialization)
|
|
// InvokeResponseBody::Raw sends raw binary data without JSON encoding
|
|
channel.send(tauri::ipc::InvokeResponseBody::Raw(data_to_send))
|
|
.map_err(|e| format!("Channel send error: {}", e))?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[tauri::command]
|
|
pub async fn video_set_cache_size(
|
|
state: tauri::State<'_, Arc<Mutex<VideoState>>>,
|
|
cache_size: usize,
|
|
) -> Result<(), String> {
|
|
let mut video_state = state.lock().unwrap();
|
|
video_state.cache_size = cache_size;
|
|
Ok(())
|
|
}
|
|
|
|
#[tauri::command]
|
|
pub async fn video_get_pool_info(
|
|
state: tauri::State<'_, Arc<Mutex<VideoState>>>,
|
|
pool_index: usize,
|
|
) -> Result<(u32, u32, f64), String> {
|
|
let video_state = state.lock().unwrap();
|
|
let decoder = video_state.pool.get(pool_index)
|
|
.ok_or("Invalid pool index")?
|
|
.lock().unwrap();
|
|
|
|
Ok((
|
|
decoder.output_width, // Return scaled dimensions
|
|
decoder.output_height,
|
|
decoder.fps
|
|
))
|
|
}
|