From b35ccbe271e909989e27081265a7e7328e7a8298 Mon Sep 17 00:00:00 2001 From: Marc Plano-Lesay Date: Mon, 13 Oct 2025 16:52:25 +1100 Subject: [PATCH] feat: implement cbz writing and pdf reading --- Cargo.lock | 156 +++++++++++++++++++++++++++ Cargo.toml | 1 + src/formats/cbz.rs | 41 ++++++- src/formats/mod.rs | 27 +++-- src/formats/pdf.rs | 198 +++++++++++++++++++++++++++++++++- src/job.rs | 24 +---- src/main.rs | 102 ++++++++++++++---- tests/cbz_writer_tests.rs | 96 +++++++++++++++++ tests/job_and_format_tests.rs | 4 +- tests/pdf_reader_tests.rs | 51 +++++++++ 10 files changed, 643 insertions(+), 57 deletions(-) create mode 100644 tests/cbz_writer_tests.rs create mode 100644 tests/pdf_reader_tests.rs diff --git a/Cargo.lock b/Cargo.lock index 9b478d4..f260a51 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -40,6 +40,15 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4aa90d7ce82d4be67b64039a3d588d38dbcc6736577de4a847025ce5b0c468d1" +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anstream" version = "0.6.15" @@ -232,6 +241,7 @@ dependencies = [ "image", "indicatif", "log", + "lopdf", "pdf-writer", "pretty_env_logger", "rayon", @@ -268,6 +278,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +dependencies = [ + "iana-time-zone", + "num-traits", + "windows-link", +] + [[package]] name = "cipher" version = "0.4.4" @@ -349,6 +370,12 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + [[package]] name = "cpufeatures" version = "0.2.14" @@ -484,6 +511,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "env_logger" version = "0.10.2" @@ -676,6 +712,30 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +[[package]] +name = "iana-time-zone" +version = "0.1.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "image" version = "0.25.8" @@ -847,6 +907,12 @@ dependencies = [ "zlib-rs", ] +[[package]] +name = "linked-hash-map" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" + [[package]] name = "linux-raw-sys" version = "0.11.0" @@ -884,6 +950,25 @@ dependencies = [ "imgref", ] +[[package]] +name = "lopdf" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e775e4ee264e8a87d50a9efef7b67b4aa988cf94e75630859875fc347e6c872b" +dependencies = [ + "chrono", + "encoding_rs", + "flate2", + "itoa", + "linked-hash-map", + "log", + "md5", + "nom", + "rayon", + "time", + "weezl", +] + [[package]] name = "lzma-rust2" version = "0.13.0" @@ -904,6 +989,12 @@ dependencies = [ "rayon", ] +[[package]] +name = "md5" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" + [[package]] name = "memchr" version = "2.7.4" @@ -1613,10 +1704,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d" dependencies = [ "deranged", + "itoa", "num-conv", "powerfmt", "serde", "time-core", + "time-macros", ] [[package]] @@ -1625,6 +1718,16 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b" +[[package]] +name = "time-macros" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3" +dependencies = [ + "num-conv", + "time-core", +] + [[package]] name = "toml" version = "0.8.19" @@ -1826,12 +1929,65 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + [[package]] name = "windows-sys" version = "0.52.0" diff --git a/Cargo.toml b/Cargo.toml index 599d6a1..473f8c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ rayon = "1.10.0" tabled = "0.20.0" walkdir = "2.5.0" zip = "6.0.0" +lopdf = "0.32.0" [dev-dependencies] tempfile = "3.12.0" diff --git a/src/formats/cbz.rs b/src/formats/cbz.rs index 886ab43..8319496 100644 --- a/src/formats/cbz.rs +++ b/src/formats/cbz.rs @@ -1,6 +1,6 @@ use std::ffi::OsStr; use std::fs::File; -use std::io::Read; +use std::io::{Read, Write}; use std::path::Path; use anyhow::Result; @@ -9,7 +9,7 @@ use zip::ZipArchive; use crate::model::{Document, ImagePage}; -use super::FormatReader; +use super::{FormatReader, FormatWriter}; pub struct CbzReader; @@ -51,3 +51,40 @@ impl FormatReader for CbzReader { Ok(Document::new(pages)) } } + +pub struct CbzWriter; + +impl FormatWriter for CbzWriter { + fn write(&self, doc: &Document, output: &Path) -> Result<()> { + use zip::write::SimpleFileOptions; + let file = File::create(output)?; + let mut zip = zip::ZipWriter::new(file); + let options = SimpleFileOptions::default(); + + for (idx, page) in doc.pages.iter().enumerate() { + let mut name = page.name.clone(); + if Path::new(&name).extension().and_then(OsStr::to_str) != Some("jpg") { + name = format!("{:03}.jpg", idx + 1); + } + zip.start_file(&name, options)?; + if let Some(dct) = &page.jpeg_dct { + zip.write_all(dct)?; + } else { + // Encode to JPEG + let rgb = page.image.to_rgb8(); + let (w, h) = (rgb.width(), rgb.height()); + let mut cursor = std::io::Cursor::new(Vec::new()); + { + let mut enc = + image::codecs::jpeg::JpegEncoder::new_with_quality(&mut cursor, 85); + enc.encode(&rgb.into_raw(), w, h, image::ColorType::Rgb8.into())?; + } + let data = cursor.into_inner(); + zip.write_all(&data)?; + } + } + + zip.finish()?; + Ok(()) + } +} diff --git a/src/formats/mod.rs b/src/formats/mod.rs index 0ad3ec0..0bac2ce 100644 --- a/src/formats/mod.rs +++ b/src/formats/mod.rs @@ -8,6 +8,9 @@ use crate::model::Document; pub mod cbz; pub mod pdf; +use cbz::{CbzReader, CbzWriter}; +use pdf::{PdfReader, PdfWriter}; + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum FormatId { Cbz, @@ -17,18 +20,12 @@ pub enum FormatId { impl FormatId { #[allow(dead_code)] pub fn can_read(self) -> bool { - match self { - FormatId::Cbz => true, - FormatId::Pdf => false, // planned but not implemented yet - } + get_reader(self).is_some() } #[allow(dead_code)] pub fn can_write(self) -> bool { - match self { - FormatId::Pdf => true, - FormatId::Cbz => false, // planned but not implemented yet - } + get_writer(self).is_some() } pub fn detect_from_path(path: &Path) -> Option { @@ -47,3 +44,17 @@ pub trait FormatReader: Send + Sync { pub trait FormatWriter: Send + Sync { fn write(&self, doc: &Document, output: &Path) -> Result<()>; } + +pub fn get_reader(format: FormatId) -> Option> { + match format { + FormatId::Cbz => Some(Box::new(CbzReader)), + FormatId::Pdf => Some(Box::new(PdfReader)), + } +} + +pub fn get_writer(format: FormatId) -> Option> { + match format { + FormatId::Pdf => Some(Box::new(PdfWriter)), + FormatId::Cbz => Some(Box::new(CbzWriter)), + } +} diff --git a/src/formats/pdf.rs b/src/formats/pdf.rs index 2da0469..929d9cb 100644 --- a/src/formats/pdf.rs +++ b/src/formats/pdf.rs @@ -1,12 +1,13 @@ use anyhow::Result; -use image::{codecs::jpeg::JpegEncoder, ColorType}; +use image::codecs::jpeg::JpegEncoder; +use image::ColorType; use pdf_writer::{Content, Filter, Finish, Name, Pdf, Rect, Ref}; use std::io::Cursor; use std::path::Path; -use crate::model::Document; +use crate::model::{Document, ImagePage}; -use super::FormatWriter; +use super::{FormatReader, FormatWriter}; pub struct PdfWriter; @@ -77,3 +78,194 @@ impl FormatWriter for PdfWriter { Ok(()) } } + +pub struct PdfReader; + +impl FormatReader for PdfReader { + fn read(&self, input: &Path) -> Result { + use lopdf::{Document as LoDocument, Object}; + + let doc = LoDocument::load(input)?; + let pages_map = doc.get_pages(); + let mut image_pages: Vec = Vec::new(); + + for (idx, (_page_num, page_id)) in pages_map.iter().enumerate() { + // Fetch page object + let page_obj = doc.get_object(*page_id)?; + let page_dict = match page_obj.as_dict() { + Ok(d) => d, + Err(_) => continue, + }; + + // Resolve Resources dictionary (can be a reference or inline dict) + let (mut xobjects_opt, mut content_refs): (Option, Vec>) = + (None, Vec::new()); + + if let Ok(obj) = page_dict.get(b"Resources") { + match obj { + Object::Reference(id) => { + if let Ok(Object::Dictionary(d)) = doc.get_object(*id) { + // Extract XObject dict if present + if let Ok(Object::Reference(xid)) = d.get(b"XObject") { + if let Ok(Object::Dictionary(xd)) = doc.get_object(*xid) { + xobjects_opt = Some(xd.clone()); + } + } else if let Ok(Object::Dictionary(xd)) = d.get(b"XObject") { + xobjects_opt = Some(xd.clone()); + } + } + } + Object::Dictionary(d) => { + if let Ok(Object::Reference(xid)) = d.get(b"XObject") { + if let Ok(Object::Dictionary(xd)) = doc.get_object(*xid) { + xobjects_opt = Some(xd.clone()); + } + } else if let Ok(Object::Dictionary(xd)) = d.get(b"XObject") { + xobjects_opt = Some(xd.clone()); + } + } + _ => {} + } + } + + // Try to track which XObjects are used by parsing Content streams for /Name Do + if let Ok(contents_obj) = page_dict.get(b"Contents") { + match contents_obj { + Object::Reference(cid) => { + if let Ok(Object::Stream(stream)) = doc.get_object(*cid) { + content_refs.extend(extract_xobject_names(&stream.content)); + } + } + Object::Array(arr) => { + for o in arr { + if let Object::Reference(cid) = o { + if let Ok(Object::Stream(stream)) = doc.get_object(*cid) { + content_refs.extend(extract_xobject_names(&stream.content)); + } + } + } + } + Object::Stream(stream) => { + content_refs.extend(extract_xobject_names(&stream.content)); + } + _ => {} + } + } + + // If we have XObjects, pick the first image (prefer one referenced in content) + if let Some(xobjects) = xobjects_opt { + // Build ordered keys: first those referenced in content, then the rest + let mut keys: Vec> = xobjects.iter().map(|(k, _)| k.clone()).collect(); + // Stable sort by whether referenced first + keys.sort_by_key(|k| { + let name = if k.starts_with(b"/") { + k[1..].to_vec() + } else { + k.clone() + }; + match content_refs.iter().position(|r| *r == name) { + Some(pos) => pos as i32, + None => i32::MAX, + } + }); + + for key in keys { + if let Ok(&Object::Reference(obj_id)) = xobjects.get(&key) { + if let Ok(Object::Stream(stream)) = doc.get_object(obj_id) { + let dict = &stream.dict; + let is_image = matches!(dict.get(b"Subtype"), Ok(Object::Name(n)) if n == b"Image"); + if !is_image { + continue; + } + + let is_dct = match dict.get(b"Filter") { + Ok(Object::Name(n)) => n == b"DCTDecode", + Ok(Object::Array(arr)) => arr + .iter() + .any(|o| matches!(o, Object::Name(n) if n == b"DCTDecode")), + _ => false, + }; + + let data = stream.content.clone(); + if is_dct { + if let Ok(img) = image::load_from_memory(&data) { + let name = format!("{:03}.jpg", idx + 1); + image_pages.push(ImagePage { + name, + image: img, + jpeg_dct: Some(data), + }); + break; + } else { + // If JPEG parsing failed, skip + continue; + } + } else if let Ok(img) = image::load_from_memory(&data) { + // Fallback: try to decode arbitrary image stream + let name = format!("{:03}.jpg", idx + 1); + image_pages.push(ImagePage { + name, + image: img, + jpeg_dct: None, + }); + break; + } + } + } + } + } + } + + Ok(Document::new(image_pages)) + } +} + +// Helper to extract XObject names used in a content stream by scanning for "/Name Do" +fn extract_xobject_names(content: &[u8]) -> Vec> { + // This is a naive scanner but often sufficient: tokens separated by whitespace, looking for "/name Do" + let mut names = Vec::new(); + let s = content; + let mut i = 0; + while i < s.len() { + // skip whitespace + while i < s.len() && s[i].is_ascii_whitespace() { + i += 1; + } + if i >= s.len() { + break; + } + if s[i] == b'/' { + // read name + let start = i + 1; + i += 1; + while i < s.len() && !s[i].is_ascii_whitespace() { + i += 1; + } + let name = s[start..i].to_vec(); + // skip whitespace + while i < s.len() && s[i].is_ascii_whitespace() { + i += 1; + } + // check for Do operator after possible inline graphics state + // We will just check if next token is Do + let mut j = i; + while j < s.len() && s[j].is_ascii_whitespace() { + j += 1; + } + let op_start = j; + while j < s.len() && (s[j] as char).is_ascii_alphabetic() { + j += 1; + } + if &s[op_start..j] == b"Do" { + names.push(name); + } + i = j; + } else { + // skip token + while i < s.len() && !s[i].is_ascii_whitespace() { + i += 1; + } + } + } + names +} diff --git a/src/job.rs b/src/job.rs index 473ba1a..7f2043d 100644 --- a/src/job.rs +++ b/src/job.rs @@ -5,9 +5,7 @@ use anyhow::Result; use indicatif::{ProgressBar, ProgressStyle}; use rayon::prelude::*; -use crate::formats::cbz::CbzReader; -use crate::formats::pdf::PdfWriter; -use crate::formats::{FormatId, FormatReader, FormatWriter}; +use crate::formats::{get_reader, get_writer, FormatId}; #[derive(Debug, Clone)] pub struct Job { @@ -34,22 +32,6 @@ impl Job { } } -fn get_reader(format: FormatId) -> Box { - match format { - FormatId::Cbz => Box::new(CbzReader), - // Placeholder for future formats - FormatId::Pdf => unimplemented!("Reading PDF not implemented"), - } -} - -fn get_writer(format: FormatId) -> Box { - match format { - FormatId::Pdf => Box::new(PdfWriter), - // Placeholder for future formats - FormatId::Cbz => unimplemented!("Writing CBZ not implemented"), - } -} - pub fn process_jobs(jobs: Vec) -> Result<()> { let pb = ProgressBar::new(jobs.len() as u64); pb.enable_steady_tick(Duration::from_millis(300)); @@ -59,8 +41,8 @@ pub fn process_jobs(jobs: Vec) -> Result<()> { jobs.par_iter().for_each(|job| { // Build the pipeline for each job - let reader = get_reader(job.from); - let writer = get_writer(job.to); + let reader = get_reader(job.from).expect("No reader registered for selected input format"); + let writer = get_writer(job.to).expect("No writer registered for selected output format"); let doc = reader.read(&job.input_path).expect("Failed to read input"); writer diff --git a/src/main.rs b/src/main.rs index 7b902b8..7ac9262 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,6 +11,21 @@ use walkdir::WalkDir; use cbz2pdf::formats::FormatId; use cbz2pdf::job::{process_jobs, Job}; +#[derive(clap::ValueEnum, Clone, Debug)] +enum CliFormat { + Cbz, + Pdf, +} + +impl From for FormatId { + fn from(value: CliFormat) -> Self { + match value { + CliFormat::Cbz => FormatId::Cbz, + CliFormat::Pdf => FormatId::Pdf, + } + } +} + #[derive(Parser)] #[command()] struct Cli { @@ -18,7 +33,7 @@ struct Cli { short = 'i', long = "input", value_hint = ValueHint::FilePath, - help = "Path to CBZ file or directory containing CBZ files" + help = "Path to input file or directory" )] input_path: String, @@ -27,10 +42,20 @@ struct Cli { long = "output-directory", default_value = ".", value_hint = ValueHint::FilePath, - help = "Output directory for PDF files" + help = "Output directory for converted files" )] output_dir: String, + #[arg( + long = "from", + value_enum, + help = "Input format. If omitted, auto-detect from file extension" + )] + from: Option, + + #[arg(long = "to", value_enum, default_value = "pdf", help = "Output format")] + to: CliFormat, + #[arg(short = 'p', long, help = "Ask for confirmation before doing anything")] interactive: bool, } @@ -43,29 +68,51 @@ fn main() -> Result<(), Box> { let input_path = Path::new(&cli.input_path); let output_dir = Path::new(&cli.output_dir); + let from_opt: Option = cli.from.map(Into::into); + let to_fmt: FormatId = cli.to.into(); + + // Validate target capability early + if !to_fmt.can_write() { + eprintln!("Selected output format is not supported for writing: {to_fmt:?}"); + std::process::exit(1); + } + let mut jobs: Vec = Vec::new(); if input_path.is_file() { - if let Some(FormatId::Cbz) = FormatId::detect_from_path(input_path) { - jobs.push(Job::new( - input_path.to_path_buf(), - output_dir.to_path_buf(), - FormatId::Cbz, - FormatId::Pdf, - )); - } else { - eprintln!("Unsupported input file format"); + let detected = FormatId::detect_from_path(input_path); + let from_fmt = from_opt.or(detected).unwrap_or_else(|| { + eprintln!( + "Could not detect input format from file extension and no --from was provided" + ); + std::process::exit(1); + }); + + if !from_fmt.can_read() { + eprintln!("Selected/Detected input format is not supported for reading: {from_fmt:?}"); std::process::exit(1); } + + jobs.push(Job::new( + input_path.to_path_buf(), + output_dir.to_path_buf(), + from_fmt, + to_fmt, + )); } else if input_path.is_dir() { - jobs.extend(walk_directory(input_path, output_dir)); + jobs.extend(walk_directory(input_path, output_dir, from_opt, to_fmt)); } else { eprintln!( - "Invalid input path. Please provide a CBZ file or a directory containing CBZ files." + "Invalid input path. Please provide a valid file or a directory containing supported files." ); std::process::exit(1); } + if jobs.is_empty() { + eprintln!("No matching inputs found to process."); + std::process::exit(1); + } + jobs.sort_by_key(|j| j.input_path.clone().into_os_string().into_string()); let proceed = if cli.interactive { @@ -105,20 +152,33 @@ fn main() -> Result<(), Box> { Ok(()) } -fn walk_directory(directory: &Path, output_dir: &Path) -> Vec { +fn walk_directory( + directory: &Path, + output_dir: &Path, + from_opt: Option, + to_fmt: FormatId, +) -> Vec { debug!("Walking {directory:?}"); let mut jobs = Vec::new(); for entry in WalkDir::new(directory) { let entry = entry.unwrap(); let path = entry.path(); if path.is_file() { - if let Some(FormatId::Cbz) = FormatId::detect_from_path(path) { - jobs.push(Job::new( - path.to_path_buf(), - output_dir.to_path_buf(), - FormatId::Cbz, - FormatId::Pdf, - )); + let detected = FormatId::detect_from_path(path); + let from_fmt_opt = match from_opt { + Some(fixed) => detected.filter(|d| *d == fixed), + None => detected, + }; + + if let Some(from_fmt) = from_fmt_opt { + if from_fmt.can_read() && to_fmt.can_write() { + jobs.push(Job::new( + path.to_path_buf(), + output_dir.to_path_buf(), + from_fmt, + to_fmt, + )); + } } } } diff --git a/tests/cbz_writer_tests.rs b/tests/cbz_writer_tests.rs new file mode 100644 index 0000000..2fbd54e --- /dev/null +++ b/tests/cbz_writer_tests.rs @@ -0,0 +1,96 @@ +use std::fs::File; +use std::io::Read; + +use cbz2pdf::formats::cbz::CbzWriter; +use cbz2pdf::formats::FormatWriter; +use cbz2pdf::model::{Document, ImagePage}; + +fn make_tiny_jpeg() -> (Vec, image::DynamicImage) { + let img = image::DynamicImage::new_rgb8(1, 1).to_rgb8(); + let mut buf = Vec::new(); + { + let mut cursor = std::io::Cursor::new(&mut buf); + let mut enc = image::codecs::jpeg::JpegEncoder::new_with_quality(&mut cursor, 80); + enc.encode(&img, 1, 1, image::ColorType::Rgb8.into()) + .unwrap(); + } + let decoded = image::load_from_memory(&buf).unwrap(); + (buf, decoded) +} + +#[test] +fn cbz_writer_preserves_dct_and_renames_non_jpg() { + // Prepare a page with original JPEG DCT data but a non-jpg name. + let (jpeg_dct, decoded) = make_tiny_jpeg(); + let page = ImagePage { + name: "cover.png".to_string(), + image: decoded, + jpeg_dct: Some(jpeg_dct.clone()), + }; + let doc = Document::new(vec![page]); + + let temp_dir = tempfile::tempdir().expect("create temp dir"); + let cbz_path = temp_dir.path().join("out.cbz"); + + let writer = CbzWriter; + writer.write(&doc, &cbz_path).expect("write cbz"); + + // Open the CBZ and verify it contains 001.jpg with the exact JPEG data. + let f = File::open(&cbz_path).unwrap(); + let mut zip = zip::ZipArchive::new(f).unwrap(); + + // There should be exactly one file named 001.jpg + let mut found = false; + for i in 0..zip.len() { + let mut file = zip.by_index(i).unwrap(); + let name = file.enclosed_name().unwrap().to_owned(); + if name.file_name().unwrap() == "001.jpg" { + let mut data = Vec::new(); + file.read_to_end(&mut data).unwrap(); + assert_eq!( + data, jpeg_dct, + "writer should preserve original JPEG DCT bytes" + ); + found = true; + } + } + assert!(found, "001.jpg not found in zip"); +} + +#[test] +fn cbz_writer_keeps_jpg_name() { + // If the page already has a .jpg name, the writer should keep it. + let (jpeg_dct, decoded) = make_tiny_jpeg(); + let page = ImagePage { + name: "page01.jpg".to_string(), + image: decoded, + jpeg_dct: Some(jpeg_dct), + }; + let doc = Document::new(vec![page]); + + let temp_dir = tempfile::tempdir().expect("create temp dir"); + let cbz_path = temp_dir.path().join("out.cbz"); + + let writer = CbzWriter; + writer.write(&doc, &cbz_path).expect("write cbz"); + + let f = File::open(&cbz_path).unwrap(); + let mut zip = zip::ZipArchive::new(f).unwrap(); + + let mut names = Vec::new(); + for i in 0..zip.len() { + let file = zip.by_index(i).unwrap(); + let name = file + .enclosed_name() + .unwrap() + .file_name() + .unwrap() + .to_owned(); + names.push(name.to_string_lossy().to_string()); + } + assert_eq!( + names, + vec!["page01.jpg"], + "existing .jpg name should be kept" + ); +} diff --git a/tests/job_and_format_tests.rs b/tests/job_and_format_tests.rs index 41b5e6d..55c9742 100644 --- a/tests/job_and_format_tests.rs +++ b/tests/job_and_format_tests.rs @@ -34,7 +34,7 @@ fn job_new_sets_output_extension() { #[test] fn format_capabilities_consistent() { assert!(FormatId::Cbz.can_read()); - assert!(!FormatId::Cbz.can_write()); + assert!(FormatId::Cbz.can_write()); assert!(FormatId::Pdf.can_write()); - assert!(!FormatId::Pdf.can_read()); + assert!(FormatId::Pdf.can_read()); } diff --git a/tests/pdf_reader_tests.rs b/tests/pdf_reader_tests.rs new file mode 100644 index 0000000..32be07f --- /dev/null +++ b/tests/pdf_reader_tests.rs @@ -0,0 +1,51 @@ +use cbz2pdf::formats::pdf::{PdfReader, PdfWriter}; +use cbz2pdf::formats::{FormatReader, FormatWriter}; +use cbz2pdf::model::{Document, ImagePage}; + +fn make_small_jpeg(w: u32, h: u32, rgb: [u8; 3]) -> (Vec, image::DynamicImage) { + let mut img = image::ImageBuffer::, _>::new(w, h); + for p in img.pixels_mut() { + *p = image::Rgb(rgb); + } + let dynimg = image::DynamicImage::ImageRgb8(img); + + let mut buf = Vec::new(); + { + let mut cursor = std::io::Cursor::new(&mut buf); + let mut enc = image::codecs::jpeg::JpegEncoder::new_with_quality(&mut cursor, 85); + let rgb8 = dynimg.to_rgb8(); + enc.encode(&rgb8, w, h, image::ColorType::Rgb8.into()) + .unwrap(); + } + (buf, dynimg) +} + +#[test] +fn pdf_reader_extracts_jpeg_xobject_and_preserves_dct() { + // Build a PDF with one JPEG-backed page + let (jpeg_dct, dynimg) = make_small_jpeg(3, 2, [10, 20, 30]); + let page = ImagePage { + name: "p1.jpg".into(), + image: dynimg.clone(), + jpeg_dct: Some(jpeg_dct.clone()), + }; + let doc = Document::new(vec![page]); + + let temp_dir = tempfile::tempdir().expect("tmpdir"); + let pdf_path = temp_dir.path().join("in.pdf"); + + PdfWriter.write(&doc, &pdf_path).expect("write pdf"); + + // Read back with PdfReader + let out = PdfReader.read(&pdf_path).expect("read pdf"); + assert_eq!(out.pages.len(), 1, "should have one page extracted"); + let p = &out.pages[0]; + assert_eq!(p.image.width(), dynimg.width()); + assert_eq!(p.image.height(), dynimg.height()); + assert!(p.jpeg_dct.is_some(), "should preserve DCT for JPEG images"); + assert_eq!( + p.jpeg_dct.as_ref().unwrap(), + &jpeg_dct, + "JPEG bytes should match" + ); +}