From 034f0b142c5422607a6c38383bda1df3f854555d Mon Sep 17 00:00:00 2001 From: Marc Plano-Lesay Date: Fri, 10 Oct 2025 16:34:52 +1100 Subject: [PATCH] chore: refactor ahead of supporting more conversion types --- Cargo.lock | 1 + Cargo.toml | 3 + src/formats/cbz.rs | 53 ++++++++++ src/formats/mod.rs | 49 +++++++++ src/formats/pdf.rs | 79 ++++++++++++++ src/job.rs | 74 +++++++++++++ src/lib.rs | 3 + src/main.rs | 189 +++++++--------------------------- src/model.rs | 32 ++++++ tests/cbz_reader_tests.rs | 57 ++++++++++ tests/job_and_format_tests.rs | 40 +++++++ tests/pdf_writer_smoke.rs | 38 +++++++ 12 files changed, 468 insertions(+), 150 deletions(-) create mode 100644 src/formats/cbz.rs create mode 100644 src/formats/mod.rs create mode 100644 src/formats/pdf.rs create mode 100644 src/job.rs create mode 100644 src/lib.rs create mode 100644 src/model.rs create mode 100644 tests/cbz_reader_tests.rs create mode 100644 tests/job_and_format_tests.rs create mode 100644 tests/pdf_writer_smoke.rs diff --git a/Cargo.lock b/Cargo.lock index 0795aa2..309ca3d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -236,6 +236,7 @@ dependencies = [ "pretty_env_logger", "rayon", "tabled", + "tempfile", "walkdir", "zip", ] diff --git a/Cargo.toml b/Cargo.toml index e6ae54c..599d6a1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,3 +16,6 @@ rayon = "1.10.0" tabled = "0.20.0" walkdir = "2.5.0" zip = "6.0.0" + +[dev-dependencies] +tempfile = "3.12.0" diff --git a/src/formats/cbz.rs b/src/formats/cbz.rs new file mode 100644 index 0000000..886ab43 --- /dev/null +++ b/src/formats/cbz.rs @@ -0,0 +1,53 @@ +use std::ffi::OsStr; +use std::fs::File; +use std::io::Read; +use std::path::Path; + +use anyhow::Result; +use rayon::prelude::*; +use zip::ZipArchive; + +use crate::model::{Document, ImagePage}; + +use super::FormatReader; + +pub struct CbzReader; + +impl FormatReader for CbzReader { + fn read(&self, input: &Path) -> Result { + let mut zip = ZipArchive::new(File::open(input)?)?; + let mut files: Vec<(String, Vec)> = Vec::new(); + for i in 0..zip.len() { + let mut file = zip.by_index(i)?; + let mut image_data = Vec::new(); + let name = file + .enclosed_name() + .expect("Failed to read file name") + .to_owned(); + if name.extension() == Some(OsStr::new("jpg")) { + file.read_to_end(&mut image_data)?; + files.push(( + name.file_name() + .expect("Failed to read file name") + .to_string_lossy() + .to_string(), + image_data, + )); + } + } + + let mut pages: Vec = Vec::new(); + files + .par_iter() + .map(|(name, data)| ImagePage { + name: name.clone(), + image: image::load_from_memory(data).expect("Failed to decode image"), + jpeg_dct: Some(data.clone()), + }) + .collect_into_vec(&mut pages); + + pages.par_sort_by_key(|p| p.name.clone()); + + Ok(Document::new(pages)) + } +} diff --git a/src/formats/mod.rs b/src/formats/mod.rs new file mode 100644 index 0000000..0ad3ec0 --- /dev/null +++ b/src/formats/mod.rs @@ -0,0 +1,49 @@ +use std::ffi::OsStr; +use std::path::Path; + +use anyhow::Result; + +use crate::model::Document; + +pub mod cbz; +pub mod pdf; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FormatId { + Cbz, + Pdf, +} + +impl FormatId { + #[allow(dead_code)] + pub fn can_read(self) -> bool { + match self { + FormatId::Cbz => true, + FormatId::Pdf => false, // planned but not implemented yet + } + } + + #[allow(dead_code)] + pub fn can_write(self) -> bool { + match self { + FormatId::Pdf => true, + FormatId::Cbz => false, // planned but not implemented yet + } + } + + pub fn detect_from_path(path: &Path) -> Option { + match path.extension().and_then(OsStr::to_str) { + Some("cbz") => Some(FormatId::Cbz), + Some("pdf") => Some(FormatId::Pdf), + _ => None, + } + } +} + +pub trait FormatReader: Send + Sync { + fn read(&self, input: &Path) -> Result; +} + +pub trait FormatWriter: Send + Sync { + fn write(&self, doc: &Document, output: &Path) -> Result<()>; +} diff --git a/src/formats/pdf.rs b/src/formats/pdf.rs new file mode 100644 index 0000000..2da0469 --- /dev/null +++ b/src/formats/pdf.rs @@ -0,0 +1,79 @@ +use anyhow::Result; +use image::{codecs::jpeg::JpegEncoder, ColorType}; +use pdf_writer::{Content, Filter, Finish, Name, Pdf, Rect, Ref}; +use std::io::Cursor; +use std::path::Path; + +use crate::model::Document; + +use super::FormatWriter; + +pub struct PdfWriter; + +impl FormatWriter for PdfWriter { + fn write(&self, doc: &Document, output: &Path) -> Result<()> { + let a4 = Rect::new(0.0, 0.0, 595.0, 842.0); + + let mut pdf = Pdf::new(); + let catalog_id = Ref::new(1); + let page_tree_id = Ref::new(2); + pdf.catalog(catalog_id).pages(page_tree_id); + + let mut pages = Vec::new(); + let image_count = doc.pages.len(); + + for (pos, page) in doc.pages.iter().enumerate() { + let page_id = Ref::new(pos as i32 + 10); + let image_id = Ref::new(image_count as i32 + 10 + pos as i32); + let content_id = Ref::new(image_count as i32 * 3 + 10 + pos as i32); + pages.push(page_id); + let mut page_obj = pdf.page(page_id); + let image_name = Name(b"Im1"); + + page_obj.media_box(a4); + page_obj.parent(page_tree_id); + page_obj.contents(content_id); + page_obj.resources().x_objects().pair(image_name, image_id); + page_obj.finish(); + + // Prefer embedding original JPEG DCT stream if available to avoid re-encoding. + let jpeg_buf = if let Some(dct) = &page.jpeg_dct { + dct.clone() + } else { + // Fallback: encode the image to JPEG on the fly + let rgb = page.image.to_rgb8(); + let (w, h) = (rgb.width(), rgb.height()); + let mut cursor = Cursor::new(Vec::new()); + { + let mut encoder = JpegEncoder::new_with_quality(&mut cursor, 85); + encoder + .encode(&rgb.into_raw(), w, h, ColorType::Rgb8.into()) + .expect("Failed to encode image to JPEG for PDF"); + } + cursor.into_inner() + }; + + let mut pdf_image = pdf.image_xobject(image_id, &jpeg_buf); + pdf_image.filter(Filter::DctDecode); + pdf_image.width(page.image.width() as i32); + pdf_image.height(page.image.height() as i32); + pdf_image.color_space().device_rgb(); + pdf_image.bits_per_component(8); + pdf_image.finish(); + + let mut content = Content::new(); + content.save_state(); + content.transform([a4.x2, 0.0, 0.0, a4.y2, 0.0, 0.0]); + content.x_object(image_name); + content.restore_state(); + pdf.stream(content_id, &content.finish()); + } + + let page_count = pages.len(); + pdf.pages(page_tree_id).kids(pages).count(page_count as i32); + + std::fs::write(output, pdf.finish())?; + + Ok(()) + } +} diff --git a/src/job.rs b/src/job.rs new file mode 100644 index 0000000..473ba1a --- /dev/null +++ b/src/job.rs @@ -0,0 +1,74 @@ +use std::path::PathBuf; +use std::time::Duration; + +use anyhow::Result; +use indicatif::{ProgressBar, ProgressStyle}; +use rayon::prelude::*; + +use crate::formats::cbz::CbzReader; +use crate::formats::pdf::PdfWriter; +use crate::formats::{FormatId, FormatReader, FormatWriter}; + +#[derive(Debug, Clone)] +pub struct Job { + pub from: FormatId, + pub to: FormatId, + pub input_path: PathBuf, + pub output_path: PathBuf, +} + +impl Job { + pub fn new(input_path: PathBuf, output_dir: PathBuf, from: FormatId, to: FormatId) -> Self { + let mut output_path = output_dir.join(input_path.file_name().unwrap()); + match to { + FormatId::Pdf => output_path.set_extension("pdf"), + FormatId::Cbz => output_path.set_extension("cbz"), + }; + + Self { + from, + to, + input_path, + output_path, + } + } +} + +fn get_reader(format: FormatId) -> Box { + match format { + FormatId::Cbz => Box::new(CbzReader), + // Placeholder for future formats + FormatId::Pdf => unimplemented!("Reading PDF not implemented"), + } +} + +fn get_writer(format: FormatId) -> Box { + match format { + FormatId::Pdf => Box::new(PdfWriter), + // Placeholder for future formats + FormatId::Cbz => unimplemented!("Writing CBZ not implemented"), + } +} + +pub fn process_jobs(jobs: Vec) -> Result<()> { + let pb = ProgressBar::new(jobs.len() as u64); + pb.enable_steady_tick(Duration::from_millis(300)); + pb.set_style(ProgressStyle::with_template( + "[{elapsed_precise}] {wide_bar} {pos:>7}/{len:7} {msg}", + )?); + + jobs.par_iter().for_each(|job| { + // Build the pipeline for each job + let reader = get_reader(job.from); + let writer = get_writer(job.to); + + let doc = reader.read(&job.input_path).expect("Failed to read input"); + writer + .write(&doc, &job.output_path) + .expect("Failed to write output"); + pb.inc(1); + }); + + pb.finish(); + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..e27d539 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,3 @@ +pub mod formats; +pub mod job; +pub mod model; diff --git a/src/main.rs b/src/main.rs index 195a5b5..7b902b8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,20 +2,14 @@ use anyhow::Result; use clap::{Parser, ValueHint}; use dialoguer::theme::ColorfulTheme; use dialoguer::Confirm; -use image::DynamicImage; -use indicatif::{ProgressBar, ProgressStyle}; use log::*; -use pdf_writer::{Content, Filter, Finish, Name, Pdf, Rect, Ref}; -use rayon::prelude::*; -use std::ffi::OsStr; -use std::fs::File; -use std::io::Read; -use std::path::{Path, PathBuf}; -use std::time::Duration; +use std::path::Path; use tabled::builder::Builder; use tabled::settings::Style; use walkdir::WalkDir; -use zip::ZipArchive; + +use cbz2pdf::formats::FormatId; +use cbz2pdf::job::{process_jobs, Job}; #[derive(Parser)] #[command()] @@ -49,10 +43,20 @@ fn main() -> Result<(), Box> { let input_path = Path::new(&cli.input_path); let output_dir = Path::new(&cli.output_dir); - let mut jobs = Vec::new(); + let mut jobs: Vec = Vec::new(); - if input_path.is_file() && input_path.extension() == Some(OsStr::new("cbz")) { - jobs.push(Job::new(input_path.to_path_buf(), output_dir.to_path_buf())); + if input_path.is_file() { + if let Some(FormatId::Cbz) = FormatId::detect_from_path(input_path) { + jobs.push(Job::new( + input_path.to_path_buf(), + output_dir.to_path_buf(), + FormatId::Cbz, + FormatId::Pdf, + )); + } else { + eprintln!("Unsupported input file format"); + std::process::exit(1); + } } else if input_path.is_dir() { jobs.extend(walk_directory(input_path, output_dir)); } else { @@ -62,22 +66,30 @@ fn main() -> Result<(), Box> { std::process::exit(1); } - jobs.sort_by_key(|j| j.cbz_path.clone().into_os_string().into_string()); + jobs.sort_by_key(|j| j.input_path.clone().into_os_string().into_string()); let proceed = if cli.interactive { let mut table_builder = Builder::default(); table_builder.push_record(["From", "To"]); jobs.iter().for_each(|job| { table_builder.push_record(vec![ - job.cbz_path.clone().into_os_string().into_string().unwrap(), - job.pdf_path.clone().into_os_string().into_string().unwrap(), + job.input_path + .clone() + .into_os_string() + .into_string() + .unwrap(), + job.output_path + .clone() + .into_os_string() + .into_string() + .unwrap(), ]); }); let mut table = table_builder.build(); table.with(Style::rounded()); - println!("{}", table); + println!("{table}"); Confirm::with_theme(&ColorfulTheme::default()) .with_prompt("Convert?") @@ -94,145 +106,22 @@ fn main() -> Result<(), Box> { } fn walk_directory(directory: &Path, output_dir: &Path) -> Vec { - debug!("Walking {:?}", directory); + debug!("Walking {directory:?}"); let mut jobs = Vec::new(); for entry in WalkDir::new(directory) { let entry = entry.unwrap(); let path = entry.path(); - if path.is_file() && path.extension() == Some(OsStr::new("cbz")) { - jobs.push(Job::new(path.to_path_buf(), output_dir.to_path_buf())); + if path.is_file() { + if let Some(FormatId::Cbz) = FormatId::detect_from_path(path) { + jobs.push(Job::new( + path.to_path_buf(), + output_dir.to_path_buf(), + FormatId::Cbz, + FormatId::Pdf, + )); + } } } jobs } - -struct ImageFile { - pub name: String, - pub data: Vec, -} - -struct DecodedImageFile { - pub name: String, - pub data: Vec, - pub image: DynamicImage, -} - -impl From<&ImageFile> for DecodedImageFile { - fn from(value: &ImageFile) -> Self { - let image = image::load_from_memory(&value.data).unwrap(); - Self { - name: value.name.clone(), - data: value.data.clone(), - image, - } - } -} - -struct Job { - pub cbz_path: PathBuf, - pub pdf_path: PathBuf, -} - -impl Job { - fn new(cbz_path: PathBuf, output_dir: PathBuf) -> Self { - let mut output_path = output_dir.join(cbz_path.file_name().unwrap()); - output_path.set_extension("pdf"); - - Self { - cbz_path, - pdf_path: output_path, - } - } -} - -fn convert_cbz(cbz_path: &Path, output_path: &Path) -> Result<()> { - let a4 = Rect::new(0.0, 0.0, 595.0, 842.0); - - let mut zip = ZipArchive::new(File::open(cbz_path)?)?; - let mut files = Vec::new(); - for i in 0..zip.len() { - let mut file = zip.by_index(i)?; - let mut image_data = Vec::new(); - let name = file.enclosed_name().expect("Failed to read file name"); - if name.extension() == Some(OsStr::new("jpg")) { - file.read_to_end(&mut image_data)?; - files.push(ImageFile { - name: name - .file_name() - .expect("Failed to read file name") - .to_string_lossy() - .to_string(), - data: image_data, - }); - } - } - - let mut images = Vec::new(); - files - .par_iter() - .map(DecodedImageFile::from) - .collect_into_vec(&mut images); - images.par_sort_by_key(|img| img.name.clone()); - - let mut pdf = Pdf::new(); - let catalog_id = Ref::new(1); - let page_tree_id = Ref::new(2); - pdf.catalog(catalog_id).pages(page_tree_id); - - let mut pages = Vec::new(); - let image_count = images.len(); - - for (pos, image) in images.iter().enumerate() { - let page_id = Ref::new(pos as i32 + 10); - let image_id = Ref::new(image_count as i32 + 10 + pos as i32); - let content_id = Ref::new(image_count as i32 * 3 + 10 + pos as i32); - pages.push(page_id); - let mut page = pdf.page(page_id); - let image_name = Name(b"Im1"); - - page.media_box(a4); - page.parent(page_tree_id); - page.contents(content_id); - page.resources().x_objects().pair(image_name, image_id); - page.finish(); - - let mut pdf_image = pdf.image_xobject(image_id, &image.data); - pdf_image.filter(Filter::DctDecode); - pdf_image.width(image.image.width() as i32); - pdf_image.height(image.image.height() as i32); - pdf_image.color_space().device_rgb(); - pdf_image.bits_per_component(8); - pdf_image.finish(); - - let mut content = Content::new(); - content.save_state(); - content.transform([a4.x2, 0.0, 0.0, a4.y2, 0.0, 0.0]); - content.x_object(image_name); - content.restore_state(); - pdf.stream(content_id, &content.finish()); - } - - let page_count = pages.len(); - pdf.pages(page_tree_id).kids(pages).count(page_count as i32); - - std::fs::write(output_path, pdf.finish())?; - - Ok(()) -} - -fn process_jobs(jobs: Vec) -> Result<()> { - let pb = ProgressBar::new(jobs.len() as u64); - pb.enable_steady_tick(Duration::from_millis(300)); - pb.set_style(ProgressStyle::with_template( - "[{elapsed_precise}] {wide_bar} {pos:>7}/{len:7} {msg}", - )?); - - jobs.par_iter().for_each(|entry| { - convert_cbz(&entry.cbz_path, &entry.pdf_path).unwrap(); - pb.inc(1); - }); - - pb.finish(); - Ok(()) -} diff --git a/src/model.rs b/src/model.rs new file mode 100644 index 0000000..99ef9a2 --- /dev/null +++ b/src/model.rs @@ -0,0 +1,32 @@ +use image::DynamicImage; + +#[derive(Default, Debug, Clone)] +#[allow(dead_code)] +pub struct Metadata { + pub title: Option, + pub author: Option, +} + +#[derive(Debug, Clone)] +pub struct ImagePage { + pub name: String, + pub image: DynamicImage, + // If available, carry the original JPEG DCT stream to avoid re-encoding. + pub jpeg_dct: Option>, +} + +#[derive(Debug, Clone)] +pub struct Document { + pub pages: Vec, + #[allow(dead_code)] + pub metadata: Metadata, +} + +impl Document { + pub fn new(pages: Vec) -> Self { + Self { + pages, + metadata: Metadata::default(), + } + } +} diff --git a/tests/cbz_reader_tests.rs b/tests/cbz_reader_tests.rs new file mode 100644 index 0000000..ef760a5 --- /dev/null +++ b/tests/cbz_reader_tests.rs @@ -0,0 +1,57 @@ +use std::fs::File; +use std::io::Write; + +use cbz2pdf::formats::cbz::CbzReader; +use cbz2pdf::formats::FormatReader; + +#[test] +fn cbz_reader_reads_jpgs_and_sorts_by_name() { + // Build a temporary CBZ with 3 jpgs (including in a subdir) and one non-jpg + let temp_dir = tempfile::tempdir().expect("create temp dir"); + let cbz_path = temp_dir.path().join("book.cbz"); + + { + let file = File::create(&cbz_path).expect("create cbz"); + let mut zip = zip::ZipWriter::new(file); + let options = zip::write::SimpleFileOptions::default(); + + // out of order names + zip.start_file("002.jpg", options).unwrap(); + // Create a tiny JPEG using the JPEG encoder + let img = image::DynamicImage::new_rgb8(1, 1).to_rgb8(); + let mut buf = Vec::new(); + { + let mut cursor = std::io::Cursor::new(&mut buf); + let mut enc = image::codecs::jpeg::JpegEncoder::new_with_quality(&mut cursor, 80); + enc.encode(&img, 1, 1, image::ColorType::Rgb8.into()) + .unwrap(); + } + zip.write_all(&buf).unwrap(); + + zip.start_file("001.jpg", options).unwrap(); + zip.write_all(&buf).unwrap(); + + // nested path should be accepted + zip.start_file("subdir/003.jpg", options).unwrap(); + zip.write_all(&buf).unwrap(); + + // non-jpg should be ignored + zip.start_file("notes.txt", options).unwrap(); + zip.write_all(b"hello").unwrap(); + + zip.finish().unwrap(); + } + + let reader = CbzReader; + let doc = reader.read(&cbz_path).expect("read cbz"); + + assert_eq!(doc.pages.len(), 3, "should include only jpg files"); + let names: Vec = doc.pages.iter().map(|p| p.name.clone()).collect(); + assert_eq!( + names, + vec!["001.jpg", "002.jpg", "003.jpg"], + "pages sorted by name" + ); + + // temp_dir goes out of scope and cleans up automatically +} diff --git a/tests/job_and_format_tests.rs b/tests/job_and_format_tests.rs new file mode 100644 index 0000000..41b5e6d --- /dev/null +++ b/tests/job_and_format_tests.rs @@ -0,0 +1,40 @@ +use std::path::PathBuf; + +use cbz2pdf::formats::FormatId; +use cbz2pdf::job::Job; + +#[test] +fn detect_from_path_recognizes_extensions() { + let cbz = PathBuf::from("/tmp/book.cbz"); + let pdf = PathBuf::from("/tmp/book.pdf"); + assert_eq!(FormatId::detect_from_path(&cbz), Some(FormatId::Cbz)); + assert_eq!(FormatId::detect_from_path(&pdf), Some(FormatId::Pdf)); + assert_eq!( + FormatId::detect_from_path(&PathBuf::from("/tmp/book.txt")), + None + ); +} + +#[test] +fn job_new_sets_output_extension() { + let input = PathBuf::from("/tmp/book.cbz"); + let outdir = PathBuf::from("/tmp"); + let job = Job::new(input.clone(), outdir.clone(), FormatId::Cbz, FormatId::Pdf); + assert!(job.output_path.ends_with("book.pdf")); + + let job2 = Job::new( + PathBuf::from("/tmp/book.pdf"), + outdir, + FormatId::Pdf, + FormatId::Cbz, + ); + assert!(job2.output_path.ends_with("book.cbz")); +} + +#[test] +fn format_capabilities_consistent() { + assert!(FormatId::Cbz.can_read()); + assert!(!FormatId::Cbz.can_write()); + assert!(FormatId::Pdf.can_write()); + assert!(!FormatId::Pdf.can_read()); +} diff --git a/tests/pdf_writer_smoke.rs b/tests/pdf_writer_smoke.rs new file mode 100644 index 0000000..d1cac13 --- /dev/null +++ b/tests/pdf_writer_smoke.rs @@ -0,0 +1,38 @@ +use std::fs; +use std::io::Read; + +use cbz2pdf::formats::pdf::PdfWriter; +use cbz2pdf::formats::FormatWriter; +use cbz2pdf::model::{Document, ImagePage}; + +#[test] +fn pdf_writer_writes_valid_pdf_header() { + // Build a simple 2x2 red image page + let mut img = image::DynamicImage::new_rgb8(2, 2).to_rgb8(); + for p in img.pixels_mut() { + *p = image::Rgb([255, 0, 0]); + } + let page = ImagePage { + name: "page1.jpg".to_string(), + image: image::DynamicImage::ImageRgb8(img), + jpeg_dct: None, + }; + let doc = Document::new(vec![page]); + + let temp_dir = tempfile::tempdir().expect("create temp dir"); + let output = temp_dir.path().join("out.pdf"); + + let writer = PdfWriter; + writer.write(&doc, &output).expect("failed to write PDF"); + + // Assert file exists and has PDF header + let mut f = fs::File::open(&output).expect("pdf not created"); + let mut header = [0u8; 5]; + f.read_exact(&mut header).expect("cannot read header"); + assert_eq!(&header, b"%PDF-", "missing PDF header"); + + let meta = fs::metadata(&output).unwrap(); + assert!(meta.len() > 0, "empty pdf"); + + // temp_dir cleans up automatically on drop +}