chore: refactor ahead of supporting more conversion types
All checks were successful
Checking Renovate configuration / validate (pull_request) Successful in 1m5s
Build and test / Clippy (pull_request) Successful in 3m25s
Checking yaml / Run yamllint (pull_request) Successful in 11s
Build and test / Tests (pull_request) Successful in 4m12s
Build and test / Build AMD64 (pull_request) Successful in 4m12s
Build and test / Generate Documentation (pull_request) Successful in 3m52s

This commit is contained in:
Marc Plano-Lesay 2025-10-10 16:34:52 +11:00
parent 48b560d85e
commit 034f0b142c
Signed by: kernald
GPG key ID: 66A41B08CC62A6CF
12 changed files with 468 additions and 150 deletions

1
Cargo.lock generated
View file

@ -236,6 +236,7 @@ dependencies = [
"pretty_env_logger",
"rayon",
"tabled",
"tempfile",
"walkdir",
"zip",
]

View file

@ -16,3 +16,6 @@ rayon = "1.10.0"
tabled = "0.20.0"
walkdir = "2.5.0"
zip = "6.0.0"
[dev-dependencies]
tempfile = "3.12.0"

53
src/formats/cbz.rs Normal file
View file

@ -0,0 +1,53 @@
use std::ffi::OsStr;
use std::fs::File;
use std::io::Read;
use std::path::Path;
use anyhow::Result;
use rayon::prelude::*;
use zip::ZipArchive;
use crate::model::{Document, ImagePage};
use super::FormatReader;
pub struct CbzReader;
impl FormatReader for CbzReader {
fn read(&self, input: &Path) -> Result<Document> {
let mut zip = ZipArchive::new(File::open(input)?)?;
let mut files: Vec<(String, Vec<u8>)> = Vec::new();
for i in 0..zip.len() {
let mut file = zip.by_index(i)?;
let mut image_data = Vec::new();
let name = file
.enclosed_name()
.expect("Failed to read file name")
.to_owned();
if name.extension() == Some(OsStr::new("jpg")) {
file.read_to_end(&mut image_data)?;
files.push((
name.file_name()
.expect("Failed to read file name")
.to_string_lossy()
.to_string(),
image_data,
));
}
}
let mut pages: Vec<ImagePage> = Vec::new();
files
.par_iter()
.map(|(name, data)| ImagePage {
name: name.clone(),
image: image::load_from_memory(data).expect("Failed to decode image"),
jpeg_dct: Some(data.clone()),
})
.collect_into_vec(&mut pages);
pages.par_sort_by_key(|p| p.name.clone());
Ok(Document::new(pages))
}
}

49
src/formats/mod.rs Normal file
View file

@ -0,0 +1,49 @@
use std::ffi::OsStr;
use std::path::Path;
use anyhow::Result;
use crate::model::Document;
pub mod cbz;
pub mod pdf;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FormatId {
Cbz,
Pdf,
}
impl FormatId {
#[allow(dead_code)]
pub fn can_read(self) -> bool {
match self {
FormatId::Cbz => true,
FormatId::Pdf => false, // planned but not implemented yet
}
}
#[allow(dead_code)]
pub fn can_write(self) -> bool {
match self {
FormatId::Pdf => true,
FormatId::Cbz => false, // planned but not implemented yet
}
}
pub fn detect_from_path(path: &Path) -> Option<FormatId> {
match path.extension().and_then(OsStr::to_str) {
Some("cbz") => Some(FormatId::Cbz),
Some("pdf") => Some(FormatId::Pdf),
_ => None,
}
}
}
pub trait FormatReader: Send + Sync {
fn read(&self, input: &Path) -> Result<Document>;
}
pub trait FormatWriter: Send + Sync {
fn write(&self, doc: &Document, output: &Path) -> Result<()>;
}

79
src/formats/pdf.rs Normal file
View file

@ -0,0 +1,79 @@
use anyhow::Result;
use image::{codecs::jpeg::JpegEncoder, ColorType};
use pdf_writer::{Content, Filter, Finish, Name, Pdf, Rect, Ref};
use std::io::Cursor;
use std::path::Path;
use crate::model::Document;
use super::FormatWriter;
pub struct PdfWriter;
impl FormatWriter for PdfWriter {
fn write(&self, doc: &Document, output: &Path) -> Result<()> {
let a4 = Rect::new(0.0, 0.0, 595.0, 842.0);
let mut pdf = Pdf::new();
let catalog_id = Ref::new(1);
let page_tree_id = Ref::new(2);
pdf.catalog(catalog_id).pages(page_tree_id);
let mut pages = Vec::new();
let image_count = doc.pages.len();
for (pos, page) in doc.pages.iter().enumerate() {
let page_id = Ref::new(pos as i32 + 10);
let image_id = Ref::new(image_count as i32 + 10 + pos as i32);
let content_id = Ref::new(image_count as i32 * 3 + 10 + pos as i32);
pages.push(page_id);
let mut page_obj = pdf.page(page_id);
let image_name = Name(b"Im1");
page_obj.media_box(a4);
page_obj.parent(page_tree_id);
page_obj.contents(content_id);
page_obj.resources().x_objects().pair(image_name, image_id);
page_obj.finish();
// Prefer embedding original JPEG DCT stream if available to avoid re-encoding.
let jpeg_buf = if let Some(dct) = &page.jpeg_dct {
dct.clone()
} else {
// Fallback: encode the image to JPEG on the fly
let rgb = page.image.to_rgb8();
let (w, h) = (rgb.width(), rgb.height());
let mut cursor = Cursor::new(Vec::new());
{
let mut encoder = JpegEncoder::new_with_quality(&mut cursor, 85);
encoder
.encode(&rgb.into_raw(), w, h, ColorType::Rgb8.into())
.expect("Failed to encode image to JPEG for PDF");
}
cursor.into_inner()
};
let mut pdf_image = pdf.image_xobject(image_id, &jpeg_buf);
pdf_image.filter(Filter::DctDecode);
pdf_image.width(page.image.width() as i32);
pdf_image.height(page.image.height() as i32);
pdf_image.color_space().device_rgb();
pdf_image.bits_per_component(8);
pdf_image.finish();
let mut content = Content::new();
content.save_state();
content.transform([a4.x2, 0.0, 0.0, a4.y2, 0.0, 0.0]);
content.x_object(image_name);
content.restore_state();
pdf.stream(content_id, &content.finish());
}
let page_count = pages.len();
pdf.pages(page_tree_id).kids(pages).count(page_count as i32);
std::fs::write(output, pdf.finish())?;
Ok(())
}
}

74
src/job.rs Normal file
View file

@ -0,0 +1,74 @@
use std::path::PathBuf;
use std::time::Duration;
use anyhow::Result;
use indicatif::{ProgressBar, ProgressStyle};
use rayon::prelude::*;
use crate::formats::cbz::CbzReader;
use crate::formats::pdf::PdfWriter;
use crate::formats::{FormatId, FormatReader, FormatWriter};
#[derive(Debug, Clone)]
pub struct Job {
pub from: FormatId,
pub to: FormatId,
pub input_path: PathBuf,
pub output_path: PathBuf,
}
impl Job {
pub fn new(input_path: PathBuf, output_dir: PathBuf, from: FormatId, to: FormatId) -> Self {
let mut output_path = output_dir.join(input_path.file_name().unwrap());
match to {
FormatId::Pdf => output_path.set_extension("pdf"),
FormatId::Cbz => output_path.set_extension("cbz"),
};
Self {
from,
to,
input_path,
output_path,
}
}
}
fn get_reader(format: FormatId) -> Box<dyn FormatReader> {
match format {
FormatId::Cbz => Box::new(CbzReader),
// Placeholder for future formats
FormatId::Pdf => unimplemented!("Reading PDF not implemented"),
}
}
fn get_writer(format: FormatId) -> Box<dyn FormatWriter> {
match format {
FormatId::Pdf => Box::new(PdfWriter),
// Placeholder for future formats
FormatId::Cbz => unimplemented!("Writing CBZ not implemented"),
}
}
pub fn process_jobs(jobs: Vec<Job>) -> Result<()> {
let pb = ProgressBar::new(jobs.len() as u64);
pb.enable_steady_tick(Duration::from_millis(300));
pb.set_style(ProgressStyle::with_template(
"[{elapsed_precise}] {wide_bar} {pos:>7}/{len:7} {msg}",
)?);
jobs.par_iter().for_each(|job| {
// Build the pipeline for each job
let reader = get_reader(job.from);
let writer = get_writer(job.to);
let doc = reader.read(&job.input_path).expect("Failed to read input");
writer
.write(&doc, &job.output_path)
.expect("Failed to write output");
pb.inc(1);
});
pb.finish();
Ok(())
}

3
src/lib.rs Normal file
View file

@ -0,0 +1,3 @@
pub mod formats;
pub mod job;
pub mod model;

View file

@ -2,20 +2,14 @@ use anyhow::Result;
use clap::{Parser, ValueHint};
use dialoguer::theme::ColorfulTheme;
use dialoguer::Confirm;
use image::DynamicImage;
use indicatif::{ProgressBar, ProgressStyle};
use log::*;
use pdf_writer::{Content, Filter, Finish, Name, Pdf, Rect, Ref};
use rayon::prelude::*;
use std::ffi::OsStr;
use std::fs::File;
use std::io::Read;
use std::path::{Path, PathBuf};
use std::time::Duration;
use std::path::Path;
use tabled::builder::Builder;
use tabled::settings::Style;
use walkdir::WalkDir;
use zip::ZipArchive;
use cbz2pdf::formats::FormatId;
use cbz2pdf::job::{process_jobs, Job};
#[derive(Parser)]
#[command()]
@ -49,10 +43,20 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let input_path = Path::new(&cli.input_path);
let output_dir = Path::new(&cli.output_dir);
let mut jobs = Vec::new();
let mut jobs: Vec<Job> = Vec::new();
if input_path.is_file() && input_path.extension() == Some(OsStr::new("cbz")) {
jobs.push(Job::new(input_path.to_path_buf(), output_dir.to_path_buf()));
if input_path.is_file() {
if let Some(FormatId::Cbz) = FormatId::detect_from_path(input_path) {
jobs.push(Job::new(
input_path.to_path_buf(),
output_dir.to_path_buf(),
FormatId::Cbz,
FormatId::Pdf,
));
} else {
eprintln!("Unsupported input file format");
std::process::exit(1);
}
} else if input_path.is_dir() {
jobs.extend(walk_directory(input_path, output_dir));
} else {
@ -62,22 +66,30 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
std::process::exit(1);
}
jobs.sort_by_key(|j| j.cbz_path.clone().into_os_string().into_string());
jobs.sort_by_key(|j| j.input_path.clone().into_os_string().into_string());
let proceed = if cli.interactive {
let mut table_builder = Builder::default();
table_builder.push_record(["From", "To"]);
jobs.iter().for_each(|job| {
table_builder.push_record(vec![
job.cbz_path.clone().into_os_string().into_string().unwrap(),
job.pdf_path.clone().into_os_string().into_string().unwrap(),
job.input_path
.clone()
.into_os_string()
.into_string()
.unwrap(),
job.output_path
.clone()
.into_os_string()
.into_string()
.unwrap(),
]);
});
let mut table = table_builder.build();
table.with(Style::rounded());
println!("{}", table);
println!("{table}");
Confirm::with_theme(&ColorfulTheme::default())
.with_prompt("Convert?")
@ -94,145 +106,22 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
}
fn walk_directory(directory: &Path, output_dir: &Path) -> Vec<Job> {
debug!("Walking {:?}", directory);
debug!("Walking {directory:?}");
let mut jobs = Vec::new();
for entry in WalkDir::new(directory) {
let entry = entry.unwrap();
let path = entry.path();
if path.is_file() && path.extension() == Some(OsStr::new("cbz")) {
jobs.push(Job::new(path.to_path_buf(), output_dir.to_path_buf()));
if path.is_file() {
if let Some(FormatId::Cbz) = FormatId::detect_from_path(path) {
jobs.push(Job::new(
path.to_path_buf(),
output_dir.to_path_buf(),
FormatId::Cbz,
FormatId::Pdf,
));
}
}
}
jobs
}
struct ImageFile {
pub name: String,
pub data: Vec<u8>,
}
struct DecodedImageFile {
pub name: String,
pub data: Vec<u8>,
pub image: DynamicImage,
}
impl From<&ImageFile> for DecodedImageFile {
fn from(value: &ImageFile) -> Self {
let image = image::load_from_memory(&value.data).unwrap();
Self {
name: value.name.clone(),
data: value.data.clone(),
image,
}
}
}
struct Job {
pub cbz_path: PathBuf,
pub pdf_path: PathBuf,
}
impl Job {
fn new(cbz_path: PathBuf, output_dir: PathBuf) -> Self {
let mut output_path = output_dir.join(cbz_path.file_name().unwrap());
output_path.set_extension("pdf");
Self {
cbz_path,
pdf_path: output_path,
}
}
}
fn convert_cbz(cbz_path: &Path, output_path: &Path) -> Result<()> {
let a4 = Rect::new(0.0, 0.0, 595.0, 842.0);
let mut zip = ZipArchive::new(File::open(cbz_path)?)?;
let mut files = Vec::new();
for i in 0..zip.len() {
let mut file = zip.by_index(i)?;
let mut image_data = Vec::new();
let name = file.enclosed_name().expect("Failed to read file name");
if name.extension() == Some(OsStr::new("jpg")) {
file.read_to_end(&mut image_data)?;
files.push(ImageFile {
name: name
.file_name()
.expect("Failed to read file name")
.to_string_lossy()
.to_string(),
data: image_data,
});
}
}
let mut images = Vec::new();
files
.par_iter()
.map(DecodedImageFile::from)
.collect_into_vec(&mut images);
images.par_sort_by_key(|img| img.name.clone());
let mut pdf = Pdf::new();
let catalog_id = Ref::new(1);
let page_tree_id = Ref::new(2);
pdf.catalog(catalog_id).pages(page_tree_id);
let mut pages = Vec::new();
let image_count = images.len();
for (pos, image) in images.iter().enumerate() {
let page_id = Ref::new(pos as i32 + 10);
let image_id = Ref::new(image_count as i32 + 10 + pos as i32);
let content_id = Ref::new(image_count as i32 * 3 + 10 + pos as i32);
pages.push(page_id);
let mut page = pdf.page(page_id);
let image_name = Name(b"Im1");
page.media_box(a4);
page.parent(page_tree_id);
page.contents(content_id);
page.resources().x_objects().pair(image_name, image_id);
page.finish();
let mut pdf_image = pdf.image_xobject(image_id, &image.data);
pdf_image.filter(Filter::DctDecode);
pdf_image.width(image.image.width() as i32);
pdf_image.height(image.image.height() as i32);
pdf_image.color_space().device_rgb();
pdf_image.bits_per_component(8);
pdf_image.finish();
let mut content = Content::new();
content.save_state();
content.transform([a4.x2, 0.0, 0.0, a4.y2, 0.0, 0.0]);
content.x_object(image_name);
content.restore_state();
pdf.stream(content_id, &content.finish());
}
let page_count = pages.len();
pdf.pages(page_tree_id).kids(pages).count(page_count as i32);
std::fs::write(output_path, pdf.finish())?;
Ok(())
}
fn process_jobs(jobs: Vec<Job>) -> Result<()> {
let pb = ProgressBar::new(jobs.len() as u64);
pb.enable_steady_tick(Duration::from_millis(300));
pb.set_style(ProgressStyle::with_template(
"[{elapsed_precise}] {wide_bar} {pos:>7}/{len:7} {msg}",
)?);
jobs.par_iter().for_each(|entry| {
convert_cbz(&entry.cbz_path, &entry.pdf_path).unwrap();
pb.inc(1);
});
pb.finish();
Ok(())
}

32
src/model.rs Normal file
View file

@ -0,0 +1,32 @@
use image::DynamicImage;
#[derive(Default, Debug, Clone)]
#[allow(dead_code)]
pub struct Metadata {
pub title: Option<String>,
pub author: Option<String>,
}
#[derive(Debug, Clone)]
pub struct ImagePage {
pub name: String,
pub image: DynamicImage,
// If available, carry the original JPEG DCT stream to avoid re-encoding.
pub jpeg_dct: Option<Vec<u8>>,
}
#[derive(Debug, Clone)]
pub struct Document {
pub pages: Vec<ImagePage>,
#[allow(dead_code)]
pub metadata: Metadata,
}
impl Document {
pub fn new(pages: Vec<ImagePage>) -> Self {
Self {
pages,
metadata: Metadata::default(),
}
}
}

57
tests/cbz_reader_tests.rs Normal file
View file

@ -0,0 +1,57 @@
use std::fs::File;
use std::io::Write;
use cbz2pdf::formats::cbz::CbzReader;
use cbz2pdf::formats::FormatReader;
#[test]
fn cbz_reader_reads_jpgs_and_sorts_by_name() {
// Build a temporary CBZ with 3 jpgs (including in a subdir) and one non-jpg
let temp_dir = tempfile::tempdir().expect("create temp dir");
let cbz_path = temp_dir.path().join("book.cbz");
{
let file = File::create(&cbz_path).expect("create cbz");
let mut zip = zip::ZipWriter::new(file);
let options = zip::write::SimpleFileOptions::default();
// out of order names
zip.start_file("002.jpg", options).unwrap();
// Create a tiny JPEG using the JPEG encoder
let img = image::DynamicImage::new_rgb8(1, 1).to_rgb8();
let mut buf = Vec::new();
{
let mut cursor = std::io::Cursor::new(&mut buf);
let mut enc = image::codecs::jpeg::JpegEncoder::new_with_quality(&mut cursor, 80);
enc.encode(&img, 1, 1, image::ColorType::Rgb8.into())
.unwrap();
}
zip.write_all(&buf).unwrap();
zip.start_file("001.jpg", options).unwrap();
zip.write_all(&buf).unwrap();
// nested path should be accepted
zip.start_file("subdir/003.jpg", options).unwrap();
zip.write_all(&buf).unwrap();
// non-jpg should be ignored
zip.start_file("notes.txt", options).unwrap();
zip.write_all(b"hello").unwrap();
zip.finish().unwrap();
}
let reader = CbzReader;
let doc = reader.read(&cbz_path).expect("read cbz");
assert_eq!(doc.pages.len(), 3, "should include only jpg files");
let names: Vec<String> = doc.pages.iter().map(|p| p.name.clone()).collect();
assert_eq!(
names,
vec!["001.jpg", "002.jpg", "003.jpg"],
"pages sorted by name"
);
// temp_dir goes out of scope and cleans up automatically
}

View file

@ -0,0 +1,40 @@
use std::path::PathBuf;
use cbz2pdf::formats::FormatId;
use cbz2pdf::job::Job;
#[test]
fn detect_from_path_recognizes_extensions() {
let cbz = PathBuf::from("/tmp/book.cbz");
let pdf = PathBuf::from("/tmp/book.pdf");
assert_eq!(FormatId::detect_from_path(&cbz), Some(FormatId::Cbz));
assert_eq!(FormatId::detect_from_path(&pdf), Some(FormatId::Pdf));
assert_eq!(
FormatId::detect_from_path(&PathBuf::from("/tmp/book.txt")),
None
);
}
#[test]
fn job_new_sets_output_extension() {
let input = PathBuf::from("/tmp/book.cbz");
let outdir = PathBuf::from("/tmp");
let job = Job::new(input.clone(), outdir.clone(), FormatId::Cbz, FormatId::Pdf);
assert!(job.output_path.ends_with("book.pdf"));
let job2 = Job::new(
PathBuf::from("/tmp/book.pdf"),
outdir,
FormatId::Pdf,
FormatId::Cbz,
);
assert!(job2.output_path.ends_with("book.cbz"));
}
#[test]
fn format_capabilities_consistent() {
assert!(FormatId::Cbz.can_read());
assert!(!FormatId::Cbz.can_write());
assert!(FormatId::Pdf.can_write());
assert!(!FormatId::Pdf.can_read());
}

38
tests/pdf_writer_smoke.rs Normal file
View file

@ -0,0 +1,38 @@
use std::fs;
use std::io::Read;
use cbz2pdf::formats::pdf::PdfWriter;
use cbz2pdf::formats::FormatWriter;
use cbz2pdf::model::{Document, ImagePage};
#[test]
fn pdf_writer_writes_valid_pdf_header() {
// Build a simple 2x2 red image page
let mut img = image::DynamicImage::new_rgb8(2, 2).to_rgb8();
for p in img.pixels_mut() {
*p = image::Rgb([255, 0, 0]);
}
let page = ImagePage {
name: "page1.jpg".to_string(),
image: image::DynamicImage::ImageRgb8(img),
jpeg_dct: None,
};
let doc = Document::new(vec![page]);
let temp_dir = tempfile::tempdir().expect("create temp dir");
let output = temp_dir.path().join("out.pdf");
let writer = PdfWriter;
writer.write(&doc, &output).expect("failed to write PDF");
// Assert file exists and has PDF header
let mut f = fs::File::open(&output).expect("pdf not created");
let mut header = [0u8; 5];
f.read_exact(&mut header).expect("cannot read header");
assert_eq!(&header, b"%PDF-", "missing PDF header");
let meta = fs::metadata(&output).unwrap();
assert!(meta.len() > 0, "empty pdf");
// temp_dir cleans up automatically on drop
}