feat: implement cbz writing and pdf reading
All checks were successful
Build and test / Clippy (pull_request) Successful in 44s
Build and test / Tests (pull_request) Successful in 48s
Checking yaml / Run yamllint (pull_request) Successful in 5s
Checking Renovate configuration / validate (pull_request) Successful in 1m4s
Build and test / Build AMD64 (pull_request) Successful in 49s
Build and test / Generate Documentation (pull_request) Successful in 59s

This commit is contained in:
Marc Plano-Lesay 2025-10-13 16:52:25 +11:00
parent 3aa68fbe12
commit b35ccbe271
Signed by: kernald
GPG key ID: 66A41B08CC62A6CF
10 changed files with 643 additions and 57 deletions

View file

@ -1,6 +1,6 @@
use std::ffi::OsStr;
use std::fs::File;
use std::io::Read;
use std::io::{Read, Write};
use std::path::Path;
use anyhow::Result;
@ -9,7 +9,7 @@ use zip::ZipArchive;
use crate::model::{Document, ImagePage};
use super::FormatReader;
use super::{FormatReader, FormatWriter};
pub struct CbzReader;
@ -51,3 +51,40 @@ impl FormatReader for CbzReader {
Ok(Document::new(pages))
}
}
pub struct CbzWriter;
impl FormatWriter for CbzWriter {
fn write(&self, doc: &Document, output: &Path) -> Result<()> {
use zip::write::SimpleFileOptions;
let file = File::create(output)?;
let mut zip = zip::ZipWriter::new(file);
let options = SimpleFileOptions::default();
for (idx, page) in doc.pages.iter().enumerate() {
let mut name = page.name.clone();
if Path::new(&name).extension().and_then(OsStr::to_str) != Some("jpg") {
name = format!("{:03}.jpg", idx + 1);
}
zip.start_file(&name, options)?;
if let Some(dct) = &page.jpeg_dct {
zip.write_all(dct)?;
} else {
// Encode to JPEG
let rgb = page.image.to_rgb8();
let (w, h) = (rgb.width(), rgb.height());
let mut cursor = std::io::Cursor::new(Vec::new());
{
let mut enc =
image::codecs::jpeg::JpegEncoder::new_with_quality(&mut cursor, 85);
enc.encode(&rgb.into_raw(), w, h, image::ColorType::Rgb8.into())?;
}
let data = cursor.into_inner();
zip.write_all(&data)?;
}
}
zip.finish()?;
Ok(())
}
}

View file

@ -8,6 +8,9 @@ use crate::model::Document;
pub mod cbz;
pub mod pdf;
use cbz::{CbzReader, CbzWriter};
use pdf::{PdfReader, PdfWriter};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FormatId {
Cbz,
@ -17,18 +20,12 @@ pub enum FormatId {
impl FormatId {
#[allow(dead_code)]
pub fn can_read(self) -> bool {
match self {
FormatId::Cbz => true,
FormatId::Pdf => false, // planned but not implemented yet
}
get_reader(self).is_some()
}
#[allow(dead_code)]
pub fn can_write(self) -> bool {
match self {
FormatId::Pdf => true,
FormatId::Cbz => false, // planned but not implemented yet
}
get_writer(self).is_some()
}
pub fn detect_from_path(path: &Path) -> Option<FormatId> {
@ -47,3 +44,17 @@ pub trait FormatReader: Send + Sync {
pub trait FormatWriter: Send + Sync {
fn write(&self, doc: &Document, output: &Path) -> Result<()>;
}
pub fn get_reader(format: FormatId) -> Option<Box<dyn FormatReader>> {
match format {
FormatId::Cbz => Some(Box::new(CbzReader)),
FormatId::Pdf => Some(Box::new(PdfReader)),
}
}
pub fn get_writer(format: FormatId) -> Option<Box<dyn FormatWriter>> {
match format {
FormatId::Pdf => Some(Box::new(PdfWriter)),
FormatId::Cbz => Some(Box::new(CbzWriter)),
}
}

View file

@ -1,12 +1,13 @@
use anyhow::Result;
use image::{codecs::jpeg::JpegEncoder, ColorType};
use image::codecs::jpeg::JpegEncoder;
use image::ColorType;
use pdf_writer::{Content, Filter, Finish, Name, Pdf, Rect, Ref};
use std::io::Cursor;
use std::path::Path;
use crate::model::Document;
use crate::model::{Document, ImagePage};
use super::FormatWriter;
use super::{FormatReader, FormatWriter};
pub struct PdfWriter;
@ -77,3 +78,194 @@ impl FormatWriter for PdfWriter {
Ok(())
}
}
pub struct PdfReader;
impl FormatReader for PdfReader {
fn read(&self, input: &Path) -> Result<Document> {
use lopdf::{Document as LoDocument, Object};
let doc = LoDocument::load(input)?;
let pages_map = doc.get_pages();
let mut image_pages: Vec<ImagePage> = Vec::new();
for (idx, (_page_num, page_id)) in pages_map.iter().enumerate() {
// Fetch page object
let page_obj = doc.get_object(*page_id)?;
let page_dict = match page_obj.as_dict() {
Ok(d) => d,
Err(_) => continue,
};
// Resolve Resources dictionary (can be a reference or inline dict)
let (mut xobjects_opt, mut content_refs): (Option<lopdf::Dictionary>, Vec<Vec<u8>>) =
(None, Vec::new());
if let Ok(obj) = page_dict.get(b"Resources") {
match obj {
Object::Reference(id) => {
if let Ok(Object::Dictionary(d)) = doc.get_object(*id) {
// Extract XObject dict if present
if let Ok(Object::Reference(xid)) = d.get(b"XObject") {
if let Ok(Object::Dictionary(xd)) = doc.get_object(*xid) {
xobjects_opt = Some(xd.clone());
}
} else if let Ok(Object::Dictionary(xd)) = d.get(b"XObject") {
xobjects_opt = Some(xd.clone());
}
}
}
Object::Dictionary(d) => {
if let Ok(Object::Reference(xid)) = d.get(b"XObject") {
if let Ok(Object::Dictionary(xd)) = doc.get_object(*xid) {
xobjects_opt = Some(xd.clone());
}
} else if let Ok(Object::Dictionary(xd)) = d.get(b"XObject") {
xobjects_opt = Some(xd.clone());
}
}
_ => {}
}
}
// Try to track which XObjects are used by parsing Content streams for /Name Do
if let Ok(contents_obj) = page_dict.get(b"Contents") {
match contents_obj {
Object::Reference(cid) => {
if let Ok(Object::Stream(stream)) = doc.get_object(*cid) {
content_refs.extend(extract_xobject_names(&stream.content));
}
}
Object::Array(arr) => {
for o in arr {
if let Object::Reference(cid) = o {
if let Ok(Object::Stream(stream)) = doc.get_object(*cid) {
content_refs.extend(extract_xobject_names(&stream.content));
}
}
}
}
Object::Stream(stream) => {
content_refs.extend(extract_xobject_names(&stream.content));
}
_ => {}
}
}
// If we have XObjects, pick the first image (prefer one referenced in content)
if let Some(xobjects) = xobjects_opt {
// Build ordered keys: first those referenced in content, then the rest
let mut keys: Vec<Vec<u8>> = xobjects.iter().map(|(k, _)| k.clone()).collect();
// Stable sort by whether referenced first
keys.sort_by_key(|k| {
let name = if k.starts_with(b"/") {
k[1..].to_vec()
} else {
k.clone()
};
match content_refs.iter().position(|r| *r == name) {
Some(pos) => pos as i32,
None => i32::MAX,
}
});
for key in keys {
if let Ok(&Object::Reference(obj_id)) = xobjects.get(&key) {
if let Ok(Object::Stream(stream)) = doc.get_object(obj_id) {
let dict = &stream.dict;
let is_image = matches!(dict.get(b"Subtype"), Ok(Object::Name(n)) if n == b"Image");
if !is_image {
continue;
}
let is_dct = match dict.get(b"Filter") {
Ok(Object::Name(n)) => n == b"DCTDecode",
Ok(Object::Array(arr)) => arr
.iter()
.any(|o| matches!(o, Object::Name(n) if n == b"DCTDecode")),
_ => false,
};
let data = stream.content.clone();
if is_dct {
if let Ok(img) = image::load_from_memory(&data) {
let name = format!("{:03}.jpg", idx + 1);
image_pages.push(ImagePage {
name,
image: img,
jpeg_dct: Some(data),
});
break;
} else {
// If JPEG parsing failed, skip
continue;
}
} else if let Ok(img) = image::load_from_memory(&data) {
// Fallback: try to decode arbitrary image stream
let name = format!("{:03}.jpg", idx + 1);
image_pages.push(ImagePage {
name,
image: img,
jpeg_dct: None,
});
break;
}
}
}
}
}
}
Ok(Document::new(image_pages))
}
}
// Helper to extract XObject names used in a content stream by scanning for "/Name Do"
fn extract_xobject_names(content: &[u8]) -> Vec<Vec<u8>> {
// This is a naive scanner but often sufficient: tokens separated by whitespace, looking for "/name Do"
let mut names = Vec::new();
let s = content;
let mut i = 0;
while i < s.len() {
// skip whitespace
while i < s.len() && s[i].is_ascii_whitespace() {
i += 1;
}
if i >= s.len() {
break;
}
if s[i] == b'/' {
// read name
let start = i + 1;
i += 1;
while i < s.len() && !s[i].is_ascii_whitespace() {
i += 1;
}
let name = s[start..i].to_vec();
// skip whitespace
while i < s.len() && s[i].is_ascii_whitespace() {
i += 1;
}
// check for Do operator after possible inline graphics state
// We will just check if next token is Do
let mut j = i;
while j < s.len() && s[j].is_ascii_whitespace() {
j += 1;
}
let op_start = j;
while j < s.len() && (s[j] as char).is_ascii_alphabetic() {
j += 1;
}
if &s[op_start..j] == b"Do" {
names.push(name);
}
i = j;
} else {
// skip token
while i < s.len() && !s[i].is_ascii_whitespace() {
i += 1;
}
}
}
names
}

View file

@ -5,9 +5,7 @@ use anyhow::Result;
use indicatif::{ProgressBar, ProgressStyle};
use rayon::prelude::*;
use crate::formats::cbz::CbzReader;
use crate::formats::pdf::PdfWriter;
use crate::formats::{FormatId, FormatReader, FormatWriter};
use crate::formats::{get_reader, get_writer, FormatId};
#[derive(Debug, Clone)]
pub struct Job {
@ -34,22 +32,6 @@ impl Job {
}
}
fn get_reader(format: FormatId) -> Box<dyn FormatReader> {
match format {
FormatId::Cbz => Box::new(CbzReader),
// Placeholder for future formats
FormatId::Pdf => unimplemented!("Reading PDF not implemented"),
}
}
fn get_writer(format: FormatId) -> Box<dyn FormatWriter> {
match format {
FormatId::Pdf => Box::new(PdfWriter),
// Placeholder for future formats
FormatId::Cbz => unimplemented!("Writing CBZ not implemented"),
}
}
pub fn process_jobs(jobs: Vec<Job>) -> Result<()> {
let pb = ProgressBar::new(jobs.len() as u64);
pb.enable_steady_tick(Duration::from_millis(300));
@ -59,8 +41,8 @@ pub fn process_jobs(jobs: Vec<Job>) -> Result<()> {
jobs.par_iter().for_each(|job| {
// Build the pipeline for each job
let reader = get_reader(job.from);
let writer = get_writer(job.to);
let reader = get_reader(job.from).expect("No reader registered for selected input format");
let writer = get_writer(job.to).expect("No writer registered for selected output format");
let doc = reader.read(&job.input_path).expect("Failed to read input");
writer

View file

@ -11,6 +11,21 @@ use walkdir::WalkDir;
use cbz2pdf::formats::FormatId;
use cbz2pdf::job::{process_jobs, Job};
#[derive(clap::ValueEnum, Clone, Debug)]
enum CliFormat {
Cbz,
Pdf,
}
impl From<CliFormat> for FormatId {
fn from(value: CliFormat) -> Self {
match value {
CliFormat::Cbz => FormatId::Cbz,
CliFormat::Pdf => FormatId::Pdf,
}
}
}
#[derive(Parser)]
#[command()]
struct Cli {
@ -18,7 +33,7 @@ struct Cli {
short = 'i',
long = "input",
value_hint = ValueHint::FilePath,
help = "Path to CBZ file or directory containing CBZ files"
help = "Path to input file or directory"
)]
input_path: String,
@ -27,10 +42,20 @@ struct Cli {
long = "output-directory",
default_value = ".",
value_hint = ValueHint::FilePath,
help = "Output directory for PDF files"
help = "Output directory for converted files"
)]
output_dir: String,
#[arg(
long = "from",
value_enum,
help = "Input format. If omitted, auto-detect from file extension"
)]
from: Option<CliFormat>,
#[arg(long = "to", value_enum, default_value = "pdf", help = "Output format")]
to: CliFormat,
#[arg(short = 'p', long, help = "Ask for confirmation before doing anything")]
interactive: bool,
}
@ -43,29 +68,51 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let input_path = Path::new(&cli.input_path);
let output_dir = Path::new(&cli.output_dir);
let from_opt: Option<FormatId> = cli.from.map(Into::into);
let to_fmt: FormatId = cli.to.into();
// Validate target capability early
if !to_fmt.can_write() {
eprintln!("Selected output format is not supported for writing: {to_fmt:?}");
std::process::exit(1);
}
let mut jobs: Vec<Job> = Vec::new();
if input_path.is_file() {
if let Some(FormatId::Cbz) = FormatId::detect_from_path(input_path) {
jobs.push(Job::new(
input_path.to_path_buf(),
output_dir.to_path_buf(),
FormatId::Cbz,
FormatId::Pdf,
));
} else {
eprintln!("Unsupported input file format");
let detected = FormatId::detect_from_path(input_path);
let from_fmt = from_opt.or(detected).unwrap_or_else(|| {
eprintln!(
"Could not detect input format from file extension and no --from was provided"
);
std::process::exit(1);
});
if !from_fmt.can_read() {
eprintln!("Selected/Detected input format is not supported for reading: {from_fmt:?}");
std::process::exit(1);
}
jobs.push(Job::new(
input_path.to_path_buf(),
output_dir.to_path_buf(),
from_fmt,
to_fmt,
));
} else if input_path.is_dir() {
jobs.extend(walk_directory(input_path, output_dir));
jobs.extend(walk_directory(input_path, output_dir, from_opt, to_fmt));
} else {
eprintln!(
"Invalid input path. Please provide a CBZ file or a directory containing CBZ files."
"Invalid input path. Please provide a valid file or a directory containing supported files."
);
std::process::exit(1);
}
if jobs.is_empty() {
eprintln!("No matching inputs found to process.");
std::process::exit(1);
}
jobs.sort_by_key(|j| j.input_path.clone().into_os_string().into_string());
let proceed = if cli.interactive {
@ -105,20 +152,33 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
Ok(())
}
fn walk_directory(directory: &Path, output_dir: &Path) -> Vec<Job> {
fn walk_directory(
directory: &Path,
output_dir: &Path,
from_opt: Option<FormatId>,
to_fmt: FormatId,
) -> Vec<Job> {
debug!("Walking {directory:?}");
let mut jobs = Vec::new();
for entry in WalkDir::new(directory) {
let entry = entry.unwrap();
let path = entry.path();
if path.is_file() {
if let Some(FormatId::Cbz) = FormatId::detect_from_path(path) {
jobs.push(Job::new(
path.to_path_buf(),
output_dir.to_path_buf(),
FormatId::Cbz,
FormatId::Pdf,
));
let detected = FormatId::detect_from_path(path);
let from_fmt_opt = match from_opt {
Some(fixed) => detected.filter(|d| *d == fixed),
None => detected,
};
if let Some(from_fmt) = from_fmt_opt {
if from_fmt.can_read() && to_fmt.can_write() {
jobs.push(Job::new(
path.to_path_buf(),
output_dir.to_path_buf(),
from_fmt,
to_fmt,
));
}
}
}
}