Python PDF和图片互转

多张图片合并成PDF或PDF提取图片。

PyMuPDF库

在运行上述代码之前,你需要安装PyMuPDF库:

pip install PyMuPDF

tqdm库

你需要安装tqdm库来显示进度条,如果你还没有安装的话,请先运行这个命令:

pip install tqdm

PDF提取图片

import fitz  # PyMuPDF
import os
from tqdm import tqdm

# 获取当前文件夹中所有PDF文件
pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]

# 遍历所有PDF文件
for pdf_file in pdf_files:
    # 打开PDF文件
    pdf = fitz.open(pdf_file)
    
    # 创建以PDF文件名命名的目录
    pdf_folder = os.path.splitext(pdf_file)[0]
    if not os.path.isdir(pdf_folder):
        os.mkdir(pdf_folder)
    
    # 为进度条设置总步数
    progress_bar = tqdm(total=pdf.page_count, desc=f"Extracting images from {pdf_file}")

    # 遍历PDF的每一页
    for page_num, page in enumerate(pdf):
        progress_bar.update(1)  # 更新进度条
        # 提取页面中的图片
        for img_num, img in enumerate(page.get_images(), start=1):
            xref = img[0]
            base_image = pdf.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]

            # 构建图片文件名
            image_filename = f"extracted_image_page{page_num+1}_{img_num}.{image_ext}"
            image_filepath = os.path.join(pdf_folder, image_filename)

            # 保存图片文件
            with open(image_filepath, 'wb') as img_file:
                img_file.write(image_bytes)
    
    # 关闭PDF文件
    pdf.close()
    progress_bar.close()  # 关闭进度条

PDF提取图片-加速版

import fitz  # PyMuPDF
import os
import multiprocessing
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

def extract_images_from_page(args):
    pdf_path, page_num, pdf_folder = args
    pdf = fitz.open(pdf_path)
    page = pdf[page_num]
    
    for img_num, img in enumerate(page.get_images(), start=1):
        xref = img[0]
        base_image = pdf.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]

        image_filename = f"extracted_image_page{page_num+1}_{img_num}.{image_ext}"
        image_filepath = os.path.join(pdf_folder, image_filename)
        
        with open(image_filepath, 'wb') as img_file:
            img_file.write(image_bytes)
    
    pdf.close()
    
def prepare_extraction(pdf_file):
    pdf_folder = os.path.splitext(pdf_file)[0]
    if not os.path.isdir(pdf_folder):
        os.mkdir(pdf_folder)
    pdf_path = pdf_file
    pdf = fitz.open(pdf_path)
    args = [(pdf_path, page_num, pdf_folder) for page_num in range(len(pdf))]
    pdf.close()
    return args

if __name__ == "__main__":
    pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
    
    total_pages = sum([fitz.open(pdf).page_count for pdf in pdf_files])
    progress_bar = tqdm(total=total_pages, desc="Extracting images from PDFs")
    
    # 使用80%的CPU资源
    num_cores = max(1, int(cpu_count() * 0.8))
    
    with Pool(processes=num_cores) as pool:
        for pdf_file in pdf_files:
            args = prepare_extraction(pdf_file)
            for _ in pool.imap_unordered(extract_images_from_page, args):
                progress_bar.update(1)
                
    progress_bar.close()

多张图片合并成PDF文档-加速版

from PIL import Image
import os
import re
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

# 使用自然排序的方法来排序文件名
def natural_keys(text):
    return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', text)]

# 转换图片到PDF的函数
def convert_image_to_pdf(image_path):
    img = Image.open(image_path).convert('RGB')
    return img

# 主函数
def main():
    # 获取当前目录下所有图片文件
    image_files = sorted(
        [f for f in os.listdir('.') if f.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))],
        key=natural_keys
    )
    
    # 确定使用的核心数
    num_cores = max(1, int(cpu_count() * 0.8))

    # 创建进程池
    with Pool(num_cores) as pool:
        # 使用进程池并行转换图片到RGB
        images = list(tqdm(pool.imap(convert_image_to_pdf, image_files), total=len(image_files)))

    # 只有在找到图片文件时才进行PDF转换
    if images:
        # 将第一张图片保存为PDF,并将其余图片追加到PDF中
        pdf_filename = "output.pdf"
        images[0].save(pdf_filename, save_all=True, append_images=images[1:], quality=100, subsampling=0)

if __name__ == '__main__':
    main()