多张图片合并成PDF或PDF提取图片。
PyMuPDF库
在运行上述代码之前,你需要安装PyMuPDF
库:
pip install PyMuPDF
tqdm库
你需要安装tqdm
库来显示进度条,如果你还没有安装的话,请先运行这个命令:
pip install tqdm
PDF提取图片
import fitz # PyMuPDF
import os
from tqdm import tqdm
# 获取当前文件夹中所有PDF文件
pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
# 遍历所有PDF文件
for pdf_file in pdf_files:
# 打开PDF文件
pdf = fitz.open(pdf_file)
# 创建以PDF文件名命名的目录
pdf_folder = os.path.splitext(pdf_file)[0]
if not os.path.isdir(pdf_folder):
os.mkdir(pdf_folder)
# 为进度条设置总步数
progress_bar = tqdm(total=pdf.page_count, desc=f"Extracting images from {pdf_file}")
# 遍历PDF的每一页
for page_num, page in enumerate(pdf):
progress_bar.update(1) # 更新进度条
# 提取页面中的图片
for img_num, img in enumerate(page.get_images(), start=1):
xref = img[0]
base_image = pdf.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# 构建图片文件名
image_filename = f"extracted_image_page{page_num+1}_{img_num}.{image_ext}"
image_filepath = os.path.join(pdf_folder, image_filename)
# 保存图片文件
with open(image_filepath, 'wb') as img_file:
img_file.write(image_bytes)
# 关闭PDF文件
pdf.close()
progress_bar.close() # 关闭进度条
PDF提取图片-加速版
import fitz # PyMuPDF
import os
import multiprocessing
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
def extract_images_from_page(args):
pdf_path, page_num, pdf_folder = args
pdf = fitz.open(pdf_path)
page = pdf[page_num]
for img_num, img in enumerate(page.get_images(), start=1):
xref = img[0]
base_image = pdf.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image_filename = f"extracted_image_page{page_num+1}_{img_num}.{image_ext}"
image_filepath = os.path.join(pdf_folder, image_filename)
with open(image_filepath, 'wb') as img_file:
img_file.write(image_bytes)
pdf.close()
def prepare_extraction(pdf_file):
pdf_folder = os.path.splitext(pdf_file)[0]
if not os.path.isdir(pdf_folder):
os.mkdir(pdf_folder)
pdf_path = pdf_file
pdf = fitz.open(pdf_path)
args = [(pdf_path, page_num, pdf_folder) for page_num in range(len(pdf))]
pdf.close()
return args
if __name__ == "__main__":
pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
total_pages = sum([fitz.open(pdf).page_count for pdf in pdf_files])
progress_bar = tqdm(total=total_pages, desc="Extracting images from PDFs")
# 使用80%的CPU资源
num_cores = max(1, int(cpu_count() * 0.8))
with Pool(processes=num_cores) as pool:
for pdf_file in pdf_files:
args = prepare_extraction(pdf_file)
for _ in pool.imap_unordered(extract_images_from_page, args):
progress_bar.update(1)
progress_bar.close()
多张图片合并成PDF文档-加速版
from PIL import Image
import os
import re
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
# 使用自然排序的方法来排序文件名
def natural_keys(text):
return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', text)]
# 转换图片到PDF的函数
def convert_image_to_pdf(image_path):
img = Image.open(image_path).convert('RGB')
return img
# 主函数
def main():
# 获取当前目录下所有图片文件
image_files = sorted(
[f for f in os.listdir('.') if f.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))],
key=natural_keys
)
# 确定使用的核心数
num_cores = max(1, int(cpu_count() * 0.8))
# 创建进程池
with Pool(num_cores) as pool:
# 使用进程池并行转换图片到RGB
images = list(tqdm(pool.imap(convert_image_to_pdf, image_files), total=len(image_files)))
# 只有在找到图片文件时才进行PDF转换
if images:
# 将第一张图片保存为PDF,并将其余图片追加到PDF中
pdf_filename = "output.pdf"
images[0].save(pdf_filename, save_all=True, append_images=images[1:], quality=100, subsampling=0)
if __name__ == '__main__':
main()