Loading... **多张图片合并成PDF或PDF提取图片。** ## PyMuPDF库 在运行上述代码之前,你需要安装`PyMuPDF`库: ``` pip install PyMuPDF ``` ## tqdm库 你需要安装`tqdm`库来显示进度条,如果你还没有安装的话,请先运行这个命令: ``` pip install tqdm ``` ## PDF提取图片 ``` import fitz # PyMuPDF import os from tqdm import tqdm # 获取当前文件夹中所有PDF文件 pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')] # 遍历所有PDF文件 for pdf_file in pdf_files: # 打开PDF文件 pdf = fitz.open(pdf_file) # 创建以PDF文件名命名的目录 pdf_folder = os.path.splitext(pdf_file)[0] if not os.path.isdir(pdf_folder): os.mkdir(pdf_folder) # 为进度条设置总步数 progress_bar = tqdm(total=pdf.page_count, desc=f"Extracting images from {pdf_file}") # 遍历PDF的每一页 for page_num, page in enumerate(pdf): progress_bar.update(1) # 更新进度条 # 提取页面中的图片 for img_num, img in enumerate(page.get_images(), start=1): xref = img[0] base_image = pdf.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] # 构建图片文件名 image_filename = f"extracted_image_page{page_num+1}_{img_num}.{image_ext}" image_filepath = os.path.join(pdf_folder, image_filename) # 保存图片文件 with open(image_filepath, 'wb') as img_file: img_file.write(image_bytes) # 关闭PDF文件 pdf.close() progress_bar.close() # 关闭进度条 ``` ## PDF提取图片-加速版 ``` import fitz # PyMuPDF import os import multiprocessing from multiprocessing import Pool, cpu_count from tqdm import tqdm def extract_images_from_page(args): pdf_path, page_num, pdf_folder = args pdf = fitz.open(pdf_path) page = pdf[page_num] for img_num, img in enumerate(page.get_images(), start=1): xref = img[0] base_image = pdf.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] image_filename = f"extracted_image_page{page_num+1}_{img_num}.{image_ext}" image_filepath = os.path.join(pdf_folder, image_filename) with open(image_filepath, 'wb') as img_file: img_file.write(image_bytes) pdf.close() def prepare_extraction(pdf_file): pdf_folder = os.path.splitext(pdf_file)[0] if not os.path.isdir(pdf_folder): os.mkdir(pdf_folder) pdf_path = pdf_file pdf = fitz.open(pdf_path) args = [(pdf_path, page_num, pdf_folder) for page_num in range(len(pdf))] pdf.close() return args if __name__ == "__main__": pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')] total_pages = sum([fitz.open(pdf).page_count for pdf in pdf_files]) progress_bar = tqdm(total=total_pages, desc="Extracting images from PDFs") # 使用80%的CPU资源 num_cores = max(1, int(cpu_count() * 0.8)) with Pool(processes=num_cores) as pool: for pdf_file in pdf_files: args = prepare_extraction(pdf_file) for _ in pool.imap_unordered(extract_images_from_page, args): progress_bar.update(1) progress_bar.close() ``` ## 多张图片合并成PDF文档-加速版 ``` from PIL import Image import os import re from multiprocessing import Pool, cpu_count from tqdm import tqdm # 使用自然排序的方法来排序文件名 def natural_keys(text): return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', text)] # 转换图片到PDF的函数 def convert_image_to_pdf(image_path): img = Image.open(image_path).convert('RGB') return img # 主函数 def main(): # 获取当前目录下所有图片文件 image_files = sorted( [f for f in os.listdir('.') if f.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))], key=natural_keys ) # 确定使用的核心数 num_cores = max(1, int(cpu_count() * 0.8)) # 创建进程池 with Pool(num_cores) as pool: # 使用进程池并行转换图片到RGB images = list(tqdm(pool.imap(convert_image_to_pdf, image_files), total=len(image_files))) # 只有在找到图片文件时才进行PDF转换 if images: # 将第一张图片保存为PDF,并将其余图片追加到PDF中 pdf_filename = "output.pdf" images[0].save(pdf_filename, save_all=True, append_images=images[1:], quality=100, subsampling=0) if __name__ == '__main__': main() ``` 最后修改:2024 年 10 月 07 日 © 允许规范转载 赞 如果觉得我的文章对你有用,请随意赞赏