Manipulate PDF in Python

  • PyPDF2: open source, free. 老牌 Python PDF 项目.

  • PyMuPDF: open source only, need to buy license for commercial project. 全面且强大, 基于 C++ 写的 MuPDF. 但是 Python 包自带用的是预编译的 MuPDF binary, 所以无需 yum, apt install. 美中不足就是商用需要 License

  • pdf2image: open source, free. 功能很简单, 将 PDF 转化为 Image 图片, 底层用的是 popper 这个 PDF Render 工具. 需要用 yum, apt install CLI 之后才能使用.

  • pdfminer: MIT.

PyPDF2 Example:

# -*- coding: utf-8 -*-

import io
from pathlib import Path
from PyPDF2 import PdfReader, PdfWriter

dir_here = Path(__file__).absolute().parent
path_w2_pdf = dir_here.parent / "w2.pdf"

reader = PdfReader(io.BytesIO(path_w2_pdf.read_bytes()))
n_page = len(reader.pages)
for i in range(n_page):
    writer = PdfWriter()
    writer.add_page(reader.pages[i])
    page = i + 1
    buffer = io.BytesIO()
    writer.write(buffer)
    path_dst = dir_here / f"{page}.pdf"
    path_dst.write_bytes(buffer.getvalue())

PyMuPDF Example:

# -*- coding: utf-8 -*-

"""
Reference:

- https://pypi.org/project/PyMuPDF/
"""

from io import BytesIO
import fitz  # This is the PyMuPDF import name
from pathlib import Path

dir_here = Path(__file__).absolute().parent

# --- Set the sample PDF file path you want to test with
path_w2_pdf = dir_here.parent / "w2.pdf"

path_pdf = path_w2_pdf

# You can use either ``fitz.Document(filename=...)``
# or use bytes stream ``fitz.Document(stream=...)`` to read the PDF content
doc = fitz.Document(stream=path_pdf.read_bytes())

# Repair any issues (hopefully) before we hit them
# See this https://github.com/pymupdf/PyMuPDF/issues/856
buffer = BytesIO()
buffer.write(doc.write(clean=True, garbage=4))  # write the document to in-memory buffer
new_content = buffer.getvalue()
buffer.close()
doc = fitz.Document(stream=new_content)

for page_num, page in enumerate(doc, start=1):
    # --- split page
    doc1 = fitz.Document()  # new empty PDF
    # doc1.insert_pdf(doc, from_page=page_num-1, to_page=page_num-1)
    doc1.insert_pdf(doc)
    p = dir_here / f"{path_pdf.stem}-page-{page_num}.pdf"
    # you cannot write document to io.BytesIO
    doc1.save(f"{p}")

    # --- convert page to image
    pix: fitz.Pixmap = page.get_pixmap(dpi=200)
    p = dir_here / f"{path_pdf.stem}-{page_num}.png"
    # you cannot write pix map to io.BytesIO
    pix.save(f"{p}", output="png")

pikepdf Example:

# -*- coding: utf-8 -*-

from pathlib import Path
from pikepdf import Pdf, PdfImage

dir_here = Path(__file__).absolute().parent
path_w2_pdf = dir_here.parent / "w2.pdf"

pdf = Pdf.open(f"{path_w2_pdf}")

for page_num, page in enumerate(pdf.pages, start=1):
    # split page
    dst = Pdf.new()
    dst.pages.append(page)
    path_dst = dir_here / f"page-{page_num}.pdf"
    dst.save(f"{path_dst}")

pdf2image Example:

# -*- coding: utf-8 -*-

"""
- Pypi: https://pypi.org/project/pdf2image/

Dependencies:

Mac:

- Install `poppler for Mac <https://macappstore.org/poppler/>`_
- do ``brew install poppler``
- use ``brew list poppler`` to figure out the poppler bin folder, on my computer it is ``/opt/homebrew/Cellar/poppler/22.08.0/bin/``

Linux (Redhat):

- Install poppler for Linux ``sudo yum install poppler-utils``
- Check it is installed ``yum list poppler-utils``
"""

from pathlib import Path
from pdf2image import convert_from_path

dir_here = Path(__file__).absolute().parent
path_w2_pdf = dir_here.parent / "w2.pdf"

images = convert_from_path(
    f"{path_w2_pdf}",
    dpi=300,
    fmt="png",
    # poppler_path="/opt/homebrew/Cellar/poppler/22.08.0/bin/", # don't need this on Linux
)
for page_num, image in enumerate(images, start=1):
    path_output = dir_here / f"page-{page_num}.png"
    image.save(f"{path_output}")