PDF Annotation Extractor
Python script that extracts annotations (highlights, comments, underlines, etc.) from PDF files. By default, it shows the annotated text. Use --with-context to also show surrounding text.
Setup
# Create virtual environment
python -m venv pdf_env
# Activate virtual environment:
# Linux/Mac:
source pdf_env/bin/activate
# Windows:
pdf_env\Scripts\activate
# Install required package
pip install PyMuPDF
# Optional: Set alias for python3 (if needed)
# Linux/Mac (add to ~/.bashrc or ~/.bash_profile):
alias python=python3
alias pip=pip3
# Windows (PowerShell profile):
Set-Alias python python3
Set-Alias pip pip3
Usage
# Basic usage (shows annotations only):
python pdf.py document.pdf
# With context (shows 100 characters before/after each annotation):
python pdf.py document.pdf --with-context
Output
- Shows page number, annotation type, and annotated text
- Optional context around annotations with --with-context flag
- Separators: === between annotations, --- between sections within same annotation
Script Code
import fitz # PyMuPDF
import sys
import argparse
def extract_annotations(pdf_path, show_context=False):
try:
doc = fitz.open(pdf_path)
annotations = []
# Iterate through each page
for page_num in range(len(doc)):
page = doc.load_page(page_num)
annots = page.annots()
if annots:
# Get all text from the page
full_page_text = page.get_text()
for annot in annots:
annot_type = annot.type[1] if annot.type else "Unknown"
content = annot.info.get("content", "")
rect = annot.rect
if annot_type in ["Text", "FreeText", "Highlight", "Underline", "StrikeOut", "Squiggly"]:
# Get text directly under the annotation
annotated_text = page.get_text("text", clip=rect).strip()
# Find context by searching in full page text
context = ""
if show_context and annotated_text and annotated_text in full_page_text:
# Find the position of annotated text in full page text
start_pos = full_page_text.find(annotated_text)
if start_pos != -1:
# Extract context (100 chars before and after)
context_start = max(0, start_pos - 100)
context_end = min(len(full_page_text), start_pos + len(annotated_text) + 100)
context = full_page_text[context_start:context_end].strip()
# If we couldn't find it and context is requested, try text blocks
if show_context and not context:
text_blocks = page.get_text("blocks")
for block in text_blocks:
block_rect = fitz.Rect(block[:4])
if rect.intersects(block_rect):
context = block[4].strip()
break
# Format the output
result = f"Page {page_num + 1}: [{annot_type}]"
if content:
result += f"\n---\nComment: {content}"
if annotated_text:
result += f"\n---\nAnnotated Text: {annotated_text}"
if context and show_context:
result += f"\n---\nContext: {context}"
annotations.append(result)
# Print the results
if annotations:
for i, annot in enumerate(annotations):
print(annot)
if i < len(annotations) - 1: # Don't print separator after last annotation
print("=" * 80)
else:
print("No annotations found in the PDF.")
except Exception as e:
print(f"Error opening PDF: {e}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("pdf_file", help="PDF file path")
parser.add_argument("--with-context", action="store_true", help="Include context around annotations")
args = parser.parse_args()
extract_annotations(args.pdf_file, args.with_context)
Last updated: 2025-08-26 20:00 UTC