deyeChargeSpeed/docs/extract_pdf.py

import os
import pypdf
import re

pdf_path = os.path.join(os.path.dirname(__file__), "Modbus储能-组串-微逆宁波德业V118-1.pdf")

def extract_text_from_pdf(pdf_path):
    reader = pypdf.PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

full_text = extract_text_from_pdf(pdf_path)

# Registers to look for
registers = [94, 245, 320]

print("Searching for registers...")

# Split text into lines for easier processing
lines = full_text.split('\n')

found_registers = {}

with open("pdf_output.txt", "w", encoding="utf-8") as f:
    for i, line in enumerate(lines):
        for reg in registers:
            if re.search(r'\b' + str(reg) + r'\b', line):
                f.write(f"MATCH {reg}: {line.strip()}\n")
                # Write context
                start = max(0, i - 5)
                end = min(len(lines), i + 6)
                for j in range(start, end):
                    if i != j:
                        f.write(f"   CTX: {lines[j].strip()}\n")
                f.write("-" * 20 + "\n")

print("Done. Check pdf_output.txt")