Files
deyeChargeSpeed/docs/extract_pdf.py

40 lines
1.1 KiB
Python

import os
import pypdf
import re
pdf_path = os.path.join(os.path.dirname(__file__), "Modbus储能-组串-微逆宁波德业V118-1.pdf")
def extract_text_from_pdf(pdf_path):
reader = pypdf.PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
full_text = extract_text_from_pdf(pdf_path)
# Registers to look for
registers = [94, 245, 320]
print("Searching for registers...")
# Split text into lines for easier processing
lines = full_text.split('\n')
found_registers = {}
with open("pdf_output.txt", "w", encoding="utf-8") as f:
for i, line in enumerate(lines):
for reg in registers:
if re.search(r'\b' + str(reg) + r'\b', line):
f.write(f"MATCH {reg}: {line.strip()}\n")
# Write context
start = max(0, i - 5)
end = min(len(lines), i + 6)
for j in range(start, end):
if i != j:
f.write(f" CTX: {lines[j].strip()}\n")
f.write("-" * 20 + "\n")
print("Done. Check pdf_output.txt")