PdfParser.py 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003
  1. from __future__ import annotations
  2. import calendar
  3. import codecs
  4. import collections
  5. import mmap
  6. import os
  7. import re
  8. import time
  9. import zlib
  10. from typing import TYPE_CHECKING, Any, List, NamedTuple, Union
  11. # see 7.9.2.2 Text String Type on page 86 and D.3 PDFDocEncoding Character Set
  12. # on page 656
  13. def encode_text(s: str) -> bytes:
  14. return codecs.BOM_UTF16_BE + s.encode("utf_16_be")
  15. PDFDocEncoding = {
  16. 0x16: "\u0017",
  17. 0x18: "\u02D8",
  18. 0x19: "\u02C7",
  19. 0x1A: "\u02C6",
  20. 0x1B: "\u02D9",
  21. 0x1C: "\u02DD",
  22. 0x1D: "\u02DB",
  23. 0x1E: "\u02DA",
  24. 0x1F: "\u02DC",
  25. 0x80: "\u2022",
  26. 0x81: "\u2020",
  27. 0x82: "\u2021",
  28. 0x83: "\u2026",
  29. 0x84: "\u2014",
  30. 0x85: "\u2013",
  31. 0x86: "\u0192",
  32. 0x87: "\u2044",
  33. 0x88: "\u2039",
  34. 0x89: "\u203A",
  35. 0x8A: "\u2212",
  36. 0x8B: "\u2030",
  37. 0x8C: "\u201E",
  38. 0x8D: "\u201C",
  39. 0x8E: "\u201D",
  40. 0x8F: "\u2018",
  41. 0x90: "\u2019",
  42. 0x91: "\u201A",
  43. 0x92: "\u2122",
  44. 0x93: "\uFB01",
  45. 0x94: "\uFB02",
  46. 0x95: "\u0141",
  47. 0x96: "\u0152",
  48. 0x97: "\u0160",
  49. 0x98: "\u0178",
  50. 0x99: "\u017D",
  51. 0x9A: "\u0131",
  52. 0x9B: "\u0142",
  53. 0x9C: "\u0153",
  54. 0x9D: "\u0161",
  55. 0x9E: "\u017E",
  56. 0xA0: "\u20AC",
  57. }
  58. def decode_text(b):
  59. if b[: len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
  60. return b[len(codecs.BOM_UTF16_BE) :].decode("utf_16_be")
  61. else:
  62. return "".join(PDFDocEncoding.get(byte, chr(byte)) for byte in b)
  63. class PdfFormatError(RuntimeError):
  64. """An error that probably indicates a syntactic or semantic error in the
  65. PDF file structure"""
  66. pass
  67. def check_format_condition(condition: bool, error_message: str) -> None:
  68. if not condition:
  69. raise PdfFormatError(error_message)
  70. class IndirectReferenceTuple(NamedTuple):
  71. object_id: int
  72. generation: int
  73. class IndirectReference(IndirectReferenceTuple):
  74. def __str__(self) -> str:
  75. return f"{self.object_id} {self.generation} R"
  76. def __bytes__(self) -> bytes:
  77. return self.__str__().encode("us-ascii")
  78. def __eq__(self, other: object) -> bool:
  79. if self.__class__ is not other.__class__:
  80. return False
  81. assert isinstance(other, IndirectReference)
  82. return other.object_id == self.object_id and other.generation == self.generation
  83. def __ne__(self, other):
  84. return not (self == other)
  85. def __hash__(self) -> int:
  86. return hash((self.object_id, self.generation))
  87. class IndirectObjectDef(IndirectReference):
  88. def __str__(self) -> str:
  89. return f"{self.object_id} {self.generation} obj"
  90. class XrefTable:
  91. def __init__(self):
  92. self.existing_entries = {} # object ID => (offset, generation)
  93. self.new_entries = {} # object ID => (offset, generation)
  94. self.deleted_entries = {0: 65536} # object ID => generation
  95. self.reading_finished = False
  96. def __setitem__(self, key, value):
  97. if self.reading_finished:
  98. self.new_entries[key] = value
  99. else:
  100. self.existing_entries[key] = value
  101. if key in self.deleted_entries:
  102. del self.deleted_entries[key]
  103. def __getitem__(self, key):
  104. try:
  105. return self.new_entries[key]
  106. except KeyError:
  107. return self.existing_entries[key]
  108. def __delitem__(self, key):
  109. if key in self.new_entries:
  110. generation = self.new_entries[key][1] + 1
  111. del self.new_entries[key]
  112. self.deleted_entries[key] = generation
  113. elif key in self.existing_entries:
  114. generation = self.existing_entries[key][1] + 1
  115. self.deleted_entries[key] = generation
  116. elif key in self.deleted_entries:
  117. generation = self.deleted_entries[key]
  118. else:
  119. msg = f"object ID {key} cannot be deleted because it doesn't exist"
  120. raise IndexError(msg)
  121. def __contains__(self, key):
  122. return key in self.existing_entries or key in self.new_entries
  123. def __len__(self) -> int:
  124. return len(
  125. set(self.existing_entries.keys())
  126. | set(self.new_entries.keys())
  127. | set(self.deleted_entries.keys())
  128. )
  129. def keys(self):
  130. return (
  131. set(self.existing_entries.keys()) - set(self.deleted_entries.keys())
  132. ) | set(self.new_entries.keys())
  133. def write(self, f):
  134. keys = sorted(set(self.new_entries.keys()) | set(self.deleted_entries.keys()))
  135. deleted_keys = sorted(set(self.deleted_entries.keys()))
  136. startxref = f.tell()
  137. f.write(b"xref\n")
  138. while keys:
  139. # find a contiguous sequence of object IDs
  140. prev = None
  141. for index, key in enumerate(keys):
  142. if prev is None or prev + 1 == key:
  143. prev = key
  144. else:
  145. contiguous_keys = keys[:index]
  146. keys = keys[index:]
  147. break
  148. else:
  149. contiguous_keys = keys
  150. keys = None
  151. f.write(b"%d %d\n" % (contiguous_keys[0], len(contiguous_keys)))
  152. for object_id in contiguous_keys:
  153. if object_id in self.new_entries:
  154. f.write(b"%010d %05d n \n" % self.new_entries[object_id])
  155. else:
  156. this_deleted_object_id = deleted_keys.pop(0)
  157. check_format_condition(
  158. object_id == this_deleted_object_id,
  159. f"expected the next deleted object ID to be {object_id}, "
  160. f"instead found {this_deleted_object_id}",
  161. )
  162. try:
  163. next_in_linked_list = deleted_keys[0]
  164. except IndexError:
  165. next_in_linked_list = 0
  166. f.write(
  167. b"%010d %05d f \n"
  168. % (next_in_linked_list, self.deleted_entries[object_id])
  169. )
  170. return startxref
  171. class PdfName:
  172. def __init__(self, name):
  173. if isinstance(name, PdfName):
  174. self.name = name.name
  175. elif isinstance(name, bytes):
  176. self.name = name
  177. else:
  178. self.name = name.encode("us-ascii")
  179. def name_as_str(self) -> str:
  180. return self.name.decode("us-ascii")
  181. def __eq__(self, other):
  182. return (
  183. isinstance(other, PdfName) and other.name == self.name
  184. ) or other == self.name
  185. def __hash__(self) -> int:
  186. return hash(self.name)
  187. def __repr__(self) -> str:
  188. return f"{self.__class__.__name__}({repr(self.name)})"
  189. @classmethod
  190. def from_pdf_stream(cls, data):
  191. return cls(PdfParser.interpret_name(data))
  192. allowed_chars = set(range(33, 127)) - {ord(c) for c in "#%/()<>[]{}"}
  193. def __bytes__(self) -> bytes:
  194. result = bytearray(b"/")
  195. for b in self.name:
  196. if b in self.allowed_chars:
  197. result.append(b)
  198. else:
  199. result.extend(b"#%02X" % b)
  200. return bytes(result)
  201. class PdfArray(List[Any]):
  202. def __bytes__(self) -> bytes:
  203. return b"[ " + b" ".join(pdf_repr(x) for x in self) + b" ]"
  204. if TYPE_CHECKING:
  205. _DictBase = collections.UserDict[Union[str, bytes], Any]
  206. else:
  207. _DictBase = collections.UserDict
  208. class PdfDict(_DictBase):
  209. def __setattr__(self, key, value):
  210. if key == "data":
  211. collections.UserDict.__setattr__(self, key, value)
  212. else:
  213. self[key.encode("us-ascii")] = value
  214. def __getattr__(self, key):
  215. try:
  216. value = self[key.encode("us-ascii")]
  217. except KeyError as e:
  218. raise AttributeError(key) from e
  219. if isinstance(value, bytes):
  220. value = decode_text(value)
  221. if key.endswith("Date"):
  222. if value.startswith("D:"):
  223. value = value[2:]
  224. relationship = "Z"
  225. if len(value) > 17:
  226. relationship = value[14]
  227. offset = int(value[15:17]) * 60
  228. if len(value) > 20:
  229. offset += int(value[18:20])
  230. format = "%Y%m%d%H%M%S"[: len(value) - 2]
  231. value = time.strptime(value[: len(format) + 2], format)
  232. if relationship in ["+", "-"]:
  233. offset *= 60
  234. if relationship == "+":
  235. offset *= -1
  236. value = time.gmtime(calendar.timegm(value) + offset)
  237. return value
  238. def __bytes__(self) -> bytes:
  239. out = bytearray(b"<<")
  240. for key, value in self.items():
  241. if value is None:
  242. continue
  243. value = pdf_repr(value)
  244. out.extend(b"\n")
  245. out.extend(bytes(PdfName(key)))
  246. out.extend(b" ")
  247. out.extend(value)
  248. out.extend(b"\n>>")
  249. return bytes(out)
  250. class PdfBinary:
  251. def __init__(self, data):
  252. self.data = data
  253. def __bytes__(self) -> bytes:
  254. return b"<%s>" % b"".join(b"%02X" % b for b in self.data)
  255. class PdfStream:
  256. def __init__(self, dictionary, buf):
  257. self.dictionary = dictionary
  258. self.buf = buf
  259. def decode(self):
  260. try:
  261. filter = self.dictionary.Filter
  262. except AttributeError:
  263. return self.buf
  264. if filter == b"FlateDecode":
  265. try:
  266. expected_length = self.dictionary.DL
  267. except AttributeError:
  268. expected_length = self.dictionary.Length
  269. return zlib.decompress(self.buf, bufsize=int(expected_length))
  270. else:
  271. msg = f"stream filter {repr(self.dictionary.Filter)} unknown/unsupported"
  272. raise NotImplementedError(msg)
  273. def pdf_repr(x):
  274. if x is True:
  275. return b"true"
  276. elif x is False:
  277. return b"false"
  278. elif x is None:
  279. return b"null"
  280. elif isinstance(x, (PdfName, PdfDict, PdfArray, PdfBinary)):
  281. return bytes(x)
  282. elif isinstance(x, (int, float)):
  283. return str(x).encode("us-ascii")
  284. elif isinstance(x, time.struct_time):
  285. return b"(D:" + time.strftime("%Y%m%d%H%M%SZ", x).encode("us-ascii") + b")"
  286. elif isinstance(x, dict):
  287. return bytes(PdfDict(x))
  288. elif isinstance(x, list):
  289. return bytes(PdfArray(x))
  290. elif isinstance(x, str):
  291. return pdf_repr(encode_text(x))
  292. elif isinstance(x, bytes):
  293. # XXX escape more chars? handle binary garbage
  294. x = x.replace(b"\\", b"\\\\")
  295. x = x.replace(b"(", b"\\(")
  296. x = x.replace(b")", b"\\)")
  297. return b"(" + x + b")"
  298. else:
  299. return bytes(x)
  300. class PdfParser:
  301. """Based on
  302. https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf
  303. Supports PDF up to 1.4
  304. """
  305. def __init__(self, filename=None, f=None, buf=None, start_offset=0, mode="rb"):
  306. if buf and f:
  307. msg = "specify buf or f or filename, but not both buf and f"
  308. raise RuntimeError(msg)
  309. self.filename = filename
  310. self.buf = buf
  311. self.f = f
  312. self.start_offset = start_offset
  313. self.should_close_buf = False
  314. self.should_close_file = False
  315. if filename is not None and f is None:
  316. self.f = f = open(filename, mode)
  317. self.should_close_file = True
  318. if f is not None:
  319. self.buf = buf = self.get_buf_from_file(f)
  320. self.should_close_buf = True
  321. if not filename and hasattr(f, "name"):
  322. self.filename = f.name
  323. self.cached_objects = {}
  324. if buf:
  325. self.read_pdf_info()
  326. else:
  327. self.file_size_total = self.file_size_this = 0
  328. self.root = PdfDict()
  329. self.root_ref = None
  330. self.info = PdfDict()
  331. self.info_ref = None
  332. self.page_tree_root = {}
  333. self.pages = []
  334. self.orig_pages = []
  335. self.pages_ref = None
  336. self.last_xref_section_offset = None
  337. self.trailer_dict = {}
  338. self.xref_table = XrefTable()
  339. self.xref_table.reading_finished = True
  340. if f:
  341. self.seek_end()
  342. def __enter__(self) -> PdfParser:
  343. return self
  344. def __exit__(self, *args: object) -> None:
  345. self.close()
  346. def start_writing(self) -> None:
  347. self.close_buf()
  348. self.seek_end()
  349. def close_buf(self) -> None:
  350. try:
  351. self.buf.close()
  352. except AttributeError:
  353. pass
  354. self.buf = None
  355. def close(self) -> None:
  356. if self.should_close_buf:
  357. self.close_buf()
  358. if self.f is not None and self.should_close_file:
  359. self.f.close()
  360. self.f = None
  361. def seek_end(self) -> None:
  362. self.f.seek(0, os.SEEK_END)
  363. def write_header(self) -> None:
  364. self.f.write(b"%PDF-1.4\n")
  365. def write_comment(self, s):
  366. self.f.write(f"% {s}\n".encode())
  367. def write_catalog(self) -> IndirectReference:
  368. self.del_root()
  369. self.root_ref = self.next_object_id(self.f.tell())
  370. self.pages_ref = self.next_object_id(0)
  371. self.rewrite_pages()
  372. self.write_obj(self.root_ref, Type=PdfName(b"Catalog"), Pages=self.pages_ref)
  373. self.write_obj(
  374. self.pages_ref,
  375. Type=PdfName(b"Pages"),
  376. Count=len(self.pages),
  377. Kids=self.pages,
  378. )
  379. return self.root_ref
  380. def rewrite_pages(self) -> None:
  381. pages_tree_nodes_to_delete = []
  382. for i, page_ref in enumerate(self.orig_pages):
  383. page_info = self.cached_objects[page_ref]
  384. del self.xref_table[page_ref.object_id]
  385. pages_tree_nodes_to_delete.append(page_info[PdfName(b"Parent")])
  386. if page_ref not in self.pages:
  387. # the page has been deleted
  388. continue
  389. # make dict keys into strings for passing to write_page
  390. stringified_page_info = {}
  391. for key, value in page_info.items():
  392. # key should be a PdfName
  393. stringified_page_info[key.name_as_str()] = value
  394. stringified_page_info["Parent"] = self.pages_ref
  395. new_page_ref = self.write_page(None, **stringified_page_info)
  396. for j, cur_page_ref in enumerate(self.pages):
  397. if cur_page_ref == page_ref:
  398. # replace the page reference with the new one
  399. self.pages[j] = new_page_ref
  400. # delete redundant Pages tree nodes from xref table
  401. for pages_tree_node_ref in pages_tree_nodes_to_delete:
  402. while pages_tree_node_ref:
  403. pages_tree_node = self.cached_objects[pages_tree_node_ref]
  404. if pages_tree_node_ref.object_id in self.xref_table:
  405. del self.xref_table[pages_tree_node_ref.object_id]
  406. pages_tree_node_ref = pages_tree_node.get(b"Parent", None)
  407. self.orig_pages = []
  408. def write_xref_and_trailer(self, new_root_ref=None):
  409. if new_root_ref:
  410. self.del_root()
  411. self.root_ref = new_root_ref
  412. if self.info:
  413. self.info_ref = self.write_obj(None, self.info)
  414. start_xref = self.xref_table.write(self.f)
  415. num_entries = len(self.xref_table)
  416. trailer_dict = {b"Root": self.root_ref, b"Size": num_entries}
  417. if self.last_xref_section_offset is not None:
  418. trailer_dict[b"Prev"] = self.last_xref_section_offset
  419. if self.info:
  420. trailer_dict[b"Info"] = self.info_ref
  421. self.last_xref_section_offset = start_xref
  422. self.f.write(
  423. b"trailer\n"
  424. + bytes(PdfDict(trailer_dict))
  425. + b"\nstartxref\n%d\n%%%%EOF" % start_xref
  426. )
  427. def write_page(self, ref, *objs, **dict_obj):
  428. if isinstance(ref, int):
  429. ref = self.pages[ref]
  430. if "Type" not in dict_obj:
  431. dict_obj["Type"] = PdfName(b"Page")
  432. if "Parent" not in dict_obj:
  433. dict_obj["Parent"] = self.pages_ref
  434. return self.write_obj(ref, *objs, **dict_obj)
  435. def write_obj(self, ref, *objs, **dict_obj):
  436. f = self.f
  437. if ref is None:
  438. ref = self.next_object_id(f.tell())
  439. else:
  440. self.xref_table[ref.object_id] = (f.tell(), ref.generation)
  441. f.write(bytes(IndirectObjectDef(*ref)))
  442. stream = dict_obj.pop("stream", None)
  443. if stream is not None:
  444. dict_obj["Length"] = len(stream)
  445. if dict_obj:
  446. f.write(pdf_repr(dict_obj))
  447. for obj in objs:
  448. f.write(pdf_repr(obj))
  449. if stream is not None:
  450. f.write(b"stream\n")
  451. f.write(stream)
  452. f.write(b"\nendstream\n")
  453. f.write(b"endobj\n")
  454. return ref
  455. def del_root(self) -> None:
  456. if self.root_ref is None:
  457. return
  458. del self.xref_table[self.root_ref.object_id]
  459. del self.xref_table[self.root[b"Pages"].object_id]
  460. @staticmethod
  461. def get_buf_from_file(f):
  462. if hasattr(f, "getbuffer"):
  463. return f.getbuffer()
  464. elif hasattr(f, "getvalue"):
  465. return f.getvalue()
  466. else:
  467. try:
  468. return mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
  469. except ValueError: # cannot mmap an empty file
  470. return b""
  471. def read_pdf_info(self) -> None:
  472. self.file_size_total = len(self.buf)
  473. self.file_size_this = self.file_size_total - self.start_offset
  474. self.read_trailer()
  475. self.root_ref = self.trailer_dict[b"Root"]
  476. self.info_ref = self.trailer_dict.get(b"Info", None)
  477. self.root = PdfDict(self.read_indirect(self.root_ref))
  478. if self.info_ref is None:
  479. self.info = PdfDict()
  480. else:
  481. self.info = PdfDict(self.read_indirect(self.info_ref))
  482. check_format_condition(b"Type" in self.root, "/Type missing in Root")
  483. check_format_condition(
  484. self.root[b"Type"] == b"Catalog", "/Type in Root is not /Catalog"
  485. )
  486. check_format_condition(b"Pages" in self.root, "/Pages missing in Root")
  487. check_format_condition(
  488. isinstance(self.root[b"Pages"], IndirectReference),
  489. "/Pages in Root is not an indirect reference",
  490. )
  491. self.pages_ref = self.root[b"Pages"]
  492. self.page_tree_root = self.read_indirect(self.pages_ref)
  493. self.pages = self.linearize_page_tree(self.page_tree_root)
  494. # save the original list of page references
  495. # in case the user modifies, adds or deletes some pages
  496. # and we need to rewrite the pages and their list
  497. self.orig_pages = self.pages[:]
  498. def next_object_id(self, offset=None):
  499. try:
  500. # TODO: support reuse of deleted objects
  501. reference = IndirectReference(max(self.xref_table.keys()) + 1, 0)
  502. except ValueError:
  503. reference = IndirectReference(1, 0)
  504. if offset is not None:
  505. self.xref_table[reference.object_id] = (offset, 0)
  506. return reference
  507. delimiter = rb"[][()<>{}/%]"
  508. delimiter_or_ws = rb"[][()<>{}/%\000\011\012\014\015\040]"
  509. whitespace = rb"[\000\011\012\014\015\040]"
  510. whitespace_or_hex = rb"[\000\011\012\014\015\0400-9a-fA-F]"
  511. whitespace_optional = whitespace + b"*"
  512. whitespace_mandatory = whitespace + b"+"
  513. # No "\012" aka "\n" or "\015" aka "\r":
  514. whitespace_optional_no_nl = rb"[\000\011\014\040]*"
  515. newline_only = rb"[\r\n]+"
  516. newline = whitespace_optional_no_nl + newline_only + whitespace_optional_no_nl
  517. re_trailer_end = re.compile(
  518. whitespace_mandatory
  519. + rb"trailer"
  520. + whitespace_optional
  521. + rb"<<(.*>>)"
  522. + newline
  523. + rb"startxref"
  524. + newline
  525. + rb"([0-9]+)"
  526. + newline
  527. + rb"%%EOF"
  528. + whitespace_optional
  529. + rb"$",
  530. re.DOTALL,
  531. )
  532. re_trailer_prev = re.compile(
  533. whitespace_optional
  534. + rb"trailer"
  535. + whitespace_optional
  536. + rb"<<(.*?>>)"
  537. + newline
  538. + rb"startxref"
  539. + newline
  540. + rb"([0-9]+)"
  541. + newline
  542. + rb"%%EOF"
  543. + whitespace_optional,
  544. re.DOTALL,
  545. )
  546. def read_trailer(self):
  547. search_start_offset = len(self.buf) - 16384
  548. if search_start_offset < self.start_offset:
  549. search_start_offset = self.start_offset
  550. m = self.re_trailer_end.search(self.buf, search_start_offset)
  551. check_format_condition(m, "trailer end not found")
  552. # make sure we found the LAST trailer
  553. last_match = m
  554. while m:
  555. last_match = m
  556. m = self.re_trailer_end.search(self.buf, m.start() + 16)
  557. if not m:
  558. m = last_match
  559. trailer_data = m.group(1)
  560. self.last_xref_section_offset = int(m.group(2))
  561. self.trailer_dict = self.interpret_trailer(trailer_data)
  562. self.xref_table = XrefTable()
  563. self.read_xref_table(xref_section_offset=self.last_xref_section_offset)
  564. if b"Prev" in self.trailer_dict:
  565. self.read_prev_trailer(self.trailer_dict[b"Prev"])
  566. def read_prev_trailer(self, xref_section_offset):
  567. trailer_offset = self.read_xref_table(xref_section_offset=xref_section_offset)
  568. m = self.re_trailer_prev.search(
  569. self.buf[trailer_offset : trailer_offset + 16384]
  570. )
  571. check_format_condition(m, "previous trailer not found")
  572. trailer_data = m.group(1)
  573. check_format_condition(
  574. int(m.group(2)) == xref_section_offset,
  575. "xref section offset in previous trailer doesn't match what was expected",
  576. )
  577. trailer_dict = self.interpret_trailer(trailer_data)
  578. if b"Prev" in trailer_dict:
  579. self.read_prev_trailer(trailer_dict[b"Prev"])
  580. re_whitespace_optional = re.compile(whitespace_optional)
  581. re_name = re.compile(
  582. whitespace_optional
  583. + rb"/([!-$&'*-.0-;=?-Z\\^-z|~]+)(?="
  584. + delimiter_or_ws
  585. + rb")"
  586. )
  587. re_dict_start = re.compile(whitespace_optional + rb"<<")
  588. re_dict_end = re.compile(whitespace_optional + rb">>" + whitespace_optional)
  589. @classmethod
  590. def interpret_trailer(cls, trailer_data):
  591. trailer = {}
  592. offset = 0
  593. while True:
  594. m = cls.re_name.match(trailer_data, offset)
  595. if not m:
  596. m = cls.re_dict_end.match(trailer_data, offset)
  597. check_format_condition(
  598. m and m.end() == len(trailer_data),
  599. "name not found in trailer, remaining data: "
  600. + repr(trailer_data[offset:]),
  601. )
  602. break
  603. key = cls.interpret_name(m.group(1))
  604. value, offset = cls.get_value(trailer_data, m.end())
  605. trailer[key] = value
  606. check_format_condition(
  607. b"Size" in trailer and isinstance(trailer[b"Size"], int),
  608. "/Size not in trailer or not an integer",
  609. )
  610. check_format_condition(
  611. b"Root" in trailer and isinstance(trailer[b"Root"], IndirectReference),
  612. "/Root not in trailer or not an indirect reference",
  613. )
  614. return trailer
  615. re_hashes_in_name = re.compile(rb"([^#]*)(#([0-9a-fA-F]{2}))?")
  616. @classmethod
  617. def interpret_name(cls, raw, as_text=False):
  618. name = b""
  619. for m in cls.re_hashes_in_name.finditer(raw):
  620. if m.group(3):
  621. name += m.group(1) + bytearray.fromhex(m.group(3).decode("us-ascii"))
  622. else:
  623. name += m.group(1)
  624. if as_text:
  625. return name.decode("utf-8")
  626. else:
  627. return bytes(name)
  628. re_null = re.compile(whitespace_optional + rb"null(?=" + delimiter_or_ws + rb")")
  629. re_true = re.compile(whitespace_optional + rb"true(?=" + delimiter_or_ws + rb")")
  630. re_false = re.compile(whitespace_optional + rb"false(?=" + delimiter_or_ws + rb")")
  631. re_int = re.compile(
  632. whitespace_optional + rb"([-+]?[0-9]+)(?=" + delimiter_or_ws + rb")"
  633. )
  634. re_real = re.compile(
  635. whitespace_optional
  636. + rb"([-+]?([0-9]+\.[0-9]*|[0-9]*\.[0-9]+))(?="
  637. + delimiter_or_ws
  638. + rb")"
  639. )
  640. re_array_start = re.compile(whitespace_optional + rb"\[")
  641. re_array_end = re.compile(whitespace_optional + rb"]")
  642. re_string_hex = re.compile(
  643. whitespace_optional + rb"<(" + whitespace_or_hex + rb"*)>"
  644. )
  645. re_string_lit = re.compile(whitespace_optional + rb"\(")
  646. re_indirect_reference = re.compile(
  647. whitespace_optional
  648. + rb"([-+]?[0-9]+)"
  649. + whitespace_mandatory
  650. + rb"([-+]?[0-9]+)"
  651. + whitespace_mandatory
  652. + rb"R(?="
  653. + delimiter_or_ws
  654. + rb")"
  655. )
  656. re_indirect_def_start = re.compile(
  657. whitespace_optional
  658. + rb"([-+]?[0-9]+)"
  659. + whitespace_mandatory
  660. + rb"([-+]?[0-9]+)"
  661. + whitespace_mandatory
  662. + rb"obj(?="
  663. + delimiter_or_ws
  664. + rb")"
  665. )
  666. re_indirect_def_end = re.compile(
  667. whitespace_optional + rb"endobj(?=" + delimiter_or_ws + rb")"
  668. )
  669. re_comment = re.compile(
  670. rb"(" + whitespace_optional + rb"%[^\r\n]*" + newline + rb")*"
  671. )
  672. re_stream_start = re.compile(whitespace_optional + rb"stream\r?\n")
  673. re_stream_end = re.compile(
  674. whitespace_optional + rb"endstream(?=" + delimiter_or_ws + rb")"
  675. )
  676. @classmethod
  677. def get_value(cls, data, offset, expect_indirect=None, max_nesting=-1):
  678. if max_nesting == 0:
  679. return None, None
  680. m = cls.re_comment.match(data, offset)
  681. if m:
  682. offset = m.end()
  683. m = cls.re_indirect_def_start.match(data, offset)
  684. if m:
  685. check_format_condition(
  686. int(m.group(1)) > 0,
  687. "indirect object definition: object ID must be greater than 0",
  688. )
  689. check_format_condition(
  690. int(m.group(2)) >= 0,
  691. "indirect object definition: generation must be non-negative",
  692. )
  693. check_format_condition(
  694. expect_indirect is None
  695. or expect_indirect
  696. == IndirectReference(int(m.group(1)), int(m.group(2))),
  697. "indirect object definition different than expected",
  698. )
  699. object, offset = cls.get_value(data, m.end(), max_nesting=max_nesting - 1)
  700. if offset is None:
  701. return object, None
  702. m = cls.re_indirect_def_end.match(data, offset)
  703. check_format_condition(m, "indirect object definition end not found")
  704. return object, m.end()
  705. check_format_condition(
  706. not expect_indirect, "indirect object definition not found"
  707. )
  708. m = cls.re_indirect_reference.match(data, offset)
  709. if m:
  710. check_format_condition(
  711. int(m.group(1)) > 0,
  712. "indirect object reference: object ID must be greater than 0",
  713. )
  714. check_format_condition(
  715. int(m.group(2)) >= 0,
  716. "indirect object reference: generation must be non-negative",
  717. )
  718. return IndirectReference(int(m.group(1)), int(m.group(2))), m.end()
  719. m = cls.re_dict_start.match(data, offset)
  720. if m:
  721. offset = m.end()
  722. result = {}
  723. m = cls.re_dict_end.match(data, offset)
  724. while not m:
  725. key, offset = cls.get_value(data, offset, max_nesting=max_nesting - 1)
  726. if offset is None:
  727. return result, None
  728. value, offset = cls.get_value(data, offset, max_nesting=max_nesting - 1)
  729. result[key] = value
  730. if offset is None:
  731. return result, None
  732. m = cls.re_dict_end.match(data, offset)
  733. offset = m.end()
  734. m = cls.re_stream_start.match(data, offset)
  735. if m:
  736. try:
  737. stream_len_str = result.get(b"Length")
  738. stream_len = int(stream_len_str)
  739. except (TypeError, ValueError) as e:
  740. msg = f"bad or missing Length in stream dict ({stream_len_str})"
  741. raise PdfFormatError(msg) from e
  742. stream_data = data[m.end() : m.end() + stream_len]
  743. m = cls.re_stream_end.match(data, m.end() + stream_len)
  744. check_format_condition(m, "stream end not found")
  745. offset = m.end()
  746. result = PdfStream(PdfDict(result), stream_data)
  747. else:
  748. result = PdfDict(result)
  749. return result, offset
  750. m = cls.re_array_start.match(data, offset)
  751. if m:
  752. offset = m.end()
  753. result = []
  754. m = cls.re_array_end.match(data, offset)
  755. while not m:
  756. value, offset = cls.get_value(data, offset, max_nesting=max_nesting - 1)
  757. result.append(value)
  758. if offset is None:
  759. return result, None
  760. m = cls.re_array_end.match(data, offset)
  761. return result, m.end()
  762. m = cls.re_null.match(data, offset)
  763. if m:
  764. return None, m.end()
  765. m = cls.re_true.match(data, offset)
  766. if m:
  767. return True, m.end()
  768. m = cls.re_false.match(data, offset)
  769. if m:
  770. return False, m.end()
  771. m = cls.re_name.match(data, offset)
  772. if m:
  773. return PdfName(cls.interpret_name(m.group(1))), m.end()
  774. m = cls.re_int.match(data, offset)
  775. if m:
  776. return int(m.group(1)), m.end()
  777. m = cls.re_real.match(data, offset)
  778. if m:
  779. # XXX Decimal instead of float???
  780. return float(m.group(1)), m.end()
  781. m = cls.re_string_hex.match(data, offset)
  782. if m:
  783. # filter out whitespace
  784. hex_string = bytearray(
  785. b for b in m.group(1) if b in b"0123456789abcdefABCDEF"
  786. )
  787. if len(hex_string) % 2 == 1:
  788. # append a 0 if the length is not even - yes, at the end
  789. hex_string.append(ord(b"0"))
  790. return bytearray.fromhex(hex_string.decode("us-ascii")), m.end()
  791. m = cls.re_string_lit.match(data, offset)
  792. if m:
  793. return cls.get_literal_string(data, m.end())
  794. # return None, offset # fallback (only for debugging)
  795. msg = f"unrecognized object: {repr(data[offset : offset + 32])}"
  796. raise PdfFormatError(msg)
  797. re_lit_str_token = re.compile(
  798. rb"(\\[nrtbf()\\])|(\\[0-9]{1,3})|(\\(\r\n|\r|\n))|(\r\n|\r|\n)|(\()|(\))"
  799. )
  800. escaped_chars = {
  801. b"n": b"\n",
  802. b"r": b"\r",
  803. b"t": b"\t",
  804. b"b": b"\b",
  805. b"f": b"\f",
  806. b"(": b"(",
  807. b")": b")",
  808. b"\\": b"\\",
  809. ord(b"n"): b"\n",
  810. ord(b"r"): b"\r",
  811. ord(b"t"): b"\t",
  812. ord(b"b"): b"\b",
  813. ord(b"f"): b"\f",
  814. ord(b"("): b"(",
  815. ord(b")"): b")",
  816. ord(b"\\"): b"\\",
  817. }
  818. @classmethod
  819. def get_literal_string(cls, data, offset):
  820. nesting_depth = 0
  821. result = bytearray()
  822. for m in cls.re_lit_str_token.finditer(data, offset):
  823. result.extend(data[offset : m.start()])
  824. if m.group(1):
  825. result.extend(cls.escaped_chars[m.group(1)[1]])
  826. elif m.group(2):
  827. result.append(int(m.group(2)[1:], 8))
  828. elif m.group(3):
  829. pass
  830. elif m.group(5):
  831. result.extend(b"\n")
  832. elif m.group(6):
  833. result.extend(b"(")
  834. nesting_depth += 1
  835. elif m.group(7):
  836. if nesting_depth == 0:
  837. return bytes(result), m.end()
  838. result.extend(b")")
  839. nesting_depth -= 1
  840. offset = m.end()
  841. msg = "unfinished literal string"
  842. raise PdfFormatError(msg)
  843. re_xref_section_start = re.compile(whitespace_optional + rb"xref" + newline)
  844. re_xref_subsection_start = re.compile(
  845. whitespace_optional
  846. + rb"([0-9]+)"
  847. + whitespace_mandatory
  848. + rb"([0-9]+)"
  849. + whitespace_optional
  850. + newline_only
  851. )
  852. re_xref_entry = re.compile(rb"([0-9]{10}) ([0-9]{5}) ([fn])( \r| \n|\r\n)")
  853. def read_xref_table(self, xref_section_offset):
  854. subsection_found = False
  855. m = self.re_xref_section_start.match(
  856. self.buf, xref_section_offset + self.start_offset
  857. )
  858. check_format_condition(m, "xref section start not found")
  859. offset = m.end()
  860. while True:
  861. m = self.re_xref_subsection_start.match(self.buf, offset)
  862. if not m:
  863. check_format_condition(
  864. subsection_found, "xref subsection start not found"
  865. )
  866. break
  867. subsection_found = True
  868. offset = m.end()
  869. first_object = int(m.group(1))
  870. num_objects = int(m.group(2))
  871. for i in range(first_object, first_object + num_objects):
  872. m = self.re_xref_entry.match(self.buf, offset)
  873. check_format_condition(m, "xref entry not found")
  874. offset = m.end()
  875. is_free = m.group(3) == b"f"
  876. if not is_free:
  877. generation = int(m.group(2))
  878. new_entry = (int(m.group(1)), generation)
  879. if i not in self.xref_table:
  880. self.xref_table[i] = new_entry
  881. return offset
  882. def read_indirect(self, ref, max_nesting=-1):
  883. offset, generation = self.xref_table[ref[0]]
  884. check_format_condition(
  885. generation == ref[1],
  886. f"expected to find generation {ref[1]} for object ID {ref[0]} in xref "
  887. f"table, instead found generation {generation} at offset {offset}",
  888. )
  889. value = self.get_value(
  890. self.buf,
  891. offset + self.start_offset,
  892. expect_indirect=IndirectReference(*ref),
  893. max_nesting=max_nesting,
  894. )[0]
  895. self.cached_objects[ref] = value
  896. return value
  897. def linearize_page_tree(self, node=None):
  898. if node is None:
  899. node = self.page_tree_root
  900. check_format_condition(
  901. node[b"Type"] == b"Pages", "/Type of page tree node is not /Pages"
  902. )
  903. pages = []
  904. for kid in node[b"Kids"]:
  905. kid_object = self.read_indirect(kid)
  906. if kid_object[b"Type"] == b"Page":
  907. pages.append(kid)
  908. else:
  909. pages.extend(self.linearize_page_tree(node=kid_object))
  910. return pages