Fix incorrect parsing of bfrange (#631) (#763)

nsfisis · k00ni · web-flow · commit 8440edbf58c8 · 2025-03-31T15:16:09.000+02:00
* Fix incorrect parsing of bfrange (#631) Previously, the regular expression for single-offset bfrange mapping unintentionally matched portions of the bracketed array form. This caused extremely large code range, leading to out of memory. Now, we capture the `<from>` and `<to>` ranges first and then distinguish between single-offset (`<xxxx>`) and array (`[<xxxx> <xxxx> ...]`) forms. This ensures that the single-offset regex won't accidentally match bracketed segments. * Apply suggestions from code review * Add check if the regexp patterns match --------- Co-authored-by: Konrad Abicht <hi@inspirito.de>
diff --git a/src/Smalot/PdfParser/Font.php b/src/Smalot/PdfParser/Font.php
@@ -216,45 +216,53 @@ public function loadTranslateTable(): array
             // Support for multiple bfrange sections
             if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
                 foreach ($matches['sections'] as $section) {
-                    // Support for : <srcCode1> <srcCode2> <dstString>
-                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
+                    /**
+                     * Regexp to capture <from>, <to>, and either <offset> or [...] items.
+                     * - (?P<from>...) Source range's start
+                     * - (?P<to>...)   Source range's end
+                     * - (?P<dest>...) Destination range's offset or each char code
+                     *                 Some PDF file has 2-byte Unicode values on new lines > added \r\n
+                     */
+                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *(?P<dest><[0-9A-F]+>|\[[\r\n<>0-9A-F ]+\])[ \r\n]+/is';
 
                     preg_match_all($regexp, $section, $matches);
 
                     foreach ($matches['from'] as $key => $from) {
                         $char_from = hexdec($from);
                         $char_to = hexdec($matches['to'][$key]);
-                        $offset = hexdec($matches['offset'][$key]);
+                        $dest = $matches['dest'][$key];
 
-                        for ($char = $char_from; $char <= $char_to; ++$char) {
-                            $this->table[$char] = self::uchr($char - $char_from + $offset);
-                        }
-                    }
+                        if (1 === preg_match('/^<(?P<offset>[0-9A-F]+)>$/i', $dest, $offset_matches)) {
+                            // Support for : <srcCode1> <srcCode2> <dstString>
+                            $offset = hexdec($offset_matches['offset']);
 
-                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
-                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
-                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
-
-                    preg_match_all($regexp, $section, $matches);
+                            for ($char = $char_from; $char <= $char_to; ++$char) {
+                                $this->table[$char] = self::uchr($char - $char_from + $offset);
+                            }
+                        } else {
+                            // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
+                            $strings = [];
+                            $matched = preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $dest, $strings);
+                            if (false === $matched || 0 === $matched) {
+                                continue;
+                            }
 
-                    foreach ($matches['from'] as $key => $from) {
-                        $char_from = hexdec($from);
-                        $strings = [];
-
-                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
-
-                        foreach ($strings['string'] as $position => $string) {
-                            $parts = preg_split(
-                                '/([0-9A-F]{4})/i',
-                                $string,
-                                0,
-                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
-                            );
-                            $text = '';
-                            foreach ($parts as $part) {
-                                $text .= self::uchr(hexdec($part));
+                            foreach ($strings['string'] as $position => $string) {
+                                $parts = preg_split(
+                                    '/([0-9A-F]{4})/i',
+                                    $string,
+                                    0,
+                                    \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
+                                );
+                                if (false === $parts) {
+                                    continue;
+                                }
+                                $text = '';
+                                foreach ($parts as $part) {
+                                    $text .= self::uchr(hexdec($part));
+                                }
+                                $this->table[$char_from + $position] = $text;
                             }
-                            $this->table[$char_from + $position] = $text;
                         }
                     }
                 }
diff --git a/tests/PHPUnit/Integration/FontTest.php b/tests/PHPUnit/Integration/FontTest.php
@@ -238,6 +238,81 @@ public function testLoadTranslateTable(): void
         $this->assertEquals('y', $table[92]);
     }
 
+    /**
+     * Tests loadTranslateTable with a bfrange definition that lists every destination character.
+     *
+     * @see https://github.com/smalot/pdfparser/issues/631
+     */
+    public function testLoadTranslateTableIssue631(): void
+    {
+        $document = new Document();
+
+        $content = '<</Type/Font /Subtype /Type0 /ToUnicode 2 0 R>>';
+        $header = Header::parse($content, $document);
+        $font = new Font($document, $header);
+
+        $content = '/CIDInit /ProcSet findresource begin
+1 dict begin
+begincmap
+/CIDSystemInfo
+<< /Registry (Adobe)
+/Ordering (UCS)
+/Supplement 0
+>> def
+/CMapName /Adobe-Identity-UCS def
+/CMapType 2 def
+1 beginbfrange
+<0677> <0689> [<FB1F> <FEDF0672> <FEE00672> <FEDF0673> <FEE00673> <FEDF0675> <FEE00675> <06B5FE8E> <06B5FE8E> <06B6FE8E> <06B6FE8E> <06B7FE8E> <06B7FE8E> <06B8FE8E> <06B8FE8E> <06F4> <0667> <FEDFFB51> <FEE0FB51>]
+<0690> <0693> [<FFFF> <FFFFFFFF> <FFFFFFFFFFFF> <FFFFFFFFFFFFFFFF>]
+<0694> <0695> [F<> 123 <>00]
+<0696> <0701> [<1> <23> <456> <7890> <ABCDE> <F12345>]
+endbfrange
+endcmap
+CMapName currentdict /CMap defineresource pop
+end
+end';
+        $unicode = new PDFObject($document, null, $content);
+
+        $document->setObjects(['1_0' => $font, '2_0' => $unicode]);
+
+        $font->init();
+        // Test reload
+        $table = $font->loadTranslateTable();
+
+        $this->assertEquals(29, \count($table));
+
+        // Test ranges
+        $this->assertEquals("\u{FB1F}", $table[0x0677]);
+        $this->assertEquals("\u{FEDF}\u{0672}", $table[0x0678]);
+        $this->assertEquals("\u{FEE0}\u{0672}", $table[0x0679]);
+        $this->assertEquals("\u{FEDF}\u{0673}", $table[0x067A]);
+        $this->assertEquals("\u{FEE0}\u{0673}", $table[0x067B]);
+        $this->assertEquals("\u{FEDF}\u{0675}", $table[0x067C]);
+        $this->assertEquals("\u{FEE0}\u{0675}", $table[0x067D]);
+        $this->assertEquals("\u{06B5}\u{FE8E}", $table[0x067E]);
+        $this->assertEquals("\u{06B5}\u{FE8E}", $table[0x067F]);
+        $this->assertEquals("\u{06B6}\u{FE8E}", $table[0x0680]);
+        $this->assertEquals("\u{06B6}\u{FE8E}", $table[0x0681]);
+        $this->assertEquals("\u{06B7}\u{FE8E}", $table[0x0682]);
+        $this->assertEquals("\u{06B7}\u{FE8E}", $table[0x0683]);
+        $this->assertEquals("\u{06B8}\u{FE8E}", $table[0x0684]);
+        $this->assertEquals("\u{06B8}\u{FE8E}", $table[0x0685]);
+        $this->assertEquals("\u{06F4}", $table[0x0686]);
+        $this->assertEquals("\u{0667}", $table[0x0687]);
+        $this->assertEquals("\u{FEDF}\u{FB51}", $table[0x0688]);
+        $this->assertEquals("\u{FEE0}\u{FB51}", $table[0x0689]);
+        $this->assertEquals("\u{FFFF}", $table[0x0690]);
+        $this->assertEquals("\u{FFFF}\u{FFFF}", $table[0x0691]);
+        $this->assertEquals("\u{FFFF}\u{FFFF}\u{FFFF}", $table[0x0692]);
+        $this->assertEquals("\u{FFFF}\u{FFFF}\u{FFFF}\u{FFFF}", $table[0x0693]);
+        $this->assertEquals("\u{0001}", $table[0x0696]);
+        $this->assertEquals("\u{0023}", $table[0x0697]);
+        $this->assertEquals("\u{0456}", $table[0x0698]);
+        $this->assertEquals("\u{7890}", $table[0x0699]);
+        $this->assertEquals("\u{ABCD}\u{000E}", $table[0x069A]);
+        $this->assertEquals("\u{F123}\u{0045}", $table[0x069B]);
+    }
+
     public function testDecodeHexadecimal(): void
     {
         $hexa = '<322041>';