Skip to content

Commit 8440edb

Browse files
nsfisisk00ni
andauthored
Fix incorrect parsing of bfrange (#631) (#763)
* Fix incorrect parsing of bfrange (#631) Previously, the regular expression for single-offset bfrange mapping unintentionally matched portions of the bracketed array form. This caused extremely large code range, leading to out of memory. Now, we capture the `<from>` and `<to>` ranges first and then distinguish between single-offset (`<xxxx>`) and array (`[<xxxx> <xxxx> ...]`) forms. This ensures that the single-offset regex won't accidentally match bracketed segments. * Apply suggestions from code review * Add check if the regexp patterns match --------- Co-authored-by: Konrad Abicht <hi@inspirito.de>
1 parent 0ddcc54 commit 8440edb

File tree

2 files changed

+112
-29
lines changed

2 files changed

+112
-29
lines changed

src/Smalot/PdfParser/Font.php

+37-29
Original file line numberDiff line numberDiff line change
@@ -216,45 +216,53 @@ public function loadTranslateTable(): array
216216
// Support for multiple bfrange sections
217217
if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
218218
foreach ($matches['sections'] as $section) {
219-
// Support for : <srcCode1> <srcCode2> <dstString>
220-
$regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
219+
/**
220+
* Regexp to capture <from>, <to>, and either <offset> or [...] items.
221+
* - (?P<from>...) Source range's start
222+
* - (?P<to>...) Source range's end
223+
* - (?P<dest>...) Destination range's offset or each char code
224+
* Some PDF file has 2-byte Unicode values on new lines > added \r\n
225+
*/
226+
$regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *(?P<dest><[0-9A-F]+>|\[[\r\n<>0-9A-F ]+\])[ \r\n]+/is';
221227

222228
preg_match_all($regexp, $section, $matches);
223229

224230
foreach ($matches['from'] as $key => $from) {
225231
$char_from = hexdec($from);
226232
$char_to = hexdec($matches['to'][$key]);
227-
$offset = hexdec($matches['offset'][$key]);
233+
$dest = $matches['dest'][$key];
228234

229-
for ($char = $char_from; $char <= $char_to; ++$char) {
230-
$this->table[$char] = self::uchr($char - $char_from + $offset);
231-
}
232-
}
235+
if (1 === preg_match('/^<(?P<offset>[0-9A-F]+)>$/i', $dest, $offset_matches)) {
236+
// Support for : <srcCode1> <srcCode2> <dstString>
237+
$offset = hexdec($offset_matches['offset']);
233238

234-
// Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
235-
// Some PDF file has 2-byte Unicode values on new lines > added \r\n
236-
$regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
237-
238-
preg_match_all($regexp, $section, $matches);
239+
for ($char = $char_from; $char <= $char_to; ++$char) {
240+
$this->table[$char] = self::uchr($char - $char_from + $offset);
241+
}
242+
} else {
243+
// Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
244+
$strings = [];
245+
$matched = preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $dest, $strings);
246+
if (false === $matched || 0 === $matched) {
247+
continue;
248+
}
239249

240-
foreach ($matches['from'] as $key => $from) {
241-
$char_from = hexdec($from);
242-
$strings = [];
243-
244-
preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
245-
246-
foreach ($strings['string'] as $position => $string) {
247-
$parts = preg_split(
248-
'/([0-9A-F]{4})/i',
249-
$string,
250-
0,
251-
\PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
252-
);
253-
$text = '';
254-
foreach ($parts as $part) {
255-
$text .= self::uchr(hexdec($part));
250+
foreach ($strings['string'] as $position => $string) {
251+
$parts = preg_split(
252+
'/([0-9A-F]{4})/i',
253+
$string,
254+
0,
255+
\PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
256+
);
257+
if (false === $parts) {
258+
continue;
259+
}
260+
$text = '';
261+
foreach ($parts as $part) {
262+
$text .= self::uchr(hexdec($part));
263+
}
264+
$this->table[$char_from + $position] = $text;
256265
}
257-
$this->table[$char_from + $position] = $text;
258266
}
259267
}
260268
}

tests/PHPUnit/Integration/FontTest.php

+75
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,81 @@ public function testLoadTranslateTable(): void
238238
$this->assertEquals('y', $table[92]);
239239
}
240240

241+
/**
242+
* Tests loadTranslateTable with a bfrange definition that lists every destination character.
243+
*
244+
* @see https://github.com/smalot/pdfparser/issues/631
245+
*/
246+
public function testLoadTranslateTableIssue631(): void
247+
{
248+
$document = new Document();
249+
250+
$content = '<</Type/Font /Subtype /Type0 /ToUnicode 2 0 R>>';
251+
$header = Header::parse($content, $document);
252+
$font = new Font($document, $header);
253+
254+
$content = '/CIDInit /ProcSet findresource begin
255+
1 dict begin
256+
begincmap
257+
/CIDSystemInfo
258+
<< /Registry (Adobe)
259+
/Ordering (UCS)
260+
/Supplement 0
261+
>> def
262+
/CMapName /Adobe-Identity-UCS def
263+
/CMapType 2 def
264+
1 beginbfrange
265+
<0677> <0689> [<FB1F> <FEDF0672> <FEE00672> <FEDF0673> <FEE00673> <FEDF0675> <FEE00675> <06B5FE8E> <06B5FE8E> <06B6FE8E> <06B6FE8E> <06B7FE8E> <06B7FE8E> <06B8FE8E> <06B8FE8E> <06F4> <0667> <FEDFFB51> <FEE0FB51>]
266+
<0690> <0693> [<FFFF> <FFFFFFFF> <FFFFFFFFFFFF> <FFFFFFFFFFFFFFFF>]
267+
<0694> <0695> [F<> 123 <>00]
268+
<0696> <0701> [<1> <23> <456> <7890> <ABCDE> <F12345>]
269+
endbfrange
270+
endcmap
271+
CMapName currentdict /CMap defineresource pop
272+
end
273+
end';
274+
$unicode = new PDFObject($document, null, $content);
275+
276+
$document->setObjects(['1_0' => $font, '2_0' => $unicode]);
277+
278+
$font->init();
279+
// Test reload
280+
$table = $font->loadTranslateTable();
281+
282+
$this->assertEquals(29, \count($table));
283+
284+
// Test ranges
285+
$this->assertEquals("\u{FB1F}", $table[0x0677]);
286+
$this->assertEquals("\u{FEDF}\u{0672}", $table[0x0678]);
287+
$this->assertEquals("\u{FEE0}\u{0672}", $table[0x0679]);
288+
$this->assertEquals("\u{FEDF}\u{0673}", $table[0x067A]);
289+
$this->assertEquals("\u{FEE0}\u{0673}", $table[0x067B]);
290+
$this->assertEquals("\u{FEDF}\u{0675}", $table[0x067C]);
291+
$this->assertEquals("\u{FEE0}\u{0675}", $table[0x067D]);
292+
$this->assertEquals("\u{06B5}\u{FE8E}", $table[0x067E]);
293+
$this->assertEquals("\u{06B5}\u{FE8E}", $table[0x067F]);
294+
$this->assertEquals("\u{06B6}\u{FE8E}", $table[0x0680]);
295+
$this->assertEquals("\u{06B6}\u{FE8E}", $table[0x0681]);
296+
$this->assertEquals("\u{06B7}\u{FE8E}", $table[0x0682]);
297+
$this->assertEquals("\u{06B7}\u{FE8E}", $table[0x0683]);
298+
$this->assertEquals("\u{06B8}\u{FE8E}", $table[0x0684]);
299+
$this->assertEquals("\u{06B8}\u{FE8E}", $table[0x0685]);
300+
$this->assertEquals("\u{06F4}", $table[0x0686]);
301+
$this->assertEquals("\u{0667}", $table[0x0687]);
302+
$this->assertEquals("\u{FEDF}\u{FB51}", $table[0x0688]);
303+
$this->assertEquals("\u{FEE0}\u{FB51}", $table[0x0689]);
304+
$this->assertEquals("\u{FFFF}", $table[0x0690]);
305+
$this->assertEquals("\u{FFFF}\u{FFFF}", $table[0x0691]);
306+
$this->assertEquals("\u{FFFF}\u{FFFF}\u{FFFF}", $table[0x0692]);
307+
$this->assertEquals("\u{FFFF}\u{FFFF}\u{FFFF}\u{FFFF}", $table[0x0693]);
308+
$this->assertEquals("\u{0001}", $table[0x0696]);
309+
$this->assertEquals("\u{0023}", $table[0x0697]);
310+
$this->assertEquals("\u{0456}", $table[0x0698]);
311+
$this->assertEquals("\u{7890}", $table[0x0699]);
312+
$this->assertEquals("\u{ABCD}\u{000E}", $table[0x069A]);
313+
$this->assertEquals("\u{F123}\u{0045}", $table[0x069B]);
314+
}
315+
241316
public function testDecodeHexadecimal(): void
242317
{
243318
$hexa = '<322041>';

0 commit comments

Comments
 (0)