Skip to content

Commit 6dda581

Browse files
authored
🔀 Merge pull request #205 from nevans/parser/better-faster-cleaner-fetch
âš¡ Simpler, faster `msg-att` parser (for fetch responses)
2 parents 8106847 + ef38f8e commit 6dda581

File tree

1 file changed

+175
-141
lines changed

1 file changed

+175
-141
lines changed

‎lib/net/imap/response_parser.rb

Lines changed: 175 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,17 @@ def case_insensitive__nstring
427427
alias nz_number number
428428
alias nz_number? number?
429429

430+
# valid number ranges are not enforced by parser
431+
# nz-number64 = digit-nz *DIGIT
432+
# ; Unsigned 63-bit integer
433+
# ; (0 < n <= 9,223,372,036,854,775,807)
434+
alias nz_number64 nz_number
435+
436+
# valid number ranges are not enforced by parser
437+
# uniqueid = nz-number
438+
# ; Strictly ascending
439+
alias uniqueid nz_number
440+
430441
# [RFC3501 & RFC9051:]
431442
# response = *(continue-req / response-data) response-done
432443
#
@@ -607,49 +618,93 @@ def response_data__simple_numeric
607618
alias mailbox_data__exists response_data__simple_numeric
608619
alias mailbox_data__recent response_data__simple_numeric
609620

621+
# RFC3501 & RFC9051:
622+
# msg-att = "(" (msg-att-dynamic / msg-att-static)
623+
# *(SP (msg-att-dynamic / msg-att-static)) ")"
624+
#
625+
# msg-att-dynamic = "FLAGS" SP "(" [flag-fetch *(SP flag-fetch)] ")"
626+
# RFC5257 (ANNOTATE extension):
627+
# msg-att-dynamic =/ "ANNOTATION" SP
628+
# ( "(" entry-att *(SP entry-att) ")" /
629+
# "(" entry *(SP entry) ")" )
630+
# RFC7162 (CONDSTORE extension):
631+
# msg-att-dynamic =/ fetch-mod-resp
632+
# fetch-mod-resp = "MODSEQ" SP "(" permsg-modsequence ")"
633+
# RFC8970 (PREVIEW extension):
634+
# msg-att-dynamic =/ "PREVIEW" SP nstring
635+
#
636+
# RFC3501:
637+
# msg-att-static = "ENVELOPE" SP envelope /
638+
# "INTERNALDATE" SP date-time /
639+
# "RFC822" [".HEADER" / ".TEXT"] SP nstring /
640+
# "RFC822.SIZE" SP number /
641+
# "BODY" ["STRUCTURE"] SP body /
642+
# "BODY" section ["<" number ">"] SP nstring /
643+
# "UID" SP uniqueid
644+
# RFC3516 (BINARY extension):
645+
# msg-att-static =/ "BINARY" section-binary SP (nstring / literal8)
646+
# / "BINARY.SIZE" section-binary SP number
647+
# RFC8514 (SAVEDATE extension):
648+
# msg-att-static =/ "SAVEDATE" SP (date-time / nil)
649+
# RFC8474 (OBJECTID extension):
650+
# msg-att-static =/ fetch-emailid-resp / fetch-threadid-resp
651+
# fetch-emailid-resp = "EMAILID" SP "(" objectid ")"
652+
# fetch-threadid-resp = "THREADID" SP ( "(" objectid ")" / nil )
653+
# RFC9051:
654+
# msg-att-static = "ENVELOPE" SP envelope /
655+
# "INTERNALDATE" SP date-time /
656+
# "RFC822.SIZE" SP number64 /
657+
# "BODY" ["STRUCTURE"] SP body /
658+
# "BODY" section ["<" number ">"] SP nstring /
659+
# "BINARY" section-binary SP (nstring / literal8) /
660+
# "BINARY.SIZE" section-binary SP number /
661+
# "UID" SP uniqueid
662+
#
663+
# Re https://www.rfc-editor.org/errata/eid7246, I'm adding "offset" to the
664+
# official "BINARY" ABNF, like so:
665+
#
666+
# msg-att-static =/ "BINARY" section-binary ["<" number ">"] SP
667+
# (nstring / literal8)
610668
def msg_att(n)
611-
match(T_LPAR)
669+
lpar
612670
attr = {}
613671
while true
614-
token = lookahead
615-
case token.symbol
616-
when T_RPAR
617-
shift_token
618-
break
619-
when T_SPACE
620-
shift_token
621-
next
622-
end
623-
case token.value
624-
when /\A(?:ENVELOPE)\z/ni
625-
name, val = envelope_data
626-
when /\A(?:FLAGS)\z/ni
627-
name, val = flags_data
628-
when /\A(?:INTERNALDATE)\z/ni
629-
name, val = internaldate_data
630-
when /\A(?:RFC822(?:\.HEADER|\.TEXT)?)\z/ni
631-
name, val = rfc822_text
632-
when /\A(?:RFC822\.SIZE)\z/ni
633-
name, val = rfc822_size
634-
when /\A(?:BODY(?:STRUCTURE)?)\z/ni
635-
name, val = body_data
636-
when /\A(?:UID)\z/ni
637-
name, val = uid_data
638-
when /\A(?:MODSEQ)\z/ni
639-
name, val = modseq_data
640-
else
641-
parse_error("unknown attribute `%s' for {%d}", token.value, n)
642-
end
672+
name = msg_att__label; SP!
673+
val =
674+
case name
675+
when "UID" then uniqueid
676+
when "FLAGS" then flag_list
677+
when "BODY" then body
678+
when /\ABODY\[/ni then nstring
679+
when "BODYSTRUCTURE" then body
680+
when "ENVELOPE" then envelope
681+
when "INTERNALDATE" then date_time
682+
when "RFC822.SIZE" then number64
683+
when "RFC822" then nstring # not in rev2
684+
when "RFC822.HEADER" then nstring # not in rev2
685+
when "RFC822.TEXT" then nstring # not in rev2
686+
when "MODSEQ" then parens__modseq # CONDSTORE
687+
else parse_error("unknown attribute `%s' for {%d}", name, n)
688+
end
643689
attr[name] = val
690+
break unless SP?
691+
break if lookahead_rpar?
644692
end
645-
return attr
646-
end
647-
648-
def envelope_data
649-
token = match(T_ATOM)
650-
name = token.value.upcase
651-
match(T_SPACE)
652-
return name, envelope
693+
rpar
694+
attr
695+
end
696+
697+
# appends "[section]" and "<partial>" to the base label
698+
def msg_att__label
699+
case (name = tagged_ext_label)
700+
when /\A(?:RFC822(?:\.HEADER|\.TEXT)?)\z/ni
701+
# ignoring "[]" fixes https://bugs.ruby-lang.org/issues/5620
702+
lbra? and rbra
703+
when "BODY"
704+
peek_lbra? and name << section and
705+
peek_str?("<") and name << atom # partial
706+
end
707+
name
653708
end
654709

655710
def envelope
@@ -687,58 +742,10 @@ def envelope
687742
return result
688743
end
689744

690-
def flags_data
691-
token = match(T_ATOM)
692-
name = token.value.upcase
693-
match(T_SPACE)
694-
return name, flag_list
695-
end
696-
697-
def internaldate_data
698-
token = match(T_ATOM)
699-
name = token.value.upcase
700-
match(T_SPACE)
701-
token = match(T_QUOTED)
702-
return name, token.value
703-
end
704-
705-
def rfc822_text
706-
token = match(T_ATOM)
707-
name = token.value.upcase
708-
token = lookahead
709-
if token.symbol == T_LBRA
710-
shift_token
711-
match(T_RBRA)
712-
end
713-
match(T_SPACE)
714-
return name, nstring
715-
end
716-
717-
def rfc822_size
718-
token = match(T_ATOM)
719-
name = token.value.upcase
720-
match(T_SPACE)
721-
return name, number
722-
end
723-
724-
def body_data
725-
token = match(T_ATOM)
726-
name = token.value.upcase
727-
token = lookahead
728-
if token.symbol == T_SPACE
729-
shift_token
730-
return name, body
731-
end
732-
name.concat(section)
733-
token = lookahead
734-
if token.symbol == T_ATOM
735-
name.concat(token.value)
736-
shift_token
737-
end
738-
match(T_SPACE)
739-
data = nstring
740-
return name, data
741-
end
745+
# date-time = DQUOTE date-day-fixed "-" date-month "-" date-year
746+
# SP time SP zone DQUOTE
747+
alias date_time quoted
748+
alias ndatetime nquoted
742749

743750
# RFC-3501 & RFC-9051:
744751
# body = "(" (body-type-1part / body-type-mpart) ")"
@@ -996,48 +1003,78 @@ def body_extension
9961003
end
9971004
end
9981005

1006+
# section = "[" [section-spec] "]"
9991007
def section
1000-
str = String.new
1001-
token = match(T_LBRA)
1002-
str.concat(token.value)
1003-
token = match(T_ATOM, T_NUMBER, T_RBRA)
1004-
if token.symbol == T_RBRA
1005-
str.concat(token.value)
1006-
return str
1007-
end
1008-
str.concat(token.value)
1009-
token = lookahead
1010-
if token.symbol == T_SPACE
1011-
shift_token
1012-
str.concat(token.value)
1013-
token = match(T_LPAR)
1014-
str.concat(token.value)
1015-
while true
1016-
token = lookahead
1017-
case token.symbol
1018-
when T_RPAR
1019-
str.concat(token.value)
1020-
shift_token
1021-
break
1022-
when T_SPACE
1023-
shift_token
1024-
str.concat(token.value)
1025-
end
1026-
str.concat(format_string(astring))
1027-
end
1028-
end
1029-
token = match(T_RBRA)
1030-
str.concat(token.value)
1031-
return str
1008+
str = +lbra
1009+
str << section_spec unless peek_rbra?
1010+
str << rbra
1011+
end
1012+
1013+
# section-spec = section-msgtext / (section-part ["." section-text])
1014+
# section-msgtext = "HEADER" /
1015+
# "HEADER.FIELDS" [".NOT"] SP header-list /
1016+
# "TEXT"
1017+
# ; top-level or MESSAGE/RFC822 or
1018+
# ; MESSAGE/GLOBAL part
1019+
# section-part = nz-number *("." nz-number)
1020+
# ; body part reference.
1021+
# ; Allows for accessing nested body parts.
1022+
# section-text = section-msgtext / "MIME"
1023+
# ; text other than actual body part (headers,
1024+
# ; etc.)
1025+
#
1026+
# n.b: we could "cheat" here and just grab all text inside the brackets,
1027+
# but literals would need special treatment.
1028+
def section_spec
1029+
str = "".b
1030+
str << atom # grabs everything up to "SP header-list" or "]"
1031+
str << " " << header_list if SP?
1032+
str
10321033
end
10331034

1034-
def format_string(str)
1035-
case str
1035+
# header-list = "(" header-fld-name *(SP header-fld-name) ")"
1036+
def header_list
1037+
str = +""
1038+
str << lpar << header_fld_name
1039+
str << " " << header_fld_name while SP?
1040+
str << rpar
1041+
end
1042+
1043+
# RFC3501 & RFC9051:
1044+
# header-fld-name = astring
1045+
#
1046+
# Although RFC3501 allows any astring, RFC5322-valid header names are one
1047+
# or more of the printable US-ASCII characters, except SP and colon. So
1048+
# empty string isn't valid, and literals aren't needed and should not be
1049+
# used. This syntax is unchanged by [I18N-HDRS] (RFC6532).
1050+
#
1051+
# RFC5233:
1052+
# optional-field = field-name ":" unstructured CRLF
1053+
# field-name = 1*ftext
1054+
# ftext = %d33-57 / ; Printable US-ASCII
1055+
# %d59-126 ; characters not including
1056+
# ; ":".
1057+
#
1058+
# Atom and quoted should be sufficient.
1059+
#
1060+
# TODO: Use original source string, rather than decode and re-encode.
1061+
# TODO: or at least, DRY up this code with the send_command formatting.
1062+
def header_fld_name
1063+
case (str = astring)
10361064
when ""
1065+
warn '%s header-fld-name is an invalid RFC5322 field-name: ""' %
1066+
[self.class]
10371067
return '""'
10381068
when /[\x80-\xff\r\n]/n
1069+
warn "%s header-fld-name %p has invalid RFC5322 field-name char: %p" %
1070+
[self.class, str, $&]
10391071
# literal
10401072
return "{" + str.bytesize.to_s + "}" + CRLF + str
1073+
when /[^\x21-\x39\x3b-\xfe]/n
1074+
warn "%s header-fld-name %p has invalid RFC5322 field-name char: %p" %
1075+
[self.class, str, $&]
1076+
# invalid quoted string
1077+
return '"' + str.gsub(/["\\]/n, "\\\\\\&") + '"'
10411078
when /[(){ \x00-\x1f\x7f%*"\\]/n
10421079
# quoted string
10431080
return '"' + str.gsub(/["\\]/n, "\\\\\\&") + '"'
@@ -1047,23 +1084,6 @@ def format_string(str)
10471084
end
10481085
end
10491086

1050-
def uid_data
1051-
token = match(T_ATOM)
1052-
name = token.value.upcase
1053-
match(T_SPACE)
1054-
return name, number
1055-
end
1056-
1057-
def modseq_data
1058-
token = match(T_ATOM)
1059-
name = token.value.upcase
1060-
match(T_SPACE)
1061-
match(T_LPAR)
1062-
modseq = number
1063-
match(T_RPAR)
1064-
return name, modseq
1065-
end
1066-
10671087
def mailbox_data__flags
10681088
token = match(T_ATOM)
10691089
name = token.value.upcase
@@ -1631,6 +1651,20 @@ def charset
16311651
end
16321652
end
16331653

1654+
# RFC7162:
1655+
# mod-sequence-value = 1*DIGIT
1656+
# ;; Positive unsigned 63-bit integer
1657+
# ;; (mod-sequence)
1658+
# ;; (1 <= n <= 9,223,372,036,854,775,807).
1659+
alias mod_sequence_value nz_number64
1660+
1661+
# RFC7162:
1662+
# permsg-modsequence = mod-sequence-value
1663+
# ;; Per-message mod-sequence.
1664+
alias permsg_modsequence mod_sequence_value
1665+
1666+
def parens__modseq; lpar; _ = permsg_modsequence; rpar; _ end
1667+
16341668
# RFC-4315 (UIDPLUS) or RFC9051 (IMAP4rev2):
16351669
# uid-set = (uniqueid / uid-range) *("," uid-set)
16361670
# uid-range = (uniqueid ":" uniqueid)

0 commit comments

Comments
 (0)