Skip to content

Commit

Permalink
#34 add diacritic support
Browse files Browse the repository at this point in the history
  • Loading branch information
bzick committed Feb 13, 2025
1 parent ac8e5e7 commit e25eb8c
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 55 deletions.
110 changes: 56 additions & 54 deletions stream.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ func (s *Stream) PrevToken() *Token {
}

// NextToken returns next token from the stream.
// If next token doesn't exist, the method returns TypeUndef token.
// If a next token doesn't exist, the method returns TypeUndef token.
// Do not save a result (Token) into variables — the next token may be changed at any time.
func (s *Stream) NextToken() *Token {
if s.current.next != nil {
Expand All @@ -269,8 +269,7 @@ func (s *Stream) NextToken() *Token {
}

// GoNextIfNextIs moves the stream pointer to the next token if the next token has specific token keys.
// If keys matched pointer will be updated and the method returned true.
// Otherwise, returned false.
// If a key matched pointer is updated and the method returns true. Otherwise, returns false.
func (s *Stream) GoNextIfNextIs(key TokenKey, otherKeys ...TokenKey) bool {
if s.NextToken().Is(key, otherKeys...) {
s.GoNext()
Expand All @@ -280,68 +279,71 @@ func (s *Stream) GoNextIfNextIs(key TokenKey, otherKeys ...TokenKey) bool {
}

// GetSnippet returns slice of tokens.
// Slice generated from current token position and include tokens before and after current token.
// Slice generated from a current token position and include a number of tokens before and after the current token.
func (s *Stream) GetSnippet(before, after int) []Token {
var segment []Token
if s.current == undefToken {
if s.prev != nil && before > s.prev.id-s.head.id {
before = s.prev.id - s.head.id
} else {
before = 0
}
} else if before > s.current.id-s.head.id {
before = s.current.id - s.head.id
if s.current == nil {
return nil
}
if after > s.len-before-1 {
after = s.len - before - 1
snippet := make([]Token, before+after+1)
start := 0
end := before + after
snippet[before] = Token{
id: s.current.id,
key: s.current.key,
value: s.current.value,
line: s.current.line,
offset: s.current.offset,
indent: s.current.indent,
string: s.current.string,
}
segment = make([]Token, before+after+1)
if len(segment) == 0 {
return segment
}
var ptr *Token
if s.next != nil {
ptr = s.next
} else if s.prev != nil {
ptr = s.prev
} else {
ptr = s.current
}
for p := ptr; p != nil; p, before = ptr.prev, before-1 {
segment[before] = Token{
id: ptr.id,
key: ptr.key,
value: ptr.value,
line: ptr.line,
offset: ptr.offset,
indent: ptr.indent,
string: ptr.string,
}
if before <= 0 {
break
if s.current.prev != nil && before > 0 {
ptr := s.current.prev
for i := 1; i <= before; i++ {
snippet[before-i] = Token{
id: ptr.id,
key: ptr.key,
value: ptr.value,
line: ptr.line,
offset: ptr.offset,
indent: ptr.indent,
string: ptr.string,
}
ptr = ptr.prev
if ptr == nil {
start = before - i
break
}
}
}
for p, i := ptr.next, 1; p != nil; p, i = p.next, i+1 {
segment[before+i] = Token{
id: p.id,
key: p.key,
value: p.value,
line: p.line,
offset: p.offset,
indent: p.indent,
string: p.string,
}
if i >= after {
break
if s.current.next != nil && after > 0 {
ptr := s.current.next
for i := 1; i <= after; i++ {
snippet[before+i] = Token{ // before - is offset
id: ptr.id,
key: ptr.key,
value: ptr.value,
line: ptr.line,
offset: ptr.offset,
indent: ptr.indent,
string: ptr.string,
}
ptr = ptr.next
if ptr == nil {
end = -i
break
}
}
}
return segment
if start == 0 && end == before+after {
return snippet
}
return snippet[start:end]
}

// GetSnippetAsString returns tokens before and after current token as string.
// GetSnippetAsString returns tokens before and after a current token as string.
// `maxStringLength` specifies max length of each token string.
// Zero — unlimited token string length.
// If string is greater than maxLength method removes some runes in the middle of the string.
// If a string is greater than maxLength method removes some runes in the middle of the string.
func (s *Stream) GetSnippetAsString(before, after, maxStringLength int) string {
segments := s.GetSnippet(before, after)
str := make([]string, len(segments))
Expand Down
7 changes: 6 additions & 1 deletion tokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,16 @@ func TestTokenize(t *testing.T) {
{key: TokenKeyword, value: []byte("оди́н")},
{key: TokenKeyword, value: []byte("дома")},
}},
{"जब मैंने सुबह", []Token{
{key: TokenKeyword, value: []byte("जब")},
{key: TokenKeyword, value: []byte("मैंने")},
{key: TokenKeyword, value: []byte("सुबह")},
}},
}
for _, v := range diacritic {
t.Run(v.value, func(t *testing.T) {
stream := tokenizer.ParseBytes([]byte(v.value))
require.Equal(t, v.tokens, stream.GetSnippet(0, 0))
require.Equal(t, v.tokens, stream.GetSnippet(0, 1))
})
}
})
Expand Down

0 comments on commit e25eb8c

Please sign in to comment.