Skip to content

Commit a68e04c

Browse files
committed
Reduce memory usage when cacluting repository languages
1 parent 1e0758a commit a68e04c

File tree

5 files changed

+85
-33
lines changed

5 files changed

+85
-33
lines changed

modules/git/languagestats/language_stats_gogit.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package languagestats
77

88
import (
99
"bytes"
10+
"context"
1011
"io"
1112

1213
"code.gitea.io/gitea/modules/analyze"
@@ -20,8 +21,8 @@ import (
2021
"github.com/go-git/go-git/v5/plumbing/object"
2122
)
2223

23-
// GetLanguageStats calculates language stats for git repository at specified commit
24-
func GetLanguageStats(repo *git_module.Repository, commitID string) (map[string]int64, error) {
24+
// CalcLanguageStats calculates language stats for git repository at specified commit
25+
func CalcLanguageStats(ctx context.Context, repo *git_module.Repository, commitID string) (map[string]int64, error) {
2526
r, err := git.PlainOpen(repo.Path)
2627
if err != nil {
2728
return nil, err

modules/git/languagestats/language_stats_nogogit.go

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package languagestats
77

88
import (
99
"bytes"
10+
"context"
1011
"io"
1112

1213
"code.gitea.io/gitea/modules/analyze"
@@ -18,8 +19,8 @@ import (
1819
"github.com/go-enry/go-enry/v2"
1920
)
2021

21-
// GetLanguageStats calculates language stats for git repository at specified commit
22-
func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64, error) {
22+
// CalcLanguageStats calculates language stats for git repository at specified commit
23+
func CalcLanguageStats(ctx context.Context, repo *git.Repository, commitID string) (map[string]int64, error) {
2324
// We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary.
2425
// so let's create a batch stdin and stdout
2526
batchStdinWriter, batchReader, cancel, err := repo.CatFileBatch(repo.Ctx)
@@ -59,11 +60,6 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
5960

6061
tree := commit.Tree
6162

62-
entries, err := tree.ListEntriesRecursiveWithSize()
63-
if err != nil {
64-
return nil, err
65-
}
66-
6763
checker, err := attribute.NewBatchChecker(repo, commitID, attribute.LinguistAttributes)
6864
if err != nil {
6965
return nil, err
@@ -82,18 +78,12 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
8278
firstExcludedLanguage := ""
8379
firstExcludedLanguageSize := int64(0)
8480

85-
for _, f := range entries {
86-
select {
87-
case <-repo.Ctx.Done():
88-
return sizes, repo.Ctx.Err()
89-
default:
90-
}
91-
81+
if err := tree.IterateEntriesRecursive(ctx, func(ctx context.Context, f *git.TreeEntry) error {
9282
contentBuf.Reset()
9383
content = contentBuf.Bytes()
9484

9585
if f.Size() == 0 {
96-
continue
86+
return nil
9787
}
9888

9989
isVendored := optional.None[bool]()
@@ -104,19 +94,19 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
10494
attrLinguistGenerated := optional.None[bool]()
10595
if err == nil {
10696
if isVendored = attrs.GetVendored(); isVendored.ValueOrDefault(false) {
107-
continue
97+
return nil
10898
}
10999

110100
if attrLinguistGenerated = attrs.GetGenerated(); attrLinguistGenerated.ValueOrDefault(false) {
111-
continue
101+
return nil
112102
}
113103

114104
if isDocumentation = attrs.GetDocumentation(); isDocumentation.ValueOrDefault(false) {
115-
continue
105+
return nil
116106
}
117107

118108
if isDetectable = attrs.GetDetectable(); !isDetectable.ValueOrDefault(true) {
119-
continue
109+
return nil
120110
}
121111

122112
if hasLanguage := attrs.GetLanguage(); hasLanguage.Value() != "" {
@@ -130,27 +120,27 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
130120

131121
// this language will always be added to the size
132122
sizes[language] += f.Size()
133-
continue
123+
return nil
134124
}
135125
}
136126

137127
if (!isVendored.Has() && analyze.IsVendor(f.Name())) ||
138128
enry.IsDotFile(f.Name()) ||
139129
(!isDocumentation.Has() && enry.IsDocumentation(f.Name())) ||
140130
enry.IsConfiguration(f.Name()) {
141-
continue
131+
return nil
142132
}
143133

144134
// If content can not be read or file is too big just do detection by filename
145135

146136
if f.Size() <= bigFileSize {
147137
if err := writeID(f.ID.String()); err != nil {
148-
return nil, err
138+
return err
149139
}
150140
_, _, size, err := git.ReadBatchLine(batchReader)
151141
if err != nil {
152142
log.Debug("Error reading blob: %s Err: %v", f.ID.String(), err)
153-
return nil, err
143+
return err
154144
}
155145

156146
sizeToRead := size
@@ -162,11 +152,11 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
162152

163153
_, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead))
164154
if err != nil {
165-
return nil, err
155+
return err
166156
}
167157
content = contentBuf.Bytes()
168158
if err := git.DiscardFull(batchReader, discard); err != nil {
169-
return nil, err
159+
return err
170160
}
171161
}
172162

@@ -178,14 +168,14 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
178168
isGenerated = enry.IsGenerated(f.Name(), content)
179169
}
180170
if isGenerated {
181-
continue
171+
return nil
182172
}
183173

184174
// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
185175
// - eg. do the all the detection tests using filename first before reading content.
186176
language := analyze.GetCodeLanguage(f.Name(), content)
187177
if language == "" {
188-
continue
178+
return nil
189179
}
190180

191181
// group languages, such as Pug -> HTML; SCSS -> CSS
@@ -206,6 +196,9 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
206196
firstExcludedLanguage = language
207197
firstExcludedLanguageSize += f.Size()
208198
}
199+
return nil
200+
}, git.TrustedCmdArgs{"--long"}); err != nil {
201+
return sizes, err
209202
}
210203

211204
// If there are no included languages add the first excluded language

modules/git/parse_nogogit.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@ func ParseTreeEntries(data []byte) ([]*TreeEntry, error) {
2222
// parseTreeEntries FIXME this function's design is not right, it should not make the caller read all data into memory
2323
func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
2424
entries := make([]*TreeEntry, 0, bytes.Count(data, []byte{'\n'})+1)
25+
return entries, iterateTreeEntries(data, ptree, func(entry *TreeEntry) error {
26+
entries = append(entries, entry)
27+
return nil
28+
})
29+
}
30+
31+
func iterateTreeEntries(data []byte, ptree *Tree, f func(entry *TreeEntry) error) error {
2532
for pos := 0; pos < len(data); {
2633
posEnd := bytes.IndexByte(data[pos:], '\n')
2734
if posEnd == -1 {
@@ -33,7 +40,7 @@ func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
3340
line := data[pos:posEnd]
3441
lsTreeLine, err := parseLsTreeLine(line)
3542
if err != nil {
36-
return nil, err
43+
return err
3744
}
3845
entry := &TreeEntry{
3946
ptree: ptree,
@@ -44,9 +51,11 @@ func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
4451
sized: lsTreeLine.Size.Has(),
4552
}
4653
pos = posEnd + 1
47-
entries = append(entries, entry)
54+
if err := f(entry); err != nil {
55+
return err
56+
}
4857
}
49-
return entries, nil
58+
return nil
5059
}
5160

5261
func catBatchParseTreeEntries(objectFormat ObjectFormat, ptree *Tree, rd *bufio.Reader, sz int64) ([]*TreeEntry, error) {

modules/git/tree_nogogit.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
package git
77

88
import (
9+
"bufio"
10+
"context"
911
"io"
1012
"strings"
1113
)
@@ -122,3 +124,50 @@ func (t *Tree) ListEntriesRecursiveFast() (Entries, error) {
122124
func (t *Tree) ListEntriesRecursiveWithSize() (Entries, error) {
123125
return t.listEntriesRecursive(TrustedCmdArgs{"--long"})
124126
}
127+
128+
// IterateEntriesRecursive returns iterate entries of current tree recursively including all subtrees
129+
// extraArgs could be "-l" to get the size, which is slower
130+
func (t *Tree) IterateEntriesRecursive(ctx context.Context, f func(ctx context.Context, entry *TreeEntry) error, extraArgs TrustedCmdArgs) error {
131+
reader, writer := io.Pipe()
132+
done := make(chan error)
133+
134+
go func(t *Tree, done chan error, writer *io.PipeWriter) {
135+
runErr := NewCommand("ls-tree", "-t", "-r").
136+
AddArguments(extraArgs...).
137+
AddDynamicArguments(t.ID.String()).
138+
Run(ctx, &RunOpts{
139+
Dir: t.repo.Path,
140+
Stdout: writer,
141+
})
142+
143+
_ = writer.Close()
144+
145+
done <- runErr
146+
}(t, done, writer)
147+
148+
scanner := bufio.NewScanner(reader)
149+
for scanner.Scan() {
150+
if err := scanner.Err(); err != nil {
151+
return err
152+
}
153+
154+
data := scanner.Bytes()
155+
if err := iterateTreeEntries(data, t, func(entry *TreeEntry) error {
156+
if err := f(ctx, entry); err != nil {
157+
return err
158+
}
159+
160+
select {
161+
case <-ctx.Done():
162+
return ctx.Err()
163+
case runErr := <-done:
164+
return runErr
165+
default:
166+
return nil
167+
}
168+
}); err != nil {
169+
return err
170+
}
171+
}
172+
return nil
173+
}

modules/indexer/stats/db.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ func (db *DBIndexer) Index(id int64) error {
6363
}
6464

6565
// Calculate and save language statistics to database
66-
stats, err := languagestats.GetLanguageStats(gitRepo, commitID)
66+
stats, err := languagestats.CalcLanguageStats(ctx, gitRepo, commitID)
6767
if err != nil {
6868
if !setting.IsInTesting {
6969
log.Error("Unable to get language stats for ID %s for default branch %s in %s. Error: %v", commitID, repo.DefaultBranch, repo.FullName(), err)

0 commit comments

Comments
 (0)