Skip to content

Commit c43230c

Browse files
committed
feat: improve converting subtitles to txt, remove duplicate lines
1 parent dbed6e5 commit c43230c

File tree

2 files changed

+36
-16
lines changed

2 files changed

+36
-16
lines changed

lib/subtitles-summary.js

+10-7
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,16 @@ export async function summarizeVideo(id, repo, callback = () => {}) {
2929
})
3030
}
3131
export function cleanTranscript (transcript) {
32-
return transcript
33-
.split('\n')
34-
.map(line => {
35-
if (/^\d+/.test(line)) return
36-
line = line.replace(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}/g, '')
37-
return line.replace(/<[^>]+>/g, '').trim()
38-
}).filter(Boolean).join('\n').trim()
32+
const lines = []
33+
for (let line of transcript.split('\n')) {
34+
line = line.trim()
35+
if (/^\d+/.test(line)) continue
36+
if (line.match(/^\d+:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}/)) continue
37+
if (lines.length === 0 || !lines[lines.length - 1].startsWith(line)) {
38+
lines.push(line)
39+
}
40+
}
41+
return lines.filter(Boolean).join('\n').trim()
3942
}
4043

4144

test/subtitles.summary.test.js

+26-9
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,38 @@ import { cleanTranscript } from '../lib/subtitles-summary.js'
55
test('cleans transcript', () => {
66
const transcript = `
77
1
8-
00:00:01,240 --> 00:00:06,879
9-
<font color="white" size=".72c">when we are about to do something</font>
8+
00:00:01,240 --> 00:00:04,630
9+
10+
Open ai writes to the
1011
1112
2
12-
00:00:04,160 --> 00:00:10,519
13-
<font color="white" size=".72c">new</font>
13+
00:00:04,630 --> 00:00:04,640
14+
Open ai writes to the
15+
1416
1517
3
16-
00:00:06,879 --> 00:00:14,080
17-
<font color="white" size=".72c">Eh after finishing</font>
18+
00:00:04,640 --> 00:00:09,110
19+
Open ai writes to the
20+
US administration their position Well it is
21+
22+
4
23+
00:00:09,110 --> 00:00:09,120
24+
US administration their position Well it is
25+
26+
27+
5
28+
00:00:09,120 --> 00:00:12,509
29+
US administration their position Well it is
30+
necessary that it be free from copyright from
31+
32+
6
33+
00:00:12,509 --> 00:00:12,519
34+
necessary that it be free from copyright from
1835
`.trim()
1936
const expectedCleanTranscript = `
20-
when we are about to do something
21-
new
22-
Eh after finishing
37+
Open ai writes to the
38+
US administration their position Well it is
39+
necessary that it be free from copyright from
2340
`.trim()
2441

2542
assert.strictEqual(cleanTranscript(transcript), expectedCleanTranscript)

0 commit comments

Comments
 (0)