-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHallucinationRemover.py
66 lines (44 loc) · 2.13 KB
/
HallucinationRemover.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def HallucinationRemover(transcript, inputname):
import shutil
pathaudio = './FinishedAudio/'
pathrejectaudio = './RejectAudio/'
remove_list = []
for transcriptcount, entry in enumerate(transcript): # can't directly subscript entries from pandas dataframes in loop for some reason, will have to access through the whole transcript
splitentry = entry.split('|')
audioname = splitentry[0]
loadedAudioFile = pathaudio + audioname
transcribedtext = splitentry[1]
transcribedtextbag = transcribedtext.split(' ')
transcribedtextsize = len(transcribedtext)
if (transcribedtextsize > 199): #remove text entries which are too long
remove_list.append(transcriptcount)
loadedrejectedaudio = pathrejectaudio + audioname
shutil.move(loadedAudioFile, loadedrejectedaudio)
continue
repeatdetected = 0
print("HallucinationRemover: Analyzing Line" + str(transcriptcount))
for wordcount, word in enumerate(transcribedtextbag):
if wordcount == 0:
potentialrepeatword = word
else:
if potentialrepeatword == word:
repeatdetected = repeatdetected + 1
else:
potentialrepeatword = word
repeatdetected = 0
if repeatdetected > 2:
remove_list.append(transcriptcount)
loadedrejectedaudio = pathrejectaudio + audioname
shutil.move(loadedAudioFile, loadedrejectedaudio)
break
elif potentialrepeatword == 'the' and repeatdetected > 0:
remove_list.append(transcriptcount)
loadedrejectedaudio = pathrejectaudio + audioname
shutil.move(loadedAudioFile, loadedrejectedaudio)
break
transcriptcleaned = [a for i, a in enumerate(transcript) if i not in remove_list] #remove bad entries from transcript
cleanedoutputname = inputname + "_cleaned.txt"
f = open(cleanedoutputname, 'a')
for item in transcriptcleaned:
f.write(item)
f.close()