-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathancient_to_modern.py
108 lines (104 loc) · 5.44 KB
/
ancient_to_modern.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import unicodedata
import re
lowercase_vowels_without_accents = {'α', 'ε', 'η', 'ι', 'ο', 'υ', 'ω'}
diphthongs = {'αι','αυ','ει','ευ','ου','οι'}
accents_i_do_not_care_about = {'\u0313', '\u0314','\u0345'} # the psili, the daseia, and the iota subscript
modern_tono = '\u0301'
diaeresis = '\u0308'
def remove_accents(text): return ''.join(c for c in text if not unicodedata.combining(c))
def count_accents(text): return sum(1 for char in text if unicodedata.category(char) == 'Mn')
def count_vowels(text): return len([char for char in text.lower() if char in lowercase_vowels_without_accents])
def ancient_text_to_modern_pronunciation(ancient_text):
"""
This functions takes an Ancient Greek text, removes its accents, and adds accents such that the words are pronounced with a modern pronunciation
"""
text_with_accent_chars_separate = unicodedata.normalize('NFD',ancient_text)
words = text_with_accent_chars_separate.split()
output_words_list = []
for word in words:
# strip the accents that don't matter
word = ''.join([char for char in word if char not in accents_i_do_not_care_about])
# count remaining accents
num_accents = count_accents(word)
# if no accent, no accents
if num_accents == 0:
output_words_list.append(word + ' ')
continue
# if one vowel, no accents
vowel_count = count_vowels(word)
if vowel_count == 1:
output_words_list.append(remove_accents(word) + ' ')
continue
# if only two vowels and they make a diphthong
if vowel_count == 2 and not all(not bool(re.search(re.escape(chars), remove_accents(word))) for chars in diphthongs):
# if the diphthong has no diaeresis, no accents
if "\u0308" not in word:
output_words_list.append(remove_accents(word) + ' ')
continue
# if the diphthong has only a diaeresis, keep word as-is
elif num_accents == 1:
output_words_list.append(word + ' ')
continue
# if the diphthong has a diaeresis and one oxia, keep word as-is (tono has same unicode representation as oxia)
elif num_accents == 2 and '\u0301' in word:
output_words_list.append(word + ' ')
continue
# if the diphthong has a diaeresis and either one perispomeni or one vareia, replace perispomeni/vareia with tono
elif num_accents == 2:
if '\u0300' in word:
output_words_list.append(word.replace('\u0300', '\u0301') + ' ')
continue
elif '\u0342' in word:
output_words_list.append(word.replace('\u0342', '\u0301') + ' ')
continue
# if the diphthong has a diaeresis and some other combination of accents, give up
else:
output_words_list.append(word + ' ')
continue
# list each letter and its accents
non_accent_chars = [char for char in word if not unicodedata.combining(char)]
num_non_accent_chars = len(non_accent_chars)
accents_in_word = [char for char in word if unicodedata.combining(char)]
accent_character_locations = []
position = -1
for char in word:
if not unicodedata.combining(char):
position+=1
else:
accent_character_locations.append(position)
# if only one accent, modern accent on that letter
if num_accents == 1:
word = ''
for i in range(num_non_accent_chars):
word += non_accent_chars[i]
if i == accent_character_locations[0]:
word += modern_tono
output_words_list.append(word + ' ')
continue
# if exactly 2 accents, and both are either oxiae, perispomeni, or vareia, modern tono only on the first one
if num_accents == 2:
if accents_in_word[0] in {'\u0300', '\u0301', '\u0342'} and accents_in_word[1] in {'\u0300', '\u0301', '\u0342'}:
word = ''
for i in range(num_non_accent_chars):
word += non_accent_chars[i]
if i == accent_character_locations[0]:
word += modern_tono
output_words_list.append(word + ' ')
continue
# if one diaeresis and either an oxia, a perispomeni, or a vareia, keep diaeresis as-is (ancient and modern diaeresis have the same unicode representations), and turn the other accent into a tono
if diaeresis in accents_in_word:
if '\u0301' in accents_in_word[0] or '\u0301' in accents_in_word[1]:
output_words_list.append(word + ' ')
continue
elif '\u0342' in accents_in_word[0] or '\u0342' in accents_in_word[1]:
output_words_list.append(word.replace('\u0342', '\u0301') + ' ')
continue
elif '\u0300' in accents_in_word[0] or '\u0300' in accents_in_word[1]:
output_words_list.append(word.replace('\u0300', '\u0301') + ' ')
continue
# giving up on other cases
print(word) # list words it's given up on
output_words_list.append(word + ' ')
# recombine accents into letters
output_words = unicodedata.normalize('NFC',''.join(output_words_list))
return(output_words)