-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWordRecognition.py
56 lines (53 loc) · 1.82 KB
/
WordRecognition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
# Word文件读取
Author: shq
Packages: os
"""
import os, win32com.client
from docx import Document
def all_words(file_dir):
"""
# 获取文件夹下所有word路径
"""
word_list=[]
for root, dirs, files in os.walk(file_dir):
for file in files:
if file.endswith('.docx') or file.endswith('.doc'): # 检验文件拓展名
file_path = os.path.join(root, file)
word_list.append(file_path)
return word_list
def read_word(word_dir):
"""
# 读取word文件的文本
"""
words = all_words(word_dir)
result_texts = []
for word in words:
if word.split('.')[-1] == 'docx': # 新版word
doc = Document(word)
document_text = ""
for paragraph in doc.paragraphs:
document_text += paragraph.text + "\n"
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
document_text += cell.text + "\t"
document_text += "\n"
if word.split('.')[-1] == 'doc': #旧版word
word = win32com.client.Dispatch("Word.Application")
doc = word.Documents.Open(word)
for para in doc.Paragraphs:
document_text += paragraph.text + "\n"
print(para.Range.Text)
word.Quit()
result_texts.append(document_text)
if not os.path.exists('./text/'):
os.makedirs('./text/')
with open('./text/'+os.path.basename(word)+'_text.txt', 'w', encoding='utf-8') as file:
file.write(document_text)
return result_texts
if __name__ == "__main__":
word_dir = ".\\testfiles"
texts = read_word(word_dir)
for i, text in enumerate(texts, start=1):
print(f"Document {i} Content:\n{text}\n")