-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathepub-metadata-extractor.py
80 lines (69 loc) · 2.92 KB
/
epub-metadata-extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from bs4 import BeautifulSoup
import argparse
import ebooklib.epub
import json
import nltk
import yaml
# download nltk dictionnary
nltk.download('punkt', quiet=True)
parser = argparse.ArgumentParser(description='Count words and estimate pages in an epub file.')
parser.add_argument('file', help='The epub file to analyze.')
parser.add_argument('--words_per_page', type=int, default=280, help='The number of words per page for the page estimate. Default is 280 for fiction works. Non-fiction works can be set to 230.')
parser.add_argument('--output_format', choices=['text', 'json', 'yaml'], default='text', help='The output format. Can be "text", "json", or "yaml". Default is "text".')
args = parser.parse_args()
def count_words_and_pages(file, words_per_page):
book = ebooklib.epub.read_epub(file, options={'ignore_ncx': True})
total_words = 0
for item in book.get_items():
if isinstance(item, ebooklib.epub.EpubHtml):
soup = BeautifulSoup(item.content, 'html.parser')
text = soup.get_text()
words = nltk.word_tokenize(text)
total_words += len(words)
# Estimate pages
total_pages = total_words / words_per_page
return total_words, round(total_pages)
def extract_metadata(file):
book = ebooklib.epub.read_epub(file, options={'ignore_ncx': True})
isbn = [item[0] for item in book.get_metadata('DC', 'identifier')]
title = [item[0] for item in book.get_metadata('DC', 'title')]
author = [item[0] for item in book.get_metadata('DC', 'creator')]
booklicense = [item[0] for item in book.get_metadata('DC', 'rights')]
description = [item[0] for item in book.get_metadata('DC', 'description')]
return isbn[0] if isbn else None, title[0] if title else None, author[0] if author else None, booklicense[0] if booklicense else None, description[0] if description else None
# Collect data
words, pages = count_words_and_pages(args.file, args.words_per_page)
isbn, title, author, booklicense, description = extract_metadata(args.file)
# Render output
if args.output_format == 'text':
print(f'Title: {title}')
print(f'Author: {author}')
print(f'ISBN: {isbn}')
print(f'License: {booklicense}')
print(f'Description: {description}')
print(f'Total words: {words}')
print(f'Total pages: {pages}')
elif args.output_format == 'json':
print(json.dumps(
{
'title': title,
'author': author,
'isbn': isbn,
'license': booklicense,
'description': description,
'total_words': words,
'total_pages': pages
}, ensure_ascii=False
))
elif args.output_format == 'yaml':
print(yaml.dump(
{
'title': title,
'author': author,
'isbn': isbn,
'license': booklicense,
'description': description,
'total_words': words,
'total_pages': pages
}, allow_unicode=True
))