Skip to content

Commit f5f2612

Browse files
author
johnjor
committed
Parameterized image fetcher function. Fixed handling for anchor with no href attribute.
1 parent a7db5d0 commit f5f2612

File tree

4 files changed

+45
-8
lines changed

4 files changed

+45
-8
lines changed

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
11
# html3docx
2-
A fork of https://github.com/pqzx/html2docx. This version will focus on expedient changes for our particular use case.
2+
A fork of https://github.com/pqzx/html2docx. This version will focus on expedient changes for our particular use case,
3+
and thus will receive infrequent updates.
34

45
Dependencies: `python-docx` & `bs4`
56

67
### To install
78

89
`pip install html3docx`
910

11+
PyPI: https://pypi.org/project/html3docx/
12+
1013
### Improvements
1114

1215
- Fix for KeyError when handling an img tag without a src attribute.
1316
- Images with a width attribute will be scaled according to that width.
1417
- Fix for AttributeError when handling a leading br tag, either at the top of the HTML snippet, or within a td or th cell.
1518
- Fix for IndexError when a table has more cells in latter rows than in the first row.
19+
- Parameterized image fetcher function.
20+
- Fix for KeyError when handling an anchor with no href attribute.
1621

1722
## Original README
1823

htmldocx/h2d.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,11 @@
1717
import urllib.request
1818
from urllib.parse import urlparse
1919
from html.parser import HTMLParser
20+
from typing import Callable
2021

2122
import docx, docx.table
2223
from docx import Document
23-
from docx.shared import RGBColor, Pt, Inches
24+
from docx.shared import RGBColor, Inches
2425
from docx.enum.text import WD_COLOR, WD_ALIGN_PARAGRAPH
2526
from docx.image.image import Image
2627
from docx.oxml import OxmlElement
@@ -196,7 +197,8 @@ def __init__(self,
196197
ul_style="List Bullet",
197198
ol_style="List Number",
198199
table_style=DEFAULT_TABLE_STYLE,
199-
paragraph_style=DEFAULT_PARAGRAPH_STYLE):
200+
paragraph_style=DEFAULT_PARAGRAPH_STYLE,
201+
custom_image_fetcher: Callable = fetch_image):
200202
super().__init__()
201203
self.options = {
202204
'fix-html': True,
@@ -214,6 +216,7 @@ def __init__(self,
214216
self.paragraph_style = paragraph_style
215217
self.ul_style = ul_style
216218
self.ol_style = ol_style
219+
self.image_fetcher = custom_image_fetcher
217220

218221
def set_initial_attrs(self, document=None):
219222
self.tags = {
@@ -224,9 +227,9 @@ def set_initial_attrs(self, document=None):
224227
self.doc = document
225228
else:
226229
self.doc = Document()
227-
self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup
230+
self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup
228231
self.document = self.doc
229-
self.include_tables = True #TODO add this option back in?
232+
self.include_tables = True # TODO add this option back in?
230233
self.include_images = self.options['images']
231234
self.include_styles = self.options['styles']
232235
self.paragraph = None
@@ -240,6 +243,7 @@ def copy_settings_from(self, other):
240243
self.paragraph_style = other.paragraph_style
241244
self.ul_style = other.ul_style
242245
self.ol_style = other.ol_style
246+
self.image_fetcher = other.image_fetcher
243247

244248
def get_cell_html(self, soup):
245249
# Returns string of td element with opening and closing <td> tags removed
@@ -351,7 +355,7 @@ def handle_img(self, current_attrs):
351355
src_is_url = is_url(src)
352356
if src_is_url:
353357
try:
354-
image = fetch_image(src)
358+
image = self.image_fetcher(src)
355359
except urllib.error.URLError:
356360
image = None
357361
else:
@@ -598,7 +602,7 @@ def handle_data(self, data):
598602
# https://html.spec.whatwg.org/#interactive-content
599603
link = self.tags.get('a')
600604
if link:
601-
self.handle_link(link['href'], data)
605+
self.handle_link(link.get("href", ""), data)
602606
else:
603607
# If there's a link, dont put the data directly in the run
604608
self.run = self.paragraph.add_run(data)

tests/test.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import os
2+
import io
3+
import urllib.request
24
from pathlib import Path
35
import unittest
46
from docx import Document
@@ -257,7 +259,6 @@ def test_leading_br(self):
257259
self.document.add_heading('Test: Leading BR', level=1)
258260
self.parser.add_html_to_document("<br /><p>Hello</p>", self.document)
259261

260-
261262
def test_unbalanced_table(self):
262263
# A table with more td elements in latter rows than in the first
263264
self.document.add_heading(
@@ -292,6 +293,30 @@ def test_custom_ul(self):
292293
]:
293294
custom_parser.add_html_to_document(snippet, self.document)
294295

296+
def test_custom_image_fetcher(self):
297+
298+
def custom_fetcher(url):
299+
url.replace("oops.githubusercontent.com", "raw.githubusercontent.com")
300+
try:
301+
with urllib.request.urlopen(url) as response:
302+
# security flaw?
303+
return io.BytesIO(response.read())
304+
except urllib.error.URLError:
305+
return None
306+
307+
custom_parser = HtmlToDocx(custom_image_fetcher=custom_fetcher)
308+
309+
self.document.add_heading(
310+
'Test: Handling Images with Custom Fetch Function',
311+
level=1
312+
)
313+
custom_parser.add_html_to_document(
314+
"<img src='https://oops.githubusercontent.com/pqzx/h2d/master/testimg.png' />", self.document)
315+
custom_parser.add_html_to_document(
316+
"<table><tbody><tr><td>"
317+
"<img src='https://oops.githubusercontent.com/pqzx/h2d/master/testimg.png' />"
318+
"</td></tr></tbody></table>", self.document)
319+
295320

296321
if __name__ == '__main__':
297322
unittest.main()

tests/text1.html

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ <h1>heading 1</h1>
4343
<p style="text-align: left;">asdfsa</p>
4444
<p style="text-align: left;"><a class="fr-green fr-strong" href="abc://def.ghi" rel="noopener noreferrer"
4545
target="_blank">link</a></p>
46+
<p style="text-align: left;"><a class="fr-green fr-strong nohref" rel="noopener noreferrer"
47+
target="_blank">link</a></p>
48+
4649
<ol>
4750
<li>Ordered list</li>
4851
<ul>

0 commit comments

Comments
 (0)