@@ -27,22 +27,6 @@ class PageSummaryEnhancer(SummaryEnhancer):
27
27
BASE64_IMAGE_KEY = "base64_image"
28
28
DEFAULT_PAGE_NR = 1
29
29
30
- async def _acreate_summary (self , information : list [Document ], config : Optional [RunnableConfig ]) -> list [Document ]:
31
- # group infos by page, defaulting to page 1 if no page metadata
32
- if self ._chunker_settings :
33
- filtered_information = [
34
- info for info in information if len (info .page_content ) > self ._chunker_settings .max_size
35
- ]
36
- else :
37
- filtered_information = information
38
- grouped = [
39
- [info for info in filtered_information if info .metadata .get ("page" , self .DEFAULT_PAGE_NR ) == page ]
40
- for page in {info_piece .metadata .get ("page" , self .DEFAULT_PAGE_NR ) for info_piece in filtered_information }
41
- ]
42
-
43
- summary_tasks = [self ._asummarize_page (info_group , config ) for info_group in tqdm (grouped )]
44
- return await gather (* summary_tasks )
45
-
46
30
async def _asummarize_page (self , page_pieces : list [Document ], config : Optional [RunnableConfig ]) -> Document :
47
31
full_page_content = " " .join ([piece .page_content for piece in page_pieces ])
48
32
summary = await self ._summarizer .ainvoke (full_page_content , config )
@@ -52,3 +36,26 @@ async def _asummarize_page(self, page_pieces: list[Document], config: Optional[R
52
36
meta ["type" ] = ContentType .SUMMARY .value
53
37
54
38
return Document (metadata = meta , page_content = summary )
39
+
40
+ async def _acreate_summary (self , information : list [Document ], config : Optional [RunnableConfig ]) -> list [Document ]:
41
+ distinct_pages = []
42
+ for info in information :
43
+ if info .metadata .get ("page" , self .DEFAULT_PAGE_NR ) not in distinct_pages :
44
+ distinct_pages .append (info .metadata .get ("page" , self .DEFAULT_PAGE_NR ))
45
+
46
+ grouped = []
47
+ for page in distinct_pages :
48
+ group = []
49
+ for compare_info in information :
50
+ if compare_info .metadata .get ("page" , self .DEFAULT_PAGE_NR ) == page :
51
+ group .append (compare_info )
52
+ if (
53
+ self ._chunker_settings
54
+ and len (" " .join ([item .page_content for item in group ])) < self ._chunker_settings .max_size
55
+ ):
56
+ continue
57
+ grouped .append (group )
58
+
59
+ summary_tasks = [self ._asummarize_page (info_group , config ) for info_group in tqdm (grouped )]
60
+
61
+ return await gather (* summary_tasks )
0 commit comments